diff options
Diffstat (limited to 'kernel/rcu')
-rw-r--r-- | kernel/rcu/Kconfig | 37 | ||||
-rw-r--r-- | kernel/rcu/Kconfig.debug | 18 | ||||
-rw-r--r-- | kernel/rcu/rcu.h | 13 | ||||
-rw-r--r-- | kernel/rcu/rcutorture.c | 126 | ||||
-rw-r--r-- | kernel/rcu/refscale.c | 32 | ||||
-rw-r--r-- | kernel/rcu/srcutiny.c | 20 | ||||
-rw-r--r-- | kernel/rcu/srcutree.c | 209 | ||||
-rw-r--r-- | kernel/rcu/tasks.h | 7 | ||||
-rw-r--r-- | kernel/rcu/tiny.c | 29 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 73 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 6 | ||||
-rw-r--r-- | kernel/rcu/tree_nocb.h | 24 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 22 |
13 files changed, 424 insertions, 192 deletions
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b9b6bc55185d..4d9b21f69eaa 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -18,7 +18,7 @@ config TREE_RCU config PREEMPT_RCU bool - default y if PREEMPTION + default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC) select TREE_RCU help This option selects the RCU implementation that is @@ -65,6 +65,19 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config FORCE_NEED_SRCU_NMI_SAFE + bool "Force selection of NEED_SRCU_NMI_SAFE" + depends on !TINY_SRCU + depends on RCU_EXPERT + depends on ARCH_HAS_NMI_SAFE_THIS_CPU_OPS + select NEED_SRCU_NMI_SAFE + default n + help + This option forces selection of the NEED_SRCU_NMI_SAFE + Kconfig option, allowing testing of srcu_read_lock_nmisafe() + and srcu_read_unlock_nmisafe() on architectures (like x86) + that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option. + config NEED_SRCU_NMI_SAFE def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU @@ -91,7 +104,7 @@ config NEED_TASKS_RCU config TASKS_RCU bool - default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO) + default NEED_TASKS_RCU && PREEMPTION select IRQ_WORK config FORCE_TASKS_RUDE_RCU @@ -323,21 +336,27 @@ config RCU_LAZY depends on RCU_NOCB_CPU default n help - To save power, batch RCU callbacks and flush after delay, memory - pressure, or callback list growing too big. + To save power, batch RCU callbacks and delay starting the + corresponding grace period for multiple seconds. The grace + period will be started after this delay, in case of memory + pressure, or if the corresponding CPU's callback list grows + too large. - Requires rcu_nocbs=all to be set. + These delays happen only on rcu_nocbs CPUs, that is, CPUs + whose callbacks have been offloaded. - Use rcutree.enable_rcu_lazy=0 to turn it off at boot time. + Use the rcutree.enable_rcu_lazy=0 kernel-boot parameter to + globally disable these delays. config RCU_LAZY_DEFAULT_OFF bool "Turn RCU lazy invocation off by default" depends on RCU_LAZY default n help - Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default - off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch - it back on. + Build the kernel with CONFIG_RCU_LAZY=y, but cause the kernel + to boot with these energy-efficiency delays disabled. Use the + rcutree.enable_rcu_lazy=0 kernel-boot parameter to override + the this option at boot time, thus re-enabling these delays. config RCU_DOUBLE_CHECK_CB_TIME bool "RCU callback-batch backup time check" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 6af90510a1ca..12e4c64ebae1 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -54,7 +54,7 @@ config RCU_TORTURE_TEST Say N if you are unsure. config RCU_TORTURE_TEST_CHK_RDR_STATE - tristate "Check rcutorture reader state" + bool "Check rcutorture reader state" depends on RCU_TORTURE_TEST default n help @@ -70,7 +70,7 @@ config RCU_TORTURE_TEST_CHK_RDR_STATE Say N if you are unsure. config RCU_TORTURE_TEST_LOG_CPU - tristate "Log CPU for rcutorture failures" + bool "Log CPU for rcutorture failures" depends on RCU_TORTURE_TEST default n help @@ -84,6 +84,20 @@ config RCU_TORTURE_TEST_LOG_CPU Say Y here if you want CPU IDs logged. Say N if you are unsure. +config RCU_TORTURE_TEST_LOG_GP + bool "Log grace-period numbers for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + corresponding grace-period sequence numbers. This information + can be useful, but it does incur additional overhead, overhead + that can make both failures and close calls less probable. + + Say Y here if you want grace-period sequence numbers logged. + Say N if you are unsure. + config RCU_REF_SCALE_TEST tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index feb3ac1dc5d5..eed2951a4962 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -162,7 +162,7 @@ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) { unsigned long cur_s = READ_ONCE(*sp); - return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1)); + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (3 * RCU_SEQ_STATE_MASK + 1)); } /* @@ -590,6 +590,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #endif static inline void rcu_gp_set_torture_wait(int duration) { } #endif +unsigned long long rcutorture_gather_gp_seqs(void); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len); #ifdef CONFIG_TINY_SRCU @@ -611,8 +613,6 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } -static inline unsigned long -srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; } static inline void show_rcu_gp_kthreads(void) { } @@ -624,7 +624,6 @@ static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } bool rcu_watching_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); -unsigned long srcu_batches_completed(struct srcu_struct *sp); bool rcu_check_boost_fail(unsigned long gp_state, int *cpup); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); @@ -636,6 +635,12 @@ void rcu_gp_slow_register(atomic_t *rgssp); void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TINY_SRCU +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; } +#else // #ifdef CONFIG_TINY_SRCU +unsigned long srcu_batches_completed(struct srcu_struct *sp); +#endif // #else // #ifdef CONFIG_TINY_SRCU + #ifdef CONFIG_RCU_NOCB_CPU void rcu_bind_current_to_nocb(void); #else diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d26fb1d33ed9..4fa7772be183 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -135,6 +135,7 @@ torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); +torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds."); torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); @@ -147,6 +148,7 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); static int nrealnocbers; static int nrealreaders; +static int nrealfakewriters; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; @@ -272,6 +274,9 @@ struct rt_read_seg { bool rt_preempted; int rt_cpu; int rt_end_cpu; + unsigned long long rt_gp_seq; + unsigned long long rt_gp_seq_end; + u64 rt_ts; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; @@ -406,6 +411,8 @@ struct rcu_torture_ops { void (*gp_slow_register)(atomic_t *rgssp); void (*gp_slow_unregister)(atomic_t *rgssp); bool (*reader_blocked)(void); + unsigned long long (*gather_gp_seqs)(void); + void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len); long cbflood_max; int irq_capable; int can_boost; @@ -610,6 +617,8 @@ static struct rcu_torture_ops rcu_ops = { .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU) ? has_rcu_reader_blocked : NULL, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -655,6 +664,8 @@ static struct rcu_torture_ops rcu_busted_ops = { .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, .call = call_rcu_busted, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted" @@ -677,8 +688,11 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) static int srcu_torture_read_lock(void) { int idx; + struct srcu_ctr __percpu *scp; int ret = 0; + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { idx = srcu_read_lock(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); @@ -694,6 +708,12 @@ static int srcu_torture_read_lock(void) WARN_ON_ONCE(idx & ~0x1); ret += idx << 2; } + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_read_lock_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 3; + } return ret; } @@ -719,6 +739,8 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) + srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); if (reader_flavor & SRCU_READ_FLAVOR_LITE) srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); if (reader_flavor & SRCU_READ_FLAVOR_NMI) @@ -791,6 +813,7 @@ static struct rcu_torture_ops srcu_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -834,6 +857,7 @@ static struct rcu_torture_ops srcud_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -1148,8 +1172,19 @@ static int rcu_torture_boost(void *arg) unsigned long gp_state; unsigned long gp_state_time; unsigned long oldstarttime; + unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ; - VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) { + VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + } else { + VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period"); + while (time_before(jiffies, booststarttime)) { + schedule_timeout_idle(HZ); + if (kthread_should_stop()) + goto cleanup; + } + VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period"); + } /* Set real-time priority. */ sched_set_fifo_low(current); @@ -1225,6 +1260,7 @@ checkwait: if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); } while (!torture_must_stop()); +cleanup: /* Clean up and exit. */ while (!kthread_should_stop()) { torture_shutdown_absorb("rcu_torture_boost"); @@ -1728,7 +1764,7 @@ rcu_torture_fakewriter(void *arg) do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && - torture_random(&rand) % (nfakewriters * 8) == 0) { + torture_random(&rand) % (nrealfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else { switch (synctype[torture_random(&rand) % nsynctypes]) { @@ -1873,6 +1909,8 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, #define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq) { + int mask; + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE)) return; @@ -1899,11 +1937,27 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, WARN_ONCE(cur_ops->extendables && !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); - WARN_ONCE(cur_ops->extendables && - !(curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && + + /* + * non-preemptible RCU in a preemptible kernel uses preempt_disable() + * as rcu_read_lock(). + */ + mask = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + + WARN_ONCE(cur_ops->extendables && !(curstate & mask) && (preempt_count() & PREEMPT_MASK), ROEC_ARGS); - WARN_ONCE(cur_ops->readlock_nesting && - !(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && + + /* + * non-preemptible RCU in a preemptible kernel uses "preempt_count() & + * PREEMPT_MASK" as ->readlock_nesting(). + */ + mask = RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + + WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) && cur_ops->readlock_nesting() > 0, ROEC_ARGS); } @@ -1965,6 +2019,13 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, rtrsp[-1].rt_preempted = cur_ops->reader_blocked(); } } + // Sample grace-period sequence number, as good a place as any. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) { + rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs(); + rtrsp->rt_ts = ktime_get_mono_fast_ns(); + if (!first) + rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq; + } /* * Next, remove old protection, in decreasing order of strength @@ -2263,7 +2324,7 @@ rcu_torture_reader(void *arg) stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { - del_timer_sync(&t); + timer_delete_sync(&t); destroy_timer_on_stack(&t); } tick_dep_clear_task(current, TICK_DEP_BIT_RCU); @@ -2512,7 +2573,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "shuffle_interval=%d stutter=%d irqreader=%d " "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " + "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "stall_cpu_block=%d stall_cpu_repeat=%d " "n_barrier_cbs=%d " @@ -2522,11 +2583,11 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d " "preempt_duration=%d preempt_interval=%d\n", - torture_type, tag, nrealreaders, nfakewriters, + torture_type, tag, nrealreaders, nrealfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, + test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, stall_cpu_block, stall_cpu_repeat, n_barrier_cbs, @@ -3553,6 +3614,7 @@ rcu_torture_cleanup(void) int flags = 0; unsigned long gp_seq = 0; int i; + int j; if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) { @@ -3597,7 +3659,7 @@ rcu_torture_cleanup(void) rcu_torture_reader_mbchk = NULL; if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) + for (i = 0; i < nrealfakewriters; i++) torture_stop_kthread(rcu_torture_fakewriter, fakewriter_tasks[i]); kfree(fakewriter_tasks); @@ -3635,7 +3697,11 @@ rcu_torture_cleanup(void) pr_alert("\t: No segments recorded!!!\n"); firsttime = 1; for (i = 0; i < rt_read_nsegs; i++) { - pr_alert("\t%d: %#4x", i, err_segs[i].rt_readstate); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP)) + pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL)); + else + pr_alert("\t"); + pr_cont("%d: %#4x", i, err_segs[i].rt_readstate); if (err_segs[i].rt_delay_jiffies != 0) { pr_cont("%s%ldjiffies", firsttime ? "" : "+", err_segs[i].rt_delay_jiffies); @@ -3648,6 +3714,27 @@ rcu_torture_cleanup(void) else pr_cont(" ..."); } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && + cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) { + char buf1[20+1]; + char buf2[20+1]; + char sepchar = '-'; + + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, + buf1, ARRAY_SIZE(buf1)); + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end, + buf2, ARRAY_SIZE(buf2)); + if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) { + if (buf2[0]) { + for (j = 0; buf2[j]; j++) + buf2[j] = '.'; + if (j) + buf2[j - 1] = ' '; + } + sepchar = ' '; + } + pr_cont(" %s%c%s", buf1, sepchar, buf2); + } if (err_segs[i].rt_delay_ms != 0) { pr_cont(" %s%ldms", firsttime ? "" : "+", err_segs[i].rt_delay_ms); @@ -3994,6 +4081,14 @@ rcu_torture_init(void) rcu_torture_init_srcu_lockdep(); + if (nfakewriters >= 0) { + nrealfakewriters = nfakewriters; + } else { + nrealfakewriters = num_online_cpus() - 2 - nfakewriters; + if (nrealfakewriters <= 0) + nrealfakewriters = 1; + } + if (nreaders >= 0) { nrealreaders = nreaders; } else { @@ -4050,8 +4145,9 @@ rcu_torture_init(void) writer_task); if (torture_init_error(firsterr)) goto unwind; - if (nfakewriters > 0) { - fakewriter_tasks = kcalloc(nfakewriters, + + if (nrealfakewriters > 0) { + fakewriter_tasks = kcalloc(nrealfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -4060,7 +4156,7 @@ rcu_torture_init(void) goto unwind; } } - for (i = 0; i < nfakewriters; i++) { + for (i = 0; i < nrealfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, NULL, fakewriter_tasks[i]); if (torture_init_error(firsterr)) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 1b47376acdc4..f11a7c2af778 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -216,6 +216,36 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static void srcu_fast_ref_scale_read_section(const int nloops) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static const struct ref_scale_ops srcu_fast_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_fast_ref_scale_read_section, + .delaysection = srcu_fast_ref_scale_delay_section, + .name = "srcu-fast" +}; + static void srcu_lite_ref_scale_read_section(const int nloops) { int i; @@ -1163,7 +1193,7 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 4dcbf8aa80ff..6e9fe2ce1075 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -20,7 +20,11 @@ #include "rcu_segcblist.h" #include "rcu.h" +#ifndef CONFIG_TREE_RCU int rcu_scheduler_active __read_mostly; +#else // #ifndef CONFIG_TREE_RCU +extern int rcu_scheduler_active; +#endif // #else // #ifndef CONFIG_TREE_RCU static LIST_HEAD(srcu_boot_list); static bool srcu_init_done; @@ -98,7 +102,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { int newval; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); preempt_enable(); @@ -120,7 +124,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { preempt_enable(); return; /* Already running or nothing to do. */ @@ -138,7 +142,7 @@ void srcu_drive_gp(struct work_struct *wp) WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ preempt_enable(); swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); preempt_enable(); @@ -159,7 +163,7 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_running, false); idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)); preempt_enable(); @@ -172,7 +176,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY cookie = get_state_synchronize_srcu(ssp); if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { preempt_enable(); @@ -199,7 +203,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY local_irq_save(flags); *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; @@ -261,7 +265,7 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) { unsigned long ret; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY ret = get_state_synchronize_srcu(ssp); srcu_gp_start_if_needed(ssp); preempt_enable(); @@ -282,11 +286,13 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); +#ifndef CONFIG_TREE_RCU /* Lockdep diagnostics. */ void __init rcu_scheduler_starting(void) { rcu_scheduler_active = RCU_SCHEDULER_RUNNING; } +#endif // #ifndef CONFIG_TREE_RCU /* * Queue work for srcu_struct structures with early boot callbacks. diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b83c74c4dcc0..9a59b071501b 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -116,8 +116,9 @@ do { \ /* * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and - * srcu_read_unlock() running against them. So if the is_static parameter - * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. + * srcu_read_unlock() running against them. So if the is_static + * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and + * ->srcu_ctrs[].srcu_unlocks. */ static void init_srcu_struct_data(struct srcu_struct *ssp) { @@ -128,8 +129,6 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); @@ -247,15 +246,16 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_sup->srcu_cb_mutex); mutex_init(&ssp->srcu_sup->srcu_gp_mutex); - ssp->srcu_idx = 0; ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_barrier_seq = 0; mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); ssp->srcu_sup->sda_is_static = is_static; - if (!is_static) + if (!is_static) { ssp->sda = alloc_percpu(struct srcu_data); + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; + } if (!ssp->sda) goto err_free_sup; init_srcu_struct_data(ssp); @@ -429,10 +429,10 @@ static bool srcu_gp_is_expedited(struct srcu_struct *ssp) } /* - * Computes approximate total of the readers' ->srcu_lock_count[] values - * for the rank of per-CPU counters specified by idx, and returns true if - * the caller did the proper barrier (gp), and if the count of the locks - * matches that of the unlocks passed in. + * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks + * values for the rank of per-CPU counters specified by idx, and returns + * true if the caller did the proper barrier (gp), and if the count of + * the locks matches that of the unlocks passed in. */ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) { @@ -443,20 +443,20 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks); if (IS_ENABLED(CONFIG_PROVE_RCU)) mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), "Mixed reader flavors for srcu_struct at %ps.\n", ssp); - if (mask & SRCU_READ_FLAVOR_LITE && !gp) + if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp) return false; return sum == unlocks; } /* - * Returns approximate total of the readers' ->srcu_unlock_count[] values - * for the rank of per-CPU counters specified by idx. + * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks + * values for the rank of per-CPU counters specified by idx. */ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { @@ -467,7 +467,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, u for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_unlock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks); mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), @@ -487,7 +487,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) unsigned long unlocks; unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); - did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE); + did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP); /* * Make sure that a lock is always counted if the corresponding @@ -509,48 +509,49 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * If the locks are the same as the unlocks, then there must have * been no readers on this index at some point in this function. * But there might be more readers, as a task might have read - * the current ->srcu_idx but not yet have incremented its CPU's - * ->srcu_lock_count[idx] counter. In fact, it is possible + * the current ->srcu_ctrp but not yet have incremented its CPU's + * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible * that most of the tasks have been preempted between fetching - * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there - * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks - * in a system whose address space was fully populated with memory. - * Call this quantity Nt. + * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And + * there could be almost (ULONG_MAX / sizeof(struct task_struct)) + * tasks in a system whose address space was fully populated + * with memory. Call this quantity Nt. * - * So suppose that the updater is preempted at this point in the - * code for a long time. That now-preempted updater has already - * flipped ->srcu_idx (possibly during the preceding grace period), - * done an smp_mb() (again, possibly during the preceding grace - * period), and summed up the ->srcu_unlock_count[idx] counters. - * How many times can a given one of the aforementioned Nt tasks - * increment the old ->srcu_idx value's ->srcu_lock_count[idx] - * counter, in the absence of nesting? + * So suppose that the updater is preempted at this + * point in the code for a long time. That now-preempted + * updater has already flipped ->srcu_ctrp (possibly during + * the preceding grace period), done an smp_mb() (again, + * possibly during the preceding grace period), and summed up + * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times + * can a given one of the aforementioned Nt tasks increment the + * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter, + * in the absence of nesting? * * It can clearly do so once, given that it has already fetched - * the old value of ->srcu_idx and is just about to use that value - * to index its increment of ->srcu_lock_count[idx]. But as soon as - * it leaves that SRCU read-side critical section, it will increment - * ->srcu_unlock_count[idx], which must follow the updater's above - * read from that same value. Thus, as soon the reading task does - * an smp_mb() and a later fetch from ->srcu_idx, that task will be - * guaranteed to get the new index. Except that the increment of - * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the - * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock() - * is before the smp_mb(). Thus, that task might not see the new - * value of ->srcu_idx until the -second- __srcu_read_lock(), - * which in turn means that this task might well increment - * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice, - * not just once. + * the old value of ->srcu_ctrp and is just about to use that + * value to index its increment of ->srcu_ctrs[idx].srcu_locks. + * But as soon as it leaves that SRCU read-side critical section, + * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must + * follow the updater's above read from that same value. Thus, + as soon the reading task does an smp_mb() and a later fetch from + * ->srcu_ctrp, that task will be guaranteed to get the new index. + * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks + * in __srcu_read_unlock() is after the smp_mb(), and the fetch + * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb(). + * Thus, that task might not see the new value of ->srcu_ctrp until + * the -second- __srcu_read_lock(), which in turn means that this + * task might well increment ->srcu_ctrs[idx].srcu_locks for the + * old value of ->srcu_ctrp twice, not just once. * * However, it is important to note that a given smp_mb() takes * effect not just for the task executing it, but also for any * later task running on that same CPU. * - * That is, there can be almost Nt + Nc further increments of - * ->srcu_lock_count[idx] for the old index, where Nc is the number - * of CPUs. But this is OK because the size of the task_struct - * structure limits the value of Nt and current systems limit Nc - * to a few thousand. + * That is, there can be almost Nt + Nc further increments + * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc + * is the number of CPUs. But this is OK because the size of + * the task_struct structure limits the value of Nt and current + * systems limit Nc to a few thousand. * * OK, but what about nesting? This does impose a limit on * nesting of half of the size of the task_struct structure @@ -581,10 +582,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp) for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[0]); - sum += atomic_long_read(&sdp->srcu_lock_count[1]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[0]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[1]); + sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks); + sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks); + sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks); + sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks); } return sum; } @@ -647,6 +648,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long jbase = SRCU_INTERVAL; struct srcu_usage *sup = ssp->srcu_sup; + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); if (srcu_gp_is_expedited(ssp)) jbase = 0; if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { @@ -674,9 +676,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; + unsigned long delay; struct srcu_usage *sup = ssp->srcu_sup; - if (WARN_ON(!srcu_get_delay(ssp))) + spin_lock_irq_rcu_node(ssp->srcu_sup); + delay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + if (WARN_ON(!delay)) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ @@ -684,7 +690,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - del_timer_sync(&sdp->delay_work); + timer_delete_sync(&sdp->delay_work); flush_work(&sdp->work); if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) return; /* Forgot srcu_barrier(), so just leak it! */ @@ -743,12 +749,11 @@ EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); */ int __srcu_read_lock(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); + this_cpu_inc(scp->srcu_locks.counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scp); } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -760,7 +765,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -773,13 +778,12 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); */ int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) { - int idx; - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp); + struct srcu_ctr *scp = raw_cpu_ptr(scpp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - atomic_long_inc(&sdp->srcu_lock_count[idx]); + atomic_long_inc(&scp->srcu_locks); smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scpp); } EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); @@ -790,10 +794,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); */ void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) { - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ - atomic_long_inc(&sdp->srcu_unlock_count[idx]); + atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks); } EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); @@ -1096,13 +1098,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* * Wait until all readers counted by array index idx complete, but * loop an additional time if there is an expedited grace period pending. - * The caller must ensure that ->srcu_idx is not changed while checking. + * The caller must ensure that ->srcu_ctrp is not changed while checking. */ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { unsigned long curdelay; + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = !srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) @@ -1114,30 +1118,30 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) } /* - * Increment the ->srcu_idx counter so that future SRCU readers will + * Increment the ->srcu_ctrp counter so that future SRCU readers will * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *ssp) { /* - * Because the flip of ->srcu_idx is executed only if the + * Because the flip of ->srcu_ctrp is executed only if the * preceding call to srcu_readers_active_idx_check() found that - * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched - * and because that summing uses atomic_long_read(), there is - * ordering due to a control dependency between that summing and - * the WRITE_ONCE() in this call to srcu_flip(). This ordering - * ensures that if this updater saw a given reader's increment from - * __srcu_read_lock(), that reader was using a value of ->srcu_idx - * from before the previous call to srcu_flip(), which should be - * quite rare. This ordering thus helps forward progress because - * the grace period could otherwise be delayed by additional - * calls to __srcu_read_lock() using that old (soon to be new) - * value of ->srcu_idx. + * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums + * matched and because that summing uses atomic_long_read(), + * there is ordering due to a control dependency between that + * summing and the WRITE_ONCE() in this call to srcu_flip(). + * This ordering ensures that if this updater saw a given reader's + * increment from __srcu_read_lock(), that reader was using a value + * of ->srcu_ctrp from before the previous call to srcu_flip(), + * which should be quite rare. This ordering thus helps forward + * progress because the grace period could otherwise be delayed + * by additional calls to __srcu_read_lock() using that old (soon + * to be new) value of ->srcu_ctrp. * * This sum-equality check and ordering also ensures that if * a given call to __srcu_read_lock() uses the new value of - * ->srcu_idx, this updater's earlier scans cannot have seen + * ->srcu_ctrp, this updater's earlier scans cannot have seen * that reader's increments, which is all to the good, because * this grace period need not wait on that reader. After all, * if those earlier scans had seen that reader, there would have @@ -1152,7 +1156,8 @@ static void srcu_flip(struct srcu_struct *ssp) */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter. + WRITE_ONCE(ssp->srcu_ctrp, + &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]); /* * Ensure that if the updater misses an __srcu_read_unlock() @@ -1198,7 +1203,7 @@ static bool srcu_should_expedite(struct srcu_struct *ssp) check_init_srcu_struct(ssp); /* If _lite() readers, don't do unsolicited expediting. */ - if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE) + if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP) return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); @@ -1398,8 +1403,12 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, * read-side critical sections are delimited by srcu_read_lock() and * srcu_read_unlock(), and may be nested. * - * The callback will be invoked from process context, but must nevertheless - * be fast and must not block. + * The callback will be invoked from process context, but with bh + * disabled. The callback function must therefore be fast and must + * not block. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) @@ -1465,8 +1474,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * * Wait for the count to drain to zero of both indexes. To avoid the * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, - * and then flip the srcu_idx and wait for the count of the other index. + * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero + * at first, and then flip the ->srcu_ctrp and wait for the count of the + * other index. * * Can block; must be called from process context. * @@ -1675,7 +1685,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return READ_ONCE(ssp->srcu_idx); + return READ_ONCE(ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcu_batches_completed); @@ -1692,7 +1702,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) /* * Because readers might be delayed for an extended period after - * fetching ->srcu_idx for their index, at any point in time there + * fetching ->srcu_ctrp for their index, at any point in time there * might well be readers using both idx=0 and idx=1. We therefore * need to wait for readers to clear from both index values before * invoking a callback. @@ -1720,7 +1730,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) } if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) { - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 1)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1738,7 +1748,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) * SRCU read-side critical sections are normally short, * so check at least twice in quick succession after a flip. */ - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 2)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1849,7 +1859,9 @@ static void process_srcu(struct work_struct *work) ssp = sup->srcu_ssp; srcu_advance_state(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (curdelay) { WRITE_ONCE(sup->reschedule_count, 0); } else { @@ -1896,7 +1908,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); int ss_state_idx = ss_state; - idx = ssp->srcu_idx & 0x1; + idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]; if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; pr_alert("%s%s Tree SRCU g%ld state %d (%s)", @@ -1914,8 +1926,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx])); - u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx])); + u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks)); + u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks)); /* * Make sure that a lock is always counted if the corresponding @@ -1923,8 +1935,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx])); - l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx])); + l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks)); + l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks)); c0 = l0 - u0; c1 = l1 - u1; @@ -2001,6 +2013,7 @@ static int srcu_module_coming(struct module *mod) ssp->sda = alloc_percpu(struct srcu_data); if (WARN_ON_ONCE(!ssp->sda)) return -ENOMEM; + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; } return 0; } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 59314da5eb60..c0cc7ae41106 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1086,7 +1086,7 @@ static void rcu_tasks_postscan(struct list_head *hop) } if (!IS_ENABLED(CONFIG_TINY_RCU)) - del_timer_sync(&tasks_rcu_exit_srcu_stall_timer); + timer_delete_sync(&tasks_rcu_exit_srcu_stall_timer); } /* See if tasks are still holding out, complain if so. */ @@ -2256,7 +2256,7 @@ void __init tasks_cblist_init_generic(void) #endif } -void __init rcu_init_tasks_generic(void) +static int __init rcu_init_tasks_generic(void) { #ifdef CONFIG_TASKS_RCU rcu_spawn_tasks_kthread(); @@ -2272,7 +2272,10 @@ void __init rcu_init_tasks_generic(void) // Run the self-tests. rcu_tasks_initiate_self_tests(); + + return 0; } +core_initcall(rcu_init_tasks_generic); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4b3f31911465..c1ebfd51768b 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -85,15 +85,8 @@ void rcu_sched_clock_irq(int user) static inline bool rcu_reclaim_tiny(struct rcu_head *head) { rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kvfree_rcu_offset(offset)) { - trace_rcu_invoke_kvfree_callback("", head, offset); - kvfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } trace_rcu_invoke_callback("", head); f = head->func; @@ -159,10 +152,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -static void tiny_rcu_leak_callback(struct rcu_head *rhp) -{ -} - /* * Post an RCU callback to be invoked after the end of an RCU grace * period. But since we have but one CPU, that would be after any @@ -178,9 +167,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); mem_dump_obj(head); } - - if (!__is_kvfree_rcu_offset((unsigned long)head->func)) - WRITE_ONCE(head->func, tiny_rcu_leak_callback); return; } @@ -246,15 +232,18 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr) +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) +unsigned long long rcutorture_gather_gp_seqs(void) { - if (head) - kasan_record_aux_stack(ptr); + return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL; +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); - __kvfree_call_rcu(head, ptr); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) +{ + snprintf(cp, len, "g%04llx", seqs & 0xffffULL); } -EXPORT_SYMBOL_GPL(kvfree_call_rcu); +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); #endif void __init rcu_init(void) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 475f31deed14..659f83e71048 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -538,6 +538,26 @@ void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); +/* Gather grace-period sequence numbers for rcutorture diagnostics. */ +unsigned long long rcutorture_gather_gp_seqs(void) +{ + return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) | + ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) | + (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL); +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); + +/* Format grace-period sequence numbers for rcutorture diagnostics. */ +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) +{ + unsigned int egp = (seqs >> 16) & 0xffffffULL; + unsigned int ggp = (seqs >> 40) & 0xffffULL; + unsigned int pgp = seqs & 0xffffULL; + + snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp); +} +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); + #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on @@ -1254,7 +1274,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { if (!offloaded) ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ rdp->core_needs_qs = false; @@ -1268,7 +1288,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Now handle the beginnings of any new-to-this-CPU grace periods. */ if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1283,7 +1303,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); - if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap) WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); @@ -1612,12 +1632,10 @@ static void rcu_sr_normal_complete(struct llist_node *node) { struct rcu_synchronize *rs = container_of( (struct rcu_head *) node, struct rcu_synchronize, head); - unsigned long oldstate = (unsigned long) rs->head.func; WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && - !poll_state_synchronize_rcu(oldstate), - "A full grace period is not passed yet: %lu", - rcu_seq_diff(get_state_synchronize_rcu(), oldstate)); + !poll_state_synchronize_rcu_full(&rs->oldstate), + "A full grace period is not passed yet!\n"); /* Finally. */ complete(&rs->completion); @@ -1801,10 +1819,14 @@ static noinline_for_stack bool rcu_gp_init(void) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(); + /* + * A new wait segment must be started before gp_seq advanced, so + * that previous gp waiters won't observe the new gp_seq. + */ + start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); - start_new_poll = rcu_sr_normal_gp_init(); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); @@ -2931,13 +2953,8 @@ static int __init rcu_spawn_core_kthreads(void) static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) { rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kvfree_rcu_offset((unsigned long)func)) - trace_rcu_kvfree_callback(rcu_state.name, head, - (unsigned long)func, - rcu_segcblist_n_cbs(&rdp->cblist)); - else - trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_callback(rcu_state.name, head, + rcu_segcblist_n_cbs(&rdp->cblist)); trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); } @@ -3107,7 +3124,7 @@ module_param(enable_rcu_lazy, bool, 0444); * critical sections have completed. * * Use this API instead of call_rcu() if you don't want the callback to be - * invoked after very long periods of time, which can happen on systems without + * delayed for very long periods of time, which can happen on systems without * memory pressure and on systems which are lightly loaded or mostly idle. * This function will cause callbacks to be invoked sooner than later at the * expense of extra power. Other than that, this function is identical to, and @@ -3138,6 +3155,12 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * might well execute concurrently with RCU read-side critical sections * that started after call_rcu() was invoked. * + * It is perfectly legal to repost an RCU callback, potentially with + * a different callback function, from within its callback function. + * The specified function will be invoked after another full grace period + * has elapsed. This use case is similar in form to the common practice + * of reposting a timer from within its own handler. + * * RCU read-side critical sections are delimited by rcu_read_lock() * and rcu_read_unlock(), and may be nested. In addition, but only in * v5.0 and later, regions of code across which interrupts, preemption, @@ -3166,6 +3189,13 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * * Implementation of these memory-ordering guarantees is described here: * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. + * + * Specific to call_rcu() (as opposed to the other call_rcu*() functions), + * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many + * seconds before starting the grace period needed by the corresponding + * callback. This delay can significantly improve energy-efficiency + * on low-utilization battery-powered devices. To avoid this delay, + * in latency-sensitive kernel code, use call_rcu_hurry(). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -3214,7 +3244,7 @@ static void synchronize_rcu_normal(void) * snapshot before adding a request. */ if (IS_ENABLED(CONFIG_PROVE_RCU)) - rs.head.func = (void *) get_state_synchronize_rcu(); + get_state_synchronize_rcu_full(&rs.oldstate); rcu_sr_normal_add_req(&rs); @@ -3357,14 +3387,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { - struct rcu_node *rnp = rcu_get_root(); - /* * Any prior manipulation of RCU-protected data must happen * before the loads from ->gp_seq and ->expedited_sequence. */ smp_mb(); /* ^^^ */ - rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq); + + // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use + // in poll_state_synchronize_rcu_full() notwithstanding. Use of + // the latter here would result in too-short grace periods due to + // interactions with newly onlined CPUs. + rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq); rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 77efed89c79e..8d4895c854c5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -230,17 +230,19 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) * specified leaf rcu_node structure, which is acquired by the caller. */ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags, - unsigned long mask, bool wake) + unsigned long mask_in, bool wake) __releases(rnp->lock) { int cpu; + unsigned long mask; struct rcu_data *rdp; raw_lockdep_assert_held_rcu_node(rnp); - if (!(rnp->expmask & mask)) { + if (!(rnp->expmask & mask_in)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } + mask = mask_in & rnp->expmask; WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 2605dd234a13..fa269d34167a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -206,7 +206,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&rdp_gp->nocb_timer); + timer_delete(&rdp_gp->nocb_timer); } if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { @@ -822,7 +822,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); + timer_delete(&my_rdp->nocb_timer); } WRITE_ONCE(my_rdp->nocb_gp_sleep, true); raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); @@ -1557,8 +1557,11 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) /* Dump out nocb kthread state for the specified rcu_data structure. */ static void show_rcu_nocb_state(struct rcu_data *rdp) { - char bufw[20]; - char bufr[20]; + char bufd[22]; + char bufw[45]; + char bufr[45]; + char bufn[22]; + char bufb[22]; struct rcu_data *nocb_next_rdp; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; @@ -1572,9 +1575,13 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) typeof(*rdp), nocb_entry_rdp); - sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); - sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]); + sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL], + rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]); + sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass)); + pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], @@ -1586,12 +1593,15 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) jiffies - rdp->nocb_nobypass_last, rdp->nocb_nobypass_count, ".D"[rcu_segcblist_ready_cbs(rsclp)], + rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd, ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn, ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb, rcu_segcblist_n_cbs(&rdp->cblist), rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3600152b858e..3c0bbbbb686f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -833,8 +833,17 @@ void rcu_read_unlock_strict(void) { struct rcu_data *rdp; - if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread) return; + + /* + * rcu_report_qs_rdp() can only be invoked with a stable rdp and + * from the local CPU. + * + * The in_atomic_preempt_off() check ensures that we come here holding + * the last preempt_count (which will get dropped once we return to + * __rcu_read_unlock(). + */ rdp = this_cpu_ptr(&rcu_data); rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); @@ -975,13 +984,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) */ static void rcu_flavor_sched_clock_irq(int user) { - if (user || rcu_is_cpu_rrupt_from_idle()) { + if (user || rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && + (preempt_count() == HARDIRQ_OFFSET))) { /* * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. + * mode, from the idle loop without this being a nested + * interrupt, or while not holding the task preempt count + * (with PREEMPT_COUNT=y). In this case, the CPU is in a + * quiescent state, so note it. * * No memory barrier is required here because rcu_qs() * references only CPU-local variables that other CPUs |