summaryrefslogtreecommitdiff
path: root/kernel/rcu
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcu')
-rw-r--r--kernel/rcu/Kconfig80
-rw-r--r--kernel/rcu/Kconfig.debug44
-rw-r--r--kernel/rcu/rcu.h51
-rw-r--r--kernel/rcu/rcuscale.c109
-rw-r--r--kernel/rcu/rcutorture.c854
-rw-r--r--kernel/rcu/refscale.c471
-rw-r--r--kernel/rcu/srcutiny.c54
-rw-r--r--kernel/rcu/srcutree.c564
-rw-r--r--kernel/rcu/tasks.h731
-rw-r--r--kernel/rcu/tiny.c37
-rw-r--r--kernel/rcu/tree.c288
-rw-r--r--kernel/rcu/tree.h20
-rw-r--r--kernel/rcu/tree_exp.h77
-rw-r--r--kernel/rcu/tree_nocb.h243
-rw-r--r--kernel/rcu/tree_plugin.h161
-rw-r--r--kernel/rcu/tree_stall.h76
-rw-r--r--kernel/rcu/update.c32
17 files changed, 2299 insertions, 1593 deletions
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index b9b6bc55185d..762299291e09 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -18,7 +18,7 @@ config TREE_RCU
config PREEMPT_RCU
bool
- default y if PREEMPTION
+ default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC)
select TREE_RCU
help
This option selects the RCU implementation that is
@@ -65,11 +65,24 @@ config TREE_SRCU
help
This option selects the full-fledged version of SRCU.
+config FORCE_NEED_SRCU_NMI_SAFE
+ bool "Force selection of NEED_SRCU_NMI_SAFE"
+ depends on !TINY_SRCU
+ depends on RCU_EXPERT
+ depends on ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
+ select NEED_SRCU_NMI_SAFE
+ default n
+ help
+ This option forces selection of the NEED_SRCU_NMI_SAFE
+ Kconfig option, allowing testing of srcu_read_lock_nmisafe()
+ and srcu_read_unlock_nmisafe() on architectures (like x86)
+ that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option.
+
config NEED_SRCU_NMI_SAFE
def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU
config TASKS_RCU_GENERIC
- def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU
+ def_bool TASKS_RCU || TASKS_RUDE_RCU
help
This option enables generic infrastructure code supporting
task-based RCU implementations. Not for manual selection.
@@ -91,7 +104,7 @@ config NEED_TASKS_RCU
config TASKS_RCU
bool
- default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO)
+ default NEED_TASKS_RCU && PREEMPTION
select IRQ_WORK
config FORCE_TASKS_RUDE_RCU
@@ -129,6 +142,29 @@ config TASKS_TRACE_RCU
default n
select IRQ_WORK
+config TASKS_TRACE_RCU_NO_MB
+ bool "Override RCU Tasks Trace inclusion of read-side memory barriers"
+ depends on RCU_EXPERT && TASKS_TRACE_RCU
+ default ARCH_WANTS_NO_INSTR
+ help
+ This option prevents the use of read-side memory barriers in
+ rcu_read_lock_tasks_trace() and rcu_read_unlock_tasks_trace()
+ even in kernels built with CONFIG_ARCH_WANTS_NO_INSTR=n, that is,
+ in kernels that do not have noinstr set up in entry/exit code.
+ By setting this option, you are promising to carefully review
+ use of ftrace, BPF, and friends to ensure that no tracing
+ operation is attached to a function that runs in that portion
+ of the entry/exit code that RCU does not watch, that is,
+ where rcu_is_watching() returns false. Alternatively, you
+ might choose to never remove traces except by rebooting.
+
+ Those wishing to disable read-side memory barriers for an entire
+ architecture can select this Kconfig option, hence the polarity.
+
+ Say Y here if you need speed and will review use of tracing.
+ Say N here for certain esoteric testing of RCU itself.
+ Take the default if you are unsure.
+
config RCU_STALL_COMMON
def_bool TREE_RCU
help
@@ -300,44 +336,32 @@ config RCU_NOCB_CPU_CB_BOOST
Say Y here if you want to set RT priority for offloading kthreads.
Say N here if you are building a !PREEMPT_RT kernel and are unsure.
-config TASKS_TRACE_RCU_READ_MB
- bool "Tasks Trace RCU readers use memory barriers in user and idle"
- depends on RCU_EXPERT && TASKS_TRACE_RCU
- default PREEMPT_RT || NR_CPUS < 8
- help
- Use this option to further reduce the number of IPIs sent
- to CPUs executing in userspace or idle during tasks trace
- RCU grace periods. Given that a reasonable setting of
- the rcupdate.rcu_task_ipi_delay kernel boot parameter
- eliminates such IPIs for many workloads, proper setting
- of this Kconfig option is important mostly for aggressive
- real-time installations and for battery-powered devices,
- hence the default chosen above.
-
- Say Y here if you hate IPIs.
- Say N here if you hate read-side memory barriers.
- Take the default if you are unsure.
-
config RCU_LAZY
bool "RCU callback lazy invocation functionality"
depends on RCU_NOCB_CPU
default n
help
- To save power, batch RCU callbacks and flush after delay, memory
- pressure, or callback list growing too big.
+ To save power, batch RCU callbacks and delay starting the
+ corresponding grace period for multiple seconds. The grace
+ period will be started after this delay, in case of memory
+ pressure, or if the corresponding CPU's callback list grows
+ too large.
- Requires rcu_nocbs=all to be set.
+ These delays happen only on rcu_nocbs CPUs, that is, CPUs
+ whose callbacks have been offloaded.
- Use rcutree.enable_rcu_lazy=0 to turn it off at boot time.
+ Use the rcutree.enable_rcu_lazy=0 kernel-boot parameter to
+ globally disable these delays.
config RCU_LAZY_DEFAULT_OFF
bool "Turn RCU lazy invocation off by default"
depends on RCU_LAZY
default n
help
- Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default
- off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch
- it back on.
+ Build the kernel with CONFIG_RCU_LAZY=y, but cause the kernel
+ to boot with these energy-efficiency delays disabled. Use the
+ rcutree.enable_rcu_lazy=0 kernel-boot parameter to override
+ the this option at boot time, thus re-enabling these delays.
config RCU_DOUBLE_CHECK_CB_TIME
bool "RCU callback-batch backup time check"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 6af90510a1ca..e078e988773d 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -54,7 +54,7 @@ config RCU_TORTURE_TEST
Say N if you are unsure.
config RCU_TORTURE_TEST_CHK_RDR_STATE
- tristate "Check rcutorture reader state"
+ bool "Check rcutorture reader state"
depends on RCU_TORTURE_TEST
default n
help
@@ -70,7 +70,7 @@ config RCU_TORTURE_TEST_CHK_RDR_STATE
Say N if you are unsure.
config RCU_TORTURE_TEST_LOG_CPU
- tristate "Log CPU for rcutorture failures"
+ bool "Log CPU for rcutorture failures"
depends on RCU_TORTURE_TEST
default n
help
@@ -84,6 +84,20 @@ config RCU_TORTURE_TEST_LOG_CPU
Say Y here if you want CPU IDs logged.
Say N if you are unsure.
+config RCU_TORTURE_TEST_LOG_GP
+ bool "Log grace-period numbers for rcutorture failures"
+ depends on RCU_TORTURE_TEST
+ default n
+ help
+ This option causes rcutorture to decorate each entry of its
+ log of failure/close-call rcutorture reader segments with the
+ corresponding grace-period sequence numbers. This information
+ can be useful, but it does incur additional overhead, overhead
+ that can make both failures and close calls less probable.
+
+ Say Y here if you want grace-period sequence numbers logged.
+ Say N if you are unsure.
+
config RCU_REF_SCALE_TEST
tristate "Scalability tests for read-side synchronization (RCU and others)"
depends on DEBUG_KERNEL
@@ -199,4 +213,30 @@ config RCU_STRICT_GRACE_PERIOD
when looking for certain types of RCU usage bugs, for example,
too-short RCU read-side critical sections.
+
+config RCU_DYNTICKS_TORTURE
+ bool "Minimize RCU dynticks counter size"
+ depends on RCU_EXPERT && !COMPILE_TEST
+ default n
+ help
+ This option sets the width of the dynticks counter to its
+ minimum usable value. This minimum width greatly increases
+ the probability of flushing out bugs involving counter wrap,
+ but it also increases the probability of extending grace period
+ durations. This Kconfig option should therefore be avoided in
+ production due to the consequent increased probability of OOMs.
+
+ This has no value for production and is only for testing.
+
+config TRIVIAL_PREEMPT_RCU
+ bool "Textbook trivial preemptible RCU in rcutorture"
+ depends on RCU_EXPERT && RCU_TORTURE_TEST
+ default n
+ help
+ This option enables a textbook preemptible RCU that is
+ implemented in rcutorture. Its sole purpose is to validate
+ code used in books, papers, and presentations.
+
+ This has no value for production and is only for testing.
+
endmenu # "RCU Debugging"
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index feb3ac1dc5d5..fa6d30ce73d1 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -57,6 +57,9 @@
/* Low-order bit definition for polled grace-period APIs. */
#define RCU_GET_STATE_COMPLETED 0x1
+/* A complete grace period count */
+#define RCU_SEQ_GP (RCU_SEQ_STATE_MASK + 1)
+
extern int sysctl_sched_rt_runtime;
/*
@@ -157,12 +160,21 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
* Given a snapshot from rcu_seq_snap(), determine whether or not a
* full update-side operation has occurred, but do not allow the
* (ULONG_MAX / 2) safety-factor/guard-band.
+ *
+ * The token returned by get_state_synchronize_rcu_full() is based on
+ * rcu_state.gp_seq but it is tested in poll_state_synchronize_rcu_full()
+ * against the root rnp->gp_seq. Since rcu_seq_start() is first called
+ * on rcu_state.gp_seq and only later reflected on the root rnp->gp_seq,
+ * it is possible that rcu_seq_snap(rcu_state.gp_seq) returns 2 full grace
+ * periods ahead of the root rnp->gp_seq. To prevent false-positives with the
+ * full polling API that a wrap around instantly completed the GP, when nothing
+ * like that happened, adjust for the 2 GPs in the ULONG_CMP_LT().
*/
static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s)
{
unsigned long cur_s = READ_ONCE(*sp);
- return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1));
+ return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_GP));
}
/*
@@ -490,6 +502,15 @@ do { \
___locked; \
})
+#define raw_spin_trylock_irqsave_rcu_node(p, flags) \
+({ \
+ bool ___locked = raw_spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ \
+ if (___locked) \
+ smp_mb__after_unlock_lock(); \
+ ___locked; \
+})
+
#define raw_lockdep_assert_held_rcu_node(p) \
lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
@@ -532,10 +553,6 @@ struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq);
#endif // # ifdef CONFIG_TASKS_RUDE_RCU
-#ifdef CONFIG_TASKS_TRACE_RCU
-void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq);
-#endif
-
#ifdef CONFIG_TASKS_RCU_GENERIC
void tasks_cblist_init_generic(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
@@ -572,6 +589,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
unsigned long c_old,
unsigned long c);
void rcu_gp_set_torture_wait(int duration);
+void rcu_set_gpwrap_lag(unsigned long lag);
+int rcu_get_gpwrap_count(int cpu);
#else
static inline void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq)
{
@@ -589,7 +608,11 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
do { } while (0)
#endif
static inline void rcu_gp_set_torture_wait(int duration) { }
+static inline void rcu_set_gpwrap_lag(unsigned long lag) { }
+static inline int rcu_get_gpwrap_count(int cpu) { return 0; }
#endif
+unsigned long long rcutorture_gather_gp_seqs(void);
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len);
#ifdef CONFIG_TINY_SRCU
@@ -611,8 +634,6 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags,
static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; }
static inline unsigned long rcu_get_gp_seq(void) { return 0; }
static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
-static inline unsigned long
-srcu_batches_completed(struct srcu_struct *sp) { return 0; }
static inline void rcu_force_quiescent_state(void) { }
static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; }
static inline void show_rcu_gp_kthreads(void) { }
@@ -624,7 +645,6 @@ static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
bool rcu_watching_zero_in_eqs(int cpu, int *vp);
unsigned long rcu_get_gp_seq(void);
unsigned long rcu_exp_batches_completed(void);
-unsigned long srcu_batches_completed(struct srcu_struct *sp);
bool rcu_check_boost_fail(unsigned long gp_state, int *cpup);
void show_rcu_gp_kthreads(void);
int rcu_get_gp_kthreads_prio(void);
@@ -636,6 +656,12 @@ void rcu_gp_slow_register(atomic_t *rgssp);
void rcu_gp_slow_unregister(atomic_t *rgssp);
#endif /* #else #ifdef CONFIG_TINY_RCU */
+#ifdef CONFIG_TINY_SRCU
+static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; }
+#else // #ifdef CONFIG_TINY_SRCU
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
+#endif // #else // #ifdef CONFIG_TINY_SRCU
+
#ifdef CONFIG_RCU_NOCB_CPU
void rcu_bind_current_to_nocb(void);
#else
@@ -652,11 +678,6 @@ void show_rcu_tasks_rude_gp_kthread(void);
#else
static inline void show_rcu_tasks_rude_gp_kthread(void) {}
#endif
-#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_TRACE_RCU)
-void show_rcu_tasks_trace_gp_kthread(void);
-#else
-static inline void show_rcu_tasks_trace_gp_kthread(void) {}
-#endif
#ifdef CONFIG_TINY_RCU
static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
@@ -670,4 +691,8 @@ int rcu_stall_notifier_call_chain(unsigned long val, void *v);
static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; }
#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+#ifdef CONFIG_TRIVIAL_PREEMPT_RCU
+void synchronize_rcu_trivial_preempt(void);
+#endif // #ifdef CONFIG_TRIVIAL_PREEMPT_RCU
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 0f3059b1b80d..ac0b1c6b7dae 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -79,12 +79,6 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
* test-end checks, and the pair of calls through pointers.
*/
-#ifdef MODULE
-# define RCUSCALE_SHUTDOWN 0
-#else
-# define RCUSCALE_SHUTDOWN 1
-#endif
-
torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
@@ -92,8 +86,8 @@ torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
torture_param(int, minruntime, 0, "Minimum run time (s)");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
-torture_param(bool, shutdown, RCUSCALE_SHUTDOWN,
- "Shutdown at end of scalability tests.");
+torture_param(int, shutdown_secs, !IS_MODULE(CONFIG_RCU_SCALE_TEST) * 300,
+ "Shutdown at end of scalability tests or at specified timeout (s).");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable");
@@ -123,7 +117,6 @@ static int nrealreaders;
static int nrealwriters;
static struct task_struct **writer_tasks;
static struct task_struct **reader_tasks;
-static struct task_struct *shutdown_task;
static u64 **writer_durations;
static bool *writer_done;
@@ -132,7 +125,6 @@ static int *writer_n_durations;
static atomic_t n_rcu_scale_reader_started;
static atomic_t n_rcu_scale_writer_started;
static atomic_t n_rcu_scale_writer_finished;
-static wait_queue_head_t shutdown_wq;
static u64 t_rcu_scale_writer_started;
static u64 t_rcu_scale_writer_finished;
static unsigned long b_rcu_gp_test_started;
@@ -400,11 +392,6 @@ static void tasks_trace_scale_read_unlock(int idx)
rcu_read_unlock_trace();
}
-static void rcu_tasks_trace_scale_stats(void)
-{
- rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG);
-}
-
static struct rcu_scale_ops tasks_tracing_ops = {
.ptype = RCU_TASKS_FLAVOR,
.init = rcu_sync_scale_init,
@@ -416,8 +403,6 @@ static struct rcu_scale_ops tasks_tracing_ops = {
.gp_barrier = rcu_barrier_tasks_trace,
.sync = synchronize_rcu_tasks_trace,
.exp_sync = synchronize_rcu_tasks_trace,
- .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread,
- .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats,
.name = "tasks-tracing"
};
@@ -526,6 +511,8 @@ static void rcu_scale_async_cb(struct rcu_head *rhp)
rcu_scale_free(wmbp);
}
+static void rcu_scale_cleanup(void);
+
/*
* RCU scale writer kthread. Repeatedly does a grace period.
*/
@@ -629,9 +616,11 @@ rcu_scale_writer(void *arg)
b_rcu_gp_test_finished =
cur_ops->get_gp_seq();
}
- if (shutdown) {
+ if (shutdown_secs) {
+ writer_tasks[me] = NULL;
smp_mb(); /* Assign before wake. */
- wake_up(&shutdown_wq);
+ rcu_scale_cleanup();
+ kernel_power_off();
}
}
}
@@ -675,8 +664,8 @@ static void
rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
- "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n",
- scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown);
+ "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown_secs=%d\n",
+ scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown_secs);
}
/*
@@ -729,6 +718,8 @@ static void kfree_call_rcu(struct rcu_head *rh)
kfree(obj);
}
+static void kfree_scale_cleanup(void);
+
static int
kfree_scale_thread(void *arg)
{
@@ -762,7 +753,7 @@ kfree_scale_thread(void *arg)
}
for (i = 0; i < kfree_alloc_num; i++) {
- alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL);
+ alloc_ptr = kzalloc_objs(struct kfree_obj, kfree_mult);
if (!alloc_ptr)
return -ENOMEM;
@@ -796,11 +787,13 @@ kfree_scale_thread(void *arg)
pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n",
(unsigned long long)(end_time - start_time), kfree_loops,
rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started),
- (mem_begin - mem_during) >> (20 - PAGE_SHIFT));
+ PAGES_TO_MB(mem_begin - mem_during));
- if (shutdown) {
+ if (shutdown_secs) {
+ kfree_reader_tasks[me] = NULL;
smp_mb(); /* Assign before wake. */
- wake_up(&shutdown_wq);
+ kfree_scale_cleanup();
+ kernel_power_off();
}
}
@@ -827,22 +820,6 @@ kfree_scale_cleanup(void)
torture_cleanup_end();
}
-/*
- * shutdown kthread. Just waits to be awakened, then shuts down system.
- */
-static int
-kfree_scale_shutdown(void *arg)
-{
- wait_event_idle(shutdown_wq,
- atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
-
- smp_mb(); /* Wake before output. */
-
- kfree_scale_cleanup();
- kernel_power_off();
- return -EINVAL;
-}
-
// Used if doing RCU-kfree'ing via call_rcu().
static unsigned long jiffies_at_lazy_cb;
static struct rcu_head lazy_test1_rh;
@@ -902,21 +879,18 @@ kfree_scale_init(void)
kfree_nrealthreads = compute_real(kfree_nthreads);
/* Start up the kthreads. */
- if (shutdown) {
- init_waitqueue_head(&shutdown_wq);
- firsterr = torture_create_kthread(kfree_scale_shutdown, NULL,
- shutdown_task);
+ if (shutdown_secs) {
+ firsterr = torture_shutdown_init(shutdown_secs, kfree_scale_cleanup);
if (torture_init_error(firsterr))
goto unwind;
- schedule_timeout_uninterruptible(1);
}
pr_alert("kfree object size=%zu, kfree_by_call_rcu=%d\n",
kfree_mult * sizeof(struct kfree_obj),
kfree_by_call_rcu);
- kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
- GFP_KERNEL);
+ kfree_reader_tasks = kzalloc_objs(kfree_reader_tasks[0],
+ kfree_nrealthreads);
if (kfree_reader_tasks == NULL) {
firsterr = -ENOMEM;
goto unwind;
@@ -1065,20 +1039,6 @@ rcu_scale_cleanup(void)
torture_cleanup_end();
}
-/*
- * RCU scalability shutdown kthread. Just waits to be awakened, then shuts
- * down system.
- */
-static int
-rcu_scale_shutdown(void *arg)
-{
- wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
- smp_mb(); /* Wake before output. */
- rcu_scale_cleanup();
- kernel_power_off();
- return -EINVAL;
-}
-
static int __init
rcu_scale_init(void)
{
@@ -1128,16 +1088,12 @@ rcu_scale_init(void)
/* Start up the kthreads. */
- if (shutdown) {
- init_waitqueue_head(&shutdown_wq);
- firsterr = torture_create_kthread(rcu_scale_shutdown, NULL,
- shutdown_task);
+ if (shutdown_secs) {
+ firsterr = torture_shutdown_init(shutdown_secs, rcu_scale_cleanup);
if (torture_init_error(firsterr))
goto unwind;
- schedule_timeout_uninterruptible(1);
}
- reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
- GFP_KERNEL);
+ reader_tasks = kzalloc_objs(reader_tasks[0], nrealreaders);
if (reader_tasks == NULL) {
SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
@@ -1151,10 +1107,10 @@ rcu_scale_init(void)
}
while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders)
schedule_timeout_uninterruptible(1);
- writer_tasks = kcalloc(nrealwriters, sizeof(writer_tasks[0]), GFP_KERNEL);
+ writer_tasks = kzalloc_objs(writer_tasks[0], nrealwriters);
writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL);
- writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL);
- writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL);
+ writer_n_durations = kzalloc_objs(*writer_n_durations, nrealwriters);
+ writer_done = kzalloc_objs(writer_done[0], nrealwriters);
if (gp_async) {
if (gp_async_max <= 0) {
pr_warn("%s: gp_async_max = %d must be greater than zero.\n",
@@ -1163,7 +1119,8 @@ rcu_scale_init(void)
firsterr = -EINVAL;
goto unwind;
}
- writer_freelists = kcalloc(nrealwriters, sizeof(writer_freelists[0]), GFP_KERNEL);
+ writer_freelists = kzalloc_objs(writer_freelists[0],
+ nrealwriters);
}
if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done ||
(gp_async && !writer_freelists)) {
@@ -1184,8 +1141,8 @@ rcu_scale_init(void)
init_llist_head(&wflp->ws_lhg);
init_llist_head(&wflp->ws_lhp);
- wflp->ws_mblocks = kcalloc(gp_async_max, sizeof(wflp->ws_mblocks[0]),
- GFP_KERNEL);
+ wflp->ws_mblocks = kzalloc_objs(wflp->ws_mblocks[0],
+ gp_async_max);
if (!wflp->ws_mblocks) {
firsterr = -ENOMEM;
goto unwind;
@@ -1208,7 +1165,7 @@ rcu_scale_init(void)
unwind:
torture_init_end();
rcu_scale_cleanup();
- if (shutdown) {
+ if (shutdown_secs) {
WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST));
kernel_power_off();
}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d26fb1d33ed9..5f2848b828dc 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -55,22 +55,24 @@ MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
-/* Bits for ->extendables field, extendables param, and related definitions. */
-#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1)
-#define RCUTORTURE_RDR_SHIFT_2 16 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2)
-#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */
-#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */
-#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */
-#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */
-#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */
-#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */
-#define RCUTORTURE_MAX_EXTEND \
+// Bits for ->extendables field, extendables param, and related definitions.
+#define RCUTORTURE_RDR_SHIFT_1 8 // Put SRCU index in upper bits.
+#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1)
+#define RCUTORTURE_RDR_SHIFT_2 16 // Put SRCU index in upper bits.
+#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2)
+#define RCUTORTURE_RDR_BH 0x01 // Extend readers by disabling bh.
+#define RCUTORTURE_RDR_IRQ 0x02 // ... disabling interrupts.
+#define RCUTORTURE_RDR_PREEMPT 0x04 // ... disabling preemption.
+#define RCUTORTURE_RDR_RBH 0x08 // ... rcu_read_lock_bh().
+#define RCUTORTURE_RDR_SCHED 0x10 // ... rcu_read_lock_sched().
+#define RCUTORTURE_RDR_RCU_1 0x20 // ... entering another RCU reader.
+#define RCUTORTURE_RDR_RCU_2 0x40 // ... entering another RCU reader.
+#define RCUTORTURE_RDR_UPDOWN 0x80 // ... up-read from task, down-read from timer.
+ // Note: Manual start, automatic end.
+#define RCUTORTURE_RDR_NBITS 8 // Number of bits defined above.
+#define RCUTORTURE_MAX_EXTEND \
(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
- RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
+ RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) // Intentionally omit RCUTORTURE_RDR_UPDOWN.
#define RCUTORTURE_RDR_ALLBITS \
(RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \
RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2)
@@ -110,11 +112,16 @@ torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing");
+torture_param(int, n_up_down, 32, "# of concurrent up/down hrtimer-based RCU readers");
torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(bool, gpwrap_lag, true, "Enable grace-period wrap lag testing");
+torture_param(int, gpwrap_lag_gps, 8, "Value to set for set_gpwrap_lag during an active testing period.");
+torture_param(int, gpwrap_lag_cycle_mins, 30, "Total cycle duration for gpwrap lag testing (in minutes)");
+torture_param(int, gpwrap_lag_active_mins, 5, "Duration for which gpwrap lag is active within each cycle (in minutes)");
torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable");
torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disable");
@@ -135,6 +142,7 @@ torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
+torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds.");
torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable.");
torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
@@ -147,9 +155,11 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)");
static int nrealnocbers;
static int nrealreaders;
+static int nrealfakewriters;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
+static struct task_struct *updown_task;
static struct task_struct **nocb_tasks;
static struct task_struct *stats_task;
static struct task_struct *fqs_task;
@@ -272,6 +282,9 @@ struct rt_read_seg {
bool rt_preempted;
int rt_cpu;
int rt_end_cpu;
+ unsigned long long rt_gp_seq;
+ unsigned long long rt_gp_seq_end;
+ u64 rt_ts;
};
static int err_segs_recorded;
static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS];
@@ -369,11 +382,14 @@ struct rcu_torture_ops {
void (*readunlock)(int idx);
int (*readlock_held)(void); // lockdep.
int (*readlock_nesting)(void); // actual nesting, if available, -1 if not.
+ int (*down_read)(void);
+ void (*up_read)(int idx);
unsigned long (*get_gp_seq)(void);
unsigned long (*gp_diff)(unsigned long new, unsigned long old);
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
+ void (*exp_current)(void);
unsigned long (*get_gp_state_exp)(void);
unsigned long (*start_gp_poll_exp)(void);
void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp);
@@ -406,6 +422,10 @@ struct rcu_torture_ops {
void (*gp_slow_register)(atomic_t *rgssp);
void (*gp_slow_unregister)(atomic_t *rgssp);
bool (*reader_blocked)(void);
+ unsigned long long (*gather_gp_seqs)(void);
+ void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len);
+ void (*set_gpwrap_lag)(unsigned long lag);
+ int (*get_gpwrap_count)(int cpu);
long cbflood_max;
int irq_capable;
int can_boost;
@@ -414,6 +434,7 @@ struct rcu_torture_ops {
int no_pi_lock;
int debug_objects;
int start_poll_irqsoff;
+ int have_up_down;
const char *name;
};
@@ -451,7 +472,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
- if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK))
+ if ((preempt_count() & HARDIRQ_MASK) || softirq_count())
longdelay_ms = 5; /* Avoid triggering BH limits. */
mdelay(longdelay_ms);
rtrsp->rt_delay_ms = longdelay_ms;
@@ -610,6 +631,10 @@ static struct rcu_torture_ops rcu_ops = {
.reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)
? has_rcu_reader_blocked
: NULL,
+ .gather_gp_seqs = rcutorture_gather_gp_seqs,
+ .format_gp_seqs = rcutorture_format_gp_seqs,
+ .set_gpwrap_lag = rcu_set_gpwrap_lag,
+ .get_gpwrap_count = rcu_get_gpwrap_count,
.irq_capable = 1,
.can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
.extendables = RCUTORTURE_MAX_EXTEND,
@@ -655,6 +680,8 @@ static struct rcu_torture_ops rcu_busted_ops = {
.sync = synchronize_rcu_busted,
.exp_sync = synchronize_rcu_busted,
.call = call_rcu_busted,
+ .gather_gp_seqs = rcutorture_gather_gp_seqs,
+ .format_gp_seqs = rcutorture_format_gp_seqs,
.irq_capable = 1,
.extendables = RCUTORTURE_MAX_EXTEND,
.name = "busted"
@@ -665,10 +692,29 @@ static struct rcu_torture_ops rcu_busted_ops = {
*/
DEFINE_STATIC_SRCU(srcu_ctl);
+DEFINE_STATIC_SRCU_FAST(srcu_ctlf);
+DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_ctlfud);
static struct srcu_struct srcu_ctld;
static struct srcu_struct *srcu_ctlp = &srcu_ctl;
static struct rcu_torture_ops srcud_ops;
+static void srcu_torture_init(void)
+{
+ rcu_sync_torture_init();
+ if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL))
+ VERBOSE_TOROUT_STRING("srcu_torture_init normal SRCU");
+ if (reader_flavor & SRCU_READ_FLAVOR_NMI)
+ VERBOSE_TOROUT_STRING("srcu_torture_init NMI-safe SRCU");
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ srcu_ctlp = &srcu_ctlf;
+ VERBOSE_TOROUT_STRING("srcu_torture_init fast SRCU");
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
+ srcu_ctlp = &srcu_ctlfud;
+ VERBOSE_TOROUT_STRING("srcu_torture_init fast-up/down SRCU");
+ }
+}
+
static void srcu_get_gp_data(int *flags, unsigned long *gp_seq)
{
srcutorture_get_gp_data(srcu_ctlp, flags, gp_seq);
@@ -677,8 +723,11 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq)
static int srcu_torture_read_lock(void)
{
int idx;
+ struct srcu_ctr __percpu *scp;
int ret = 0;
+ WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL);
+
if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) {
idx = srcu_read_lock(srcu_ctlp);
WARN_ON_ONCE(idx & ~0x1);
@@ -689,11 +738,18 @@ static int srcu_torture_read_lock(void)
WARN_ON_ONCE(idx & ~0x1);
ret += idx << 1;
}
- if (reader_flavor & SRCU_READ_FLAVOR_LITE) {
- idx = srcu_read_lock_lite(srcu_ctlp);
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
WARN_ON_ONCE(idx & ~0x1);
ret += idx << 2;
}
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
+ scp = srcu_read_lock_fast_updown(srcu_ctlp);
+ idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx << 3;
+ }
return ret;
}
@@ -719,8 +775,11 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
static void srcu_torture_read_unlock(int idx)
{
WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
- if (reader_flavor & SRCU_READ_FLAVOR_LITE)
- srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2);
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN)
+ srcu_read_unlock_fast_updown(srcu_ctlp,
+ __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST)
+ srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x4) >> 2));
if (reader_flavor & SRCU_READ_FLAVOR_NMI)
srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1);
if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL))
@@ -732,6 +791,50 @@ static int torture_srcu_read_lock_held(void)
return srcu_read_lock_held(srcu_ctlp);
}
+static bool srcu_torture_have_up_down(void)
+{
+ int rf = reader_flavor;
+
+ if (!rf)
+ rf = SRCU_READ_FLAVOR_NORMAL;
+ return !!(cur_ops->have_up_down & rf);
+}
+
+static int srcu_torture_down_read(void)
+{
+ int idx;
+ struct srcu_ctr __percpu *scp;
+
+ WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL);
+ WARN_ON_ONCE(reader_flavor & (reader_flavor - 1));
+
+ if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) {
+ idx = srcu_down_read(srcu_ctlp);
+ WARN_ON_ONCE(idx & ~0x1);
+ return idx;
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
+ scp = srcu_down_read_fast(srcu_ctlp);
+ idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
+ WARN_ON_ONCE(idx & ~0x1);
+ return idx << 3;
+ }
+ WARN_ON_ONCE(1);
+ return 0;
+}
+
+static void srcu_torture_up_read(int idx)
+{
+ WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN)
+ srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
+ else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) ||
+ !(reader_flavor & SRCU_READ_FLAVOR_ALL))
+ srcu_up_read(srcu_ctlp, idx & 0x1);
+ else
+ WARN_ON_ONCE(1);
+}
+
static unsigned long srcu_torture_completed(void)
{
return srcu_batches_completed(srcu_ctlp);
@@ -739,7 +842,14 @@ static unsigned long srcu_torture_completed(void)
static void srcu_torture_deferred_free(struct rcu_torture *rp)
{
+ unsigned long flags;
+ bool lockit = jiffies & 0x1;
+
+ if (lockit)
+ raw_spin_lock_irqsave(&current->pi_lock, flags);
call_srcu(srcu_ctlp, &rp->rtort_rcu, rcu_torture_cb);
+ if (lockit)
+ raw_spin_unlock_irqrestore(&current->pi_lock, flags);
}
static void srcu_torture_synchronize(void)
@@ -783,17 +893,26 @@ static void srcu_torture_synchronize_expedited(void)
synchronize_srcu_expedited(srcu_ctlp);
}
+static void srcu_torture_expedite_current(void)
+{
+ srcu_expedite_current(srcu_ctlp);
+}
+
static struct rcu_torture_ops srcu_ops = {
.ttype = SRCU_FLAVOR,
- .init = rcu_sync_torture_init,
+ .init = srcu_torture_init,
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .down_read = srcu_torture_down_read,
+ .up_read = srcu_torture_up_read,
.readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
+ .gp_diff = rcu_seq_diff,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .exp_current = srcu_torture_expedite_current,
.same_gp_state = same_state_synchronize_srcu,
.get_comp_state = get_completed_synchronize_srcu,
.get_gp_state = srcu_torture_get_gp_state,
@@ -808,13 +927,29 @@ static struct rcu_torture_ops srcu_ops = {
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.debug_objects = 1,
+ .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU)
+ ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN,
.name = "srcu"
};
-static void srcu_torture_init(void)
+static void srcud_torture_init(void)
{
rcu_sync_torture_init();
- WARN_ON(init_srcu_struct(&srcu_ctld));
+ if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL)) {
+ WARN_ON(init_srcu_struct(&srcu_ctld));
+ VERBOSE_TOROUT_STRING("srcud_torture_init normal SRCU");
+ } else if (reader_flavor & SRCU_READ_FLAVOR_NMI) {
+ WARN_ON(init_srcu_struct(&srcu_ctld));
+ VERBOSE_TOROUT_STRING("srcud_torture_init NMI-safe SRCU");
+ } else if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ WARN_ON(init_srcu_struct_fast(&srcu_ctld));
+ VERBOSE_TOROUT_STRING("srcud_torture_init fast SRCU");
+ } else if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
+ WARN_ON(init_srcu_struct_fast_updown(&srcu_ctld));
+ VERBOSE_TOROUT_STRING("srcud_torture_init fast-up/down SRCU");
+ } else {
+ WARN_ON(init_srcu_struct(&srcu_ctld));
+ }
srcu_ctlp = &srcu_ctld;
}
@@ -827,16 +962,20 @@ static void srcu_torture_cleanup(void)
/* As above, but dynamically allocated. */
static struct rcu_torture_ops srcud_ops = {
.ttype = SRCU_FLAVOR,
- .init = srcu_torture_init,
+ .init = srcud_torture_init,
.cleanup = srcu_torture_cleanup,
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
.readlock_held = torture_srcu_read_lock_held,
+ .down_read = srcu_torture_down_read,
+ .up_read = srcu_torture_up_read,
.get_gp_seq = srcu_torture_completed,
+ .gp_diff = rcu_seq_diff,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .exp_current = srcu_torture_expedite_current,
.same_gp_state = same_state_synchronize_srcu,
.get_comp_state = get_completed_synchronize_srcu,
.get_gp_state = srcu_torture_get_gp_state,
@@ -851,6 +990,8 @@ static struct rcu_torture_ops srcud_ops = {
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.debug_objects = 1,
+ .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU)
+ ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN,
.name = "srcud"
};
@@ -878,7 +1019,8 @@ static struct rcu_torture_ops busted_srcud_ops = {
/*
* Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
- * This implementation does not necessarily work well with CPU hotplug.
+ * This implementation does not work well with CPU hotplug nor
+ * with rcutorture's shuffling.
*/
static void synchronize_rcu_trivial(void)
@@ -891,6 +1033,16 @@ static void synchronize_rcu_trivial(void)
}
}
+static void rcu_sync_torture_init_trivial(void)
+{
+ rcu_sync_torture_init();
+ // if (onoff_interval || shuffle_interval) {
+ if (WARN_ONCE(onoff_interval || shuffle_interval, "%s: Non-zero onoff_interval (%d) or shuffle_interval (%d) breaks trivial RCU, resetting to zero", __func__, onoff_interval, shuffle_interval)) {
+ onoff_interval = 0;
+ shuffle_interval = 0;
+ }
+}
+
static int rcu_torture_read_lock_trivial(void)
{
preempt_disable();
@@ -904,7 +1056,7 @@ static void rcu_torture_read_unlock_trivial(int idx)
static struct rcu_torture_ops trivial_ops = {
.ttype = RCU_TRIVIAL_FLAVOR,
- .init = rcu_sync_torture_init,
+ .init = rcu_sync_torture_init_trivial,
.readlock = rcu_torture_read_lock_trivial,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock_trivial,
@@ -916,6 +1068,61 @@ static struct rcu_torture_ops trivial_ops = {
.name = "trivial"
};
+#ifdef CONFIG_TRIVIAL_PREEMPT_RCU
+
+/*
+ * Definitions for trivial CONFIG_PREEMPT=y torture testing. This
+ * implementation does not work well with large numbers of tasks or with
+ * long-term preemption. Either or both get you RCU CPU stall warnings.
+ */
+
+static void rcu_sync_torture_init_trivial_preempt(void)
+{
+ rcu_sync_torture_init();
+ if (WARN_ONCE(onoff_interval || shuffle_interval, "%s: Non-zero onoff_interval (%d) or shuffle_interval (%d) breaks trivial RCU, resetting to zero", __func__, onoff_interval, shuffle_interval)) {
+ onoff_interval = 0;
+ shuffle_interval = 0;
+ }
+}
+
+static int rcu_torture_read_lock_trivial_preempt(void)
+{
+ struct task_struct *t = current;
+
+ WRITE_ONCE(t->rcu_trivial_preempt_nesting, t->rcu_trivial_preempt_nesting + 1);
+ smp_mb();
+ return 0;
+}
+
+static void rcu_torture_read_unlock_trivial_preempt(int idx)
+{
+ struct task_struct *t = current;
+
+ smp_store_release(&t->rcu_trivial_preempt_nesting, t->rcu_trivial_preempt_nesting - 1);
+}
+
+static struct rcu_torture_ops trivial_preempt_ops = {
+ .ttype = RCU_TRIVIAL_FLAVOR,
+ .init = rcu_sync_torture_init_trivial_preempt,
+ .readlock = rcu_torture_read_lock_trivial_preempt,
+ .read_delay = rcu_read_delay, // just reuse rcu's version.
+ .readunlock = rcu_torture_read_unlock_trivial_preempt,
+ .readlock_held = torture_readlock_not_held,
+ .get_gp_seq = rcu_no_completed,
+ .sync = synchronize_rcu_trivial_preempt,
+ .exp_sync = synchronize_rcu_trivial_preempt,
+ .irq_capable = 0, // In theory it should be, but let's keep it trivial.
+ .name = "trivial-preempt"
+};
+
+#define TRIVIAL_PREEMPT_OPS &trivial_preempt_ops,
+
+#else // #ifdef CONFIG_TRIVIAL_PREEMPT_RCU
+
+#define TRIVIAL_PREEMPT_OPS
+
+#endif // #else // #ifdef CONFIG_TRIVIAL_PREEMPT_RCU
+
#ifdef CONFIG_TASKS_RCU
/*
@@ -1033,10 +1240,9 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.deferred_free = rcu_tasks_tracing_torture_deferred_free,
.sync = synchronize_rcu_tasks_trace,
.exp_sync = synchronize_rcu_tasks_trace,
+ .exp_current = rcu_tasks_trace_expedite_current,
.call = call_rcu_tasks_trace,
.cb_barrier = rcu_barrier_tasks_trace,
- .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread,
- .get_gp_data = rcu_tasks_trace_get_gp_data,
.cbflood_max = 50000,
.irq_capable = 1,
.slow_gps = 1,
@@ -1148,8 +1354,19 @@ static int rcu_torture_boost(void *arg)
unsigned long gp_state;
unsigned long gp_state_time;
unsigned long oldstarttime;
+ unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ;
- VERBOSE_TOROUT_STRING("rcu_torture_boost started");
+ if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started");
+ } else {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period");
+ while (time_before(jiffies, booststarttime)) {
+ schedule_timeout_idle(HZ);
+ if (kthread_should_stop())
+ goto cleanup;
+ }
+ VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period");
+ }
/* Set real-time priority. */
sched_set_fifo_low(current);
@@ -1225,6 +1442,7 @@ checkwait: if (stutter_wait("rcu_torture_boost"))
sched_set_fifo_low(current);
} while (!torture_must_stop());
+cleanup:
/* Clean up and exit. */
while (!kthread_should_stop()) {
torture_shutdown_absorb("rcu_torture_boost");
@@ -1421,7 +1639,7 @@ static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void))
static int
rcu_torture_writer(void *arg)
{
- bool boot_ended;
+ bool booting_still = false;
bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
unsigned long cookie;
struct rcu_gp_oldstate cookie_full;
@@ -1432,6 +1650,7 @@ rcu_torture_writer(void *arg)
struct rcu_gp_oldstate gp_snap1_full;
int i;
int idx;
+ unsigned long j;
int oldnice = task_nice(current);
struct rcu_gp_oldstate *rgo = NULL;
int rgo_size = 0;
@@ -1464,16 +1683,26 @@ rcu_torture_writer(void *arg)
return 0;
}
if (cur_ops->poll_active > 0) {
- ulo = kzalloc(cur_ops->poll_active * sizeof(ulo[0]), GFP_KERNEL);
+ ulo = kcalloc(cur_ops->poll_active, sizeof(*ulo), GFP_KERNEL);
if (!WARN_ON(!ulo))
ulo_size = cur_ops->poll_active;
}
if (cur_ops->poll_active_full > 0) {
- rgo = kzalloc(cur_ops->poll_active_full * sizeof(rgo[0]), GFP_KERNEL);
+ rgo = kzalloc_objs(*rgo, cur_ops->poll_active_full);
if (!WARN_ON(!rgo))
rgo_size = cur_ops->poll_active_full;
}
+ // If the system is still booting, let it finish.
+ j = jiffies;
+ while (!torture_must_stop() && !rcu_inkernel_boot_has_ended()) {
+ booting_still = true;
+ schedule_timeout_interruptible(HZ);
+ }
+ if (booting_still)
+ pr_alert("%s" TORTURE_FLAG " Waited %lu jiffies for boot to complete.\n",
+ torture_type, jiffies - j);
+
do {
rcu_torture_writer_state = RTWS_FIXED_DELAY;
torture_hrtimeout_us(500, 1000, &rand);
@@ -1582,6 +1811,8 @@ rcu_torture_writer(void *arg)
ulo[i] = cur_ops->get_comp_state();
gp_snap = cur_ops->start_gp_poll();
rcu_torture_writer_state = RTWS_POLL_WAIT;
+ if (cur_ops->exp_current && !(torture_random(&rand) & 0xff))
+ cur_ops->exp_current();
while (!cur_ops->poll_gp_state(gp_snap)) {
gp_snap1 = cur_ops->get_gp_state();
for (i = 0; i < ulo_size; i++)
@@ -1602,6 +1833,8 @@ rcu_torture_writer(void *arg)
cur_ops->get_comp_state_full(&rgo[i]);
cur_ops->start_gp_poll_full(&gp_snap_full);
rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
+ if (cur_ops->exp_current && !(torture_random(&rand) & 0xff))
+ cur_ops->exp_current();
while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
cur_ops->get_gp_state_full(&gp_snap1_full);
for (i = 0; i < rgo_size; i++)
@@ -1662,13 +1895,11 @@ rcu_torture_writer(void *arg)
!rcu_gp_is_normal();
}
rcu_torture_writer_state = RTWS_STUTTER;
- boot_ended = rcu_inkernel_boot_has_ended();
stutter_waited = stutter_wait("rcu_torture_writer");
if (stutter_waited &&
!atomic_read(&rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
!torture_must_stop() &&
- boot_ended &&
time_after(jiffies, stallsdone))
for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
if (list_empty(&rcu_tortures[i].rtort_free) &&
@@ -1678,6 +1909,7 @@ rcu_torture_writer(void *arg)
cur_ops->gp_kthread_dbg();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
rcu_ftrace_dump(DUMP_ALL);
+ break;
}
if (stutter_waited)
sched_set_normal(current, oldnice);
@@ -1728,7 +1960,7 @@ rcu_torture_fakewriter(void *arg)
do {
torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand);
if (cur_ops->cb_barrier != NULL &&
- torture_random(&rand) % (nfakewriters * 8) == 0) {
+ torture_random(&rand) % (nrealfakewriters * 8) == 0) {
cur_ops->cb_barrier();
} else {
switch (synctype[torture_random(&rand) % nsynctypes]) {
@@ -1871,12 +2103,14 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
// Verify the specified RCUTORTURE_RDR* state.
#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count()
-static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq)
+static void rcutorture_one_extend_check(char *s, int curstate, int new, int old)
{
- if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE))
+ int mask;
+
+ if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE) || in_nmi())
return;
- WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled(), ROEC_ARGS);
+ WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled() && !in_hardirq(), ROEC_ARGS);
WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS);
// If CONFIG_PREEMPT_COUNT=n, further checks are unreliable.
@@ -1884,26 +2118,45 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old,
return;
WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
- !(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS);
+ !softirq_count(), ROEC_ARGS);
WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) &&
!(preempt_count() & PREEMPT_MASK), ROEC_ARGS);
WARN_ONCE(cur_ops->readlock_nesting &&
(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) &&
cur_ops->readlock_nesting() == 0, ROEC_ARGS);
- // Timer handlers have all sorts of stuff disabled, so ignore
+ // Interrupt handlers have all sorts of stuff disabled, so ignore
// unintended disabling.
- if (insoftirq)
+ if (in_serving_softirq() || in_hardirq())
return;
WARN_ONCE(cur_ops->extendables &&
!(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
- (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS);
- WARN_ONCE(cur_ops->extendables &&
- !(curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) &&
+ softirq_count(), ROEC_ARGS);
+
+ /*
+ * non-preemptible RCU in a preemptible kernel uses preempt_disable()
+ * as rcu_read_lock().
+ */
+ mask = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
+ mask |= RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
+
+ WARN_ONCE(cur_ops->extendables && !(curstate & mask) &&
(preempt_count() & PREEMPT_MASK), ROEC_ARGS);
- WARN_ONCE(cur_ops->readlock_nesting &&
- !(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) &&
+
+ /*
+ * non-preemptible RCU in a preemptible kernel uses "preempt_count() &
+ * PREEMPT_MASK" as ->readlock_nesting().
+ */
+ mask = RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
+ if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
+ mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && softirq_count())
+ mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
+
+ WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) &&
cur_ops->readlock_nesting() > 0, ROEC_ARGS);
}
@@ -1916,8 +2169,7 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old,
* beginning or end of the critical section and if there was actually a
* change, do a ->read_delay().
*/
-static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
- struct torture_random_state *trsp,
+static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp,
struct rt_read_seg *rtrsp)
{
bool first;
@@ -1931,8 +2183,8 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
first = idxold1 == 0;
WARN_ON_ONCE(idxold2 < 0);
- WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS);
- rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq);
+ WARN_ON_ONCE(idxold2 & ~(RCUTORTURE_RDR_ALLBITS | RCUTORTURE_RDR_UPDOWN));
+ rcutorture_one_extend_check("before change", idxold1, statesnew, statesold);
rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */
@@ -1952,8 +2204,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2;
// Complain unless both the old and the new protection is in place.
- rcutorture_one_extend_check("during change",
- idxold1 | statesnew, statesnew, statesold, insoftirq);
+ rcutorture_one_extend_check("during change", idxold1 | statesnew, statesnew, statesold);
// Sample CPU under both sets of protections to reduce confusion.
if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) {
@@ -1965,6 +2216,13 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
rtrsp[-1].rt_preempted = cur_ops->reader_blocked();
}
}
+ // Sample grace-period sequence number, as good a place as any.
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) {
+ rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs();
+ rtrsp->rt_ts = ktime_get_mono_fast_ns();
+ if (!first)
+ rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq;
+ }
/*
* Next, remove old protection, in decreasing order of strength
@@ -2000,6 +2258,11 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
if (lockit)
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
}
+ if (statesold & RCUTORTURE_RDR_UPDOWN) {
+ cur_ops->up_read((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1);
+ WARN_ON_ONCE(idxnew1 != -1);
+ idxold1 = 0;
+ }
/* Delay if neither beginning nor end and there was a change. */
if ((statesnew || statesold) && *readstate && newstate)
@@ -2016,7 +2279,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
WARN_ON_ONCE(*readstate < 0);
if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS))
pr_info("Unexpected readstate value of %#x\n", *readstate);
- rcutorture_one_extend_check("after change", *readstate, statesnew, statesold, insoftirq);
+ rcutorture_one_extend_check("after change", *readstate, statesnew, statesold);
}
/* Return the biggest extendables mask given current RCU and boot parameters. */
@@ -2083,8 +2346,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
* critical section.
*/
static struct rt_read_seg *
-rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_state *trsp,
- struct rt_read_seg *rtrsp)
+rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp)
{
int i;
int j;
@@ -2098,58 +2360,77 @@ rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_sta
i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1;
for (j = 0; j < i; j++) {
mask = rcutorture_extend_mask(*readstate, trsp);
- rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]);
+ WARN_ON_ONCE(mask & RCUTORTURE_RDR_UPDOWN);
+ rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]);
}
return &rtrsp[j];
}
-/*
- * Do one read-side critical section, returning false if there was
- * no data to read. Can be invoked both from process context and
- * from a timer handler.
- */
-static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
-{
- bool checkpolling = !(torture_random(trsp) & 0xfff);
+struct rcu_torture_one_read_state {
+ bool checkpolling;
unsigned long cookie;
struct rcu_gp_oldstate cookie_full;
- int i;
unsigned long started;
- unsigned long completed;
- int newstate;
struct rcu_torture *p;
- int pipe_count;
- bool preempted = false;
- int readstate = 0;
- struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } };
- struct rt_read_seg *rtrsp = &rtseg[0];
- struct rt_read_seg *rtrsp1;
+ int readstate;
+ struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS];
+ struct rt_read_seg *rtrsp;
unsigned long long ts;
+};
- WARN_ON_ONCE(!rcu_is_watching());
- newstate = rcutorture_extend_mask(readstate, trsp);
- rcutorture_one_extend(&readstate, newstate, myid < 0, trsp, rtrsp++);
- if (checkpolling) {
+static void init_rcu_torture_one_read_state(struct rcu_torture_one_read_state *rtorsp,
+ struct torture_random_state *trsp)
+{
+ memset(rtorsp, 0, sizeof(*rtorsp));
+ rtorsp->checkpolling = !(torture_random(trsp) & 0xfff);
+ rtorsp->rtrsp = &rtorsp->rtseg[0];
+}
+
+/*
+ * Set up the first segment of a series of overlapping read-side
+ * critical sections. The caller must have actually initiated the
+ * outermost read-side critical section.
+ */
+static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp,
+ struct torture_random_state *trsp, long myid)
+{
+ if (rtorsp->checkpolling) {
if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- cookie = cur_ops->get_gp_state();
+ rtorsp->cookie = cur_ops->get_gp_state();
if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
- cur_ops->get_gp_state_full(&cookie_full);
+ cur_ops->get_gp_state_full(&rtorsp->cookie_full);
}
- started = cur_ops->get_gp_seq();
- ts = rcu_trace_clock_local();
- p = rcu_dereference_check(rcu_torture_current,
- !cur_ops->readlock_held || cur_ops->readlock_held());
- if (p == NULL) {
+ rtorsp->started = cur_ops->get_gp_seq();
+ rtorsp->ts = rcu_trace_clock_local();
+ rtorsp->p = rcu_dereference_check(rcu_torture_current,
+ !cur_ops->readlock_held || cur_ops->readlock_held() ||
+ (rtorsp->readstate & RCUTORTURE_RDR_UPDOWN));
+ if (rtorsp->p == NULL) {
/* Wait for rcu_torture_writer to get underway */
- rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp);
+ rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp);
return false;
}
- if (p->rtort_mbtest == 0)
+ if (rtorsp->p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
- rcu_torture_reader_do_mbchk(myid, p, trsp);
- rtrsp = rcutorture_loop_extend(&readstate, myid < 0, trsp, rtrsp);
+ rcu_torture_reader_do_mbchk(myid, rtorsp->p, trsp);
+ return true;
+}
+
+/*
+ * Complete the last segment of a series of overlapping read-side
+ * critical sections and check for errors.
+ */
+static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp,
+ struct torture_random_state *trsp)
+{
+ int i;
+ unsigned long completed;
+ int pipe_count;
+ bool preempted = false;
+ struct rt_read_seg *rtrsp1;
+
preempt_disable();
- pipe_count = READ_ONCE(p->rtort_pipe_count);
+ pipe_count = READ_ONCE(rtorsp->p->rtort_pipe_count);
if (pipe_count > RCU_TORTURE_PIPE_LEN) {
// Should not happen in a correct RCU implementation,
// happens quite often for torture_type=busted.
@@ -2157,28 +2438,28 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
}
completed = cur_ops->get_gp_seq();
if (pipe_count > 1) {
- do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
- ts, started, completed);
+ do_trace_rcu_torture_read(cur_ops->name, &rtorsp->p->rtort_rcu,
+ rtorsp->ts, rtorsp->started, completed);
rcu_ftrace_dump(DUMP_ALL);
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = rcutorture_seq_diff(completed, started);
+ completed = rcutorture_seq_diff(completed, rtorsp->started);
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
}
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
- if (checkpolling) {
+ if (rtorsp->checkpolling) {
if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- WARN_ONCE(cur_ops->poll_gp_state(cookie),
+ WARN_ONCE(cur_ops->poll_gp_state(rtorsp->cookie),
"%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
__func__,
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
- cookie, cur_ops->get_gp_state());
+ rtorsp->cookie, cur_ops->get_gp_state());
if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
- WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ WARN_ONCE(cur_ops->poll_gp_state_full(&rtorsp->cookie_full),
"%s: Cookie check 6 failed %s(%d) online %*pbl\n",
__func__,
rcu_torture_writer_state_getname(),
@@ -2187,21 +2468,41 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
}
if (cur_ops->reader_blocked)
preempted = cur_ops->reader_blocked();
- rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp);
- WARN_ON_ONCE(readstate);
+ rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp);
+ WARN_ON_ONCE(rtorsp->readstate);
// This next splat is expected behavior if leakpointer, especially
// for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
- WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1);
+ WARN_ON_ONCE(leakpointer && READ_ONCE(rtorsp->p->rtort_pipe_count) > 1);
/* If error or close call, record the sequence of reader protections. */
if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) {
i = 0;
- for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++)
+ for (rtrsp1 = &rtorsp->rtseg[0]; rtrsp1 < rtorsp->rtrsp; rtrsp1++)
err_segs[i++] = *rtrsp1;
rt_read_nsegs = i;
rt_read_preempted = preempted;
}
+}
+/*
+ * Do one read-side critical section, returning false if there was
+ * no data to read. Can be invoked both from process context and
+ * from a timer handler.
+ */
+static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
+{
+ int newstate;
+ struct rcu_torture_one_read_state rtors;
+
+ WARN_ON_ONCE(!rcu_is_watching());
+ init_rcu_torture_one_read_state(&rtors, trsp);
+ newstate = rcutorture_extend_mask(rtors.readstate, trsp);
+ WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN);
+ rcutorture_one_extend(&rtors.readstate, newstate, trsp, rtors.rtrsp++);
+ if (!rcu_torture_one_read_start(&rtors, trsp, myid))
+ return false;
+ rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, trsp, rtors.rtrsp);
+ rcu_torture_one_read_end(&rtors, trsp);
return true;
}
@@ -2215,12 +2516,15 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand);
*/
static void rcu_torture_timer(struct timer_list *unused)
{
+ WARN_ON_ONCE(!in_serving_softirq());
+ WARN_ON_ONCE(in_hardirq());
+ WARN_ON_ONCE(in_nmi());
atomic_long_inc(&n_rcu_torture_timers);
(void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand), -1);
/* Test call_rcu() invocation from interrupt handler. */
if (cur_ops->call) {
- struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT);
+ struct rcu_head *rhp = kmalloc_obj(*rhp, GFP_NOWAIT);
if (rhp)
cur_ops->call(rhp, rcu_torture_timer_cb);
@@ -2246,7 +2550,7 @@ rcu_torture_reader(void *arg)
set_user_nice(current, MAX_NICE);
if (irqreader && cur_ops->irq_capable)
timer_setup_on_stack(&t, rcu_torture_timer, 0);
- tick_dep_set_task(current, TICK_DEP_BIT_RCU);
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick.
do {
if (irqreader && cur_ops->irq_capable) {
if (!timer_pending(&t))
@@ -2258,19 +2562,166 @@ rcu_torture_reader(void *arg)
torture_hrtimeout_us(500, 1000, &rand);
lastsleep = jiffies + 10;
}
- while (torture_num_online_cpus() < mynumonline && !torture_must_stop())
+ while (!torture_must_stop() &&
+ (torture_num_online_cpus() < mynumonline || !rcu_inkernel_boot_has_ended()))
schedule_timeout_interruptible(HZ / 5);
stutter_wait("rcu_torture_reader");
} while (!torture_must_stop());
if (irqreader && cur_ops->irq_capable) {
- del_timer_sync(&t);
- destroy_timer_on_stack(&t);
+ timer_delete_sync(&t);
+ timer_destroy_on_stack(&t);
}
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
torture_kthread_stopping("rcu_torture_reader");
return 0;
}
+struct rcu_torture_one_read_state_updown {
+ struct hrtimer rtorsu_hrt;
+ bool rtorsu_inuse;
+ ktime_t rtorsu_kt;
+ int rtorsu_cpu;
+ unsigned long rtorsu_j;
+ unsigned long rtorsu_ndowns;
+ unsigned long rtorsu_nups;
+ unsigned long rtorsu_nmigrates;
+ struct torture_random_state rtorsu_trs;
+ struct rcu_torture_one_read_state rtorsu_rtors;
+};
+
+static struct rcu_torture_one_read_state_updown *updownreaders;
+static DEFINE_TORTURE_RANDOM(rcu_torture_updown_rand);
+static int rcu_torture_updown(void *arg);
+
+static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp)
+{
+ int cpu = raw_smp_processor_id();
+ struct rcu_torture_one_read_state_updown *rtorsup;
+
+ rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt);
+ rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs);
+ WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders);
+ WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1);
+ WRITE_ONCE(rtorsup->rtorsu_nmigrates,
+ rtorsup->rtorsu_nmigrates + (cpu != rtorsup->rtorsu_cpu));
+ smp_store_release(&rtorsup->rtorsu_inuse, false);
+ return HRTIMER_NORESTART;
+}
+
+static int rcu_torture_updown_init(void)
+{
+ int i;
+ struct torture_random_state *rand = &rcu_torture_updown_rand;
+ int ret;
+
+ if (n_up_down < 0)
+ return 0;
+ if (!srcu_torture_have_up_down()) {
+ VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Disabling up/down reader tests due to lack of primitives");
+ return 0;
+ }
+ updownreaders = kzalloc_objs(*updownreaders, n_up_down);
+ if (!updownreaders) {
+ VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Out of memory, disabling up/down reader tests");
+ return -ENOMEM;
+ }
+ for (i = 0; i < n_up_down; i++) {
+ init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, rand);
+ hrtimer_setup(&updownreaders[i].rtorsu_hrt, rcu_torture_updown_hrt, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL | HRTIMER_MODE_HARD);
+ torture_random_init(&updownreaders[i].rtorsu_trs);
+ init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors,
+ &updownreaders[i].rtorsu_trs);
+ }
+ ret = torture_create_kthread(rcu_torture_updown, rand, updown_task);
+ if (ret) {
+ kfree(updownreaders);
+ updownreaders = NULL;
+ }
+ return ret;
+}
+
+static void rcu_torture_updown_cleanup(void)
+{
+ struct rcu_torture_one_read_state_updown *rtorsup;
+
+ for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) {
+ if (!smp_load_acquire(&rtorsup->rtorsu_inuse))
+ continue;
+ if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) {
+ rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs);
+ WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders);
+ WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1);
+ smp_store_release(&rtorsup->rtorsu_inuse, false);
+ }
+
+ }
+ kfree(updownreaders);
+ updownreaders = NULL;
+}
+
+// Do one reader for rcu_torture_updown().
+static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rtorsup)
+{
+ int idx;
+ int rawidx;
+ ktime_t t;
+
+ init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs);
+ rawidx = cur_ops->down_read();
+ WRITE_ONCE(rtorsup->rtorsu_ndowns, rtorsup->rtorsu_ndowns + 1);
+ idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1;
+ rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN;
+ rtorsup->rtorsu_rtors.rtrsp++;
+ rtorsup->rtorsu_cpu = raw_smp_processor_id();
+ if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) {
+ WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders);
+ WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1);
+ schedule_timeout_idle(HZ);
+ return;
+ }
+ smp_store_release(&rtorsup->rtorsu_inuse, true);
+ t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million.
+ if (t < 10 * 1000)
+ t = 200 * 1000 * 1000;
+ hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_HARD);
+ smp_mb(); // Sample jiffies after posting hrtimer.
+ rtorsup->rtorsu_j = jiffies; // Not used by hrtimer handler.
+ rtorsup->rtorsu_kt = t;
+}
+
+/*
+ * RCU torture up/down reader kthread, starting RCU readers in kthread
+ * context and ending them in hrtimer handlers. Otherwise similar to
+ * rcu_torture_reader().
+ */
+static int
+rcu_torture_updown(void *arg)
+{
+ unsigned long j;
+ struct rcu_torture_one_read_state_updown *rtorsup;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_updown task started");
+ do {
+ for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) {
+ if (torture_must_stop())
+ break;
+ j = smp_load_acquire(&jiffies); // Time before ->rtorsu_inuse.
+ if (smp_load_acquire(&rtorsup->rtorsu_inuse)) {
+ WARN_ONCE(time_after(j, rtorsup->rtorsu_j + 1 + HZ * 10),
+ "hrtimer queued at jiffies %lu for %lld ns took %lu jiffies\n", rtorsup->rtorsu_j, rtorsup->rtorsu_kt, j - rtorsup->rtorsu_j);
+ continue;
+ }
+ rcu_torture_updown_one(rtorsup);
+ }
+ torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand);
+ stutter_wait("rcu_torture_updown");
+ } while (!torture_must_stop());
+ rcu_torture_updown_cleanup();
+ torture_kthread_stopping("rcu_torture_updown");
+ return 0;
+}
+
/*
* Randomly Toggle CPUs' callback-offload state. This uses hrtimers to
* increase race probabilities and fuzzes the interval between toggling.
@@ -2333,6 +2784,11 @@ rcu_torture_stats_print(void)
int i;
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ long n_gpwraps = 0;
+ unsigned long ndowns = 0;
+ unsigned long nunexpired = 0;
+ unsigned long nmigrates = 0;
+ unsigned long nups = 0;
struct rcu_torture *rtcp;
static unsigned long rtcv_snap = ULONG_MAX;
static bool splatted;
@@ -2343,11 +2799,21 @@ rcu_torture_stats_print(void)
pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]);
batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]);
}
+ if (cur_ops->get_gpwrap_count)
+ n_gpwraps += cur_ops->get_gpwrap_count(cpu);
+ }
+ if (updownreaders) {
+ for (i = 0; i < n_up_down; i++) {
+ ndowns += READ_ONCE(updownreaders[i].rtorsu_ndowns);
+ nups += READ_ONCE(updownreaders[i].rtorsu_nups);
+ nunexpired += READ_ONCE(updownreaders[i].rtorsu_inuse);
+ nmigrates += READ_ONCE(updownreaders[i].rtorsu_nmigrates);
+ }
}
for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) {
if (pipesummary[i] != 0)
break;
- }
+ } // The value of variable "i" is used later, so don't clobber it!
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
rtcp = rcu_access_pointer(rcu_torture_current);
@@ -2368,14 +2834,17 @@ rcu_torture_stats_print(void)
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
atomic_long_read(&n_rcu_torture_timers));
+ if (updownreaders)
+ pr_cont("ndowns: %lu nups: %lu nhrt: %lu nmigrates: %lu ", ndowns, nups, nunexpired, nmigrates);
torture_onoff_stats();
pr_cont("barrier: %ld/%ld:%ld ",
data_race(n_barrier_successes),
data_race(n_barrier_attempts),
data_race(n_rcu_torture_barrier_error));
pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic.
- pr_cont("nocb-toggles: %ld:%ld\n",
+ pr_cont("nocb-toggles: %ld:%ld ",
atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload));
+ pr_cont("gpwraps: %ld\n", n_gpwraps);
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) ||
@@ -2413,7 +2882,8 @@ rcu_torture_stats_print(void)
cur_ops->stats();
if (rtcv_snap == rcu_torture_current_version &&
rcu_access_pointer(rcu_torture_current) &&
- !rcu_stall_is_suppressed()) {
+ !rcu_stall_is_suppressed() &&
+ rcu_inkernel_boot_has_ended()) {
int __maybe_unused flags = 0;
unsigned long __maybe_unused gp_seq = 0;
@@ -2483,7 +2953,7 @@ static void rcu_torture_mem_dump_obj(void)
mem_dump_obj(&z);
kmem_cache_free(kcp, rhp);
kmem_cache_destroy(kcp);
- rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ rhp = kmalloc_obj(*rhp);
if (WARN_ON_ONCE(!rhp))
return;
pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
@@ -2512,7 +2982,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"shuffle_interval=%d stutter=%d irqreader=%d "
"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
"test_boost=%d/%d test_boost_interval=%d "
- "test_boost_duration=%d shutdown_secs=%d "
+ "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d "
"stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
"stall_cpu_block=%d stall_cpu_repeat=%d "
"n_barrier_cbs=%d "
@@ -2521,12 +2991,12 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"reader_flavor=%x "
"nocbs_nthreads=%d nocbs_toggle=%d "
"test_nmis=%d "
- "preempt_duration=%d preempt_interval=%d\n",
- torture_type, tag, nrealreaders, nfakewriters,
+ "preempt_duration=%d preempt_interval=%d n_up_down=%d\n",
+ torture_type, tag, nrealreaders, nrealfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
test_boost, cur_ops->can_boost,
- test_boost_interval, test_boost_duration, shutdown_secs,
+ test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs,
stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
stall_cpu_block, stall_cpu_repeat,
n_barrier_cbs,
@@ -2535,7 +3005,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
reader_flavor,
nocbs_nthreads, nocbs_toggle,
test_nmis,
- preempt_duration, preempt_interval);
+ preempt_duration, preempt_interval, n_up_down);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -2975,7 +3445,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
cver = READ_ONCE(rcu_torture_current_version);
gps = cur_ops->get_gp_seq();
rfp->rcu_launder_gp_seq_start = gps;
- tick_dep_set_task(current, TICK_DEP_BIT_RCU);
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick.
while (time_before(jiffies, stopat) &&
!shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
@@ -2991,7 +3461,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders++;
n_launders_sa++;
} else if (!cur_ops->cbflood_max || cur_ops->cbflood_max > n_max_cbs) {
- rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL);
+ rfcp = kmalloc_obj(*rfcp);
if (WARN_ON_ONCE(!rfcp)) {
schedule_timeout_interruptible(1);
continue;
@@ -3103,6 +3573,8 @@ static int rcu_torture_fwd_prog(void *args)
int tested_tries = 0;
VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started");
+ while (!rcu_inkernel_boot_has_ended())
+ schedule_timeout_interruptible(HZ / 10);
rcu_bind_current_to_nocb();
if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST))
set_user_nice(current, MAX_NICE);
@@ -3177,8 +3649,8 @@ static int __init rcu_torture_fwd_prog_init(void)
fwd_progress_holdoff = 1;
if (fwd_progress_div <= 0)
fwd_progress_div = 4;
- rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL);
- fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL);
+ rfp = kzalloc_objs(*rfp, fwd_progress);
+ fwd_prog_tasks = kzalloc_objs(*fwd_prog_tasks, fwd_progress);
if (!rfp || !fwd_prog_tasks) {
kfree(rfp);
kfree(fwd_prog_tasks);
@@ -3344,10 +3816,9 @@ static int rcu_torture_barrier_init(void)
atomic_set(&barrier_cbs_count, 0);
atomic_set(&barrier_cbs_invoked, 0);
barrier_cbs_tasks =
- kcalloc(n_barrier_cbs, sizeof(barrier_cbs_tasks[0]),
- GFP_KERNEL);
+ kzalloc_objs(barrier_cbs_tasks[0], n_barrier_cbs);
barrier_cbs_wq =
- kcalloc(n_barrier_cbs, sizeof(barrier_cbs_wq[0]), GFP_KERNEL);
+ kzalloc_objs(barrier_cbs_wq[0], n_barrier_cbs);
if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
return -ENOMEM;
for (i = 0; i < n_barrier_cbs; i++) {
@@ -3546,6 +4017,57 @@ static int rcu_torture_preempt(void *unused)
static enum cpuhp_state rcutor_hp;
+static struct hrtimer gpwrap_lag_timer;
+static bool gpwrap_lag_active;
+
+/* Timer handler for toggling RCU grace-period sequence overflow test lag value */
+static enum hrtimer_restart rcu_gpwrap_lag_timer(struct hrtimer *timer)
+{
+ ktime_t next_delay;
+
+ if (gpwrap_lag_active) {
+ pr_alert("rcu-torture: Disabling gpwrap lag (value=0)\n");
+ cur_ops->set_gpwrap_lag(0);
+ gpwrap_lag_active = false;
+ next_delay = ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0);
+ } else {
+ pr_alert("rcu-torture: Enabling gpwrap lag (value=%d)\n", gpwrap_lag_gps);
+ cur_ops->set_gpwrap_lag(gpwrap_lag_gps);
+ gpwrap_lag_active = true;
+ next_delay = ktime_set(gpwrap_lag_active_mins * 60, 0);
+ }
+
+ if (torture_must_stop_irq())
+ return HRTIMER_NORESTART;
+
+ hrtimer_forward_now(timer, next_delay);
+ return HRTIMER_RESTART;
+}
+
+static int rcu_gpwrap_lag_init(void)
+{
+ if (!gpwrap_lag)
+ return 0;
+
+ if (gpwrap_lag_cycle_mins <= 0 || gpwrap_lag_active_mins <= 0) {
+ pr_alert("rcu-torture: lag timing parameters must be positive\n");
+ return -EINVAL;
+ }
+
+ hrtimer_setup(&gpwrap_lag_timer, rcu_gpwrap_lag_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ gpwrap_lag_active = false;
+ hrtimer_start(&gpwrap_lag_timer,
+ ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0), HRTIMER_MODE_REL);
+
+ return 0;
+}
+
+static void rcu_gpwrap_lag_cleanup(void)
+{
+ hrtimer_cancel(&gpwrap_lag_timer);
+ cur_ops->set_gpwrap_lag(0);
+ gpwrap_lag_active = false;
+}
static void
rcu_torture_cleanup(void)
{
@@ -3553,6 +4075,7 @@ rcu_torture_cleanup(void)
int flags = 0;
unsigned long gp_seq = 0;
int i;
+ int j;
if (torture_cleanup_begin()) {
if (cur_ops->cb_barrier != NULL) {
@@ -3586,6 +4109,10 @@ rcu_torture_cleanup(void)
nocb_tasks = NULL;
}
+ if (updown_task) {
+ torture_stop_kthread(rcu_torture_updown, updown_task);
+ updown_task = NULL;
+ }
if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
torture_stop_kthread(rcu_torture_reader,
@@ -3597,7 +4124,7 @@ rcu_torture_cleanup(void)
rcu_torture_reader_mbchk = NULL;
if (fakewriter_tasks) {
- for (i = 0; i < nfakewriters; i++)
+ for (i = 0; i < nrealfakewriters; i++)
torture_stop_kthread(rcu_torture_fakewriter,
fakewriter_tasks[i]);
kfree(fakewriter_tasks);
@@ -3635,7 +4162,11 @@ rcu_torture_cleanup(void)
pr_alert("\t: No segments recorded!!!\n");
firsttime = 1;
for (i = 0; i < rt_read_nsegs; i++) {
- pr_alert("\t%d: %#4x", i, err_segs[i].rt_readstate);
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP))
+ pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL));
+ else
+ pr_alert("\t");
+ pr_cont("%d: %#4x", i, err_segs[i].rt_readstate);
if (err_segs[i].rt_delay_jiffies != 0) {
pr_cont("%s%ldjiffies", firsttime ? "" : "+",
err_segs[i].rt_delay_jiffies);
@@ -3648,6 +4179,27 @@ rcu_torture_cleanup(void)
else
pr_cont(" ...");
}
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) &&
+ cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) {
+ char buf1[20+1];
+ char buf2[20+1];
+ char sepchar = '-';
+
+ cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq,
+ buf1, ARRAY_SIZE(buf1));
+ cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end,
+ buf2, ARRAY_SIZE(buf2));
+ if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) {
+ if (buf2[0]) {
+ for (j = 0; buf2[j]; j++)
+ buf2[j] = '.';
+ if (j)
+ buf2[j - 1] = ' ';
+ }
+ sepchar = ' ';
+ }
+ pr_cont(" %s%c%s", buf1, sepchar, buf2);
+ }
if (err_segs[i].rt_delay_ms != 0) {
pr_cont(" %s%ldms", firsttime ? "" : "+",
err_segs[i].rt_delay_ms);
@@ -3689,6 +4241,9 @@ rcu_torture_cleanup(void)
torture_cleanup_end();
if (cur_ops->gp_slow_unregister)
cur_ops->gp_slow_unregister(NULL);
+
+ if (gpwrap_lag && cur_ops->set_gpwrap_lag)
+ rcu_gpwrap_lag_cleanup();
}
static void rcu_torture_leak_cb(struct rcu_head *rhp)
@@ -3730,7 +4285,7 @@ static void rcu_test_debug_objects(void)
(!cur_ops->call || !cur_ops->cb_barrier)))
return;
- struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ struct rcu_head *rhp = kmalloc_obj(*rhp);
init_rcu_head_on_stack(&rh1);
init_rcu_head_on_stack(&rh2);
@@ -3956,7 +4511,7 @@ rcu_torture_init(void)
static struct rcu_torture_ops *torture_ops[] = {
&rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops,
TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
- &trivial_ops,
+ &trivial_ops, TRIVIAL_PREEMPT_OPS
};
if (!torture_init_begin(torture_type, verbose))
@@ -3994,6 +4549,14 @@ rcu_torture_init(void)
rcu_torture_init_srcu_lockdep();
+ if (nfakewriters >= 0) {
+ nrealfakewriters = nfakewriters;
+ } else {
+ nrealfakewriters = num_online_cpus() - 2 - nfakewriters;
+ if (nrealfakewriters <= 0)
+ nrealfakewriters = 1;
+ }
+
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
@@ -4046,30 +4609,24 @@ rcu_torture_init(void)
/* Start up the kthreads. */
rcu_torture_write_types();
- firsterr = torture_create_kthread(rcu_torture_writer, NULL,
- writer_task);
- if (torture_init_error(firsterr))
- goto unwind;
- if (nfakewriters > 0) {
- fakewriter_tasks = kcalloc(nfakewriters,
- sizeof(fakewriter_tasks[0]),
- GFP_KERNEL);
+ if (nrealfakewriters > 0) {
+ fakewriter_tasks = kzalloc_objs(fakewriter_tasks[0],
+ nrealfakewriters);
if (fakewriter_tasks == NULL) {
TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
}
- for (i = 0; i < nfakewriters; i++) {
+ for (i = 0; i < nrealfakewriters; i++) {
firsterr = torture_create_kthread(rcu_torture_fakewriter,
NULL, fakewriter_tasks[i]);
if (torture_init_error(firsterr))
goto unwind;
}
- reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
- GFP_KERNEL);
- rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk),
- GFP_KERNEL);
+ reader_tasks = kzalloc_objs(reader_tasks[0], nrealreaders);
+ rcu_torture_reader_mbchk = kzalloc_objs(*rcu_torture_reader_mbchk,
+ nrealreaders);
if (!reader_tasks || !rcu_torture_reader_mbchk) {
TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
@@ -4082,13 +4639,22 @@ rcu_torture_init(void)
if (torture_init_error(firsterr))
goto unwind;
}
+
+ firsterr = torture_create_kthread(rcu_torture_writer, NULL,
+ writer_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+
+ firsterr = rcu_torture_updown_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
nrealnocbers = nocbs_nthreads;
if (WARN_ON(nrealnocbers < 0))
nrealnocbers = 1;
if (WARN_ON(nocbs_toggle < 0))
nocbs_toggle = HZ;
if (nrealnocbers > 0) {
- nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL);
+ nocb_tasks = kzalloc_objs(nocb_tasks[0], nrealnocbers);
if (nocb_tasks == NULL) {
TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
@@ -4176,9 +4742,17 @@ rcu_torture_init(void)
}
if (object_debug)
rcu_test_debug_objects();
- torture_init_end();
+
if (cur_ops->gp_slow_register && !WARN_ON_ONCE(!cur_ops->gp_slow_unregister))
cur_ops->gp_slow_register(&rcu_fwd_cb_nodelay);
+
+ if (gpwrap_lag && cur_ops->set_gpwrap_lag) {
+ firsterr = rcu_gpwrap_lag_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+
+ torture_init_end();
return 0;
unwind:
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 1b47376acdc4..a2d9d75d88a1 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -85,22 +85,16 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
// Number of typesafe_lookup structures, that is, the degree of concurrency.
torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures.");
// Number of loops per experiment, all readers execute operations concurrently.
-torture_param(long, loops, 10000, "Number of loops per experiment.");
+torture_param(int, loops, 10000, "Number of loops per experiment.");
// Number of readers, with -1 defaulting to about 75% of the CPUs.
torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
// Number of runs.
torture_param(int, nruns, 30, "Number of experiments to run.");
// Reader delay in nanoseconds, 0 for no delay.
torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
-
-#ifdef MODULE
-# define REFSCALE_SHUTDOWN 0
-#else
-# define REFSCALE_SHUTDOWN 1
-#endif
-
-torture_param(bool, shutdown, REFSCALE_SHUTDOWN,
- "Shutdown at end of scalability tests.");
+// Maximum shutdown delay in seconds, or zero for no shutdown.
+torture_param(int, shutdown_secs, !IS_MODULE(CONFIG_REPRO_TEST) * 300,
+ "Shutdown at end of scalability tests or at specified timeout (s).");
struct reader_task {
struct task_struct *task;
@@ -109,12 +103,8 @@ struct reader_task {
u64 last_duration_ns;
};
-static struct task_struct *shutdown_task;
-static wait_queue_head_t shutdown_wq;
-
static struct task_struct *main_task;
static wait_queue_head_t main_wq;
-static int shutdown_start;
static struct reader_task *reader_tasks;
@@ -136,6 +126,7 @@ struct ref_scale_ops {
void (*cleanup)(void);
void (*readsection)(const int nloops);
void (*delaysection)(const int nloops, const int udl, const int ndl);
+ bool enable_irqs;
const char *name;
};
@@ -184,6 +175,8 @@ static const struct ref_scale_ops rcu_ops = {
// Definitions for SRCU ref scale testing.
DEFINE_STATIC_SRCU(srcu_refctl_scale);
+DEFINE_STATIC_SRCU_FAST(srcu_fast_refctl_scale);
+DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_fast_updown_refctl_scale);
static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale;
static void srcu_ref_scale_read_section(const int nloops)
@@ -216,34 +209,76 @@ static const struct ref_scale_ops srcu_ops = {
.name = "srcu"
};
-static void srcu_lite_ref_scale_read_section(const int nloops)
+static bool srcu_fast_sync_scale_init(void)
+{
+ srcu_ctlp = &srcu_fast_refctl_scale;
+ return true;
+}
+
+static void srcu_fast_ref_scale_read_section(const int nloops)
{
int i;
- int idx;
+ struct srcu_ctr __percpu *scp;
for (i = nloops; i >= 0; i--) {
- idx = srcu_read_lock_lite(srcu_ctlp);
- srcu_read_unlock_lite(srcu_ctlp, idx);
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ srcu_read_unlock_fast(srcu_ctlp, scp);
}
}
-static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
{
int i;
- int idx;
+ struct srcu_ctr __percpu *scp;
for (i = nloops; i >= 0; i--) {
- idx = srcu_read_lock_lite(srcu_ctlp);
+ scp = srcu_read_lock_fast(srcu_ctlp);
un_delay(udl, ndl);
- srcu_read_unlock_lite(srcu_ctlp, idx);
+ srcu_read_unlock_fast(srcu_ctlp, scp);
}
}
-static const struct ref_scale_ops srcu_lite_ops = {
- .init = rcu_sync_scale_init,
- .readsection = srcu_lite_ref_scale_read_section,
- .delaysection = srcu_lite_ref_scale_delay_section,
- .name = "srcu-lite"
+static const struct ref_scale_ops srcu_fast_ops = {
+ .init = srcu_fast_sync_scale_init,
+ .readsection = srcu_fast_ref_scale_read_section,
+ .delaysection = srcu_fast_ref_scale_delay_section,
+ .name = "srcu-fast"
+};
+
+static bool srcu_fast_updown_sync_scale_init(void)
+{
+ srcu_ctlp = &srcu_fast_updown_refctl_scale;
+ return true;
+}
+
+static void srcu_fast_updown_ref_scale_read_section(const int nloops)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast_updown(srcu_ctlp);
+ srcu_read_unlock_fast_updown(srcu_ctlp, scp);
+ }
+}
+
+static void srcu_fast_updown_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast_updown(srcu_ctlp);
+ un_delay(udl, ndl);
+ srcu_read_unlock_fast_updown(srcu_ctlp, scp);
+ }
+}
+
+static const struct ref_scale_ops srcu_fast_updown_ops = {
+ .init = srcu_fast_updown_sync_scale_init,
+ .readsection = srcu_fast_updown_ref_scale_read_section,
+ .delaysection = srcu_fast_updown_ref_scale_delay_section,
+ .name = "srcu-fast-updown"
};
#ifdef CONFIG_TASKS_RCU
@@ -323,6 +358,9 @@ static const struct ref_scale_ops rcu_trace_ops = {
// Definitions for reference count
static atomic_t refcnt;
+// Definitions acquire-release.
+static DEFINE_PER_CPU(unsigned long, test_acqrel);
+
static void ref_refcnt_section(const int nloops)
{
int i;
@@ -351,6 +389,184 @@ static const struct ref_scale_ops refcnt_ops = {
.name = "refcnt"
};
+static void ref_percpuinc_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ this_cpu_inc(test_acqrel);
+ this_cpu_dec(test_acqrel);
+ }
+}
+
+static void ref_percpuinc_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ this_cpu_inc(test_acqrel);
+ un_delay(udl, ndl);
+ this_cpu_dec(test_acqrel);
+ }
+}
+
+static const struct ref_scale_ops percpuinc_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_percpuinc_section,
+ .delaysection = ref_percpuinc_delay_section,
+ .name = "percpuinc"
+};
+
+// Note that this can lose counts in preemptible kernels.
+static void ref_incpercpu_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap = this_cpu_ptr(&test_acqrel);
+
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ }
+}
+
+static void ref_incpercpu_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap = this_cpu_ptr(&test_acqrel);
+
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ un_delay(udl, ndl);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ }
+}
+
+static const struct ref_scale_ops incpercpu_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_incpercpu_section,
+ .delaysection = ref_incpercpu_delay_section,
+ .name = "incpercpu"
+};
+
+static void ref_incpercpupreempt_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ preempt_disable();
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ preempt_enable();
+ }
+}
+
+static void ref_incpercpupreempt_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ preempt_disable();
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ un_delay(udl, ndl);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ preempt_enable();
+ }
+}
+
+static const struct ref_scale_ops incpercpupreempt_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_incpercpupreempt_section,
+ .delaysection = ref_incpercpupreempt_delay_section,
+ .name = "incpercpupreempt"
+};
+
+static void ref_incpercpubh_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ local_bh_disable();
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ local_bh_enable();
+ }
+}
+
+static void ref_incpercpubh_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ local_bh_disable();
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ un_delay(udl, ndl);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ local_bh_enable();
+ }
+}
+
+static const struct ref_scale_ops incpercpubh_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_incpercpubh_section,
+ .delaysection = ref_incpercpubh_delay_section,
+ .enable_irqs = true,
+ .name = "incpercpubh"
+};
+
+static void ref_incpercpuirqsave_section(const int nloops)
+{
+ int i;
+ unsigned long flags;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ local_irq_save(flags);
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ local_irq_restore(flags);
+ }
+}
+
+static void ref_incpercpuirqsave_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ unsigned long flags;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ local_irq_save(flags);
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ un_delay(udl, ndl);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ local_irq_restore(flags);
+ }
+}
+
+static const struct ref_scale_ops incpercpuirqsave_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_incpercpuirqsave_section,
+ .delaysection = ref_incpercpuirqsave_delay_section,
+ .name = "incpercpuirqsave"
+};
+
// Definitions for rwlock
static rwlock_t test_rwlock;
@@ -494,9 +710,6 @@ static const struct ref_scale_ops lock_irq_ops = {
.name = "lock-irq"
};
-// Definitions acquire-release.
-static DEFINE_PER_CPU(unsigned long, test_acqrel);
-
static void ref_acqrel_section(const int nloops)
{
unsigned long x;
@@ -629,6 +842,133 @@ static const struct ref_scale_ops jiffies_ops = {
.name = "jiffies"
};
+static void ref_preempt_section(const int nloops)
+{
+ int i;
+
+ migrate_disable();
+ for (i = nloops; i >= 0; i--) {
+ preempt_disable();
+ preempt_enable();
+ }
+ migrate_enable();
+}
+
+static void ref_preempt_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ migrate_disable();
+ for (i = nloops; i >= 0; i--) {
+ preempt_disable();
+ un_delay(udl, ndl);
+ preempt_enable();
+ }
+ migrate_enable();
+}
+
+static const struct ref_scale_ops preempt_ops = {
+ .readsection = ref_preempt_section,
+ .delaysection = ref_preempt_delay_section,
+ .name = "preempt"
+};
+
+static void ref_bh_section(const int nloops)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_bh_disable();
+ local_bh_enable();
+ }
+ preempt_enable();
+}
+
+static void ref_bh_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_bh_disable();
+ un_delay(udl, ndl);
+ local_bh_enable();
+ }
+ preempt_enable();
+}
+
+static const struct ref_scale_ops bh_ops = {
+ .readsection = ref_bh_section,
+ .delaysection = ref_bh_delay_section,
+ .enable_irqs = true,
+ .name = "bh"
+};
+
+static void ref_irq_section(const int nloops)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_irq_disable();
+ local_irq_enable();
+ }
+ preempt_enable();
+}
+
+static void ref_irq_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_irq_disable();
+ un_delay(udl, ndl);
+ local_irq_enable();
+ }
+ preempt_enable();
+}
+
+static const struct ref_scale_ops irq_ops = {
+ .readsection = ref_irq_section,
+ .delaysection = ref_irq_delay_section,
+ .name = "irq"
+};
+
+static void ref_irqsave_section(const int nloops)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_irq_save(flags);
+ local_irq_restore(flags);
+ }
+ preempt_enable();
+}
+
+static void ref_irqsave_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_irq_save(flags);
+ un_delay(udl, ndl);
+ local_irq_restore(flags);
+ }
+ preempt_enable();
+}
+
+static const struct ref_scale_ops irqsave_ops = {
+ .readsection = ref_irqsave_section,
+ .delaysection = ref_irqsave_delay_section,
+ .name = "irqsave"
+};
+
////////////////////////////////////////////////////////////////////////
//
// Methods leveraging SLAB_TYPESAFE_BY_RCU.
@@ -793,7 +1133,7 @@ static bool typesafe_init(void)
else if (si == 0)
si = nr_cpu_ids;
rtsarray_size = si;
- rtsarray = kcalloc(si, sizeof(*rtsarray), GFP_KERNEL);
+ rtsarray = kzalloc_objs(*rtsarray, si);
if (!rtsarray)
return false;
for (idx = 0; idx < rtsarray_size; idx++) {
@@ -924,15 +1264,18 @@ repeat:
if (!atomic_dec_return(&n_warmedup))
while (atomic_read_acquire(&n_warmedup))
rcu_scale_one_reader();
- // Also keep interrupts disabled. This also has the effect
- // of preventing entries into slow path for rcu_read_unlock().
- local_irq_save(flags);
+ // Also keep interrupts disabled when it is safe to do so, which
+ // it is not for local_bh_enable(). This also has the effect of
+ // preventing entries into slow path for rcu_read_unlock().
+ if (!cur_ops->enable_irqs)
+ local_irq_save(flags);
start = ktime_get_mono_fast_ns();
rcu_scale_one_reader();
duration = ktime_get_mono_fast_ns() - start;
- local_irq_restore(flags);
+ if (!cur_ops->enable_irqs)
+ local_irq_restore(flags);
rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
// To reduce runtime-skew noise, do maintain-load invocations until
@@ -1004,6 +1347,8 @@ static u64 process_durations(int n)
return sum;
}
+static void ref_scale_cleanup(void);
+
// The main_func is the main orchestrator, it performs a bunch of
// experiments. For every experiment, it orders all the readers
// involved to start and waits for them to finish the experiment. It
@@ -1021,7 +1366,7 @@ static int main_func(void *arg)
set_user_nice(current, MAX_NICE);
VERBOSE_SCALEOUT("main_func task started");
- result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
+ result_avg = kcalloc(nruns, sizeof(*result_avg), GFP_KERNEL);
buf = kzalloc(800 + 64, GFP_KERNEL);
if (!result_avg || !buf) {
SCALEOUT_ERRSTRING("out of memory");
@@ -1090,9 +1435,10 @@ static int main_func(void *arg)
oom_exit:
// This will shutdown everything including us.
- if (shutdown) {
- shutdown_start = 1;
- wake_up(&shutdown_wq);
+ if (shutdown_secs) {
+ main_task = NULL; // Avoid self-kill deadlock.
+ ref_scale_cleanup();
+ kernel_power_off();
}
// Wait for torture to stop us
@@ -1110,8 +1456,8 @@ static void
ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
- "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
- verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay);
+ "--- %s: verbose=%d verbose_batched=%d shutdown_secs=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+ verbose, verbose_batched, shutdown_secs, holdoff, lookup_instances, loops, nreaders, nruns, readdelay);
}
static void
@@ -1133,9 +1479,9 @@ ref_scale_cleanup(void)
reader_tasks[i].task);
}
kfree(reader_tasks);
+ reader_tasks = NULL;
torture_stop_kthread("main_task", main_task);
- kfree(main_task);
// Do scale-type-specific cleanup operations.
if (cur_ops->cleanup != NULL)
@@ -1144,28 +1490,19 @@ ref_scale_cleanup(void)
torture_cleanup_end();
}
-// Shutdown kthread. Just waits to be awakened, then shuts down system.
-static int
-ref_scale_shutdown(void *arg)
-{
- wait_event_idle(shutdown_wq, shutdown_start);
-
- smp_mb(); // Wake before output.
- ref_scale_cleanup();
- kernel_power_off();
-
- return -EINVAL;
-}
-
static int __init
ref_scale_init(void)
{
long i;
int firsterr = 0;
static const struct ref_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS
- &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops,
- &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops,
+ &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_fast_updown_ops,
+ RCU_TRACE_OPS RCU_TASKS_OPS
+ &refcnt_ops, &percpuinc_ops, &incpercpu_ops, &incpercpupreempt_ops,
+ &incpercpubh_ops, &incpercpuirqsave_ops,
+ &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops,
+ &sched_clock_ops, &clock_ops, &jiffies_ops,
+ &preempt_ops, &bh_ops, &irq_ops, &irqsave_ops,
&typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
};
@@ -1196,26 +1533,26 @@ ref_scale_init(void)
ref_scale_print_module_parms(cur_ops, "Start of test");
// Shutdown task
- if (shutdown) {
- init_waitqueue_head(&shutdown_wq);
- firsterr = torture_create_kthread(ref_scale_shutdown, NULL,
- shutdown_task);
+ if (shutdown_secs) {
+ firsterr = torture_shutdown_init(shutdown_secs, ref_scale_cleanup);
if (torture_init_error(firsterr))
goto unwind;
- schedule_timeout_uninterruptible(1);
}
// Reader tasks (default to ~75% of online CPUs).
if (nreaders < 0)
nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
- if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops))
+ if (WARN_ONCE(loops <= 0, "%s: loops = %d, adjusted to 1\n", __func__, loops))
loops = 1;
if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders))
nreaders = 1;
if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns))
nruns = 1;
- reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
- GFP_KERNEL);
+ if (WARN_ONCE(loops > INT_MAX / nreaders,
+ "%s: nreaders * loops will overflow, adjusted loops to %d",
+ __func__, INT_MAX / nreaders))
+ loops = INT_MAX / nreaders;
+ reader_tasks = kzalloc_objs(reader_tasks[0], nreaders);
if (!reader_tasks) {
SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
@@ -1244,7 +1581,7 @@ ref_scale_init(void)
unwind:
torture_init_end();
ref_scale_cleanup();
- if (shutdown) {
+ if (shutdown_secs) {
WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
kernel_power_off();
}
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 4dcbf8aa80ff..a2e2d516e51b 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -9,6 +9,7 @@
*/
#include <linux/export.h>
+#include <linux/irq_work.h>
#include <linux/mutex.h>
#include <linux/preempt.h>
#include <linux/rcupdate_wait.h>
@@ -20,7 +21,11 @@
#include "rcu_segcblist.h"
#include "rcu.h"
+#ifndef CONFIG_TREE_RCU
int rcu_scheduler_active __read_mostly;
+#else // #ifndef CONFIG_TREE_RCU
+extern int rcu_scheduler_active;
+#endif // #else // #ifndef CONFIG_TREE_RCU
static LIST_HEAD(srcu_boot_list);
static bool srcu_init_done;
@@ -37,6 +42,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp)
ssp->srcu_idx_max = 0;
INIT_WORK(&ssp->srcu_work, srcu_drive_gp);
INIT_LIST_HEAD(&ssp->srcu_work.entry);
+ init_irq_work(&ssp->srcu_irq_work, srcu_tiny_irq_work);
return 0;
}
@@ -80,6 +86,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
void cleanup_srcu_struct(struct srcu_struct *ssp)
{
WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]);
+ irq_work_sync(&ssp->srcu_irq_work);
flush_work(&ssp->srcu_work);
WARN_ON(ssp->srcu_gp_running);
WARN_ON(ssp->srcu_gp_waiting);
@@ -98,19 +105,19 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
int newval;
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
preempt_enable();
- if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task())
+ if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task() && !irqs_disabled())
swake_up_one(&ssp->srcu_wq);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
/*
* Workqueue handler to drive one grace period and invoke any callbacks
- * that become ready as a result. Single-CPU and !PREEMPTION operation
- * means that we get away with murder on synchronization. ;-)
+ * that become ready as a result. Single-CPU operation and preemption
+ * disabling mean that we get away with murder on synchronization. ;-)
*/
void srcu_drive_gp(struct work_struct *wp)
{
@@ -120,7 +127,7 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) {
preempt_enable();
return; /* Already running or nothing to do. */
@@ -137,8 +144,13 @@ void srcu_drive_gp(struct work_struct *wp)
WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
preempt_enable();
- swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
- preempt_disable(); // Needed for PREEMPT_AUTO
+ do {
+ // Deadlock issues prevent __srcu_read_unlock() from
+ // doing an unconditional wakeup, so polling is required.
+ swait_event_timeout_exclusive(ssp->srcu_wq,
+ !READ_ONCE(ssp->srcu_lock_nesting[idx]), HZ / 10);
+ } while (READ_ONCE(ssp->srcu_lock_nesting[idx]));
+ preempt_disable(); // Needed for PREEMPT_LAZY
WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
preempt_enable();
@@ -159,7 +171,7 @@ void srcu_drive_gp(struct work_struct *wp)
* at interrupt level, but the ->srcu_gp_running checks will
* straighten that out.
*/
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
WRITE_ONCE(ssp->srcu_gp_running, false);
idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max));
preempt_enable();
@@ -168,24 +180,36 @@ void srcu_drive_gp(struct work_struct *wp)
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
+/*
+ * Use an irq_work to defer schedule_work() to avoid acquiring the workqueue
+ * pool->lock while the caller might hold scheduler locks, causing lockdep
+ * splats due to workqueue_init() doing a wakeup.
+ */
+void srcu_tiny_irq_work(struct irq_work *irq_work)
+{
+ struct srcu_struct *ssp;
+
+ ssp = container_of(irq_work, struct srcu_struct, srcu_irq_work);
+ schedule_work(&ssp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(srcu_tiny_irq_work);
+
static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
{
unsigned long cookie;
- preempt_disable(); // Needed for PREEMPT_AUTO
+ lockdep_assert_preemption_disabled(); // Needed for PREEMPT_LAZY
cookie = get_state_synchronize_srcu(ssp);
if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) {
- preempt_enable();
return;
}
WRITE_ONCE(ssp->srcu_idx_max, cookie);
if (!READ_ONCE(ssp->srcu_gp_running)) {
if (likely(srcu_init_done))
- schedule_work(&ssp->srcu_work);
+ irq_work_queue(&ssp->srcu_irq_work);
else if (list_empty(&ssp->srcu_work.entry))
list_add(&ssp->srcu_work.entry, &srcu_boot_list);
}
- preempt_enable();
}
/*
@@ -199,7 +223,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rhp->func = func;
rhp->next = NULL;
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
local_irq_save(flags);
*ssp->srcu_cb_tail = rhp;
ssp->srcu_cb_tail = &rhp->next;
@@ -261,7 +285,7 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
{
unsigned long ret;
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
ret = get_state_synchronize_srcu(ssp);
srcu_gp_start_if_needed(ssp);
preempt_enable();
@@ -282,11 +306,13 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+#ifndef CONFIG_TREE_RCU
/* Lockdep diagnostics. */
void __init rcu_scheduler_starting(void)
{
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
}
+#endif // #ifndef CONFIG_TREE_RCU
/*
* Queue work for srcu_struct structures with early boot callbacks.
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index b83c74c4dcc0..7c2f7cc131f7 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -19,6 +19,7 @@
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
+#include <linux/irq_work.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched.h>
#include <linux/smp.h>
@@ -75,49 +76,15 @@ static bool __read_mostly srcu_init_done;
static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay);
static void process_srcu(struct work_struct *work);
+static void srcu_irq_work(struct irq_work *work);
static void srcu_delay_timer(struct timer_list *t);
-/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
-#define spin_lock_rcu_node(p) \
-do { \
- spin_lock(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
-} while (0)
-
-#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
-
-#define spin_lock_irq_rcu_node(p) \
-do { \
- spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
-} while (0)
-
-#define spin_unlock_irq_rcu_node(p) \
- spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
-
-#define spin_lock_irqsave_rcu_node(p, flags) \
-do { \
- spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
- smp_mb__after_unlock_lock(); \
-} while (0)
-
-#define spin_trylock_irqsave_rcu_node(p, flags) \
-({ \
- bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
- \
- if (___locked) \
- smp_mb__after_unlock_lock(); \
- ___locked; \
-})
-
-#define spin_unlock_irqrestore_rcu_node(p, flags) \
- spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
-
/*
* Initialize SRCU per-CPU data. Note that statically allocated
* srcu_struct structures might already have srcu_read_lock() and
- * srcu_read_unlock() running against them. So if the is_static parameter
- * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
+ * srcu_read_unlock() running against them. So if the is_static
+ * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and
+ * ->srcu_ctrs[].srcu_unlocks.
*/
static void init_srcu_struct_data(struct srcu_struct *ssp)
{
@@ -128,11 +95,9 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
* Initialize the per-CPU srcu_data array, which feeds into the
* leaves of the srcu_node tree.
*/
- BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) !=
- ARRAY_SIZE(sdp->srcu_unlock_count));
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+ raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
rcu_segcblist_init(&sdp->srcu_cblist);
sdp->srcu_cblist_invoking = false;
sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
@@ -174,7 +139,8 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
/* Initialize geometry if it has not already been initialized. */
rcu_init_geometry();
- ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags);
+ ssp->srcu_sup->node = kzalloc_objs(*ssp->srcu_sup->node, rcu_num_nodes,
+ gfp_flags);
if (!ssp->srcu_sup->node)
return false;
@@ -186,7 +152,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
/* Each pass through this loop initializes one srcu_node structure. */
srcu_for_each_node_breadth_first(ssp, snp) {
- spin_lock_init(&ACCESS_PRIVATE(snp, lock));
+ raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
@@ -238,33 +204,38 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
{
if (!is_static)
- ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL);
+ ssp->srcu_sup = kzalloc_obj(*ssp->srcu_sup);
if (!ssp->srcu_sup)
return -ENOMEM;
if (!is_static)
- spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ raw_spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
ssp->srcu_sup->node = NULL;
mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
- ssp->srcu_idx = 0;
ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL;
ssp->srcu_sup->srcu_barrier_seq = 0;
mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
+ init_irq_work(&ssp->srcu_sup->irq_work, srcu_irq_work);
ssp->srcu_sup->sda_is_static = is_static;
- if (!is_static)
+ if (!is_static) {
ssp->sda = alloc_percpu(struct srcu_data);
+ ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0];
+ }
if (!ssp->sda)
goto err_free_sup;
init_srcu_struct_data(ssp);
ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL;
ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
- if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC))
+ if (!preemptible())
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC);
+ else if (init_srcu_struct_nodes(ssp, GFP_KERNEL))
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
+ else
goto err_free_sda;
- WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
}
ssp->srcu_sup->srcu_ssp = ssp;
smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed,
@@ -286,32 +257,92 @@ err_free_sup:
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
- struct lock_class_key *key)
+static int
+__init_srcu_struct_common(struct srcu_struct *ssp, const char *name, struct lock_class_key *key)
{
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
lockdep_init_map(&ssp->dep_map, name, key, 0);
return init_srcu_struct_fields(ssp, false);
}
+
+int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key)
+{
+ ssp->srcu_reader_flavor = 0;
+ return __init_srcu_struct_common(ssp, name, key);
+}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
+int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key)
+{
+ ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST;
+ return __init_srcu_struct_common(ssp, name, key);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct_fast);
+
+int __init_srcu_struct_fast_updown(struct srcu_struct *ssp, const char *name,
+ struct lock_class_key *key)
+{
+ ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN;
+ return __init_srcu_struct_common(ssp, name, key);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct_fast_updown);
+
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/**
* init_srcu_struct - initialize a sleep-RCU structure
* @ssp: structure to initialize.
*
- * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * Use this in place of DEFINE_SRCU() and DEFINE_STATIC_SRCU()
+ * for non-static srcu_struct structures that are to be passed to
+ * srcu_read_lock(), srcu_read_lock_nmisafe(), and friends. It is necessary
+ * to invoke this on a given srcu_struct before passing that srcu_struct
* to any other function. Each srcu_struct represents a separate domain
* of SRCU protection.
*/
int init_srcu_struct(struct srcu_struct *ssp)
{
+ ssp->srcu_reader_flavor = 0;
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
+/**
+ * init_srcu_struct_fast - initialize a fast-reader sleep-RCU structure
+ * @ssp: structure to initialize.
+ *
+ * Use this in place of DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST()
+ * for non-static srcu_struct structures that are to be passed to
+ * srcu_read_lock_fast() and friends. It is necessary to invoke this on a
+ * given srcu_struct before passing that srcu_struct to any other function.
+ * Each srcu_struct represents a separate domain of SRCU protection.
+ */
+int init_srcu_struct_fast(struct srcu_struct *ssp)
+{
+ ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST;
+ return init_srcu_struct_fields(ssp, false);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct_fast);
+
+/**
+ * init_srcu_struct_fast_updown - initialize a fast-reader up/down sleep-RCU structure
+ * @ssp: structure to initialize.
+ *
+ * Use this function in place of DEFINE_SRCU_FAST_UPDOWN() and
+ * DEFINE_STATIC_SRCU_FAST_UPDOWN() for non-static srcu_struct
+ * structures that are to be passed to srcu_read_lock_fast_updown(),
+ * srcu_down_read_fast(), and friends. It is necessary to invoke this on a
+ * given srcu_struct before passing that srcu_struct to any other function.
+ * Each srcu_struct represents a separate domain of SRCU protection.
+ */
+int init_srcu_struct_fast_updown(struct srcu_struct *ssp)
+{
+ ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN;
+ return init_srcu_struct_fields(ssp, false);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct_fast_updown);
+
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
@@ -333,20 +364,20 @@ static void srcu_transition_to_big(struct srcu_struct *ssp)
/* Double-checked locking on ->srcu_size-state. */
if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL)
return;
- spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) {
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
__srcu_transition_to_big(ssp);
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
* Check to see if the just-encountered contention event justifies
* a transition to SRCU_SIZE_BIG.
*/
-static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
+static void raw_spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
{
unsigned long j;
@@ -368,16 +399,16 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
* to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
* parameter permits this.
*/
-static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags)
+static void raw_spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags)
{
struct srcu_struct *ssp = sdp->ssp;
- if (spin_trylock_irqsave_rcu_node(sdp, *flags))
+ if (raw_spin_trylock_irqsave_rcu_node(sdp, *flags))
return;
- spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
- spin_lock_irqsave_check_contention(ssp);
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags);
- spin_lock_irqsave_rcu_node(sdp, *flags);
+ raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
+ raw_spin_lock_irqsave_check_contention(ssp);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags);
+ raw_spin_lock_irqsave_rcu_node(sdp, *flags);
}
/*
@@ -386,12 +417,12 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon
* to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
* parameter permits this.
*/
-static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
+static void raw_spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
{
- if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags))
+ if (raw_spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags))
return;
- spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
- spin_lock_irqsave_check_contention(ssp);
+ raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
+ raw_spin_lock_irqsave_check_contention(ssp);
}
/*
@@ -409,13 +440,13 @@ static void check_init_srcu_struct(struct srcu_struct *ssp)
/* The smp_load_acquire() pairs with the smp_store_release(). */
if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/
return; /* Already initialized. */
- spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) {
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
init_srcu_struct_fields(ssp, true);
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -429,10 +460,10 @@ static bool srcu_gp_is_expedited(struct srcu_struct *ssp)
}
/*
- * Computes approximate total of the readers' ->srcu_lock_count[] values
- * for the rank of per-CPU counters specified by idx, and returns true if
- * the caller did the proper barrier (gp), and if the count of the locks
- * matches that of the unlocks passed in.
+ * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks
+ * values for the rank of per-CPU counters specified by idx, and returns
+ * true if the caller did the proper barrier (gp), and if the count of
+ * the locks matches that of the unlocks passed in.
*/
static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks)
{
@@ -443,31 +474,31 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&sdp->srcu_lock_count[idx]);
+ sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks);
if (IS_ENABLED(CONFIG_PROVE_RCU))
mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
}
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
"Mixed reader flavors for srcu_struct at %ps.\n", ssp);
- if (mask & SRCU_READ_FLAVOR_LITE && !gp)
+ if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp)
return false;
return sum == unlocks;
}
/*
- * Returns approximate total of the readers' ->srcu_unlock_count[] values
- * for the rank of per-CPU counters specified by idx.
+ * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks
+ * values for the rank of per-CPU counters specified by idx.
*/
static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm)
{
int cpu;
- unsigned long mask = 0;
+ unsigned long mask = ssp->srcu_reader_flavor;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&sdp->srcu_unlock_count[idx]);
+ sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks);
mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
}
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
@@ -487,7 +518,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
unsigned long unlocks;
unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm);
- did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE);
+ did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP);
/*
* Make sure that a lock is always counted if the corresponding
@@ -502,6 +533,8 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
*/
if (!did_gp)
smp_mb(); /* A */
+ else if (srcu_gp_is_expedited(ssp))
+ synchronize_rcu_expedited(); /* X */
else
synchronize_rcu(); /* X */
@@ -509,48 +542,49 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
* If the locks are the same as the unlocks, then there must have
* been no readers on this index at some point in this function.
* But there might be more readers, as a task might have read
- * the current ->srcu_idx but not yet have incremented its CPU's
- * ->srcu_lock_count[idx] counter. In fact, it is possible
+ * the current ->srcu_ctrp but not yet have incremented its CPU's
+ * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible
* that most of the tasks have been preempted between fetching
- * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there
- * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks
- * in a system whose address space was fully populated with memory.
- * Call this quantity Nt.
+ * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And
+ * there could be almost (ULONG_MAX / sizeof(struct task_struct))
+ * tasks in a system whose address space was fully populated
+ * with memory. Call this quantity Nt.
*
- * So suppose that the updater is preempted at this point in the
- * code for a long time. That now-preempted updater has already
- * flipped ->srcu_idx (possibly during the preceding grace period),
- * done an smp_mb() (again, possibly during the preceding grace
- * period), and summed up the ->srcu_unlock_count[idx] counters.
- * How many times can a given one of the aforementioned Nt tasks
- * increment the old ->srcu_idx value's ->srcu_lock_count[idx]
- * counter, in the absence of nesting?
+ * So suppose that the updater is preempted at this
+ * point in the code for a long time. That now-preempted
+ * updater has already flipped ->srcu_ctrp (possibly during
+ * the preceding grace period), done an smp_mb() (again,
+ * possibly during the preceding grace period), and summed up
+ * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times
+ * can a given one of the aforementioned Nt tasks increment the
+ * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter,
+ * in the absence of nesting?
*
* It can clearly do so once, given that it has already fetched
- * the old value of ->srcu_idx and is just about to use that value
- * to index its increment of ->srcu_lock_count[idx]. But as soon as
- * it leaves that SRCU read-side critical section, it will increment
- * ->srcu_unlock_count[idx], which must follow the updater's above
- * read from that same value. Thus, as soon the reading task does
- * an smp_mb() and a later fetch from ->srcu_idx, that task will be
- * guaranteed to get the new index. Except that the increment of
- * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the
- * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock()
- * is before the smp_mb(). Thus, that task might not see the new
- * value of ->srcu_idx until the -second- __srcu_read_lock(),
- * which in turn means that this task might well increment
- * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice,
- * not just once.
+ * the old value of ->srcu_ctrp and is just about to use that
+ * value to index its increment of ->srcu_ctrs[idx].srcu_locks.
+ * But as soon as it leaves that SRCU read-side critical section,
+ * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must
+ * follow the updater's above read from that same value. Thus,
+ as soon the reading task does an smp_mb() and a later fetch from
+ * ->srcu_ctrp, that task will be guaranteed to get the new index.
+ * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks
+ * in __srcu_read_unlock() is after the smp_mb(), and the fetch
+ * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb().
+ * Thus, that task might not see the new value of ->srcu_ctrp until
+ * the -second- __srcu_read_lock(), which in turn means that this
+ * task might well increment ->srcu_ctrs[idx].srcu_locks for the
+ * old value of ->srcu_ctrp twice, not just once.
*
* However, it is important to note that a given smp_mb() takes
* effect not just for the task executing it, but also for any
* later task running on that same CPU.
*
- * That is, there can be almost Nt + Nc further increments of
- * ->srcu_lock_count[idx] for the old index, where Nc is the number
- * of CPUs. But this is OK because the size of the task_struct
- * structure limits the value of Nt and current systems limit Nc
- * to a few thousand.
+ * That is, there can be almost Nt + Nc further increments
+ * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc
+ * is the number of CPUs. But this is OK because the size of
+ * the task_struct structure limits the value of Nt and current
+ * systems limit Nc to a few thousand.
*
* OK, but what about nesting? This does impose a limit on
* nesting of half of the size of the task_struct structure
@@ -581,10 +615,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&sdp->srcu_lock_count[0]);
- sum += atomic_long_read(&sdp->srcu_lock_count[1]);
- sum -= atomic_long_read(&sdp->srcu_unlock_count[0]);
- sum -= atomic_long_read(&sdp->srcu_unlock_count[1]);
+ sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks);
+ sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks);
+ sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks);
+ sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks);
}
return sum;
}
@@ -647,6 +681,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
unsigned long jbase = SRCU_INTERVAL;
struct srcu_usage *sup = ssp->srcu_sup;
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
if (srcu_gp_is_expedited(ssp))
jbase = 0;
if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
@@ -674,17 +709,23 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
void cleanup_srcu_struct(struct srcu_struct *ssp)
{
int cpu;
+ unsigned long delay;
struct srcu_usage *sup = ssp->srcu_sup;
- if (WARN_ON(!srcu_get_delay(ssp)))
+ raw_spin_lock_irq_rcu_node(ssp->srcu_sup);
+ delay = srcu_get_delay(ssp);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ if (WARN_ON(!delay))
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
+ /* Wait for irq_work to finish first as it may queue a new work. */
+ irq_work_sync(&sup->irq_work);
flush_delayed_work(&sup->work);
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- del_timer_sync(&sdp->delay_work);
+ timer_delete_sync(&sdp->delay_work);
flush_work(&sdp->work);
if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
return; /* Forgot srcu_barrier(), so just leak it! */
@@ -721,11 +762,16 @@ void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
struct srcu_data *sdp;
/* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */
- WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi());
+ WARN_ON_ONCE(read_flavor != SRCU_READ_FLAVOR_NMI &&
+ read_flavor != SRCU_READ_FLAVOR_FAST && in_nmi());
WARN_ON_ONCE(read_flavor & (read_flavor - 1));
sdp = raw_cpu_ptr(ssp->sda);
old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor);
+ WARN_ON_ONCE(ssp->srcu_reader_flavor && read_flavor != ssp->srcu_reader_flavor);
+ WARN_ON_ONCE(old_read_flavor && ssp->srcu_reader_flavor &&
+ old_read_flavor != ssp->srcu_reader_flavor);
+ WARN_ON_ONCE(read_flavor == SRCU_READ_FLAVOR_FAST && !ssp->srcu_reader_flavor);
if (!old_read_flavor) {
old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor);
if (!old_read_flavor)
@@ -743,12 +789,11 @@ EXPORT_SYMBOL_GPL(__srcu_check_read_flavor);
*/
int __srcu_read_lock(struct srcu_struct *ssp)
{
- int idx;
+ struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);
- idx = READ_ONCE(ssp->srcu_idx) & 0x1;
- this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter);
+ this_cpu_inc(scp->srcu_locks.counter);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
- return idx;
+ return __srcu_ptr_to_ctr(ssp, scp);
}
EXPORT_SYMBOL_GPL(__srcu_read_lock);
@@ -760,7 +805,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);
+ this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -773,13 +818,12 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
*/
int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
{
- int idx;
- struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
+ struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp);
+ struct srcu_ctr *scp = raw_cpu_ptr(scpp);
- idx = READ_ONCE(ssp->srcu_idx) & 0x1;
- atomic_long_inc(&sdp->srcu_lock_count[idx]);
+ atomic_long_inc(&scp->srcu_locks);
smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */
- return idx;
+ return __srcu_ptr_to_ctr(ssp, scpp);
}
EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
@@ -790,10 +834,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
*/
void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
{
- struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
-
smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */
- atomic_long_inc(&sdp->srcu_unlock_count[idx]);
+ atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
@@ -855,11 +897,9 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
{
int cpu;
- for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
- if (!(mask & (1UL << (cpu - snp->grplo))))
- continue;
- srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
- }
+ for (cpu = snp->grplo; cpu <= snp->grphi; cpu++)
+ if ((mask & (1UL << (cpu - snp->grplo))) && rcu_cpu_beenfullyonline(cpu))
+ srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
}
/*
@@ -890,7 +930,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
mutex_lock(&sup->srcu_cb_mutex);
/* End the current grace period. */
- spin_lock_irq_rcu_node(sup);
+ raw_spin_lock_irq_rcu_node(sup);
idx = rcu_seq_state(sup->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
if (srcu_gp_is_expedited(ssp))
@@ -901,7 +941,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
gpseq = rcu_seq_current(&sup->srcu_gp_seq);
if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq))
WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq);
- spin_unlock_irq_rcu_node(sup);
+ raw_spin_unlock_irq_rcu_node(sup);
mutex_unlock(&sup->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
@@ -913,7 +953,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
} else {
idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
srcu_for_each_node_breadth_first(ssp, snp) {
- spin_lock_irq_rcu_node(snp);
+ raw_spin_lock_irq_rcu_node(snp);
cbs = false;
last_lvl = snp >= sup->level[rcu_num_lvls - 1];
if (last_lvl)
@@ -928,7 +968,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
else
mask = snp->srcu_data_have_cbs[idx];
snp->srcu_data_have_cbs[idx] = 0;
- spin_unlock_irq_rcu_node(snp);
+ raw_spin_unlock_irq_rcu_node(snp);
if (cbs)
srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
}
@@ -938,27 +978,27 @@ static void srcu_gp_end(struct srcu_struct *ssp)
if (!(gpseq & counter_wrap_check))
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irq_rcu_node(sdp);
+ raw_spin_lock_irq_rcu_node(sdp);
if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100))
sdp->srcu_gp_seq_needed = gpseq;
if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100))
sdp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irq_rcu_node(sdp);
+ raw_spin_unlock_irq_rcu_node(sdp);
}
/* Callback initiation done, allow grace periods after next. */
mutex_unlock(&sup->srcu_cb_mutex);
/* Start a new grace period if needed. */
- spin_lock_irq_rcu_node(sup);
+ raw_spin_lock_irq_rcu_node(sup);
gpseq = rcu_seq_current(&sup->srcu_gp_seq);
if (!rcu_seq_state(gpseq) &&
ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) {
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(sup);
+ raw_spin_unlock_irq_rcu_node(sup);
srcu_reschedule(ssp, 0);
} else {
- spin_unlock_irq_rcu_node(sup);
+ raw_spin_unlock_irq_rcu_node(sup);
}
/* Transition to big if needed. */
@@ -989,19 +1029,19 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) ||
(!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
return;
- spin_lock_irqsave_rcu_node(snp, flags);
+ raw_spin_lock_irqsave_rcu_node(snp, flags);
sgsne = snp->srcu_gp_seq_needed_exp;
if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) {
- spin_unlock_irqrestore_rcu_node(snp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
return;
}
WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
}
- spin_lock_irqsave_ssp_contention(ssp, &flags);
+ raw_spin_lock_irqsave_ssp_contention(ssp, &flags);
if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s))
WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -1039,12 +1079,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf)
return; /* GP already done and CBs recorded. */
- spin_lock_irqsave_rcu_node(snp, flags);
+ raw_spin_lock_irqsave_rcu_node(snp, flags);
snp_seq = snp->srcu_have_cbs[idx];
if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) {
if (snp == snp_leaf && snp_seq == s)
snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- spin_unlock_irqrestore_rcu_node(snp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
if (snp == snp_leaf && snp_seq != s) {
srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0);
return;
@@ -1059,11 +1099,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
sgsne = snp->srcu_gp_seq_needed_exp;
if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s)))
WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
}
/* Top of tree, must ensure the grace period will be started. */
- spin_lock_irqsave_ssp_contention(ssp, &flags);
+ raw_spin_lock_irqsave_ssp_contention(ssp, &flags);
if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
@@ -1084,25 +1124,31 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
// it isn't. And it does not have to be. After all, it
// can only be executed during early boot when there is only
// the one boot CPU running with interrupts still disabled.
+ //
+ // Use an irq_work here to avoid acquiring runqueue lock with
+ // srcu rcu_node::lock held. BPF instrument could introduce the
+ // opposite dependency, hence we need to break the possible
+ // locking dependency here.
if (likely(srcu_init_done))
- queue_delayed_work(rcu_gp_wq, &sup->work,
- !!srcu_get_delay(ssp));
+ irq_work_queue(&sup->irq_work);
else if (list_empty(&sup->work.work.entry))
list_add(&sup->work.work.entry, &srcu_boot_list);
}
- spin_unlock_irqrestore_rcu_node(sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sup, flags);
}
/*
* Wait until all readers counted by array index idx complete, but
* loop an additional time if there is an expedited grace period pending.
- * The caller must ensure that ->srcu_idx is not changed while checking.
+ * The caller must ensure that ->srcu_ctrp is not changed while checking.
*/
static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
{
unsigned long curdelay;
+ raw_spin_lock_irq_rcu_node(ssp->srcu_sup);
curdelay = !srcu_get_delay(ssp);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
for (;;) {
if (srcu_readers_active_idx_check(ssp, idx))
@@ -1114,30 +1160,30 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
}
/*
- * Increment the ->srcu_idx counter so that future SRCU readers will
+ * Increment the ->srcu_ctrp counter so that future SRCU readers will
* use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
* us to wait for pre-existing readers in a starvation-free manner.
*/
static void srcu_flip(struct srcu_struct *ssp)
{
/*
- * Because the flip of ->srcu_idx is executed only if the
+ * Because the flip of ->srcu_ctrp is executed only if the
* preceding call to srcu_readers_active_idx_check() found that
- * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched
- * and because that summing uses atomic_long_read(), there is
- * ordering due to a control dependency between that summing and
- * the WRITE_ONCE() in this call to srcu_flip(). This ordering
- * ensures that if this updater saw a given reader's increment from
- * __srcu_read_lock(), that reader was using a value of ->srcu_idx
- * from before the previous call to srcu_flip(), which should be
- * quite rare. This ordering thus helps forward progress because
- * the grace period could otherwise be delayed by additional
- * calls to __srcu_read_lock() using that old (soon to be new)
- * value of ->srcu_idx.
+ * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums
+ * matched and because that summing uses atomic_long_read(),
+ * there is ordering due to a control dependency between that
+ * summing and the WRITE_ONCE() in this call to srcu_flip().
+ * This ordering ensures that if this updater saw a given reader's
+ * increment from __srcu_read_lock(), that reader was using a value
+ * of ->srcu_ctrp from before the previous call to srcu_flip(),
+ * which should be quite rare. This ordering thus helps forward
+ * progress because the grace period could otherwise be delayed
+ * by additional calls to __srcu_read_lock() using that old (soon
+ * to be new) value of ->srcu_ctrp.
*
* This sum-equality check and ordering also ensures that if
* a given call to __srcu_read_lock() uses the new value of
- * ->srcu_idx, this updater's earlier scans cannot have seen
+ * ->srcu_ctrp, this updater's earlier scans cannot have seen
* that reader's increments, which is all to the good, because
* this grace period need not wait on that reader. After all,
* if those earlier scans had seen that reader, there would have
@@ -1152,7 +1198,8 @@ static void srcu_flip(struct srcu_struct *ssp)
*/
smp_mb(); /* E */ /* Pairs with B and C. */
- WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter.
+ WRITE_ONCE(ssp->srcu_ctrp,
+ &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]);
/*
* Ensure that if the updater misses an __srcu_read_unlock()
@@ -1161,6 +1208,16 @@ static void srcu_flip(struct srcu_struct *ssp)
* counter update. Note that both this memory barrier and the
* one in srcu_readers_active_idx_check() provide the guarantee
* for __srcu_read_lock().
+ *
+ * Note that this is a performance optimization, in which we spend
+ * an otherwise unnecessary smp_mb() in order to reduce the number
+ * of full per-CPU-variable scans in srcu_readers_lock_idx() and
+ * srcu_readers_unlock_idx(). But this performance optimization
+ * is not so optimal for SRCU-fast, where we would be spending
+ * not smp_mb(), but rather synchronize_rcu(). At the same time,
+ * the overhead of the smp_mb() is in the noise, so there is no
+ * point in omitting it in the SRCU-fast case. So the same code
+ * is executed either way.
*/
smp_mb(); /* D */ /* Pairs with C. */
}
@@ -1198,16 +1255,16 @@ static bool srcu_should_expedite(struct srcu_struct *ssp)
check_init_srcu_struct(ssp);
/* If _lite() readers, don't do unsolicited expediting. */
- if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)
+ if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP)
return false;
/* If the local srcu_data structure has callbacks, not idle. */
sdp = raw_cpu_ptr(ssp->sda);
- spin_lock_irqsave_rcu_node(sdp, flags);
+ raw_spin_lock_irqsave_rcu_node(sdp, flags);
if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
- spin_unlock_irqrestore_rcu_node(sdp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
return false; /* Callbacks already present, so not idle. */
}
- spin_unlock_irqrestore_rcu_node(sdp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
/*
* No local callbacks, so probabilistically probe global state.
@@ -1263,11 +1320,13 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
*/
idx = __srcu_read_lock_nmisafe(ssp);
ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state);
- if (ss_state < SRCU_SIZE_WAIT_CALL)
+ // If !rcu_cpu_beenfullyonline(), interrupts are still disabled,
+ // so no migration is possible in either direction from this CPU.
+ if (ss_state < SRCU_SIZE_WAIT_CALL || !rcu_cpu_beenfullyonline(raw_smp_processor_id()))
sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
sdp = raw_cpu_ptr(ssp->sda);
- spin_lock_irqsave_sdp_contention(sdp, &flags);
+ raw_spin_lock_irqsave_sdp_contention(sdp, &flags);
if (rhp)
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
/*
@@ -1327,7 +1386,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
sdp->srcu_gp_seq_needed_exp = s;
needexp = true;
}
- spin_unlock_irqrestore_rcu_node(sdp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
/* Ensure that snp node tree is fully initialized before traversing it */
if (ss_state < SRCU_SIZE_WAIT_BARRIER)
@@ -1398,8 +1457,12 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
* read-side critical sections are delimited by srcu_read_lock() and
* srcu_read_unlock(), and may be nested.
*
- * The callback will be invoked from process context, but must nevertheless
- * be fast and must not block.
+ * The callback will be invoked from process context, but with bh
+ * disabled. The callback function must therefore be fast and must
+ * not block.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func)
@@ -1435,7 +1498,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
/*
* Make sure that later code is ordered after the SRCU grace
- * period. This pairs with the spin_lock_irq_rcu_node()
+ * period. This pairs with the raw_spin_lock_irq_rcu_node()
* in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
* because the current CPU might have been totally uninvolved with
* (and thus unordered against) that grace period.
@@ -1465,8 +1528,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
*
* Wait for the count to drain to zero of both indexes. To avoid the
* possible starvation of synchronize_srcu(), it waits for the count of
- * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
- * and then flip the srcu_idx and wait for the count of the other index.
+ * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero
+ * at first, and then flip the ->srcu_ctrp and wait for the count of the
+ * other index.
*
* Can block; must be called from process context.
*
@@ -1579,7 +1643,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
{
if (cookie != SRCU_GET_STATE_COMPLETED &&
- !rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
+ !rcu_seq_done_exact(&ssp->srcu_sup->srcu_gp_seq, cookie))
return false;
// Ensure that the end of the SRCU grace period happens before
// any subsequent code that the caller might execute.
@@ -1613,7 +1677,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
*/
static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
{
- spin_lock_irq_rcu_node(sdp);
+ raw_spin_lock_irq_rcu_node(sdp);
atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
sdp->srcu_barrier_head.func = srcu_barrier_cb;
debug_rcu_head_queue(&sdp->srcu_barrier_head);
@@ -1622,7 +1686,7 @@ static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
}
- spin_unlock_irq_rcu_node(sdp);
+ raw_spin_unlock_irq_rcu_node(sdp);
}
/**
@@ -1666,6 +1730,64 @@ void srcu_barrier(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(srcu_barrier);
+/* Callback for srcu_expedite_current() usage. */
+static void srcu_expedite_current_cb(struct rcu_head *rhp)
+{
+ unsigned long flags;
+ bool needcb = false;
+ struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head);
+
+ raw_spin_lock_irqsave_sdp_contention(sdp, &flags);
+ if (sdp->srcu_ec_state == SRCU_EC_IDLE) {
+ WARN_ON_ONCE(1);
+ } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) {
+ sdp->srcu_ec_state = SRCU_EC_IDLE;
+ } else {
+ WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST);
+ sdp->srcu_ec_state = SRCU_EC_PENDING;
+ needcb = true;
+ }
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
+ // If needed, requeue ourselves as an expedited SRCU callback.
+ if (needcb)
+ __call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false);
+}
+
+/**
+ * srcu_expedite_current - Expedite the current SRCU grace period
+ * @ssp: srcu_struct to expedite.
+ *
+ * Cause the current SRCU grace period to become expedited. The grace
+ * period following the current one might also be expedited. If there is
+ * no current grace period, one might be created. If the current grace
+ * period is currently sleeping, that sleep will complete before expediting
+ * will take effect.
+ */
+void srcu_expedite_current(struct srcu_struct *ssp)
+{
+ unsigned long flags;
+ bool needcb = false;
+ struct srcu_data *sdp;
+
+ migrate_disable();
+ sdp = this_cpu_ptr(ssp->sda);
+ raw_spin_lock_irqsave_sdp_contention(sdp, &flags);
+ if (sdp->srcu_ec_state == SRCU_EC_IDLE) {
+ sdp->srcu_ec_state = SRCU_EC_PENDING;
+ needcb = true;
+ } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) {
+ sdp->srcu_ec_state = SRCU_EC_REPOST;
+ } else {
+ WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
+ // If needed, queue an expedited SRCU callback.
+ if (needcb)
+ __call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false);
+ migrate_enable();
+}
+EXPORT_SYMBOL_GPL(srcu_expedite_current);
+
/**
* srcu_batches_completed - return batches completed.
* @ssp: srcu_struct on which to report batch completion.
@@ -1675,7 +1797,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
*/
unsigned long srcu_batches_completed(struct srcu_struct *ssp)
{
- return READ_ONCE(ssp->srcu_idx);
+ return READ_ONCE(ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(srcu_batches_completed);
@@ -1692,7 +1814,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
/*
* Because readers might be delayed for an extended period after
- * fetching ->srcu_idx for their index, at any point in time there
+ * fetching ->srcu_ctrp for their index, at any point in time there
* might well be readers using both idx=0 and idx=1. We therefore
* need to wait for readers to clear from both index values before
* invoking a callback.
@@ -1702,17 +1824,17 @@ static void srcu_advance_state(struct srcu_struct *ssp)
*/
idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */
if (idx == SRCU_STATE_IDLE) {
- spin_lock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_lock_irq_rcu_node(ssp->srcu_sup);
if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq));
- spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return;
}
idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq));
if (idx == SRCU_STATE_IDLE)
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (idx != SRCU_STATE_IDLE) {
mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* Someone else started the grace period. */
@@ -1720,16 +1842,16 @@ static void srcu_advance_state(struct srcu_struct *ssp)
}
if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
- idx = 1 ^ (ssp->srcu_idx & 1);
+ idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]);
if (!try_check_zero(ssp, idx, 1)) {
mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
}
srcu_flip(ssp);
- spin_lock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_lock_irq_rcu_node(ssp->srcu_sup);
rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2);
ssp->srcu_sup->srcu_n_exp_nodelay = 0;
- spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
}
if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
@@ -1738,7 +1860,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
* SRCU read-side critical sections are normally short,
* so check at least twice in quick succession after a flip.
*/
- idx = 1 ^ (ssp->srcu_idx & 1);
+ idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]);
if (!try_check_zero(ssp, idx, 2)) {
mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
@@ -1767,7 +1889,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
ssp = sdp->ssp;
rcu_cblist_init(&ready_cbs);
- spin_lock_irq_rcu_node(sdp);
+ raw_spin_lock_irq_rcu_node(sdp);
WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
@@ -1778,7 +1900,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
*/
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
- spin_unlock_irq_rcu_node(sdp);
+ raw_spin_unlock_irq_rcu_node(sdp);
return; /* Someone else on the job or nothing to do. */
}
@@ -1786,7 +1908,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
sdp->srcu_cblist_invoking = true;
rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
len = ready_cbs.len;
- spin_unlock_irq_rcu_node(sdp);
+ raw_spin_unlock_irq_rcu_node(sdp);
rhp = rcu_cblist_dequeue(&ready_cbs);
for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
debug_rcu_head_unqueue(rhp);
@@ -1801,11 +1923,11 @@ static void srcu_invoke_callbacks(struct work_struct *work)
* Update counts, accelerate new callbacks, and if needed,
* schedule another round of callback invocation.
*/
- spin_lock_irq_rcu_node(sdp);
+ raw_spin_lock_irq_rcu_node(sdp);
rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
- spin_unlock_irq_rcu_node(sdp);
+ raw_spin_unlock_irq_rcu_node(sdp);
/* An SRCU barrier or callbacks from previous nesting work pending */
if (more)
srcu_schedule_cbs_sdp(sdp, 0);
@@ -1819,7 +1941,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
{
bool pushgp = true;
- spin_lock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_lock_irq_rcu_node(ssp->srcu_sup);
if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) {
/* All requests fulfilled, time to go idle. */
@@ -1829,7 +1951,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
/* Outstanding request and no GP. Start one. */
srcu_gp_start(ssp);
}
- spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (pushgp)
queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay);
@@ -1849,7 +1971,9 @@ static void process_srcu(struct work_struct *work)
ssp = sup->srcu_ssp;
srcu_advance_state(ssp);
+ raw_spin_lock_irq_rcu_node(ssp->srcu_sup);
curdelay = srcu_get_delay(ssp);
+ raw_spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (curdelay) {
WRITE_ONCE(sup->reschedule_count, 0);
} else {
@@ -1867,6 +1991,23 @@ static void process_srcu(struct work_struct *work)
srcu_reschedule(ssp, curdelay);
}
+static void srcu_irq_work(struct irq_work *work)
+{
+ struct srcu_struct *ssp;
+ struct srcu_usage *sup;
+ unsigned long delay;
+ unsigned long flags;
+
+ sup = container_of(work, struct srcu_usage, irq_work);
+ ssp = sup->srcu_ssp;
+
+ raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ delay = srcu_get_delay(ssp);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+
+ queue_delayed_work(rcu_gp_wq, &sup->work, !!delay);
+}
+
void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags,
unsigned long *gp_seq)
{
@@ -1896,7 +2037,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state);
int ss_state_idx = ss_state;
- idx = ssp->srcu_idx & 0x1;
+ idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0];
if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
@@ -1914,8 +2055,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
struct srcu_data *sdp;
sdp = per_cpu_ptr(ssp->sda, cpu);
- u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx]));
- u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx]));
+ u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks));
+ u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks));
/*
* Make sure that a lock is always counted if the corresponding
@@ -1923,8 +2064,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
*/
smp_rmb();
- l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx]));
- l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx]));
+ l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks));
+ l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks));
c0 = l0 - u0;
c1 = l1 - u1;
@@ -2001,6 +2142,7 @@ static int srcu_module_coming(struct module *mod)
ssp->sda = alloc_percpu(struct srcu_data);
if (WARN_ON_ONCE(!ssp->sda))
return -ENOMEM;
+ ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0];
}
return 0;
}
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 59314da5eb60..48f0d803c8e2 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -161,11 +161,6 @@ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
#endif
-/* Avoid IPIing CPUs early in the grace period. */
-#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0)
-static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY;
-module_param(rcu_task_ipi_delay, int, 0644);
-
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
#define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30)
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
@@ -264,7 +259,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
}
lim = rcu_task_enqueue_lim;
- rtp->rtpcp_array = kcalloc(num_possible_cpus(), sizeof(struct rcu_tasks_percpu *), GFP_KERNEL);
+ rtp->rtpcp_array = kzalloc_objs(struct rcu_tasks_percpu *,
+ num_possible_cpus());
BUG_ON(!rtp->rtpcp_array);
for_each_possible_cpu(cpu) {
@@ -295,9 +291,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
shift = ilog2(rcu_task_cpu_ids / lim);
if (((rcu_task_cpu_ids - 1) >> shift) >= lim)
shift++;
- WRITE_ONCE(rtp->percpu_enqueue_shift, shift);
- WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
- smp_store_release(&rtp->percpu_enqueue_lim, lim);
+ rtp->percpu_enqueue_shift = shift;
+ rtp->percpu_dequeue_lim = lim;
+ rtp->percpu_enqueue_lim = lim;
pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d rcu_task_cpu_ids=%d.\n",
rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim),
@@ -316,7 +312,8 @@ static void call_rcu_tasks_generic_timer(struct timer_list *tlp)
unsigned long flags;
bool needwake = false;
struct rcu_tasks *rtp;
- struct rcu_tasks_percpu *rtpcp = from_timer(rtpcp, tlp, lazy_timer);
+ struct rcu_tasks_percpu *rtpcp = timer_container_of(rtpcp, tlp,
+ lazy_timer);
rtp = rtpcp->rtpp;
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
@@ -552,13 +549,13 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
rtpcp_next = rtp->rtpcp_array[index];
if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND;
- queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
+ queue_work_on(cpuwq, system_percpu_wq, &rtpcp_next->rtp_work);
index++;
if (index < num_possible_cpus()) {
rtpcp_next = rtp->rtpcp_array[index];
if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND;
- queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
+ queue_work_on(cpuwq, system_percpu_wq, &rtpcp_next->rtp_work);
}
}
}
@@ -717,7 +714,6 @@ static void __init rcu_tasks_bootup_oddness(void)
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
-
/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
@@ -800,9 +796,7 @@ static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *t
#endif // #ifndef CONFIG_TINY_RCU
-static void exit_tasks_rcu_finish_trace(struct task_struct *t);
-
-#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
+#if defined(CONFIG_TASKS_RCU)
////////////////////////////////////////////////////////////////////////
//
@@ -897,7 +891,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
rtp->postgp_func(rtp);
}
-#endif /* #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) */
+#endif /* #if defined(CONFIG_TASKS_RCU) */
#ifdef CONFIG_TASKS_RCU
@@ -1086,7 +1080,7 @@ static void rcu_tasks_postscan(struct list_head *hop)
}
if (!IS_ENABLED(CONFIG_TINY_RCU))
- del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
+ timer_delete_sync(&tasks_rcu_exit_srcu_stall_timer);
}
/* See if tasks are still holding out, complain if so. */
@@ -1321,13 +1315,11 @@ void exit_tasks_rcu_finish(void)
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
list_del_init(&t->rcu_tasks_exit_list);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
-
- exit_tasks_rcu_finish_trace(t);
}
#else /* #ifdef CONFIG_TASKS_RCU */
void exit_tasks_rcu_start(void) { }
-void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
+void exit_tasks_rcu_finish(void) { }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_RUDE_RCU
@@ -1448,682 +1440,11 @@ EXPORT_SYMBOL_GPL(rcu_tasks_rude_get_gp_data);
#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
-////////////////////////////////////////////////////////////////////////
-//
-// Tracing variant of Tasks RCU. This variant is designed to be used
-// to protect tracing hooks, including those of BPF. This variant
-// therefore:
-//
-// 1. Has explicit read-side markers to allow finite grace periods
-// in the face of in-kernel loops for PREEMPT=n builds.
-//
-// 2. Protects code in the idle loop, exception entry/exit, and
-// CPU-hotplug code paths, similar to the capabilities of SRCU.
-//
-// 3. Avoids expensive read-side instructions, having overhead similar
-// to that of Preemptible RCU.
-//
-// There are of course downsides. For example, the grace-period code
-// can send IPIs to CPUs, even when those CPUs are in the idle loop or
-// in nohz_full userspace. If needed, these downsides can be at least
-// partially remedied.
-//
-// Perhaps most important, this variant of RCU does not affect the vanilla
-// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace
-// readers can operate from idle, offline, and exception entry/exit in no
-// way allows rcu_preempt and rcu_sched readers to also do so.
-//
-// The implementation uses rcu_tasks_wait_gp(), which relies on function
-// pointers in the rcu_tasks structure. The rcu_spawn_tasks_trace_kthread()
-// function sets these function pointers up so that rcu_tasks_wait_gp()
-// invokes these functions in this order:
-//
-// rcu_tasks_trace_pregp_step():
-// Disables CPU hotplug, adds all currently executing tasks to the
-// holdout list, then checks the state of all tasks that blocked
-// or were preempted within their current RCU Tasks Trace read-side
-// critical section, adding them to the holdout list if appropriate.
-// Finally, this function re-enables CPU hotplug.
-// The ->pertask_func() pointer is NULL, so there is no per-task processing.
-// rcu_tasks_trace_postscan():
-// Invokes synchronize_rcu() to wait for late-stage exiting tasks
-// to finish exiting.
-// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty:
-// Scans the holdout list, attempting to identify a quiescent state
-// for each task on the list. If there is a quiescent state, the
-// corresponding task is removed from the holdout list. Once this
-// list is empty, the grace period has completed.
-// rcu_tasks_trace_postgp():
-// Provides the needed full memory barrier and does debug checks.
-//
-// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks.
-//
-// Pre-grace-period update-side code is ordered before the grace period
-// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period
-// read-side code is ordered before the grace period by atomic operations
-// on .b.need_qs flag of each task involved in this process, or by scheduler
-// context-switch ordering (for locked-down non-running readers).
-
-// The lockdep state must be outside of #ifdef to be useful.
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct lock_class_key rcu_lock_trace_key;
-struct lockdep_map rcu_trace_lock_map =
- STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key);
-EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-#ifdef CONFIG_TASKS_TRACE_RCU
-
-// Record outstanding IPIs to each CPU. No point in sending two...
-static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
-
-// The number of detections of task quiescent state relying on
-// heavyweight readers executing explicit memory barriers.
-static unsigned long n_heavy_reader_attempts;
-static unsigned long n_heavy_reader_updates;
-static unsigned long n_heavy_reader_ofl_updates;
-static unsigned long n_trc_holdouts;
-
-void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
-DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace,
- "RCU Tasks Trace");
-
-/* Load from ->trc_reader_special.b.need_qs with proper ordering. */
-static u8 rcu_ld_need_qs(struct task_struct *t)
-{
- smp_mb(); // Enforce full grace-period ordering.
- return smp_load_acquire(&t->trc_reader_special.b.need_qs);
-}
-
-/* Store to ->trc_reader_special.b.need_qs with proper ordering. */
-static void rcu_st_need_qs(struct task_struct *t, u8 v)
-{
- smp_store_release(&t->trc_reader_special.b.need_qs, v);
- smp_mb(); // Enforce full grace-period ordering.
-}
-
-/*
- * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
- * the four-byte operand-size restriction of some platforms.
- *
- * Returns the old value, which is often ignored.
- */
-u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
-{
- return cmpxchg(&t->trc_reader_special.b.need_qs, old, new);
-}
-EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
-
-/*
- * If we are the last reader, signal the grace-period kthread.
- * Also remove from the per-CPU list of blocked tasks.
- */
-void rcu_read_unlock_trace_special(struct task_struct *t)
-{
- unsigned long flags;
- struct rcu_tasks_percpu *rtpcp;
- union rcu_special trs;
-
- // Open-coded full-word version of rcu_ld_need_qs().
- smp_mb(); // Enforce full grace-period ordering.
- trs = smp_load_acquire(&t->trc_reader_special);
-
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb)
- smp_mb(); // Pairs with update-side barriers.
- // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers.
- if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) {
- u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS,
- TRC_NEED_QS_CHECKED);
-
- WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result);
- }
- if (trs.b.blocked) {
- rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu);
- raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
- list_del_init(&t->trc_blkd_node);
- WRITE_ONCE(t->trc_reader_special.b.blocked, false);
- raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
- }
- WRITE_ONCE(t->trc_reader_nesting, 0);
-}
-EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special);
-
-/* Add a newly blocked reader task to its CPU's list. */
-void rcu_tasks_trace_qs_blkd(struct task_struct *t)
-{
- unsigned long flags;
- struct rcu_tasks_percpu *rtpcp;
-
- local_irq_save(flags);
- rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu);
- raw_spin_lock_rcu_node(rtpcp); // irqs already disabled
- t->trc_blkd_cpu = smp_processor_id();
- if (!rtpcp->rtp_blkd_tasks.next)
- INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
- list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
- WRITE_ONCE(t->trc_reader_special.b.blocked, true);
- raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
-}
-EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd);
-
-/* Add a task to the holdout list, if it is not already on the list. */
-static void trc_add_holdout(struct task_struct *t, struct list_head *bhp)
-{
- if (list_empty(&t->trc_holdout_list)) {
- get_task_struct(t);
- list_add(&t->trc_holdout_list, bhp);
- n_trc_holdouts++;
- }
-}
-
-/* Remove a task from the holdout list, if it is in fact present. */
-static void trc_del_holdout(struct task_struct *t)
-{
- if (!list_empty(&t->trc_holdout_list)) {
- list_del_init(&t->trc_holdout_list);
- put_task_struct(t);
- n_trc_holdouts--;
- }
-}
-
-/* IPI handler to check task state. */
-static void trc_read_check_handler(void *t_in)
-{
- int nesting;
- struct task_struct *t = current;
- struct task_struct *texp = t_in;
-
- // If the task is no longer running on this CPU, leave.
- if (unlikely(texp != t))
- goto reset_ipi; // Already on holdout list, so will check later.
-
- // If the task is not in a read-side critical section, and
- // if this is the last reader, awaken the grace-period kthread.
- nesting = READ_ONCE(t->trc_reader_nesting);
- if (likely(!nesting)) {
- rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
- goto reset_ipi;
- }
- // If we are racing with an rcu_read_unlock_trace(), try again later.
- if (unlikely(nesting < 0))
- goto reset_ipi;
-
- // Get here if the task is in a read-side critical section.
- // Set its state so that it will update state for the grace-period
- // kthread upon exit from that critical section.
- rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED);
-
-reset_ipi:
- // Allow future IPIs to be sent on CPU and for task.
- // Also order this IPI handler against any later manipulations of
- // the intended task.
- smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^
- smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^
-}
-
-/* Callback function for scheduler to check locked-down task. */
-static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
-{
- struct list_head *bhp = bhp_in;
- int cpu = task_cpu(t);
- int nesting;
- bool ofl = cpu_is_offline(cpu);
-
- if (task_curr(t) && !ofl) {
- // If no chance of heavyweight readers, do it the hard way.
- if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- return -EINVAL;
-
- // If heavyweight readers are enabled on the remote task,
- // we can inspect its state despite its currently running.
- // However, we cannot safely change its state.
- n_heavy_reader_attempts++;
- // Check for "running" idle tasks on offline CPUs.
- if (!rcu_watching_zero_in_eqs(cpu, &t->trc_reader_nesting))
- return -EINVAL; // No quiescent state, do it the hard way.
- n_heavy_reader_updates++;
- nesting = 0;
- } else {
- // The task is not running, so C-language access is safe.
- nesting = t->trc_reader_nesting;
- WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t))));
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl)
- n_heavy_reader_ofl_updates++;
- }
-
- // If not exiting a read-side critical section, mark as checked
- // so that the grace-period kthread will remove it from the
- // holdout list.
- if (!nesting) {
- rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
- return 0; // In QS, so done.
- }
- if (nesting < 0)
- return -EINVAL; // Reader transitioning, try again later.
-
- // The task is in a read-side critical section, so set up its
- // state so that it will update state upon exit from that critical
- // section.
- if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED))
- trc_add_holdout(t, bhp);
- return 0;
-}
-
-/* Attempt to extract the state for the specified task. */
-static void trc_wait_for_one_reader(struct task_struct *t,
- struct list_head *bhp)
-{
- int cpu;
-
- // If a previous IPI is still in flight, let it complete.
- if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI
- return;
-
- // The current task had better be in a quiescent state.
- if (t == current) {
- rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
- WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
- return;
- }
-
- // Attempt to nail down the task for inspection.
- get_task_struct(t);
- if (!task_call_func(t, trc_inspect_reader, bhp)) {
- put_task_struct(t);
- return;
- }
- put_task_struct(t);
-
- // If this task is not yet on the holdout list, then we are in
- // an RCU read-side critical section. Otherwise, the invocation of
- // trc_add_holdout() that added it to the list did the necessary
- // get_task_struct(). Either way, the task cannot be freed out
- // from under this code.
-
- // If currently running, send an IPI, either way, add to list.
- trc_add_holdout(t, bhp);
- if (task_curr(t) &&
- time_after(jiffies + 1, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) {
- // The task is currently running, so try IPIing it.
- cpu = task_cpu(t);
-
- // If there is already an IPI outstanding, let it happen.
- if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0)
- return;
-
- per_cpu(trc_ipi_to_cpu, cpu) = true;
- t->trc_ipi_to_cpu = cpu;
- rcu_tasks_trace.n_ipis++;
- if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) {
- // Just in case there is some other reason for
- // failure than the target CPU being offline.
- WARN_ONCE(1, "%s(): smp_call_function_single() failed for CPU: %d\n",
- __func__, cpu);
- rcu_tasks_trace.n_ipis_fails++;
- per_cpu(trc_ipi_to_cpu, cpu) = false;
- t->trc_ipi_to_cpu = -1;
- }
- }
-}
-
-/*
- * Initialize for first-round processing for the specified task.
- * Return false if task is NULL or already taken care of, true otherwise.
- */
-static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself)
-{
- // During early boot when there is only the one boot CPU, there
- // is no idle task for the other CPUs. Also, the grace-period
- // kthread is always in a quiescent state. In addition, just return
- // if this task is already on the list.
- if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list))
- return false;
-
- rcu_st_need_qs(t, 0);
- t->trc_ipi_to_cpu = -1;
- return true;
-}
-
-/* Do first-round processing for the specified task. */
-static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop)
-{
- if (rcu_tasks_trace_pertask_prep(t, true))
- trc_wait_for_one_reader(t, hop);
-}
-
-/* Initialize for a new RCU-tasks-trace grace period. */
-static void rcu_tasks_trace_pregp_step(struct list_head *hop)
-{
- LIST_HEAD(blkd_tasks);
- int cpu;
- unsigned long flags;
- struct rcu_tasks_percpu *rtpcp;
- struct task_struct *t;
-
- // There shouldn't be any old IPIs, but...
- for_each_possible_cpu(cpu)
- WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu));
-
- // Disable CPU hotplug across the CPU scan for the benefit of
- // any IPIs that might be needed. This also waits for all readers
- // in CPU-hotplug code paths.
- cpus_read_lock();
-
- // These rcu_tasks_trace_pertask_prep() calls are serialized to
- // allow safe access to the hop list.
- for_each_online_cpu(cpu) {
- rcu_read_lock();
- // Note that cpu_curr_snapshot() picks up the target
- // CPU's current task while its runqueue is locked with
- // an smp_mb__after_spinlock(). This ensures that either
- // the grace-period kthread will see that task's read-side
- // critical section or the task will see the updater's pre-GP
- // accesses. The trailing smp_mb() in cpu_curr_snapshot()
- // does not currently play a role other than simplify
- // that function's ordering semantics. If these simplified
- // ordering semantics continue to be redundant, that smp_mb()
- // might be removed.
- t = cpu_curr_snapshot(cpu);
- if (rcu_tasks_trace_pertask_prep(t, true))
- trc_add_holdout(t, hop);
- rcu_read_unlock();
- cond_resched_tasks_rcu_qs();
- }
-
- // Only after all running tasks have been accounted for is it
- // safe to take care of the tasks that have blocked within their
- // current RCU tasks trace read-side critical section.
- for_each_possible_cpu(cpu) {
- rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu);
- raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
- list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks);
- while (!list_empty(&blkd_tasks)) {
- rcu_read_lock();
- t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node);
- list_del_init(&t->trc_blkd_node);
- list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
- raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
- rcu_tasks_trace_pertask(t, hop);
- rcu_read_unlock();
- raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
- }
- raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
- cond_resched_tasks_rcu_qs();
- }
-
- // Re-enable CPU hotplug now that the holdout list is populated.
- cpus_read_unlock();
-}
-
-/*
- * Do intermediate processing between task and holdout scans.
- */
-static void rcu_tasks_trace_postscan(struct list_head *hop)
-{
- // Wait for late-stage exiting tasks to finish exiting.
- // These might have passed the call to exit_tasks_rcu_finish().
-
- // If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
- synchronize_rcu();
- // Any tasks that exit after this point will set
- // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
-}
-
-/* Communicate task state back to the RCU tasks trace stall warning request. */
-struct trc_stall_chk_rdr {
- int nesting;
- int ipi_to_cpu;
- u8 needqs;
-};
-
-static int trc_check_slow_task(struct task_struct *t, void *arg)
-{
- struct trc_stall_chk_rdr *trc_rdrp = arg;
-
- if (task_curr(t) && cpu_online(task_cpu(t)))
- return false; // It is running, so decline to inspect it.
- trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting);
- trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu);
- trc_rdrp->needqs = rcu_ld_need_qs(t);
- return true;
-}
-
-/* Show the state of a task stalling the current RCU tasks trace GP. */
-static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
-{
- int cpu;
- struct trc_stall_chk_rdr trc_rdr;
- bool is_idle_tsk = is_idle_task(t);
-
- if (*firstreport) {
- pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n");
- *firstreport = false;
- }
- cpu = task_cpu(t);
- if (!task_call_func(t, trc_check_slow_task, &trc_rdr))
- pr_alert("P%d: %c%c\n",
- t->pid,
- ".I"[t->trc_ipi_to_cpu >= 0],
- ".i"[is_idle_tsk]);
- else
- pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n",
- t->pid,
- ".I"[trc_rdr.ipi_to_cpu >= 0],
- ".i"[is_idle_tsk],
- ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)],
- ".B"[!!data_race(t->trc_reader_special.b.blocked)],
- trc_rdr.nesting,
- " !CN"[trc_rdr.needqs & 0x3],
- " ?"[trc_rdr.needqs > 0x3],
- cpu, cpu_online(cpu) ? "" : "(offline)");
- sched_show_task(t);
-}
-
-/* List stalled IPIs for RCU tasks trace. */
-static void show_stalled_ipi_trace(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- if (per_cpu(trc_ipi_to_cpu, cpu))
- pr_alert("\tIPI outstanding to CPU %d\n", cpu);
-}
-
-/* Do one scan of the holdout list. */
-static void check_all_holdout_tasks_trace(struct list_head *hop,
- bool needreport, bool *firstreport)
-{
- struct task_struct *g, *t;
-
- // Disable CPU hotplug across the holdout list scan for IPIs.
- cpus_read_lock();
-
- list_for_each_entry_safe(t, g, hop, trc_holdout_list) {
- // If safe and needed, try to check the current task.
- if (READ_ONCE(t->trc_ipi_to_cpu) == -1 &&
- !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED))
- trc_wait_for_one_reader(t, hop);
-
- // If check succeeded, remove this task from the list.
- if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 &&
- rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED)
- trc_del_holdout(t);
- else if (needreport)
- show_stalled_task_trace(t, firstreport);
- cond_resched_tasks_rcu_qs();
- }
-
- // Re-enable CPU hotplug now that the holdout list scan has completed.
- cpus_read_unlock();
-
- if (needreport) {
- if (*firstreport)
- pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n");
- show_stalled_ipi_trace();
- }
-}
-
-static void rcu_tasks_trace_empty_fn(void *unused)
-{
-}
-
-/* Wait for grace period to complete and provide ordering. */
-static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
-{
- int cpu;
-
- // Wait for any lingering IPI handlers to complete. Note that
- // if a CPU has gone offline or transitioned to userspace in the
- // meantime, all IPI handlers should have been drained beforehand.
- // Yes, this assumes that CPUs process IPIs in order. If that ever
- // changes, there will need to be a recheck and/or timed wait.
- for_each_online_cpu(cpu)
- if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))))
- smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
-
- smp_mb(); // Caller's code must be ordered after wakeup.
- // Pairs with pretty much every ordering primitive.
-}
-
-/* Report any needed quiescent state for this exiting task. */
-static void exit_tasks_rcu_finish_trace(struct task_struct *t)
-{
- union rcu_special trs = READ_ONCE(t->trc_reader_special);
-
- rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
- WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
- if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked))
- rcu_read_unlock_trace_special(t);
- else
- WRITE_ONCE(t->trc_reader_nesting, 0);
-}
-
-/**
- * call_rcu_tasks_trace() - Queue a callback trace task-based grace period
- * @rhp: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a trace rcu-tasks
- * grace period elapses, in other words after all currently executing
- * trace rcu-tasks read-side critical sections have completed. These
- * read-side critical sections are delimited by calls to rcu_read_lock_trace()
- * and rcu_read_unlock_trace().
- *
- * See the description of call_rcu() for more detailed information on
- * memory ordering guarantees.
- */
-void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func)
-{
- call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace);
-}
-EXPORT_SYMBOL_GPL(call_rcu_tasks_trace);
-
-/**
- * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period
- *
- * Control will return to the caller some time after a trace rcu-tasks
- * grace period has elapsed, in other words after all currently executing
- * trace rcu-tasks read-side critical sections have elapsed. These read-side
- * critical sections are delimited by calls to rcu_read_lock_trace()
- * and rcu_read_unlock_trace().
- *
- * This is a very specialized primitive, intended only for a few uses in
- * tracing and other situations requiring manipulation of function preambles
- * and profiling hooks. The synchronize_rcu_tasks_trace() function is not
- * (yet) intended for heavy use from multiple CPUs.
- *
- * See the description of synchronize_rcu() for more detailed information
- * on memory ordering guarantees.
- */
-void synchronize_rcu_tasks_trace(void)
-{
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section");
- synchronize_rcu_tasks_generic(&rcu_tasks_trace);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace);
-
-/**
- * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks.
- *
- * Although the current implementation is guaranteed to wait, it is not
- * obligated to, for example, if there are no pending callbacks.
- */
-void rcu_barrier_tasks_trace(void)
-{
- rcu_barrier_tasks_generic(&rcu_tasks_trace);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace);
-
-int rcu_tasks_trace_lazy_ms = -1;
-module_param(rcu_tasks_trace_lazy_ms, int, 0444);
-
-static int __init rcu_spawn_tasks_trace_kthread(void)
-{
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) {
- rcu_tasks_trace.gp_sleep = HZ / 10;
- rcu_tasks_trace.init_fract = HZ / 10;
- } else {
- rcu_tasks_trace.gp_sleep = HZ / 200;
- if (rcu_tasks_trace.gp_sleep <= 0)
- rcu_tasks_trace.gp_sleep = 1;
- rcu_tasks_trace.init_fract = HZ / 200;
- if (rcu_tasks_trace.init_fract <= 0)
- rcu_tasks_trace.init_fract = 1;
- }
- if (rcu_tasks_trace_lazy_ms >= 0)
- rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms);
- rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step;
- rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan;
- rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace;
- rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp;
- rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace);
- return 0;
-}
-
-#if !defined(CONFIG_TINY_RCU)
-void show_rcu_tasks_trace_gp_kthread(void)
-{
- char buf[64];
-
- snprintf(buf, sizeof(buf), "N%lu h:%lu/%lu/%lu",
- data_race(n_trc_holdouts),
- data_race(n_heavy_reader_ofl_updates),
- data_race(n_heavy_reader_updates),
- data_race(n_heavy_reader_attempts));
- show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf);
-}
-EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread);
-
-void rcu_tasks_trace_torture_stats_print(char *tt, char *tf)
-{
- rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, "");
-}
-EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print);
-#endif // !defined(CONFIG_TINY_RCU)
-
-struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
-{
- return rcu_tasks_trace.kthread_ptr;
-}
-EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread);
-
-void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq)
-{
- *flags = 0;
- *gp_seq = rcu_seq_current(&rcu_tasks_trace.tasks_gp_seq);
-}
-EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data);
-
-#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
-static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
-#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
-
#ifndef CONFIG_TINY_RCU
void show_rcu_tasks_gp_kthreads(void)
{
show_rcu_tasks_classic_gp_kthread();
show_rcu_tasks_rude_gp_kthread();
- show_rcu_tasks_trace_gp_kthread();
}
#endif /* #ifndef CONFIG_TINY_RCU */
@@ -2250,13 +1571,9 @@ void __init tasks_cblist_init_generic(void)
#ifdef CONFIG_TASKS_RUDE_RCU
cblist_init_generic(&rcu_tasks_rude);
#endif
-
-#ifdef CONFIG_TASKS_TRACE_RCU
- cblist_init_generic(&rcu_tasks_trace);
-#endif
}
-void __init rcu_init_tasks_generic(void)
+static int __init rcu_init_tasks_generic(void)
{
#ifdef CONFIG_TASKS_RCU
rcu_spawn_tasks_kthread();
@@ -2266,14 +1583,26 @@ void __init rcu_init_tasks_generic(void)
rcu_spawn_tasks_rude_kthread();
#endif
-#ifdef CONFIG_TASKS_TRACE_RCU
- rcu_spawn_tasks_trace_kthread();
-#endif
-
// Run the self-tests.
rcu_tasks_initiate_self_tests();
+
+ return 0;
}
+core_initcall(rcu_init_tasks_generic);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
+////////////////////////////////////////////////////////////////////////
+//
+// Tracing variant of Tasks RCU. This variant is designed to be used
+// to protect tracing hooks, including those of BPF. This variant
+// is implemented via a straightforward mapping onto SRCU-fast.
+
+DEFINE_SRCU_FAST(rcu_tasks_trace_srcu_struct);
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_srcu_struct);
+
+#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 4b3f31911465..585cade21010 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -70,12 +70,10 @@ void rcu_qs(void)
*/
void rcu_sched_clock_irq(int user)
{
- if (user) {
+ if (user)
rcu_qs();
- } else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
- set_tsk_need_resched(current);
- set_preempt_need_resched();
- }
+ else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail)
+ set_need_resched_current();
}
/*
@@ -85,15 +83,8 @@ void rcu_sched_clock_irq(int user)
static inline bool rcu_reclaim_tiny(struct rcu_head *head)
{
rcu_callback_t f;
- unsigned long offset = (unsigned long)head->func;
rcu_lock_acquire(&rcu_callback_map);
- if (__is_kvfree_rcu_offset(offset)) {
- trace_rcu_invoke_kvfree_callback("", head, offset);
- kvfree((void *)head - offset);
- rcu_lock_release(&rcu_callback_map);
- return true;
- }
trace_rcu_invoke_callback("", head);
f = head->func;
@@ -159,10 +150,6 @@ void synchronize_rcu(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
-static void tiny_rcu_leak_callback(struct rcu_head *rhp)
-{
-}
-
/*
* Post an RCU callback to be invoked after the end of an RCU grace
* period. But since we have but one CPU, that would be after any
@@ -178,9 +165,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
mem_dump_obj(head);
}
-
- if (!__is_kvfree_rcu_offset((unsigned long)head->func))
- WRITE_ONCE(head->func, tiny_rcu_leak_callback);
return;
}
@@ -246,15 +230,18 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
-#ifdef CONFIG_KASAN_GENERIC
-void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST)
+unsigned long long rcutorture_gather_gp_seqs(void)
{
- if (head)
- kasan_record_aux_stack(ptr);
+ return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL;
+}
+EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs);
- __kvfree_call_rcu(head, ptr);
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len)
+{
+ snprintf(cp, len, "g%04llx", seqs & 0xffffULL);
}
-EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs);
#endif
void __init rcu_init(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 475f31deed14..55df6d37145e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -80,6 +80,15 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.gpwrap = true,
};
+
+int rcu_get_gpwrap_count(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return READ_ONCE(rdp->gpwrap_count);
+}
+EXPORT_SYMBOL_GPL(rcu_get_gpwrap_count);
+
static struct rcu_state rcu_state = {
.level = { &rcu_state.node[0] },
.gp_state = RCU_GP_IDLE,
@@ -151,7 +160,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
-static void sync_sched_exp_online_cleanup(int cpu);
+static void rcu_report_qs_rdp(struct rcu_data *rdp);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
@@ -368,7 +377,7 @@ EXPORT_SYMBOL_GPL(rcu_momentary_eqs);
*/
static int rcu_is_cpu_rrupt_from_idle(void)
{
- long nesting;
+ long nmi_nesting = ct_nmi_nesting();
/*
* Usually called from the tick; but also used from smp_function_call()
@@ -380,21 +389,28 @@ static int rcu_is_cpu_rrupt_from_idle(void)
/* Check for counter underflows */
RCU_LOCKDEP_WARN(ct_nesting() < 0,
"RCU nesting counter underflow!");
- RCU_LOCKDEP_WARN(ct_nmi_nesting() <= 0,
- "RCU nmi_nesting counter underflow/zero!");
- /* Are we at first interrupt nesting level? */
- nesting = ct_nmi_nesting();
- if (nesting > 1)
+ /* Non-idle interrupt or nested idle interrupt */
+ if (nmi_nesting > 1)
return false;
/*
- * If we're not in an interrupt, we must be in the idle task!
+ * Non nested idle interrupt (interrupting section where RCU
+ * wasn't watching).
*/
- WARN_ON_ONCE(!nesting && !is_idle_task(current));
+ if (nmi_nesting == 1)
+ return true;
+
+ /* Not in an interrupt */
+ if (!nmi_nesting) {
+ RCU_LOCKDEP_WARN(!in_task() || !is_idle_task(current),
+ "RCU nmi_nesting counter not in idle task!");
+ return !rcu_is_watching_curr_cpu();
+ }
+
+ RCU_LOCKDEP_WARN(1, "RCU nmi_nesting counter underflow/zero!");
- /* Does CPU appear to be idle from an RCU standpoint? */
- return ct_nesting() == 0;
+ return false;
}
#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
@@ -538,7 +554,27 @@ void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq)
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
-#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
+/* Gather grace-period sequence numbers for rcutorture diagnostics. */
+unsigned long long rcutorture_gather_gp_seqs(void)
+{
+ return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) |
+ ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) |
+ (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL);
+}
+EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs);
+
+/* Format grace-period sequence numbers for rcutorture diagnostics. */
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len)
+{
+ unsigned int egp = (seqs >> 16) & 0xffffffULL;
+ unsigned int ggp = (seqs >> 40) & 0xffffULL;
+ unsigned int pgp = seqs & 0xffffULL;
+
+ snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp);
+}
+EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs);
+
+#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK))
/*
* An empty function that will trigger a reschedule on
* IRQ tail once IRQs get re-enabled on userspace/guest resume.
@@ -567,7 +603,7 @@ noinstr void rcu_irq_work_resched(void)
if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
return;
- if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
+ if (IS_ENABLED(CONFIG_VIRT_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
return;
instrumentation_begin();
@@ -576,7 +612,7 @@ noinstr void rcu_irq_work_resched(void)
}
instrumentation_end();
}
-#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */
+#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) */
#ifdef CONFIG_PROVE_RCU
/**
@@ -737,6 +773,25 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
}
+static unsigned long seq_gpwrap_lag = ULONG_MAX / 4;
+
+/**
+ * rcu_set_gpwrap_lag - Set RCU GP sequence overflow lag value.
+ * @lag_gps: Set overflow lag to this many grace period worth of counters
+ * which is used by rcutorture to quickly force a gpwrap situation.
+ * @lag_gps = 0 means we reset it back to the boot-time value.
+ */
+void rcu_set_gpwrap_lag(unsigned long lag_gps)
+{
+ unsigned long lag_seq_count;
+
+ lag_seq_count = (lag_gps == 0)
+ ? ULONG_MAX / 4
+ : lag_gps << RCU_SEQ_CTR_SHIFT;
+ WRITE_ONCE(seq_gpwrap_lag, lag_seq_count);
+}
+EXPORT_SYMBOL_GPL(rcu_set_gpwrap_lag);
+
/*
* When trying to report a quiescent state on behalf of some other CPU,
* it is our responsibility to check for and handle potential overflow
@@ -747,9 +802,11 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
{
raw_lockdep_assert_held_rcu_node(rnp);
- if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4,
- rnp->gp_seq))
+ if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + seq_gpwrap_lag,
+ rnp->gp_seq)) {
WRITE_ONCE(rdp->gpwrap, true);
+ WRITE_ONCE(rdp->gpwrap_count, READ_ONCE(rdp->gpwrap_count) + 1);
+ }
if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq))
rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4;
}
@@ -781,6 +838,10 @@ static int rcu_watching_snap_save(struct rcu_data *rdp)
return 0;
}
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+
/*
* Returns positive if the specified CPU has passed through a quiescent state
* by virtue of being in or having passed through an dynticks idle state since
@@ -916,9 +977,9 @@ static int rcu_watching_snap_recheck(struct rcu_data *rdp)
rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
- rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu);
- rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu);
- rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu);
+ rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu);
+ rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu);
+ rsrp->nr_csw = nr_context_switches_cpu(cpu);
rsrp->jiffies = jiffies;
rsrp->gp_seq = rdp->gp_seq;
}
@@ -1040,38 +1101,6 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
return needmore;
}
-static void swake_up_one_online_ipi(void *arg)
-{
- struct swait_queue_head *wqh = arg;
-
- swake_up_one(wqh);
-}
-
-static void swake_up_one_online(struct swait_queue_head *wqh)
-{
- int cpu = get_cpu();
-
- /*
- * If called from rcutree_report_cpu_starting(), wake up
- * is dangerous that late in the CPU-down hotplug process. The
- * scheduler might queue an ignored hrtimer. Defer the wake up
- * to an online CPU instead.
- */
- if (unlikely(cpu_is_offline(cpu))) {
- int target;
-
- target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU),
- cpu_online_mask);
-
- smp_call_function_single(target, swake_up_one_online_ipi,
- wqh, 0);
- put_cpu();
- } else {
- put_cpu();
- swake_up_one(wqh);
- }
-}
-
/*
* Awaken the grace-period kthread. Don't do a self-awaken (unless in an
* interrupt or softirq handler, in which case we just might immediately
@@ -1096,7 +1125,7 @@ static void rcu_gp_kthread_wake(void)
return;
WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
- swake_up_one_online(&rcu_state.gp_wq);
+ swake_up_one(&rcu_state.gp_wq);
}
/*
@@ -1254,7 +1283,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
/* Handle the ends of any preceding grace periods first. */
if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
- unlikely(READ_ONCE(rdp->gpwrap))) {
+ unlikely(rdp->gpwrap)) {
if (!offloaded)
ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
rdp->core_needs_qs = false;
@@ -1268,7 +1297,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
/* Now handle the beginnings of any new-to-this-CPU grace periods. */
if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) ||
- unlikely(READ_ONCE(rdp->gpwrap))) {
+ unlikely(rdp->gpwrap)) {
/*
* If the current grace period is waiting for this CPU,
* set up to detect a quiescent state, otherwise don't
@@ -1283,7 +1312,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
- if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap))
+ if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap)
WRITE_ONCE(rdp->last_sched_clock, jiffies);
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
@@ -1603,8 +1632,10 @@ static void rcu_sr_put_wait_head(struct llist_node *node)
atomic_set_release(&sr_wn->inuse, 0);
}
-/* Disabled by default. */
-static int rcu_normal_wake_from_gp;
+/* Enable rcu_normal_wake_from_gp automatically on small systems. */
+#define WAKE_FROM_GP_CPU_THRESHOLD 16
+
+static int rcu_normal_wake_from_gp = -1;
module_param(rcu_normal_wake_from_gp, int, 0644);
static struct workqueue_struct *sync_wq;
@@ -1612,12 +1643,10 @@ static void rcu_sr_normal_complete(struct llist_node *node)
{
struct rcu_synchronize *rs = container_of(
(struct rcu_head *) node, struct rcu_synchronize, head);
- unsigned long oldstate = (unsigned long) rs->head.func;
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
- !poll_state_synchronize_rcu(oldstate),
- "A full grace period is not passed yet: %lu",
- rcu_seq_diff(get_state_synchronize_rcu(), oldstate));
+ !poll_state_synchronize_rcu_full(&rs->oldstate),
+ "A full grace period is not passed yet!\n");
/* Finally. */
complete(&rs->completion);
@@ -1780,6 +1809,7 @@ static noinline_for_stack bool rcu_gp_init(void)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root();
bool start_new_poll;
+ unsigned long old_gp_seq;
WRITE_ONCE(rcu_state.gp_activity, jiffies);
raw_spin_lock_irq_rcu_node(rnp);
@@ -1801,10 +1831,31 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Advance to a new grace period and initialize state. */
record_gp_stall_check_time();
+ /*
+ * A new wait segment must be started before gp_seq advanced, so
+ * that previous gp waiters won't observe the new gp_seq.
+ */
+ start_new_poll = rcu_sr_normal_gp_init();
/* Record GP times before starting GP, hence rcu_seq_start(). */
+ old_gp_seq = rcu_state.gp_seq;
+ /*
+ * Critical ordering: rcu_seq_start() must happen BEFORE the CPU hotplug
+ * scan below. Otherwise we risk a race where a newly onlining CPU could
+ * be missed by the current grace period, potentially leading to
+ * use-after-free errors. For a detailed explanation of this race, see
+ * Documentation/RCU/Design/Requirements/Requirements.rst in the
+ * "Hotplug CPU" section.
+ *
+ * Also note that the root rnp's gp_seq is kept separate from, and lags,
+ * the rcu_state's gp_seq, for a reason. See the Quick-Quiz on
+ * Single-node systems for more details (in Data-Structures.rst).
+ */
rcu_seq_start(&rcu_state.gp_seq);
+ /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
+ rcu_seq_done_exact(&old_gp_seq, rcu_seq_snap(&rcu_state.gp_seq)));
+
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
- start_new_poll = rcu_sr_normal_gp_init();
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
@@ -1835,6 +1886,10 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Exclude CPU hotplug operations. */
rcu_for_each_leaf_node(rnp) {
local_irq_disable();
+ /*
+ * Serialize with CPU offline. See Requirements.rst > Hotplug CPU >
+ * Concurrent Quiescent State Reporting for Offline CPUs.
+ */
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1909,7 +1964,12 @@ static noinline_for_stack bool rcu_gp_init(void)
trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
- /* Quiescent states for tasks on any now-offline CPUs. */
+ /*
+ * Quiescent states for tasks on any now-offline CPUs. Since we
+ * released the ofl and rnp lock before this loop, CPUs might
+ * have gone offline and we have to report QS on their behalf.
+ * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting.
+ */
mask = rnp->qsmask & ~rnp->qsmaskinitnext;
rnp->rcu_gp_init_mask = mask;
if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
@@ -1924,6 +1984,17 @@ static noinline_for_stack bool rcu_gp_init(void)
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
+ /*
+ * Immediately report QS for the GP kthread's CPU. The GP kthread
+ * cannot be in an RCU read-side critical section while running
+ * the FQS scan. This eliminates the need for a second FQS wait
+ * when all CPUs are idle.
+ */
+ preempt_disable();
+ rcu_qs();
+ rcu_report_qs_rdp(this_cpu_ptr(&rcu_data));
+ preempt_enable();
+
return true;
}
@@ -2637,10 +2708,8 @@ void rcu_sched_clock_irq(int user)
/* The load-acquire pairs with the store-release setting to true. */
if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
/* Idle and userspace execution already are quiescent states. */
- if (!rcu_is_cpu_rrupt_from_idle() && !user) {
- set_tsk_need_resched(current);
- set_preempt_need_resched();
- }
+ if (!rcu_is_cpu_rrupt_from_idle() && !user)
+ set_need_resched_current();
__this_cpu_write(rcu_data.rcu_urgent_qs, false);
}
rcu_flavor_sched_clock_irq(user);
@@ -2765,7 +2834,6 @@ static void strict_work_handler(struct work_struct *work)
/* Perform RCU core processing work for the current CPU. */
static __latent_entropy void rcu_core(void)
{
- unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
@@ -2778,8 +2846,8 @@ static __latent_entropy void rcu_core(void)
if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
rcu_preempt_deferred_qs(current);
} else if (rcu_preempt_need_deferred_qs(current)) {
- set_tsk_need_resched(current);
- set_preempt_need_resched();
+ guard(irqsave)();
+ set_need_resched_current();
}
/* Update RCU state based on any recent quiescent states. */
@@ -2788,10 +2856,9 @@ static __latent_entropy void rcu_core(void)
/* No grace period and unregistered callbacks? */
if (!rcu_gp_in_progress() &&
rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) {
- local_irq_save(flags);
+ guard(irqsave)();
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
rcu_accelerate_cbs_unlocked(rnp, rdp);
- local_irq_restore(flags);
}
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
@@ -2931,13 +2998,8 @@ static int __init rcu_spawn_core_kthreads(void)
static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func)
{
rcu_segcblist_enqueue(&rdp->cblist, head);
- if (__is_kvfree_rcu_offset((unsigned long)func))
- trace_rcu_kvfree_callback(rcu_state.name, head,
- (unsigned long)func,
- rcu_segcblist_n_cbs(&rdp->cblist));
- else
- trace_rcu_callback(rcu_state.name, head,
- rcu_segcblist_n_cbs(&rdp->cblist));
+ trace_rcu_callback(rcu_state.name, head,
+ rcu_segcblist_n_cbs(&rdp->cblist));
trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
}
@@ -3047,6 +3109,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
+ /* Avoid NULL dereference if callback is NULL. */
+ if (WARN_ON_ONCE(!func))
+ return;
+
if (debug_rcu_head_queue(head)) {
/*
* Probable double call_rcu(), so leak the callback.
@@ -3107,7 +3173,7 @@ module_param(enable_rcu_lazy, bool, 0444);
* critical sections have completed.
*
* Use this API instead of call_rcu() if you don't want the callback to be
- * invoked after very long periods of time, which can happen on systems without
+ * delayed for very long periods of time, which can happen on systems without
* memory pressure and on systems which are lightly loaded or mostly idle.
* This function will cause callbacks to be invoked sooner than later at the
* expense of extra power. Other than that, this function is identical to, and
@@ -3138,6 +3204,12 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
* might well execute concurrently with RCU read-side critical sections
* that started after call_rcu() was invoked.
*
+ * It is perfectly legal to repost an RCU callback, potentially with
+ * a different callback function, from within its callback function.
+ * The specified function will be invoked after another full grace period
+ * has elapsed. This use case is similar in form to the common practice
+ * of reposting a timer from within its own handler.
+ *
* RCU read-side critical sections are delimited by rcu_read_lock()
* and rcu_read_unlock(), and may be nested. In addition, but only in
* v5.0 and later, regions of code across which interrupts, preemption,
@@ -3166,6 +3238,13 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
*
* Implementation of these memory-ordering guarantees is described here:
* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
+ *
+ * Specific to call_rcu() (as opposed to the other call_rcu*() functions),
+ * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many
+ * seconds before starting the grace period needed by the corresponding
+ * callback. This delay can significantly improve energy-efficiency
+ * on low-utilization battery-powered devices. To avoid this delay,
+ * in latency-sensitive kernel code, use call_rcu_hurry().
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
@@ -3201,7 +3280,7 @@ static void synchronize_rcu_normal(void)
trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request"));
- if (!READ_ONCE(rcu_normal_wake_from_gp)) {
+ if (READ_ONCE(rcu_normal_wake_from_gp) < 1) {
wait_rcu_gp(call_rcu_hurry);
goto trace_complete_out;
}
@@ -3214,7 +3293,7 @@ static void synchronize_rcu_normal(void)
* snapshot before adding a request.
*/
if (IS_ENABLED(CONFIG_PROVE_RCU))
- rs.head.func = (void *) get_state_synchronize_rcu();
+ get_state_synchronize_rcu_full(&rs.oldstate);
rcu_sr_normal_add_req(&rs);
@@ -3357,14 +3436,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
*/
void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
{
- struct rcu_node *rnp = rcu_get_root();
-
/*
* Any prior manipulation of RCU-protected data must happen
* before the loads from ->gp_seq and ->expedited_sequence.
*/
smp_mb(); /* ^^^ */
- rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq);
+
+ // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use
+ // in poll_state_synchronize_rcu_full() notwithstanding. Use of
+ // the latter here would result in too-short grace periods due to
+ // interactions with newly onlined CPUs.
+ rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq);
rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
@@ -3699,7 +3781,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
}
rcu_nocb_unlock(rdp);
if (wake_nocb)
- wake_nocb_gp(rdp, false);
+ wake_nocb_gp(rdp);
smp_store_release(&rdp->barrier_seq_snap, gseq);
}
@@ -3726,6 +3808,11 @@ static void rcu_barrier_handler(void *cpu_in)
* to complete. For example, if there are no RCU callbacks queued anywhere
* in the system, then rcu_barrier() is within its rights to return
* immediately, without waiting for anything, much less an RCU grace period.
+ * In fact, rcu_barrier() will normally not result in any RCU grace periods
+ * beyond those that were already destined to be executed.
+ *
+ * In kernels built with CONFIG_RCU_LAZY=y, this function also hurries all
+ * pending lazy RCU callbacks.
*/
void rcu_barrier(void)
{
@@ -3942,7 +4029,7 @@ bool rcu_cpu_online(int cpu)
* RCU on an offline processor during initial boot, hence the check for
* rcu_scheduler_fully_active.
*/
-bool rcu_lockdep_current_cpu_online(void)
+bool notrace rcu_lockdep_current_cpu_online(void)
{
struct rcu_data *rdp;
bool ret = false;
@@ -4188,6 +4275,8 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ rcu_preempt_deferred_qs_init(rdp);
rcu_spawn_rnp_kthreads(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus);
@@ -4223,7 +4312,6 @@ int rcutree_online_cpu(unsigned int cpu)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return 0; /* Too early in boot for scheduler work. */
- sync_sched_exp_online_cleanup(cpu);
// Stop-machine done, so allow nohz_full to disable tick.
tick_dep_clear(TICK_DEP_BIT_RCU);
@@ -4313,6 +4401,12 @@ void rcutree_report_cpu_dead(void)
* may introduce a new READ-side while it is actually off the QS masks.
*/
lockdep_assert_irqs_disabled();
+ /*
+ * CPUHP_AP_SMPCFD_DYING was the last call for rcu_exp_handler() execution.
+ * The requested QS must have been reported on the last context switch
+ * from stop machine to idle.
+ */
+ WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
// Do any dangling deferred wakeups.
do_nocb_deferred_wakeup(rdp);
@@ -4320,6 +4414,13 @@ void rcutree_report_cpu_dead(void)
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
+
+ /*
+ * Hold the ofl_lock and rnp lock to avoid races between CPU going
+ * offline and doing a QS report (as below), versus rcu_gp_init().
+ * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting section
+ * for more details.
+ */
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4330,6 +4431,7 @@ void rcutree_report_cpu_dead(void)
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
+ /* Clear from ->qsmaskinitnext to mark offline. */
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
arch_spin_unlock(&rcu_state.ofl_lock);
@@ -4796,12 +4898,18 @@ void __init rcu_init(void)
rcutree_online_cpu(cpu);
/* Create workqueue for Tree SRCU and for expedited GPs. */
- rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
+ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
WARN_ON(!rcu_gp_wq);
- sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0);
+ sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
WARN_ON(!sync_wq);
+ /* Respect if explicitly disabled via a boot parameter. */
+ if (rcu_normal_wake_from_gp < 0) {
+ if (num_possible_cpus() <= WAKE_FROM_GP_CPU_THRESHOLD)
+ rcu_normal_wake_from_gp = 1;
+ }
+
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
if (qovld < 0)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a9a811d9d7a3..7dfc57e9adb1 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -168,12 +168,23 @@ struct rcu_snap_record {
u64 cputime_irq; /* Accumulated cputime of hard irqs */
u64 cputime_softirq;/* Accumulated cputime of soft irqs */
u64 cputime_system; /* Accumulated cputime of kernel tasks */
- unsigned long nr_hardirqs; /* Accumulated number of hard irqs */
+ u64 nr_hardirqs; /* Accumulated number of hard irqs */
unsigned int nr_softirqs; /* Accumulated number of soft irqs */
unsigned long long nr_csw; /* Accumulated number of task switches */
unsigned long jiffies; /* Track jiffies value */
};
+/*
+ * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention.
+ * to report quiescent states at the soonest possible time.
+ * The request can be in one of the following states:
+ * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled.
+ * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it
+ * ran and we still haven't reported a quiescent state.
+ */
+#define DEFER_QS_IDLE 0
+#define DEFER_QS_PENDING 1
+
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
@@ -183,6 +194,7 @@ struct rcu_data {
bool core_needs_qs; /* Core waits for quiescent state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible ->gp_seq wrap. */
+ unsigned int gpwrap_count; /* Count of GP sequence wrap. */
bool cpu_started; /* RCU watching this onlining CPU. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
@@ -191,7 +203,7 @@ struct rcu_data {
/* during and after the last grace */
/* period it is aware of. */
struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */
- bool defer_qs_iw_pending; /* Scheduler attention pending? */
+ int defer_qs_pending; /* irqwork or softirq pending? */
struct work_struct strict_work; /* Schedule readers for strict GPs. */
/* 2) batch handling */
@@ -289,7 +301,6 @@ struct rcu_data {
#define RCU_NOCB_WAKE_BYPASS 1
#define RCU_NOCB_WAKE_LAZY 2
#define RCU_NOCB_WAKE 3
-#define RCU_NOCB_WAKE_FORCE 4
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
@@ -476,6 +487,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
static void rcu_flavor_sched_clock_irq(int user);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
@@ -487,7 +499,7 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
-static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
+static bool wake_nocb_gp(struct rcu_data *rdp);
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
unsigned long j, bool lazy);
static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 77efed89c79e..82cada459e5d 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -141,6 +141,13 @@ static void __maybe_unused sync_exp_reset_tree(void)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WARN_ON_ONCE(rnp->expmask);
WRITE_ONCE(rnp->expmask, rnp->expmaskinit);
+ /*
+ * Need to wait for any blocked tasks as well. Note that
+ * additional blocking tasks will also block the expedited GP
+ * until such time as the ->expmask bits are cleared.
+ */
+ if (rcu_is_leaf_node(rnp) && rcu_preempt_has_tasks(rnp))
+ WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -200,7 +207,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
if (rnp->parent == NULL) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (wake)
- swake_up_one_online(&rcu_state.expedited_wq);
+ swake_up_one(&rcu_state.expedited_wq);
break;
}
@@ -230,17 +237,19 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake)
* specified leaf rcu_node structure, which is acquired by the caller.
*/
static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags,
- unsigned long mask, bool wake)
+ unsigned long mask_in, bool wake)
__releases(rnp->lock)
{
int cpu;
+ unsigned long mask;
struct rcu_data *rdp;
raw_lockdep_assert_held_rcu_node(rnp);
- if (!(rnp->expmask & mask)) {
+ if (!(rnp->expmask & mask_in)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
+ mask = mask_in & rnp->expmask;
WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
@@ -391,13 +400,6 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
}
mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
- /*
- * Need to wait for any blocked tasks as well. Note that
- * additional blocking tasks will also block the expedited GP
- * until such time as the ->expmask bits are cleared.
- */
- if (rcu_preempt_has_tasks(rnp))
- WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* IPI the remaining CPUs for expedited quiescent state. */
@@ -587,7 +589,12 @@ static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigne
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask),
".T"[!!data_race(rnp_root->exp_tasks)]);
- if (ndetected) {
+ if (!ndetected) {
+ // This is invoked from the grace-period worker, so
+ // a new grace period cannot have started. And if this
+ // worker were stalled, we would not get here. ;-)
+ pr_err("INFO: Expedited stall ended before state dump start\n");
+ } else {
pr_err("blocking rcu_node structures (internal RCU debug):");
rcu_for_each_node_breadth_first(rnp) {
if (rnp == rnp_root)
@@ -727,8 +734,7 @@ static void rcu_exp_need_qs(void)
__this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
/* Store .exp before .rcu_urgent_qs. */
smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
- set_tsk_need_resched(current);
- set_preempt_need_resched();
+ set_need_resched_current();
}
#ifdef CONFIG_PREEMPT_RCU
@@ -749,12 +755,8 @@ static void rcu_exp_handler(void *unused)
struct task_struct *t = current;
/*
- * First, is there no need for a quiescent state from this CPU,
- * or is this CPU already looking for a quiescent state for the
- * current grace period? If either is the case, just leave.
- * However, this should not happen due to the preemptible
- * sync_sched_exp_online_cleanup() implementation being a no-op,
- * so warn if this does happen.
+ * WARN if the CPU is unexpectedly already looking for a
+ * QS or has already reported one.
*/
ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
@@ -801,11 +803,6 @@ static void rcu_exp_handler(void *unused)
WARN_ON_ONCE(1);
}
-/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */
-static void sync_sched_exp_online_cleanup(int cpu)
-{
-}
-
/*
* Scan the current list of tasks blocked within RCU read-side critical
* sections, printing out the tid of each that is blocking the current
@@ -883,38 +880,6 @@ static void rcu_exp_handler(void *unused)
rcu_exp_need_qs();
}
-/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
-static void sync_sched_exp_online_cleanup(int cpu)
-{
- unsigned long flags;
- int my_cpu;
- struct rcu_data *rdp;
- int ret;
- struct rcu_node *rnp;
-
- rdp = per_cpu_ptr(&rcu_data, cpu);
- rnp = rdp->mynode;
- my_cpu = get_cpu();
- /* Quiescent state either not needed or already requested, leave. */
- if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
- READ_ONCE(rdp->cpu_no_qs.b.exp)) {
- put_cpu();
- return;
- }
- /* Quiescent state needed on current CPU, so set it up locally. */
- if (my_cpu == cpu) {
- local_irq_save(flags);
- rcu_exp_need_qs();
- local_irq_restore(flags);
- put_cpu();
- return;
- }
- /* Quiescent state needed on some other CPU, send IPI. */
- ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
- put_cpu();
- WARN_ON_ONCE(ret);
-}
-
/*
* Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections that are
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 2605dd234a13..1047b30cd46b 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -190,9 +190,18 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
init_swait_queue_head(&rnp->nocb_gp_wq[1]);
}
+/* Clear any pending deferred wakeup timer (nocb_gp_lock must be held). */
+static void nocb_defer_wakeup_cancel(struct rcu_data *rdp_gp)
+{
+ if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ timer_delete(&rdp_gp->nocb_timer);
+ }
+}
+
static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
struct rcu_data *rdp,
- bool force, unsigned long flags)
+ unsigned long flags)
__releases(rdp_gp->nocb_gp_lock)
{
bool needwake = false;
@@ -204,19 +213,16 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
return false;
}
- if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
- WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- del_timer(&rdp_gp->nocb_timer);
- }
+ nocb_defer_wakeup_cancel(rdp_gp);
- if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
+ if (READ_ONCE(rdp_gp->nocb_gp_sleep)) {
WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
needwake = true;
}
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
if (needwake) {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
- swake_up_one_online(&rdp_gp->nocb_gp_wq);
+ swake_up_one(&rdp_gp->nocb_gp_wq);
}
return needwake;
@@ -225,13 +231,13 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
/*
* Kick the GP kthread for this NOCB group.
*/
-static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+static bool wake_nocb_gp(struct rcu_data *rdp)
{
unsigned long flags;
struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- return __wake_nocb_gp(rdp_gp, rdp, force, flags);
+ return __wake_nocb_gp(rdp_gp, rdp, flags);
}
#ifdef CONFIG_RCU_LAZY
@@ -276,7 +282,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
* callback storms, no need to wake up too early.
*/
if (waketype == RCU_NOCB_WAKE_LAZY &&
- rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
+ rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush());
WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
@@ -373,6 +379,38 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
}
/*
+ * Determine if the bypass queue needs to be flushed based on time and size.
+ * For lazy-only bypass queues, use the lazy flush timeout; otherwise flush
+ * based on jiffy advancement. The flush_faster controls flush aggressiveness.
+ */
+static bool nocb_bypass_needs_flush(struct rcu_data *rdp, long bypass_ncbs,
+ long lazy_ncbs, unsigned long j,
+ bool flush_faster)
+{
+ bool bypass_is_lazy;
+ unsigned long bypass_first;
+ unsigned long flush_timeout;
+ long qhimark_thresh;
+
+ if (!bypass_ncbs)
+ return false;
+
+ qhimark_thresh = flush_faster ? qhimark : 2 * qhimark;
+ if (bypass_ncbs >= qhimark_thresh)
+ return true;
+
+ bypass_first = READ_ONCE(rdp->nocb_bypass_first);
+ bypass_is_lazy = (bypass_ncbs == lazy_ncbs);
+
+ if (bypass_is_lazy)
+ flush_timeout = rcu_get_jiffies_lazy_flush();
+ else
+ flush_timeout = flush_faster ? 0 : 1;
+
+ return time_after(j, bypass_first + flush_timeout);
+}
+
+/*
* See whether it is appropriate to use the ->nocb_bypass list in order
* to control contention on ->nocb_lock. A limited number of direct
* enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
@@ -398,7 +436,8 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
unsigned long cur_gp_seq;
unsigned long j = jiffies;
long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len));
+ long lazy_len = READ_ONCE(rdp->lazy_len);
+ bool bypass_is_lazy = (ncbs == lazy_len);
lockdep_assert_irqs_disabled();
@@ -450,10 +489,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
// If ->nocb_bypass has been used too long or is too full,
// flush ->nocb_bypass to ->cblist.
- if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
- (ncbs && bypass_is_lazy &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) ||
- ncbs >= qhimark) {
+ if (nocb_bypass_needs_flush(rdp, ncbs, lazy_len, j, true)) {
rcu_nocb_lock(rdp);
*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
@@ -518,22 +554,17 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
}
/*
- * Awaken the no-CBs grace-period kthread if needed, either due to it
- * legitimately being asleep or due to overload conditions.
- *
- * If warranted, also wake up the kthread servicing this CPUs queues.
+ * Awaken the no-CBs grace-period kthread if needed due to it legitimately
+ * being asleep.
*/
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
unsigned long flags)
__releases(rdp->nocb_lock)
{
long bypass_len;
- unsigned long cur_gp_seq;
- unsigned long j;
long lazy_len;
long len;
struct task_struct *t;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
// If we are being polled or there is no kthread, just leave.
t = READ_ONCE(rdp->nocb_gp_kthread);
@@ -549,53 +580,26 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
lazy_len = READ_ONCE(rdp->lazy_len);
if (was_alldone) {
rdp->qlen_last_fqs_check = len;
+ rcu_nocb_unlock(rdp);
// Only lazy CBs in bypass list
if (lazy_len && bypass_len == lazy_len) {
- rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
TPS("WakeLazy"));
- } else if (!irqs_disabled_flags(flags) && cpu_online(rdp->cpu)) {
+ } else if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
- rcu_nocb_unlock(rdp);
- wake_nocb_gp(rdp, false);
+ wake_nocb_gp(rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- /*
- * Don't do the wake-up upfront on fragile paths.
- * Also offline CPUs can't call swake_up_one_online() from
- * (soft-)IRQs. Rely on the final deferred wake-up from
- * rcutree_report_cpu_dead()
- */
- rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
TPS("WakeEmptyIsDeferred"));
}
- } else if (len > rdp->qlen_last_fqs_check + qhimark) {
- /* ... or if many callbacks queued. */
- rdp->qlen_last_fqs_check = len;
- j = jiffies;
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- smp_mb(); /* Enqueue before timer_pending(). */
- if ((rdp->nocb_cb_sleep ||
- !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
- !timer_pending(&rdp_gp->nocb_timer)) {
- rcu_nocb_unlock(rdp);
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
- TPS("WakeOvfIsDeferred"));
- } else {
- rcu_nocb_unlock(rdp);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
- }
- } else {
- rcu_nocb_unlock(rdp);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+
+ return;
}
+
+ rcu_nocb_unlock(rdp);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
}
static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
@@ -699,15 +703,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
lazy_ncbs = READ_ONCE(rdp->lazy_len);
- if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) ||
- bypass_ncbs > 2 * qhimark)) {
- flush_bypass = true;
- } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
- bypass_ncbs > 2 * qhimark)) {
- flush_bypass = true;
- } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+ flush_bypass = nocb_bypass_needs_flush(rdp, bypass_ncbs, lazy_ncbs, j, false);
+ if (!flush_bypass && !bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
continue; /* No callbacks here, try next. */
}
@@ -820,10 +817,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
if (rdp_toggling)
my_rdp->nocb_toggling_rdp = NULL;
- if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
- WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- del_timer(&my_rdp->nocb_timer);
- }
+ nocb_defer_wakeup_cancel(my_rdp);
WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
} else {
@@ -972,7 +966,6 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
unsigned long flags)
__releases(rdp_gp->nocb_gp_lock)
{
- int ndw;
int ret;
if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
@@ -980,8 +973,7 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
return false;
}
- ndw = rdp_gp->nocb_defer_wakeup;
- ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ ret = __wake_nocb_gp(rdp_gp, rdp, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
return ret;
@@ -991,13 +983,12 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
{
unsigned long flags;
- struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
+ struct rcu_data *rdp = timer_container_of(rdp, t, nocb_timer);
WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
- smp_mb__after_spinlock(); /* Timer expire before wakeup. */
do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
}
@@ -1113,30 +1104,6 @@ static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
return 0;
}
-int rcu_nocb_cpu_deoffload(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- int ret = 0;
-
- cpus_read_lock();
- mutex_lock(&rcu_state.nocb_mutex);
- if (rcu_rdp_is_offloaded(rdp)) {
- if (!cpu_online(cpu)) {
- ret = rcu_nocb_rdp_deoffload(rdp);
- if (!ret)
- cpumask_clear_cpu(cpu, rcu_nocb_mask);
- } else {
- pr_info("NOCB: Cannot CB-deoffload online CPU %d\n", rdp->cpu);
- ret = -EINVAL;
- }
- }
- mutex_unlock(&rcu_state.nocb_mutex);
- cpus_read_unlock();
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
-
static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp)
{
unsigned long flags;
@@ -1152,7 +1119,6 @@ static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp)
static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
{
int wake_gp;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
WARN_ON_ONCE(cpu_online(rdp->cpu));
/*
@@ -1162,7 +1128,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
if (!rdp->nocb_gp_rdp)
return -EINVAL;
- if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread))
+ if (WARN_ON_ONCE(!rdp->nocb_gp_kthread))
return -EINVAL;
pr_info("Offloading %d\n", rdp->cpu);
@@ -1172,7 +1138,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
if (wake_gp)
- wake_up_process(rdp_gp->nocb_gp_kthread);
+ wake_up_process(rdp->nocb_gp_kthread);
swait_event_exclusive(rdp->nocb_state_wq,
rcu_nocb_rdp_offload_wait_cond(rdp));
@@ -1182,28 +1148,52 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
return 0;
}
-int rcu_nocb_cpu_offload(int cpu)
+/* Common helper for CPU offload/deoffload operations. */
+static int rcu_nocb_cpu_toggle_offload(int cpu, bool offload)
{
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
int ret = 0;
cpus_read_lock();
mutex_lock(&rcu_state.nocb_mutex);
- if (!rcu_rdp_is_offloaded(rdp)) {
- if (!cpu_online(cpu)) {
- ret = rcu_nocb_rdp_offload(rdp);
- if (!ret)
- cpumask_set_cpu(cpu, rcu_nocb_mask);
- } else {
- pr_info("NOCB: Cannot CB-offload online CPU %d\n", rdp->cpu);
- ret = -EINVAL;
- }
+
+ /* Already in desired state, nothing to do. */
+ if (rcu_rdp_is_offloaded(rdp) == offload)
+ goto out_unlock;
+
+ if (cpu_online(cpu)) {
+ pr_info("NOCB: Cannot CB-%soffload online CPU %d\n",
+ offload ? "" : "de", rdp->cpu);
+ ret = -EINVAL;
+ goto out_unlock;
}
+
+ if (offload) {
+ ret = rcu_nocb_rdp_offload(rdp);
+ if (!ret)
+ cpumask_set_cpu(cpu, rcu_nocb_mask);
+ } else {
+ ret = rcu_nocb_rdp_deoffload(rdp);
+ if (!ret)
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ }
+
+out_unlock:
mutex_unlock(&rcu_state.nocb_mutex);
cpus_read_unlock();
-
return ret;
}
+
+int rcu_nocb_cpu_deoffload(int cpu)
+{
+ return rcu_nocb_cpu_toggle_offload(cpu, false /* de-offload */);
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+
+int rcu_nocb_cpu_offload(int cpu)
+{
+ return rcu_nocb_cpu_toggle_offload(cpu, true /* offload */);
+}
EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
#ifdef CONFIG_RCU_LAZY
@@ -1279,7 +1269,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
}
rcu_nocb_try_flush_bypass(rdp, jiffies);
rcu_nocb_unlock_irqrestore(rdp, flags);
- wake_nocb_gp(rdp, false);
+ wake_nocb_gp(rdp);
sc->nr_to_scan -= _count;
count += _count;
if (sc->nr_to_scan <= 0)
@@ -1557,8 +1547,11 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
/* Dump out nocb kthread state for the specified rcu_data structure. */
static void show_rcu_nocb_state(struct rcu_data *rdp)
{
- char bufw[20];
- char bufr[20];
+ char bufd[22];
+ char bufw[45];
+ char bufr[45];
+ char bufn[22];
+ char bufb[22];
struct rcu_data *nocb_next_rdp;
struct rcu_segcblist *rsclp = &rdp->cblist;
bool waslocked;
@@ -1567,14 +1560,21 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
if (rdp->nocb_gp_rdp == rdp)
show_rcu_nocb_gp_state(rdp);
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist))
+ return;
+
nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp,
&rdp->nocb_entry_rdp,
typeof(*rdp),
nocb_entry_rdp);
- sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
- sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
- pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
+ sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]);
+ sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL],
+ rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]);
+ sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n",
rdp->cpu, rdp->nocb_gp_rdp->cpu,
nocb_next_rdp ? nocb_next_rdp->cpu : -1,
"kK"[!!rdp->nocb_cb_kthread],
@@ -1586,12 +1586,15 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
jiffies - rdp->nocb_nobypass_last,
rdp->nocb_nobypass_count,
".D"[rcu_segcblist_ready_cbs(rsclp)],
+ rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd,
".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn,
".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb,
rcu_segcblist_n_cbs(&rdp->cblist),
rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1,
@@ -1651,7 +1654,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
{
}
-static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+static bool wake_nocb_gp(struct rcu_data *rdp)
{
return false;
}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3600152b858e..95ad967adcf3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -29,7 +29,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
(IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
lockdep_is_held(&rdp->nocb_lock) ||
lockdep_is_held(&rcu_state.nocb_mutex) ||
- (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) &&
+ ((!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) || softirq_count()) &&
rdp == this_cpu_ptr(&rcu_data)) ||
rcu_current_is_nocb_kthread(rdp)),
"Unsafe read of RCU_NOCB offloaded state"
@@ -486,13 +486,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
struct rcu_node *rnp;
union rcu_special special;
+ rdp = this_cpu_ptr(&rcu_data);
+ if (rdp->defer_qs_pending == DEFER_QS_PENDING)
+ rdp->defer_qs_pending = DEFER_QS_IDLE;
+
/*
* If RCU core is waiting for this CPU to exit its critical section,
* report the fact that it has exited. Because irqs are disabled,
* t->rcu_read_unlock_special cannot change.
*/
special = t->rcu_read_unlock_special;
- rdp = this_cpu_ptr(&rcu_data);
if (!special.s && !rdp->cpu_no_qs.b.exp) {
local_irq_restore(flags);
return;
@@ -534,7 +537,6 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
(!empty_norm || rnp->qsmask));
empty_exp = sync_rcu_exp_done(rnp);
- smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry);
t->rcu_blocked_node = NULL;
@@ -626,8 +628,93 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
{
struct rcu_data *rdp;
+ lockdep_assert_irqs_disabled();
rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
- rdp->defer_qs_iw_pending = false;
+
+ /*
+ * If the IRQ work handler happens to run in the middle of RCU read-side
+ * critical section, it could be ineffective in getting the scheduler's
+ * attention to report a deferred quiescent state (the whole point of the
+ * IRQ work). For this reason, requeue the IRQ work.
+ *
+ * Basically, we want to avoid following situation:
+ * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING)
+ * 2. CPU enters new rcu_read_lock()
+ * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0
+ * 4. rcu_read_unlock() does not re-queue work (state still PENDING)
+ * 5. Deferred QS reporting does not happen.
+ */
+ if (rcu_preempt_depth() > 0)
+ WRITE_ONCE(rdp->defer_qs_pending, DEFER_QS_IDLE);
+}
+
+/*
+ * Check if expedited grace period processing during unlock is needed.
+ *
+ * This function determines whether expedited handling is required based on:
+ * 1. Task blocking an expedited grace period (based on a heuristic, could be
+ * false-positive, see below.)
+ * 2. CPU participating in an expedited grace period
+ * 3. Strict grace period mode requiring expedited handling
+ * 4. RCU priority deboosting needs when interrupts were disabled
+ *
+ * @t: The task being checked
+ * @rdp: The per-CPU RCU data
+ * @rnp: The RCU node for this CPU
+ * @irqs_were_disabled: Whether interrupts were disabled before rcu_read_unlock()
+ *
+ * Returns true if expedited processing of the rcu_read_unlock() is needed.
+ */
+static bool rcu_unlock_needs_exp_handling(struct task_struct *t,
+ struct rcu_data *rdp,
+ struct rcu_node *rnp,
+ bool irqs_were_disabled)
+{
+ /*
+ * Check if this task is blocking an expedited grace period. If the
+ * task was preempted within an RCU read-side critical section and is
+ * on the expedited grace period blockers list (exp_tasks), we need
+ * expedited handling to unblock the expedited GP. This is not an exact
+ * check because 't' might not be on the exp_tasks list at all - its
+ * just a fast heuristic that can be false-positive sometimes.
+ */
+ if (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks))
+ return true;
+
+ /*
+ * Check if this CPU is participating in an expedited grace period.
+ * The expmask bitmap tracks which CPUs need to check in for the
+ * current expedited GP. If our CPU's bit is set, we need expedited
+ * handling to help complete the expedited GP.
+ */
+ if (rdp->grpmask & READ_ONCE(rnp->expmask))
+ return true;
+
+ /*
+ * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, all grace periods
+ * are treated as short for testing purposes even if that means
+ * disturbing the system more. Check if either:
+ * - This CPU has not yet reported a quiescent state, or
+ * - This task was preempted within an RCU critical section
+ * In either case, require expedited handling for strict GP mode.
+ */
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node))
+ return true;
+
+ /*
+ * RCU priority boosting case: If a task is subject to RCU priority
+ * boosting and exits an RCU read-side critical section with interrupts
+ * disabled, we need expedited handling to ensure timely deboosting.
+ * Without this, a low-priority task could incorrectly run at high
+ * real-time priority for an extended period degrading real-time
+ * responsiveness. This applies to all CONFIG_RCU_BOOST=y kernels,
+ * not just to PREEMPT_RT.
+ */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node)
+ return true;
+
+ return false;
}
/*
@@ -649,41 +736,33 @@ static void rcu_read_unlock_special(struct task_struct *t)
local_irq_save(flags);
irqs_were_disabled = irqs_disabled_flags(flags);
if (preempt_bh_were_disabled || irqs_were_disabled) {
- bool expboost; // Expedited GP in flight or possible boosting.
+ bool needs_exp; // Expedited handling needed.
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
- (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
- (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
- ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) ||
- (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
- t->rcu_blocked_node);
+ needs_exp = rcu_unlock_needs_exp_handling(t, rdp, rnp, irqs_were_disabled);
+
// Need to defer quiescent state until everything is enabled.
- if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) {
+ if (use_softirq && (in_hardirq() || (needs_exp && !irqs_were_disabled))) {
// Using softirq, safe to awaken, and either the
// wakeup is free or there is either an expedited
// GP in flight or a potential need to deboost.
- raise_softirq_irqoff(RCU_SOFTIRQ);
+ if (rdp->defer_qs_pending != DEFER_QS_PENDING) {
+ rdp->defer_qs_pending = DEFER_QS_PENDING;
+ raise_softirq_irqoff(RCU_SOFTIRQ);
+ }
} else {
// Enabling BH or preempt does reschedule, so...
// Also if no expediting and no possible deboosting,
// slow is OK. Plus nohz_full CPUs eventually get
// tick enabled.
- set_tsk_need_resched(current);
- set_preempt_need_resched();
+ set_need_resched_current();
if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
- expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
+ needs_exp && rdp->defer_qs_pending != DEFER_QS_PENDING &&
+ cpu_online(rdp->cpu)) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
- if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
- IS_ENABLED(CONFIG_PREEMPT_RT))
- rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(
- rcu_preempt_deferred_qs_handler);
- else
- init_irq_work(&rdp->defer_qs_iw,
- rcu_preempt_deferred_qs_handler);
- rdp->defer_qs_iw_pending = true;
+ rdp->defer_qs_pending = DEFER_QS_PENDING;
irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
}
}
@@ -736,10 +815,8 @@ static void rcu_flavor_sched_clock_irq(int user)
if (rcu_preempt_depth() > 0 ||
(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
/* No QS, force context switch if deferred. */
- if (rcu_preempt_need_deferred_qs(t)) {
- set_tsk_need_resched(t);
- set_preempt_need_resched();
- }
+ if (rcu_preempt_need_deferred_qs(t))
+ set_need_resched_current();
} else if (rcu_preempt_need_deferred_qs(t)) {
rcu_preempt_deferred_qs(t); /* Report deferred QS. */
return;
@@ -822,6 +899,10 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
}
}
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp)
+{
+ rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler);
+}
#else /* #ifdef CONFIG_PREEMPT_RCU */
/*
@@ -833,8 +914,17 @@ void rcu_read_unlock_strict(void)
{
struct rcu_data *rdp;
- if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
+ if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread)
return;
+
+ /*
+ * rcu_report_qs_rdp() can only be invoked with a stable rdp and
+ * from the local CPU.
+ *
+ * The in_atomic_preempt_off() check ensures that we come here holding
+ * the last preempt_count (which will get dropped once we return to
+ * __rcu_read_unlock().
+ */
rdp = this_cpu_ptr(&rcu_data);
rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
@@ -975,13 +1065,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
*/
static void rcu_flavor_sched_clock_irq(int user)
{
- if (user || rcu_is_cpu_rrupt_from_idle()) {
+ if (user || rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) &&
+ (preempt_count() == HARDIRQ_OFFSET))) {
/*
* Get here if this CPU took its interrupt from user
- * mode or from the idle loop, and if this is not a
- * nested interrupt. In this case, the CPU is in
- * a quiescent state, so note it.
+ * mode, from the idle loop without this being a nested
+ * interrupt, or while not holding the task preempt count
+ * (with PREEMPT_COUNT=y). In this case, the CPU is in a
+ * quiescent state, so note it.
*
* No memory barrier is required here because rcu_qs()
* references only CPU-local variables that other CPUs
@@ -1009,6 +1102,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
}
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { }
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/*
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 925fcdad5dea..b67532cb8770 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -17,8 +17,59 @@
// Controlling CPU stall warnings, including delay calculation.
/* panic() on RCU Stall sysctl. */
-int sysctl_panic_on_rcu_stall __read_mostly;
-int sysctl_max_rcu_stall_to_panic __read_mostly;
+static int sysctl_panic_on_rcu_stall __read_mostly;
+static int sysctl_max_rcu_stall_to_panic __read_mostly;
+
+static const struct ctl_table rcu_stall_sysctl_table[] = {
+ {
+ .procname = "panic_on_rcu_stall",
+ .data = &sysctl_panic_on_rcu_stall,
+ .maxlen = sizeof(sysctl_panic_on_rcu_stall),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "max_rcu_stall_to_panic",
+ .data = &sysctl_max_rcu_stall_to_panic,
+ .maxlen = sizeof(sysctl_max_rcu_stall_to_panic),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
+ },
+};
+
+static int __init init_rcu_stall_sysctl(void)
+{
+ register_sysctl_init("kernel", rcu_stall_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_rcu_stall_sysctl);
+
+#ifdef CONFIG_SYSFS
+
+static unsigned int rcu_stall_count;
+
+static ssize_t rcu_stall_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%u\n", rcu_stall_count);
+}
+
+static struct kobj_attribute rcu_stall_count_attr = __ATTR_RO(rcu_stall_count);
+
+static __init int kernel_rcu_stall_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &rcu_stall_count_attr.attr, NULL);
+ return 0;
+}
+
+late_initcall(kernel_rcu_stall_sysfs_init);
+
+#endif // CONFIG_SYSFS
#ifdef CONFIG_PROVE_RCU
#define RCU_STALL_DELAY_DELTA (5 * HZ)
@@ -112,6 +163,13 @@ static void panic_on_rcu_stall(void)
{
static int cpu_stall;
+ /*
+ * Attempt to kick out the BPF scheduler if it's installed and defer
+ * the panic to give the system a chance to recover.
+ */
+ if (scx_rcu_cpu_stall())
+ return;
+
if (++cpu_stall < sysctl_max_rcu_stall_to_panic)
return;
@@ -435,8 +493,8 @@ static void print_cpu_stat_info(int cpu)
rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
pr_err("\t hardirqs softirqs csw/system\n");
- pr_err("\t number: %8ld %10d %12lld\n",
- kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs,
+ pr_err("\t number: %8lld %10d %12lld\n",
+ kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu) - rsrp->nr_hardirqs,
kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
nr_context_switches_cpu(cpu) - rsrp->nr_csw);
pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n",
@@ -705,8 +763,7 @@ static void print_cpu_stall(unsigned long gp_seq, unsigned long gps)
* progress and it could be we're stuck in kernel space without context
* switches for an entirely unreasonable amount of time.
*/
- set_tsk_need_resched(current);
- set_preempt_need_resched();
+ set_need_resched_current();
}
static bool csd_lock_suppress_rcu_stall;
@@ -784,6 +841,10 @@ static void check_cpu_stall(struct rcu_data *rdp)
if (kvm_check_and_clear_guest_paused())
return;
+#ifdef CONFIG_SYSFS
+ ++rcu_stall_count;
+#endif
+
rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps);
if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) {
pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name);
@@ -927,8 +988,7 @@ void show_rcu_gp_kthreads(void)
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
cbs += data_race(READ_ONCE(rdp->n_cbs_invoked));
- if (rcu_segcblist_is_offloaded(&rdp->cblist))
- show_rcu_nocb_state(rdp);
+ show_rcu_nocb_state(rdp);
}
pr_info("RCU callbacks invoked since boot: %lu\n", cbs);
show_rcu_tasks_gp_kthreads();
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c912b594ba98..b62735a67884 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -117,7 +117,7 @@ static bool rcu_read_lock_held_common(bool *ret)
return false;
}
-int rcu_read_lock_sched_held(void)
+int notrace rcu_read_lock_sched_held(void)
{
bool ret;
@@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
* Note that rcu_read_lock() is disallowed if the CPU is either idle or
* offline from an RCU perspective, so check for those as well.
*/
-int rcu_read_lock_held(void)
+int notrace rcu_read_lock_held(void)
{
bool ret;
@@ -367,7 +367,7 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held);
* Note that rcu_read_lock_bh() is disallowed if the CPU is either idle or
* offline from an RCU perspective, so check for those as well.
*/
-int rcu_read_lock_bh_held(void)
+int notrace rcu_read_lock_bh_held(void)
{
bool ret;
@@ -377,7 +377,7 @@ int rcu_read_lock_bh_held(void)
}
EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
-int rcu_read_lock_any_held(void)
+int notrace rcu_read_lock_any_held(void)
{
bool ret;
@@ -538,6 +538,28 @@ long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool do
EXPORT_SYMBOL_GPL(torture_sched_setaffinity);
#endif
+#if IS_ENABLED(CONFIG_TRIVIAL_PREEMPT_RCU)
+// Trivial and stupid grace-period wait. Defined here so that lockdep
+// kernels can find tasklist_lock.
+void synchronize_rcu_trivial_preempt(void)
+{
+ struct task_struct *g;
+ struct task_struct *t;
+
+ smp_mb(); // Order prior accesses before grace-period start.
+ rcu_read_lock(); // Protect task list.
+ for_each_process_thread(g, t) {
+ if (t == current)
+ continue; // Don't deadlock on ourselves!
+ // Order later rcu_read_lock() on other tasks after QS.
+ while (smp_load_acquire(&t->rcu_trivial_preempt_nesting))
+ continue;
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_trivial_preempt);
+#endif // #if IS_ENABLED(CONFIG_TRIVIAL_PREEMPT_RCU)
+
int rcu_cpu_stall_notifiers __read_mostly; // !0 = provide stall notifiers (rarely useful)
EXPORT_SYMBOL_GPL(rcu_cpu_stall_notifiers);
@@ -614,7 +636,7 @@ static void early_boot_test_call_rcu(void)
call_rcu(&head, test_callback);
early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
call_srcu(&early_srcu, &shead, test_callback);
- rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ rhp = kmalloc_obj(*rhp);
if (!WARN_ON_ONCE(!rhp))
kfree_rcu(rhp, rh);
}