From 4704bd317108c94b6e2d8309f3dbb70d2015568a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 16 Nov 2020 11:18:16 +0100 Subject: list: Fix a typo at the kernel-doc markup hlist_add_behing -> hlist_add_behind Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- include/linux/list.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 89bdc92e75c3..f2af4b4aa4e9 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -901,7 +901,7 @@ static inline void hlist_add_before(struct hlist_node *n, } /** - * hlist_add_behing - add a new entry after the one specified + * hlist_add_behind - add a new entry after the one specified * @n: new entry to be added * @prev: hlist node to add it after, which must be non-NULL */ -- cgit v1.2.3 From 5130b8fd06901c1b3a4bd0d0f5c5ea99b2b0a6f0 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 20 Nov 2020 12:49:16 +0100 Subject: rcu: Introduce kfree_rcu() single-argument macro There is a kvfree_rcu() single argument macro that handles pointers returned by kvmalloc(). Even though it also handles pointer returned by kmalloc(), readability suffers. This commit therefore updates the kfree_rcu() macro to explicitly pair with kmalloc(), thus improving readability. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de0826411311..b95373ea3b02 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -851,8 +851,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) /** * kfree_rcu() - kfree an object after a grace period. - * @ptr: pointer to kfree - * @rhf: the name of the struct rcu_head within the type of @ptr. + * @ptr: pointer to kfree for both single- and double-argument invocations. + * @rhf: the name of the struct rcu_head within the type of @ptr, + * but only for double-argument invocations. * * Many rcu callbacks functions just call kfree() on the base structure. * These functions are trivial, but their size adds up, and furthermore @@ -875,13 +876,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * The BUILD_BUG_ON check must not involve any function calls, hence the * checks are done in macros here. */ -#define kfree_rcu(ptr, rhf) \ -do { \ - typeof (ptr) ___p = (ptr); \ - \ - if (___p) \ - __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ -} while (0) +#define kfree_rcu kvfree_rcu /** * kvfree_rcu() - kvfree an object after a grace period. @@ -913,7 +908,14 @@ do { \ kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__) #define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME -#define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf) +#define kvfree_rcu_arg_2(ptr, rhf) \ +do { \ + typeof (ptr) ___p = (ptr); \ + \ + if (___p) \ + __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ +} while (0) + #define kvfree_rcu_arg_1(ptr) \ do { \ typeof(ptr) ___p = (ptr); \ -- cgit v1.2.3 From 5ea5d1ed572cb5ac173674fe770252253d2d9e27 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 20 Nov 2020 12:49:17 +0100 Subject: rcu: Eliminate the __kvfree_rcu() macro This commit open-codes the __kvfree_rcu() macro, thus saving a few lines of code and improving readability. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index b95373ea3b02..f1576cde5951 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -840,15 +840,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) */ #define __is_kvfree_rcu_offset(offset) ((offset) < 4096) -/* - * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain. - */ -#define __kvfree_rcu(head, offset) \ - do { \ - BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \ - kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \ - } while (0) - /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree for both single- and double-argument invocations. @@ -866,7 +857,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * Because the functions are not allowed in the low-order 4096 bytes of * kernel virtual memory, offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will - * be generated in __kvfree_rcu(). If this error is triggered, you can + * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to * position the rcu_head structure into the first 4096 bytes. * @@ -912,8 +903,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) do { \ typeof (ptr) ___p = (ptr); \ \ - if (___p) \ - __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ + if (___p) { \ + BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \ + kvfree_call_rcu(&((___p)->rhf), (rcu_callback_t)(unsigned long) \ + (offsetof(typeof(*(ptr)), rhf))); \ + } \ } while (0) #define kvfree_rcu_arg_1(ptr) \ -- cgit v1.2.3 From 74612a07b83fc46c2b2e6f71a541d55b024ebefc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Nov 2020 16:34:09 -0800 Subject: srcu: Make Tiny SRCU use multi-bit grace-period counter There is a need for a polling interface for SRCU grace periods. This polling needs to distinguish between an SRCU instance being idle on the one hand or in the middle of a grace period on the other. This commit therefore converts the Tiny SRCU srcu_struct structure's srcu_idx from a defacto boolean to a free-running counter, using the bottom bit to indicate that a grace period is in progress. The second-from-bottom bit is thus used as the index returned by srcu_read_lock(). Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet [ paulmck: Fix ->srcu_lock_nesting[] indexing per Neeraj Upadhyay. ] Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/srcutiny.h | 6 +++--- kernel/rcu/srcutiny.c | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 5a5a1941ca15..b8b42d0a24f1 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -15,7 +15,7 @@ struct srcu_struct { short srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ - short srcu_idx; /* Current reader array element. */ + unsigned short srcu_idx; /* Current reader array element in bit 0x2. */ u8 srcu_gp_running; /* GP workqueue running? */ u8 srcu_gp_waiting; /* GP waiting for readers? */ struct swait_queue_head srcu_wq; @@ -59,7 +59,7 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) { int idx; - idx = READ_ONCE(ssp->srcu_idx); + idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], ssp->srcu_lock_nesting[idx] + 1); return idx; } @@ -80,7 +80,7 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, { int idx; - idx = READ_ONCE(ssp->srcu_idx) & 0x1; + idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", tt, tf, idx, READ_ONCE(ssp->srcu_lock_nesting[!idx]), diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 6208c1dae5c9..5598cf6f1edc 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -124,11 +124,12 @@ void srcu_drive_gp(struct work_struct *wp) ssp->srcu_cb_head = NULL; ssp->srcu_cb_tail = &ssp->srcu_cb_head; local_irq_enable(); - idx = ssp->srcu_idx; - WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx); + idx = (ssp->srcu_idx & 0x2) / 2; + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); /* Invoke the callbacks we removed above. */ while (lh) { -- cgit v1.2.3 From 8b5bd67cf6422b63ee100d76d8de8960ca2df7f0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Nov 2020 12:54:48 -0800 Subject: srcu: Provide polling interfaces for Tiny SRCU grace periods There is a need for a polling interface for SRCU grace periods, so this commit supplies get_state_synchronize_srcu(), start_poll_synchronize_srcu(), and poll_state_synchronize_srcu() for this purpose. The first can be used if future grace periods are inevitable (perhaps due to a later call_srcu() invocation), the second if future grace periods might not otherwise happen, and the third to check if a grace period has elapsed since the corresponding call to either of the first two. As with get_state_synchronize_rcu() and cond_synchronize_rcu(), the return value from either get_state_synchronize_srcu() or start_poll_synchronize_srcu() must be passed in to a later call to poll_state_synchronize_srcu(). Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet [ paulmck: Add EXPORT_SYMBOL_GPL() per kernel test robot feedback. ] [ paulmck: Apply feedback from Neeraj Upadhyay. ] Link: https://lore.kernel.org/lkml/20201117004017.GA7444@paulmck-ThinkPad-P72/ Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 ++ include/linux/srcu.h | 3 +++ include/linux/srcutiny.h | 1 + kernel/rcu/srcutiny.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 59 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de0826411311..e09c0d87b3c3 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -33,6 +33,8 @@ #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) #define ulong2long(a) (*(long *)(&(a))) +#define USHORT_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b))) +#define USHORT_CMP_LT(a, b) (USHRT_MAX / 2 < (unsigned short)((a) - (b))) /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); diff --git a/include/linux/srcu.h b/include/linux/srcu.h index e432cc92c73d..a0895bbf71ce 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -60,6 +60,9 @@ void cleanup_srcu_struct(struct srcu_struct *ssp); int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); void synchronize_srcu(struct srcu_struct *ssp); +unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); +unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); +bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index b8b42d0a24f1..0e0cf4d6a72a 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -16,6 +16,7 @@ struct srcu_struct { short srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ unsigned short srcu_idx; /* Current reader array element in bit 0x2. */ + unsigned short srcu_idx_max; /* Furthest future srcu_idx request. */ u8 srcu_gp_running; /* GP workqueue running? */ u8 srcu_gp_waiting; /* GP waiting for readers? */ struct swait_queue_head srcu_wq; diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 3bac1db85a85..26344dc6483b 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -34,6 +34,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp) ssp->srcu_gp_running = false; ssp->srcu_gp_waiting = false; ssp->srcu_idx = 0; + ssp->srcu_idx_max = 0; INIT_WORK(&ssp->srcu_work, srcu_drive_gp); INIT_LIST_HEAD(&ssp->srcu_work.entry); return 0; @@ -84,6 +85,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) WARN_ON(ssp->srcu_gp_waiting); WARN_ON(ssp->srcu_cb_head); WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); + WARN_ON(ssp->srcu_idx != ssp->srcu_idx_max); + WARN_ON(ssp->srcu_idx & 0x1); } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -114,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head)) + if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) return; /* Already running or nothing to do. */ /* Remove recently arrived callbacks and wait for readers. */ @@ -147,13 +150,19 @@ void srcu_drive_gp(struct work_struct *wp) * straighten that out. */ WRITE_ONCE(ssp->srcu_gp_running, false); - if (READ_ONCE(ssp->srcu_cb_head)) + if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { + unsigned short cookie; + + cookie = get_state_synchronize_srcu(ssp); + if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) + return; + WRITE_ONCE(ssp->srcu_idx_max, cookie); if (!READ_ONCE(ssp->srcu_gp_running)) { if (likely(srcu_init_done)) schedule_work(&ssp->srcu_work); @@ -196,6 +205,48 @@ void synchronize_srcu(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(synchronize_srcu); +/* + * get_state_synchronize_srcu - Provide an end-of-grace-period cookie + */ +unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) +{ + unsigned long ret; + + barrier(); + ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1; + barrier(); + return ret & USHRT_MAX; +} +EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); + +/* + * start_poll_synchronize_srcu - Provide cookie and start grace period + * + * The difference between this and get_state_synchronize_srcu() is that + * this function ensures that the poll_state_synchronize_srcu() will + * eventually return the value true. + */ +unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) +{ + unsigned long ret = get_state_synchronize_srcu(ssp); + + srcu_gp_start_if_needed(ssp); + return ret; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); + +/* + * poll_state_synchronize_srcu - Has cookie's grace period ended? + */ +bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) +{ + bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie); + + barrier(); + return ret; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); + /* Lockdep diagnostics. */ void __init rcu_scheduler_starting(void) { -- cgit v1.2.3 From ae5c2341ed3987bd434ed495bd4f3d8b2bc3e623 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 23 Sep 2020 11:22:09 -0400 Subject: rcu/segcblist: Add counters to segcblist datastructure Add counting of segment lengths of segmented callback list. This will be useful for a number of things such as knowing how big the ready-to-execute segment have gotten. The immediate benefit is ability to trace how the callbacks in the segmented callback list change. Also this patch remove hacks related to using donecbs's ->len field as a temporary variable to save the segmented callback list's length. This cannot be done anymore and is not needed. Also fix SRCU: The negative counting of the unsegmented list cannot be used to adjust the segmented one. To fix this, sample the unsegmented length in advance, and use it after CB execution to adjust the segmented list's length. Reviewed-by: Frederic Weisbecker Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 1 + kernel/rcu/rcu_segcblist.c | 120 +++++++++++++++++++++++++++--------------- kernel/rcu/rcu_segcblist.h | 2 - kernel/rcu/srcutree.c | 5 +- 4 files changed, 82 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index b36afe7b22c9..6c01f09a6456 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -72,6 +72,7 @@ struct rcu_segcblist { #else long len; #endif + long seglen[RCU_CBLIST_NSEGS]; u8 enabled; u8 offloaded; }; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 3cff8003f707..777780447887 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -7,10 +7,10 @@ * Authors: Paul E. McKenney */ -#include -#include +#include #include -#include +#include +#include #include "rcu_segcblist.h" @@ -88,6 +88,46 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) #endif } +/* Get the length of a segment of the rcu_segcblist structure. */ +static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg) +{ + return READ_ONCE(rsclp->seglen[seg]); +} + +/* Set the length of a segment of the rcu_segcblist structure. */ +static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v) +{ + WRITE_ONCE(rsclp->seglen[seg], v); +} + +/* Increase the numeric length of a segment by a specified amount. */ +static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, long v) +{ + WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v); +} + +/* Move from's segment length to to's segment. */ +static void rcu_segcblist_move_seglen(struct rcu_segcblist *rsclp, int from, int to) +{ + long len; + + if (from == to) + return; + + len = rcu_segcblist_get_seglen(rsclp, from); + if (!len) + return; + + rcu_segcblist_add_seglen(rsclp, to, len); + rcu_segcblist_set_seglen(rsclp, from, 0); +} + +/* Increment segment's length. */ +static void rcu_segcblist_inc_seglen(struct rcu_segcblist *rsclp, int seg) +{ + rcu_segcblist_add_seglen(rsclp, seg, 1); +} + /* * Increase the numeric length of an rcu_segcblist structure by the * specified amount, which can be negative. This can cause the ->len @@ -179,26 +219,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp) rcu_segcblist_add_len(rsclp, 1); } -/* - * Exchange the numeric length of the specified rcu_segcblist structure - * with the specified value. This can cause the ->len field to disagree - * with the actual number of callbacks on the structure. This exchange is - * fully ordered with respect to the callers accesses both before and after. - */ -static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v) -{ -#ifdef CONFIG_RCU_NOCB_CPU - return atomic_long_xchg(&rsclp->len, v); -#else - long ret = rsclp->len; - - smp_mb(); /* Up to the caller! */ - WRITE_ONCE(rsclp->len, v); - smp_mb(); /* Up to the caller! */ - return ret; -#endif -} - /* * Initialize an rcu_segcblist structure. */ @@ -209,8 +229,10 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); rsclp->head = NULL; - for (i = 0; i < RCU_CBLIST_NSEGS; i++) + for (i = 0; i < RCU_CBLIST_NSEGS; i++) { rsclp->tails[i] = &rsclp->head; + rcu_segcblist_set_seglen(rsclp, i, 0); + } rcu_segcblist_set_len(rsclp, 0); rsclp->enabled = 1; } @@ -306,6 +328,7 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, { rcu_segcblist_inc_len(rsclp); smp_mb(); /* Ensure counts are updated before callback is enqueued. */ + rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL); rhp->next = NULL; WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next); @@ -334,27 +357,13 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) if (rsclp->tails[i] != rsclp->tails[i - 1]) break; + rcu_segcblist_inc_seglen(rsclp, i); WRITE_ONCE(*rsclp->tails[i], rhp); for (; i <= RCU_NEXT_TAIL; i++) WRITE_ONCE(rsclp->tails[i], &rhp->next); return true; } -/* - * Extract only the counts from the specified rcu_segcblist structure, - * and place them in the specified rcu_cblist structure. This function - * supports both callback orphaning and invocation, hence the separation - * of counts and callbacks. (Callbacks ready for invocation must be - * orphaned and adopted separately from pending callbacks, but counts - * apply to all callbacks. Locking must be used to make sure that - * both orphaned-callbacks lists are consistent.) - */ -void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp) -{ - rclp->len = rcu_segcblist_xchg_len(rsclp, 0); -} - /* * Extract only those callbacks ready to be invoked from the specified * rcu_segcblist structure and place them in the specified rcu_cblist @@ -367,6 +376,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, if (!rcu_segcblist_ready_cbs(rsclp)) return; /* Nothing to do. */ + rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_DONE_TAIL); *rclp->tail = rsclp->head; WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]); WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); @@ -374,6 +384,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) WRITE_ONCE(rsclp->tails[i], &rsclp->head); + rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0); } /* @@ -390,11 +401,15 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, if (!rcu_segcblist_pend_cbs(rsclp)) return; /* Nothing to do. */ + rclp->len = 0; *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); - for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) + for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) { + rclp->len += rcu_segcblist_get_seglen(rsclp, i); WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]); + rcu_segcblist_set_seglen(rsclp, i, 0); + } } /* @@ -405,7 +420,6 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { rcu_segcblist_add_len(rsclp, rclp->len); - rclp->len = 0; } /* @@ -419,6 +433,7 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, if (!rclp->head) return; /* No callbacks to move. */ + rcu_segcblist_add_seglen(rsclp, RCU_DONE_TAIL, rclp->len); *rclp->tail = rsclp->head; WRITE_ONCE(rsclp->head, rclp->head); for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) @@ -439,6 +454,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, { if (!rclp->head) return; /* Nothing to do. */ + + rcu_segcblist_add_seglen(rsclp, RCU_NEXT_TAIL, rclp->len); WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head); WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail); } @@ -463,6 +480,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) break; WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]); + rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL); } /* If no callbacks moved, nothing more need be done. */ @@ -483,6 +501,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) break; /* No more callbacks. */ WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]); + rcu_segcblist_move_seglen(rsclp, i, j); rsclp->gp_seq[j] = rsclp->gp_seq[i]; } } @@ -504,7 +523,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) */ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) { - int i; + int i, j; WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) @@ -547,6 +566,10 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL) return false; + /* Accounting: everything below i is about to get merged into i. */ + for (j = i + 1; j <= RCU_NEXT_TAIL; j++) + rcu_segcblist_move_seglen(rsclp, j, i); + /* * Merge all later callbacks, including newly arrived callbacks, * into the segment located by the for-loop above. Assign "seq" @@ -574,13 +597,24 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, struct rcu_cblist donecbs; struct rcu_cblist pendcbs; + lockdep_assert_cpus_held(); + rcu_cblist_init(&donecbs); rcu_cblist_init(&pendcbs); - rcu_segcblist_extract_count(src_rsclp, &donecbs); + rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs); rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs); + + /* + * No need smp_mb() before setting length to 0, because CPU hotplug + * lock excludes rcu_barrier. + */ + rcu_segcblist_set_len(src_rsclp, 0); + rcu_segcblist_insert_count(dst_rsclp, &donecbs); + rcu_segcblist_insert_count(dst_rsclp, &pendcbs); rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs); rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs); + rcu_segcblist_init(src_rsclp); } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 1d2d61406463..cd35c9faaf51 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -89,8 +89,6 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, struct rcu_head *rhp); -void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp); void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0f23d20d485a..79b7081143a7 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1160,6 +1160,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) */ static void srcu_invoke_callbacks(struct work_struct *work) { + long len; bool more; struct rcu_cblist ready_cbs; struct rcu_head *rhp; @@ -1182,6 +1183,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) /* We are on the job! Extract and invoke ready callbacks. */ sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); + len = ready_cbs.len; spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { @@ -1190,13 +1192,14 @@ static void srcu_invoke_callbacks(struct work_struct *work) rhp->func(rhp); local_bh_enable(); } + WARN_ON_ONCE(ready_cbs.len); /* * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ spin_lock_irq_rcu_node(sdp); - rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); + rcu_segcblist_add_len(&sdp->srcu_cblist, -len); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&ssp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; -- cgit v1.2.3 From 65e560327fe68153a9ad7452d5fd3171a1927d33 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:16 +0100 Subject: rcu/nocb: Turn enabled/offload states into a common flag This commit gathers the rcu_segcblist ->enabled and ->offloaded property field into a single ->flags bitmask to avoid further proliferation of individual u8 fields in the structure. This change prepares for the state formerly known as ->offloaded state to be modified at runtime. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 6 ++++-- kernel/rcu/rcu_segcblist.c | 6 +++--- kernel/rcu/rcu_segcblist.h | 23 +++++++++++++++++++++-- 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 6c01f09a6456..4714b0263c76 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -63,6 +63,9 @@ struct rcu_cblist { #define RCU_NEXT_TAIL 3 #define RCU_CBLIST_NSEGS 4 +#define SEGCBLIST_ENABLED BIT(0) +#define SEGCBLIST_OFFLOADED BIT(1) + struct rcu_segcblist { struct rcu_head *head; struct rcu_head **tails[RCU_CBLIST_NSEGS]; @@ -73,8 +76,7 @@ struct rcu_segcblist { long len; #endif long seglen[RCU_CBLIST_NSEGS]; - u8 enabled; - u8 offloaded; + u8 flags; }; #define RCU_SEGCBLIST_INITIALIZER(n) \ diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 89e0dff890bf..934945d72ca5 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -246,7 +246,7 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) rcu_segcblist_set_seglen(rsclp, i, 0); } rcu_segcblist_set_len(rsclp, 0); - rsclp->enabled = 1; + rcu_segcblist_set_flags(rsclp, SEGCBLIST_ENABLED); } /* @@ -257,7 +257,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) { WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); - rsclp->enabled = 0; + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED); } /* @@ -266,7 +266,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp) { - rsclp->offloaded = 1; + rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); } /* diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 18e101de8747..ff372db09f8b 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -53,19 +53,38 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) #endif } +static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp, + int flags) +{ + rsclp->flags |= flags; +} + +static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp, + int flags) +{ + rsclp->flags &= ~flags; +} + +static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp, + int flags) +{ + return READ_ONCE(rsclp->flags) & flags; +} + /* * Is the specified rcu_segcblist enabled, for example, not corresponding * to an offline CPU? */ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) { - return rsclp->enabled; + return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED); } /* Is the specified rcu_segcblist offloaded? */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { - return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rsclp->offloaded; + return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED); } /* -- cgit v1.2.3 From 8d346d438f93b5344e99d429727ec9c2f392d4ec Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:17 +0100 Subject: rcu/nocb: Provide basic callback offloading state machine bits Offloading and de-offloading RCU callback processes must be done carefully. There must never be a time at which callback processing is disabled because the task driving the offloading or de-offloading might be preempted or otherwise stalled at that point in time, which would result in OOM due to calbacks piling up indefinitely. This implies that there will be times during which a given CPU's callbacks might be concurrently invoked by both that CPU's RCU_SOFTIRQ handler (or, equivalently, that CPU's rcuc kthread) and by that CPU's rcuo kthread. This situation could fatally confuse both rcu_barrier() and the CPU-hotplug offlining process, so these must be excluded during any concurrent-callback-invocation period. In addition, during times of concurrent callback invocation, changes to ->cblist must be protected both as needed for RCU_SOFTIRQ and as needed for the rcuo kthread. This commit therefore defines and documents the states for a state machine that coordinates offloading and deoffloading. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 115 +++++++++++++++++++++++++++++++++++++++++- kernel/rcu/rcu_segcblist.c | 1 + kernel/rcu/rcu_segcblist.h | 12 ++++- kernel/rcu/tree.c | 3 ++ 4 files changed, 128 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 4714b0263c76..8afe886e85f1 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -63,8 +63,121 @@ struct rcu_cblist { #define RCU_NEXT_TAIL 3 #define RCU_CBLIST_NSEGS 4 + +/* + * ==NOCB Offloading state machine== + * + * + * ---------------------------------------------------------------------------- + * | SEGCBLIST_SOFTIRQ_ONLY | + * | | + * | Callbacks processed by rcu_core() from softirqs or local | + * | rcuc kthread, without holding nocb_lock. | + * ---------------------------------------------------------------------------- + * | + * v + * ---------------------------------------------------------------------------- + * | SEGCBLIST_OFFLOADED | + * | | + * | Callbacks processed by rcu_core() from softirqs or local | + * | rcuc kthread, while holding nocb_lock. Waking up CB and GP kthreads, | + * | allowing nocb_timer to be armed. | + * ---------------------------------------------------------------------------- + * | + * v + * ----------------------------------- + * | | + * v v + * --------------------------------------- ----------------------------------| + * | SEGCBLIST_OFFLOADED | | | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_KTHREAD_CB | | SEGCBLIST_KTHREAD_GP | + * | | | | + * | | | | + * | CB kthread woke up and | | GP kthread woke up and | + * | acknowledged SEGCBLIST_OFFLOADED. | | acknowledged SEGCBLIST_OFFLOADED| + * | Processes callbacks concurrently | | | + * | with rcu_core(), holding | | | + * | nocb_lock. | | | + * --------------------------------------- ----------------------------------- + * | | + * ----------------------------------- + * | + * v + * |--------------------------------------------------------------------------| + * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_KTHREAD_CB | | + * | SEGCBLIST_KTHREAD_GP | + * | | + * | Kthreads handle callbacks holding nocb_lock, local rcu_core() stops | + * | handling callbacks. | + * ---------------------------------------------------------------------------- + */ + + + +/* + * ==NOCB De-Offloading state machine== + * + * + * |--------------------------------------------------------------------------| + * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_KTHREAD_CB | | + * | SEGCBLIST_KTHREAD_GP | + * | | + * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() | + * | ignores callbacks. | + * ---------------------------------------------------------------------------- + * | + * v + * |--------------------------------------------------------------------------| + * | SEGCBLIST_KTHREAD_CB | | + * | SEGCBLIST_KTHREAD_GP | + * | | + * | CB/GP kthreads and local rcu_core() handle callbacks concurrently | + * | holding nocb_lock. Wake up CB and GP kthreads if necessary. | + * ---------------------------------------------------------------------------- + * | + * v + * ----------------------------------- + * | | + * v v + * ---------------------------------------------------------------------------| + * | | + * | SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | + * | | | + * | GP kthread woke up and | CB kthread woke up and | + * | acknowledged the fact that | acknowledged the fact that | + * | SEGCBLIST_OFFLOADED got cleared. | SEGCBLIST_OFFLOADED got cleared. | + * | | The CB kthread goes to sleep | + * | The callbacks from the target CPU | until it ever gets re-offloaded. | + * | will be ignored from the GP kthread | | + * | loop. | | + * ---------------------------------------------------------------------------- + * | | + * ----------------------------------- + * | + * v + * ---------------------------------------------------------------------------- + * | 0 | + * | | + * | Callbacks processed by rcu_core() from softirqs or local | + * | rcuc kthread, while holding nocb_lock. Forbid nocb_timer to be armed. | + * | Flush pending nocb_timer. Flush nocb bypass callbacks. | + * ---------------------------------------------------------------------------- + * | + * v + * ---------------------------------------------------------------------------- + * | SEGCBLIST_SOFTIRQ_ONLY | + * | | + * | Callbacks processed by rcu_core() from softirqs or local | + * | rcuc kthread, without holding nocb_lock. | + * ---------------------------------------------------------------------------- + */ #define SEGCBLIST_ENABLED BIT(0) -#define SEGCBLIST_OFFLOADED BIT(1) +#define SEGCBLIST_SOFTIRQ_ONLY BIT(1) +#define SEGCBLIST_KTHREAD_CB BIT(2) +#define SEGCBLIST_KTHREAD_GP BIT(3) +#define SEGCBLIST_OFFLOADED BIT(4) struct rcu_segcblist { struct rcu_head *head; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 934945d72ca5..7fc6362625b2 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -266,6 +266,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp) { + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY); rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index ff372db09f8b..e05952ab9b87 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -83,8 +83,16 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) /* Is the specified rcu_segcblist offloaded? */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { - return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED); + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU)) { + /* + * Complete de-offloading happens only when SEGCBLIST_SOFTIRQ_ONLY + * is set. + */ + if (!rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY)) + return true; + } + + return false; } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8086c0467c15..7cfc2e8a2500 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -83,6 +83,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), +#ifdef CONFIG_RCU_NOCB_CPU + .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY, +#endif }; static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, -- cgit v1.2.3 From d97b078182406c0bd0aacd36fc0a693e118e608f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:19 +0100 Subject: rcu/nocb: De-offloading CB kthread To de-offload callback processing back onto a CPU, it is necessary to clear SEGCBLIST_OFFLOAD and notify the nocb CB kthread, which will then clear its own bit flag and go to sleep to stop handling callbacks. This commit makes that change. It will also be necessary to notify the nocb GP kthread in this same way, which is the subject of a follow-on commit. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker [ paulmck: Add export per kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 + kernel/rcu/rcu_segcblist.c | 10 ++-- kernel/rcu/rcu_segcblist.h | 2 +- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 130 ++++++++++++++++++++++++++++++++++++++------- 5 files changed, 123 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de0826411311..40266eb418b6 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -104,8 +104,10 @@ static inline void rcu_user_exit(void) { } #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); +int rcu_nocb_cpu_deoffload(int cpu); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } +static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /** diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 7fc6362625b2..7f181c9675f7 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -264,10 +264,14 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) * Mark the specified rcu_segcblist structure as offloaded. This * structure must be empty. */ -void rcu_segcblist_offload(struct rcu_segcblist *rsclp) +void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) { - rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY); - rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); + if (offload) { + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY); + rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); + } else { + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED); + } } /* diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index e05952ab9b87..28c9a5225afc 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -109,7 +109,7 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -void rcu_segcblist_offload(struct rcu_segcblist *rsclp); +void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7708ed161f4a..e0deb4829847 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -201,6 +201,7 @@ struct rcu_data { /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */ + struct swait_queue_head nocb_state_wq; /* For offloading state changes */ struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ atomic_t nocb_lock_contended; /* Contention experienced. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7e291ce0a1d6..1b870d0d2445 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2081,16 +2081,29 @@ static int rcu_nocb_gp_kthread(void *arg) return 0; } +static inline bool nocb_cb_can_run(struct rcu_data *rdp) +{ + u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; + return rcu_segcblist_test_flags(&rdp->cblist, flags); +} + +static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) +{ + return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); +} + /* * Invoke any ready callbacks from the corresponding no-CBs CPU, * then, if there are no more, wait for more to appear. */ static void nocb_cb_wait(struct rcu_data *rdp) { + struct rcu_segcblist *cblist = &rdp->cblist; + struct rcu_node *rnp = rdp->mynode; + bool needwake_state = false; + bool needwake_gp = false; unsigned long cur_gp_seq; unsigned long flags; - bool needwake_gp = false; - struct rcu_node *rnp = rdp->mynode; local_irq_save(flags); rcu_momentary_dyntick_idle(); @@ -2100,32 +2113,50 @@ static void nocb_cb_wait(struct rcu_data *rdp) local_bh_enable(); lockdep_assert_irqs_enabled(); rcu_nocb_lock_irqsave(rdp, flags); - if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) && rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } - if (rcu_segcblist_ready_cbs(&rdp->cblist)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_gp) - rcu_gp_kthread_wake(); - return; - } - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); WRITE_ONCE(rdp->nocb_cb_sleep, true); + + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (rcu_segcblist_ready_cbs(cblist)) + WRITE_ONCE(rdp->nocb_cb_sleep, false); + } else { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We won't touch the callbacks and keep sleeping until we ever + * get re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } + + if (rdp->nocb_cb_sleep) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); + rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_gp) rcu_gp_kthread_wake(); - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - !READ_ONCE(rdp->nocb_cb_sleep)); - if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */ + + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + + do { + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + /* ^^^ Ensure CB invocation follows _sleep test. */ - return; - } - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + if (smp_load_acquire(&rdp->nocb_cb_sleep)) { + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + } while (!nocb_cb_can_run(rdp)); } /* @@ -2187,6 +2218,67 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) do_nocb_deferred_wakeup_common(rdp); } +static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + bool wake_cb = false; + unsigned long flags; + + printk("De-offloading %d\n", rdp->cpu); + + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_offload(cblist, false); + + if (rdp->nocb_cb_sleep) { + rdp->nocb_cb_sleep = false; + wake_cb = true; + } + rcu_nocb_unlock_irqrestore(rdp, flags); + + if (wake_cb) + swake_up_one(&rdp->nocb_cb_wq); + + swait_event_exclusive(rdp->nocb_state_wq, + !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); + + return 0; +} + +static long rcu_nocb_rdp_deoffload(void *arg) +{ + struct rcu_data *rdp = arg; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + return __rcu_nocb_rdp_deoffload(rdp); +} + +int rcu_nocb_cpu_deoffload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + if (rdp == rdp->nocb_gp_rdp) { + pr_info("Can't deoffload an rdp GP leader (yet)\n"); + return -EINVAL; + } + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (cpu_online(cpu)) { + ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); + } else { + ret = __rcu_nocb_rdp_deoffload(rdp); + } + if (!ret) + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); + void __init rcu_init_nohz(void) { int cpu; @@ -2229,7 +2321,8 @@ void __init rcu_init_nohz(void) rdp = per_cpu_ptr(&rcu_data, cpu); if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_offload(&rdp->cblist); + rcu_segcblist_offload(&rdp->cblist, true); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); } rcu_organize_nocb_kthreads(); } @@ -2239,6 +2332,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { init_swait_queue_head(&rdp->nocb_cb_wq); init_swait_queue_head(&rdp->nocb_gp_wq); + init_swait_queue_head(&rdp->nocb_state_wq); raw_spin_lock_init(&rdp->nocb_lock); raw_spin_lock_init(&rdp->nocb_bypass_lock); raw_spin_lock_init(&rdp->nocb_gp_lock); -- cgit v1.2.3 From 254e11efde66ca0a0ce0c99a62c377314b5984ff Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:22 +0100 Subject: rcu/nocb: Re-offload support To re-offload the callback processing off of a CPU, it is necessary to clear SEGCBLIST_SOFTIRQ_ONLY, set SEGCBLIST_OFFLOADED, and then notify both the CB and GP kthreads so that they both set their own bit flag and start processing the callbacks remotely. The re-offloading worker is then notified that it can stop the RCU_SOFTIRQ handler (or rcuc kthread, as the case may be) from processing the callbacks locally. Ordering must be carefully enforced so that the callbacks that used to be processed locally without locking will have the same ordering properties when they are invoked by the nocb CB and GP kthreads. This commit makes this change. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker [ paulmck: Export rcu_nocb_cpu_offload(). ] Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 + kernel/rcu/tree_plugin.h | 158 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 138 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 40266eb418b6..e0ee52e2756d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -104,9 +104,11 @@ static inline void rcu_user_exit(void) { } #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); +int rcu_nocb_cpu_offload(int cpu); int rcu_nocb_cpu_deoffload(int cpu); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } +static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; } static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fe46e7008667..03ae1ce4790f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1928,6 +1928,20 @@ static void do_nocb_bypass_wakeup_timer(struct timer_list *t) __call_rcu_nocb_wake(rdp, true, flags); } +/* + * Check if we ignore this rdp. + * + * We check that without holding the nocb lock but + * we make sure not to miss a freshly offloaded rdp + * with the current ordering: + * + * rdp_offload_toggle() nocb_gp_enabled_cb() + * ------------------------- ---------------------------- + * WRITE flags LOCK nocb_gp_lock + * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep + * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock + * UNLOCK nocb_gp_lock READ flags + */ static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) { u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; @@ -1940,6 +1954,11 @@ static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_sta struct rcu_segcblist *cblist = &rdp->cblist; if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + } return true; } else { /* @@ -2003,6 +2022,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); continue; /* No callbacks here, try next. */ } if (bypass_ncbs) { @@ -2054,6 +2075,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (needwake_gp) rcu_gp_kthread_wake(); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); } my_rdp->nocb_gp_bypass = bypass; @@ -2159,6 +2182,11 @@ static void nocb_cb_wait(struct rcu_data *rdp) WRITE_ONCE(rdp->nocb_cb_sleep, true); if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } if (rcu_segcblist_ready_cbs(cblist)) WRITE_ONCE(rdp->nocb_cb_sleep, false); } else { @@ -2254,35 +2282,25 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) do_nocb_deferred_wakeup_common(rdp); } -static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +static int rdp_offload_toggle(struct rcu_data *rdp, + bool offload, unsigned long flags) + __releases(rdp->nocb_lock) { struct rcu_segcblist *cblist = &rdp->cblist; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - bool wake_cb = false, wake_gp = false; - unsigned long flags; - - printk("De-offloading %d\n", rdp->cpu); - - rcu_nocb_lock_irqsave(rdp, flags); - /* - * If there are still pending work offloaded, the offline - * CPU won't help much handling them. - */ - if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - return -EBUSY; - } + bool wake_gp = false; - rcu_segcblist_offload(cblist, false); + rcu_segcblist_offload(cblist, offload); - if (rdp->nocb_cb_sleep) { + if (rdp->nocb_cb_sleep) rdp->nocb_cb_sleep = false; - wake_cb = true; - } rcu_nocb_unlock_irqrestore(rdp, flags); - if (wake_cb) - swake_up_one(&rdp->nocb_cb_wq); + /* + * Ignore former value of nocb_cb_sleep and force wake up as it could + * have been spuriously set to false already. + */ + swake_up_one(&rdp->nocb_cb_wq); raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); if (rdp_gp->nocb_gp_sleep) { @@ -2294,10 +2312,32 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); + return 0; +} + +static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + printk("De-offloading %d\n", rdp->cpu); + + rcu_nocb_lock_irqsave(rdp, flags); + /* + * If there are still pending work offloaded, the offline + * CPU won't help much handling them. + */ + if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + return -EBUSY; + } + + ret = rdp_offload_toggle(rdp, false, flags); swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); - return 0; + return ret; } static long rcu_nocb_rdp_deoffload(void *arg) @@ -2335,6 +2375,80 @@ int rcu_nocb_cpu_deoffload(int cpu) } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); +static int __rcu_nocb_rdp_offload(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + /* + * For now we only support re-offload, ie: the rdp must have been + * offloaded on boot first. + */ + if (!rdp->nocb_gp_rdp) + return -EINVAL; + + printk("Offloading %d\n", rdp->cpu); + /* + * Can't use rcu_nocb_lock_irqsave() while we are in + * SEGCBLIST_SOFTIRQ_ONLY mode. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + /* + * We didn't take the nocb lock while working on the + * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. + * Every modifications that have been done previously on + * rdp->cblist must be visible remotely by the nocb kthreads + * upon wake up after reading the cblist flags. + * + * The layout against nocb_lock enforces that ordering: + * + * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() + * ------------------------- ---------------------------- + * WRITE callbacks rcu_nocb_lock() + * rcu_nocb_lock() READ flags + * WRITE flags READ callbacks + * rcu_nocb_unlock() rcu_nocb_unlock() + */ + ret = rdp_offload_toggle(rdp, true, flags); + swait_event_exclusive(rdp->nocb_state_wq, + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + + return ret; +} + +static long rcu_nocb_rdp_offload(void *arg) +{ + struct rcu_data *rdp = arg; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + return __rcu_nocb_rdp_offload(rdp); +} + +int rcu_nocb_cpu_offload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (cpu_online(cpu)) { + ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); + } else { + ret = __rcu_nocb_rdp_offload(rdp); + } + if (!ret) + cpumask_set_cpu(cpu, rcu_nocb_mask); + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); + void __init rcu_init_nohz(void) { int cpu; -- cgit v1.2.3 From 43759fe5a137389e94ed6d4680c3c63c17273158 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Nov 2020 23:53:13 +0100 Subject: cpu/hotplug: Add lockdep_is_cpus_held() This commit adds a lockdep_is_cpus_held() function to verify that the proper locks are held and that various operations are running in the correct context. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- include/linux/cpu.h | 2 ++ kernel/cpu.c | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d6428aaf67e7..3aaa0687e8df 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -111,6 +111,8 @@ static inline void cpu_maps_update_done(void) #endif /* CONFIG_SMP */ extern struct bus_type cpu_subsys; +extern int lockdep_is_cpus_held(void); + #ifdef CONFIG_HOTPLUG_CPU extern void cpus_write_lock(void); extern void cpus_write_unlock(void); diff --git a/kernel/cpu.c b/kernel/cpu.c index 4e11e91010e1..1b6302ecbabe 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -330,6 +330,13 @@ void lockdep_assert_cpus_held(void) percpu_rwsem_assert_held(&cpu_hotplug_lock); } +#ifdef CONFIG_LOCKDEP +int lockdep_is_cpus_held(void) +{ + return percpu_rwsem_is_held(&cpu_hotplug_lock); +} +#endif + static void lockdep_acquire_cpus_lock(void) { rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_); -- cgit v1.2.3 From dcd42591ebb8a25895b551a5297ea9c24414ba54 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:33 +0100 Subject: timer: Add timer_curr_running() This commit adds a timer_curr_running() function that verifies that the current code is running in the context of the specified timer's handler. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/timer.h | 2 ++ kernel/time/timer.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index fda13c9d1256..4118a97e62fb 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -192,6 +192,8 @@ extern int try_to_del_timer_sync(struct timer_list *timer); #define del_singleshot_timer_sync(t) del_timer_sync(t) +extern bool timer_curr_running(struct timer_list *timer); + extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8dbc008f8942..f9b2096456e5 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1237,6 +1237,19 @@ int try_to_del_timer_sync(struct timer_list *timer) } EXPORT_SYMBOL(try_to_del_timer_sync); +bool timer_curr_running(struct timer_list *timer) +{ + int i; + + for (i = 0; i < NR_BASES; i++) { + struct timer_base *base = this_cpu_ptr(&timer_bases[i]); + if (base->running_timer == timer) + return true; + } + + return false; +} + #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) { -- cgit v1.2.3 From ae19aaafae95a5487469433e9cae4c208f8d15cd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2020 11:30:18 -0800 Subject: torture: Add fuzzed hrtimer-based sleep functions This commit adds torture_hrtimeout_ns(), torture_hrtimeout_us(), torture_hrtimeout_ms(), torture_hrtimeout_jiffies(), and torture_hrtimeout_s(), each of which uses hrtimers to block for a fuzzed time interval. These functions are intended to be used by the various torture tests to decouple wakeups from the timer wheel, thus providing more opportunity for Murphy to insert destructive race conditions. Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 7 +++++ kernel/torture.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 7f65bd1dd307..32941f8106b2 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -61,6 +61,13 @@ static inline void torture_random_init(struct torture_random_state *trsp) trsp->trs_count = 0; } +/* Definitions for high-resolution-timer sleeps. */ +int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp); +int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp); +int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp); +int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp); +int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *trsp); + /* Task shuffler, which causes CPUs to occasionally go idle. */ void torture_shuffle_task_register(struct task_struct *tp); int torture_shuffle_init(long shuffint); diff --git a/kernel/torture.c b/kernel/torture.c index 8562ac18d2eb..7548634e8a89 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -58,6 +58,81 @@ static int verbose; static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); +/* + * Schedule a high-resolution-timer sleep in nanoseconds, with a 32-bit + * nanosecond random fuzz. This function and its friends desynchronize + * testing from the timer wheel. + */ +int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp) +{ + ktime_t hto = baset_ns; + + if (trsp) + hto += (torture_random(trsp) >> 3) % fuzzt_ns; + set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_hrtimeout(&hto, HRTIMER_MODE_REL); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_ns); + +/* + * Schedule a high-resolution-timer sleep in microseconds, with a 32-bit + * nanosecond (not microsecond!) random fuzz. + */ +int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp) +{ + ktime_t baset_ns = baset_us * NSEC_PER_USEC; + + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_us); + +/* + * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit + * microsecond (not millisecond!) random fuzz. + */ +int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp) +{ + ktime_t baset_ns = baset_ms * NSEC_PER_MSEC; + u32 fuzzt_ns; + + if ((u32)~0U / NSEC_PER_USEC < fuzzt_us) + fuzzt_ns = (u32)~0U; + else + fuzzt_ns = fuzzt_us * NSEC_PER_USEC; + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_ms); + +/* + * Schedule a high-resolution-timer sleep in jiffies, with an + * implied one-jiffy random fuzz. This is intended to replace calls to + * schedule_timeout_interruptible() and friends. + */ +int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp) +{ + ktime_t baset_ns = jiffies_to_nsecs(baset_j); + + return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies); + +/* + * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit + * millisecond (not second!) random fuzz. + */ +int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *trsp) +{ + ktime_t baset_ns = baset_s * NSEC_PER_SEC; + u32 fuzzt_ns; + + if ((u32)~0U / NSEC_PER_MSEC < fuzzt_ms) + fuzzt_ns = (u32)~0U; + else + fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC; + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_s); + #ifdef CONFIG_HOTPLUG_CPU /* -- cgit v1.2.3 From 8a67a20bf257ca378d6e5588fbe4382966395ac8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Nov 2020 13:00:04 -0800 Subject: torture: Throttle VERBOSE_TOROUT_*() output This commit adds kernel boot parameters torture.verbose_sleep_frequency and torture.verbose_sleep_duration, which allow VERBOSE_TOROUT_*() output to be throttled with periodic sleeps on large systems. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 8 ++++++++ include/linux/torture.h | 15 +++++++++++++-- kernel/torture.c | 20 ++++++++++++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3244f9e04a64..18f3aa759e54 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5337,6 +5337,14 @@ are running concurrently, especially on systems with rotating-rust storage. + torture.verbose_sleep_frequency= [KNL] + Specifies how many verbose printk()s should be + emitted between each sleep. The default of zero + disables verbose-printk() sleeping. + + torture.verbose_sleep_duration= [KNL] + Duration of each verbose-printk() sleep in jiffies. + tp720= [HW,PS2] tpm_suspend_pcr=[HW,TPM] diff --git a/include/linux/torture.h b/include/linux/torture.h index 32941f8106b2..d62d13c8c69a 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -32,9 +32,20 @@ #define TOROUT_STRING(s) \ pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s) #define VERBOSE_TOROUT_STRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); } while (0) +do { \ + if (verbose) { \ + verbose_torout_sleep(); \ + pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); \ + } \ +} while (0) #define VERBOSE_TOROUT_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0) +do { \ + if (verbose) { \ + verbose_torout_sleep(); \ + pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); \ + } \ +} while (0) +void verbose_torout_sleep(void); /* Definitions for online/offline exerciser. */ typedef void torture_ofl_func(void); diff --git a/kernel/torture.c b/kernel/torture.c index 7ad5c9681580..93eeeb2b3d88 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -48,6 +48,12 @@ module_param(disable_onoff_at_boot, bool, 0444); static bool ftrace_dump_at_shutdown; module_param(ftrace_dump_at_shutdown, bool, 0444); +static int verbose_sleep_frequency; +module_param(verbose_sleep_frequency, int, 0444); + +static int verbose_sleep_duration = 1; +module_param(verbose_sleep_duration, int, 0444); + static char *torture_type; static int verbose; @@ -58,6 +64,20 @@ static int verbose; static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); +static atomic_t verbose_sleep_counter; + +/* + * Sleep if needed from VERBOSE_TOROUT*(). + */ +void verbose_torout_sleep(void) +{ + if (verbose_sleep_frequency > 0 && + verbose_sleep_duration > 0 && + !(atomic_inc_return(&verbose_sleep_counter) % verbose_sleep_frequency)) + schedule_timeout_uninterruptible(verbose_sleep_duration); +} +EXPORT_SYMBOL_GPL(verbose_torout_sleep); + /* * Schedule a high-resolution-timer sleep in nanoseconds, with a 32-bit * nanosecond random fuzz. This function and its friends desynchronize -- cgit v1.2.3 From 1afb95fee0342b8d9e05b0433e8e44a6dfd7c4a3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 19 Dec 2020 07:34:35 -0800 Subject: torture: Maintain torture-specific set of CPUs-online books The TREE01 rcutorture scenario intentionally creates confusion as to the number of available CPUs by specifying the "maxcpus=8 nr_cpus=43" kernel boot parameters. This can disable rcutorture's load shedding, which currently uses num_online_cpus(), which would count the extra 35 CPUs. However, the rcutorture guest OS will be provisioned with only 8 CPUs, which means that rcutorture will present full load even when all but one of the original 8 CPUs are offline. This can result in spurious errors due to extreme overloading of that single remaining CPU. This commit therefore keeps a separate set of books on the number of usable online CPUs, so that torture_num_online_cpus() is used for load shedding instead of num_online_cpus(). Note that initial sizing must use num_online_cpus() because torture_num_online_cpus() will return NR_CPUS until shortly after torture_onoff_init() is invoked. Reported-by: Frederic Weisbecker [ paulmck: Apply feedback from kernel test robot. ] Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 5 +++++ kernel/rcu/rcutorture.c | 4 ++-- kernel/torture.c | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index d62d13c8c69a..0910c5803f35 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -48,6 +48,11 @@ do { \ void verbose_torout_sleep(void); /* Definitions for online/offline exerciser. */ +#ifdef CONFIG_HOTPLUG_CPU +int torture_num_online_cpus(void); +#else /* #ifdef CONFIG_HOTPLUG_CPU */ +static inline int torture_num_online_cpus(void) { return 1; } +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ typedef void torture_ofl_func(void); bool torture_offline(int cpu, long *n_onl_attempts, long *n_onl_successes, unsigned long *sum_offl, int *min_onl, int *max_onl); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 76c838696366..a816df4e86e0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1338,7 +1338,7 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, struct torture_random_state *trsp) { unsigned long loops; - int noc = num_online_cpus(); + int noc = torture_num_online_cpus(); int rdrchked; int rdrchker; struct rcu_torture_reader_check *rtrcp; // Me. @@ -1658,7 +1658,7 @@ rcu_torture_reader(void *arg) torture_hrtimeout_us(500, 1000, &rand); lastsleep = jiffies + 10; } - while (num_online_cpus() < mynumonline && !torture_must_stop()) + while (torture_num_online_cpus() < mynumonline && !torture_must_stop()) schedule_timeout_interruptible(HZ / 5); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); diff --git a/kernel/torture.c b/kernel/torture.c index 507a20be6950..01e336f1e5b2 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -175,6 +175,19 @@ static unsigned long sum_online; static int min_online = -1; static int max_online; +static int torture_online_cpus = NR_CPUS; + +/* + * Some torture testing leverages confusion as to the number of online + * CPUs. This function returns the torture-testing view of this number, + * which allows torture tests to load-balance appropriately. + */ +int torture_num_online_cpus(void) +{ + return READ_ONCE(torture_online_cpus); +} +EXPORT_SYMBOL_GPL(torture_num_online_cpus); + /* * Attempt to take a CPU offline. Return false if the CPU is already * offline or if it is not subject to CPU-hotplug operations. The @@ -229,6 +242,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, *min_offl = delta; if (*max_offl < delta) *max_offl = delta; + WRITE_ONCE(torture_online_cpus, torture_online_cpus - 1); + WARN_ON_ONCE(torture_online_cpus <= 0); } return true; @@ -285,6 +300,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, *min_onl = delta; if (*max_onl < delta) *max_onl = delta; + WRITE_ONCE(torture_online_cpus, torture_online_cpus + 1); } return true; -- cgit v1.2.3 From 8e7f37f2aaa56b723a24f6872817cf9c6410b613 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Dec 2020 17:41:02 -0800 Subject: mm: Add mem_dump_obj() to print source of memory block There are kernel facilities such as per-CPU reference counts that give error messages in generic handlers or callbacks, whose messages are unenlightening. In the case of per-CPU reference-count underflow, this is not a problem when creating a new use of this facility because in that case the bug is almost certainly in the code implementing that new use. However, trouble arises when deploying across many systems, which might exercise corner cases that were not seen during development and testing. Here, it would be really nice to get some kind of hint as to which of several uses the underflow was caused by. This commit therefore exposes a mem_dump_obj() function that takes a pointer to memory (which must still be allocated if it has been dynamically allocated) and prints available information on where that memory came from. This pointer can reference the middle of the block as well as the beginning of the block, as needed by things like RCU callback functions and timer handlers that might not know where the beginning of the memory block is. These functions and handlers can use mem_dump_obj() to print out better hints as to where the problem might lie. The information printed can depend on kernel configuration. For example, the allocation return address can be printed only for slab and slub, and even then only when the necessary debug has been enabled. For slab, build with CONFIG_DEBUG_SLAB=y, and either use sizes with ample space to the next power of two or use the SLAB_STORE_USER when creating the kmem_cache structure. For slub, build with CONFIG_SLUB_DEBUG=y and boot with slub_debug=U, or pass SLAB_STORE_USER to kmem_cache_create() if more focused use is desired. Also for slub, use CONFIG_STACKTRACE to enable printing of the allocation-time stack trace. Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: Reported-by: Andrii Nakryiko [ paulmck: Convert to printing and change names per Joonsoo Kim. ] [ paulmck: Move slab definition per Stephen Rothwell and kbuild test robot. ] [ paulmck: Handle CONFIG_MMU=n case where vmalloc() is kmalloc(). ] [ paulmck: Apply Vlastimil Babka feedback on slab.c kmem_provenance(). ] [ paulmck: Extract more info from !SLUB_DEBUG per Joonsoo Kim. ] [ paulmck: Explicitly check for small pointers per Naresh Kamboju. ] Acked-by: Joonsoo Kim Acked-by: Vlastimil Babka Tested-by: Naresh Kamboju Signed-off-by: Paul E. McKenney --- include/linux/mm.h | 2 ++ include/linux/slab.h | 2 ++ mm/slab.c | 20 ++++++++++++++ mm/slab.h | 12 +++++++++ mm/slab_common.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/slob.c | 6 +++++ mm/slub.c | 40 ++++++++++++++++++++++++++++ mm/util.c | 24 +++++++++++++++++ 8 files changed, 181 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5299b90a6c40..af7d050900e7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3169,5 +3169,7 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, extern int sysctl_nr_trim_pages; +void mem_dump_obj(void *object); + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index be4ba5867ac5..7ae604076767 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -186,6 +186,8 @@ void kfree(const void *); void kfree_sensitive(const void *); size_t __ksize(const void *); size_t ksize(const void *); +bool kmem_valid_obj(void *object); +void kmem_dump_obj(void *object); #ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR void __check_heap_object(const void *ptr, unsigned long n, struct page *page, diff --git a/mm/slab.c b/mm/slab.c index d7c8da9319c7..dcc55e78f353 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3635,6 +3635,26 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif /* CONFIG_NUMA */ +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + struct kmem_cache *cachep; + unsigned int objnr; + void *objp; + + kpp->kp_ptr = object; + kpp->kp_page = page; + cachep = page->slab_cache; + kpp->kp_slab_cache = cachep; + objp = object - obj_offset(cachep); + kpp->kp_data_offset = obj_offset(cachep); + page = virt_to_head_page(objp); + objnr = obj_to_index(cachep, page, objp); + objp = index_to_obj(cachep, page, objnr); + kpp->kp_objp = objp; + if (DEBUG && cachep->flags & SLAB_STORE_USER) + kpp->kp_ret = *dbg_userword(cachep, objp); +} + /** * __do_kmalloc - allocate memory * @size: how many bytes of memory are required. diff --git a/mm/slab.h b/mm/slab.h index 1a756a359fa8..ecad9b57bc44 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -615,4 +615,16 @@ static inline bool slab_want_init_on_free(struct kmem_cache *c) return false; } +#define KS_ADDRS_COUNT 16 +struct kmem_obj_info { + void *kp_ptr; + struct page *kp_page; + void *kp_objp; + unsigned long kp_data_offset; + struct kmem_cache *kp_slab_cache; + void *kp_ret; + void *kp_stack[KS_ADDRS_COUNT]; +}; +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); + #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index e981c80d216c..adbace4256ef 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -537,6 +537,81 @@ bool slab_is_available(void) return slab_state >= UP; } +/** + * kmem_valid_obj - does the pointer reference a valid slab object? + * @object: pointer to query. + * + * Return: %true if the pointer is to a not-yet-freed object from + * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer + * is to an already-freed object, and %false otherwise. + */ +bool kmem_valid_obj(void *object) +{ + struct page *page; + + /* Some arches consider ZERO_SIZE_PTR to be a valid address. */ + if (object < (void *)PAGE_SIZE || !virt_addr_valid(object)) + return false; + page = virt_to_head_page(object); + return PageSlab(page); +} + +/** + * kmem_dump_obj - Print available slab provenance information + * @object: slab object for which to find provenance information. + * + * This function uses pr_cont(), so that the caller is expected to have + * printed out whatever preamble is appropriate. The provenance information + * depends on the type of object and on how much debugging is enabled. + * For a slab-cache object, the fact that it is a slab object is printed, + * and, if available, the slab name, return address, and stack trace from + * the allocation of that object. + * + * This function will splat if passed a pointer to a non-slab object. + * If you are not sure what type of object you have, you should instead + * use mem_dump_obj(). + */ +void kmem_dump_obj(void *object) +{ + char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc"; + int i; + struct page *page; + unsigned long ptroffset; + struct kmem_obj_info kp = { }; + + if (WARN_ON_ONCE(!virt_addr_valid(object))) + return; + page = virt_to_head_page(object); + if (WARN_ON_ONCE(!PageSlab(page))) { + pr_cont(" non-slab memory.\n"); + return; + } + kmem_obj_info(&kp, object, page); + if (kp.kp_slab_cache) + pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name); + else + pr_cont(" slab%s", cp); + if (kp.kp_objp) + pr_cont(" start %px", kp.kp_objp); + if (kp.kp_data_offset) + pr_cont(" data offset %lu", kp.kp_data_offset); + if (kp.kp_objp) { + ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset; + pr_cont(" pointer offset %lu", ptroffset); + } + if (kp.kp_slab_cache && kp.kp_slab_cache->usersize) + pr_cont(" size %u", kp.kp_slab_cache->usersize); + if (kp.kp_ret) + pr_cont(" allocated at %pS\n", kp.kp_ret); + else + pr_cont("\n"); + for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) { + if (!kp.kp_stack[i]) + break; + pr_info(" %pS\n", kp.kp_stack[i]); + } +} + #ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ void __init create_boot_cache(struct kmem_cache *s, const char *name, diff --git a/mm/slob.c b/mm/slob.c index 8d4bfa46247f..ef87ada8705d 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -461,6 +461,12 @@ out: spin_unlock_irqrestore(&slob_lock, flags); } +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + kpp->kp_ptr = object; + kpp->kp_page = page; +} + /* * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. */ diff --git a/mm/slub.c b/mm/slub.c index 0c8b43a5b3b0..3c1a84316fd7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3919,6 +3919,46 @@ int __kmem_cache_shutdown(struct kmem_cache *s) return 0; } +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + void *base; + int __maybe_unused i; + unsigned int objnr; + void *objp; + void *objp0; + struct kmem_cache *s = page->slab_cache; + struct track __maybe_unused *trackp; + + kpp->kp_ptr = object; + kpp->kp_page = page; + kpp->kp_slab_cache = s; + base = page_address(page); + objp0 = kasan_reset_tag(object); +#ifdef CONFIG_SLUB_DEBUG + objp = restore_red_left(s, objp0); +#else + objp = objp0; +#endif + objnr = obj_to_index(s, page, objp); + kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp); + objp = base + s->size * objnr; + kpp->kp_objp = objp; + if (WARN_ON_ONCE(objp < base || objp >= base + page->objects * s->size || (objp - base) % s->size) || + !(s->flags & SLAB_STORE_USER)) + return; +#ifdef CONFIG_SLUB_DEBUG + trackp = get_track(s, objp, TRACK_ALLOC); + kpp->kp_ret = (void *)trackp->addr; +#ifdef CONFIG_STACKTRACE + for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { + kpp->kp_stack[i] = (void *)trackp->addrs[i]; + if (!kpp->kp_stack[i]) + break; + } +#endif +#endif +} + /******************************************************************** * Kmalloc subsystem *******************************************************************/ diff --git a/mm/util.c b/mm/util.c index 8c9b7d1e7c49..da46f9d6c382 100644 --- a/mm/util.c +++ b/mm/util.c @@ -982,3 +982,27 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) kunmap_atomic(addr1); return ret; } + +/** + * mem_dump_obj - Print available provenance information + * @object: object for which to find provenance information. + * + * This function uses pr_cont(), so that the caller is expected to have + * printed out whatever preamble is appropriate. The provenance information + * depends on the type of object and on how much debugging is enabled. + * For example, for a slab-cache object, the slab name is printed, and, + * if available, the return address and stack trace from the allocation + * of that object. + */ +void mem_dump_obj(void *object) +{ + if (!virt_addr_valid(object)) { + pr_cont(" non-paged (local) memory.\n"); + return; + } + if (kmem_valid_obj(object)) { + kmem_dump_obj(object); + return; + } + pr_cont(" non-slab memory.\n"); +} -- cgit v1.2.3 From 98f180837a896ecedf8f7e12af22b57f271d43c9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Dec 2020 16:13:57 -0800 Subject: mm: Make mem_dump_obj() handle vmalloc() memory This commit adds vmalloc() support to mem_dump_obj(). Note that the vmalloc_dump_obj() function combines the checking and dumping, in contrast with the split between kmem_valid_obj() and kmem_dump_obj(). The reason for the difference is that the checking in the vmalloc() case involves acquiring a global lock, and redundant acquisitions of global locks should be avoided, even on not-so-fast paths. Note that this change causes on-stack variables to be reported as vmalloc() storage from kernel_clone() or similar, depending on the degree of inlining that your compiler does. This is likely more helpful than the earlier "non-paged (local) memory". Cc: Andrew Morton Cc: Joonsoo Kim Cc: Reported-by: Andrii Nakryiko Acked-by: Vlastimil Babka Tested-by: Naresh Kamboju Signed-off-by: Paul E. McKenney --- include/linux/vmalloc.h | 6 ++++++ mm/util.c | 14 ++++++++------ mm/vmalloc.c | 12 ++++++++++++ 3 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 80c0181c411d..c18f4751a704 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -246,4 +246,10 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) int register_vmap_purge_notifier(struct notifier_block *nb); int unregister_vmap_purge_notifier(struct notifier_block *nb); +#ifdef CONFIG_MMU +bool vmalloc_dump_obj(void *object); +#else +static inline bool vmalloc_dump_obj(void *object) { return false; } +#endif + #endif /* _LINUX_VMALLOC_H */ diff --git a/mm/util.c b/mm/util.c index 92f23d2c1277..54870226cea6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -996,18 +996,20 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) */ void mem_dump_obj(void *object) { + if (kmem_valid_obj(object)) { + kmem_dump_obj(object); + return; + } + if (vmalloc_dump_obj(object)) + return; if (!virt_addr_valid(object)) { if (object == NULL) pr_cont(" NULL pointer.\n"); else if (object == ZERO_SIZE_PTR) pr_cont(" zero-size pointer.\n"); else - pr_cont(" non-paged (local) memory.\n"); - return; - } - if (kmem_valid_obj(object)) { - kmem_dump_obj(object); + pr_cont(" non-paged memory.\n"); return; } - pr_cont(" non-slab memory.\n"); + pr_cont(" non-slab/vmalloc memory.\n"); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4d88fe5a277a..c274ea4f14ea 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3448,6 +3448,18 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) } #endif /* CONFIG_SMP */ +bool vmalloc_dump_obj(void *object) +{ + struct vm_struct *vm; + void *objp = (void *)PAGE_ALIGN((unsigned long)object); + + vm = find_vm_area(objp); + if (!vm) + return false; + pr_cont(" vmalloc allocated at %pS\n", vm->caller); + return true; +} + #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) __acquires(&vmap_purge_lock) -- cgit v1.2.3