diff options
Diffstat (limited to 'kernel')
58 files changed, 992 insertions, 713 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 38ef6d06888e..ce1435cb08b1 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -30,7 +30,7 @@ choice 250 Hz is a good compromise choice allowing server performance while also showing good interactive responsiveness even on SMP and NUMA systems. If you are going to be using NTSC video - or multimedia, selected 300Hz instead. + or multimedia, select 300Hz instead. config HZ_300 bool "300 HZ" diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 11ea8d24ac72..fa24c032ed6f 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -851,7 +851,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; - if (kn->parent != new_parent) + if (rcu_access_pointer(kn->__parent) != new_parent) return -EIO; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f231fe3a0744..3caf2cd86e65 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -633,9 +633,22 @@ int cgroup_task_count(const struct cgroup *cgrp) return count; } +static struct cgroup *kn_priv(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + /* + * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT. + * Therefore it is always safe to dereference this pointer outside of a + * RCU section. + */ + parent = rcu_dereference_check(kn->__parent, + kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT); + return parent->priv; +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { - struct cgroup *cgrp = of->kn->parent->priv; + struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); /* @@ -1612,7 +1625,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn) if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else - cgrp = kn->parent->priv; + cgrp = kn_priv(kn); cgroup_unlock(); @@ -1644,7 +1657,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else - cgrp = kn->parent->priv; + cgrp = kn_priv(kn); /* * We're gonna grab cgroup_mutex which nests outside kernfs @@ -1682,7 +1695,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); - del_timer_sync(&cfile->notify_timer); + timer_delete_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); @@ -2118,7 +2131,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_SUPPORT_EXPORTOP | - KERNFS_ROOT_SUPPORT_USER_XATTR, + KERNFS_ROOT_SUPPORT_USER_XATTR | + KERNFS_ROOT_INVARIANT_PARENT, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); @@ -4115,7 +4129,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_file_ctx *ctx = of->priv; - struct cgroup *cgrp = of->kn->parent->priv; + struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; @@ -5909,6 +5923,12 @@ static void kill_css(struct cgroup_subsys_state *css) if (css->flags & CSS_DYING) return; + /* + * Call css_killed(), if defined, before setting the CSS_DYING flag + */ + if (css->ss->css_killed) + css->ss->css_killed(css); + css->flags |= CSS_DYING; /* diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 976a8bc3ff60..383963e28ac6 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -33,6 +33,7 @@ enum prs_errcode { PERR_CPUSEMPTY, PERR_HKEEPING, PERR_ACCESS, + PERR_REMOTE, }; /* bits in struct cpuset flags field */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 39c1fc643d77..306b60430091 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -61,10 +61,17 @@ static const char * const perr_strings[] = { [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", [PERR_ACCESS] = "Enable partition not permitted", + [PERR_REMOTE] = "Have remote partition underneath", }; /* - * Exclusive CPUs distributed out to sub-partitions of top_cpuset + * For local partitions, update to subpartitions_cpus & isolated_cpus is done + * in update_parent_effective_cpumask(). For remote partitions, it is done in + * the remote_partition_*() and remote_cpus_update() helpers. + */ +/* + * Exclusive CPUs distributed out to local or remote sub-partitions of + * top_cpuset */ static cpumask_var_t subpartitions_cpus; @@ -86,7 +93,6 @@ static struct list_head remote_children; * A flag to force sched domain rebuild at the end of an operation. * It can be set in * - update_partition_sd_lb() - * - remote_partition_check() * - update_cpumasks_hier() * - cpuset_update_flag() * - cpuset_hotplug_update_tasks() @@ -1089,9 +1095,14 @@ void cpuset_reset_sched_domains(void) * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, - * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() - * is used instead of effective_cpus to make sure all offline CPUs are also - * included as hotplug code won't update cpumasks for tasks in top_cpuset. + * cpuset membership stays stable. + * + * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus + * to make sure all offline CPUs are also included as hotplug code won't + * update cpumasks for tasks in top_cpuset. + * + * As task_cpu_possible_mask() can be task dependent in arm64, we have to + * do cpu masking per task instead of doing it once for all. */ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { @@ -1151,7 +1162,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * * Return: 0 if successful, an error code otherwise */ -static int update_partition_exclusive(struct cpuset *cs, int new_prs) +static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs) { bool exclusive = (new_prs > PRS_MEMBER); @@ -1234,12 +1245,12 @@ static void reset_partition_data(struct cpuset *cs) } /* - * partition_xcpus_newstate - Exclusive CPUs state change + * isolated_cpus_update - Update the isolated_cpus mask * @old_prs: old partition_root_state * @new_prs: new partition_root_state * @xcpus: exclusive CPUs with state change */ -static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus) +static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) { WARN_ON_ONCE(old_prs == new_prs); if (new_prs == PRS_ISOLATED) @@ -1273,8 +1284,8 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, isolcpus_updated = (new_prs != parent->partition_root_state); if (isolcpus_updated) - partition_xcpus_newstate(parent->partition_root_state, new_prs, - xcpus); + isolated_cpus_update(parent->partition_root_state, new_prs, + xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); return isolcpus_updated; @@ -1304,8 +1315,8 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, isolcpus_updated = (old_prs != parent->partition_root_state); if (isolcpus_updated) - partition_xcpus_newstate(old_prs, parent->partition_root_state, - xcpus); + isolated_cpus_update(old_prs, parent->partition_root_state, + xcpus); cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); @@ -1340,20 +1351,57 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); * compute_effective_exclusive_cpumask - compute effective exclusive CPUs * @cs: cpuset * @xcpus: effective exclusive CPUs value to be set - * Return: true if xcpus is not empty, false otherwise. + * @real_cs: the real cpuset (can be NULL) + * Return: 0 if there is no sibling conflict, > 0 otherwise * - * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), - * it must be a subset of parent's effective_xcpus. + * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to + * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus + * as well. The provision of real_cs means that a cpumask is being changed and + * the given cs is a trial one. */ -static bool compute_effective_exclusive_cpumask(struct cpuset *cs, - struct cpumask *xcpus) +static int compute_effective_exclusive_cpumask(struct cpuset *cs, + struct cpumask *xcpus, + struct cpuset *real_cs) { + struct cgroup_subsys_state *css; struct cpuset *parent = parent_cs(cs); + struct cpuset *sibling; + int retval = 0; if (!xcpus) xcpus = cs->effective_xcpus; - return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); + cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); + + if (!real_cs) { + if (!cpumask_empty(cs->exclusive_cpus)) + return 0; + } else { + cs = real_cs; + } + + /* + * Exclude exclusive CPUs from siblings + */ + rcu_read_lock(); + cpuset_for_each_child(sibling, css, parent) { + if (sibling == cs) + continue; + + if (!cpumask_empty(sibling->exclusive_cpus) && + cpumask_intersects(xcpus, sibling->exclusive_cpus)) { + cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus); + retval++; + continue; + } + if (!cpumask_empty(sibling->effective_xcpus) && + cpumask_intersects(xcpus, sibling->effective_xcpus)) { + cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus); + retval++; + } + } + rcu_read_unlock(); + return retval; } static inline bool is_remote_partition(struct cpuset *cs) @@ -1395,7 +1443,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * remote partition root underneath it, its exclusive_cpus must * have overlapped with subpartitions_cpus. */ - compute_effective_exclusive_cpumask(cs, tmp->new_cpus); + compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL); if (cpumask_empty(tmp->new_cpus) || cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) @@ -1404,8 +1452,11 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, spin_lock_irq(&callback_lock); isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); + cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); + cpuset_force_rebuild(); + cs->prs_err = 0; /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1428,20 +1479,24 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { bool isolcpus_updated; - compute_effective_exclusive_cpumask(cs, tmp->new_cpus); WARN_ON_ONCE(!is_remote_partition(cs)); - WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); + WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); list_del_init(&cs->remote_sibling); isolcpus_updated = partition_xcpus_del(cs->partition_root_state, - NULL, tmp->new_cpus); - cs->partition_root_state = -cs->partition_root_state; - if (!cs->prs_err) - cs->prs_err = PERR_INVCPUS; + NULL, cs->effective_xcpus); + if (cs->prs_err) + cs->partition_root_state = -cs->partition_root_state; + else + cs->partition_root_state = PRS_MEMBER; + + /* effective_xcpus may need to be changed */ + compute_effective_exclusive_cpumask(cs, NULL, NULL); reset_partition_data(cs); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); + cpuset_force_rebuild(); /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1453,14 +1508,15 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) /* * remote_cpus_update - cpus_exclusive change of remote partition * @cs: the cpuset to be updated - * @newmask: the new effective_xcpus mask + * @xcpus: the new exclusive_cpus mask, if non-NULL + * @excpus: the new effective_xcpus mask * @tmp: temporary masks * * top_cpuset and subpartitions_cpus will be updated or partition can be * invalidated. */ -static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, - struct tmpmasks *tmp) +static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, + struct cpumask *excpus, struct tmpmasks *tmp) { bool adding, deleting; int prs = cs->partition_root_state; @@ -1471,29 +1527,45 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); - if (cpumask_empty(newmask)) + if (cpumask_empty(excpus)) { + cs->prs_err = PERR_CPUSEMPTY; goto invalidate; + } - adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus); - deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask); + adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus); + deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus); /* * Additions of remote CPUs is only allowed if those CPUs are * not allocated to other partitions and there are effective_cpus * left in the top cpuset. */ - if (adding && (!capable(CAP_SYS_ADMIN) || - cpumask_intersects(tmp->addmask, subpartitions_cpus) || - cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))) - goto invalidate; + if (adding) { + if (!capable(CAP_SYS_ADMIN)) + cs->prs_err = PERR_ACCESS; + else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) + cs->prs_err = PERR_NOCPUS; + if (cs->prs_err) + goto invalidate; + } spin_lock_irq(&callback_lock); if (adding) isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); + /* + * Need to update effective_xcpus and exclusive_cpus now as + * update_sibling_cpumasks() below may iterate back to the same cs. + */ + cpumask_copy(cs->effective_xcpus, excpus); + if (xcpus) + cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); + if (adding || deleting) + cpuset_force_rebuild(); /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1507,47 +1579,6 @@ invalidate: } /* - * remote_partition_check - check if a child remote partition needs update - * @cs: the cpuset to be updated - * @newmask: the new effective_xcpus mask - * @delmask: temporary mask for deletion (not in tmp) - * @tmp: temporary masks - * - * This should be called before the given cs has updated its cpus_allowed - * and/or effective_xcpus. - */ -static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, - struct cpumask *delmask, struct tmpmasks *tmp) -{ - struct cpuset *child, *next; - int disable_cnt = 0; - - /* - * Compute the effective exclusive CPUs that will be deleted. - */ - if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) || - !cpumask_intersects(delmask, subpartitions_cpus)) - return; /* No deletion of exclusive CPUs in partitions */ - - /* - * Searching the remote children list to look for those that will - * be impacted by the deletion of exclusive CPUs. - * - * Since a cpuset must be removed from the remote children list - * before it can go offline and holding cpuset_mutex will prevent - * any change in cpuset status. RCU read lock isn't needed. - */ - lockdep_assert_held(&cpuset_mutex); - list_for_each_entry_safe(child, next, &remote_children, remote_sibling) - if (cpumask_intersects(child->effective_cpus, delmask)) { - remote_partition_disable(child, tmp); - disable_cnt++; - } - if (disable_cnt) - cpuset_force_rebuild(); -} - -/* * prstate_housekeeping_conflict - check for partition & housekeeping conflicts * @prstate: partition root state to be checked * @new_cpus: cpu mask @@ -1601,7 +1632,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) * The partcmd_update command is used by update_cpumasks_hier() with newmask * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used * by update_cpumask() with NULL newmask. In both cases, the callers won't - * check for error and so partition_root_state and prs_error will be updated + * check for error and so partition_root_state and prs_err will be updated * directly. */ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, @@ -1614,11 +1645,12 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ int subparts_delta = 0; - struct cpumask *xcpus; /* cs effective_xcpus */ int isolcpus_updated = 0; + struct cpumask *xcpus = user_xcpus(cs); bool nocpu; lockdep_assert_held(&cpuset_mutex); + WARN_ON_ONCE(is_remote_partition(cs)); /* * new_prs will only be changed for the partcmd_update and @@ -1626,7 +1658,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; - xcpus = user_xcpus(cs); if (cmd == partcmd_invalidate) { if (is_prs_invalid(old_prs)) @@ -1661,12 +1692,19 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* + * Need to call compute_effective_exclusive_cpumask() in case + * exclusive_cpus not set. Sibling conflict should only happen + * if exclusive_cpus isn't set. + */ + xcpus = tmp->new_cpus; + if (compute_effective_exclusive_cpumask(cs, xcpus, NULL)) + WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); + + /* * Enabling partition root is not allowed if its - * effective_xcpus is empty or doesn't overlap with - * parent's effective_xcpus. + * effective_xcpus is empty. */ - if (cpumask_empty(xcpus) || - !cpumask_intersects(xcpus, parent->effective_xcpus)) + if (cpumask_empty(xcpus)) return PERR_INVCPUS; if (prstate_housekeeping_conflict(new_prs, xcpus)) @@ -1679,19 +1717,22 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (nocpu) return PERR_NOCPUS; - cpumask_copy(tmp->delmask, xcpus); - deleting = true; - subparts_delta++; + deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_xcpus); + if (deleting) + subparts_delta++; new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* - * May need to add cpus to parent's effective_cpus for - * valid partition root. + * May need to add cpus back to parent's effective_cpus + * (and maybe removed from subpartitions_cpus/isolated_cpus) + * for valid partition root. xcpus may contain CPUs that + * shouldn't be removed from the two global cpumasks. */ - adding = !is_prs_invalid(old_prs) && - cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); - if (adding) + if (is_partition_valid(cs)) { + cpumask_copy(tmp->addmask, cs->effective_xcpus); + adding = true; subparts_delta--; + } new_prs = PRS_MEMBER; } else if (newmask) { /* @@ -1701,6 +1742,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, part_error = PERR_CPUSEMPTY; goto write_error; } + /* Check newmask again, whether cpus are available for parent/cs */ nocpu |= tasks_nocpu_error(parent, cs, newmask); @@ -1829,7 +1871,7 @@ write_error: * CPU lists in cs haven't been updated yet. So defer it to later. */ if ((old_prs != new_prs) && (cmd != partcmd_update)) { - int err = update_partition_exclusive(cs, new_prs); + int err = update_partition_exclusive_flag(cs, new_prs); if (err) return err; @@ -1867,7 +1909,7 @@ write_error: update_unbound_workqueue_cpumask(isolcpus_updated); if ((old_prs != new_prs) && (cmd == partcmd_update)) - update_partition_exclusive(cs, new_prs); + update_partition_exclusive_flag(cs, new_prs); if (adding || deleting) { cpuset_update_tasks_cpumask(parent, tmp->addmask); @@ -1917,7 +1959,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * 2) All the effective_cpus will be used up and cp * has tasks */ - compute_effective_exclusive_cpumask(cs, new_ecpus); + compute_effective_exclusive_cpumask(cs, new_ecpus, NULL); cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); rcu_read_lock(); @@ -1925,6 +1967,11 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, if (!is_partition_valid(child)) continue; + /* + * There shouldn't be a remote partition underneath another + * partition root. + */ + WARN_ON_ONCE(is_remote_partition(child)); child->prs_err = 0; if (!cpumask_subset(child->effective_xcpus, cs->effective_xcpus)) @@ -1980,32 +2027,39 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, bool remote = is_remote_partition(cp); bool update_parent = false; + old_prs = new_prs = cp->partition_root_state; + /* - * Skip descendent remote partition that acquires CPUs - * directly from top cpuset unless it is cs. + * For child remote partition root (!= cs), we need to call + * remote_cpus_update() if effective_xcpus will be changed. + * Otherwise, we can skip the whole subtree. + * + * remote_cpus_update() will reuse tmp->new_cpus only after + * its value is being processed. */ if (remote && (cp != cs)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } + compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL); + if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + rcu_read_unlock(); + remote_cpus_update(cp, NULL, tmp->new_cpus, tmp); + rcu_read_lock(); - /* - * Update effective_xcpus if exclusive_cpus set. - * The case when exclusive_cpus isn't set is handled later. - */ - if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) { - spin_lock_irq(&callback_lock); - compute_effective_exclusive_cpumask(cp, NULL); - spin_unlock_irq(&callback_lock); + /* Remote partition may be invalidated */ + new_prs = cp->partition_root_state; + remote = (new_prs == old_prs); } - old_prs = new_prs = cp->partition_root_state; - if (remote || (is_partition_valid(parent) && - is_partition_valid(cp))) + if (remote || (is_partition_valid(parent) && is_partition_valid(cp))) compute_partition_effective_cpumask(cp, tmp->new_cpus); else compute_effective_cpumask(tmp->new_cpus, cp, parent); + if (remote) + goto get_css; /* Ready to update cpuset data */ + /* * A partition with no effective_cpus is allowed as long as * there is no task associated with it. Call @@ -2025,9 +2079,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) cpumask_copy(tmp->new_cpus, parent->effective_cpus); - if (remote) - goto get_css; - /* * Skip the whole subtree if * 1) the cpumask remains the same, @@ -2088,6 +2139,9 @@ get_css: spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; + if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) + compute_effective_exclusive_cpumask(cp, NULL, NULL); + /* * Make sure effective_xcpus is properly set for a valid * partition root. @@ -2174,7 +2228,14 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, parent); if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) continue; + } else if (is_remote_partition(sibling)) { + /* + * Change in a sibling cpuset won't affect a remote + * partition root. + */ + continue; } + if (!css_tryget_online(&sibling->css)) continue; @@ -2231,8 +2292,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * trialcs->effective_xcpus is used as a temporary cpumask * for checking validity of the partition root. */ + trialcs->partition_root_state = PRS_MEMBER; if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) - compute_effective_exclusive_cpumask(trialcs, NULL); + compute_effective_exclusive_cpumask(trialcs, NULL, cs); } /* Nothing to do if the cpus didn't change */ @@ -2305,19 +2367,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Call remote_cpus_update() to handle valid remote partition */ if (is_remote_partition(cs)) - remote_cpus_update(cs, xcpus, &tmp); + remote_cpus_update(cs, NULL, xcpus, &tmp); else if (invalidate) update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); else update_parent_effective_cpumask(cs, partcmd_update, xcpus, &tmp); - } else if (!cpumask_empty(cs->exclusive_cpus)) { - /* - * Use trialcs->effective_cpus as a temp cpumask - */ - remote_partition_check(cs, trialcs->effective_xcpus, - trialcs->effective_cpus, &tmp); } spin_lock_irq(&callback_lock); @@ -2369,8 +2425,15 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; - if (*buf) - compute_effective_exclusive_cpumask(trialcs, NULL); + if (*buf) { + trialcs->partition_root_state = PRS_MEMBER; + /* + * Reject the change if there is exclusive CPUs conflict with + * the siblings. + */ + if (compute_effective_exclusive_cpumask(trialcs, NULL, cs)) + return -EINVAL; + } /* * Check all the descendants in update_cpumasks_hier() if @@ -2401,8 +2464,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (invalidate) remote_partition_disable(cs, &tmp); else - remote_cpus_update(cs, trialcs->effective_xcpus, - &tmp); + remote_cpus_update(cs, trialcs->exclusive_cpus, + trialcs->effective_xcpus, &tmp); } else if (invalidate) { update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); @@ -2410,12 +2473,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, update_parent_effective_cpumask(cs, partcmd_update, trialcs->effective_xcpus, &tmp); } - } else if (!cpumask_empty(trialcs->exclusive_cpus)) { - /* - * Use trialcs->effective_cpus as a temp cpumask - */ - remote_partition_check(cs, trialcs->effective_xcpus, - trialcs->effective_cpus, &tmp); } spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); @@ -2782,7 +2839,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) int err = PERR_NONE, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; - bool new_xcpus_state = false; + bool isolcpus_updated = false; if (old_prs == new_prs) return 0; @@ -2796,18 +2853,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; - /* - * Setup effective_xcpus if not properly set yet, it will be cleared - * later if partition becomes invalid. - */ - if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) { - spin_lock_irq(&callback_lock); - cpumask_and(cs->effective_xcpus, - cs->cpus_allowed, parent->effective_xcpus); - spin_unlock_irq(&callback_lock); - } - - err = update_partition_exclusive(cs, new_prs); + err = update_partition_exclusive_flag(cs, new_prs); if (err) goto out; @@ -2821,6 +2867,19 @@ static int update_prstate(struct cpuset *cs, int new_prs) } /* + * We don't support the creation of a new local partition with + * a remote partition underneath it. This unsupported + * setting can happen only if parent is the top_cpuset because + * a remote partition cannot be created underneath an existing + * local or remote partition. + */ + if ((parent == &top_cpuset) && + cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) { + err = PERR_REMOTE; + goto out; + } + + /* * If parent is valid partition, enable local partiion. * Otherwise, enable a remote partition. */ @@ -2835,8 +2894,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. + * Need to update isolated_cpus. */ - new_xcpus_state = true; + isolcpus_updated = true; } else { /* * Switching back to member is always allowed even if it @@ -2860,7 +2920,7 @@ out: */ if (err) { new_prs = -new_prs; - update_partition_exclusive(cs, new_prs); + update_partition_exclusive_flag(cs, new_prs); } spin_lock_irq(&callback_lock); @@ -2868,14 +2928,18 @@ out: WRITE_ONCE(cs->prs_err, err); if (!is_partition_valid(cs)) reset_partition_data(cs); - else if (new_xcpus_state) - partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); + else if (isolcpus_updated) + isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(new_xcpus_state); + update_unbound_workqueue_cpumask(isolcpus_updated); - /* Force update if switching back to member */ + /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); + /* A newly created partition must have effective_xcpus set */ + WARN_ON_ONCE(!old_prs && (new_prs > 0) + && cpumask_empty(cs->effective_xcpus)); + /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); @@ -3208,7 +3272,7 @@ int cpuset_common_seq_show(struct seq_file *sf, void *v) return ret; } -static int sched_partition_show(struct seq_file *seq, void *v) +static int cpuset_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); const char *err, *type = NULL; @@ -3239,7 +3303,7 @@ static int sched_partition_show(struct seq_file *seq, void *v) return 0; } -static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, +static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); @@ -3260,11 +3324,8 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, css_get(&cs->css); cpus_read_lock(); mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) - goto out_unlock; - - retval = update_prstate(cs, val); -out_unlock: + if (is_cpuset_online(cs)) + retval = update_prstate(cs, val); mutex_unlock(&cpuset_mutex); cpus_read_unlock(); css_put(&cs->css); @@ -3308,8 +3369,8 @@ static struct cftype dfl_files[] = { { .name = "cpus.partition", - .seq_show = sched_partition_show, - .write = sched_partition_write, + .seq_show = cpuset_partition_show, + .write = cpuset_partition_write, .private = FILE_PARTITION_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct cpuset, partition_file), @@ -3475,9 +3536,6 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpus_read_lock(); mutex_lock(&cpuset_mutex); - if (is_partition_valid(cs)) - update_prstate(cs, 0); - if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -3488,6 +3546,22 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpus_read_unlock(); } +static void cpuset_css_killed(struct cgroup_subsys_state *css) +{ + struct cpuset *cs = css_cs(css); + + cpus_read_lock(); + mutex_lock(&cpuset_mutex); + + /* Reset valid partition back to member */ + if (is_partition_valid(cs)) + update_prstate(cs, PRS_MEMBER); + + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + +} + static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); @@ -3609,6 +3683,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, + .css_killed = cpuset_css_killed, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, @@ -3739,10 +3814,10 @@ retry: if (remote && cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) { + cs->prs_err = PERR_HOTPLUG; remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; - cpuset_force_rebuild(); } /* diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 4bb587d5d34f..b2239156b7de 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -318,10 +318,11 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) might_sleep(); for_each_possible_cpu(cpu) { - struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu); + struct cgroup *pos; /* Reacquire for each CPU to avoid disabling IRQs too long */ __cgroup_rstat_lock(cgrp, cpu); + pos = cgroup_rstat_updated_list(cgrp, cpu); for (; pos; pos = pos->rstat_flush_next) { struct cgroup_subsys_state *css; diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 20552f163930..8aafd050b754 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -73,7 +73,6 @@ CONFIG_DEBUG_VM=y CONFIG_DEBUG_VM_PGFLAGS=y CONFIG_DEBUG_VM_RB=y CONFIG_DEBUG_VM_VMACACHE=y -CONFIG_GENERIC_PTDUMP=y CONFIG_KASAN=y CONFIG_KASAN_GENERIC=y CONFIG_KASAN_INLINE=y diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index a620fb4b2116..aff7c0fdbefa 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -375,11 +375,10 @@ static int __init reserve_crashkernel_low(unsigned long long low_size) return 0; } -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high) +void __init reserve_crashkernel_generic(unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) { unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; bool fixed_base = false; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index ce1bb2301c06..0b9495187fba 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -837,10 +837,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) { struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; - int ret = 0; - - if (arch_kgdb_ops.enable_nmi) - arch_kgdb_ops.enable_nmi(0); /* * Avoid entering the debugger if we were triggered due to an oops * but panic_timeout indicates the system should automatically @@ -858,15 +854,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) ks->linux_regs = regs; if (kgdb_reenter_check(ks)) - goto out; /* Ouch, double exception ! */ + return 0; /* Ouch, double exception ! */ if (kgdb_info[ks->cpu].enter_kgdb != 0) - goto out; + return 0; - ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); -out: - if (arch_kgdb_ops.enable_nmi) - arch_kgdb_ops.enable_nmi(1); - return ret; + return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); } NOKPROBE_SYMBOL(kgdb_handle_exception); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6a77f1c779c4..9b11b10b120c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -334,7 +334,7 @@ poll_again: *cp = '\0'; p_tmp = strrchr(buffer, ' '); p_tmp = (p_tmp ? p_tmp + 1 : buffer); - strscpy(tmpbuffer, p_tmp, sizeof(tmpbuffer)); + strscpy(tmpbuffer, p_tmp); *cp = tmp; len = strlen(tmpbuffer); @@ -452,7 +452,7 @@ poll_again: char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) { if (prompt && kdb_prompt_str != prompt) - strscpy(kdb_prompt_str, prompt, CMD_BUFLEN); + strscpy(kdb_prompt_str, prompt); kdb_printf("%s", kdb_prompt_str); kdb_nextline = 1; /* Prompt and input resets line number */ return kdb_read(buffer, bufsize); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 5f4be507d79f..7a4d2d4689a5 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -25,7 +25,6 @@ #include <linux/smp.h> #include <linux/utsname.h> #include <linux/vmalloc.h> -#include <linux/atomic.h> #include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/init.h> @@ -105,7 +104,7 @@ static kdbmsg_t kdbmsgs[] = { KDBMSG(NOENVVALUE, "Environment variable should have value"), KDBMSG(NOTIMP, "Command not implemented"), KDBMSG(ENVFULL, "Environment full"), - KDBMSG(ENVBUFFULL, "Environment buffer full"), + KDBMSG(KMALLOCFAILED, "Failed to allocate memory"), KDBMSG(TOOMANYBPT, "Too many breakpoints defined"), #ifdef CONFIG_CPU_XSCALE KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"), @@ -130,13 +129,9 @@ static const int __nkdb_err = ARRAY_SIZE(kdbmsgs); /* - * Initial environment. This is all kept static and local to - * this file. We don't want to rely on the memory allocation - * mechanisms in the kernel, so we use a very limited allocate-only - * heap for new and altered environment variables. The entire - * environment is limited to a fixed number of entries (add more - * to __env[] if required) and a fixed amount of heap (add more to - * KDB_ENVBUFSIZE if required). + * Initial environment. This is all kept static and local to this file. + * The entire environment is limited to a fixed number of entries + * (add more to __env[] if required) */ static char *__env[31] = { @@ -259,35 +254,6 @@ char *kdbgetenv(const char *match) } /* - * kdballocenv - This function is used to allocate bytes for - * environment entries. - * Parameters: - * bytes The number of bytes to allocate in the static buffer. - * Returns: - * A pointer to the allocated space in the buffer on success. - * NULL if bytes > size available in the envbuffer. - * Remarks: - * We use a static environment buffer (envbuffer) to hold the values - * of dynamically generated environment variables (see kdb_set). Buffer - * space once allocated is never free'd, so over time, the amount of space - * (currently 512 bytes) will be exhausted if env variables are changed - * frequently. - */ -static char *kdballocenv(size_t bytes) -{ -#define KDB_ENVBUFSIZE 512 - static char envbuffer[KDB_ENVBUFSIZE]; - static int envbufsize; - char *ep = NULL; - - if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) { - ep = &envbuffer[envbufsize]; - envbufsize += bytes; - } - return ep; -} - -/* * kdbgetulenv - This function will return the value of an unsigned * long-valued environment variable. * Parameters: @@ -348,9 +314,9 @@ static int kdb_setenv(const char *var, const char *val) varlen = strlen(var); vallen = strlen(val); - ep = kdballocenv(varlen + vallen + 2); - if (ep == (char *)0) - return KDB_ENVBUFFULL; + ep = kmalloc(varlen + vallen + 2, GFP_KDB); + if (!ep) + return KDB_KMALLOCFAILED; sprintf(ep, "%s=%s", var, val); @@ -359,6 +325,7 @@ static int kdb_setenv(const char *var, const char *val) && ((strncmp(__env[i], var, varlen) == 0) && ((__env[i][varlen] == '\0') || (__env[i][varlen] == '=')))) { + kfree_const(__env[i]); __env[i] = ep; return 0; } @@ -2119,32 +2086,6 @@ static int kdb_dmesg(int argc, const char **argv) return 0; } #endif /* CONFIG_PRINTK */ - -/* Make sure we balance enable/disable calls, must disable first. */ -static atomic_t kdb_nmi_disabled; - -static int kdb_disable_nmi(int argc, const char *argv[]) -{ - if (atomic_read(&kdb_nmi_disabled)) - return 0; - atomic_set(&kdb_nmi_disabled, 1); - arch_kgdb_ops.enable_nmi(0); - return 0; -} - -static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp) -{ - if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0)) - return -EINVAL; - arch_kgdb_ops.enable_nmi(1); - return 0; -} - -static const struct kernel_param_ops kdb_param_ops_enable_nmi = { - .set = kdb_param_enable_nmi, -}; -module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600); - /* * kdb_cpu - This function implements the 'cpu' command. * cpu [<cpunum>] @@ -2836,20 +2777,10 @@ static kdbtab_t maintab[] = { }, }; -static kdbtab_t nmicmd = { - .name = "disable_nmi", - .func = kdb_disable_nmi, - .usage = "", - .help = "Disable NMI entry to KDB", - .flags = KDB_ENABLE_ALWAYS_SAFE, -}; - /* Initialize the kdb command table. */ static void __init kdb_inittab(void) { kdb_register_table(maintab, ARRAY_SIZE(maintab)); - if (arch_kgdb_ops.enable_nmi) - kdb_register_table(&nmicmd, 1); } /* Execute any commands defined in kdb_cmds. */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 0bb21659e252..128db74e9eab 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2451,6 +2451,7 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL #define DETACH_DEAD 0x04UL +#define DETACH_EXIT 0x08UL /* * Cross CPU call to remove a performance event @@ -2465,6 +2466,7 @@ __perf_remove_from_context(struct perf_event *event, void *info) { struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; + enum perf_event_state state = PERF_EVENT_STATE_OFF; unsigned long flags = (unsigned long)info; ctx_time_update(cpuctx, ctx); @@ -2473,16 +2475,19 @@ __perf_remove_from_context(struct perf_event *event, * Ensure event_sched_out() switches to OFF, at the very least * this avoids raising perf_pending_task() at this time. */ - if (flags & DETACH_DEAD) + if (flags & DETACH_EXIT) + state = PERF_EVENT_STATE_EXIT; + if (flags & DETACH_DEAD) { event->pending_disable = 1; + state = PERF_EVENT_STATE_DEAD; + } event_sched_out(event, ctx); + perf_event_set_state(event, min(event->state, state)); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); - if (flags & DETACH_DEAD) - event->state = PERF_EVENT_STATE_DEAD; if (!pmu_ctx->nr_events) { pmu_ctx->rotate_necessary = 0; @@ -13731,12 +13736,7 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) mutex_lock(&parent_event->child_mutex); } - perf_remove_from_context(event, detach_flags); - - raw_spin_lock_irq(&ctx->lock); - if (event->state > PERF_EVENT_STATE_EXIT) - perf_event_set_state(event, PERF_EVENT_STATE_EXIT); - raw_spin_unlock_irq(&ctx->lock); + perf_remove_from_context(event, detach_flags | DETACH_EXIT); /* * Child events can be freed. diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 70c84b9d7be3..615b4e6d22c7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -173,6 +173,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; struct mmu_notifier_range range; + pte_t pte; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); @@ -192,6 +193,16 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (!page_vma_mapped_walk(&pvmw)) goto unlock; VM_BUG_ON_PAGE(addr != pvmw.address, old_page); + pte = ptep_get(pvmw.pte); + + /* + * Handle PFN swap PTES, such as device-exclusive ones, that actually + * map pages: simply trigger GUP again to fix it up. + */ + if (unlikely(!pte_present(pte))) { + page_vma_mapped_walk_done(&pvmw); + goto unlock; + } if (new_page) { folio_get(new_folio); @@ -206,7 +217,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, inc_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); + flush_cache_page(vma, addr, pte_pfn(pte)); ptep_clear_flush(vma, addr, pvmw.pte); if (new_page) set_pte_at(mm, addr, pvmw.pte, @@ -1692,7 +1703,8 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO| + VM_SEALED_SYSMAP, &xol_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); diff --git a/kernel/exit.c b/kernel/exit.c index c2e6c7b7779f..1b51dc099f1e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -268,6 +268,9 @@ repeat: leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + /* for pidfs_exit() and do_notify_parent() */ + if (leader->signal->flags & SIGNAL_GROUP_EXIT) + leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -756,12 +759,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; - /* - * Ignore thread-group leaders that exited before all - * subthreads did. - */ - if (!delay_group_leader(tsk)) - do_notify_pidfd(tsk); if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && @@ -774,6 +771,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead) do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; + /* untraced sub-thread */ + do_notify_pidfd(tsk); } if (autoreap) { diff --git a/kernel/fork.c b/kernel/fork.c index 1b659b07ecd5..c4b26cd8998b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -311,11 +311,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * so memcg accounting is performed manually on assigning/releasing * stacks to tasks. Drop __GFP_ACCOUNT. */ - stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, - VMALLOC_START, VMALLOC_END, + stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP & ~__GFP_ACCOUNT, - PAGE_KERNEL, - 0, node, __builtin_return_address(0)); + node, __builtin_return_address(0)); if (!stack) return -ENOMEM; @@ -436,35 +434,6 @@ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -#ifdef CONFIG_PER_VMA_LOCK - -/* SLAB cache for vm_area_struct.lock */ -static struct kmem_cache *vma_lock_cachep; - -static bool vma_lock_alloc(struct vm_area_struct *vma) -{ - vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); - if (!vma->vm_lock) - return false; - - init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = UINT_MAX; - - return true; -} - -static inline void vma_lock_free(struct vm_area_struct *vma) -{ - kmem_cache_free(vma_lock_cachep, vma->vm_lock); -} - -#else /* CONFIG_PER_VMA_LOCK */ - -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } -static inline void vma_lock_free(struct vm_area_struct *vma) {} - -#endif /* CONFIG_PER_VMA_LOCK */ - struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma; @@ -474,14 +443,46 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) return NULL; vma_init(vma, mm); - if (!vma_lock_alloc(vma)) { - kmem_cache_free(vm_area_cachep, vma); - return NULL; - } return vma; } +static void vm_area_init_from(const struct vm_area_struct *src, + struct vm_area_struct *dest) +{ + dest->vm_mm = src->vm_mm; + dest->vm_ops = src->vm_ops; + dest->vm_start = src->vm_start; + dest->vm_end = src->vm_end; + dest->anon_vma = src->anon_vma; + dest->vm_pgoff = src->vm_pgoff; + dest->vm_file = src->vm_file; + dest->vm_private_data = src->vm_private_data; + vm_flags_init(dest, src->vm_flags); + memcpy(&dest->vm_page_prot, &src->vm_page_prot, + sizeof(dest->vm_page_prot)); + /* + * src->shared.rb may be modified concurrently when called from + * dup_mmap(), but the clone will reinitialize it. + */ + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, + sizeof(dest->vm_userfaultfd_ctx)); +#ifdef CONFIG_ANON_VMA_NAME + dest->anon_name = src->anon_name; +#endif +#ifdef CONFIG_SWAP + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, + sizeof(dest->swap_readahead_info)); +#endif +#ifndef CONFIG_MMU + dest->vm_region = src->vm_region; +#endif +#ifdef CONFIG_NUMA + dest->vm_policy = src->vm_policy; +#endif +} + struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); @@ -491,15 +492,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - /* - * orig->shared.rb may be modified concurrently, but the clone - * will be reinitialized. - */ - data_race(memcpy(new, orig, sizeof(*new))); - if (!vma_lock_alloc(new)) { - kmem_cache_free(vm_area_cachep, new); - return NULL; - } + vm_area_init_from(orig, new); + vma_lock_init(new, true); INIT_LIST_HEAD(&new->anon_vma_chain); vma_numab_state_init(new); dup_anon_vma_name(orig, new); @@ -511,35 +505,15 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) return new; } -void __vm_area_free(struct vm_area_struct *vma) +void vm_area_free(struct vm_area_struct *vma) { + /* The vma should be detached while being destroyed. */ + vma_assert_detached(vma); vma_numab_state_free(vma); free_anon_vma_name(vma); - vma_lock_free(vma); kmem_cache_free(vm_area_cachep, vma); } -#ifdef CONFIG_PER_VMA_LOCK -static void vm_area_free_rcu_cb(struct rcu_head *head) -{ - struct vm_area_struct *vma = container_of(head, struct vm_area_struct, - vm_rcu); - - /* The vma should not be locked while being destroyed. */ - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); - __vm_area_free(vma); -} -#endif - -void vm_area_free(struct vm_area_struct *vma) -{ -#ifdef CONFIG_PER_VMA_LOCK - call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); -#else - __vm_area_free(vma); -#endif -} - static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -830,6 +804,36 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ +#ifdef CONFIG_MM_ID +static DEFINE_IDA(mm_ida); + +static inline int mm_alloc_id(struct mm_struct *mm) +{ + int ret; + + ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL); + if (ret < 0) + return ret; + mm->mm_id = ret; + return 0; +} + +static inline void mm_free_id(struct mm_struct *mm) +{ + const mm_id_t id = mm->mm_id; + + mm->mm_id = MM_ID_DUMMY; + if (id == MM_ID_DUMMY) + return; + if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX)) + return; + ida_free(&mm_ida, id); +} +#else /* !CONFIG_MM_ID */ +static inline int mm_alloc_id(struct mm_struct *mm) { return 0; } +static inline void mm_free_id(struct mm_struct *mm) {} +#endif /* CONFIG_MM_ID */ + static void check_mm(struct mm_struct *mm) { int i; @@ -933,6 +937,7 @@ void __mmdrop(struct mm_struct *mm) WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); + mm_free_id(mm); destroy_context(mm); mmu_notifier_subscriptions_destroy(mm); check_mm(mm); @@ -1267,6 +1272,15 @@ static void mm_init_uprobes_state(struct mm_struct *mm) #endif } +static void mmap_init_lock(struct mm_struct *mm) +{ + init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); +#ifdef CONFIG_PER_VMA_LOCK + rcuwait_init(&mm->vma_writer_wait); +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -1308,6 +1322,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_pgd(mm)) goto fail_nopgd; + if (mm_alloc_id(mm)) + goto fail_noid; + if (init_new_context(p, mm)) goto fail_nocontext; @@ -1327,6 +1344,8 @@ fail_pcpu: fail_cid: destroy_context(mm); fail_nocontext: + mm_free_id(mm); +fail_noid: mm_free_pgd(mm); fail_nopgd: free_mm(mm); @@ -1563,6 +1582,17 @@ struct mm_struct *get_task_mm(struct task_struct *task) } EXPORT_SYMBOL_GPL(get_task_mm); +static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode) +{ + if (mm == current->mm) + return true; + if (ptrace_may_access(task, mode)) + return true; + if ((mode & PTRACE_MODE_READ) && perfmon_capable()) + return true; + return false; +} + struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) { struct mm_struct *mm; @@ -1575,7 +1605,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mm = get_task_mm(task); if (!mm) { mm = ERR_PTR(-ESRCH); - } else if (mm != current->mm && !ptrace_may_access(task, mode)) { + } else if (!may_access_mm(mm, task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } @@ -3183,6 +3213,11 @@ void __init mm_cache_init(void) void __init proc_caches_init(void) { + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), + }; + sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -3199,11 +3234,10 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); -#ifdef CONFIG_PER_VMA_LOCK - vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); -#endif + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), &args, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| + SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 00529c81cc40..c9e5dc068e85 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -89,7 +89,6 @@ rm -f "${tmpdir}.contents.txt" # Create archive and try to normalize metadata for reproducibility. tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ - --exclude=".__afs*" --exclude=".nfs*" \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ -I $XZ -cf $tarfile -C "${tmpdir}/" . > /dev/null diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 04efa7a6e69b..dc898ec93463 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -93,6 +93,43 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static void debug_show_blocker(struct task_struct *task) +{ + struct task_struct *g, *t; + unsigned long owner; + struct mutex *lock; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); + + lock = READ_ONCE(task->blocker_mutex); + if (!lock) + return; + + owner = mutex_get_owner(lock); + if (unlikely(!owner)) { + pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", + task->comm, task->pid); + return; + } + + /* Ensure the owner information is correct. */ + for_each_process_thread(g, t) { + if ((unsigned long)t == owner) { + pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n", + task->comm, task->pid, t->comm, t->pid); + sched_show_task(t); + return; + } + } +} +#else +static inline void debug_show_blocker(struct task_struct *task) +{ +} +#endif + static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; @@ -152,6 +189,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); + debug_show_blocker(t); hung_task_show_lock = true; if (sysctl_hung_task_all_cpu_backtrace) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 2861f89880af..9d5c8651492d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -398,7 +398,7 @@ void irq_domain_remove(struct irq_domain *domain) * If the going away domain is the default one, reset it. */ if (unlikely(irq_default_domain == domain)) - irq_set_default_host(NULL); + irq_set_default_domain(NULL); mutex_unlock(&irq_domain_mutex); @@ -573,7 +573,7 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, EXPORT_SYMBOL_GPL(irq_find_matching_fwspec); /** - * irq_set_default_host() - Set a "default" irq domain + * irq_set_default_domain() - Set a "default" irq domain * @domain: default domain pointer * * For convenience, it's possible to set a "default" domain that will be used @@ -581,16 +581,16 @@ EXPORT_SYMBOL_GPL(irq_find_matching_fwspec); * platforms that want to manipulate a few hard coded interrupt numbers that * aren't properly represented in the device-tree. */ -void irq_set_default_host(struct irq_domain *domain) +void irq_set_default_domain(struct irq_domain *domain) { pr_debug("Default domain set to @0x%p\n", domain); irq_default_domain = domain; } -EXPORT_SYMBOL_GPL(irq_set_default_host); +EXPORT_SYMBOL_GPL(irq_set_default_domain); /** - * irq_get_default_host() - Retrieve the "default" irq domain + * irq_get_default_domain() - Retrieve the "default" irq domain * * Returns: the default domain, if any. * @@ -598,11 +598,11 @@ EXPORT_SYMBOL_GPL(irq_set_default_host); * systems that cannot implement a firmware->fwnode mapping (which * both DT and ACPI provide). */ -struct irq_domain *irq_get_default_host(void) +struct irq_domain *irq_get_default_domain(void) { return irq_default_domain; } -EXPORT_SYMBOL_GPL(irq_get_default_host); +EXPORT_SYMBOL_GPL(irq_get_default_domain); static bool irq_domain_is_nomap(struct irq_domain *domain) { diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 147cabb4c077..f2b2929986ff 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -37,7 +37,7 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) void irq_force_complete_move(struct irq_desc *desc) { - for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = d->parent_data) { + for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = irqd_get_parent_data(d)) { if (d->chip && d->chip->irq_force_complete_move) { d->chip->irq_force_complete_move(d); return; diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 117d9d4d3c3b..6ce73cceaf53 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1500,7 +1500,7 @@ static int access_thread(void *arg) func(); } } while (!torture_must_stop()); - del_timer_sync(&timer); + timer_delete_sync(&timer); destroy_timer_on_stack(&timer); torture_kthread_stopping("access_thread"); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c22ad51c4317..3e62b944c883 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -210,6 +210,16 @@ int sanity_check_segment_list(struct kimage *image) } #endif + /* + * The destination addresses are searched from system RAM rather than + * being allocated from the buddy allocator, so they are not guaranteed + * to be accepted by the current kernel. Accept the destination + * addresses before kexec swaps their content with the segments' source + * pages to avoid accessing memory before it is accepted. + */ + for (i = 0; i < nr_segments; i++) + accept_memory(image->segment[i].mem, image->segment[i].memsz); + return 0; } diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c index d3689632e8b9..3a5c25b2adc9 100644 --- a/kernel/kexec_elf.c +++ b/kernel/kexec_elf.c @@ -390,7 +390,7 @@ int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, struct kexec_buf *kbuf, unsigned long *lowest_load_addr) { - unsigned long lowest_addr = UINT_MAX; + unsigned long lowest_addr = ULONG_MAX; int ret; size_t i; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 3eedb8c226ad..fba686487e3b 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -464,6 +464,12 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, continue; } + /* Make sure this does not conflict with exclude range */ + if (arch_check_excluded_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + /* We found a suitable memory range */ break; } while (1); @@ -498,6 +504,12 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, continue; } + /* Make sure this does not conflict with exclude range */ + if (arch_check_excluded_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + /* We found a suitable memory range */ break; } while (1); diff --git a/kernel/kthread.c b/kernel/kthread.c index 5dc5b0d7238e..77c44924cf54 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1362,14 +1362,14 @@ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, struct kthread_worker *worker = work->worker; /* - * del_timer_sync() must be called to make sure that the timer + * timer_delete_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); - del_timer_sync(&dwork->timer); + timer_delete_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 19b636f60a24..555e2b3a665a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -72,6 +72,14 @@ static inline unsigned long __owner_flags(unsigned long owner) return owner & MUTEX_FLAGS; } +/* Do not use the return value as a pointer directly. */ +unsigned long mutex_get_owner(struct mutex *lock) +{ + unsigned long owner = atomic_long_read(&lock->owner); + + return (unsigned long)__owner_task(owner); +} + /* * Returns: __mutex_owner(lock) on failure or NULL on success. */ @@ -182,6 +190,9 @@ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + WRITE_ONCE(current->blocker_mutex, lock); +#endif debug_mutex_add_waiter(lock, waiter, current); list_add_tail(&waiter->list, list); @@ -197,6 +208,9 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) __mutex_clear_flag(lock, MUTEX_FLAGS); debug_mutex_remove_waiter(lock, waiter, current); +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + WRITE_ONCE(current->blocker_mutex, NULL); +#endif } /* diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 6083883c4fe0..d6964fc29f51 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); #define per_cpu_sum(var) \ ({ \ - typeof(var) __sum = 0; \ + TYPEOF_UNQUAL(var) __sum = 0; \ int cpu; \ compiletime_assert_atomic_type(__sum); \ for_each_possible_cpu(cpu) \ diff --git a/kernel/panic.c b/kernel/panic.c index 0c55eec9e874..a3889f38153d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -833,9 +833,15 @@ device_initcall(register_warn_debugfs); */ __visible noinstr void __stack_chk_fail(void) { + unsigned long flags; + instrumentation_begin(); + flags = user_access_save(); + panic("stack-protector: Kernel stack is corrupted in: %pB", __builtin_return_address(0)); + + user_access_restore(flags); instrumentation_end(); } EXPORT_SYMBOL(__stack_chk_fail); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index aa42de4d2768..4d9b21f69eaa 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -68,6 +68,8 @@ config TREE_SRCU config FORCE_NEED_SRCU_NMI_SAFE bool "Force selection of NEED_SRCU_NMI_SAFE" depends on !TINY_SRCU + depends on RCU_EXPERT + depends on ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select NEED_SRCU_NMI_SAFE default n help diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 65095664f5c5..4fa7772be183 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2324,7 +2324,7 @@ rcu_torture_reader(void *arg) stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { - del_timer_sync(&t); + timer_delete_sync(&t); destroy_timer_on_stack(&t); } tick_dep_clear_task(current, TICK_DEP_BIT_RCU); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d2a694944553..9a59b071501b 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -690,7 +690,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - del_timer_sync(&sdp->delay_work); + timer_delete_sync(&sdp->delay_work); flush_work(&sdp->work); if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) return; /* Forgot srcu_barrier(), so just leak it! */ diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 466668eb4fad..c0cc7ae41106 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1086,7 +1086,7 @@ static void rcu_tasks_postscan(struct list_head *hop) } if (!IS_ENABLED(CONFIG_TINY_RCU)) - del_timer_sync(&tasks_rcu_exit_srcu_stall_timer); + timer_delete_sync(&tasks_rcu_exit_srcu_stall_timer); } /* See if tasks are still holding out, complain if so. */ diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 5ff3bc56ff51..fa269d34167a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -206,7 +206,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&rdp_gp->nocb_timer); + timer_delete(&rdp_gp->nocb_timer); } if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { @@ -822,7 +822,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); + timer_delete(&my_rdp->nocb_timer); } WRITE_ONCE(my_rdp->nocb_gp_sleep, true); raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); diff --git a/kernel/reboot.c b/kernel/reboot.c index 41ab9e1ba357..ec087827c85c 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -36,6 +36,8 @@ enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; EXPORT_SYMBOL_GPL(reboot_mode); enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED; +static enum hw_protection_action hw_protection_action = HWPROT_ACT_SHUTDOWN; + /* * This variable is used privately to keep track of whether or not * reboot_type is still set to its default value (i.e., reboot= hasn't @@ -229,6 +231,9 @@ EXPORT_SYMBOL(unregister_restart_handler); /** * do_kernel_restart - Execute kernel restart handler call chain * + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * * Calls functions registered with register_restart_handler. * * Expected to be called from machine_restart as last step of the restart @@ -933,61 +938,86 @@ void orderly_reboot(void) } EXPORT_SYMBOL_GPL(orderly_reboot); +static const char *hw_protection_action_str(enum hw_protection_action action) +{ + switch (action) { + case HWPROT_ACT_SHUTDOWN: + return "shutdown"; + case HWPROT_ACT_REBOOT: + return "reboot"; + default: + return "undefined"; + } +} + +static enum hw_protection_action hw_failure_emergency_action; + /** - * hw_failure_emergency_poweroff_func - emergency poweroff work after a known delay - * @work: work_struct associated with the emergency poweroff function + * hw_failure_emergency_action_func - emergency action work after a known delay + * @work: work_struct associated with the emergency action function * * This function is called in very critical situations to force - * a kernel poweroff after a configurable timeout value. + * a kernel poweroff or reboot after a configurable timeout value. */ -static void hw_failure_emergency_poweroff_func(struct work_struct *work) +static void hw_failure_emergency_action_func(struct work_struct *work) { + const char *action_str = hw_protection_action_str(hw_failure_emergency_action); + + pr_emerg("Hardware protection timed-out. Trying forced %s\n", + action_str); + /* - * We have reached here after the emergency shutdown waiting period has - * expired. This means orderly_poweroff has not been able to shut off - * the system for some reason. + * We have reached here after the emergency action waiting period has + * expired. This means orderly_poweroff/reboot has not been able to + * shut off the system for some reason. * - * Try to shut down the system immediately using kernel_power_off - * if populated + * Try to shut off the system immediately if possible */ - pr_emerg("Hardware protection timed-out. Trying forced poweroff\n"); - kernel_power_off(); + + if (hw_failure_emergency_action == HWPROT_ACT_REBOOT) + kernel_restart(NULL); + else + kernel_power_off(); /* * Worst of the worst case trigger emergency restart */ - pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n"); + pr_emerg("Hardware protection %s failed. Trying emergency restart\n", + action_str); emergency_restart(); } -static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work, - hw_failure_emergency_poweroff_func); +static DECLARE_DELAYED_WORK(hw_failure_emergency_action_work, + hw_failure_emergency_action_func); /** - * hw_failure_emergency_poweroff - Trigger an emergency system poweroff + * hw_failure_emergency_schedule - Schedule an emergency system shutdown or reboot + * + * @action: The hardware protection action to be taken + * @action_delay_ms: Time in milliseconds to elapse before triggering action * * This may be called from any critical situation to trigger a system shutdown - * after a given period of time. If time is negative this is not scheduled. + * or reboot after a given period of time. + * If time is negative this is not scheduled. */ -static void hw_failure_emergency_poweroff(int poweroff_delay_ms) +static void hw_failure_emergency_schedule(enum hw_protection_action action, + int action_delay_ms) { - if (poweroff_delay_ms <= 0) + if (action_delay_ms <= 0) return; - schedule_delayed_work(&hw_failure_emergency_poweroff_work, - msecs_to_jiffies(poweroff_delay_ms)); + hw_failure_emergency_action = action; + schedule_delayed_work(&hw_failure_emergency_action_work, + msecs_to_jiffies(action_delay_ms)); } /** - * __hw_protection_shutdown - Trigger an emergency system shutdown or reboot + * __hw_protection_trigger - Trigger an emergency system shutdown or reboot * * @reason: Reason of emergency shutdown or reboot to be printed. * @ms_until_forced: Time to wait for orderly shutdown or reboot before * triggering it. Negative value disables the forced * shutdown or reboot. - * @shutdown: If true, indicates that a shutdown will happen - * after the critical tempeature is reached. - * If false, indicates that a reboot will happen - * after the critical tempeature is reached. + * @action: The hardware protection action to be taken. * * Initiate an emergency system shutdown or reboot in order to protect * hardware from further damage. Usage examples include a thermal protection. @@ -995,11 +1025,16 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms) * pending even if the previous request has given a large timeout for forced * shutdown/reboot. */ -void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown) +void __hw_protection_trigger(const char *reason, int ms_until_forced, + enum hw_protection_action action) { static atomic_t allow_proceed = ATOMIC_INIT(1); - pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason); + if (action == HWPROT_ACT_DEFAULT) + action = hw_protection_action; + + pr_emerg("HARDWARE PROTECTION %s (%s)\n", + hw_protection_action_str(action), reason); /* Shutdown should be initiated only once. */ if (!atomic_dec_and_test(&allow_proceed)) @@ -1009,13 +1044,55 @@ void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shut * Queue a backup emergency shutdown in the event of * orderly_poweroff failure */ - hw_failure_emergency_poweroff(ms_until_forced); - if (shutdown) + hw_failure_emergency_schedule(action, ms_until_forced); + if (action == HWPROT_ACT_REBOOT) + orderly_reboot(); + else orderly_poweroff(true); +} +EXPORT_SYMBOL_GPL(__hw_protection_trigger); + +static bool hw_protection_action_parse(const char *str, + enum hw_protection_action *action) +{ + if (sysfs_streq(str, "shutdown")) + *action = HWPROT_ACT_SHUTDOWN; + else if (sysfs_streq(str, "reboot")) + *action = HWPROT_ACT_REBOOT; else - orderly_reboot(); + return false; + + return true; +} + +static int __init hw_protection_setup(char *str) +{ + hw_protection_action_parse(str, &hw_protection_action); + return 1; +} +__setup("hw_protection=", hw_protection_setup); + +#ifdef CONFIG_SYSFS +static ssize_t hw_protection_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + hw_protection_action_str(hw_protection_action)); +} +static ssize_t hw_protection_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!hw_protection_action_parse(buf, &hw_protection_action)) + return -EINVAL; + + return count; } -EXPORT_SYMBOL_GPL(__hw_protection_shutdown); +static struct kobj_attribute hw_protection_attr = __ATTR_RW(hw_protection); +#endif static int __init reboot_setup(char *str) { @@ -1276,6 +1353,7 @@ static struct kobj_attribute reboot_cpu_attr = __ATTR_RW(cpu); #endif static struct attribute *reboot_attrs[] = { + &hw_protection_attr.attr, &reboot_mode_attr.attr, #ifdef CONFIG_X86 &reboot_force_attr.attr, diff --git a/kernel/relay.c b/kernel/relay.c index a8ae436dc77e..5ac7e711e4b6 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -351,10 +351,9 @@ static struct dentry *relay_create_buf_file(struct rchan *chan, struct dentry *dentry; char *tmpname; - tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); + tmpname = kasprintf(GFP_KERNEL, "%s%d", chan->base_filename, cpu); if (!tmpname) return NULL; - snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); /* Create file in fs */ dentry = chan->cb->create_buf_file(tmpname, chan->parent, diff --git a/kernel/resource.c b/kernel/resource.c index 12004452d999..8d3e6ed0bdc1 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -561,8 +561,7 @@ static int __region_intersects(struct resource *parent, resource_size_t start, struct resource res, o; bool covered; - res.start = start; - res.end = start + size - 1; + res = DEFINE_RES(start, size, 0); for (p = parent->child; p ; p = p->sibling) { if (!resource_intersection(p, &res, &o)) @@ -1714,18 +1713,13 @@ static int __init reserve_setup(char *str) * I/O port space; otherwise assume it's memory. */ if (io_start < 0x10000) { - res->flags = IORESOURCE_IO; + *res = DEFINE_RES_IO_NAMED(io_start, io_num, "reserved"); parent = &ioport_resource; } else { - res->flags = IORESOURCE_MEM; + *res = DEFINE_RES_MEM_NAMED(io_start, io_num, "reserved"); parent = &iomem_resource; } - res->name = "reserved"; - res->start = io_start; - res->end = io_start + io_num - 1; res->flags |= IORESOURCE_BUSY; - res->desc = IORES_DESC_NONE; - res->child = NULL; if (request_resource(parent, res) == 0) reserved = x+1; } @@ -1975,11 +1969,7 @@ get_free_mem_region(struct device *dev, struct resource *base, */ revoke_iomem(res); } else { - res->start = addr; - res->end = addr + size - 1; - res->name = name; - res->desc = desc; - res->flags = IORESOURCE_MEM; + *res = DEFINE_RES_NAMED_DESC(addr, size, name, IORESOURCE_MEM, desc); /* * Only succeed if the resource hosts an exclusive diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cfaca3040b2f..c81cf642dba0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10703,7 +10703,6 @@ void sched_mm_cid_after_execve(struct task_struct *t) smp_mb(); t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); } - rseq_set_notify_resume(t); } void sched_mm_cid_fork(struct task_struct *t) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 21575d39c376..66bcd40a28ca 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4171,8 +4171,8 @@ static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) init_dsq(dsq, dsq_id); - ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, - dsq_hash_params); + ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node, + dsq_hash_params); if (ret) { kfree(dsq); return ERR_PTR(ret); @@ -5361,6 +5361,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ cpus_read_lock(); + scx_idle_enable(ops); + if (scx_ops.init) { ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); if (ret) { @@ -5427,8 +5429,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (scx_ops.cpu_acquire || scx_ops.cpu_release) static_branch_enable(&scx_ops_cpu_preempt); - scx_idle_enable(ops); - /* * Lock out forks, cgroup on/offlining and moves before opening the * floodgate so that they don't wander into the operations prematurely. diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 52c36a70a3d0..cb343ca889e0 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -544,7 +544,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 * core. */ if (flags & SCX_PICK_IDLE_CORE) { - cpu = prev_cpu; + cpu = -EBUSY; goto out_unlock; } } @@ -584,8 +584,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 * increasing distance. */ cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags); - if (cpu >= 0) - goto out_unlock; out_unlock: rcu_read_unlock(); @@ -723,14 +721,14 @@ static void reset_idle_masks(struct sched_ext_ops *ops) void scx_idle_enable(struct sched_ext_ops *ops) { if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) - static_branch_enable(&scx_builtin_idle_enabled); + static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); else - static_branch_disable(&scx_builtin_idle_enabled); + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) - static_branch_enable(&scx_builtin_idle_per_node); + static_branch_enable_cpuslocked(&scx_builtin_idle_per_node); else - static_branch_disable(&scx_builtin_idle_per_node); + static_branch_disable_cpuslocked(&scx_builtin_idle_per_node); #ifdef CONFIG_SMP reset_idle_masks(ops); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index bb56805e3d47..1396674fa722 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1440,7 +1440,7 @@ void psi_trigger_destroy(struct psi_trigger *t) group->rtpoll_task, lockdep_is_held(&group->rtpoll_trigger_lock)); rcu_assign_pointer(group->rtpoll_task, NULL); - del_timer(&group->rtpoll_timer); + timer_delete(&group->rtpoll_timer); } } mutex_unlock(&group->rtpoll_trigger_lock); diff --git a/kernel/signal.c b/kernel/signal.c index 86ba66d95da5..f8859faa26c5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -176,9 +176,10 @@ static bool recalc_sigpending_tsk(struct task_struct *t) void recalc_sigpending(void) { - if (!recalc_sigpending_tsk(current) && !freezing(current)) - clear_thread_flag(TIF_SIGPENDING); - + if (!recalc_sigpending_tsk(current) && !freezing(current)) { + if (unlikely(test_thread_flag(TIF_SIGPENDING))) + clear_thread_flag(TIF_SIGPENDING); + } } EXPORT_SYMBOL(recalc_sigpending); @@ -2179,11 +2180,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); - /* - * Notify for thread-group leaders without subthreads. - */ - if (thread_group_empty(tsk)) - do_notify_pidfd(tsk); + + /* ptraced, or group-leader without sub-threads */ + do_notify_pidfd(tsk); if (sig != SIGCHLD) { /* diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e0eeacbe2521..bb48498ebb5a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -619,7 +619,7 @@ static inline void clocksource_stop_watchdog(void) { if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) return; - del_timer(&watchdog_timer); + timer_delete(&watchdog_timer); watchdog_running = 0; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 22376a1a75b9..517ee2590a29 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -465,19 +465,17 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer, static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif -static inline void -debug_init(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); } -static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) { debug_hrtimer_init_on_stack(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, @@ -1316,8 +1314,6 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base; unsigned long flags; - if (WARN_ON_ONCE(!timer->function)) - return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard @@ -1429,7 +1425,7 @@ static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) * running. * * This prevents priority inversion: if the soft irq thread is preempted - * in the middle of a timer callback, then calling del_timer_sync() can + * in the middle of a timer callback, then calling hrtimer_cancel() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer @@ -1592,8 +1588,9 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) } } -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) +static void __hrtimer_setup(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; @@ -1626,39 +1623,12 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); -} - -static void __hrtimer_setup(struct hrtimer *timer, - enum hrtimer_restart (*function)(struct hrtimer *), - clockid_t clock_id, enum hrtimer_mode mode) -{ - __hrtimer_init(timer, clock_id, mode); if (WARN_ON_ONCE(!function)) - timer->function = hrtimer_dummy_timeout; + ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; else - timer->function = function; -} - -/** - * hrtimer_init - initialize a timer to the given clock - * @timer: the timer to be initialized - * @clock_id: the clock to be used - * @mode: The modes which are relevant for initialization: - * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, - * HRTIMER_MODE_REL_SOFT - * - * The PINNED variants of the above can be handed in, - * but the PINNED bit is ignored as pinning happens - * when the hrtimer is started - */ -void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_init(timer, clock_id, mode); - __hrtimer_init(timer, clock_id, mode); + ACCESS_PRIVATE(timer, function) = function; } -EXPORT_SYMBOL_GPL(hrtimer_init); /** * hrtimer_setup - initialize a timer to the given clock @@ -1676,7 +1646,7 @@ EXPORT_SYMBOL_GPL(hrtimer_init); void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { - debug_init(timer, clock_id, mode); + debug_setup(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup); @@ -1695,7 +1665,7 @@ void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { - debug_init_on_stack(timer, clock_id, mode); + debug_setup_on_stack(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); @@ -1769,7 +1739,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); - fn = timer->function; + fn = ACCESS_PRIVATE(timer, function); /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the @@ -2044,7 +2014,7 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because - * __hrtimer_init_sleeper() determines the delivery mode on RT so the + * __hrtimer_setup_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) @@ -2054,8 +2024,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); -static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) +static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly @@ -2081,8 +2051,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, mode |= HRTIMER_MODE_HARD; } - __hrtimer_init(&sl->timer, clock_id, mode); - sl->timer.function = hrtimer_wakeup; + __hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode); sl->task = current; } @@ -2095,8 +2064,8 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { - debug_init_on_stack(&sl->timer, clock_id, mode); - __hrtimer_init_sleeper(sl, clock_id, mode); + debug_setup_on_stack(&sl->timer, clock_id, mode); + __hrtimer_setup_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index dfe939f6e4ec..c0e960a5de39 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -97,7 +97,7 @@ signed long __sched schedule_timeout(signed long timeout) timer.timer.expires = expire; add_timer(&timer.timer); schedule(); - del_timer_sync(&timer.timer); + timer_delete_sync(&timer.timer); /* Remove the timer from the object tracker */ destroy_timer_on_stack(&timer.timer); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 929846b8b45a..1e67d076f195 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -682,19 +682,20 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act } /** - * timekeeping_forward - update clock to given cycle now value + * timekeeping_forward_now - update clock to the current time * @tk: Pointer to the timekeeper to update - * @cycle_now: Current clocksource read value * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void timekeeping_forward(struct timekeeper *tk, u64 cycle_now) +static void timekeeping_forward_now(struct timekeeper *tk) { - u64 delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, - tk->tkr_mono.clock->max_raw_delta); + u64 cycle_now, delta; + cycle_now = tk_clock_read(&tk->tkr_mono); + delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; @@ -710,21 +711,6 @@ static void timekeeping_forward(struct timekeeper *tk, u64 cycle_now) } /** - * timekeeping_forward_now - update clock to the current time - * @tk: Pointer to the timekeeper to update - * - * Forward the current clock to update its state since the last call to - * update_wall_time(). This is useful before significant clock changes, - * as it avoids having to deal with this time offset explicitly. - */ -static void timekeeping_forward_now(struct timekeeper *tk) -{ - u64 cycle_now = tk_clock_read(&tk->tkr_mono); - - timekeeping_forward(tk, cycle_now); -} - -/** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * @@ -2165,54 +2151,6 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, return offset; } -static u64 timekeeping_accumulate(struct timekeeper *tk, u64 offset, - enum timekeeping_adv_mode mode, - unsigned int *clock_set) -{ - int shift = 0, maxshift; - - /* - * TK_ADV_FREQ indicates that adjtimex(2) directly set the - * frequency or the tick length. - * - * Accumulate the offset, so that the new multiplier starts from - * now. This is required as otherwise for offsets, which are - * smaller than tk::cycle_interval, timekeeping_adjust() could set - * xtime_nsec backwards, which subsequently causes time going - * backwards in the coarse time getters. But even for the case - * where offset is greater than tk::cycle_interval the periodic - * accumulation does not have much value. - * - * Also reset tk::ntp_error as it does not make sense to keep the - * old accumulated error around in this case. - */ - if (mode == TK_ADV_FREQ) { - timekeeping_forward(tk, tk->tkr_mono.cycle_last + offset); - tk->ntp_error = 0; - return 0; - } - - /* - * With NO_HZ we may have to accumulate many cycle_intervals - * (think "ticks") worth of time at once. To do this efficiently, - * we calculate the largest doubling multiple of cycle_intervals - * that is smaller than the offset. We then accumulate that - * chunk in one go, and then try to consume the next smaller - * doubled multiple. - */ - shift = ilog2(offset) - ilog2(tk->cycle_interval); - shift = max(0, shift); - /* Bound shift to one less than what overflows tick_length */ - maxshift = (64 - (ilog2(ntp_tick_length()) + 1)) - 1; - shift = min(shift, maxshift); - while (offset >= tk->cycle_interval) { - offset = logarithmic_accumulation(tk, offset, shift, clock_set); - if (offset < tk->cycle_interval << shift) - shift--; - } - return offset; -} - /* * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length @@ -2222,6 +2160,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) struct timekeeper *tk = &tk_core.shadow_timekeeper; struct timekeeper *real_tk = &tk_core.timekeeper; unsigned int clock_set = 0; + int shift = 0, maxshift; u64 offset; guard(raw_spinlock_irqsave)(&tk_core.lock); @@ -2238,7 +2177,24 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; - offset = timekeeping_accumulate(tk, offset, mode, &clock_set); + /* + * With NO_HZ we may have to accumulate many cycle_intervals + * (think "ticks") worth of time at once. To do this efficiently, + * we calculate the largest doubling multiple of cycle_intervals + * that is smaller than the offset. We then accumulate that + * chunk in one go, and then try to consume the next smaller + * doubled multiple. + */ + shift = ilog2(offset) - ilog2(tk->cycle_interval); + shift = max(0, shift); + /* Bound shift to one less than what overflows tick_length */ + maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; + shift = min(shift, maxshift); + while (offset >= tk->cycle_interval) { + offset = logarithmic_accumulation(tk, offset, shift, &clock_set); + if (offset < tk->cycle_interval<<shift) + shift--; + } /* Adjust the multiplier to correct NTP error */ timekeeping_adjust(tk, offset); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index c8f776dc6ee0..4d915c0a263c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -744,7 +744,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); + timer_delete_sync(timer); debug_object_init(timer, &timer_debug_descr); return true; default: @@ -790,7 +790,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); + timer_delete_sync(timer); debug_object_free(timer, &timer_debug_descr); return true; default: @@ -1212,10 +1212,10 @@ EXPORT_SYMBOL(mod_timer_pending); * * mod_timer(timer, expires) is equivalent to: * - * del_timer(timer); timer->expires = expires; add_timer(timer); + * timer_delete(timer); timer->expires = expires; add_timer(timer); * * mod_timer() is more efficient than the above open coded sequence. In - * case that the timer is inactive, the del_timer() part is a NOP. The + * case that the timer is inactive, the timer_delete() part is a NOP. The * timer is in any case activated with the new expiry time @expires. * * Note that if there are multiple unserialized concurrent users of the diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index cfbb46cc4e76..b03d0ada6469 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -46,7 +46,7 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, timer->function); + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 033fba0633cf..a3f35c7d83b6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -265,8 +265,7 @@ config FUNCTION_GRAPH_RETADDR config FUNCTION_TRACE_ARGS bool - depends on HAVE_FUNCTION_ARG_ACCESS_API - depends on DEBUG_INFO_BTF + depends on PROBE_EVENTS_BTF_ARGS default y help If supported with function argument access API and BTF, then diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 33082c4e8154..95c6e3473a76 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -89,8 +89,11 @@ static bool delete_fprobe_node(struct fprobe_hlist_node *node) { lockdep_assert_held(&fprobe_mutex); - WRITE_ONCE(node->fp, NULL); - hlist_del_rcu(&node->hlist); + /* Avoid double deleting */ + if (READ_ONCE(node->fp) != NULL) { + WRITE_ONCE(node->fp, NULL); + hlist_del_rcu(&node->hlist); + } return !!find_first_fprobe_node(node->addr); } @@ -411,6 +414,102 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num) ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); } +#ifdef CONFIG_MODULES + +#define FPROBE_IPS_BATCH_INIT 8 +/* instruction pointer address list */ +struct fprobe_addr_list { + int index; + int size; + unsigned long *addrs; +}; + +static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long addr) +{ + unsigned long *addrs; + + if (alist->index >= alist->size) + return -ENOMEM; + + alist->addrs[alist->index++] = addr; + if (alist->index < alist->size) + return 0; + + /* Expand the address list */ + addrs = kcalloc(alist->size * 2, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return -ENOMEM; + + memcpy(addrs, alist->addrs, alist->size * sizeof(*addrs)); + alist->size *= 2; + kfree(alist->addrs); + alist->addrs = addrs; + + return 0; +} + +static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head, + struct fprobe_addr_list *alist) +{ + struct fprobe_hlist_node *node; + int ret = 0; + + hlist_for_each_entry_rcu(node, head, hlist) { + if (!within_module(node->addr, mod)) + continue; + if (delete_fprobe_node(node)) + continue; + /* + * If failed to update alist, just continue to update hlist. + * Therefore, at list user handler will not hit anymore. + */ + if (!ret) + ret = fprobe_addr_list_add(alist, node->addr); + } +} + +/* Handle module unloading to manage fprobe_ip_table. */ +static int fprobe_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT}; + struct module *mod = data; + int i; + + if (val != MODULE_STATE_GOING) + return NOTIFY_DONE; + + alist.addrs = kcalloc(alist.size, sizeof(*alist.addrs), GFP_KERNEL); + /* If failed to alloc memory, we can not remove ips from hash. */ + if (!alist.addrs) + return NOTIFY_DONE; + + mutex_lock(&fprobe_mutex); + for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) + fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); + + if (alist.index < alist.size && alist.index > 0) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, + alist.addrs, alist.index, 1, 0); + mutex_unlock(&fprobe_mutex); + + kfree(alist.addrs); + + return NOTIFY_DONE; +} + +static struct notifier_block fprobe_module_nb = { + .notifier_call = fprobe_module_callback, + .priority = 0, +}; + +static int __init init_fprobe_module(void) +{ + return register_module_notifier(&fprobe_module_nb); +} +early_initcall(init_fprobe_module); +#endif + static int symbols_cmp(const void *a, const void *b) { const char **str_a = (const char **) a; @@ -445,6 +544,7 @@ struct filter_match_data { size_t index; size_t size; unsigned long *addrs; + struct module **mods; }; static int filter_match_callback(void *data, const char *name, unsigned long addr) @@ -458,30 +558,47 @@ static int filter_match_callback(void *data, const char *name, unsigned long add if (!ftrace_location(addr)) return 0; - if (match->addrs) - match->addrs[match->index] = addr; + if (match->addrs) { + struct module *mod = __module_text_address(addr); + + if (mod && !try_module_get(mod)) + return 0; + match->mods[match->index] = mod; + match->addrs[match->index] = addr; + } match->index++; return match->index == match->size; } /* * Make IP list from the filter/no-filter glob patterns. - * Return the number of matched symbols, or -ENOENT. + * Return the number of matched symbols, or errno. + * If @addrs == NULL, this just counts the number of matched symbols. If @addrs + * is passed with an array, we need to pass the an @mods array of the same size + * to increment the module refcount for each symbol. + * This means we also need to call `module_put` for each element of @mods after + * using the @addrs. */ -static int ip_list_from_filter(const char *filter, const char *notfilter, - unsigned long *addrs, size_t size) +static int get_ips_from_filter(const char *filter, const char *notfilter, + unsigned long *addrs, struct module **mods, + size_t size) { struct filter_match_data match = { .filter = filter, .notfilter = notfilter, - .index = 0, .size = size, .addrs = addrs}; + .index = 0, .size = size, .addrs = addrs, .mods = mods}; int ret; + if (addrs && !mods) + return -EINVAL; + ret = kallsyms_on_each_symbol(filter_match_callback, &match); if (ret < 0) return ret; - ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match); - if (ret < 0) - return ret; + if (IS_ENABLED(CONFIG_MODULES)) { + ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match); + if (ret < 0) + return ret; + } return match.index ?: -ENOENT; } @@ -543,24 +660,35 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) */ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) { - unsigned long *addrs; - int ret; + unsigned long *addrs __free(kfree) = NULL; + struct module **mods __free(kfree) = NULL; + int ret, num; if (!fp || !filter) return -EINVAL; - ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX); - if (ret < 0) - return ret; + num = get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX); + if (num < 0) + return num; - addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL); + addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL); if (!addrs) return -ENOMEM; - ret = ip_list_from_filter(filter, notfilter, addrs, ret); - if (ret > 0) - ret = register_fprobe_ips(fp, addrs, ret); - kfree(addrs); + mods = kcalloc(num, sizeof(*mods), GFP_KERNEL); + if (!mods) + return -ENOMEM; + + ret = get_ips_from_filter(filter, notfilter, addrs, mods, num); + if (ret < 0) + return ret; + + ret = register_fprobe_ips(fp, addrs, ret); + + for (int i = 0; i < num; i++) { + if (mods[i]) + module_put(mods[i]); + } return ret; } EXPORT_SYMBOL_GPL(register_fprobe); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 92015de6203d..1a48aedb5255 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6855,6 +6855,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) } } } + cond_resched(); } while_for_each_ftrace_rec(); return fail ? -EINVAL : 0; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d8d7b28e2c2f..c0f877d39a24 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6016,7 +6016,7 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) meta->read = cpu_buffer->read; /* Some archs do not have data cache coherency between kernel and user-space */ - flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); + flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE); } static void @@ -7319,7 +7319,8 @@ consume: out: /* Some archs do not have data cache coherency between kernel and user-space */ - flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); + flush_kernel_vmap_range(cpu_buffer->reader_page->page, + buffer->subbuf_size + BUF_PAGE_HDR_SIZE); rb_update_meta_page(cpu_buffer); diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 50344aa9f7f9..968c5c3b0246 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -809,7 +809,8 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) if (p && rv_is_nested_monitor(p)) { pr_info("Parent monitor %s is already nested, cannot nest further\n", parent->name); - return -EINVAL; + retval = -EINVAL; + goto out_unlock; } r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 103b193875b3..b581e388a9d9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -50,6 +50,7 @@ #include <linux/irq_work.h> #include <linux/workqueue.h> #include <linux/sort.h> +#include <linux/io.h> /* vmap_page_range() */ #include <asm/setup.h> /* COMMAND_LINE_SIZE */ @@ -3341,10 +3342,9 @@ out_nobuffer: } EXPORT_SYMBOL_GPL(trace_vbprintk); -__printf(3, 0) -static int -__trace_array_vprintk(struct trace_buffer *buffer, - unsigned long ip, const char *fmt, va_list args) +static __printf(3, 0) +int __trace_array_vprintk(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) { struct ring_buffer_event *event; int len = 0, size; @@ -3394,7 +3394,6 @@ out_nobuffer: return len; } -__printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { @@ -3424,7 +3423,6 @@ int trace_array_vprintk(struct trace_array *tr, * Note, trace_array_init_printk() must be called on @tr before this * can be used. */ -__printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { @@ -3469,7 +3467,6 @@ int trace_array_init_printk(struct trace_array *tr) } EXPORT_SYMBOL_GPL(trace_array_init_printk); -__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) { @@ -3485,7 +3482,6 @@ int trace_array_printk_buf(struct trace_buffer *buffer, return ret; } -__printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(printk_trace, ip, fmt, args); @@ -8505,6 +8501,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) struct trace_iterator *iter = &info->iter; int ret = 0; + /* A memmap'ed buffer is not supported for user space mmap */ + if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP) + return -ENODEV; + /* Currently the boot mapped buffer is not supported for mmap */ if (iter->tr->flags & TRACE_ARRAY_FL_BOOT) return -ENODEV; @@ -9609,13 +9609,11 @@ static void free_trace_buffers(struct trace_array *tr) return; free_trace_buffer(&tr->array_buffer); + kfree(tr->module_delta); #ifdef CONFIG_TRACER_MAX_TRACE free_trace_buffer(&tr->max_buffer); #endif - - if (tr->range_addr_start) - vunmap((void *)tr->range_addr_start); } static void init_trace_flags_index(struct trace_array *tr) @@ -9808,29 +9806,27 @@ static int instance_mkdir(const char *name) return ret; } -static u64 map_pages(u64 start, u64 size) +static u64 map_pages(unsigned long start, unsigned long size) { - struct page **pages; - phys_addr_t page_start; - unsigned int page_count; - unsigned int i; - void *vaddr; - - page_count = DIV_ROUND_UP(size, PAGE_SIZE); + unsigned long vmap_start, vmap_end; + struct vm_struct *area; + int ret; - page_start = start; - pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); - if (!pages) + area = get_vm_area(size, VM_IOREMAP); + if (!area) return 0; - for (i = 0; i < page_count; i++) { - phys_addr_t addr = page_start + i * PAGE_SIZE; - pages[i] = pfn_to_page(addr >> PAGE_SHIFT); + vmap_start = (unsigned long) area->addr; + vmap_end = vmap_start + size; + + ret = vmap_page_range(vmap_start, vmap_end, + start, pgprot_nx(PAGE_KERNEL)); + if (ret < 0) { + free_vm_area(area); + return 0; } - vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL); - kfree(pages); - return (u64)(unsigned long)vaddr; + return (u64)vmap_start; } /** @@ -10709,6 +10705,7 @@ static inline void do_allocate_snapshot(const char *name) { } __init static void enable_instances(void) { struct trace_array *tr; + bool memmap_area = false; char *curr_str; char *name; char *str; @@ -10777,6 +10774,7 @@ __init static void enable_instances(void) name); continue; } + memmap_area = true; } else if (tok) { if (!reserve_mem_find_by_name(tok, &start, &size)) { start = 0; @@ -10787,7 +10785,20 @@ __init static void enable_instances(void) } if (start) { - addr = map_pages(start, size); + /* Start and size must be page aligned */ + if (start & ~PAGE_MASK) { + pr_warn("Tracing: mapping start addr %pa is not page aligned\n", &start); + continue; + } + if (size & ~PAGE_MASK) { + pr_warn("Tracing: mapping size %pa is not page aligned\n", &size); + continue; + } + + if (memmap_area) + addr = map_pages(start, size); + else + addr = (unsigned long)phys_to_virt(start); if (addr) { pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n", name, &start, (unsigned long)size); @@ -10814,10 +10825,13 @@ __init static void enable_instances(void) update_printk_trace(tr); /* - * If start is set, then this is a mapped buffer, and - * cannot be deleted by user space, so keep the reference - * to it. + * memmap'd buffers can not be freed. */ + if (memmap_area) { + tr->flags |= TRACE_ARRAY_FL_MEMMAP; + tr->ref++; + } + if (start) { tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; tr->range_name = no_free_ptr(rname); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c20f6bcc200a..79be1995db44 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -447,6 +447,7 @@ enum { TRACE_ARRAY_FL_BOOT = BIT(1), TRACE_ARRAY_FL_LAST_BOOT = BIT(2), TRACE_ARRAY_FL_MOD_INIT = BIT(3), + TRACE_ARRAY_FL_MEMMAP = BIT(4), }; #ifdef CONFIG_MODULES @@ -852,13 +853,15 @@ static inline void __init disable_tracing_selftest(const char *reason) extern void *head_page(struct trace_array_cpu *data); extern unsigned long long ns2usecs(u64 nsec); -extern int -trace_vbprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_vprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args); + +__printf(2, 0) +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +__printf(2, 0) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +__printf(3, 0) +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8638b7f7ff85..069e92856bda 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -470,6 +470,7 @@ static void test_event_printk(struct trace_event_call *call) case '%': continue; case 'p': + do_pointer: /* Find dereferencing fields */ switch (fmt[i + 1]) { case 'B': case 'R': case 'r': @@ -498,6 +499,12 @@ static void test_event_printk(struct trace_event_call *call) continue; if (fmt[i + j] == '*') { star = true; + /* Handle %*pbl case */ + if (!j && fmt[i + 1] == 'p') { + arg++; + i++; + goto do_pointer; + } continue; } if ((fmt[i + j] == 's')) { diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 5d7ca80173ea..b40fa59159ac 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -919,9 +919,15 @@ static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mo struct __find_tracepoint_cb_data *data = priv; if (!data->tpoint && !strcmp(data->tp_name, tp->name)) { - data->tpoint = tp; - if (!data->mod) + /* If module is not specified, try getting module refcount. */ + if (!data->mod && mod) { + /* If failed to get refcount, ignore this tracepoint. */ + if (!try_module_get(mod)) + return; + data->mod = mod; + } + data->tpoint = tp; } } @@ -933,7 +939,11 @@ static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) data->tpoint = tp; } -/* Find a tracepoint from kernel and module. */ +/* + * Find a tracepoint from kernel and module. If the tracepoint is on the module, + * the module's refcount is incremented and returned as *@tp_mod. Thus, if it is + * not NULL, caller must call module_put(*tp_mod) after used the tracepoint. + */ static struct tracepoint *find_tracepoint(const char *tp_name, struct module **tp_mod) { @@ -962,7 +972,10 @@ static void reenable_trace_fprobe(struct trace_fprobe *tf) } } -/* Find a tracepoint from specified module. */ +/* + * Find a tracepoint from specified module. In this case, this does not get the + * module's refcount. The caller must ensure the module is not freed. + */ static struct tracepoint *find_tracepoint_in_module(struct module *mod, const char *tp_name) { @@ -1169,11 +1182,6 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], if (is_tracepoint) { ctx->flags |= TPARG_FL_TPOINT; tpoint = find_tracepoint(symbol, &tp_mod); - /* lock module until register this tprobe. */ - if (tp_mod && !try_module_get(tp_mod)) { - tpoint = NULL; - tp_mod = NULL; - } if (tpoint) { ctx->funcname = kallsyms_lookup( (unsigned long)tpoint->probestub, diff --git a/kernel/ucount.c b/kernel/ucount.c index 86c5f1c0bad9..8686e329b8f2 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -11,11 +11,14 @@ struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, - .count = ATOMIC_INIT(1), + .count = RCUREF_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 -static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; +#define UCOUNTS_HASHTABLE_ENTRIES (1 << UCOUNTS_HASHTABLE_BITS) +static struct hlist_nulls_head ucounts_hashtable[UCOUNTS_HASHTABLE_ENTRIES] = { + [0 ... UCOUNTS_HASHTABLE_ENTRIES - 1] = HLIST_NULLS_HEAD_INIT(0) +}; static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashfn(ns, uid) \ @@ -24,7 +27,6 @@ static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashentry(ns, uid) \ (ucounts_hashtable + ucounts_hashfn(ns, uid)) - #ifdef CONFIG_SYSCTL static struct ctl_table_set * set_lookup(struct ctl_table_root *root) @@ -127,88 +129,73 @@ void retire_userns_sysctls(struct user_namespace *ns) #endif } -static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) +static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, + struct hlist_nulls_head *hashent) { struct ucounts *ucounts; + struct hlist_nulls_node *pos; - hlist_for_each_entry(ucounts, hashent, node) { - if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) - return ucounts; + guard(rcu)(); + hlist_nulls_for_each_entry_rcu(ucounts, pos, hashent, node) { + if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) { + if (rcuref_get(&ucounts->count)) + return ucounts; + } } return NULL; } static void hlist_add_ucounts(struct ucounts *ucounts) { - struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + struct hlist_nulls_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + spin_lock_irq(&ucounts_lock); - hlist_add_head(&ucounts->node, hashent); + hlist_nulls_add_head_rcu(&ucounts->node, hashent); spin_unlock_irq(&ucounts_lock); } -static inline bool get_ucounts_or_wrap(struct ucounts *ucounts) +struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { - /* Returns true on a successful get, false if the count wraps. */ - return !atomic_add_negative(1, &ucounts->count); -} + struct hlist_nulls_head *hashent = ucounts_hashentry(ns, uid); + struct ucounts *ucounts, *new; -struct ucounts *get_ucounts(struct ucounts *ucounts) -{ - if (!get_ucounts_or_wrap(ucounts)) { - put_ucounts(ucounts); - ucounts = NULL; - } - return ucounts; -} + ucounts = find_ucounts(ns, uid, hashent); + if (ucounts) + return ucounts; -struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) -{ - struct hlist_head *hashent = ucounts_hashentry(ns, uid); - bool wrapped; - struct ucounts *ucounts, *new = NULL; + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->ns = ns; + new->uid = uid; + rcuref_init(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); - if (!ucounts) { + if (ucounts) { spin_unlock_irq(&ucounts_lock); - - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) - return NULL; - - new->ns = ns; - new->uid = uid; - atomic_set(&new->count, 1); - - spin_lock_irq(&ucounts_lock); - ucounts = find_ucounts(ns, uid, hashent); - if (!ucounts) { - hlist_add_head(&new->node, hashent); - get_user_ns(new->ns); - spin_unlock_irq(&ucounts_lock); - return new; - } + kfree(new); + return ucounts; } - wrapped = !get_ucounts_or_wrap(ucounts); + hlist_nulls_add_head_rcu(&new->node, hashent); + get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); - kfree(new); - if (wrapped) { - put_ucounts(ucounts); - return NULL; - } - return ucounts; + return new; } void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { - hlist_del_init(&ucounts->node); + if (rcuref_put(&ucounts->count)) { + spin_lock_irqsave(&ucounts_lock, flags); + hlist_nulls_del_rcu(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); + put_user_ns(ucounts->ns); - kfree(ucounts); + kfree_rcu(ucounts, rcu); } } diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index a78ff092d636..75af12ff774e 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -269,12 +269,10 @@ void __init hardlockup_config_perf_event(const char *str) } else { unsigned int len = comma - str; - if (len >= sizeof(buf)) + if (len > sizeof(buf)) return; - if (strscpy(buf, str, sizeof(buf)) < 0) - return; - buf[len] = 0; + strscpy(buf, str, len); if (kstrtoull(buf, 16, &config)) return; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bfe030b443e2..cf6203282737 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2057,11 +2057,11 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, struct delayed_work *dwork = to_delayed_work(work); /* - * dwork->timer is irqsafe. If del_timer() fails, it's + * dwork->timer is irqsafe. If timer_delete() fails, it's * guaranteed that the timer is not queued anywhere and not * running on the local CPU. */ - if (likely(del_timer(&dwork->timer))) + if (likely(timer_delete(&dwork->timer))) return 1; } @@ -3069,7 +3069,7 @@ restart: break; } - del_timer_sync(&pool->mayday_timer); + timer_delete_sync(&pool->mayday_timer); raw_spin_lock_irq(&pool->lock); /* * This is necessary even after a new worker was just successfully @@ -4281,7 +4281,7 @@ EXPORT_SYMBOL_GPL(flush_work); bool flush_delayed_work(struct delayed_work *dwork) { local_irq_disable(); - if (del_timer_sync(&dwork->timer)) + if (timer_delete_sync(&dwork->timer)) __queue_work(dwork->cpu, dwork->wq, &dwork->work); local_irq_enable(); return flush_work(&dwork->work); @@ -4984,9 +4984,9 @@ static void put_unbound_pool(struct worker_pool *pool) reap_dying_workers(&cull_list); /* shut down the timers */ - del_timer_sync(&pool->idle_timer); + timer_delete_sync(&pool->idle_timer); cancel_work_sync(&pool->idle_cull_work); - del_timer_sync(&pool->mayday_timer); + timer_delete_sync(&pool->mayday_timer); /* RCU protected to allow dereferences from get_work_pool() */ call_rcu(&pool->rcu, rcu_free_pool); @@ -7637,7 +7637,7 @@ notrace void wq_watchdog_touch(int cpu) static void wq_watchdog_set_thresh(unsigned long thresh) { wq_watchdog_thresh = 0; - del_timer_sync(&wq_watchdog_timer); + timer_delete_sync(&wq_watchdog_timer); if (thresh) { wq_watchdog_thresh = thresh; |