summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c7
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/helpers.c30
-rw-r--r--kernel/bpf/verifier.c1
-rw-r--r--kernel/cfi.c8
-rw-r--r--kernel/cgroup/rstat.c19
-rw-r--r--kernel/cpu_pm.c50
-rw-r--r--kernel/events/core.c35
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/msi.c13
-rw-r--r--kernel/irq/timings.c5
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/notifier.c19
-rw-r--r--kernel/power/energy_model.c4
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/suspend_test.c2
-rw-r--r--kernel/sched/core.c90
-rw-r--r--kernel/sched/cpufreq_schedutil.c16
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/time/timer.c6
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/bpf_trace.c13
-rw-r--r--kernel/trace/trace.c22
-rw-r--r--kernel/trace/trace.h32
-rw-r--r--kernel/trace/trace_events_hist.c26
-rw-r--r--kernel/trace/trace_hwlat.c2
-rw-r--r--kernel/trace/trace_osnoise.c56
-rw-r--r--kernel/tracepoint.c155
-rw-r--r--kernel/ucount.c29
30 files changed, 454 insertions, 210 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b1a5fc04492b..0a28a8095d3e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1362,11 +1362,13 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
}
/**
- * __bpf_prog_run - run eBPF program on a given context
+ * ___bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
*
* Decode and execute eBPF instructions.
+ *
+ * Return: whatever value is in %BPF_R0 at program exit
*/
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
@@ -1878,6 +1880,9 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
*
* Try to JIT eBPF program, if JIT is not available, use interpreter.
* The BPF program will be executed via BPF_PROG_RUN() macro.
+ *
+ * Return: the &fp argument along with &err set to 0 for success or
+ * a negative errno code on failure
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 72c58cc516a3..9c011f3a2687 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1565,8 +1565,8 @@ alloc:
/* We cannot do copy_from_user or copy_to_user inside
* the rcu_read_lock. Allocate enough space here.
*/
- keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
- values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
if (!keys || !values) {
ret = -ENOMEM;
goto after_loop;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 62cf00383910..55f83ea09dae 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -353,9 +353,15 @@ const struct bpf_func_proto bpf_jiffies64_proto = {
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
- struct cgroup *cgrp = task_dfl_cgroup(current);
+ struct cgroup *cgrp;
+ u64 cgrp_id;
- return cgroup_id(cgrp);
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ cgrp_id = cgroup_id(cgrp);
+ rcu_read_unlock();
+
+ return cgrp_id;
}
const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
@@ -366,13 +372,17 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
{
- struct cgroup *cgrp = task_dfl_cgroup(current);
+ struct cgroup *cgrp;
struct cgroup *ancestor;
+ u64 cgrp_id;
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
ancestor = cgroup_ancestor(cgrp, ancestor_level);
- if (!ancestor)
- return 0;
- return cgroup_id(ancestor);
+ cgrp_id = ancestor ? cgroup_id(ancestor) : 0;
+ rcu_read_unlock();
+
+ return cgrp_id;
}
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
@@ -397,8 +407,8 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
void *ptr;
int i;
- for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
- if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+ for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) {
+ if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
continue;
storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
@@ -1070,12 +1080,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f9bda5476ea5..381d3d6f24bc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11663,6 +11663,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
if (aux_data[i].seen)
continue;
memcpy(insn + i, &trap, sizeof(trap));
+ aux_data[i].zext_dst = false;
}
}
diff --git a/kernel/cfi.c b/kernel/cfi.c
index e17a56639766..9594cfd1cf2c 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -248,9 +248,9 @@ static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
{
cfi_check_fn fn;
- rcu_read_lock_sched();
+ rcu_read_lock_sched_notrace();
fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr);
- rcu_read_unlock_sched();
+ rcu_read_unlock_sched_notrace();
return fn;
}
@@ -269,11 +269,11 @@ static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
cfi_check_fn fn = NULL;
struct module *mod;
- rcu_read_lock_sched();
+ rcu_read_lock_sched_notrace();
mod = __module_address(ptr);
if (mod)
fn = mod->cfi_check;
- rcu_read_unlock_sched();
+ rcu_read_unlock_sched_notrace();
return fn;
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 7f0e58917432..b264ab5652ba 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -347,19 +347,20 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
}
static struct cgroup_rstat_cpu *
-cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
+cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
struct cgroup_rstat_cpu *rstatc;
rstatc = get_cpu_ptr(cgrp->rstat_cpu);
- u64_stats_update_begin(&rstatc->bsync);
+ *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
return rstatc;
}
static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
- struct cgroup_rstat_cpu *rstatc)
+ struct cgroup_rstat_cpu *rstatc,
+ unsigned long flags)
{
- u64_stats_update_end(&rstatc->bsync);
+ u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
cgroup_rstat_updated(cgrp, smp_processor_id());
put_cpu_ptr(rstatc);
}
@@ -367,18 +368,20 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) {
case CPUTIME_USER:
@@ -394,7 +397,7 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
break;
}
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
/*
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index f7e1d0eccdbc..246efc74e3f3 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -13,19 +13,32 @@
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
-static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+/*
+ * atomic_notifiers use a spinlock_t, which can block under PREEMPT_RT.
+ * Notifications for cpu_pm will be issued by the idle task itself, which can
+ * never block, IOW it requires using a raw_spinlock_t.
+ */
+static struct {
+ struct raw_notifier_head chain;
+ raw_spinlock_t lock;
+} cpu_pm_notifier = {
+ .chain = RAW_NOTIFIER_INIT(cpu_pm_notifier.chain),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(cpu_pm_notifier.lock),
+};
static int cpu_pm_notify(enum cpu_pm_event event)
{
int ret;
/*
- * atomic_notifier_call_chain has a RCU read critical section, which
- * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
- * RCU know this.
+ * This introduces a RCU read critical section, which could be
+ * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know
+ * this.
*/
rcu_irq_enter_irqson();
- ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL);
+ rcu_read_lock();
+ ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);
+ rcu_read_unlock();
rcu_irq_exit_irqson();
return notifier_to_errno(ret);
@@ -33,10 +46,13 @@ static int cpu_pm_notify(enum cpu_pm_event event)
static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down)
{
+ unsigned long flags;
int ret;
rcu_irq_enter_irqson();
- ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL);
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
rcu_irq_exit_irqson();
return notifier_to_errno(ret);
@@ -49,12 +65,17 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev
* Add a driver to a list of drivers that are notified about
* CPU and CPU cluster low power entry and exit.
*
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_register.
+ * This function has the same return conditions as raw_notifier_chain_register.
*/
int cpu_pm_register_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_chain_register(&cpu_pm_notifier.chain, nb);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
+ return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
@@ -64,12 +85,17 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
*
* Remove a driver from the CPU PM notifier list.
*
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_unregister.
+ * This function has the same return conditions as raw_notifier_chain_unregister.
*/
int cpu_pm_unregister_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_chain_unregister(&cpu_pm_notifier.chain, nb);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
+ return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 464917096e73..1cb1f9b8392e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11917,6 +11917,37 @@ again:
return gctx;
}
+static bool
+perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
+{
+ unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
+ bool is_capable = perfmon_capable();
+
+ if (attr->sigtrap) {
+ /*
+ * perf_event_attr::sigtrap sends signals to the other task.
+ * Require the current task to also have CAP_KILL.
+ */
+ rcu_read_lock();
+ is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
+ rcu_read_unlock();
+
+ /*
+ * If the required capabilities aren't available, checks for
+ * ptrace permissions: upgrade to ATTACH, since sending signals
+ * can effectively change the target task.
+ */
+ ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
+ }
+
+ /*
+ * Preserve ptrace permission check for backwards compatibility. The
+ * ptrace check also includes checks that the current task and other
+ * task have matching uids, and is therefore not done here explicitly.
+ */
+ return is_capable || ptrace_may_access(task, ptrace_mode);
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -12163,15 +12194,13 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_file;
/*
- * Preserve ptrace permission check for backwards compatibility.
- *
* We must hold exec_update_lock across this and any potential
* perf_install_in_context() call for this new event to
* serialize against exec() altering our credentials (and the
* perf_event_exit_task() that could imply).
*/
err = -EACCES;
- if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ if (!perf_check_permission(&attr, task))
goto err_cred;
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 7f04c7d8296e..a98bcfc4be7b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
} else {
switch (__irq_startup_managed(desc, aff, force)) {
case IRQ_STARTUP_NORMAL:
+ if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)
+ irq_setup_affinity(desc);
ret = __irq_startup(desc);
- irq_setup_affinity(desc);
+ if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP))
+ irq_setup_affinity(desc);
break;
case IRQ_STARTUP_MANAGED:
irq_do_set_affinity(d, aff, false);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index c41965e348b5..85df3ca03efe 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -476,11 +476,6 @@ skip_activate:
return 0;
cleanup:
- for_each_msi_vector(desc, i, dev) {
- irq_data = irq_domain_get_irq_data(domain, i);
- if (irqd_is_activated(irq_data))
- irq_domain_deactivate_irq(irq_data);
- }
msi_domain_free_irqs(domain, dev);
return ret;
}
@@ -505,7 +500,15 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
+ struct irq_data *irq_data;
struct msi_desc *desc;
+ int i;
+
+ for_each_msi_vector(desc, i, dev) {
+ irq_data = irq_domain_get_irq_data(domain, i);
+ if (irqd_is_activated(irq_data))
+ irq_domain_deactivate_irq(irq_data);
+ }
for_each_msi_entry(desc, dev) {
/*
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index d309d6fbf5bd..4d2a702d7aa9 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -453,6 +453,11 @@ static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
*/
index = irq_timings_interval_index(interval);
+ if (index > PREDICTION_BUFFER_SIZE - 1) {
+ irqs->count = 0;
+ return;
+ }
+
/*
* Store the index as an element of the pattern in another
* circular array.
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b5d9bb5202c6..ad0db322ed3b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -343,7 +343,7 @@ static __always_inline bool
rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
enum rtmutex_chainwalk chwalk)
{
- if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEX))
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
return waiter != NULL;
return chwalk == RT_MUTEX_FULL_CHAINWALK;
}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 1b019cbca594..b8251dc0bc0f 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -172,25 +172,6 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
-int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh,
- unsigned long val_up, unsigned long val_down, void *v)
-{
- unsigned long flags;
- int ret;
-
- /*
- * Musn't use RCU; because then the notifier list can
- * change between the up and down traversal.
- */
- spin_lock_irqsave(&nh->lock, flags);
- ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
- spin_unlock_irqrestore(&nh->lock, flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust);
-NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust);
-
/**
* atomic_notifier_call_chain - Call functions in an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 0f4530b3a8cd..a332ccd829e2 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -170,7 +170,9 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
/* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
for (i = 0; i < nr_states; i++) {
- table[i].cost = div64_u64(fmax * table[i].power,
+ unsigned long power_res = em_scale_power(table[i].power);
+
+ table[i].cost = div64_u64(fmax * power_res,
table[i].frequency);
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 12c7e1bb442f..44169f3081fd 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -577,7 +577,7 @@ static inline void pm_print_times_init(void) {}
struct kobject *power_kobj;
-/**
+/*
* state - control system sleep states.
*
* show() returns available sleep state labels, which may be "mem", "standby",
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index d8cae434f9eb..eb75f394a059 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -96,7 +96,7 @@ static void s2idle_enter(void)
s2idle_state = S2IDLE_STATE_ENTER;
raw_spin_unlock_irq(&s2idle_lock);
- get_online_cpus();
+ cpus_read_lock();
cpuidle_resume();
/* Push all the CPUs into the idle loop. */
@@ -106,7 +106,7 @@ static void s2idle_enter(void)
s2idle_state == S2IDLE_STATE_WAKE);
cpuidle_pause();
- put_online_cpus();
+ cpus_read_unlock();
raw_spin_lock_irq(&s2idle_lock);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index e1ed58adb69e..d20526c5be15 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -129,7 +129,7 @@ static int __init has_wakealarm(struct device *dev, const void *data)
{
struct rtc_device *candidate = to_rtc_device(dev);
- if (!candidate->ops->set_alarm)
+ if (!test_bit(RTC_FEATURE_ALARM, candidate->features))
return 0;
if (!device_may_wakeup(candidate->dev.parent))
return 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d9ff40f4661..20ffcc044134 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1981,12 +1981,18 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
+static inline int __normal_prio(int policy, int rt_prio, int nice)
{
- return p->static_prio;
+ int prio;
+
+ if (dl_policy(policy))
+ prio = MAX_DL_PRIO - 1;
+ else if (rt_policy(policy))
+ prio = MAX_RT_PRIO - 1 - rt_prio;
+ else
+ prio = NICE_TO_PRIO(nice);
+
+ return prio;
}
/*
@@ -1998,15 +2004,7 @@ static inline int __normal_prio(struct task_struct *p)
*/
static inline int normal_prio(struct task_struct *p)
{
- int prio;
-
- if (task_has_dl_policy(p))
- prio = MAX_DL_PRIO-1;
- else if (task_has_rt_policy(p))
- prio = MAX_RT_PRIO-1 - p->rt_priority;
- else
- prio = __normal_prio(p);
- return prio;
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
}
/*
@@ -4099,7 +4097,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
- p->prio = p->normal_prio = __normal_prio(p);
+ p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
/*
@@ -6341,6 +6339,18 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
}
EXPORT_SYMBOL(default_wake_function);
+static void __setscheduler_prio(struct task_struct *p, int prio)
+{
+ if (dl_prio(prio))
+ p->sched_class = &dl_sched_class;
+ else if (rt_prio(prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+
+ p->prio = prio;
+}
+
#ifdef CONFIG_RT_MUTEXES
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
@@ -6456,22 +6466,19 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
} else {
p->dl.pi_se = &p->dl;
}
- p->sched_class = &dl_sched_class;
} else if (rt_prio(prio)) {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
- p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (rt_prio(oldprio))
p->rt.timeout = 0;
- p->sched_class = &fair_sched_class;
}
- p->prio = prio;
+ __setscheduler_prio(p, prio);
if (queued)
enqueue_task(rq, p, queue_flag);
@@ -6824,35 +6831,6 @@ static void __setscheduler_params(struct task_struct *p,
set_load_weight(p, true);
}
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr, bool keep_boost)
-{
- /*
- * If params can't change scheduling class changes aren't allowed
- * either.
- */
- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
- return;
-
- __setscheduler_params(p, attr);
-
- /*
- * Keep a potential priority boosting if called from
- * sched_setscheduler().
- */
- p->prio = normal_prio(p);
- if (keep_boost)
- p->prio = rt_effective_prio(p, p->prio);
-
- if (dl_prio(p->prio))
- p->sched_class = &dl_sched_class;
- else if (rt_prio(p->prio))
- p->sched_class = &rt_sched_class;
- else
- p->sched_class = &fair_sched_class;
-}
-
/*
* Check the target process has a UID that matches the current process's:
*/
@@ -6873,10 +6851,8 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
{
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
- MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, queued, running;
- int new_effective_prio, policy = attr->sched_policy;
+ int oldpolicy = -1, policy = attr->sched_policy;
+ int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
struct callback_head *head;
struct rq_flags rf;
@@ -7074,6 +7050,7 @@ change:
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
if (pi) {
/*
* Take priority boosted tasks into account. If the new
@@ -7082,8 +7059,8 @@ change:
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_effective_prio(p, newprio);
- if (new_effective_prio == oldprio)
+ newprio = rt_effective_prio(p, newprio);
+ if (newprio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
@@ -7096,7 +7073,10 @@ change:
prev_class = p->sched_class;
- __setscheduler(rq, p, attr, pi);
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ __setscheduler_prio(p, newprio);
+ }
__setscheduler_uclamp(p, attr);
if (queued) {
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 57124614363d..e7af18857371 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -537,9 +537,17 @@ static struct attribute *sugov_attrs[] = {
};
ATTRIBUTE_GROUPS(sugov);
+static void sugov_tunables_free(struct kobject *kobj)
+{
+ struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj);
+
+ kfree(to_sugov_tunables(attr_set));
+}
+
static struct kobj_type sugov_tunables_ktype = {
.default_groups = sugov_groups,
.sysfs_ops = &governor_sysfs_ops,
+ .release = &sugov_tunables_free,
};
/********************** cpufreq governor interface *********************/
@@ -639,12 +647,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic
return tunables;
}
-static void sugov_tunables_free(struct sugov_tunables *tunables)
+static void sugov_clear_global_tunables(void)
{
if (!have_governor_per_policy())
global_tunables = NULL;
-
- kfree(tunables);
}
static int sugov_init(struct cpufreq_policy *policy)
@@ -707,7 +713,7 @@ out:
fail:
kobject_put(&tunables->attr_set.kobj);
policy->governor_data = NULL;
- sugov_tunables_free(tunables);
+ sugov_clear_global_tunables();
stop_kthread:
sugov_kthread_stop(sg_policy);
@@ -734,7 +740,7 @@ static void sugov_exit(struct cpufreq_policy *policy)
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
policy->governor_data = NULL;
if (!count)
- sugov_tunables_free(tunables);
+ sugov_clear_global_tunables();
mutex_unlock(&global_tunables_lock);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 057e17f3215d..6469eca8078c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -602,7 +602,7 @@ static inline void seccomp_sync_threads(unsigned long flags)
smp_store_release(&thread->seccomp.filter,
caller->seccomp.filter);
atomic_set(&thread->seccomp.filter_count,
- atomic_read(&thread->seccomp.filter_count));
+ atomic_read(&caller->seccomp.filter_count));
/*
* Don't let an unprivileged task work around
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 9eb11c2209e5..e3d2c23c413d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1265,8 +1265,10 @@ static inline void timer_base_unlock_expiry(struct timer_base *base)
static void timer_sync_wait_running(struct timer_base *base)
{
if (atomic_read(&base->timer_waiters)) {
+ raw_spin_unlock_irq(&base->lock);
spin_unlock(&base->expiry_lock);
spin_lock(&base->expiry_lock);
+ raw_spin_lock_irq(&base->lock);
}
}
@@ -1457,14 +1459,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn, baseclk);
- base->running_timer = NULL;
raw_spin_lock(&base->lock);
+ base->running_timer = NULL;
} else {
raw_spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn, baseclk);
+ raw_spin_lock_irq(&base->lock);
base->running_timer = NULL;
timer_sync_wait_running(base);
- raw_spin_lock_irq(&base->lock);
}
}
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d567b1717c4c..3ee23f4d437f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -219,6 +219,11 @@ config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
depends on DYNAMIC_FTRACE_WITH_REGS
depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+config DYNAMIC_FTRACE_WITH_ARGS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b4916ef388ad..fdd14072fc3b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -990,28 +990,29 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_numa_node_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
- case BPF_FUNC_probe_write_user:
- return bpf_get_probe_write_proto();
case BPF_FUNC_current_task_under_cgroup:
return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_probe_write_user:
+ return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
+ NULL : bpf_get_probe_write_proto();
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
case BPF_FUNC_probe_read:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_proto;
case BPF_FUNC_probe_read_str:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_str_proto;
#endif
#ifdef CONFIG_CGROUPS
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c59dd35a6da5..a1adb29ef5c1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2897,14 +2897,26 @@ int tracepoint_printk_sysctl(struct ctl_table *table, int write,
void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
{
+ enum event_trigger_type tt = ETT_NONE;
+ struct trace_event_file *file = fbuffer->trace_file;
+
+ if (__event_trigger_test_discard(file, fbuffer->buffer, fbuffer->event,
+ fbuffer->entry, &tt))
+ goto discard;
+
if (static_key_false(&tracepoint_printk_key.key))
output_printk(fbuffer);
if (static_branch_unlikely(&trace_event_exports_enabled))
ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT);
- event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer,
- fbuffer->event, fbuffer->entry,
- fbuffer->trace_ctx, fbuffer->regs);
+
+ trace_buffer_unlock_commit_regs(file->tr, fbuffer->buffer,
+ fbuffer->event, fbuffer->trace_ctx, fbuffer->regs);
+
+discard:
+ if (tt)
+ event_triggers_post_call(file, tt);
+
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
@@ -9135,8 +9147,10 @@ static int trace_array_create_dir(struct trace_array *tr)
return -EINVAL;
ret = event_trace_add_tracer(tr->dir, tr);
- if (ret)
+ if (ret) {
tracefs_remove(tr->dir);
+ return ret;
+ }
init_tracer_tracefs(tr, tr->dir);
__update_tracer_options(tr);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a180abf76d4e..4a0e693000c6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1389,38 +1389,6 @@ event_trigger_unlock_commit(struct trace_event_file *file,
event_triggers_post_call(file, tt);
}
-/**
- * event_trigger_unlock_commit_regs - handle triggers and finish event commit
- * @file: The file pointer associated with the event
- * @buffer: The ring buffer that the event is being written to
- * @event: The event meta data in the ring buffer
- * @entry: The event itself
- * @trace_ctx: The tracing context flags.
- *
- * This is a helper function to handle triggers that require data
- * from the event itself. It also tests the event against filters and
- * if the event is soft disabled and should be discarded.
- *
- * Same as event_trigger_unlock_commit() but calls
- * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
- */
-static inline void
-event_trigger_unlock_commit_regs(struct trace_event_file *file,
- struct trace_buffer *buffer,
- struct ring_buffer_event *event,
- void *entry, unsigned int trace_ctx,
- struct pt_regs *regs)
-{
- enum event_trigger_type tt = ETT_NONE;
-
- if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
- trace_buffer_unlock_commit_regs(file->tr, buffer, event,
- trace_ctx, regs);
-
- if (tt)
- event_triggers_post_call(file, tt);
-}
-
#define FILTER_PRED_INVALID ((unsigned short)-1)
#define FILTER_PRED_IS_RIGHT (1 << 15)
#define FILTER_PRED_FOLD (1 << 15)
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 34325f41ebc0..a48aa2a2875b 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -65,7 +65,8 @@
C(INVALID_SORT_MODIFIER,"Invalid sort modifier"), \
C(EMPTY_SORT_FIELD, "Empty sort field"), \
C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \
- C(INVALID_SORT_FIELD, "Sort field must be a key or a val"),
+ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \
+ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"),
#undef C
#define C(a, b) HIST_ERR_##a
@@ -2156,6 +2157,13 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
ret = PTR_ERR(operand1);
goto free;
}
+ if (operand1->flags & HIST_FIELD_FL_STRING) {
+ /* String type can not be the operand of unary operator. */
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str));
+ destroy_hist_field(operand1, 0);
+ ret = -EINVAL;
+ goto free;
+ }
expr->flags |= operand1->flags &
(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
@@ -2257,6 +2265,11 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
operand1 = NULL;
goto free;
}
+ if (operand1->flags & HIST_FIELD_FL_STRING) {
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(operand1_str));
+ ret = -EINVAL;
+ goto free;
+ }
/* rest of string could be another expression e.g. b+c in a+b+c */
operand_flags = 0;
@@ -2266,6 +2279,11 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
operand2 = NULL;
goto free;
}
+ if (operand2->flags & HIST_FIELD_FL_STRING) {
+ hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str));
+ ret = -EINVAL;
+ goto free;
+ }
ret = check_expr_operands(file->tr, operand1, operand2);
if (ret)
@@ -2287,6 +2305,10 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
expr->operands[0] = operand1;
expr->operands[1] = operand2;
+
+ /* The operand sizes should be the same, so just pick one */
+ expr->size = operand1->size;
+
expr->operator = field_op;
expr->name = expr_str(expr, 0);
expr->type = kstrdup(operand1->type, GFP_KERNEL);
@@ -3408,6 +3430,8 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data,
event = data->match_data.event;
}
+ if (!event)
+ goto free;
/*
* At this point, we're looking at a field on another
* event. Because we can't modify a hist trigger on
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index a6c0cdaf4b87..14f46aae1981 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -327,7 +327,7 @@ static void move_to_next_cpu(void)
get_online_cpus();
cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
- next_cpu = cpumask_next(smp_processor_id(), current_mask);
+ next_cpu = cpumask_next(raw_smp_processor_id(), current_mask);
put_online_cpus();
if (next_cpu >= nr_cpu_ids)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index a7e3c24dee13..b61eefe5ccf5 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -253,6 +253,7 @@ static struct osnoise_data {
*/
static bool osnoise_busy;
+#ifdef CONFIG_PREEMPT_RT
/*
* Print the osnoise header info.
*/
@@ -261,6 +262,35 @@ static void print_osnoise_headers(struct seq_file *s)
if (osnoise_data.tainted)
seq_puts(s, "# osnoise is tainted!\n");
+ seq_puts(s, "# _-------=> irqs-off\n");
+ seq_puts(s, "# / _------=> need-resched\n");
+ seq_puts(s, "# | / _-----=> need-resched-lazy\n");
+ seq_puts(s, "# || / _----=> hardirq/softirq\n");
+ seq_puts(s, "# ||| / _---=> preempt-depth\n");
+ seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n");
+ seq_puts(s, "# ||||| / _-=> migrate-disable\n");
+
+ seq_puts(s, "# |||||| / ");
+ seq_puts(s, " MAX\n");
+
+ seq_puts(s, "# ||||| / ");
+ seq_puts(s, " SINGLE Interference counters:\n");
+
+ seq_puts(s, "# ||||||| RUNTIME ");
+ seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n");
+
+ seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP IN US ");
+ seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n");
+
+ seq_puts(s, "# | | | ||||||| | | ");
+ seq_puts(s, " | | | | | | | |\n");
+}
+#else /* CONFIG_PREEMPT_RT */
+static void print_osnoise_headers(struct seq_file *s)
+{
+ if (osnoise_data.tainted)
+ seq_puts(s, "# osnoise is tainted!\n");
+
seq_puts(s, "# _-----=> irqs-off\n");
seq_puts(s, "# / _----=> need-resched\n");
seq_puts(s, "# | / _---=> hardirq/softirq\n");
@@ -279,6 +309,7 @@ static void print_osnoise_headers(struct seq_file *s)
seq_puts(s, "# | | | |||| | | ");
seq_puts(s, " | | | | | | | |\n");
}
+#endif /* CONFIG_PREEMPT_RT */
/*
* osnoise_taint - report an osnoise error.
@@ -323,6 +354,24 @@ static void trace_osnoise_sample(struct osnoise_sample *sample)
/*
* Print the timerlat header info.
*/
+#ifdef CONFIG_PREEMPT_RT
+static void print_timerlat_headers(struct seq_file *s)
+{
+ seq_puts(s, "# _-------=> irqs-off\n");
+ seq_puts(s, "# / _------=> need-resched\n");
+ seq_puts(s, "# | / _-----=> need-resched-lazy\n");
+ seq_puts(s, "# || / _----=> hardirq/softirq\n");
+ seq_puts(s, "# ||| / _---=> preempt-depth\n");
+ seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n");
+ seq_puts(s, "# ||||| / _-=> migrate-disable\n");
+ seq_puts(s, "# |||||| /\n");
+ seq_puts(s, "# ||||||| ACTIVATION\n");
+ seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP ID ");
+ seq_puts(s, " CONTEXT LATENCY\n");
+ seq_puts(s, "# | | | ||||||| | | ");
+ seq_puts(s, " | |\n");
+}
+#else /* CONFIG_PREEMPT_RT */
static void print_timerlat_headers(struct seq_file *s)
{
seq_puts(s, "# _-----=> irqs-off\n");
@@ -336,6 +385,7 @@ static void print_timerlat_headers(struct seq_file *s)
seq_puts(s, "# | | | |||| | | ");
seq_puts(s, " | |\n");
}
+#endif /* CONFIG_PREEMPT_RT */
/*
* Record an timerlat_sample into the tracer buffer.
@@ -1025,9 +1075,13 @@ diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *
/*
* osnoise_stop_tracing - Stop tracing and the tracer.
*/
-static void osnoise_stop_tracing(void)
+static __always_inline void osnoise_stop_tracing(void)
{
struct trace_array *tr = osnoise_trace;
+
+ trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
+ "stop tracing hit on cpu %d\n", smp_processor_id());
+
tracer_tracing_off(tr);
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index fc32821f8240..efd14c79fab4 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -15,12 +15,57 @@
#include <linux/sched/task.h>
#include <linux/static_key.h>
+enum tp_func_state {
+ TP_FUNC_0,
+ TP_FUNC_1,
+ TP_FUNC_2,
+ TP_FUNC_N,
+};
+
extern tracepoint_ptr_t __start___tracepoints_ptrs[];
extern tracepoint_ptr_t __stop___tracepoints_ptrs[];
DEFINE_SRCU(tracepoint_srcu);
EXPORT_SYMBOL_GPL(tracepoint_srcu);
+enum tp_transition_sync {
+ TP_TRANSITION_SYNC_1_0_1,
+ TP_TRANSITION_SYNC_N_2_1,
+
+ _NR_TP_TRANSITION_SYNC,
+};
+
+struct tp_transition_snapshot {
+ unsigned long rcu;
+ unsigned long srcu;
+ bool ongoing;
+};
+
+/* Protected by tracepoints_mutex */
+static struct tp_transition_snapshot tp_transition_snapshot[_NR_TP_TRANSITION_SYNC];
+
+static void tp_rcu_get_state(enum tp_transition_sync sync)
+{
+ struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync];
+
+ /* Keep the latest get_state snapshot. */
+ snapshot->rcu = get_state_synchronize_rcu();
+ snapshot->srcu = start_poll_synchronize_srcu(&tracepoint_srcu);
+ snapshot->ongoing = true;
+}
+
+static void tp_rcu_cond_sync(enum tp_transition_sync sync)
+{
+ struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync];
+
+ if (!snapshot->ongoing)
+ return;
+ cond_synchronize_rcu(snapshot->rcu);
+ if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu))
+ synchronize_srcu(&tracepoint_srcu);
+ snapshot->ongoing = false;
+}
+
/* Set to 1 to enable tracepoint debug output */
static const int tracepoint_debug;
@@ -246,26 +291,29 @@ static void *func_remove(struct tracepoint_func **funcs,
return old;
}
-static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs, bool sync)
+/*
+ * Count the number of functions (enum tp_func_state) in a tp_funcs array.
+ */
+static enum tp_func_state nr_func_state(const struct tracepoint_func *tp_funcs)
+{
+ if (!tp_funcs)
+ return TP_FUNC_0;
+ if (!tp_funcs[1].func)
+ return TP_FUNC_1;
+ if (!tp_funcs[2].func)
+ return TP_FUNC_2;
+ return TP_FUNC_N; /* 3 or more */
+}
+
+static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs)
{
void *func = tp->iterator;
/* Synthetic events do not have static call sites */
if (!tp->static_call_key)
return;
-
- if (!tp_funcs[1].func) {
+ if (nr_func_state(tp_funcs) == TP_FUNC_1)
func = tp_funcs[0].func;
- /*
- * If going from the iterator back to a single caller,
- * we need to synchronize with __DO_TRACE to make sure
- * that the data passed to the callback is the one that
- * belongs to that callback.
- */
- if (sync)
- tracepoint_synchronize_unregister();
- }
-
__static_call_update(tp->static_call_key, tp->static_call_tramp, func);
}
@@ -299,9 +347,41 @@ static int tracepoint_add_func(struct tracepoint *tp,
* a pointer to it. This array is referenced by __DO_TRACE from
* include/linux/tracepoint.h using rcu_dereference_sched().
*/
- tracepoint_update_call(tp, tp_funcs, false);
- rcu_assign_pointer(tp->funcs, tp_funcs);
- static_key_enable(&tp->key);
+ switch (nr_func_state(tp_funcs)) {
+ case TP_FUNC_1: /* 0->1 */
+ /*
+ * Make sure new static func never uses old data after a
+ * 1->0->1 transition sequence.
+ */
+ tp_rcu_cond_sync(TP_TRANSITION_SYNC_1_0_1);
+ /* Set static call to first function */
+ tracepoint_update_call(tp, tp_funcs);
+ /* Both iterator and static call handle NULL tp->funcs */
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ static_key_enable(&tp->key);
+ break;
+ case TP_FUNC_2: /* 1->2 */
+ /* Set iterator static call */
+ tracepoint_update_call(tp, tp_funcs);
+ /*
+ * Iterator callback installed before updating tp->funcs.
+ * Requires ordering between RCU assign/dereference and
+ * static call update/call.
+ */
+ fallthrough;
+ case TP_FUNC_N: /* N->N+1 (N>1) */
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>1) transition sequence.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
release_probes(old);
return 0;
@@ -328,17 +408,52 @@ static int tracepoint_remove_func(struct tracepoint *tp,
/* Failed allocating new tp_funcs, replaced func with stub */
return 0;
- if (!tp_funcs) {
+ switch (nr_func_state(tp_funcs)) {
+ case TP_FUNC_0: /* 1->0 */
/* Removed last function */
if (tp->unregfunc && static_key_enabled(&tp->key))
tp->unregfunc();
static_key_disable(&tp->key);
+ /* Set iterator static call */
+ tracepoint_update_call(tp, tp_funcs);
+ /* Both iterator and static call handle NULL tp->funcs */
+ rcu_assign_pointer(tp->funcs, NULL);
+ /*
+ * Make sure new static func never uses old data after a
+ * 1->0->1 transition sequence.
+ */
+ tp_rcu_get_state(TP_TRANSITION_SYNC_1_0_1);
+ break;
+ case TP_FUNC_1: /* 2->1 */
rcu_assign_pointer(tp->funcs, tp_funcs);
- } else {
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>2) transition sequence. If the first
+ * element's data has changed, then force the synchronization
+ * to prevent current readers that have loaded the old data
+ * from calling the new function.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
+ tp_rcu_cond_sync(TP_TRANSITION_SYNC_N_2_1);
+ /* Set static call to first function */
+ tracepoint_update_call(tp, tp_funcs);
+ break;
+ case TP_FUNC_2: /* N->N-1 (N>2) */
+ fallthrough;
+ case TP_FUNC_N:
rcu_assign_pointer(tp->funcs, tp_funcs);
- tracepoint_update_call(tp, tp_funcs,
- tp_funcs[0].func != old[0].func);
+ /*
+ * Make sure static func never uses incorrect data after a
+ * N->...->2->1 (N>2) transition sequence.
+ */
+ if (tp_funcs[0].data != old[0].data)
+ tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
release_probes(old);
return 0;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 87799e2379bd..bb51849e6375 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -58,14 +58,17 @@ static struct ctl_table_root set_root = {
.permissions = set_permissions,
};
-#define UCOUNT_ENTRY(name) \
- { \
- .procname = name, \
- .maxlen = sizeof(int), \
- .mode = 0644, \
- .proc_handler = proc_dointvec_minmax, \
- .extra1 = SYSCTL_ZERO, \
- .extra2 = SYSCTL_INT_MAX, \
+static long ue_zero = 0;
+static long ue_int_max = INT_MAX;
+
+#define UCOUNT_ENTRY(name) \
+ { \
+ .procname = name, \
+ .maxlen = sizeof(long), \
+ .mode = 0644, \
+ .proc_handler = proc_doulongvec_minmax, \
+ .extra1 = &ue_zero, \
+ .extra2 = &ue_int_max, \
}
static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_user_namespaces"),
@@ -160,6 +163,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
{
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
struct ucounts *ucounts, *new;
+ long overflow;
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -184,8 +188,12 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
return new;
}
}
+ overflow = atomic_add_negative(1, &ucounts->count);
spin_unlock_irq(&ucounts_lock);
- ucounts = get_ucounts(ucounts);
+ if (overflow) {
+ put_ucounts(ucounts);
+ return NULL;
+ }
return ucounts;
}
@@ -193,8 +201,7 @@ void put_ucounts(struct ucounts *ucounts)
{
unsigned long flags;
- if (atomic_dec_and_test(&ucounts->count)) {
- spin_lock_irqsave(&ucounts_lock, flags);
+ if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) {
hlist_del_init(&ucounts->node);
spin_unlock_irqrestore(&ucounts_lock, flags);
kfree(ucounts);