diff options
author | Jakub Kicinski <kuba@kernel.org> | 2023-01-05 15:34:11 -0800 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2023-01-05 15:34:11 -0800 |
commit | 4aea86b4033f92f01547e6d4388d4451ae9b0980 (patch) | |
tree | 1e6e70b8133187b56d636ad9bb942c81b2654173 /kernel | |
parent | 0471005efac9ac70bffd50532f8de07a28eac5aa (diff) | |
parent | 50011c32f421215f6231996fcc84fd1fe81c4a48 (diff) | |
download | lwn-4aea86b4033f92f01547e6d4388d4451ae9b0980.tar.gz lwn-4aea86b4033f92f01547e6d4388d4451ae9b0980.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
No conflicts.
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 3 | ||||
-rw-r--r-- | kernel/bpf/bpf_lsm.c | 2 | ||||
-rw-r--r-- | kernel/bpf/task_iter.c | 39 | ||||
-rw-r--r-- | kernel/bpf/trampoline.c | 4 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 21 | ||||
-rw-r--r-- | kernel/dma/mapping.c | 4 | ||||
-rw-r--r-- | kernel/events/core.c | 54 | ||||
-rw-r--r-- | kernel/exit.c | 6 | ||||
-rw-r--r-- | kernel/futex/syscalls.c | 11 | ||||
-rw-r--r-- | kernel/gcov/gcc_4_7.c | 5 | ||||
-rw-r--r-- | kernel/kprobes.c | 24 | ||||
-rw-r--r-- | kernel/locking/rtmutex.c | 55 | ||||
-rw-r--r-- | kernel/locking/rtmutex_api.c | 6 | ||||
-rw-r--r-- | kernel/panic.c | 7 | ||||
-rw-r--r-- | kernel/trace/rv/monitors/wip/wip.h | 2 | ||||
-rw-r--r-- | kernel/trace/rv/monitors/wwnr/wwnr.h | 2 | ||||
-rw-r--r-- | kernel/trace/trace.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_probe.c | 65 | ||||
-rw-r--r-- | kernel/trace/trace_probe.h | 19 | ||||
-rw-r--r-- | kernel/trace/trace_probe_tmpl.h | 47 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 3 |
21 files changed, 253 insertions, 128 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e7fc37a68069..10ef068f598d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,9 +41,6 @@ UBSAN_SANITIZE_kcov.o := n KMSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector -# Don't instrument error handlers -CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI) - obj-y += sched/ obj-y += locking/ obj-y += power/ diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 9ea42a45da47..a4a41ee3e80b 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -351,8 +351,10 @@ BTF_ID(func, bpf_lsm_bpf_prog_alloc_security) BTF_ID(func, bpf_lsm_bpf_prog_free_security) BTF_ID(func, bpf_lsm_file_alloc_security) BTF_ID(func, bpf_lsm_file_free_security) +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) +#endif /* CONFIG_SECURITY_NETWORK */ BTF_ID(func, bpf_lsm_task_free) BTF_SET_END(untrusted_lsm_hooks) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index c2a2182ce570..c4ab9d6cdbe9 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -438,6 +438,7 @@ struct bpf_iter_seq_task_vma_info { */ struct bpf_iter_seq_task_common common; struct task_struct *task; + struct mm_struct *mm; struct vm_area_struct *vma; u32 tid; unsigned long prev_vm_start; @@ -456,16 +457,19 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) enum bpf_task_vma_iter_find_op op; struct vm_area_struct *curr_vma; struct task_struct *curr_task; + struct mm_struct *curr_mm; u32 saved_tid = info->tid; /* If this function returns a non-NULL vma, it holds a reference to - * the task_struct, and holds read lock on vma->mm->mmap_lock. + * the task_struct, holds a refcount on mm->mm_users, and holds + * read lock on vma->mm->mmap_lock. * If this function returns NULL, it does not hold any reference or * lock. */ if (info->task) { curr_task = info->task; curr_vma = info->vma; + curr_mm = info->mm; /* In case of lock contention, drop mmap_lock to unblock * the writer. * @@ -504,13 +508,15 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) * 4.2) VMA2 and VMA2' covers different ranges, process * VMA2'. */ - if (mmap_lock_is_contended(curr_task->mm)) { + if (mmap_lock_is_contended(curr_mm)) { info->prev_vm_start = curr_vma->vm_start; info->prev_vm_end = curr_vma->vm_end; op = task_vma_iter_find_vma; - mmap_read_unlock(curr_task->mm); - if (mmap_read_lock_killable(curr_task->mm)) + mmap_read_unlock(curr_mm); + if (mmap_read_lock_killable(curr_mm)) { + mmput(curr_mm); goto finish; + } } else { op = task_vma_iter_next_vma; } @@ -535,42 +541,47 @@ again: op = task_vma_iter_find_vma; } - if (!curr_task->mm) + curr_mm = get_task_mm(curr_task); + if (!curr_mm) goto next_task; - if (mmap_read_lock_killable(curr_task->mm)) + if (mmap_read_lock_killable(curr_mm)) { + mmput(curr_mm); goto finish; + } } switch (op) { case task_vma_iter_first_vma: - curr_vma = find_vma(curr_task->mm, 0); + curr_vma = find_vma(curr_mm, 0); break; case task_vma_iter_next_vma: - curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); + curr_vma = find_vma(curr_mm, curr_vma->vm_end); break; case task_vma_iter_find_vma: /* We dropped mmap_lock so it is necessary to use find_vma * to find the next vma. This is similar to the mechanism * in show_smaps_rollup(). */ - curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1); + curr_vma = find_vma(curr_mm, info->prev_vm_end - 1); /* case 1) and 4.2) above just use curr_vma */ /* check for case 2) or case 4.1) above */ if (curr_vma && curr_vma->vm_start == info->prev_vm_start && curr_vma->vm_end == info->prev_vm_end) - curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); + curr_vma = find_vma(curr_mm, curr_vma->vm_end); break; } if (!curr_vma) { /* case 3) above, or case 2) 4.1) with vma->next == NULL */ - mmap_read_unlock(curr_task->mm); + mmap_read_unlock(curr_mm); + mmput(curr_mm); goto next_task; } info->task = curr_task; info->vma = curr_vma; + info->mm = curr_mm; return curr_vma; next_task: @@ -579,6 +590,7 @@ next_task: put_task_struct(curr_task); info->task = NULL; + info->mm = NULL; info->tid++; goto again; @@ -587,6 +599,7 @@ finish: put_task_struct(curr_task); info->task = NULL; info->vma = NULL; + info->mm = NULL; return NULL; } @@ -658,7 +671,9 @@ static void task_vma_seq_stop(struct seq_file *seq, void *v) */ info->prev_vm_start = ~0UL; info->prev_vm_end = info->vma->vm_end; - mmap_read_unlock(info->task->mm); + mmap_read_unlock(info->mm); + mmput(info->mm); + info->mm = NULL; put_task_struct(info->task); info->task = NULL; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 11f5ec0b8016..d0ed7d6f5eec 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -488,6 +488,10 @@ again: /* reset fops->func and fops->trampoline for re-register */ tr->fops->func = NULL; tr->fops->trampoline = 0; + + /* reset im->image memory attr for arch_prepare_bpf_trampoline */ + set_memory_nx((long)im->image, 1); + set_memory_rw((long)im->image, 1); goto again; } #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4a25375ebb0d..fa4c911603e9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1054,6 +1054,8 @@ static void print_insn_state(struct bpf_verifier_env *env, */ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags) { + size_t alloc_bytes; + void *orig = dst; size_t bytes; if (ZERO_OR_NULL_PTR(src)) @@ -1062,11 +1064,11 @@ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - if (ksize(dst) < ksize(src)) { - kfree(dst); - dst = kmalloc_track_caller(kmalloc_size_roundup(bytes), flags); - if (!dst) - return NULL; + alloc_bytes = max(ksize(orig), kmalloc_size_roundup(bytes)); + dst = krealloc(orig, alloc_bytes, flags); + if (!dst) { + kfree(orig); + return NULL; } memcpy(dst, src, bytes); @@ -11828,10 +11830,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * register B - not null * for JNE A, B, ... - A is not null in the false branch; * for JEQ A, B, ... - A is not null in the true branch. + * + * Since PTR_TO_BTF_ID points to a kernel struct that does + * not need to be null checked by the BPF program, i.e., + * could be null even without PTR_MAYBE_NULL marking, so + * only propagate nullness when neither reg is that type. */ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X && __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) && - type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type)) { + type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) && + base_type(src_reg->type) != PTR_TO_BTF_ID && + base_type(dst_reg->type) != PTR_TO_BTF_ID) { eq_branch_regs = NULL; switch (opcode) { case BPF_JEQ: diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index c026a5a5e046..68106e3791f6 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -560,6 +560,8 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size, return NULL; if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM))) return NULL; + if (WARN_ON_ONCE(gfp & __GFP_COMP)) + return NULL; size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) @@ -645,6 +647,8 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES)) return NULL; + if (WARN_ON_ONCE(gfp & __GFP_COMP)) + return NULL; if (ops && ops->alloc_noncontiguous) sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs); diff --git a/kernel/events/core.c b/kernel/events/core.c index eacc3702654d..d56328e5080e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -380,7 +380,6 @@ enum event_type_t { /* * perf_sched_events : >0 events exist - * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ static void perf_sched_delayed(struct work_struct *work); @@ -389,7 +388,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; -static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -844,9 +842,16 @@ static void perf_cgroup_switch(struct task_struct *task) struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp; - cgrp = perf_cgroup_from_task(task, NULL); + /* + * cpuctx->cgrp is set when the first cgroup event enabled, + * and is cleared when the last cgroup event disabled. + */ + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + + cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; @@ -3631,8 +3636,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_switch(next); + perf_cgroup_switch(next); } static bool perf_less_group_idx(const void *l, const void *r) @@ -4974,15 +4978,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event) detach_sb_event(event); } -static void unaccount_event_cpu(struct perf_event *event, int cpu) -{ - if (event->parent) - return; - - if (is_cgroup_event(event)) - atomic_dec(&per_cpu(perf_cgroup_events, cpu)); -} - #ifdef CONFIG_NO_HZ_FULL static DEFINE_SPINLOCK(nr_freq_lock); #endif @@ -5048,8 +5043,6 @@ static void unaccount_event(struct perf_event *event) schedule_delayed_work(&perf_sched_work, HZ); } - unaccount_event_cpu(event, event->cpu); - unaccount_pmu_sb_event(event); } @@ -11679,15 +11672,6 @@ static void account_pmu_sb_event(struct perf_event *event) attach_sb_event(event); } -static void account_event_cpu(struct perf_event *event, int cpu) -{ - if (event->parent) - return; - - if (is_cgroup_event(event)) - atomic_inc(&per_cpu(perf_cgroup_events, cpu)); -} - /* Freq events need the tick to stay alive (see perf_event_task_tick). */ static void account_freq_event_nohz(void) { @@ -11775,8 +11759,6 @@ static void account_event(struct perf_event *event) } enabled: - account_event_cpu(event, event->cpu); - account_pmu_sb_event(event); } @@ -12339,12 +12321,12 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & ~PERF_FLAG_ALL) return -EINVAL; - /* Do we allow access to perf_event_open(2) ? */ - err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + err = perf_copy_attr(attr_uptr, &attr); if (err) return err; - err = perf_copy_attr(attr_uptr, &attr); + /* Do we allow access to perf_event_open(2) ? */ + err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); if (err) return err; @@ -12689,7 +12671,8 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; err_context: - /* event->pmu_ctx freed by free_event() */ + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; /* _free_event() */ err_locked: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); @@ -12802,6 +12785,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, err_pmu_ctx: put_pmu_ctx(pmu_ctx); + event->pmu_ctx = NULL; /* _free_event() */ err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); @@ -12822,13 +12806,11 @@ static void __perf_pmu_remove(struct perf_event_context *ctx, perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { perf_remove_from_context(event, 0); - unaccount_event_cpu(event, cpu); put_pmu_ctx(event->pmu_ctx); list_add(&event->migrate_entry, events); for_each_sibling_event(sibling, event) { perf_remove_from_context(sibling, 0); - unaccount_event_cpu(sibling, cpu); put_pmu_ctx(sibling->pmu_ctx); list_add(&sibling->migrate_entry, events); } @@ -12847,7 +12829,6 @@ static void __perf_pmu_install_event(struct pmu *pmu, if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, cpu); perf_install_in_context(ctx, event, cpu); } @@ -13231,7 +13212,7 @@ inherit_event(struct perf_event *parent_event, pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); - return NULL; + return ERR_CAST(pmu_ctx); } child_event->pmu_ctx = pmu_ctx; @@ -13742,8 +13723,7 @@ static int __perf_cgroup_move(void *info) struct task_struct *task = info; preempt_disable(); - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_switch(task); + perf_cgroup_switch(task); preempt_enable(); return 0; diff --git a/kernel/exit.c b/kernel/exit.c index deffb8e4b1b2..15dc2ec80c46 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -931,6 +931,7 @@ void __noreturn make_task_dead(int signr) * Then do everything else. */ struct task_struct *tsk = current; + unsigned int limit; if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); @@ -954,8 +955,9 @@ void __noreturn make_task_dead(int signr) * To make sure this can't happen, place an upper bound on how often the * kernel may oops without panic(). */ - if (atomic_inc_return(&oops_count) >= READ_ONCE(oops_limit) && oops_limit) - panic("Oopsed too often (kernel.oops_limit is %d)", oops_limit); + limit = READ_ONCE(oops_limit); + if (atomic_inc_return(&oops_count) >= limit && limit) + panic("Oopsed too often (kernel.oops_limit is %d)", limit); /* * We're taking recursive faults here in make_task_dead. Safest is to just diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 086a22d1adb7..a8074079b09e 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -286,19 +286,22 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, } futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); - if (!futexv) - return -ENOMEM; + if (!futexv) { + ret = -ENOMEM; + goto destroy_timer; + } ret = futex_parse_waitv(futexv, waiters, nr_futexes); if (!ret) ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); + kfree(futexv); + +destroy_timer: if (timeout) { hrtimer_cancel(&to.timer); destroy_hrtimer_on_stack(&to.timer); } - - kfree(futexv); return ret; } diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 7971e989e425..74a4ef1da9ad 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -82,6 +82,7 @@ struct gcov_fn_info { * @version: gcov version magic indicating the gcc version used for compilation * @next: list head for a singly-linked list * @stamp: uniquifying time stamp + * @checksum: unique object checksum * @filename: name of the associated gcov data file * @merge: merge functions (null for unused counter type) * @n_functions: number of instrumented functions @@ -94,6 +95,10 @@ struct gcov_info { unsigned int version; struct gcov_info *next; unsigned int stamp; + /* Since GCC 12.1 a checksum field is added. */ +#if (__GNUC__ >= 12) + unsigned int checksum; +#endif const char *filename; void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int); unsigned int n_functions; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3050631e528d..1c18ecf9f98b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2213,13 +2213,9 @@ int register_kretprobe(struct kretprobe *rp) rp->kp.post_handler = NULL; /* Pre-allocate memory for max kretprobe instances */ - if (rp->maxactive <= 0) { -#ifdef CONFIG_PREEMPTION + if (rp->maxactive <= 0) rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); -#else - rp->maxactive = num_possible_cpus(); -#endif - } + #ifdef CONFIG_KRETPROBE_ON_RETHOOK rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler); if (!rp->rh) @@ -2364,6 +2360,14 @@ static void kill_kprobe(struct kprobe *p) lockdep_assert_held(&kprobe_mutex); + /* + * The module is going away. We should disarm the kprobe which + * is using ftrace, because ftrace framework is still available at + * 'MODULE_STATE_GOING' notification. + */ + if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) + disarm_kprobe_ftrace(p); + p->flags |= KPROBE_FLAG_GONE; if (kprobe_aggrprobe(p)) { /* @@ -2380,14 +2384,6 @@ static void kill_kprobe(struct kprobe *p) * the original probed function (which will be freed soon) any more. */ arch_remove_kprobe(p); - - /* - * The module is going away. We should disarm the kprobe which - * is using ftrace, because ftrace framework is still available at - * 'MODULE_STATE_GOING' notification. - */ - if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) - disarm_kprobe_ftrace(p); } /* Disable one kprobe */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7779ee8abc2a..010cf4e6d0b8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -89,15 +89,31 @@ static inline int __ww_mutex_check_kill(struct rt_mutex *lock, * set this bit before looking at the lock. */ -static __always_inline void -rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) +static __always_inline struct task_struct * +rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) { unsigned long val = (unsigned long)owner; if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; - WRITE_ONCE(lock->owner, (struct task_struct *)val); + return (struct task_struct *)val; +} + +static __always_inline void +rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) +{ + /* + * lock->wait_lock is held but explicit acquire semantics are needed + * for a new lock owner so WRITE_ONCE is insufficient. + */ + xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner)); +} + +static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock) +{ + /* lock->wait_lock is held so the unlock provides release semantics. */ + WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL)); } static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) @@ -106,7 +122,8 @@ static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } -static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock) +static __always_inline void +fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -172,8 +189,21 @@ static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock) * still set. */ owner = READ_ONCE(*p); - if (owner & RT_MUTEX_HAS_WAITERS) - WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); + if (owner & RT_MUTEX_HAS_WAITERS) { + /* + * See rt_mutex_set_owner() and rt_mutex_clear_owner() on + * why xchg_acquire() is used for updating owner for + * locking and WRITE_ONCE() for unlocking. + * + * WRITE_ONCE() would work for the acquire case too, but + * in case that the lock acquisition failed it might + * force other lockers into the slow path unnecessarily. + */ + if (acquire_lock) + xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS); + else + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); + } } /* @@ -208,6 +238,13 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) owner = *p; } while (cmpxchg_relaxed(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); + + /* + * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE + * operations in the event of contention. Ensure the successful + * cmpxchg is visible. + */ + smp_mb__after_atomic(); } /* @@ -1243,7 +1280,7 @@ static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) * try_to_take_rt_mutex() sets the lock waiters bit * unconditionally. Clean this up. */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, true); return ret; } @@ -1604,7 +1641,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, * try_to_take_rt_mutex() sets the waiter bit * unconditionally. We might have to fix that up. */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, true); trace_contention_end(lock, ret); @@ -1719,7 +1756,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) * try_to_take_rt_mutex() sets the waiter bit unconditionally. * We might have to fix that up: */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, true); debug_rt_mutex_free_waiter(&waiter); trace_contention_end(lock, 0); diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 900220941caa..cb9fdff76a8a 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -267,7 +267,7 @@ void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock) { debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL); + rt_mutex_clear_owner(lock); } /** @@ -382,7 +382,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, true); raw_spin_unlock_irq(&lock->wait_lock); return ret; @@ -438,7 +438,7 @@ bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, false); raw_spin_unlock_irq(&lock->wait_lock); diff --git a/kernel/panic.c b/kernel/panic.c index 326d91505f04..463c9295bc28 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -232,12 +232,15 @@ static void panic_print_sys_info(bool console_flush) void check_panic_on_warn(const char *origin) { + unsigned int limit; + if (panic_on_warn) panic("%s: panic_on_warn set ...\n", origin); - if (atomic_inc_return(&warn_count) >= READ_ONCE(warn_limit) && warn_limit) + limit = READ_ONCE(warn_limit); + if (atomic_inc_return(&warn_count) >= limit && limit) panic("%s: system warned too often (kernel.warn_limit is %d)", - origin, warn_limit); + origin, limit); } /** diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index dacc37b62a2c..2e373f2c65ed 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -27,7 +27,7 @@ struct automaton_wip { bool final_states[state_max_wip]; }; -static struct automaton_wip automaton_wip = { +static const struct automaton_wip automaton_wip = { .state_names = { "preemptive", "non_preemptive" diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index 118e576b91b4..d0d9c4b8121b 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -27,7 +27,7 @@ struct automaton_wwnr { bool final_states[state_max_wwnr]; }; -static struct automaton_wwnr automaton_wwnr = { +static const struct automaton_wwnr automaton_wwnr = { .state_names = { "not_running", "running" diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6d7ef130f57e..a555a861b978 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5617,7 +5617,7 @@ static const char readme_msg[] = "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" - "\t <type>\\[<array-size>\\]\n" + "\t symstr, <type>\\[<array-size>\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: <stype> <name>;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index bb2f95d7175c..01ebabbbe8c9 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -76,9 +76,11 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; /* Fetch type information table */ static const struct fetch_type probe_fetch_types[] = { /* Special types */ - __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, + __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, 1, "__data_loc char[]"), - __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1, + __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1, 1, + "__data_loc char[]"), + __ASSIGN_FETCH_TYPE("symstr", string, string, sizeof(u32), 1, 1, "__data_loc char[]"), /* Basic types */ ASSIGN_FETCH_TYPE(u8, u8, 0), @@ -98,10 +100,15 @@ static const struct fetch_type probe_fetch_types[] = { ASSIGN_FETCH_TYPE_END }; -static const struct fetch_type *find_fetch_type(const char *type) +static const struct fetch_type *find_fetch_type(const char *type, unsigned long flags) { int i; + /* Reject the symbol/symstr for uprobes */ + if (type && (flags & TPARG_FL_USER) && + (!strcmp(type, "symbol") || !strcmp(type, "symstr"))) + return NULL; + if (!type) type = DEFAULT_FETCH_TYPE_STR; @@ -119,13 +126,13 @@ static const struct fetch_type *find_fetch_type(const char *type) switch (bs) { case 8: - return find_fetch_type("u8"); + return find_fetch_type("u8", flags); case 16: - return find_fetch_type("u16"); + return find_fetch_type("u16", flags); case 32: - return find_fetch_type("u32"); + return find_fetch_type("u32", flags); case 64: - return find_fetch_type("u64"); + return find_fetch_type("u64", flags); default: goto fail; } @@ -478,7 +485,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, DEREF_OPEN_BRACE); return -EINVAL; } else { - const struct fetch_type *t2 = find_fetch_type(NULL); + const struct fetch_type *t2 = find_fetch_type(NULL, flags); *tmp = '\0'; ret = parse_probe_arg(arg, t2, &code, end, flags, offs); @@ -630,9 +637,9 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; - parg->type = find_fetch_type("string"); + parg->type = find_fetch_type("string", flags); } else - parg->type = find_fetch_type(t); + parg->type = find_fetch_type(t, flags); if (!parg->type) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); goto out; @@ -662,16 +669,26 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, ret = -EINVAL; /* Store operation */ - if (!strcmp(parg->type->name, "string") || - !strcmp(parg->type->name, "ustring")) { - if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && - code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && - code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), - BAD_STRING); - goto fail; + if (parg->type->is_string) { + if (!strcmp(parg->type->name, "symstr")) { + if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK && + code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG && + code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) { + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_SYMSTRING); + goto fail; + } + } else { + if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && + code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && + code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_STRING); + goto fail; + } } - if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || + if (!strcmp(parg->type->name, "symstr") || + (code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG || parg->count) { /* @@ -679,6 +696,8 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, * must be kept, and if parg->count != 0, this is an * array of string pointers instead of string address * itself. + * For the symstr, it doesn't need to dereference, thus + * it just get the value. */ code++; if (code->op != FETCH_OP_NOP) { @@ -690,6 +709,8 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (!strcmp(parg->type->name, "ustring") || code->op == FETCH_OP_UDEREF) code->op = FETCH_OP_ST_USTRING; + else if (!strcmp(parg->type->name, "symstr")) + code->op = FETCH_OP_ST_SYMSTR; else code->op = FETCH_OP_ST_STRING; code->size = parg->type->size; @@ -919,8 +940,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, for (i = 0; i < tp->nr_args; i++) { parg = tp->args + i; if (parg->count) { - if ((strcmp(parg->type->name, "string") == 0) || - (strcmp(parg->type->name, "ustring") == 0)) + if (parg->type->is_string) fmt = ", __get_str(%s[%d])"; else fmt = ", REC->%s[%d]"; @@ -928,8 +948,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, pos += snprintf(buf + pos, LEN_OR_ZERO, fmt, parg->name, j); } else { - if ((strcmp(parg->type->name, "string") == 0) || - (strcmp(parg->type->name, "ustring") == 0)) + if (parg->type->is_string) fmt = ", __get_str(%s)"; else fmt = ", REC->%s"; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index de38f1c03776..23acfd1c3812 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -98,6 +98,7 @@ enum fetch_op { FETCH_OP_ST_UMEM, /* Mem: .offset, .size */ FETCH_OP_ST_STRING, /* String: .offset, .size */ FETCH_OP_ST_USTRING, /* User String: .offset, .size */ + FETCH_OP_ST_SYMSTR, /* Kernel Symbol String: .offset, .size */ // Stage 4 (modify) op FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ // Stage 5 (loop) op @@ -133,7 +134,8 @@ struct fetch_insn { struct fetch_type { const char *name; /* Name of type */ size_t size; /* Byte size of type */ - int is_signed; /* Signed flag */ + bool is_signed; /* Signed flag */ + bool is_string; /* String flag */ print_type_func_t print; /* Print functions */ const char *fmt; /* Format string */ const char *fmttype; /* Name in format file */ @@ -177,16 +179,19 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(symbol); #define _ADDR_FETCH_TYPE(t) __ADDR_FETCH_TYPE(t) #define ADDR_FETCH_TYPE _ADDR_FETCH_TYPE(BITS_PER_LONG) -#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ - {.name = _name, \ +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, str, _fmttype) \ + {.name = _name, \ .size = _size, \ - .is_signed = sign, \ + .is_signed = (bool)sign, \ + .is_string = (bool)str, \ .print = PRINT_TYPE_FUNC_NAME(ptype), \ .fmt = PRINT_TYPE_FMT_NAME(ptype), \ .fmttype = _fmttype, \ } + +/* Non string types can use these macros */ #define _ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ - __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, #_fmttype) + __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, 0, #_fmttype) #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, ptype) @@ -353,7 +358,8 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) #define TPARG_FL_TPOINT BIT(3) -#define TPARG_FL_MASK GENMASK(3, 0) +#define TPARG_FL_USER BIT(4) +#define TPARG_FL_MASK GENMASK(4, 0) extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *argv, unsigned int flags); @@ -431,6 +437,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(ARRAY_TOO_BIG, "Array number is too big"), \ C(BAD_TYPE, "Unknown type is specified"), \ C(BAD_STRING, "String accepts only memory argument"), \ + C(BAD_SYMSTRING, "Symbol String doesn't accept data/userdata"), \ C(BAD_BITFIELD, "Invalid bitfield"), \ C(ARG_NAME_TOO_LONG, "Argument name is too long"), \ C(NO_ARG_NAME, "Argument name is not specified"), \ diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index b3bdb8ddb862..5cea672243f6 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -67,6 +67,37 @@ probe_mem_read(void *dest, void *src, size_t size); static nokprobe_inline int probe_mem_read_user(void *dest, void *src, size_t size); +static nokprobe_inline int +fetch_store_symstrlen(unsigned long addr) +{ + char namebuf[KSYM_SYMBOL_LEN]; + int ret; + + ret = sprint_symbol(namebuf, addr); + if (ret < 0) + return 0; + + return ret + 1; +} + +/* + * Fetch a null-terminated symbol string + offset. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +fetch_store_symstring(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + return sprint_symbol(__dest, addr); +} + /* From the 2nd stage, routine is same */ static nokprobe_inline int process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, @@ -99,16 +130,22 @@ stage2: stage3: /* 3rd stage: store value to buffer */ if (unlikely(!dest)) { - if (code->op == FETCH_OP_ST_STRING) { + switch (code->op) { + case FETCH_OP_ST_STRING: ret = fetch_store_strlen(val + code->offset); code++; goto array; - } else if (code->op == FETCH_OP_ST_USTRING) { + case FETCH_OP_ST_USTRING: ret += fetch_store_strlen_user(val + code->offset); code++; goto array; - } else + case FETCH_OP_ST_SYMSTR: + ret += fetch_store_symstrlen(val + code->offset); + code++; + goto array; + default: return -EILSEQ; + } } switch (code->op) { @@ -129,6 +166,10 @@ stage3: loc = *(u32 *)dest; ret = fetch_store_string_user(val + code->offset, dest, base); break; + case FETCH_OP_ST_SYMSTR: + loc = *(u32 *)dest; + ret = fetch_store_symstring(val + code->offset, dest, base); + break; default: return -EILSEQ; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index fb58e86dd117..8d64b6553aed 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -691,7 +691,8 @@ static int __trace_uprobe_create(int argc, const char **argv) for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], - is_return ? TPARG_FL_RETURN : 0); + (is_return ? TPARG_FL_RETURN : 0) | + TPARG_FL_USER); if (ret) goto error; } |