diff options
Diffstat (limited to 'kernel')
59 files changed, 2944 insertions, 1726 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 7a63d567fdb5..7343b3a9bff0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -117,6 +117,10 @@ obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_RSEQ) += rseq.o +obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o +KASAN_SANITIZE_stackleak.o := n +KCOV_INSTRUMENT_stackleak.o := n + $(obj)/configs.o: $(obj)/config_data.h targets += config_data.gz diff --git a/kernel/bounds.c b/kernel/bounds.c index c373e887c066..9795d75b09b2 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -13,7 +13,7 @@ #include <linux/log2.h> #include <linux/spinlock_types.h> -void foo(void) +int main(void) { /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); @@ -23,4 +23,6 @@ void foo(void) #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); /* End of constants */ + + return 0; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 378cef70341c..ee4c82667d65 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2067,56 +2067,47 @@ static int btf_check_sec_info(struct btf_verifier_env *env, return 0; } -static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, - u32 btf_data_size) +static int btf_parse_hdr(struct btf_verifier_env *env) { + u32 hdr_len, hdr_copy, btf_data_size; const struct btf_header *hdr; - u32 hdr_len, hdr_copy; - /* - * Minimal part of the "struct btf_header" that - * contains the hdr_len. - */ - struct btf_min_header { - u16 magic; - u8 version; - u8 flags; - u32 hdr_len; - } __user *min_hdr; struct btf *btf; int err; btf = env->btf; - min_hdr = btf_data; + btf_data_size = btf->data_size; - if (btf_data_size < sizeof(*min_hdr)) { + if (btf_data_size < + offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) { btf_verifier_log(env, "hdr_len not found"); return -EINVAL; } - if (get_user(hdr_len, &min_hdr->hdr_len)) - return -EFAULT; - + hdr = btf->data; + hdr_len = hdr->hdr_len; if (btf_data_size < hdr_len) { btf_verifier_log(env, "btf_header not found"); return -EINVAL; } - err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len); - if (err) { - if (err == -E2BIG) - btf_verifier_log(env, "Unsupported btf_header"); - return err; + /* Ensure the unsupported header fields are zero */ + if (hdr_len > sizeof(btf->hdr)) { + u8 *expected_zero = btf->data + sizeof(btf->hdr); + u8 *end = btf->data + hdr_len; + + for (; expected_zero < end; expected_zero++) { + if (*expected_zero) { + btf_verifier_log(env, "Unsupported btf_header"); + return -E2BIG; + } + } } hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); - if (copy_from_user(&btf->hdr, btf_data, hdr_copy)) - return -EFAULT; + memcpy(&btf->hdr, btf->data, hdr_copy); hdr = &btf->hdr; - if (hdr->hdr_len != hdr_len) - return -EINVAL; - btf_verifier_log_hdr(env, btf_data_size); if (hdr->magic != BTF_MAGIC) { @@ -2186,10 +2177,6 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, } env->btf = btf; - err = btf_parse_hdr(env, btf_data, btf_data_size); - if (err) - goto errout; - data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { err = -ENOMEM; @@ -2198,13 +2185,18 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, btf->data = data; btf->data_size = btf_data_size; - btf->nohdr_data = btf->data + btf->hdr.hdr_len; if (copy_from_user(data, btf_data, btf_data_size)) { err = -EFAULT; goto errout; } + err = btf_parse_hdr(env); + if (err) + goto errout; + + btf->nohdr_data = btf->data + btf->hdr.hdr_len; + err = btf_parse_str_sec(env); if (err) goto errout; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7c7eeea8cffc..6377225b2082 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -365,10 +365,13 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) } #ifdef CONFIG_BPF_JIT +# define BPF_JIT_LIMIT_DEFAULT (PAGE_SIZE * 40000) + /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); int bpf_jit_harden __read_mostly; int bpf_jit_kallsyms __read_mostly; +int bpf_jit_limit __read_mostly = BPF_JIT_LIMIT_DEFAULT; static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, @@ -577,27 +580,64 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; } +static atomic_long_t bpf_jit_current; + +#if defined(MODULES_VADDR) +static int __init bpf_jit_charge_init(void) +{ + /* Only used as heuristic here to derive limit. */ + bpf_jit_limit = min_t(u64, round_up((MODULES_END - MODULES_VADDR) >> 2, + PAGE_SIZE), INT_MAX); + return 0; +} +pure_initcall(bpf_jit_charge_init); +#endif + +static int bpf_jit_charge_modmem(u32 pages) +{ + if (atomic_long_add_return(pages, &bpf_jit_current) > + (bpf_jit_limit >> PAGE_SHIFT)) { + if (!capable(CAP_SYS_ADMIN)) { + atomic_long_sub(pages, &bpf_jit_current); + return -EPERM; + } + } + + return 0; +} + +static void bpf_jit_uncharge_modmem(u32 pages) +{ + atomic_long_sub(pages, &bpf_jit_current); +} + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; - unsigned int size, hole, start; + u32 size, hole, start, pages; /* Most of BPF filters are really small, but if some of them * fill a page, allow at least 128 extra bytes to insert a * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); + pages = size / PAGE_SIZE; + + if (bpf_jit_charge_modmem(pages)) + return NULL; hdr = module_alloc(size); - if (hdr == NULL) + if (!hdr) { + bpf_jit_uncharge_modmem(pages); return NULL; + } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); - hdr->pages = size / PAGE_SIZE; + hdr->pages = pages; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -610,7 +650,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { + u32 pages = hdr->pages; + module_memfree(hdr); + bpf_jit_uncharge_modmem(pages); } /* This symbol is only overridden by archs that have different diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 141710b82a6c..191b79948424 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -512,8 +512,7 @@ static int dev_map_notification(struct notifier_block *notifier, struct bpf_dtab_netdev *dev, *odev; dev = READ_ONCE(dtab->netdev_map[i]); - if (!dev || - dev->dev->ifindex != netdev->ifindex) + if (!dev || netdev != dev->dev) continue; odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); if (dev == odev) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ab0d5e3f9892..a74972b07e74 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -99,7 +99,6 @@ BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) const struct bpf_func_proto bpf_map_pop_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, - .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, @@ -113,7 +112,6 @@ BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) const struct bpf_func_proto bpf_map_peek_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, - .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 12a93fb37449..8bbd72d3a121 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -122,6 +122,7 @@ static int __queue_map_get(struct bpf_map *map, void *value, bool delete) raw_spin_lock_irqsave(&qs->lock, flags); if (queue_stack_map_is_empty(qs)) { + memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } @@ -151,6 +152,7 @@ static int __stack_map_get(struct bpf_map *map, void *value, bool delete) raw_spin_lock_irqsave(&qs->lock, flags); if (queue_stack_map_is_empty(qs)) { + memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98fa0be35370..1971ca325fb4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1387,21 +1387,24 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, enum bpf_access_type t) { switch (env->prog->type) { + /* Program types only with direct read access go here! */ case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SK_REUSEPORT: - /* dst_input() and dst_output() can't write for now */ + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_SKB: if (t == BPF_WRITE) return false; /* fallthrough */ + + /* Program types with direct read + write access go here! */ case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: - case BPF_PROG_TYPE_FLOW_DISSECTOR: if (meta) return meta->pkt_access; @@ -2849,10 +2852,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || fn->ret_type == RET_PTR_TO_MAP_VALUE) { - if (fn->ret_type == RET_PTR_TO_MAP_VALUE) - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; - else - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -2865,7 +2864,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; - regs[BPF_REG_0].id = ++env->id_gen; + if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + } else { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { int id = acquire_reference_state(env, insn_idx); if (id < 0) @@ -3043,7 +3047,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->umax_value = umax_ptr; dst_reg->var_off = ptr_reg->var_off; dst_reg->off = ptr_reg->off + smin_val; - dst_reg->range = ptr_reg->range; + dst_reg->raw = ptr_reg->raw; break; } /* A new variable offset is created. Note that off_reg->off @@ -3073,10 +3077,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; + dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ - dst_reg->range = 0; + dst_reg->raw = 0; } break; case BPF_SUB: @@ -3105,7 +3110,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = ptr_reg->var_off; dst_reg->id = ptr_reg->id; dst_reg->off = ptr_reg->off - smin_val; - dst_reg->range = ptr_reg->range; + dst_reg->raw = ptr_reg->raw; break; } /* A new variable offset is created. If the subtrahend is known @@ -3131,11 +3136,12 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; + dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) - dst_reg->range = 0; + dst_reg->raw = 0; } break; case BPF_AND: @@ -5706,7 +5712,11 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) bool is_narrower_load; u32 target_size; - if (ops->gen_prologue) { + if (ops->gen_prologue || env->seen_direct_write) { + if (!ops->gen_prologue) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= ARRAY_SIZE(insn_buf)) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4a3dae2a8283..6aaf5dd5383b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -55,6 +55,7 @@ #include <linux/nsproxy.h> #include <linux/file.h> #include <linux/sched/cputime.h> +#include <linux/psi.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -832,7 +833,7 @@ static void css_set_move_task(struct task_struct *task, */ WARN_ON_ONCE(task->flags & PF_EXITING); - rcu_assign_pointer(task->cgroups, to_cset); + cgroup_move_task(task, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } @@ -3416,6 +3417,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) return ret; } +#ifdef CONFIG_PSI +static int cgroup_io_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); +} +static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); +} +static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); +} +#endif + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4546,6 +4562,23 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, }, +#ifdef CONFIG_PSI + { + .name = "io.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_io_pressure_show, + }, + { + .name = "memory.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_memory_pressure_show, + }, + { + .name = "cpu.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_cpu_pressure_show, + }, +#endif { } /* terminate */ }; @@ -4606,6 +4639,7 @@ static void css_free_rwork_fn(struct work_struct *work) */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); + psi_cgroup_free(cgrp); if (cgroup_on_dfl(cgrp)) cgroup_rstat_exit(cgrp); kfree(cgrp); @@ -4862,10 +4896,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; - ret = cgroup_bpf_inherit(cgrp); + + ret = psi_cgroup_alloc(cgrp); if (ret) goto out_idr_free; + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_psi_free; + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; @@ -4903,6 +4942,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) return cgrp; +out_psi_free: + psi_cgroup_free(cgrp); out_idr_free: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_stat_exit: diff --git a/kernel/compat.c b/kernel/compat.c index 8e40efc2928a..089d00d0da9c 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -93,28 +93,28 @@ int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) return 0; } -static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) +static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) { return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || __get_user(tv->tv_sec, &ctv->tv_sec) || __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) +static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) { return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || __put_user(tv->tv_sec, &ctv->tv_sec) || __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts) +static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts) +static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) { return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || __put_user(ts->tv_sec, &cts->tv_sec) || diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config index 108fecc20fc1..208481d91090 100644 --- a/kernel/configs/kvm_guest.config +++ b/kernel/configs/kvm_guest.config @@ -20,6 +20,7 @@ CONFIG_PARAVIRT=y CONFIG_KVM_GUEST=y CONFIG_S390_GUEST=y CONFIG_VIRTIO=y +CONFIG_VIRTIO_MENU=y CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_BLK=y CONFIG_VIRTIO_CONSOLE=y diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2ddfce8f1e8f..bb4fe4e1a601 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2556,16 +2556,11 @@ static int kdb_summary(int argc, const char **argv) } kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); - /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */ - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); -#undef LOAD_INT -#undef LOAD_FRAC + /* Display in kilobytes */ #define K(x) ((x) << (PAGE_SHIFT - 10)) kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ca8ac2824f0b..2a12b988c717 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -135,9 +135,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; tmp = d->freepages_delay_total + tsk->delays->freepages_delay; d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; + tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; + d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; + d->thrashing_count += tsk->delays->thrashing_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -169,3 +172,15 @@ void __delayacct_freepages_end(void) ¤t->delays->freepages_count); } +void __delayacct_thrashing_start(void) +{ + current->delays->thrashing_start = ktime_get_ns(); +} + +void __delayacct_thrashing_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->thrashing_start, + ¤t->delays->thrashing_delay, + ¤t->delays->thrashing_count); +} diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 87a6bc2a96c0..22a12ab5a5e9 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -4,7 +4,7 @@ * * DMA operations that map physical memory directly without using an IOMMU. */ -#include <linux/bootmem.h> /* for max_pfn */ +#include <linux/memblock.h> /* for max_pfn */ #include <linux/export.h> #include <linux/mm.h> #include <linux/dma-direct.h> @@ -14,8 +14,6 @@ #include <linux/pfn.h> #include <linux/set_memory.h> -#define DIRECT_MAPPING_ERROR 0 - /* * Most architectures use ZONE_DMA for the first 16 Megabytes, but * some use it for entirely different regions: diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 4f8a6dbf0b60..5731daa09a32 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -21,6 +21,7 @@ #include <linux/cache.h> #include <linux/dma-direct.h> +#include <linux/dma-noncoherent.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/spinlock.h> @@ -39,7 +40,7 @@ #include <asm/dma.h> #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/iommu-helper.h> #define CREATE_TRACE_POINTS @@ -73,13 +74,6 @@ static phys_addr_t io_tlb_start, io_tlb_end; static unsigned long io_tlb_nslabs; /* - * When the IOMMU overflows we return a fallback buffer. This sets the size. - */ -static unsigned long io_tlb_overflow = 32*1024; - -static phys_addr_t io_tlb_overflow_buffer; - -/* * This is a free list describing the number of free entries available from * each index */ @@ -126,7 +120,6 @@ setup_io_tlb_npages(char *str) return 0; } early_param("swiotlb", setup_io_tlb_npages); -/* make io_tlb_overflow tunable too? */ unsigned long swiotlb_nr_tbl(void) { @@ -194,16 +187,10 @@ void __init swiotlb_update_mem_attributes(void) bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); memset(vaddr, 0, bytes); - - vaddr = phys_to_virt(io_tlb_overflow_buffer); - bytes = PAGE_ALIGN(io_tlb_overflow); - set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); - memset(vaddr, 0, bytes); } int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { - void *v_overflow_buffer; unsigned long i, bytes; bytes = nslabs << IO_TLB_SHIFT; @@ -213,25 +200,14 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) io_tlb_end = io_tlb_start + bytes; /* - * Get the overflow emergency buffer - */ - v_overflow_buffer = memblock_virt_alloc_low_nopanic( - PAGE_ALIGN(io_tlb_overflow), - PAGE_SIZE); - if (!v_overflow_buffer) - return -ENOMEM; - - io_tlb_overflow_buffer = __pa(v_overflow_buffer); - - /* * Allocate and initialize the free list array. This array is used * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. */ - io_tlb_list = memblock_virt_alloc( + io_tlb_list = memblock_alloc( PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), PAGE_SIZE); - io_tlb_orig_addr = memblock_virt_alloc( + io_tlb_orig_addr = memblock_alloc( PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), PAGE_SIZE); for (i = 0; i < io_tlb_nslabs; i++) { @@ -266,7 +242,7 @@ swiotlb_init(int verbose) bytes = io_tlb_nslabs << IO_TLB_SHIFT; /* Get IO TLB memory from the low pages */ - vstart = memblock_virt_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); + vstart = memblock_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; @@ -330,7 +306,6 @@ int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { unsigned long i, bytes; - unsigned char *v_overflow_buffer; bytes = nslabs << IO_TLB_SHIFT; @@ -342,19 +317,6 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) memset(tlb, 0, bytes); /* - * Get the overflow emergency buffer - */ - v_overflow_buffer = (void *)__get_free_pages(GFP_DMA, - get_order(io_tlb_overflow)); - if (!v_overflow_buffer) - goto cleanup2; - - set_memory_decrypted((unsigned long)v_overflow_buffer, - io_tlb_overflow >> PAGE_SHIFT); - memset(v_overflow_buffer, 0, io_tlb_overflow); - io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); - - /* * Allocate and initialize the free list array. This array is used * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. @@ -390,10 +352,6 @@ cleanup4: sizeof(int))); io_tlb_list = NULL; cleanup3: - free_pages((unsigned long)v_overflow_buffer, - get_order(io_tlb_overflow)); - io_tlb_overflow_buffer = 0; -cleanup2: io_tlb_end = 0; io_tlb_start = 0; io_tlb_nslabs = 0; @@ -407,8 +365,6 @@ void __init swiotlb_exit(void) return; if (late_alloc) { - free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer), - get_order(io_tlb_overflow)); free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * sizeof(phys_addr_t))); free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * @@ -416,8 +372,6 @@ void __init swiotlb_exit(void) free_pages((unsigned long)phys_to_virt(io_tlb_start), get_order(io_tlb_nslabs << IO_TLB_SHIFT)); } else { - memblock_free_late(io_tlb_overflow_buffer, - PAGE_ALIGN(io_tlb_overflow)); memblock_free_late(__pa(io_tlb_orig_addr), PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); memblock_free_late(__pa(io_tlb_list), @@ -429,7 +383,7 @@ void __init swiotlb_exit(void) max_segment = 0; } -int is_swiotlb_buffer(phys_addr_t paddr) +static int is_swiotlb_buffer(phys_addr_t paddr) { return paddr >= io_tlb_start && paddr < io_tlb_end; } @@ -591,26 +545,6 @@ found: } /* - * Allocates bounce buffer and returns its physical address. - */ -static phys_addr_t -map_single(struct device *hwdev, phys_addr_t phys, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - dma_addr_t start_dma_addr; - - if (swiotlb_force == SWIOTLB_NO_FORCE) { - dev_warn_ratelimited(hwdev, "Cannot do DMA to address %pa\n", - &phys); - return SWIOTLB_MAP_ERROR; - } - - start_dma_addr = __phys_to_dma(hwdev, io_tlb_start); - return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, - dir, attrs); -} - -/* * tlb_addr is the physical address of the bounce buffer to unmap. */ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, @@ -689,104 +623,32 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, } } -static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr, - size_t size) -{ - u64 mask = DMA_BIT_MASK(32); - - if (dev && dev->coherent_dma_mask) - mask = dev->coherent_dma_mask; - return addr + size - 1 <= mask; -} - -static void * -swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle, - unsigned long attrs) +static dma_addr_t swiotlb_bounce_page(struct device *dev, phys_addr_t *phys, + size_t size, enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t phys_addr; - - if (swiotlb_force == SWIOTLB_NO_FORCE) - goto out_warn; - - phys_addr = swiotlb_tbl_map_single(dev, - __phys_to_dma(dev, io_tlb_start), - 0, size, DMA_FROM_DEVICE, attrs); - if (phys_addr == SWIOTLB_MAP_ERROR) - goto out_warn; - - *dma_handle = __phys_to_dma(dev, phys_addr); - if (!dma_coherent_ok(dev, *dma_handle, size)) - goto out_unmap; - - memset(phys_to_virt(phys_addr), 0, size); - return phys_to_virt(phys_addr); + dma_addr_t dma_addr; -out_unmap: - dev_warn(dev, "hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", - (unsigned long long)dev->coherent_dma_mask, - (unsigned long long)*dma_handle); - - /* - * DMA_TO_DEVICE to avoid memcpy in unmap_single. - * DMA_ATTR_SKIP_CPU_SYNC is optional. - */ - swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); -out_warn: - if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) { - dev_warn(dev, - "swiotlb: coherent allocation failed, size=%zu\n", - size); - dump_stack(); + if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) { + dev_warn_ratelimited(dev, + "Cannot do DMA to address %pa\n", phys); + return DIRECT_MAPPING_ERROR; } - return NULL; -} - -static bool swiotlb_free_buffer(struct device *dev, size_t size, - dma_addr_t dma_addr) -{ - phys_addr_t phys_addr = dma_to_phys(dev, dma_addr); - WARN_ON_ONCE(irqs_disabled()); - - if (!is_swiotlb_buffer(phys_addr)) - return false; - - /* - * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single. - * DMA_ATTR_SKIP_CPU_SYNC is optional. - */ - swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); - return true; -} - -static void -swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, - int do_panic) -{ - if (swiotlb_force == SWIOTLB_NO_FORCE) - return; - - /* - * Ran out of IOMMU space for this operation. This is very bad. - * Unfortunately the drivers cannot handle this operation properly. - * unless they check for dma_mapping_error (most don't) - * When the mapping is small enough return a static buffer to limit - * the damage, or panic when the transfer is too big. - */ - dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n", - size); + /* Oh well, have to allocate and map a bounce buffer. */ + *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), + *phys, size, dir, attrs); + if (*phys == SWIOTLB_MAP_ERROR) + return DIRECT_MAPPING_ERROR; - if (size <= io_tlb_overflow || !do_panic) - return; + /* Ensure that the address returned is DMA'ble */ + dma_addr = __phys_to_dma(dev, *phys); + if (unlikely(!dma_capable(dev, dma_addr, size))) { + swiotlb_tbl_unmap_single(dev, *phys, size, dir, + attrs | DMA_ATTR_SKIP_CPU_SYNC); + return DIRECT_MAPPING_ERROR; + } - if (dir == DMA_BIDIRECTIONAL) - panic("DMA: Random memory could be DMA accessed\n"); - if (dir == DMA_FROM_DEVICE) - panic("DMA: Random memory could be DMA written\n"); - if (dir == DMA_TO_DEVICE) - panic("DMA: Random memory could be DMA read\n"); + return dma_addr; } /* @@ -801,7 +663,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t map, phys = page_to_phys(page) + offset; + phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dev_addr = phys_to_dma(dev, phys); BUG_ON(dir == DMA_NONE); @@ -810,28 +672,17 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (dma_capable(dev, dev_addr, size) && swiotlb_force != SWIOTLB_FORCE) - return dev_addr; - - trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); - - /* Oh well, have to allocate and map a bounce buffer. */ - map = map_single(dev, phys, size, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) { - swiotlb_full(dev, size, dir, 1); - return __phys_to_dma(dev, io_tlb_overflow_buffer); + if (!dma_capable(dev, dev_addr, size) || + swiotlb_force == SWIOTLB_FORCE) { + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + dev_addr = swiotlb_bounce_page(dev, &phys, size, dir, attrs); } - dev_addr = __phys_to_dma(dev, map); + if (!dev_is_dma_coherent(dev) && + (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) + arch_sync_dma_for_device(dev, phys, size, dir); - /* Ensure that the address returned is DMA'ble */ - if (dma_capable(dev, dev_addr, size)) - return dev_addr; - - attrs |= DMA_ATTR_SKIP_CPU_SYNC; - swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); - - return __phys_to_dma(dev, io_tlb_overflow_buffer); + return dev_addr; } /* @@ -842,14 +693,18 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, * After this call, reads by the cpu to the buffer are guaranteed to see * whatever the device wrote there. */ -static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) +void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) { phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); + if (!dev_is_dma_coherent(hwdev) && + (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) + arch_sync_dma_for_cpu(hwdev, paddr, size, dir); + if (is_swiotlb_buffer(paddr)) { swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); return; @@ -867,13 +722,6 @@ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, dma_mark_clean(phys_to_virt(paddr), size); } -void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - unmap_single(hwdev, dev_addr, size, dir, attrs); -} - /* * Make physical memory consistent for a single streaming mode DMA translation * after a transfer. @@ -893,15 +741,17 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(paddr)) { + if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_CPU) + arch_sync_dma_for_cpu(hwdev, paddr, size, dir); + + if (is_swiotlb_buffer(paddr)) swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); - return; - } - if (dir != DMA_FROM_DEVICE) - return; + if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_DEVICE) + arch_sync_dma_for_device(hwdev, paddr, size, dir); - dma_mark_clean(phys_to_virt(paddr), size); + if (!is_swiotlb_buffer(paddr) && dir == DMA_FROM_DEVICE) + dma_mark_clean(phys_to_virt(paddr), size); } void @@ -925,48 +775,31 @@ swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, * appropriate dma address and length. They are obtained via * sg_dma_{address,length}(SG). * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * * Device ownership issues as mentioned above for swiotlb_map_page are the * same here. */ int -swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, +swiotlb_map_sg_attrs(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; - BUG_ON(dir == DMA_NONE); - for_each_sg(sgl, sg, nelems, i) { - phys_addr_t paddr = sg_phys(sg); - dma_addr_t dev_addr = phys_to_dma(hwdev, paddr); - - if (swiotlb_force == SWIOTLB_FORCE || - !dma_capable(hwdev, dev_addr, sg->length)) { - phys_addr_t map = map_single(hwdev, sg_phys(sg), - sg->length, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) { - /* Don't panic here, we expect map_sg users - to do proper error handling. */ - swiotlb_full(hwdev, sg->length, dir, 0); - attrs |= DMA_ATTR_SKIP_CPU_SYNC; - swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, - attrs); - sg_dma_len(sgl) = 0; - return 0; - } - sg->dma_address = __phys_to_dma(hwdev, map); - } else - sg->dma_address = dev_addr; + sg->dma_address = swiotlb_map_page(dev, sg_page(sg), sg->offset, + sg->length, dir, attrs); + if (sg->dma_address == DIRECT_MAPPING_ERROR) + goto out_error; sg_dma_len(sg) = sg->length; } + return nelems; + +out_error: + swiotlb_unmap_sg_attrs(dev, sgl, i, dir, + attrs | DMA_ATTR_SKIP_CPU_SYNC); + sg_dma_len(sgl) = 0; + return 0; } /* @@ -984,7 +817,7 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) - unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, + swiotlb_unmap_page(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs); } @@ -1022,12 +855,6 @@ swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); } -int -swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) -{ - return (dma_addr == __phys_to_dma(hwdev, io_tlb_overflow_buffer)); -} - /* * Return whether the given device DMA address mask can be supported * properly. For example, if your device can only drive the low 24-bits @@ -1040,39 +867,10 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask) return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; } -void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - void *vaddr; - - /* temporary workaround: */ - if (gfp & __GFP_NOWARN) - attrs |= DMA_ATTR_NO_WARN; - - /* - * Don't print a warning when the first allocation attempt fails. - * swiotlb_alloc_coherent() will print a warning when the DMA memory - * allocation ultimately failed. - */ - gfp |= __GFP_NOWARN; - - vaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs); - if (!vaddr) - vaddr = swiotlb_alloc_buffer(dev, size, dma_handle, attrs); - return vaddr; -} - -void swiotlb_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, unsigned long attrs) -{ - if (!swiotlb_free_buffer(dev, size, dma_addr)) - dma_direct_free(dev, size, vaddr, dma_addr, attrs); -} - const struct dma_map_ops swiotlb_dma_ops = { - .mapping_error = swiotlb_dma_mapping_error, - .alloc = swiotlb_alloc, - .free = swiotlb_free, + .mapping_error = dma_direct_mapping_error, + .alloc = dma_direct_alloc, + .free = dma_direct_free, .sync_single_for_cpu = swiotlb_sync_single_for_cpu, .sync_single_for_device = swiotlb_sync_single_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a97f34bc14c..8c490130c4fb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8376,30 +8376,39 @@ static struct pmu perf_tracepoint = { * * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe * if not set, create kprobe/uprobe + * + * The following values specify a reference counter (or semaphore in the + * terminology of tools like dtrace, systemtap, etc.) Userspace Statically + * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset. + * + * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset + * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left */ enum perf_probe_config { PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ + PERF_UPROBE_REF_CTR_OFFSET_BITS = 32, + PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS, }; PMU_FORMAT_ATTR(retprobe, "config:0"); +#endif -static struct attribute *probe_attrs[] = { +#ifdef CONFIG_KPROBE_EVENTS +static struct attribute *kprobe_attrs[] = { &format_attr_retprobe.attr, NULL, }; -static struct attribute_group probe_format_group = { +static struct attribute_group kprobe_format_group = { .name = "format", - .attrs = probe_attrs, + .attrs = kprobe_attrs, }; -static const struct attribute_group *probe_attr_groups[] = { - &probe_format_group, +static const struct attribute_group *kprobe_attr_groups[] = { + &kprobe_format_group, NULL, }; -#endif -#ifdef CONFIG_KPROBE_EVENTS static int perf_kprobe_event_init(struct perf_event *event); static struct pmu perf_kprobe = { .task_ctx_nr = perf_sw_context, @@ -8409,7 +8418,7 @@ static struct pmu perf_kprobe = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - .attr_groups = probe_attr_groups, + .attr_groups = kprobe_attr_groups, }; static int perf_kprobe_event_init(struct perf_event *event) @@ -8441,6 +8450,24 @@ static int perf_kprobe_event_init(struct perf_event *event) #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS +PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63"); + +static struct attribute *uprobe_attrs[] = { + &format_attr_retprobe.attr, + &format_attr_ref_ctr_offset.attr, + NULL, +}; + +static struct attribute_group uprobe_format_group = { + .name = "format", + .attrs = uprobe_attrs, +}; + +static const struct attribute_group *uprobe_attr_groups[] = { + &uprobe_format_group, + NULL, +}; + static int perf_uprobe_event_init(struct perf_event *event); static struct pmu perf_uprobe = { .task_ctx_nr = perf_sw_context, @@ -8450,12 +8477,13 @@ static struct pmu perf_uprobe = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - .attr_groups = probe_attr_groups, + .attr_groups = uprobe_attr_groups, }; static int perf_uprobe_event_init(struct perf_event *event) { int err; + unsigned long ref_ctr_offset; bool is_retprobe; if (event->attr.type != perf_uprobe.type) @@ -8471,7 +8499,8 @@ static int perf_uprobe_event_init(struct perf_event *event) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; - err = perf_uprobe_init(event, is_retprobe); + ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT; + err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe); if (err) return err; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2bf792d22087..96d4bee83489 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -73,6 +73,7 @@ struct uprobe { struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ loff_t offset; + loff_t ref_ctr_offset; unsigned long flags; /* @@ -88,6 +89,15 @@ struct uprobe { struct arch_uprobe arch; }; +struct delayed_uprobe { + struct list_head list; + struct uprobe *uprobe; + struct mm_struct *mm; +}; + +static DEFINE_MUTEX(delayed_uprobe_lock); +static LIST_HEAD(delayed_uprobe_list); + /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction @@ -282,6 +292,166 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t return 1; } +static struct delayed_uprobe * +delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct delayed_uprobe *du; + + list_for_each_entry(du, &delayed_uprobe_list, list) + if (du->uprobe == uprobe && du->mm == mm) + return du; + return NULL; +} + +static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct delayed_uprobe *du; + + if (delayed_uprobe_check(uprobe, mm)) + return 0; + + du = kzalloc(sizeof(*du), GFP_KERNEL); + if (!du) + return -ENOMEM; + + du->uprobe = uprobe; + du->mm = mm; + list_add(&du->list, &delayed_uprobe_list); + return 0; +} + +static void delayed_uprobe_delete(struct delayed_uprobe *du) +{ + if (WARN_ON(!du)) + return; + list_del(&du->list); + kfree(du); +} + +static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct list_head *pos, *q; + struct delayed_uprobe *du; + + if (!uprobe && !mm) + return; + + list_for_each_safe(pos, q, &delayed_uprobe_list) { + du = list_entry(pos, struct delayed_uprobe, list); + + if (uprobe && du->uprobe != uprobe) + continue; + if (mm && du->mm != mm) + continue; + + delayed_uprobe_delete(du); + } +} + +static bool valid_ref_ctr_vma(struct uprobe *uprobe, + struct vm_area_struct *vma) +{ + unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset); + + return uprobe->ref_ctr_offset && + vma->vm_file && + file_inode(vma->vm_file) == uprobe->inode && + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && + vma->vm_start <= vaddr && + vma->vm_end > vaddr; +} + +static struct vm_area_struct * +find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct vm_area_struct *tmp; + + for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) + if (valid_ref_ctr_vma(uprobe, tmp)) + return tmp; + + return NULL; +} + +static int +__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) +{ + void *kaddr; + struct page *page; + struct vm_area_struct *vma; + int ret; + short *ptr; + + if (!vaddr || !d) + return -EINVAL; + + ret = get_user_pages_remote(NULL, mm, vaddr, 1, + FOLL_WRITE, &page, &vma, NULL); + if (unlikely(ret <= 0)) { + /* + * We are asking for 1 page. If get_user_pages_remote() fails, + * it may return 0, in that case we have to return error. + */ + return ret == 0 ? -EBUSY : ret; + } + + kaddr = kmap_atomic(page); + ptr = kaddr + (vaddr & ~PAGE_MASK); + + if (unlikely(*ptr + d < 0)) { + pr_warn("ref_ctr going negative. vaddr: 0x%lx, " + "curr val: %d, delta: %d\n", vaddr, *ptr, d); + ret = -EINVAL; + goto out; + } + + *ptr += d; + ret = 0; +out: + kunmap_atomic(kaddr); + put_page(page); + return ret; +} + +static void update_ref_ctr_warn(struct uprobe *uprobe, + struct mm_struct *mm, short d) +{ + pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n", + d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, + (unsigned long long) uprobe->offset, + (unsigned long long) uprobe->ref_ctr_offset, mm); +} + +static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, + short d) +{ + struct vm_area_struct *rc_vma; + unsigned long rc_vaddr; + int ret = 0; + + rc_vma = find_ref_ctr_vma(uprobe, mm); + + if (rc_vma) { + rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset); + ret = __update_ref_ctr(mm, rc_vaddr, d); + if (ret) + update_ref_ctr_warn(uprobe, mm, d); + + if (d > 0) + return ret; + } + + mutex_lock(&delayed_uprobe_lock); + if (d > 0) + ret = delayed_uprobe_add(uprobe, mm); + else + delayed_uprobe_remove(uprobe, mm); + mutex_unlock(&delayed_uprobe_lock); + + return ret; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for @@ -302,9 +472,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { + struct uprobe *uprobe; struct page *old_page, *new_page; struct vm_area_struct *vma; - int ret; + int ret, is_register, ref_ctr_updated = 0; + + is_register = is_swbp_insn(&opcode); + uprobe = container_of(auprobe, struct uprobe, arch); retry: /* Read the page with vaddr into memory */ @@ -317,6 +491,15 @@ retry: if (ret <= 0) goto put_old; + /* We are going to replace instruction, update ref_ctr. */ + if (!ref_ctr_updated && uprobe->ref_ctr_offset) { + ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); + if (ret) + goto put_old; + + ref_ctr_updated = 1; + } + ret = anon_vma_prepare(vma); if (ret) goto put_old; @@ -337,6 +520,11 @@ put_old: if (unlikely(ret == -EAGAIN)) goto retry; + + /* Revert back reference counter if instruction update failed. */ + if (ret && is_register && ref_ctr_updated) + update_ref_ctr(uprobe, mm, -1); + return ret; } @@ -378,8 +566,15 @@ static struct uprobe *get_uprobe(struct uprobe *uprobe) static void put_uprobe(struct uprobe *uprobe) { - if (atomic_dec_and_test(&uprobe->ref)) + if (atomic_dec_and_test(&uprobe->ref)) { + /* + * If application munmap(exec_vma) before uprobe_unregister() + * gets called, we don't get a chance to remove uprobe from + * delayed_uprobe_list from remove_breakpoint(). Do it here. + */ + delayed_uprobe_remove(uprobe, NULL); kfree(uprobe); + } } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -484,7 +679,18 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) return u; } -static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) +static void +ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe) +{ + pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx " + "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n", + uprobe->inode->i_ino, (unsigned long long) uprobe->offset, + (unsigned long long) cur_uprobe->ref_ctr_offset, + (unsigned long long) uprobe->ref_ctr_offset); +} + +static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, + loff_t ref_ctr_offset) { struct uprobe *uprobe, *cur_uprobe; @@ -494,6 +700,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = inode; uprobe->offset = offset; + uprobe->ref_ctr_offset = ref_ctr_offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); @@ -501,6 +708,12 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ if (cur_uprobe) { + if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { + ref_ctr_mismatch_warn(cur_uprobe, uprobe); + put_uprobe(cur_uprobe); + kfree(uprobe); + return ERR_PTR(-EINVAL); + } kfree(uprobe); uprobe = cur_uprobe; } @@ -895,7 +1108,7 @@ EXPORT_SYMBOL_GPL(uprobe_unregister); * else return 0 (success) */ static int __uprobe_register(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc) + loff_t ref_ctr_offset, struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; @@ -912,9 +1125,12 @@ static int __uprobe_register(struct inode *inode, loff_t offset, return -EINVAL; retry: - uprobe = alloc_uprobe(inode, offset); + uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (!uprobe) return -ENOMEM; + if (IS_ERR(uprobe)) + return PTR_ERR(uprobe); + /* * We can race with uprobe_unregister()->delete_uprobe(). * Check uprobe_is_active() and retry if it is false. @@ -938,10 +1154,17 @@ static int __uprobe_register(struct inode *inode, loff_t offset, int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { - return __uprobe_register(inode, offset, uc); + return __uprobe_register(inode, offset, 0, uc); } EXPORT_SYMBOL_GPL(uprobe_register); +int uprobe_register_refctr(struct inode *inode, loff_t offset, + loff_t ref_ctr_offset, struct uprobe_consumer *uc) +{ + return __uprobe_register(inode, offset, ref_ctr_offset, uc); +} +EXPORT_SYMBOL_GPL(uprobe_register_refctr); + /* * uprobe_apply - unregister an already registered probe. * @inode: the file in which the probe has to be removed. @@ -1060,6 +1283,35 @@ static void build_probe_list(struct inode *inode, spin_unlock(&uprobes_treelock); } +/* @vma contains reference counter, not the probed instruction. */ +static int delayed_ref_ctr_inc(struct vm_area_struct *vma) +{ + struct list_head *pos, *q; + struct delayed_uprobe *du; + unsigned long vaddr; + int ret = 0, err = 0; + + mutex_lock(&delayed_uprobe_lock); + list_for_each_safe(pos, q, &delayed_uprobe_list) { + du = list_entry(pos, struct delayed_uprobe, list); + + if (du->mm != vma->vm_mm || + !valid_ref_ctr_vma(du->uprobe, vma)) + continue; + + vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset); + ret = __update_ref_ctr(vma->vm_mm, vaddr, 1); + if (ret) { + update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1); + if (!err) + err = ret; + } + delayed_uprobe_delete(du); + } + mutex_unlock(&delayed_uprobe_lock); + return err; +} + /* * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. * @@ -1072,7 +1324,15 @@ int uprobe_mmap(struct vm_area_struct *vma) struct uprobe *uprobe, *u; struct inode *inode; - if (no_uprobe_events() || !valid_vma(vma, true)) + if (no_uprobe_events()) + return 0; + + if (vma->vm_file && + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && + test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) + delayed_ref_ctr_inc(vma); + + if (!valid_vma(vma, true)) return 0; inode = file_inode(vma->vm_file); @@ -1246,6 +1506,10 @@ void uprobe_clear_state(struct mm_struct *mm) { struct xol_area *area = mm->uprobes_state.xol_area; + mutex_lock(&delayed_uprobe_lock); + delayed_uprobe_remove(NULL, mm); + mutex_unlock(&delayed_uprobe_lock); + if (!area) return; diff --git a/kernel/fail_function.c b/kernel/fail_function.c index bc80a4e268c0..17f75b545f66 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -173,8 +173,7 @@ static void fei_debugfs_remove_attr(struct fei_attr *attr) struct dentry *dir; dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir); - if (dir) - debugfs_remove_recursive(dir); + debugfs_remove_recursive(dir); } static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) diff --git a/kernel/fork.c b/kernel/fork.c index f0b58479534f..07cddff89c7b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -91,6 +91,7 @@ #include <linux/kcov.h> #include <linux/livepatch.h> #include <linux/thread_info.h> +#include <linux/stackleak.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -223,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) return s->addr; } + /* + * Allocated stacks are cached and later reused by new threads, + * so memcg accounting is performed manually on assigning/releasing + * stacks to tasks. Drop __GFP_ACCOUNT. + */ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, + THREADINFO_GFP & ~__GFP_ACCOUNT, PAGE_KERNEL, 0, node, __builtin_return_address(0)); @@ -248,9 +254,19 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) static inline void free_thread_stack(struct task_struct *tsk) { #ifdef CONFIG_VMAP_STACK - if (task_stack_vm_area(tsk)) { + struct vm_struct *vm = task_stack_vm_area(tsk); + + if (vm) { int i; + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + mod_memcg_page_state(vm->pages[i], + MEMCG_KERNEL_STACK_KB, + -(int)(PAGE_SIZE / 1024)); + + memcg_kmem_uncharge(vm->pages[i], 0); + } + for (i = 0; i < NR_CACHED_STACKS; i++) { if (this_cpu_cmpxchg(cached_stacks[i], NULL, tsk->stack_vm_area) != NULL) @@ -351,10 +367,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account) NR_KERNEL_STACK_KB, PAGE_SIZE / 1024 * account); } - - /* All stack pages belong to the same memcg. */ - mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); } else { /* * All stack pages are in the same zone and belong to the @@ -370,6 +382,35 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } } +static int memcg_charge_kernel_stack(struct task_struct *tsk) +{ +#ifdef CONFIG_VMAP_STACK + struct vm_struct *vm = task_stack_vm_area(tsk); + int ret; + + if (vm) { + int i; + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + /* + * If memcg_kmem_charge() fails, page->mem_cgroup + * pointer is NULL, and both memcg_kmem_uncharge() + * and mod_memcg_page_state() in free_thread_stack() + * will ignore this page. So it's safe. + */ + ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0); + if (ret) + return ret; + + mod_memcg_page_state(vm->pages[i], + MEMCG_KERNEL_STACK_KB, + PAGE_SIZE / 1024); + } + } +#endif + return 0; +} + static void release_task_stack(struct task_struct *tsk) { if (WARN_ON(tsk->state != TASK_DEAD)) @@ -807,6 +848,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!stack) goto free_tsk; + if (memcg_charge_kernel_stack(tsk)) + goto free_stack; + stack_vm_area = task_stack_vm_area(tsk); err = arch_dup_task_struct(tsk, orig); @@ -1779,6 +1823,10 @@ static __latent_entropy struct task_struct *copy_process( p->default_timer_slack_ns = current->timer_slack_ns; +#ifdef CONFIG_PSI + p->psi_flags = 0; +#endif + task_io_accounting_init(&p->ioac); acct_clear_integrals(p); @@ -1879,6 +1927,8 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cleanup_io; + stackleak_task_init(p); + if (pid != &init_struct_pid) { pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (IS_ERR(pid)) { diff --git a/kernel/futex.c b/kernel/futex.c index 3e2de8fc1891..f423f9b6577e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -65,7 +65,7 @@ #include <linux/sched/mm.h> #include <linux/hugetlb.h> #include <linux/freezer.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/fault-inject.h> #include <asm/futex.h> diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 83f830acbb5f..410a77a8f6e2 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -173,7 +173,7 @@ err_unlock: } COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct compat_timespec __user *, utime, u32 __user *, uaddr2, + struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { struct timespec ts; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b9132d1269ef..cb8e3e8ac7b9 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -15,6 +15,7 @@ #include <linux/lockdep.h> #include <linux/export.h> #include <linux/sysctl.h> +#include <linux/suspend.h> #include <linux/utsname.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> @@ -242,6 +243,28 @@ void reset_hung_task_detector(void) } EXPORT_SYMBOL_GPL(reset_hung_task_detector); +static bool hung_detector_suspended; + +static int hungtask_pm_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + case PM_RESTORE_PREPARE: + hung_detector_suspended = true; + break; + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + case PM_POST_RESTORE: + hung_detector_suspended = false; + break; + default: + break; + } + return NOTIFY_OK; +} + /* * kthread which checks for tasks stuck in D state */ @@ -261,7 +284,8 @@ static int watchdog(void *dummy) interval = min_t(unsigned long, interval, timeout); t = hung_timeout_jiffies(hung_last_checked, interval); if (t <= 0) { - if (!atomic_xchg(&reset_hung_task, 0)) + if (!atomic_xchg(&reset_hung_task, 0) && + !hung_detector_suspended) check_hung_uninterruptible_tasks(timeout); hung_last_checked = jiffies; continue; @@ -275,6 +299,10 @@ static int watchdog(void *dummy) static int __init hung_task_init(void) { atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + + /* Disable hung task detector on suspend */ + pm_notifier(hungtask_pm_notify, 0); + watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5d9fc01b60a6..3366d11c3e02 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -183,7 +183,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, * unhappy about. Replace them with ':', which does * the trick and is not as offensive as '\'... */ - name = kstrdup(of_node_full_name(of_node), GFP_KERNEL); + name = kasprintf(GFP_KERNEL, "%pOF", of_node); if (!name) { kfree(domain); return NULL; @@ -867,7 +867,7 @@ void irq_dispose_mapping(unsigned int virq) EXPORT_SYMBOL_GPL(irq_dispose_mapping); /** - * irq_find_mapping() - Find a linux irq from an hw irq number. + * irq_find_mapping() - Find a linux irq from a hw irq number. * @domain: domain owning this hardware interrupt * @hwirq: hardware irq number in that domain space */ @@ -1741,6 +1741,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d) static void debugfs_remove_domain_dir(struct irq_domain *d) { debugfs_remove(d->debugfs_file); + d->debugfs_file = NULL; } void __init irq_domain_debugfs_init(struct dentry *root) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fb86146037a7..9dbdccab3b6a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -927,6 +927,9 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) local_bh_disable(); ret = action->thread_fn(action->irq, action->dev_id); + if (ret == IRQ_HANDLED) + atomic_inc(&desc->threads_handled); + irq_finalize_oneshot(desc, action); local_bh_enable(); return ret; @@ -943,6 +946,9 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, irqreturn_t ret; ret = action->thread_fn(action->irq, action->dev_id); + if (ret == IRQ_HANDLED) + atomic_inc(&desc->threads_handled); + irq_finalize_oneshot(desc, action); return ret; } @@ -1020,8 +1026,6 @@ static int irq_thread(void *data) irq_thread_check_affinity(desc, action); action_ret = handler_fn(desc, action); - if (action_ret == IRQ_HANDLED) - atomic_inc(&desc->threads_handled); if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 02a0b01380d8..f3a04994e063 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -37,7 +37,7 @@ extern const u8 kallsyms_names[] __weak; * Tell the compiler that the count isn't in the small data section if the arch * has one (eg: FRV). */ -extern const unsigned long kallsyms_num_syms +extern const unsigned int kallsyms_num_syms __attribute__((weak, section(".rodata"))); extern const unsigned long kallsyms_relative_base @@ -46,7 +46,7 @@ __attribute__((weak, section(".rodata"))); extern const u8 kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; -extern const unsigned long kallsyms_markers[] __weak; +extern const unsigned int kallsyms_markers[] __weak; /* * Expand a compressed symbol data into the resulting uncompressed string, diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 0130e488ebfe..8f36c27c1794 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -4,7 +4,7 @@ #endif #include <linux/hash.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/debug_locks.h> /* diff --git a/kernel/memremap.c b/kernel/memremap.c index 5b8600d39931..9eced2cc9f94 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -1,47 +1,21 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright(c) 2015 Intel Corporation. All rights reserved. */ -#include <linux/radix-tree.h> #include <linux/device.h> -#include <linux/types.h> -#include <linux/pfn_t.h> #include <linux/io.h> #include <linux/kasan.h> -#include <linux/mm.h> #include <linux/memory_hotplug.h> +#include <linux/mm.h> +#include <linux/pfn_t.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/types.h> #include <linux/wait_bit.h> +#include <linux/xarray.h> -static DEFINE_MUTEX(pgmap_lock); -static RADIX_TREE(pgmap_radix, GFP_KERNEL); +static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) -static unsigned long order_at(struct resource *res, unsigned long pgoff) -{ - unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; - unsigned long nr_pages, mask; - - nr_pages = PHYS_PFN(resource_size(res)); - if (nr_pages == pgoff) - return ULONG_MAX; - - /* - * What is the largest aligned power-of-2 range available from - * this resource pgoff to the end of the resource range, - * considering the alignment of the current pgoff? - */ - mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff); - if (!mask) - return ULONG_MAX; - - return find_first_bit(&mask, BITS_PER_LONG); -} - -#define foreach_order_pgoff(res, order, pgoff) \ - for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \ - pgoff += 1UL << order, order = order_at((res), pgoff)) - #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, @@ -70,18 +44,10 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ -static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff) +static void pgmap_array_delete(struct resource *res) { - unsigned long pgoff, order; - - mutex_lock(&pgmap_lock); - foreach_order_pgoff(res, order, pgoff) { - if (pgoff >= end_pgoff) - break; - radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); - } - mutex_unlock(&pgmap_lock); - + xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), + NULL, GFP_KERNEL); synchronize_rcu(); } @@ -142,7 +108,7 @@ static void devm_memremap_pages_release(void *data) mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); - pgmap_radix_release(res, -1); + pgmap_array_delete(res); dev_WARN_ONCE(dev, pgmap->altmap.alloc, "%s: failed to free all reserved pages\n", __func__); } @@ -175,10 +141,9 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) struct vmem_altmap *altmap = pgmap->altmap_valid ? &pgmap->altmap : NULL; struct resource *res = &pgmap->res; - unsigned long pfn, pgoff, order; + struct dev_pagemap *conflict_pgmap; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; - struct dev_pagemap *conflict_pgmap; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -216,20 +181,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) pgmap->dev = dev; - mutex_lock(&pgmap_lock); - error = 0; - - foreach_order_pgoff(res, order, pgoff) { - error = __radix_tree_insert(&pgmap_radix, - PHYS_PFN(res->start) + pgoff, order, pgmap); - if (error) { - dev_err(dev, "%s: failed: %d\n", __func__, error); - break; - } - } - mutex_unlock(&pgmap_lock); + error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), + PHYS_PFN(res->end), pgmap, GFP_KERNEL)); if (error) - goto err_radix; + goto err_array; nid = dev_to_node(dev); if (nid < 0) @@ -256,19 +211,14 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (error) goto err_add_memory; - for_each_device_pfn(pfn, pgmap) { - struct page *page = pfn_to_page(pfn); - - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer. It is a bug if a ZONE_DEVICE page is ever - * freed or placed on a driver-private list. Seed the - * storage with LIST_POISON* values. - */ - list_del(&page->lru); - page->pgmap = pgmap; - percpu_ref_get(pgmap->ref); - } + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, pgmap); + percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); devm_add_action(dev, devm_memremap_pages_release, pgmap); @@ -279,8 +229,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_kasan: untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: - err_radix: - pgmap_radix_release(res, pgoff); + pgmap_array_delete(res); + err_array: return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); @@ -320,7 +270,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, /* fall back to slow path lookup */ rcu_read_lock(); - pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); + pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) pgmap = NULL; rcu_read_unlock(); diff --git a/kernel/panic.c b/kernel/panic.c index 8b2e002d52eb..f6d549a29a5c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -136,7 +136,7 @@ void panic(const char *fmt, ...) { static char buf[1024]; va_list args; - long i, i_next = 0; + long i, i_next = 0, len; int state = 0; int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; @@ -173,8 +173,12 @@ void panic(const char *fmt, ...) console_verbose(); bust_spinlocks(1); va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + len = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); + + if (len && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + pr_emerg("Kernel panic - not syncing: %s\n", buf); #ifdef CONFIG_DEBUG_BUGVERBOSE /* @@ -631,7 +635,7 @@ device_initcall(register_warn_debugfs); */ __visible void __stack_chk_fail(void) { - panic("stack-protector: Kernel stack is corrupted in: %pB\n", + panic("stack-protector: Kernel stack is corrupted in: %pB", __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); diff --git a/kernel/pid.c b/kernel/pid.c index cdf63e53a014..b2f6c506035d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -31,7 +31,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/hash.h> #include <linux/pid_namespace.h> #include <linux/init_task.h> diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3d37c279c090..b0308a2c6000 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -23,7 +23,7 @@ #include <linux/pm.h> #include <linux/device.h> #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/nmi.h> #include <linux/syscalls.h> #include <linux/console.h> @@ -963,7 +963,8 @@ void __init __register_nosave_region(unsigned long start_pfn, BUG_ON(!region); } else { /* This allocation cannot fail */ - region = memblock_virt_alloc(sizeof(struct nosave_region), 0); + region = memblock_alloc(sizeof(struct nosave_region), + SMP_CACHE_BYTES); } region->start_pfn = start_pfn; region->end_pfn = end_pfn; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9bf5404397e0..1b2a029360b7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -16,6 +16,8 @@ * 01Mar01 Andrew Morton */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/mm.h> #include <linux/tty.h> @@ -29,7 +31,6 @@ #include <linux/delay.h> #include <linux/smp.h> #include <linux/security.h> -#include <linux/bootmem.h> #include <linux/memblock.h> #include <linux/syscalls.h> #include <linux/crash_core.h> @@ -192,16 +193,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, return 0; } -/* - * Number of registered extended console drivers. - * - * If extended consoles are present, in-kernel cont reassembly is disabled - * and each fragment is stored as a separate log entry with proper - * continuation flag so that every emitted message has full metadata. This - * doesn't change the result for regular consoles or /proc/kmsg. For - * /dev/kmsg, as long as the reader concatenates messages according to - * consecutive continuation flags, the end result should be the same too. - */ +/* Number of registered extended console drivers. */ static int nr_ext_console_drivers; /* @@ -423,6 +415,7 @@ static u32 log_next_idx; /* the next printk record to write to the console */ static u64 console_seq; static u32 console_idx; +static u64 exclusive_console_stop_seq; /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; @@ -437,6 +430,7 @@ static u32 clear_idx; /* record buffer */ #define LOG_ALIGN __alignof__(struct printk_log) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#define LOG_BUF_LEN_MAX (u32)(1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -1037,18 +1031,28 @@ void log_buf_vmcoreinfo_setup(void) static unsigned long __initdata new_log_buf_len; /* we practice scaling the ring buffer by powers of 2 */ -static void __init log_buf_len_update(unsigned size) +static void __init log_buf_len_update(u64 size) { + if (size > (u64)LOG_BUF_LEN_MAX) { + size = (u64)LOG_BUF_LEN_MAX; + pr_err("log_buf over 2G is not supported.\n"); + } + if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) - new_log_buf_len = size; + new_log_buf_len = (unsigned long)size; } /* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { - unsigned size = memparse(str, &str); + u64 size; + + if (!str) + return -EINVAL; + + size = memparse(str, &str); log_buf_len_update(size); @@ -1093,7 +1097,7 @@ void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; - int free; + unsigned int free; if (log_buf != __log_buf) return; @@ -1106,14 +1110,14 @@ void __init setup_log_buf(int early) if (early) { new_log_buf = - memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); + memblock_alloc(new_log_buf_len, LOG_ALIGN); } else { - new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, + new_log_buf = memblock_alloc_nopanic(new_log_buf_len, LOG_ALIGN); } if (unlikely(!new_log_buf)) { - pr_err("log_buf_len: %ld bytes not available\n", + pr_err("log_buf_len: %lu bytes not available\n", new_log_buf_len); return; } @@ -1126,8 +1130,8 @@ void __init setup_log_buf(int early) memcpy(log_buf, __log_buf, __LOG_BUF_LEN); logbuf_unlock_irqrestore(flags); - pr_info("log_buf_len: %d bytes\n", log_buf_len); - pr_info("early log buf free: %d(%d%%)\n", + pr_info("log_buf_len: %u bytes\n", log_buf_len); + pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); } @@ -1767,12 +1771,8 @@ static void cont_flush(void) static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) { - /* - * If ext consoles are present, flush and skip in-kernel - * continuation. See nr_ext_console_drivers definition. Also, if - * the line gets too long, split it up in separate records. - */ - if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { + /* If the line gets too long, split it up in separate records. */ + if (cont.len + len > sizeof(cont.buf)) { cont_flush(); return false; } @@ -1795,9 +1795,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * cont_flush(); } - if (cont.len > (sizeof(cont.buf) * 80) / 100) - cont_flush(); - return true; } @@ -1889,8 +1886,9 @@ asmlinkage int vprintk_emit(int facility, int level, const char *fmt, va_list args) { int printed_len; - bool in_sched = false; + bool in_sched = false, pending_output; unsigned long flags; + u64 curr_log_seq; if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; @@ -1902,11 +1900,13 @@ asmlinkage int vprintk_emit(int facility, int level, /* This stops the holder of console_sem just where we want him */ logbuf_lock_irqsave(flags); + curr_log_seq = log_next_seq; printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); + pending_output = (curr_log_seq != log_next_seq); logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { + if (!in_sched && pending_output) { /* * Disable preemption to avoid being preempted while holding * console_sem which would prevent anyone from printing to @@ -1923,7 +1923,8 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } - wake_up_klogd(); + if (pending_output) + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -2009,6 +2010,7 @@ static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; static u32 console_idx; +static u64 exclusive_console_stop_seq; static u64 log_first_seq; static u32 log_first_idx; static u64 log_next_seq; @@ -2351,8 +2353,9 @@ again: printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); if (console_seq < log_first_seq) { - len = sprintf(text, "** %u printk messages dropped **\n", - (unsigned)(log_first_seq - console_seq)); + len = sprintf(text, + "** %llu printk messages dropped **\n", + log_first_seq - console_seq); /* messages are gone, move to first one */ console_seq = log_first_seq; @@ -2376,6 +2379,12 @@ skip: goto skip; } + /* Output to all consoles once old messages replayed. */ + if (unlikely(exclusive_console && + console_seq >= exclusive_console_stop_seq)) { + exclusive_console = NULL; + } + len += msg_print_text(msg, console_msg_format & MSG_FORMAT_SYSLOG, text + len, @@ -2418,10 +2427,6 @@ skip: console_locked = 0; - /* Release the exclusive_console once it is used */ - if (unlikely(exclusive_console)) - exclusive_console = NULL; - raw_spin_unlock(&logbuf_lock); up_console_sem(); @@ -2688,8 +2693,7 @@ void register_console(struct console *newcon) } if (newcon->flags & CON_EXTENDED) - if (!nr_ext_console_drivers++) - pr_info("printk: continuation disabled due to ext consoles, expect more fragments in /dev/kmsg\n"); + nr_ext_console_drivers++; if (newcon->flags & CON_PRINTBUFFER) { /* @@ -2699,13 +2703,18 @@ void register_console(struct console *newcon) logbuf_lock_irqsave(flags); console_seq = syslog_seq; console_idx = syslog_idx; - logbuf_unlock_irqrestore(flags); /* * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to * the already-registered consoles. + * + * Set exclusive_console with disabled interrupts to reduce + * race window with eventual console_flush_on_panic() that + * ignores console_lock. */ exclusive_console = newcon; + exclusive_console_stop_seq = console_seq; + logbuf_unlock_irqrestore(flags); } console_unlock(); console_sysfs_notify(); diff --git a/kernel/profile.c b/kernel/profile.c index 9aa2a4445b0d..9c08a2c7cb1d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -16,7 +16,7 @@ #include <linux/export.h> #include <linux/profile.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/notifier.h> #include <linux/mm.h> #include <linux/cpumask.h> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7fe183404c38..21fb5a5662b5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o +obj-$(CONFIG_PSI) += psi.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe0223121883..f12225f26b70 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -722,8 +722,10 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) + if (!(flags & ENQUEUE_RESTORE)) { sched_info_queued(rq, p); + psi_enqueue(p, flags & ENQUEUE_WAKEUP); + } p->sched_class->enqueue_task(rq, p, flags); } @@ -733,8 +735,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) + if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeued(rq, p); + psi_dequeue(p, flags & DEQUEUE_SLEEP); + } p->sched_class->dequeue_task(rq, p, flags); } @@ -2037,6 +2041,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; + psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } @@ -2876,6 +2881,18 @@ unsigned long long nr_context_switches(void) } /* + * Consumers of these two interfaces, like for example the cpuidle menu + * governor, are using nonsensical data. Preferring shallow idle state selection + * for a CPU that has IO-wait which might not even end up running the task when + * it does become runnable. + */ + +unsigned long nr_iowait_cpu(int cpu) +{ + return atomic_read(&cpu_rq(cpu)->nr_iowait); +} + +/* * IO-wait accounting, and how its mostly bollocks (on SMP). * * The idea behind IO-wait account is to account the idle time that we could @@ -2910,31 +2927,11 @@ unsigned long nr_iowait(void) unsigned long i, sum = 0; for_each_possible_cpu(i) - sum += atomic_read(&cpu_rq(i)->nr_iowait); + sum += nr_iowait_cpu(i); return sum; } -/* - * Consumers of these two interfaces, like for example the cpuidle menu - * governor, are using nonsensical data. Preferring shallow idle state selection - * for a CPU that has IO-wait which might not even end up running the task when - * it does become runnable. - */ - -unsigned long nr_iowait_cpu(int cpu) -{ - struct rq *this = cpu_rq(cpu); - return atomic_read(&this->nr_iowait); -} - -void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) -{ - struct rq *rq = this_rq(); - *nr_waiters = atomic_read(&rq->nr_iowait); - *load = rq->load.weight; -} - #ifdef CONFIG_SMP /* @@ -3051,6 +3048,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); + psi_task_tick(rq); rq_unlock(rq, &rf); @@ -4933,9 +4931,7 @@ static void do_sched_yield(void) struct rq_flags rf; struct rq *rq; - local_irq_disable(); - rq = this_rq(); - rq_lock(rq, &rf); + rq = this_rq_lock_irq(&rf); schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); @@ -5244,7 +5240,7 @@ out_unlock: * an error code. */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) + struct __kernel_timespec __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); @@ -5255,16 +5251,16 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, return retval; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, compat_pid_t, pid, - struct compat_timespec __user *, interval) + struct old_timespec32 __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); if (retval == 0) - retval = compat_put_timespec64(&t, interval); + retval = put_old_timespec32(&t, interval); return retval; } #endif @@ -6069,6 +6065,8 @@ void __init sched_init(void) init_schedstats(); + psi_init(); + scheduler_running = 1; } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index a171c1258109..28a516575c18 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -91,19 +91,73 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) return delta; } -/* - * a1 = a0 * e + a * (1 - e) +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. */ static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) { - unsigned long newload; + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } - newload = load * exp + active * (FIXED_1 - exp); - if (active >= load) - newload += FIXED_1-1; + return result; +} - return newload / FIXED_1; +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); } #ifdef CONFIG_NO_HZ_COMMON @@ -225,75 +279,6 @@ static long calc_load_nohz_fold(void) return delta; } -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) { - for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - /* * NO_HZ can leave us missing all per-CPU ticks calling * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c new file mode 100644 index 000000000000..7cdecfc010af --- /dev/null +++ b/kernel/sched/psi.c @@ -0,0 +1,759 @@ +/* + * Pressure stall information for CPU, memory and IO + * + * Copyright (c) 2018 Facebook, Inc. + * Author: Johannes Weiner <hannes@cmpxchg.org> + * + * When CPU, memory and IO are contended, tasks experience delays that + * reduce throughput and introduce latencies into the workload. Memory + * and IO contention, in addition, can cause a full loss of forward + * progress in which the CPU goes idle. + * + * This code aggregates individual task delays into resource pressure + * metrics that indicate problems with both workload health and + * resource utilization. + * + * Model + * + * The time in which a task can execute on a CPU is our baseline for + * productivity. Pressure expresses the amount of time in which this + * potential cannot be realized due to resource contention. + * + * This concept of productivity has two components: the workload and + * the CPU. To measure the impact of pressure on both, we define two + * contention states for a resource: SOME and FULL. + * + * In the SOME state of a given resource, one or more tasks are + * delayed on that resource. This affects the workload's ability to + * perform work, but the CPU may still be executing other tasks. + * + * In the FULL state of a given resource, all non-idle tasks are + * delayed on that resource such that nobody is advancing and the CPU + * goes idle. This leaves both workload and CPU unproductive. + * + * (Naturally, the FULL state doesn't exist for the CPU resource.) + * + * SOME = nr_delayed_tasks != 0 + * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * + * The percentage of wallclock time spent in those compound stall + * states gives pressure numbers between 0 and 100 for each resource, + * where the SOME percentage indicates workload slowdowns and the FULL + * percentage indicates reduced CPU utilization: + * + * %SOME = time(SOME) / period + * %FULL = time(FULL) / period + * + * Multiple CPUs + * + * The more tasks and available CPUs there are, the more work can be + * performed concurrently. This means that the potential that can go + * unrealized due to resource contention *also* scales with non-idle + * tasks and CPUs. + * + * Consider a scenario where 257 number crunching tasks are trying to + * run concurrently on 256 CPUs. If we simply aggregated the task + * states, we would have to conclude a CPU SOME pressure number of + * 100%, since *somebody* is waiting on a runqueue at all + * times. However, that is clearly not the amount of contention the + * workload is experiencing: only one out of 256 possible exceution + * threads will be contended at any given time, or about 0.4%. + * + * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any + * given time *one* of the tasks is delayed due to a lack of memory. + * Again, looking purely at the task state would yield a memory FULL + * pressure number of 0%, since *somebody* is always making forward + * progress. But again this wouldn't capture the amount of execution + * potential lost, which is 1 out of 4 CPUs, or 25%. + * + * To calculate wasted potential (pressure) with multiple processors, + * we have to base our calculation on the number of non-idle tasks in + * conjunction with the number of available CPUs, which is the number + * of potential execution threads. SOME becomes then the proportion of + * delayed tasks to possibe threads, and FULL is the share of possible + * threads that are unproductive due to delays: + * + * threads = min(nr_nonidle_tasks, nr_cpus) + * SOME = min(nr_delayed_tasks / threads, 1) + * FULL = (threads - min(nr_running_tasks, threads)) / threads + * + * For the 257 number crunchers on 256 CPUs, this yields: + * + * threads = min(257, 256) + * SOME = min(1 / 256, 1) = 0.4% + * FULL = (256 - min(257, 256)) / 256 = 0% + * + * For the 1 out of 4 memory-delayed tasks, this yields: + * + * threads = min(4, 4) + * SOME = min(1 / 4, 1) = 25% + * FULL = (4 - min(3, 4)) / 4 = 25% + * + * [ Substitute nr_cpus with 1, and you can see that it's a natural + * extension of the single-CPU model. ] + * + * Implementation + * + * To assess the precise time spent in each such state, we would have + * to freeze the system on task changes and start/stop the state + * clocks accordingly. Obviously that doesn't scale in practice. + * + * Because the scheduler aims to distribute the compute load evenly + * among the available CPUs, we can track task state locally to each + * CPU and, at much lower frequency, extrapolate the global state for + * the cumulative stall times and the running averages. + * + * For each runqueue, we track: + * + * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) + * + * and then periodically aggregate: + * + * tNONIDLE = sum(tNONIDLE[i]) + * + * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE + * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE + * + * %SOME = tSOME / period + * %FULL = tFULL / period + * + * This gives us an approximation of pressure that is practical + * cost-wise, yet way more sensitive and accurate than periodic + * sampling of the aggregate task states would be. + */ + +#include <linux/sched/loadavg.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/seqlock.h> +#include <linux/cgroup.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/psi.h> +#include "sched.h" + +static int psi_bug __read_mostly; + +bool psi_disabled __read_mostly; +core_param(psi_disabled, psi_disabled, bool, 0644); + +/* Running averages - we need to be higher-res than loadavg */ +#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ +#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */ +#define EXP_60s 1981 /* 1/exp(2s/60s) */ +#define EXP_300s 2034 /* 1/exp(2s/300s) */ + +/* Sampling frequency in nanoseconds */ +static u64 psi_period __read_mostly; + +/* System-level pressure and stall tracking */ +static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); +static struct psi_group psi_system = { + .pcpu = &system_group_pcpu, +}; + +static void psi_update_work(struct work_struct *work); + +static void group_init(struct psi_group *group) +{ + int cpu; + + for_each_possible_cpu(cpu) + seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); + group->next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->clock_work, psi_update_work); + mutex_init(&group->stat_lock); +} + +void __init psi_init(void) +{ + if (psi_disabled) + return; + + psi_period = jiffies_to_nsecs(PSI_FREQ); + group_init(&psi_system); +} + +static bool test_state(unsigned int *tasks, enum psi_states state) +{ + switch (state) { + case PSI_IO_SOME: + return tasks[NR_IOWAIT]; + case PSI_IO_FULL: + return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; + case PSI_MEM_SOME: + return tasks[NR_MEMSTALL]; + case PSI_MEM_FULL: + return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; + case PSI_CPU_SOME: + return tasks[NR_RUNNING] > 1; + case PSI_NONIDLE: + return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || + tasks[NR_RUNNING]; + default: + return false; + } +} + +static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +{ + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + unsigned int tasks[NR_PSI_TASK_COUNTS]; + u64 now, state_start; + unsigned int seq; + int s; + + /* Snapshot a coherent view of the CPU state */ + do { + seq = read_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); + memcpy(times, groupc->times, sizeof(groupc->times)); + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_start = groupc->state_start; + } while (read_seqcount_retry(&groupc->seq, seq)); + + /* Calculate state time deltas against the previous snapshot */ + for (s = 0; s < NR_PSI_STATES; s++) { + u32 delta; + /* + * In addition to already concluded states, we also + * incorporate currently active states on the CPU, + * since states may last for many sampling periods. + * + * This way we keep our delta sampling buckets small + * (u32) and our reported pressure close to what's + * actually happening. + */ + if (test_state(tasks, s)) + times[s] += now - state_start; + + delta = times[s] - groupc->times_prev[s]; + groupc->times_prev[s] = times[s]; + + times[s] = delta; + } +} + +static void calc_avgs(unsigned long avg[3], int missed_periods, + u64 time, u64 period) +{ + unsigned long pct; + + /* Fill in zeroes for periods of no activity */ + if (missed_periods) { + avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods); + avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods); + avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods); + } + + /* Sample the most recent active period */ + pct = div_u64(time * 100, period); + pct *= FIXED_1; + avg[0] = calc_load(avg[0], EXP_10s, pct); + avg[1] = calc_load(avg[1], EXP_60s, pct); + avg[2] = calc_load(avg[2], EXP_300s, pct); +} + +static bool update_stats(struct psi_group *group) +{ + u64 deltas[NR_PSI_STATES - 1] = { 0, }; + unsigned long missed_periods = 0; + unsigned long nonidle_total = 0; + u64 now, expires, period; + int cpu; + int s; + + mutex_lock(&group->stat_lock); + + /* + * Collect the per-cpu time buckets and average them into a + * single time sample that is normalized to wallclock time. + * + * For averaging, each CPU is weighted by its non-idle time in + * the sampling period. This eliminates artifacts from uneven + * loading, or even entirely idle CPUs. + */ + for_each_possible_cpu(cpu) { + u32 times[NR_PSI_STATES]; + u32 nonidle; + + get_recent_times(group, cpu, times); + + nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); + nonidle_total += nonidle; + + for (s = 0; s < PSI_NONIDLE; s++) + deltas[s] += (u64)times[s] * nonidle; + } + + /* + * Integrate the sample into the running statistics that are + * reported to userspace: the cumulative stall times and the + * decaying averages. + * + * Pressure percentages are sampled at PSI_FREQ. We might be + * called more often when the user polls more frequently than + * that; we might be called less often when there is no task + * activity, thus no data, and clock ticks are sporadic. The + * below handles both. + */ + + /* total= */ + for (s = 0; s < NR_PSI_STATES - 1; s++) + group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + + /* avgX= */ + now = sched_clock(); + expires = group->next_update; + if (now < expires) + goto out; + if (now - expires > psi_period) + missed_periods = div_u64(now - expires, psi_period); + + /* + * The periodic clock tick can get delayed for various + * reasons, especially on loaded systems. To avoid clock + * drift, we schedule the clock in fixed psi_period intervals. + * But the deltas we sample out of the per-cpu buckets above + * are based on the actual time elapsing between clock ticks. + */ + group->next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->last_update + (missed_periods * psi_period)); + group->last_update = now; + + for (s = 0; s < NR_PSI_STATES - 1; s++) { + u32 sample; + + sample = group->total[s] - group->total_prev[s]; + /* + * Due to the lockless sampling of the time buckets, + * recorded time deltas can slip into the next period, + * which under full pressure can result in samples in + * excess of the period length. + * + * We don't want to report non-sensical pressures in + * excess of 100%, nor do we want to drop such events + * on the floor. Instead we punt any overage into the + * future until pressure subsides. By doing this we + * don't underreport the occurring pressure curve, we + * just report it delayed by one period length. + * + * The error isn't cumulative. As soon as another + * delta slips from a period P to P+1, by definition + * it frees up its time T in P. + */ + if (sample > period) + sample = period; + group->total_prev[s] += sample; + calc_avgs(group->avg[s], missed_periods, sample, period); + } +out: + mutex_unlock(&group->stat_lock); + return nonidle_total; +} + +static void psi_update_work(struct work_struct *work) +{ + struct delayed_work *dwork; + struct psi_group *group; + bool nonidle; + + dwork = to_delayed_work(work); + group = container_of(dwork, struct psi_group, clock_work); + + /* + * If there is task activity, periodically fold the per-cpu + * times and feed samples into the running averages. If things + * are idle and there is no data to process, stop the clock. + * Once restarted, we'll catch up the running averages in one + * go - see calc_avgs() and missed_periods. + */ + + nonidle = update_stats(group); + + if (nonidle) { + unsigned long delay = 0; + u64 now; + + now = sched_clock(); + if (group->next_update > now) + delay = nsecs_to_jiffies(group->next_update - now) + 1; + schedule_delayed_work(dwork, delay); + } +} + +static void record_times(struct psi_group_cpu *groupc, int cpu, + bool memstall_tick) +{ + u32 delta; + u64 now; + + now = cpu_clock(cpu); + delta = now - groupc->state_start; + groupc->state_start = now; + + if (test_state(groupc->tasks, PSI_IO_SOME)) { + groupc->times[PSI_IO_SOME] += delta; + if (test_state(groupc->tasks, PSI_IO_FULL)) + groupc->times[PSI_IO_FULL] += delta; + } + + if (test_state(groupc->tasks, PSI_MEM_SOME)) { + groupc->times[PSI_MEM_SOME] += delta; + if (test_state(groupc->tasks, PSI_MEM_FULL)) + groupc->times[PSI_MEM_FULL] += delta; + else if (memstall_tick) { + u32 sample; + /* + * Since we care about lost potential, a + * memstall is FULL when there are no other + * working tasks, but also when the CPU is + * actively reclaiming and nothing productive + * could run even if it were runnable. + * + * When the timer tick sees a reclaiming CPU, + * regardless of runnable tasks, sample a FULL + * tick (or less if it hasn't been a full tick + * since the last state change). + */ + sample = min(delta, (u32)jiffies_to_nsecs(1)); + groupc->times[PSI_MEM_FULL] += sample; + } + } + + if (test_state(groupc->tasks, PSI_CPU_SOME)) + groupc->times[PSI_CPU_SOME] += delta; + + if (test_state(groupc->tasks, PSI_NONIDLE)) + groupc->times[PSI_NONIDLE] += delta; +} + +static void psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) +{ + struct psi_group_cpu *groupc; + unsigned int t, m; + + groupc = per_cpu_ptr(group->pcpu, cpu); + + /* + * First we assess the aggregate resource states this CPU's + * tasks have been in since the last change, and account any + * SOME and FULL time these may have resulted in. + * + * Then we update the task counts according to the state + * change requested through the @clear and @set bits. + */ + write_seqcount_begin(&groupc->seq); + + record_times(groupc, cpu, false); + + for (t = 0, m = clear; m; m &= ~(1 << t), t++) { + if (!(m & (1 << t))) + continue; + if (groupc->tasks[t] == 0 && !psi_bug) { + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + cpu, t, groupc->tasks[0], + groupc->tasks[1], groupc->tasks[2], + clear, set); + psi_bug = 1; + } + groupc->tasks[t]--; + } + + for (t = 0; set; set &= ~(1 << t), t++) + if (set & (1 << t)) + groupc->tasks[t]++; + + write_seqcount_end(&groupc->seq); + + if (!delayed_work_pending(&group->clock_work)) + schedule_delayed_work(&group->clock_work, PSI_FREQ); +} + +static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +{ +#ifdef CONFIG_CGROUPS + struct cgroup *cgroup = NULL; + + if (!*iter) + cgroup = task->cgroups->dfl_cgrp; + else if (*iter == &psi_system) + return NULL; + else + cgroup = cgroup_parent(*iter); + + if (cgroup && cgroup_parent(cgroup)) { + *iter = cgroup; + return cgroup_psi(cgroup); + } +#else + if (*iter) + return NULL; +#endif + *iter = &psi_system; + return &psi_system; +} + +void psi_task_change(struct task_struct *task, int clear, int set) +{ + int cpu = task_cpu(task); + struct psi_group *group; + void *iter = NULL; + + if (!task->pid) + return; + + if (((task->psi_flags & set) || + (task->psi_flags & clear) != clear) && + !psi_bug) { + printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", + task->pid, task->comm, cpu, + task->psi_flags, clear, set); + psi_bug = 1; + } + + task->psi_flags &= ~clear; + task->psi_flags |= set; + + while ((group = iterate_groups(task, &iter))) + psi_group_change(group, cpu, clear, set); +} + +void psi_memstall_tick(struct task_struct *task, int cpu) +{ + struct psi_group *group; + void *iter = NULL; + + while ((group = iterate_groups(task, &iter))) { + struct psi_group_cpu *groupc; + + groupc = per_cpu_ptr(group->pcpu, cpu); + write_seqcount_begin(&groupc->seq); + record_times(groupc, cpu, true); + write_seqcount_end(&groupc->seq); + } +} + +/** + * psi_memstall_enter - mark the beginning of a memory stall section + * @flags: flags to handle nested sections + * + * Marks the calling task as being stalled due to a lack of memory, + * such as waiting for a refault or performing reclaim. + */ +void psi_memstall_enter(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + *flags = current->flags & PF_MEMSTALL; + if (*flags) + return; + /* + * PF_MEMSTALL setting & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we can + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags |= PF_MEMSTALL; + psi_task_change(current, 0, TSK_MEMSTALL); + + rq_unlock_irq(rq, &rf); +} + +/** + * psi_memstall_leave - mark the end of an memory stall section + * @flags: flags to handle nested memdelay sections + * + * Marks the calling task as no longer stalled due to lack of memory. + */ +void psi_memstall_leave(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + if (*flags) + return; + /* + * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we could + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags &= ~PF_MEMSTALL; + psi_task_change(current, TSK_MEMSTALL, 0); + + rq_unlock_irq(rq, &rf); +} + +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgroup) +{ + if (psi_disabled) + return 0; + + cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi.pcpu) + return -ENOMEM; + group_init(&cgroup->psi); + return 0; +} + +void psi_cgroup_free(struct cgroup *cgroup) +{ + if (psi_disabled) + return; + + cancel_delayed_work_sync(&cgroup->psi.clock_work); + free_percpu(cgroup->psi.pcpu); +} + +/** + * cgroup_move_task - move task to a different cgroup + * @task: the task + * @to: the target css_set + * + * Move task to a new cgroup and safely migrate its associated stall + * state between the different groups. + * + * This function acquires the task's rq lock to lock out concurrent + * changes to the task's scheduling state and - in case the task is + * running - concurrent changes to its stall state. + */ +void cgroup_move_task(struct task_struct *task, struct css_set *to) +{ + bool move_psi = !psi_disabled; + unsigned int task_flags = 0; + struct rq_flags rf; + struct rq *rq; + + if (move_psi) { + rq = task_rq_lock(task, &rf); + + if (task_on_rq_queued(task)) + task_flags = TSK_RUNNING; + else if (task->in_iowait) + task_flags = TSK_IOWAIT; + + if (task->flags & PF_MEMSTALL) + task_flags |= TSK_MEMSTALL; + + if (task_flags) + psi_task_change(task, task_flags, 0); + } + + /* + * Lame to do this here, but the scheduler cannot be locked + * from the outside, so we move cgroups from inside sched/. + */ + rcu_assign_pointer(task->cgroups, to); + + if (move_psi) { + if (task_flags) + psi_task_change(task, 0, task_flags); + + task_rq_unlock(rq, task, &rf); + } +} +#endif /* CONFIG_CGROUPS */ + +int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) +{ + int full; + + if (psi_disabled) + return -EOPNOTSUPP; + + update_stats(group); + + for (full = 0; full < 2 - (res == PSI_CPU); full++) { + unsigned long avg[3]; + u64 total; + int w; + + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + + seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", + full ? "full" : "some", + LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), + LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), + LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), + total); + } + + return 0; +} + +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_io_show, NULL); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_memory_show, NULL); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_cpu_show, NULL); +} + +static const struct file_operations psi_io_fops = { + .open = psi_io_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_memory_fops = { + .open = psi_memory_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_cpu_fops = { + .open = psi_cpu_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init psi_proc_init(void) +{ + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_fops); + proc_create("pressure/memory", 0, NULL, &psi_memory_fops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + return 0; +} +module_init(psi_proc_init); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b8c007713b3b..618577fc9aa8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -54,6 +54,7 @@ #include <linux/proc_fs.h> #include <linux/prefetch.h> #include <linux/profile.h> +#include <linux/psi.h> #include <linux/rcupdate_wait.h> #include <linux/security.h> #include <linux/stop_machine.h> @@ -319,6 +320,7 @@ extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED #include <linux/cgroup.h> +#include <linux/psi.h> struct cfs_rq; struct rt_rq; @@ -957,6 +959,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +extern void update_rq_clock(struct rq *rq); + static inline u64 __rq_clock_broken(struct rq *rq) { return READ_ONCE(rq->clock); @@ -1075,6 +1079,98 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) #endif } +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); + +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock); + +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); +} + +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline struct rq * +this_rq_lock_irq(struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, rf); + return rq; +} + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, @@ -1717,8 +1813,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } -extern void update_rq_clock(struct rq *rq); - extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); @@ -1783,86 +1877,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) #endif #endif -struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(rq->lock); - -struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(p->pi_lock) - __acquires(rq->lock); - -static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) - __releases(rq->lock) - __releases(p->pi_lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -} - -static inline void -rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irqsave(&rq->lock, rf->flags); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock_irq(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irq(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_relock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_repin_lock(rq, rf); -} - -static inline void -rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -} - -static inline void -rq_unlock_irq(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irq(&rq->lock); -} - -static inline void -rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8aea199a39b4..4904c4677000 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -55,6 +55,92 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_val_or_zero(var) 0 #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_PSI +/* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, + * where a task's runnable state changes, and requeues, where a task + * and its state are being moved between CPUs and runqueues. + */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) +{ + int clear = 0, set = TSK_RUNNING; + + if (psi_disabled) + return; + + if (!wakeup || p->sched_psi_wake_requeue) { + if (p->flags & PF_MEMSTALL) + set |= TSK_MEMSTALL; + if (p->sched_psi_wake_requeue) + p->sched_psi_wake_requeue = 0; + } else { + if (p->in_iowait) + clear |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_dequeue(struct task_struct *p, bool sleep) +{ + int clear = TSK_RUNNING, set = 0; + + if (psi_disabled) + return; + + if (!sleep) { + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + } else { + if (p->in_iowait) + set |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_ttwu_dequeue(struct task_struct *p) +{ + if (psi_disabled) + return; + /* + * Is the task being migrated during a wakeup? Make sure to + * deregister its sleep-persistent psi states from the old + * queue, and let psi_enqueue() know it has to requeue. + */ + if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { + struct rq_flags rf; + struct rq *rq; + int clear = 0; + + if (p->in_iowait) + clear |= TSK_IOWAIT; + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + + rq = __task_rq_lock(p, &rf); + psi_task_change(p, clear, 0); + p->sched_psi_wake_requeue = 1; + __task_rq_unlock(rq, &rf); + } +} + +static inline void psi_task_tick(struct rq *rq) +{ + if (psi_disabled) + return; + + if (unlikely(rq->curr->flags & PF_MEMSTALL)) + psi_memstall_tick(rq->curr, cpu_of(rq)); +} +#else /* CONFIG_PSI */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} +static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_task_tick(struct rq *rq) {} +#endif /* CONFIG_PSI */ + #ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) { diff --git a/kernel/signal.c b/kernel/signal.c index dbd2e4db24cf..9a32bc2088c9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -892,7 +892,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) /* * The first thread which returns from do_signal_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and - * notify its parent. See get_signal_to_deliver(). + * notify its parent. See get_signal(). */ signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED); signal->group_stop_count = 0; @@ -3165,7 +3165,7 @@ int copy_siginfo_from_user32(struct kernel_siginfo *to, * @ts: upper bound on process time suspension */ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, - const struct timespec *ts) + const struct timespec64 *ts) { ktime_t *to = NULL, timeout = KTIME_MAX; struct task_struct *tsk = current; @@ -3173,9 +3173,9 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, int sig, ret = 0; if (ts) { - if (!timespec_valid(ts)) + if (!timespec64_valid(ts)) return -EINVAL; - timeout = timespec_to_ktime(*ts); + timeout = timespec64_to_ktime(*ts); to = &timeout; } @@ -3223,11 +3223,12 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, - siginfo_t __user *, uinfo, const struct timespec __user *, uts, + siginfo_t __user *, uinfo, + const struct __kernel_timespec __user *, uts, size_t, sigsetsize) { sigset_t these; - struct timespec ts; + struct timespec64 ts; kernel_siginfo_t info; int ret; @@ -3239,7 +3240,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return -EFAULT; if (uts) { - if (copy_from_user(&ts, uts, sizeof(ts))) + if (get_timespec64(&ts, uts)) return -EFAULT; } @@ -3256,10 +3257,10 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, - struct compat_timespec __user *, uts, compat_size_t, sigsetsize) + struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { sigset_t s; - struct timespec t; + struct timespec64 t; kernel_siginfo_t info; long ret; @@ -3270,7 +3271,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, return -EFAULT; if (uts) { - if (compat_get_timespec(&t, uts)) + if (get_old_timespec32(&t, uts)) return -EFAULT; } diff --git a/kernel/softirq.c b/kernel/softirq.c index 7a0720a20003..d28813306b2c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -257,9 +257,9 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) int softirq_bit; /* - * Mask out PF_MEMALLOC s current task context is borrowed for the - * softirq. A softirq handled such as network RX might set PF_MEMALLOC - * again if the socket is related to swap + * Mask out PF_MEMALLOC as the current task context is borrowed for the + * softirq. A softirq handled, such as network RX, might set PF_MEMALLOC + * again if the socket is related to swapping. */ current->flags &= ~PF_MEMALLOC; diff --git a/kernel/stackleak.c b/kernel/stackleak.c new file mode 100644 index 000000000000..e42892926244 --- /dev/null +++ b/kernel/stackleak.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This code fills the used part of the kernel stack with a poison value + * before returning to userspace. It's part of the STACKLEAK feature + * ported from grsecurity/PaX. + * + * Author: Alexander Popov <alex.popov@linux.com> + * + * STACKLEAK reduces the information which kernel stack leak bugs can + * reveal and blocks some uninitialized stack variable attacks. + */ + +#include <linux/stackleak.h> + +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE +#include <linux/jump_label.h> +#include <linux/sysctl.h> + +static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); + +int stack_erasing_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + int state = !static_branch_unlikely(&stack_erasing_bypass); + int prev_state = state; + + table->data = &state; + table->maxlen = sizeof(int); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + state = !!state; + if (ret || !write || state == prev_state) + return ret; + + if (state) + static_branch_disable(&stack_erasing_bypass); + else + static_branch_enable(&stack_erasing_bypass); + + pr_warn("stackleak: kernel stack erasing is %s\n", + state ? "enabled" : "disabled"); + return ret; +} + +#define skip_erasing() static_branch_unlikely(&stack_erasing_bypass) +#else +#define skip_erasing() false +#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ + +asmlinkage void stackleak_erase(void) +{ + /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ + unsigned long kstack_ptr = current->lowest_stack; + unsigned long boundary = (unsigned long)end_of_stack(current); + unsigned int poison_count = 0; + const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long); + + if (skip_erasing()) + return; + + /* Check that 'lowest_stack' value is sane */ + if (unlikely(kstack_ptr - boundary >= THREAD_SIZE)) + kstack_ptr = boundary; + + /* Search for the poison value in the kernel stack */ + while (kstack_ptr > boundary && poison_count <= depth) { + if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON) + poison_count++; + else + poison_count = 0; + + kstack_ptr -= sizeof(unsigned long); + } + + /* + * One 'long int' at the bottom of the thread stack is reserved and + * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y). + */ + if (kstack_ptr == boundary) + kstack_ptr += sizeof(unsigned long); + +#ifdef CONFIG_STACKLEAK_METRICS + current->prev_lowest_stack = kstack_ptr; +#endif + + /* + * Now write the poison value to the kernel stack. Start from + * 'kstack_ptr' and move up till the new 'boundary'. We assume that + * the stack pointer doesn't change when we write poison. + */ + if (on_thread_stack()) + boundary = current_stack_pointer; + else + boundary = current_top_of_stack(); + + while (kstack_ptr < boundary) { + *(unsigned long *)kstack_ptr = STACKLEAK_POISON; + kstack_ptr += sizeof(unsigned long); + } + + /* Reset the 'lowest_stack' value for the next syscall */ + current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; +} + +void __used stackleak_track_stack(void) +{ + /* + * N.B. stackleak_erase() fills the kernel stack with the poison value, + * which has the register width. That code assumes that the value + * of 'lowest_stack' is aligned on the register width boundary. + * + * That is true for x86 and x86_64 because of the kernel stack + * alignment on these platforms (for details, see 'cc_stack_align' in + * arch/x86/Makefile). Take care of that when you port STACKLEAK to + * new platforms. + */ + unsigned long sp = (unsigned long)&sp; + + /* + * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than + * STACKLEAK_SEARCH_DEPTH makes the poison search in + * stackleak_erase() unreliable. Let's prevent that. + */ + BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH); + + if (sp < current->lowest_stack && + sp >= (unsigned long)task_stack_page(current) + + sizeof(unsigned long)) { + current->lowest_stack = sp; + } +} +EXPORT_SYMBOL(stackleak_track_stack); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cc02050fd0c4..3ae223f7b5df 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -91,7 +91,9 @@ #ifdef CONFIG_CHR_DEV_SG #include <scsi/sg.h> #endif - +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE +#include <linux/stackleak.h> +#endif #ifdef CONFIG_LOCKUP_DETECTOR #include <linux/nmi.h> #endif @@ -1233,6 +1235,17 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE + { + .procname = "stack_erasing", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = stack_erasing_sysctl, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e1a549c9e399..9cdd74bd2d27 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1660,7 +1660,7 @@ int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: - if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) + if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif @@ -1780,12 +1780,12 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) +COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 tu; - if (compat_get_timespec64(&tu, rqtp)) + if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 2c6847d5d69b..989ccf028bde 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -162,20 +162,20 @@ COMPAT_SYS_NI(setitimer); #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { struct timespec64 new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; - if (compat_get_timespec64(&new_tp, tp)) + if (get_old_timespec32(&new_tp, tp)) return -EFAULT; return do_sys_settimeofday64(&new_tp, NULL); } COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { int ret; struct timespec64 kernel_tp; @@ -184,13 +184,13 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, if (ret) return ret; - if (compat_put_timespec64(&kernel_tp, tp)) + if (put_old_timespec32(&kernel_tp, tp)) return -EFAULT; return 0; } COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { struct timespec64 rtn_tp = { .tv_sec = 0, @@ -201,7 +201,7 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (compat_put_timespec64(&rtn_tp, tp)) + if (put_old_timespec32(&rtn_tp, tp)) return -EFAULT; return 0; default: @@ -210,8 +210,8 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, } COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 t; @@ -224,7 +224,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EINVAL; } - if (compat_get_timespec64(&t, rqtp)) + if (get_old_timespec32(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) return -EINVAL; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index eabb4c22728d..bd62b5eeb5a0 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -755,13 +755,13 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct compat_itimerspec __user *, setting) + struct old_itimerspec32 __user *, setting) { struct itimerspec64 cur_setting; int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { - if (put_compat_itimerspec64(&cur_setting, setting)) + if (put_old_itimerspec32(&cur_setting, setting)) ret = -EFAULT; } return ret; @@ -928,8 +928,8 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - struct compat_itimerspec __user *, new, - struct compat_itimerspec __user *, old) + struct old_itimerspec32 __user *, new, + struct old_itimerspec32 __user *, old) { struct itimerspec64 new_spec, old_spec; struct itimerspec64 *rtn = old ? &old_spec : NULL; @@ -937,12 +937,12 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, if (!new) return -EINVAL; - if (get_compat_itimerspec64(&new_spec, new)) + if (get_old_itimerspec32(&new_spec, new)) return -EFAULT; error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old) { - if (put_compat_itimerspec64(&old_spec, old)) + if (put_old_itimerspec32(&old_spec, old)) error = -EFAULT; } return error; @@ -1115,7 +1115,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1123,14 +1123,14 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, if (!kc || !kc->clock_set) return -EINVAL; - if (compat_get_timespec64(&ts, tp)) + if (get_old_timespec32(&ts, tp)) return -EFAULT; return kc->clock_set(which_clock, &ts); } COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1141,7 +1141,7 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, err = kc->clock_get(which_clock, &ts); - if (!err && compat_put_timespec64(&ts, tp)) + if (!err && put_old_timespec32(&ts, tp)) err = -EFAULT; return err; @@ -1180,7 +1180,7 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1190,7 +1190,7 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, return -EINVAL; err = kc->clock_getres(which_clock, &ts); - if (!err && tp && compat_put_timespec64(&ts, tp)) + if (!err && tp && put_old_timespec32(&ts, tp)) return -EFAULT; return err; @@ -1237,8 +1237,8 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; @@ -1248,7 +1248,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, if (!kc->nsleep) return -EOPNOTSUPP; - if (compat_get_timespec64(&t, rqtp)) + if (get_old_timespec32(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index aa2094d5dd27..be0aac2b4300 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -400,8 +400,6 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) if (tick_broadcast_forced) break; cpumask_clear_cpu(cpu, tick_broadcast_on); - if (!tick_device_is_functional(dev)) - break; if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5b33e2f5c0ed..69e673b88474 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -885,7 +885,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (need_resched()) return false; - if (unlikely(local_softirq_pending() && cpu_online(cpu))) { + if (unlikely(local_softirq_pending())) { static int ratelimit; if (ratelimit < 10 && diff --git a/kernel/time/time.c b/kernel/time/time.c index ccdb351277ee..e3a7f7fd3abc 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -104,12 +104,12 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_TIME -/* compat_time_t is a 32 bit "long" and needs to get converted. */ -COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) +/* old_time32_t is a 32 bit "long" and needs to get converted. */ +COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) { - compat_time_t i; + old_time32_t i; - i = (compat_time_t)ktime_get_real_seconds(); + i = (old_time32_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) @@ -119,7 +119,7 @@ COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) return i; } -COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) +COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr) { struct timespec64 tv; int err; @@ -144,9 +144,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, struct timezone __user *, tz) { if (likely(tv != NULL)) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (copy_to_user(tv, &ktv, sizeof(ktv))) + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (unlikely(tz != NULL)) { @@ -223,14 +225,15 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, +COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { if (tv) { - struct timeval ktv; + struct timespec64 ts; - do_gettimeofday(&ktv); - if (compat_put_timeval(&ktv, tv)) + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (tz) { @@ -241,7 +244,7 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, return 0; } -COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, +COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; @@ -342,30 +345,6 @@ unsigned int jiffies_to_usecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_usecs); -/** - * timespec_trunc - Truncate timespec to a granularity - * @t: Timespec - * @gran: Granularity in ns. - * - * Truncate a timespec to a granularity. Always rounds down. gran must - * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). - */ -struct timespec timespec_trunc(struct timespec t, unsigned gran) -{ - /* Avoid division in the common cases 1 ns and 1 s. */ - if (gran == 1) { - /* nothing */ - } else if (gran == NSEC_PER_SEC) { - t.tv_nsec = 0; - } else if (gran > 1 && gran < NSEC_PER_SEC) { - t.tv_nsec -= t.tv_nsec % gran; - } else { - WARN(1, "illegal file time granularity: %u", gran); - } - return t; -} -EXPORT_SYMBOL(timespec_trunc); - /* * mktime64 - Converts date to seconds. * Converts Gregorian date to seconds since 1970-01-01 00:00:00. @@ -884,10 +863,10 @@ int put_timespec64(const struct timespec64 *ts, } EXPORT_SYMBOL_GPL(put_timespec64); -int __compat_get_timespec64(struct timespec64 *ts64, - const struct compat_timespec __user *cts) +static int __get_old_timespec32(struct timespec64 *ts64, + const struct old_timespec32 __user *cts) { - struct compat_timespec ts; + struct old_timespec32 ts; int ret; ret = copy_from_user(&ts, cts, sizeof(ts)); @@ -900,33 +879,33 @@ int __compat_get_timespec64(struct timespec64 *ts64, return 0; } -int __compat_put_timespec64(const struct timespec64 *ts64, - struct compat_timespec __user *cts) +static int __put_old_timespec32(const struct timespec64 *ts64, + struct old_timespec32 __user *cts) { - struct compat_timespec ts = { + struct old_timespec32 ts = { .tv_sec = ts64->tv_sec, .tv_nsec = ts64->tv_nsec }; return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; } -int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) +int get_old_timespec32(struct timespec64 *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; else - return __compat_get_timespec64(ts, uts); + return __get_old_timespec32(ts, uts); } -EXPORT_SYMBOL_GPL(compat_get_timespec64); +EXPORT_SYMBOL_GPL(get_old_timespec32); -int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) +int put_old_timespec32(const struct timespec64 *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; else - return __compat_put_timespec64(ts, uts); + return __put_old_timespec32(ts, uts); } -EXPORT_SYMBOL_GPL(compat_put_timespec64); +EXPORT_SYMBOL_GPL(put_old_timespec32); int get_itimerspec64(struct itimerspec64 *it, const struct __kernel_itimerspec __user *uit) @@ -958,23 +937,23 @@ int put_itimerspec64(const struct itimerspec64 *it, } EXPORT_SYMBOL_GPL(put_itimerspec64); -int get_compat_itimerspec64(struct itimerspec64 *its, - const struct compat_itimerspec __user *uits) +int get_old_itimerspec32(struct itimerspec64 *its, + const struct old_itimerspec32 __user *uits) { - if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) || - __compat_get_timespec64(&its->it_value, &uits->it_value)) + if (__get_old_timespec32(&its->it_interval, &uits->it_interval) || + __get_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(get_compat_itimerspec64); +EXPORT_SYMBOL_GPL(get_old_itimerspec32); -int put_compat_itimerspec64(const struct itimerspec64 *its, - struct compat_itimerspec __user *uits) +int put_old_itimerspec32(const struct itimerspec64 *its, + struct old_itimerspec32 __user *uits) { - if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) || - __compat_put_timespec64(&its->it_value, &uits->it_value)) + if (__put_old_timespec32(&its->it_interval, &uits->it_interval) || + __put_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(put_compat_itimerspec64); +EXPORT_SYMBOL_GPL(put_old_itimerspec32); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f3b22f456fac..2d110c948805 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1212,22 +1212,6 @@ int get_device_system_crosststamp(int (*get_time_fn) EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** - * do_gettimeofday - Returns the time of day in a timeval - * @tv: pointer to the timeval to be set - * - * NOTE: Users should be converted to using getnstimeofday() - */ -void do_gettimeofday(struct timeval *tv) -{ - struct timespec64 now; - - getnstimeofday64(&now); - tv->tv_sec = now.tv_sec; - tv->tv_usec = now.tv_nsec/1000; -} -EXPORT_SYMBOL(do_gettimeofday); - -/** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * @@ -2174,14 +2158,6 @@ void getboottime64(struct timespec64 *ts) } EXPORT_SYMBOL_GPL(getboottime64); -unsigned long get_seconds(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - return tk->xtime_sec; -} -EXPORT_SYMBOL(get_seconds); - void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bf6f1d70484d..ff1c4b20cd0a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2727,6 +2727,7 @@ void trace_dump_stack(int skip) __ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, skip, preempt_count(), NULL); } +EXPORT_SYMBOL_GPL(trace_dump_stack); static DEFINE_PER_CPU(int, user_stack_count); @@ -4621,13 +4622,18 @@ static const char readme_msg[] = "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n" #endif #ifdef CONFIG_UPROBE_EVENTS - "\t place: <path>:<offset>\n" + " place (uprobe): <path>:<offset>[(ref_ctr_offset)]\n" #endif "\t args: <name>=fetcharg[:type]\n" "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n" +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + "\t $stack<index>, $stack, $retval, $comm, $arg<N>\n" +#else "\t $stack<index>, $stack, $retval, $comm\n" - "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n" - "\t b<bit-width>@<bit-offset>/<container-size>\n" +#endif + "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" + "\t b<bit-width>@<bit-offset>/<container-size>,\n" + "\t <type>\\[<array-size>\\]\n" #endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 69a3fe926e8c..76217bbef815 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -290,7 +290,8 @@ void perf_kprobe_destroy(struct perf_event *p_event) #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS -int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) +int perf_uprobe_init(struct perf_event *p_event, + unsigned long ref_ctr_offset, bool is_retprobe) { int ret; char *path = NULL; @@ -312,8 +313,8 @@ int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) goto out; } - tp_event = create_local_trace_uprobe( - path, p_event->attr.probe_offset, is_retprobe); + tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset, + ref_ctr_offset, is_retprobe); if (IS_ERR(tp_event)) { ret = PTR_ERR(tp_event); goto out; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index d239004aaf29..eb908ef2ecec 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1063,8 +1063,10 @@ static int create_synth_event(int argc, char **argv) event = NULL; ret = -EEXIST; goto out; - } else if (delete_event) + } else if (delete_event) { + ret = -ENOENT; goto out; + } if (argc < 2) { ret = -EINVAL; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c30032367aab..fec67188c4d2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -14,6 +14,7 @@ #include "trace_kprobe_selftest.h" #include "trace_probe.h" +#include "trace_probe_tmpl.h" #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 @@ -61,9 +62,23 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, return strncmp(mod->name, name, len) == 0 && name[len] == ':'; } -static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) +static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) { - return !!strchr(trace_kprobe_symbol(tk), ':'); + char *p; + bool ret; + + if (!tk->symbol) + return false; + p = strchr(tk->symbol, ':'); + if (!p) + return true; + *p = '\0'; + mutex_lock(&module_mutex); + ret = !!find_module(tk->symbol); + mutex_unlock(&module_mutex); + *p = ':'; + + return ret; } static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) @@ -120,184 +135,6 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); -/* Memory fetching by symbol */ -struct symbol_cache { - char *symbol; - long offset; - unsigned long addr; -}; - -unsigned long update_symbol_cache(struct symbol_cache *sc) -{ - sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); - - if (sc->addr) - sc->addr += sc->offset; - - return sc->addr; -} - -void free_symbol_cache(struct symbol_cache *sc) -{ - kfree(sc->symbol); - kfree(sc); -} - -struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) -{ - struct symbol_cache *sc; - - if (!sym || strlen(sym) == 0) - return NULL; - - sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); - if (!sc) - return NULL; - - sc->symbol = kstrdup(sym, GFP_KERNEL); - if (!sc->symbol) { - kfree(sc); - return NULL; - } - sc->offset = offset; - update_symbol_cache(sc); - - return sc; -} - -/* - * Kprobes-specific fetch functions - */ -#define DEFINE_FETCH_stack(type) \ -static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ - (unsigned int)((unsigned long)offset)); \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type)); - -DEFINE_BASIC_FETCH_FUNCS(stack) -/* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - -#define DEFINE_FETCH_memory(type) \ -static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ - void *addr, void *dest) \ -{ \ - type retval; \ - if (probe_kernel_address(addr, retval)) \ - *(type *)dest = 0; \ - else \ - *(type *)dest = retval; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type)); - -DEFINE_BASIC_FETCH_FUNCS(memory) -/* - * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max - * length and relative data location. - */ -static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) -{ - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); - long ret; - - if (!maxlen) - return; - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_unsafe(dst, addr, maxlen); - - if (ret < 0) { /* Failed to fetch string */ - dst[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); - } else { - *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); - } -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); - -/* Return the length of string -- including null terminal byte */ -static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) -{ - mm_segment_t old_fs; - int ret, len = 0; - u8 c; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - pagefault_disable(); - - do { - ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) /* Failed to check the length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size)); - -#define DEFINE_FETCH_symbol(type) \ -void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\ -{ \ - struct symbol_cache *sc = data; \ - if (sc->addr) \ - fetch_memory_##type(regs, (void *)sc->addr, dest); \ - else \ - *(type *)dest = 0; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type)); - -DEFINE_BASIC_FETCH_FUNCS(symbol) -DEFINE_FETCH_symbol(string) -DEFINE_FETCH_symbol(string_size) - -/* kprobes don't support file_offset fetch methods */ -#define fetch_file_offset_u8 NULL -#define fetch_file_offset_u16 NULL -#define fetch_file_offset_u32 NULL -#define fetch_file_offset_u64 NULL -#define fetch_file_offset_string NULL -#define fetch_file_offset_string_size NULL - -/* Fetch type information table */ -static const struct fetch_type kprobes_fetch_type_table[] = { - /* Special types */ - [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, - sizeof(u32), 1, "__data_loc char[]"), - [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, - string_size, sizeof(u32), 0, "u32"), - /* Basic types */ - ASSIGN_FETCH_TYPE(u8, u8, 0), - ASSIGN_FETCH_TYPE(u16, u16, 0), - ASSIGN_FETCH_TYPE(u32, u32, 0), - ASSIGN_FETCH_TYPE(u64, u64, 0), - ASSIGN_FETCH_TYPE(s8, u8, 1), - ASSIGN_FETCH_TYPE(s16, u16, 1), - ASSIGN_FETCH_TYPE(s32, u32, 1), - ASSIGN_FETCH_TYPE(s64, u64, 1), - ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), - ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), - ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), - ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), - - ASSIGN_FETCH_TYPE_END -}; - /* * Allocate new trace_probe and initialize it (including kprobes). */ @@ -540,8 +377,11 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) return -EINVAL; } - for (i = 0; i < tk->tp.nr_args; i++) - traceprobe_update_arg(&tk->tp.args[i]); + for (i = 0; i < tk->tp.nr_args; i++) { + ret = traceprobe_update_arg(&tk->tp.args[i]); + if (ret) + return ret; + } /* Set/clear disabled flag according to tp->flag */ if (trace_probe_is_enabled(&tk->tp)) @@ -554,19 +394,13 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) else ret = register_kprobe(&tk->rp.kp); - if (ret == 0) + if (ret == 0) { tk->tp.flags |= TP_FLAG_REGISTERED; - else { - if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { - pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); - ret = 0; - } else if (ret == -EILSEQ) { - pr_warn("Probing address(0x%p) is not an instruction boundary.\n", - tk->rp.kp.addr); - ret = -EINVAL; - } + } else if (ret == -EILSEQ) { + pr_warn("Probing address(0x%p) is not an instruction boundary.\n", + tk->rp.kp.addr); + ret = -EINVAL; } - return ret; } @@ -629,6 +463,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register k*probe */ ret = __register_trace_kprobe(tk); + if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) { + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); + ret = 0; + } + if (ret < 0) unregister_kprobe_event(tk); else @@ -713,13 +552,15 @@ static int create_trace_kprobe(int argc, char **argv) long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; + unsigned int flags = TPARG_FL_KERNEL; /* argc must be >= 1 */ if (argv[0][0] == 'p') is_return = false; - else if (argv[0][0] == 'r') + else if (argv[0][0] == 'r') { is_return = true; - else if (argv[0][0] == '-') + flags |= TPARG_FL_RETURN; + } else if (argv[0][0] == '-') is_delete = true; else { pr_info("Probe definition must be started with 'p', 'r' or" @@ -749,10 +590,13 @@ static int create_trace_kprobe(int argc, char **argv) } if (event) { - if (strchr(event, '/')) { + char *slash; + + slash = strchr(event, '/'); + if (slash) { group = event; - event = strchr(group, '/') + 1; - event[-1] = '\0'; + event = slash + 1; + slash[0] = '\0'; if (strlen(group) == 0) { pr_info("Group name is not specified\n"); return -EINVAL; @@ -802,8 +646,9 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Failed to parse either an address or a symbol.\n"); return ret; } - if (offset && is_return && - !kprobe_on_func_entry(NULL, symbol, offset)) { + if (kprobe_on_func_entry(NULL, symbol, offset)) + flags |= TPARG_FL_FENTRY; + if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { pr_info("Given offset is not valid for return probe.\n"); return -EINVAL; } @@ -873,8 +718,7 @@ static int create_trace_kprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, - is_return, true, - kprobes_fetch_type_table); + flags); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -1028,6 +872,106 @@ static const struct file_operations kprobe_profile_ops = { .release = seq_release, }; +/* Kprobe specific fetch functions */ + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) +{ + mm_segment_t old_fs; + int ret, len = 0; + u8 c; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + pagefault_disable(); + + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + pagefault_enable(); + set_fs(old_fs); + + return (ret < 0) ? ret : len; +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. + */ +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + u8 *dst = get_loc_data(dest, base); + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_unsafe(dst, (void *)addr, maxlen); + + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, (void *)dst - base); + return ret; +} + +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ + return probe_kernel_read(dest, src, size); +} + +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, + void *base) +{ + unsigned long val; + +retry: + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_REG: + val = regs_get_register(regs, code->param); + break; + case FETCH_OP_STACK: + val = regs_get_kernel_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = kernel_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + case FETCH_OP_ARG: + val = regs_get_kernel_argument(regs, code->param); + break; +#endif + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + return -EILSEQ; + } + code++; + + return process_fetch_insn_bottom(code, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + /* Kprobe handler */ static nokprobe_inline void __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, @@ -1059,7 +1003,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, entry = ring_buffer_event_data(event); entry->ip = (unsigned long)tk->rp.kp.addr; - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); event_trigger_unlock_commit_regs(trace_file, buffer, event, entry, irq_flags, pc, regs); @@ -1108,7 +1052,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry = ring_buffer_event_data(event); entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); event_trigger_unlock_commit_regs(trace_file, buffer, event, entry, irq_flags, pc, regs); @@ -1133,8 +1077,6 @@ print_kprobe_event(struct trace_iterator *iter, int flags, struct kprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_probe *tp; - u8 *data; - int i; field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); @@ -1146,11 +1088,9 @@ print_kprobe_event(struct trace_iterator *iter, int flags, trace_seq_putc(s, ')'); - data = (u8 *)&field[1]; - for (i = 0; i < tp->nr_args; i++) - if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset, field)) - goto out; + if (print_probe_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; trace_seq_putc(s, '\n'); out: @@ -1164,8 +1104,6 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, struct kretprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_probe *tp; - u8 *data; - int i; field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); @@ -1182,11 +1120,9 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, trace_seq_putc(s, ')'); - data = (u8 *)&field[1]; - for (i = 0; i < tp->nr_args; i++) - if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset, field)) - goto out; + if (print_probe_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; trace_seq_putc(s, '\n'); @@ -1197,49 +1133,25 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, static int kprobe_event_define_fields(struct trace_event_call *event_call) { - int ret, i; + int ret; struct kprobe_trace_entry_head field; struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); - /* Set argument names as fields */ - for (i = 0; i < tk->tp.nr_args; i++) { - struct probe_arg *parg = &tk->tp.args[i]; - ret = trace_define_field(event_call, parg->type->fmttype, - parg->name, - sizeof(field) + parg->offset, - parg->type->size, - parg->type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; + return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); } static int kretprobe_event_define_fields(struct trace_event_call *event_call) { - int ret, i; + int ret; struct kretprobe_trace_entry_head field; struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); - /* Set argument names as fields */ - for (i = 0; i < tk->tp.nr_args; i++) { - struct probe_arg *parg = &tk->tp.args[i]; - ret = trace_define_field(event_call, parg->type->fmttype, - parg->name, - sizeof(field) + parg->offset, - parg->type->size, - parg->type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; + return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); } #ifdef CONFIG_PERF_EVENTS @@ -1286,7 +1198,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); return 0; @@ -1322,7 +1234,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); } @@ -1457,7 +1369,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) init_trace_event_call(tk, call); - if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) + if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; ret = register_trace_event(&call->event); if (!ret) { @@ -1514,7 +1426,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, init_trace_event_call(tk, &tk->tp.call); - if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { + if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { ret = -ENOMEM; goto error; } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index b0875b327f5c..c3fd849d4a8f 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -115,7 +115,7 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, * section, then we need to read the link list pointers. The trick is * we pass the address of the string to the seq function just like * we do for the kernel core formats. To get back the structure that - * holds the format, we simply use containerof() and then go to the + * holds the format, we simply use container_of() and then go to the * next format in the list. */ static const char ** diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e99c3ce7aa65..3ef15a6683c0 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -26,14 +26,12 @@ const char *reserved_field_names[] = { /* Printing in basic type function template */ #define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \ -int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \ - void *data, void *ent) \ +int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, void *data, void *ent)\ { \ - trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ + trace_seq_printf(s, fmt, *(type *)data); \ return !trace_seq_has_overflowed(s); \ } \ -const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \ -NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname)); +const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u") DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u") @@ -48,193 +46,52 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx") +int PRINT_TYPE_FUNC_NAME(symbol)(struct trace_seq *s, void *data, void *ent) +{ + trace_seq_printf(s, "%pS", (void *)*(unsigned long *)data); + return !trace_seq_has_overflowed(s); +} +const char PRINT_TYPE_FMT_NAME(symbol)[] = "%pS"; + /* Print type function for string type */ -int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, - void *data, void *ent) +int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) { int len = *(u32 *)data >> 16; if (!len) - trace_seq_printf(s, " %s=(fault)", name); + trace_seq_puts(s, "(fault)"); else - trace_seq_printf(s, " %s=\"%s\"", name, + trace_seq_printf(s, "\"%s\"", (const char *)get_loc_data(data, ent)); return !trace_seq_has_overflowed(s); } -NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; -#define CHECK_FETCH_FUNCS(method, fn) \ - (((FETCH_FUNC_NAME(method, u8) == fn) || \ - (FETCH_FUNC_NAME(method, u16) == fn) || \ - (FETCH_FUNC_NAME(method, u32) == fn) || \ - (FETCH_FUNC_NAME(method, u64) == fn) || \ - (FETCH_FUNC_NAME(method, string) == fn) || \ - (FETCH_FUNC_NAME(method, string_size) == fn)) \ - && (fn != NULL)) - -/* Data fetch function templates */ -#define DEFINE_FETCH_reg(type) \ -void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_register(regs, \ - (unsigned int)((unsigned long)offset)); \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type)); -DEFINE_BASIC_FETCH_FUNCS(reg) -/* No string on the register */ -#define fetch_reg_string NULL -#define fetch_reg_string_size NULL - -#define DEFINE_FETCH_retval(type) \ -void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ - void *dummy, void *dest) \ -{ \ - *(type *)dest = (type)regs_return_value(regs); \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type)); -DEFINE_BASIC_FETCH_FUNCS(retval) -/* No string on the retval */ -#define fetch_retval_string NULL -#define fetch_retval_string_size NULL - -/* Dereference memory access function */ -struct deref_fetch_param { - struct fetch_param orig; - long offset; - fetch_func_t fetch; - fetch_func_t fetch_size; +/* Fetch type information table */ +static const struct fetch_type probe_fetch_types[] = { + /* Special types */ + __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, + "__data_loc char[]"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), + ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), + ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), + ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), + ASSIGN_FETCH_TYPE_ALIAS(symbol, ADDR_FETCH_TYPE, ADDR_FETCH_TYPE, 0), + + ASSIGN_FETCH_TYPE_END }; -#define DEFINE_FETCH_deref(type) \ -void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ - void *data, void *dest) \ -{ \ - struct deref_fetch_param *dprm = data; \ - unsigned long addr; \ - call_fetch(&dprm->orig, regs, &addr); \ - if (addr) { \ - addr += dprm->offset; \ - dprm->fetch(regs, (void *)addr, dest); \ - } else \ - *(type *)dest = 0; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type)); -DEFINE_BASIC_FETCH_FUNCS(deref) -DEFINE_FETCH_deref(string) - -void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, - void *data, void *dest) -{ - struct deref_fetch_param *dprm = data; - unsigned long addr; - - call_fetch(&dprm->orig, regs, &addr); - if (addr && dprm->fetch_size) { - addr += dprm->offset; - dprm->fetch_size(regs, (void *)addr, dest); - } else - *(string_size *)dest = 0; -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size)); - -static void update_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} -NOKPROBE_SYMBOL(update_deref_fetch_param); - -static void free_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - kfree(data); -} -NOKPROBE_SYMBOL(free_deref_fetch_param); - -/* Bitfield fetch function */ -struct bitfield_fetch_param { - struct fetch_param orig; - unsigned char hi_shift; - unsigned char low_shift; -}; - -#define DEFINE_FETCH_bitfield(type) \ -void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ - void *data, void *dest) \ -{ \ - struct bitfield_fetch_param *bprm = data; \ - type buf = 0; \ - call_fetch(&bprm->orig, regs, &buf); \ - if (buf) { \ - buf <<= bprm->hi_shift; \ - buf >>= bprm->low_shift; \ - } \ - *(type *)dest = buf; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type)); -DEFINE_BASIC_FETCH_FUNCS(bitfield) -#define fetch_bitfield_string NULL -#define fetch_bitfield_string_size NULL - -static void -update_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} - -static void -free_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - - kfree(data); -} - -void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs, - void *data, void *dest) -{ - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); - long ret; - - if (!maxlen) - return; - - ret = strlcpy(dst, current->comm, maxlen); - *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string)); - -void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs, - void *data, void *dest) -{ - *(u32 *)dest = strlen(current->comm) + 1; -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size)); - -static const struct fetch_type *find_fetch_type(const char *type, - const struct fetch_type *ftbl) +static const struct fetch_type *find_fetch_type(const char *type) { int i; @@ -255,58 +112,27 @@ static const struct fetch_type *find_fetch_type(const char *type, switch (bs) { case 8: - return find_fetch_type("u8", ftbl); + return find_fetch_type("u8"); case 16: - return find_fetch_type("u16", ftbl); + return find_fetch_type("u16"); case 32: - return find_fetch_type("u32", ftbl); + return find_fetch_type("u32"); case 64: - return find_fetch_type("u64", ftbl); + return find_fetch_type("u64"); default: goto fail; } } - for (i = 0; ftbl[i].name; i++) { - if (strcmp(type, ftbl[i].name) == 0) - return &ftbl[i]; + for (i = 0; probe_fetch_types[i].name; i++) { + if (strcmp(type, probe_fetch_types[i].name) == 0) + return &probe_fetch_types[i]; } fail: return NULL; } -/* Special function : only accept unsigned long */ -static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest) -{ - *(unsigned long *)dest = kernel_stack_pointer(regs); -} -NOKPROBE_SYMBOL(fetch_kernel_stack_address); - -static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest) -{ - *(unsigned long *)dest = user_stack_pointer(regs); -} -NOKPROBE_SYMBOL(fetch_user_stack_address); - -static fetch_func_t get_fetch_size_function(const struct fetch_type *type, - fetch_func_t orig_fn, - const struct fetch_type *ftbl) -{ - int i; - - if (type != &ftbl[FETCH_TYPE_STRING]) - return NULL; /* Only string type needs size function */ - - for (i = 0; i < FETCH_MTD_END; i++) - if (type->fetch[i] == orig_fn) - return ftbl[FETCH_TYPE_STRSIZE].fetch[i]; - - WARN_ON(1); /* This should not happen */ - - return NULL; -} - /* Split symbol and offset. */ int traceprobe_split_symbol_offset(char *symbol, long *offset) { @@ -331,41 +157,44 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return, - bool is_kprobe) + struct fetch_insn *code, unsigned int flags) { int ret = 0; unsigned long param; if (strcmp(arg, "retval") == 0) { - if (is_return) - f->fn = t->fetch[FETCH_MTD_retval]; + if (flags & TPARG_FL_RETURN) + code->op = FETCH_OP_RETVAL; else ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { if (arg[5] == '\0') { - if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR)) - return -EINVAL; - - if (is_kprobe) - f->fn = fetch_kernel_stack_address; - else - f->fn = fetch_user_stack_address; + code->op = FETCH_OP_STACKP; } else if (isdigit(arg[5])) { ret = kstrtoul(arg + 5, 10, ¶m); - if (ret || (is_kprobe && param > PARAM_MAX_STACK)) + if (ret || ((flags & TPARG_FL_KERNEL) && + param > PARAM_MAX_STACK)) ret = -EINVAL; else { - f->fn = t->fetch[FETCH_MTD_stack]; - f->data = (void *)param; + code->op = FETCH_OP_STACK; + code->param = (unsigned int)param; } } else ret = -EINVAL; } else if (strcmp(arg, "comm") == 0) { - if (strcmp(t->name, "string") != 0 && - strcmp(t->name, "string_size") != 0) + code->op = FETCH_OP_COMM; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + } else if (((flags & TPARG_FL_MASK) == + (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && + strncmp(arg, "arg", 3) == 0) { + if (!isdigit(arg[3])) + return -EINVAL; + ret = kstrtoul(arg + 3, 10, ¶m); + if (ret || !param || param > PARAM_MAX_STACK) return -EINVAL; - f->fn = t->fetch[FETCH_MTD_comm]; + code->op = FETCH_OP_ARG; + code->param = (unsigned int)param - 1; +#endif } else ret = -EINVAL; @@ -373,25 +202,27 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } /* Recursive argument parser */ -static int parse_probe_arg(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return, bool is_kprobe, - const struct fetch_type *ftbl) +static int +parse_probe_arg(char *arg, const struct fetch_type *type, + struct fetch_insn **pcode, struct fetch_insn *end, + unsigned int flags) { + struct fetch_insn *code = *pcode; unsigned long param; - long offset; + long offset = 0; char *tmp; int ret = 0; switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); + ret = parse_probe_vars(arg + 1, type, code, flags); break; case '%': /* named register */ ret = regs_query_register_offset(arg + 1); if (ret >= 0) { - f->fn = t->fetch[FETCH_MTD_reg]; - f->data = (void *)(unsigned long)ret; + code->op = FETCH_OP_REG; + code->param = (unsigned int)ret; ret = 0; } break; @@ -401,33 +232,42 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, ret = kstrtoul(arg + 1, 0, ¶m); if (ret) break; - - f->fn = t->fetch[FETCH_MTD_memory]; - f->data = (void *)param; + /* load address */ + code->op = FETCH_OP_IMM; + code->immediate = param; } else if (arg[1] == '+') { /* kprobes don't support file offsets */ - if (is_kprobe) + if (flags & TPARG_FL_KERNEL) return -EINVAL; ret = kstrtol(arg + 2, 0, &offset); if (ret) break; - f->fn = t->fetch[FETCH_MTD_file_offset]; - f->data = (void *)offset; + code->op = FETCH_OP_FOFFS; + code->immediate = (unsigned long)offset; // imm64? } else { /* uprobes don't support symbols */ - if (!is_kprobe) + if (!(flags & TPARG_FL_KERNEL)) return -EINVAL; - ret = traceprobe_split_symbol_offset(arg + 1, &offset); - if (ret) - break; + /* Preserve symbol for updating */ + code->op = FETCH_NOP_SYMBOL; + code->data = kstrdup(arg + 1, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + if (++code == end) + return -E2BIG; - f->data = alloc_symbol_cache(arg + 1, offset); - if (f->data) - f->fn = t->fetch[FETCH_MTD_symbol]; + code->op = FETCH_OP_IMM; + code->immediate = 0; } + /* These are fetching from memory */ + if (++code == end) + return -E2BIG; + *pcode = code; + code->op = FETCH_OP_DEREF; + code->offset = offset; break; case '+': /* deref memory */ @@ -435,11 +275,10 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, case '-': tmp = strchr(arg, '('); if (!tmp) - break; + return -EINVAL; *tmp = '\0'; ret = kstrtol(arg, 0, &offset); - if (ret) break; @@ -447,36 +286,27 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, tmp = strrchr(arg, ')'); if (tmp) { - struct deref_fetch_param *dprm; - const struct fetch_type *t2; + const struct fetch_type *t2 = find_fetch_type(NULL); - t2 = find_fetch_type(NULL, ftbl); *tmp = '\0'; - dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); - - if (!dprm) - return -ENOMEM; - - dprm->offset = offset; - dprm->fetch = t->fetch[FETCH_MTD_memory]; - dprm->fetch_size = get_fetch_size_function(t, - dprm->fetch, ftbl); - ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, - is_kprobe, ftbl); + ret = parse_probe_arg(arg, t2, &code, end, flags); if (ret) - kfree(dprm); - else { - f->fn = t->fetch[FETCH_MTD_deref]; - f->data = (void *)dprm; - } + break; + if (code->op == FETCH_OP_COMM) + return -EINVAL; + if (++code == end) + return -E2BIG; + *pcode = code; + + code->op = FETCH_OP_DEREF; + code->offset = offset; } break; } - if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ - pr_info("%s type has no corresponding fetch method.\n", t->name); + if (!ret && code->op == FETCH_OP_NOP) { + /* Parsed, but do not find fetch method */ ret = -EINVAL; } - return ret; } @@ -485,22 +315,15 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, /* Bitfield type needs to be parsed into a fetch function */ static int __parse_bitfield_probe_arg(const char *bf, const struct fetch_type *t, - struct fetch_param *f) + struct fetch_insn **pcode) { - struct bitfield_fetch_param *bprm; + struct fetch_insn *code = *pcode; unsigned long bw, bo; char *tail; if (*bf != 'b') return 0; - bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); - if (!bprm) - return -ENOMEM; - - bprm->orig = *f; - f->fn = t->fetch[FETCH_MTD_bitfield]; - f->data = (void *)bprm; bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ if (bw == 0 || *tail != '@') @@ -511,20 +334,26 @@ static int __parse_bitfield_probe_arg(const char *bf, if (tail == bf || *tail != '/') return -EINVAL; + code++; + if (code->op != FETCH_OP_NOP) + return -E2BIG; + *pcode = code; - bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); - bprm->low_shift = bprm->hi_shift + bo; + code->op = FETCH_OP_MOD_BF; + code->lshift = BYTES_TO_BITS(t->size) - (bw + bo); + code->rshift = BYTES_TO_BITS(t->size) - bw; + code->basesize = t->size; return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; } /* String length checking wrapper */ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe, - const struct fetch_type *ftbl) + struct probe_arg *parg, unsigned int flags) { - const char *t; - int ret; + struct fetch_insn *code, *scode, *tmp = NULL; + char *t, *t2; + int ret, len; if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); @@ -535,36 +364,128 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, pr_info("Failed to allocate memory for command '%s'.\n", arg); return -ENOMEM; } - t = strchr(parg->comm, ':'); + t = strchr(arg, ':'); if (t) { - arg[t - parg->comm] = '\0'; - t++; + *t = '\0'; + t2 = strchr(++t, '['); + if (t2) { + *t2 = '\0'; + parg->count = simple_strtoul(t2 + 1, &t2, 0); + if (strcmp(t2, "]") || parg->count == 0) + return -EINVAL; + if (parg->count > MAX_ARRAY_LEN) + return -E2BIG; + } } /* * The default type of $comm should be "string", and it can't be * dereferenced. */ if (!t && strcmp(arg, "$comm") == 0) - t = "string"; - parg->type = find_fetch_type(t, ftbl); + parg->type = find_fetch_type("string"); + else + parg->type = find_fetch_type(t); if (!parg->type) { pr_info("Unsupported type: %s\n", t); return -EINVAL; } parg->offset = *size; - *size += parg->type->size; - ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, - is_kprobe, ftbl); - - if (ret >= 0 && t != NULL) - ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); - - if (ret >= 0) { - parg->fetch_size.fn = get_fetch_size_function(parg->type, - parg->fetch.fn, - ftbl); - parg->fetch_size.data = parg->fetch.data; + *size += parg->type->size * (parg->count ?: 1); + + if (parg->count) { + len = strlen(parg->type->fmttype) + 6; + parg->fmt = kmalloc(len, GFP_KERNEL); + if (!parg->fmt) + return -ENOMEM; + snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, + parg->count); + } + + code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL); + if (!code) + return -ENOMEM; + code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; + + ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], + flags); + if (ret) + goto fail; + + /* Store operation */ + if (!strcmp(parg->type->name, "string")) { + if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && + code->op != FETCH_OP_COMM) { + pr_info("string only accepts memory or address.\n"); + ret = -EINVAL; + goto fail; + } + if (code->op != FETCH_OP_DEREF || parg->count) { + /* + * IMM and COMM is pointing actual address, those must + * be kept, and if parg->count != 0, this is an array + * of string pointers instead of string address itself. + */ + code++; + if (code->op != FETCH_OP_NOP) { + ret = -E2BIG; + goto fail; + } + } + code->op = FETCH_OP_ST_STRING; /* In DEREF case, replace it */ + code->size = parg->type->size; + parg->dynamic = true; + } else if (code->op == FETCH_OP_DEREF) { + code->op = FETCH_OP_ST_MEM; + code->size = parg->type->size; + } else { + code++; + if (code->op != FETCH_OP_NOP) { + ret = -E2BIG; + goto fail; + } + code->op = FETCH_OP_ST_RAW; + code->size = parg->type->size; + } + scode = code; + /* Modify operation */ + if (t != NULL) { + ret = __parse_bitfield_probe_arg(t, parg->type, &code); + if (ret) + goto fail; } + /* Loop(Array) operation */ + if (parg->count) { + if (scode->op != FETCH_OP_ST_MEM && + scode->op != FETCH_OP_ST_STRING) { + pr_info("array only accepts memory or address\n"); + ret = -EINVAL; + goto fail; + } + code++; + if (code->op != FETCH_OP_NOP) { + ret = -E2BIG; + goto fail; + } + code->op = FETCH_OP_LP_ARRAY; + code->param = parg->count; + } + code++; + code->op = FETCH_OP_END; + + /* Shrink down the code buffer */ + parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL); + if (!parg->code) + ret = -ENOMEM; + else + memcpy(parg->code, tmp, sizeof(*code) * (code - tmp + 1)); + +fail: + if (ret) { + for (code = tmp; code < tmp + FETCH_INSN_MAX; code++) + if (code->op == FETCH_NOP_SYMBOL) + kfree(code->data); + } + kfree(tmp); return ret; } @@ -586,35 +507,63 @@ int traceprobe_conflict_field_name(const char *name, return 0; } -void traceprobe_update_arg(struct probe_arg *arg) -{ - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - update_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - update_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - update_symbol_cache(arg->fetch.data); -} - void traceprobe_free_probe_arg(struct probe_arg *arg) { - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - free_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - free_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - free_symbol_cache(arg->fetch.data); + struct fetch_insn *code = arg->code; + while (code && code->op != FETCH_OP_END) { + if (code->op == FETCH_NOP_SYMBOL) + kfree(code->data); + code++; + } + kfree(arg->code); kfree(arg->name); kfree(arg->comm); + kfree(arg->fmt); } +int traceprobe_update_arg(struct probe_arg *arg) +{ + struct fetch_insn *code = arg->code; + long offset; + char *tmp; + char c; + int ret = 0; + + while (code && code->op != FETCH_OP_END) { + if (code->op == FETCH_NOP_SYMBOL) { + if (code[1].op != FETCH_OP_IMM) + return -EINVAL; + + tmp = strpbrk("+-", code->data); + if (tmp) + c = *tmp; + ret = traceprobe_split_symbol_offset(code->data, + &offset); + if (ret) + return ret; + + code[1].immediate = + (unsigned long)kallsyms_lookup_name(code->data); + if (tmp) + *tmp = c; + if (!code[1].immediate) + return -ENOENT; + code[1].immediate += offset; + } + code++; + } + return 0; +} + +/* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, bool is_return) { - int i; + struct probe_arg *parg; + int i, j; int pos = 0; - const char *fmt, *arg; if (!is_return) { @@ -625,35 +574,51 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; } - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", - tp->args[i].name, tp->args[i].type->fmt); + parg = tp->args + i; + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=", parg->name); + if (parg->count) { + pos += snprintf(buf + pos, LEN_OR_ZERO, "{%s", + parg->type->fmt); + for (j = 1; j < parg->count; j++) + pos += snprintf(buf + pos, LEN_OR_ZERO, ",%s", + parg->type->fmt); + pos += snprintf(buf + pos, LEN_OR_ZERO, "}"); + } else + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", + parg->type->fmt); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); for (i = 0; i < tp->nr_args; i++) { - if (strcmp(tp->args[i].type->name, "string") == 0) + parg = tp->args + i; + if (parg->count) { + if (strcmp(parg->type->name, "string") == 0) + fmt = ", __get_str(%s[%d])"; + else + fmt = ", REC->%s[%d]"; + for (j = 0; j < parg->count; j++) + pos += snprintf(buf + pos, LEN_OR_ZERO, + fmt, parg->name, j); + } else { + if (strcmp(parg->type->name, "string") == 0) + fmt = ", __get_str(%s)"; + else + fmt = ", REC->%s"; pos += snprintf(buf + pos, LEN_OR_ZERO, - ", __get_str(%s)", - tp->args[i].name); - else - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tp->args[i].name); + fmt, parg->name); + } } -#undef LEN_OR_ZERO - /* return the length of print_fmt */ return pos; } +#undef LEN_OR_ZERO -int set_print_fmt(struct trace_probe *tp, bool is_return) +int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return) { int len; char *print_fmt; @@ -670,3 +635,28 @@ int set_print_fmt(struct trace_probe *tp, bool is_return) return 0; } + +int traceprobe_define_arg_fields(struct trace_event_call *event_call, + size_t offset, struct trace_probe *tp) +{ + int ret, i; + + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) { + struct probe_arg *parg = &tp->args[i]; + const char *fmt = parg->type->fmttype; + int size = parg->type->size; + + if (parg->fmt) + fmt = parg->fmt; + if (parg->count) + size *= parg->count; + ret = trace_define_field(event_call, fmt, parg->name, + offset + parg->offset, size, + parg->type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5f52668e165d..974afc1a3e73 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -23,6 +23,7 @@ #include <linux/stringify.h> #include <linux/limits.h> #include <linux/uaccess.h> +#include <linux/bitops.h> #include <asm/bitsperlong.h> #include "trace.h" @@ -30,6 +31,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 +#define MAX_ARRAY_LEN 64 #define MAX_STRING_SIZE PATH_MAX /* Reserved field names */ @@ -54,50 +56,74 @@ #define TP_FLAG_PROFILE 2 #define TP_FLAG_REGISTERED 4 +/* data_loc: data location, compatible with u32 */ +#define make_data_loc(len, offs) \ + (((u32)(len) << 16) | ((u32)(offs) & 0xffff)) +#define get_loc_len(dl) ((u32)(dl) >> 16) +#define get_loc_offs(dl) ((u32)(dl) & 0xffff) -/* data_rloc: data relative location, compatible with u32 */ -#define make_data_rloc(len, roffs) \ - (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) -#define get_rloc_len(dl) ((u32)(dl) >> 16) -#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) - -/* - * Convert data_rloc to data_loc: - * data_rloc stores the offset from data_rloc itself, but data_loc - * stores the offset from event entry. - */ -#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) - -static nokprobe_inline void *get_rloc_data(u32 *dl) +static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) { - return (u8 *)dl + get_rloc_offs(*dl); + return (u8 *)ent + get_loc_offs(*dl); } -/* For data_loc conversion */ -static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) +static nokprobe_inline u32 update_data_loc(u32 loc, int consumed) { - return (u8 *)ent + get_rloc_offs(*dl); + u32 maxlen = get_loc_len(loc); + u32 offset = get_loc_offs(loc); + + return make_data_loc(maxlen - consumed, offset + consumed); } -/* Data fetch function type */ -typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); /* Printing function type */ -typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *); - -/* Fetch types */ -enum { - FETCH_MTD_reg = 0, - FETCH_MTD_stack, - FETCH_MTD_retval, - FETCH_MTD_comm, - FETCH_MTD_memory, - FETCH_MTD_symbol, - FETCH_MTD_deref, - FETCH_MTD_bitfield, - FETCH_MTD_file_offset, - FETCH_MTD_END, +typedef int (*print_type_func_t)(struct trace_seq *, void *, void *); + +enum fetch_op { + FETCH_OP_NOP = 0, + // Stage 1 (load) ops + FETCH_OP_REG, /* Register : .param = offset */ + FETCH_OP_STACK, /* Stack : .param = index */ + FETCH_OP_STACKP, /* Stack pointer */ + FETCH_OP_RETVAL, /* Return value */ + FETCH_OP_IMM, /* Immediate : .immediate */ + FETCH_OP_COMM, /* Current comm */ + FETCH_OP_ARG, /* Function argument : .param */ + FETCH_OP_FOFFS, /* File offset: .immediate */ + // Stage 2 (dereference) op + FETCH_OP_DEREF, /* Dereference: .offset */ + // Stage 3 (store) ops + FETCH_OP_ST_RAW, /* Raw: .size */ + FETCH_OP_ST_MEM, /* Mem: .offset, .size */ + FETCH_OP_ST_STRING, /* String: .offset, .size */ + // Stage 4 (modify) op + FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ + // Stage 5 (loop) op + FETCH_OP_LP_ARRAY, /* Array: .param = loop count */ + FETCH_OP_END, + FETCH_NOP_SYMBOL, /* Unresolved Symbol holder */ }; +struct fetch_insn { + enum fetch_op op; + union { + unsigned int param; + struct { + unsigned int size; + int offset; + }; + struct { + unsigned char basesize; + unsigned char lshift; + unsigned char rshift; + }; + unsigned long immediate; + void *data; + }; +}; + +/* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */ +#define FETCH_INSN_MAX 16 + /* Fetch type information table */ struct fetch_type { const char *name; /* Name of type */ @@ -106,13 +132,6 @@ struct fetch_type { print_type_func_t print; /* Print functions */ const char *fmt; /* Fromat string */ const char *fmttype; /* Name in format file */ - /* Fetch functions */ - fetch_func_t fetch[FETCH_MTD_END]; -}; - -struct fetch_param { - fetch_func_t fn; - void *data; }; /* For defining macros, define string/string_size types */ @@ -124,8 +143,7 @@ typedef u32 string_size; /* Printing in basic type function template */ #define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ -int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ - void *data, void *ent); \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, void *data, void *ent);\ extern const char PRINT_TYPE_FMT_NAME(type)[] DECLARE_BASIC_PRINT_TYPE_FUNC(u8); @@ -142,57 +160,7 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(x32); DECLARE_BASIC_PRINT_TYPE_FUNC(x64); DECLARE_BASIC_PRINT_TYPE_FUNC(string); - -#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type - -/* Declare macro for basic types */ -#define DECLARE_FETCH_FUNC(method, type) \ -extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \ - void *data, void *dest) - -#define DECLARE_BASIC_FETCH_FUNCS(method) \ -DECLARE_FETCH_FUNC(method, u8); \ -DECLARE_FETCH_FUNC(method, u16); \ -DECLARE_FETCH_FUNC(method, u32); \ -DECLARE_FETCH_FUNC(method, u64) - -DECLARE_BASIC_FETCH_FUNCS(reg); -#define fetch_reg_string NULL -#define fetch_reg_string_size NULL - -DECLARE_BASIC_FETCH_FUNCS(retval); -#define fetch_retval_string NULL -#define fetch_retval_string_size NULL - -DECLARE_BASIC_FETCH_FUNCS(symbol); -DECLARE_FETCH_FUNC(symbol, string); -DECLARE_FETCH_FUNC(symbol, string_size); - -DECLARE_BASIC_FETCH_FUNCS(deref); -DECLARE_FETCH_FUNC(deref, string); -DECLARE_FETCH_FUNC(deref, string_size); - -DECLARE_BASIC_FETCH_FUNCS(bitfield); -#define fetch_bitfield_string NULL -#define fetch_bitfield_string_size NULL - -/* comm only makes sense as a string */ -#define fetch_comm_u8 NULL -#define fetch_comm_u16 NULL -#define fetch_comm_u32 NULL -#define fetch_comm_u64 NULL -DECLARE_FETCH_FUNC(comm, string); -DECLARE_FETCH_FUNC(comm, string_size); - -/* - * Define macro for basic types - we don't need to define s* types, because - * we have to care only about bitwidth at recording time. - */ -#define DEFINE_BASIC_FETCH_FUNCS(method) \ -DEFINE_FETCH_##method(u8) \ -DEFINE_FETCH_##method(u16) \ -DEFINE_FETCH_##method(u32) \ -DEFINE_FETCH_##method(u64) +DECLARE_BASIC_PRINT_TYPE_FUNC(symbol); /* Default (unsigned long) fetch type */ #define __DEFAULT_FETCH_TYPE(t) x##t @@ -200,8 +168,9 @@ DEFINE_FETCH_##method(u64) #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) -#define ASSIGN_FETCH_FUNC(method, type) \ - [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) +#define __ADDR_FETCH_TYPE(t) u##t +#define _ADDR_FETCH_TYPE(t) __ADDR_FETCH_TYPE(t) +#define ADDR_FETCH_TYPE _ADDR_FETCH_TYPE(BITS_PER_LONG) #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ {.name = _name, \ @@ -210,64 +179,23 @@ DEFINE_FETCH_##method(u64) .print = PRINT_TYPE_FUNC_NAME(ptype), \ .fmt = PRINT_TYPE_FMT_NAME(ptype), \ .fmttype = _fmttype, \ - .fetch = { \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(comm, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ -ASSIGN_FETCH_FUNC(bitfield, ftype), \ -ASSIGN_FETCH_FUNC(file_offset, ftype), \ - } \ } - +#define _ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, #_fmttype) #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, ptype) /* If ptype is an alias of atype, use this macro (show atype in format) */ #define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype) + _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, atype) #define ASSIGN_FETCH_TYPE_END {} - -#define FETCH_TYPE_STRING 0 -#define FETCH_TYPE_STRSIZE 1 +#define MAX_ARRAY_LEN 64 #ifdef CONFIG_KPROBE_EVENTS -struct symbol_cache; -unsigned long update_symbol_cache(struct symbol_cache *sc); -void free_symbol_cache(struct symbol_cache *sc); -struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); bool trace_kprobe_on_func_entry(struct trace_event_call *call); bool trace_kprobe_error_injectable(struct trace_event_call *call); #else -/* uprobes do not support symbol fetch methods */ -#define fetch_symbol_u8 NULL -#define fetch_symbol_u16 NULL -#define fetch_symbol_u32 NULL -#define fetch_symbol_u64 NULL -#define fetch_symbol_string NULL -#define fetch_symbol_string_size NULL - -struct symbol_cache { -}; -static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc) -{ - return 0; -} - -static inline void __used free_symbol_cache(struct symbol_cache *sc) -{ -} - -static inline struct symbol_cache * __used -alloc_symbol_cache(const char *sym, long offset) -{ - return NULL; -} - static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call) { return false; @@ -280,11 +208,13 @@ static inline bool trace_kprobe_error_injectable(struct trace_event_call *call) #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { - struct fetch_param fetch; - struct fetch_param fetch_size; + struct fetch_insn *code; + bool dynamic;/* Dynamic array (string) is used */ unsigned int offset; /* Offset from argument entry */ + unsigned int count; /* Array count */ const char *name; /* Name of this argument */ const char *comm; /* Command of this argument */ + char *fmt; /* Format string if needed */ const struct fetch_type *type; /* Type of this argument */ }; @@ -313,12 +243,6 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp) return !!(tp->flags & TP_FLAG_REGISTERED); } -static nokprobe_inline void call_fetch(struct fetch_param *fprm, - struct pt_regs *regs, void *dest) -{ - return fprm->fn(regs, fprm->data, dest); -} - /* Check the name is good for event/group/fields */ static inline bool is_good_name(const char *name) { @@ -343,67 +267,23 @@ find_event_file_link(struct trace_probe *tp, struct trace_event_file *file) return NULL; } +#define TPARG_FL_RETURN BIT(0) +#define TPARG_FL_KERNEL BIT(1) +#define TPARG_FL_FENTRY BIT(2) +#define TPARG_FL_MASK GENMASK(2, 0) + extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe, - const struct fetch_type *ftbl); + struct probe_arg *parg, unsigned int flags); extern int traceprobe_conflict_field_name(const char *name, struct probe_arg *args, int narg); -extern void traceprobe_update_arg(struct probe_arg *arg); +extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, long *offset); -/* Sum up total data length for dynamic arraies (strings) */ -static nokprobe_inline int -__get_data_size(struct trace_probe *tp, struct pt_regs *regs) -{ - int i, ret = 0; - u32 len; - - for (i = 0; i < tp->nr_args; i++) - if (unlikely(tp->args[i].fetch_size.fn)) { - call_fetch(&tp->args[i].fetch_size, regs, &len); - ret += len; - } - - return ret; -} - -/* Store the value of each argument */ -static nokprobe_inline void -store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, - u8 *data, int maxlen) -{ - int i; - u32 end = tp->size; - u32 *dl; /* Data (relative) location */ - - for (i = 0; i < tp->nr_args; i++) { - if (unlikely(tp->args[i].fetch_size.fn)) { - /* - * First, we set the relative location and - * maximum data length to *dl - */ - dl = (u32 *)(data + tp->args[i].offset); - *dl = make_data_rloc(maxlen, end - tp->args[i].offset); - /* Then try to fetch string or dynamic array data */ - call_fetch(&tp->args[i].fetch, regs, dl); - /* Reduce maximum length */ - end += get_rloc_len(*dl); - maxlen -= get_rloc_len(*dl); - /* Trick here, convert data_rloc to data_loc */ - *dl = convert_rloc_to_loc(*dl, - ent_size + tp->args[i].offset); - } else - /* Just fetching data normally */ - call_fetch(&tp->args[i].fetch, regs, - data + tp->args[i].offset); - } -} - -extern int set_print_fmt(struct trace_probe *tp, bool is_return); +extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); #ifdef CONFIG_PERF_EVENTS extern struct trace_event_call * @@ -412,6 +292,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); extern struct trace_event_call * -create_local_trace_uprobe(char *name, unsigned long offs, bool is_return); +create_local_trace_uprobe(char *name, unsigned long offs, + unsigned long ref_ctr_offset, bool is_return); extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); #endif +extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, + size_t offset, struct trace_probe *tp); diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h new file mode 100644 index 000000000000..5c56afc17cf8 --- /dev/null +++ b/kernel/trace/trace_probe_tmpl.h @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Traceprobe fetch helper inlines + */ + +static nokprobe_inline void +fetch_store_raw(unsigned long val, struct fetch_insn *code, void *buf) +{ + switch (code->size) { + case 1: + *(u8 *)buf = (u8)val; + break; + case 2: + *(u16 *)buf = (u16)val; + break; + case 4: + *(u32 *)buf = (u32)val; + break; + case 8: + //TBD: 32bit signed + *(u64 *)buf = (u64)val; + break; + default: + *(unsigned long *)buf = val; + } +} + +static nokprobe_inline void +fetch_apply_bitfield(struct fetch_insn *code, void *buf) +{ + switch (code->basesize) { + case 1: + *(u8 *)buf <<= code->lshift; + *(u8 *)buf >>= code->rshift; + break; + case 2: + *(u16 *)buf <<= code->lshift; + *(u16 *)buf >>= code->rshift; + break; + case 4: + *(u32 *)buf <<= code->lshift; + *(u32 *)buf >>= code->rshift; + break; + case 8: + *(u64 *)buf <<= code->lshift; + *(u64 *)buf >>= code->rshift; + break; + } +} + +/* + * These functions must be defined for each callsite. + * Return consumed dynamic data size (>= 0), or error (< 0). + * If dest is NULL, don't store result and return required dynamic data size. + */ +static int +process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, + void *dest, void *base); +static nokprobe_inline int fetch_store_strlen(unsigned long addr); +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base); +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size); + +/* From the 2nd stage, routine is same */ +static nokprobe_inline int +process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, + void *dest, void *base) +{ + struct fetch_insn *s3 = NULL; + int total = 0, ret = 0, i = 0; + u32 loc = 0; + unsigned long lval = val; + +stage2: + /* 2nd stage: dereference memory if needed */ + while (code->op == FETCH_OP_DEREF) { + lval = val; + ret = probe_mem_read(&val, (void *)val + code->offset, + sizeof(val)); + if (ret) + return ret; + code++; + } + + s3 = code; +stage3: + /* 3rd stage: store value to buffer */ + if (unlikely(!dest)) { + if (code->op == FETCH_OP_ST_STRING) { + ret += fetch_store_strlen(val + code->offset); + code++; + goto array; + } else + return -EILSEQ; + } + + switch (code->op) { + case FETCH_OP_ST_RAW: + fetch_store_raw(val, code, dest); + break; + case FETCH_OP_ST_MEM: + probe_mem_read(dest, (void *)val + code->offset, code->size); + break; + case FETCH_OP_ST_STRING: + loc = *(u32 *)dest; + ret = fetch_store_string(val + code->offset, dest, base); + break; + default: + return -EILSEQ; + } + code++; + + /* 4th stage: modify stored value if needed */ + if (code->op == FETCH_OP_MOD_BF) { + fetch_apply_bitfield(code, dest); + code++; + } + +array: + /* the last stage: Loop on array */ + if (code->op == FETCH_OP_LP_ARRAY) { + total += ret; + if (++i < code->param) { + code = s3; + if (s3->op != FETCH_OP_ST_STRING) { + dest += s3->size; + val += s3->size; + goto stage3; + } + code--; + val = lval + sizeof(char *); + if (dest) { + dest += sizeof(u32); + *(u32 *)dest = update_data_loc(loc, ret); + } + goto stage2; + } + code++; + ret = total; + } + + return code->op == FETCH_OP_END ? ret : -EILSEQ; +} + +/* Sum up total data length for dynamic arraies (strings) */ +static nokprobe_inline int +__get_data_size(struct trace_probe *tp, struct pt_regs *regs) +{ + struct probe_arg *arg; + int i, len, ret = 0; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + if (unlikely(arg->dynamic)) { + len = process_fetch_insn(arg->code, regs, NULL, NULL); + if (len > 0) + ret += len; + } + } + + return ret; +} + +/* Store the value of each argument */ +static nokprobe_inline void +store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, + int header_size, int maxlen) +{ + struct probe_arg *arg; + void *base = data - header_size; + void *dyndata = data + tp->size; + u32 *dl; /* Data location */ + int ret, i; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + dl = data + arg->offset; + /* Point the dynamic data area if needed */ + if (unlikely(arg->dynamic)) + *dl = make_data_loc(maxlen, dyndata - base); + ret = process_fetch_insn(arg->code, regs, dl, base); + if (unlikely(ret < 0 && arg->dynamic)) + *dl = make_data_loc(0, dyndata - base); + else + dyndata += ret; + } +} + +static inline int +print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args, + u8 *data, void *field) +{ + void *p; + int i, j; + + for (i = 0; i < nr_args; i++) { + struct probe_arg *a = args + i; + + trace_seq_printf(s, " %s=", a->name); + if (likely(!a->count)) { + if (!a->type->print(s, data + a->offset, field)) + return -ENOMEM; + continue; + } + trace_seq_putc(s, '{'); + p = data + a->offset; + for (j = 0; j < a->count; j++) { + if (!a->type->print(s, p, field)) + return -ENOMEM; + trace_seq_putc(s, j == a->count - 1 ? '}' : ','); + p += a->type->size; + } + } + return 0; +} diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4237eba4ef20..2b0d1ee3241c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -111,7 +111,7 @@ check_stack(unsigned long ip, unsigned long *stack) stack_trace_max_size = this_size; stack_trace_max.nr_entries = 0; - stack_trace_max.skip = 3; + stack_trace_max.skip = 0; save_stack_trace(&stack_trace_max); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e696667da29a..31ea48eceda1 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -15,6 +15,7 @@ #include <linux/rculist.h> #include "trace_probe.h" +#include "trace_probe_tmpl.h" #define UPROBE_EVENT_SYSTEM "uprobes" @@ -47,6 +48,7 @@ struct trace_uprobe { struct inode *inode; char *filename; unsigned long offset; + unsigned long ref_ctr_offset; unsigned long nhit; struct trace_probe tp; }; @@ -98,74 +100,52 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) /* * Uprobes-specific fetch functions */ -#define DEFINE_FETCH_stack(type) \ -static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)get_user_stack_nth(regs, \ - ((unsigned long)offset)); \ -} -DEFINE_BASIC_FETCH_FUNCS(stack) -/* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - -#define DEFINE_FETCH_memory(type) \ -static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ - void *addr, void *dest) \ -{ \ - type retval; \ - void __user *vaddr = (void __force __user *) addr; \ - \ - if (copy_from_user(&retval, vaddr, sizeof(type))) \ - *(type *)dest = 0; \ - else \ - *(type *) dest = retval; \ +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ + void __user *vaddr = (void __force __user *)src; + + return copy_from_user(dest, vaddr, size) ? -EFAULT : 0; } -DEFINE_BASIC_FETCH_FUNCS(memory) /* * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. */ -static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) { long ret; - u32 rloc = *(u32 *)dest; - int maxlen = get_rloc_len(rloc); - u8 *dst = get_rloc_data(dest); + u32 loc = *(u32 *)dest; + int maxlen = get_loc_len(loc); + u8 *dst = get_loc_data(dest, base); void __user *src = (void __force __user *) addr; - if (!maxlen) - return; + if (unlikely(!maxlen)) + return -ENOMEM; ret = strncpy_from_user(dst, src, maxlen); - if (ret == maxlen) - dst[--ret] = '\0'; - - if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc)); - } else { - *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc)); + if (ret >= 0) { + if (ret == maxlen) + dst[ret - 1] = '\0'; + *(u32 *)dest = make_data_loc(ret, (void *)dst - base); } + + return ret; } -static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) { int len; void __user *vaddr = (void __force __user *) addr; len = strnlen_user(vaddr, MAX_STRING_SIZE); - if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; + return (len > MAX_STRING_SIZE) ? 0 : len; } -static unsigned long translate_user_vaddr(void *file_offset) +static unsigned long translate_user_vaddr(unsigned long file_offset) { unsigned long base_addr; struct uprobe_dispatch_data *udd; @@ -173,44 +153,44 @@ static unsigned long translate_user_vaddr(void *file_offset) udd = (void *) current->utask->vaddr; base_addr = udd->bp_addr - udd->tu->offset; - return base_addr + (unsigned long)file_offset; + return base_addr + file_offset; } -#define DEFINE_FETCH_file_offset(type) \ -static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \ - void *offset, void *dest)\ -{ \ - void *vaddr = (void *)translate_user_vaddr(offset); \ - \ - FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \ +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, + void *base) +{ + unsigned long val; + + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_REG: + val = regs_get_register(regs, code->param); + break; + case FETCH_OP_STACK: + val = get_user_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = user_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_FOFFS: + val = translate_user_vaddr(code->immediate); + break; + default: + return -EILSEQ; + } + code++; + + return process_fetch_insn_bottom(code, val, dest, base); } -DEFINE_BASIC_FETCH_FUNCS(file_offset) -DEFINE_FETCH_file_offset(string) -DEFINE_FETCH_file_offset(string_size) - -/* Fetch type information table */ -static const struct fetch_type uprobes_fetch_type_table[] = { - /* Special types */ - [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, - sizeof(u32), 1, "__data_loc char[]"), - [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, - string_size, sizeof(u32), 0, "u32"), - /* Basic types */ - ASSIGN_FETCH_TYPE(u8, u8, 0), - ASSIGN_FETCH_TYPE(u16, u16, 0), - ASSIGN_FETCH_TYPE(u32, u32, 0), - ASSIGN_FETCH_TYPE(u64, u64, 0), - ASSIGN_FETCH_TYPE(s8, u8, 1), - ASSIGN_FETCH_TYPE(s16, u16, 1), - ASSIGN_FETCH_TYPE(s32, u32, 1), - ASSIGN_FETCH_TYPE(s64, u64, 1), - ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), - ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), - ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), - ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), - - ASSIGN_FETCH_TYPE_END -}; +NOKPROBE_SYMBOL(process_fetch_insn) static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) { @@ -311,6 +291,35 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) return 0; } +/* + * Uprobe with multiple reference counter is not allowed. i.e. + * If inode and offset matches, reference counter offset *must* + * match as well. Though, there is one exception: If user is + * replacing old trace_uprobe with new one(same group/event), + * then we allow same uprobe with new reference counter as far + * as the new one does not conflict with any other existing + * ones. + */ +static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new) +{ + struct trace_uprobe *tmp, *old = NULL; + struct inode *new_inode = d_real_inode(new->path.dentry); + + old = find_probe_event(trace_event_name(&new->tp.call), + new->tp.call.class->system); + + list_for_each_entry(tmp, &uprobe_list, list) { + if ((old ? old != tmp : true) && + new_inode == d_real_inode(tmp->path.dentry) && + new->offset == tmp->offset && + new->ref_ctr_offset != tmp->ref_ctr_offset) { + pr_warn("Reference counter offset mismatch."); + return ERR_PTR(-EINVAL); + } + } + return old; +} + /* Register a trace_uprobe and probe_event */ static int register_trace_uprobe(struct trace_uprobe *tu) { @@ -320,8 +329,12 @@ static int register_trace_uprobe(struct trace_uprobe *tu) mutex_lock(&uprobe_lock); /* register as an event */ - old_tu = find_probe_event(trace_event_name(&tu->tp.call), - tu->tp.call.class->system); + old_tu = find_old_trace_uprobe(tu); + if (IS_ERR(old_tu)) { + ret = PTR_ERR(old_tu); + goto end; + } + if (old_tu) { /* delete old event */ ret = unregister_trace_uprobe(old_tu); @@ -352,10 +365,10 @@ end: static int create_trace_uprobe(int argc, char **argv) { struct trace_uprobe *tu; - char *arg, *event, *group, *filename; + char *arg, *event, *group, *filename, *rctr, *rctr_end; char buf[MAX_EVENT_NAME_LEN]; struct path path; - unsigned long offset; + unsigned long offset, ref_ctr_offset; bool is_delete, is_return; int i, ret; @@ -364,6 +377,7 @@ static int create_trace_uprobe(int argc, char **argv) is_return = false; event = NULL; group = NULL; + ref_ctr_offset = 0; /* argc must be >= 1 */ if (argv[0][0] == '-') @@ -438,6 +452,26 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; } + /* Parse reference counter offset if specified. */ + rctr = strchr(arg, '('); + if (rctr) { + rctr_end = strchr(rctr, ')'); + if (rctr > rctr_end || *(rctr_end + 1) != 0) { + ret = -EINVAL; + pr_info("Invalid reference counter offset.\n"); + goto fail_address_parse; + } + + *rctr++ = '\0'; + *rctr_end = '\0'; + ret = kstrtoul(rctr, 0, &ref_ctr_offset); + if (ret) { + pr_info("Invalid reference counter offset.\n"); + goto fail_address_parse; + } + } + + /* Parse uprobe offset. */ ret = kstrtoul(arg, 0, &offset); if (ret) goto fail_address_parse; @@ -472,6 +506,7 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; } tu->offset = offset; + tu->ref_ctr_offset = ref_ctr_offset; tu->path = path; tu->filename = kstrdup(filename, GFP_KERNEL); @@ -522,8 +557,7 @@ static int create_trace_uprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, - is_return, false, - uprobes_fetch_type_table); + is_return ? TPARG_FL_RETURN : 0); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -590,6 +624,9 @@ static int probes_seq_show(struct seq_file *m, void *v) trace_event_name(&tu->tp.call), tu->filename, (int)(sizeof(void *) * 2), tu->offset); + if (tu->ref_ctr_offset) + seq_printf(m, "(0x%lx)", tu->ref_ctr_offset); + for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); @@ -833,7 +870,6 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e struct trace_seq *s = &iter->seq; struct trace_uprobe *tu; u8 *data; - int i; entry = (struct uprobe_trace_entry_head *)iter->ent; tu = container_of(event, struct trace_uprobe, tp.call.event); @@ -850,12 +886,8 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e data = DATAOF_TRACE_ENTRY(entry, false); } - for (i = 0; i < tu->tp.nr_args; i++) { - struct probe_arg *parg = &tu->tp.args[i]; - - if (!parg->type->print(s, parg->name, data + parg->offset, entry)) - goto out; - } + if (print_probe_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0) + goto out; trace_seq_putc(s, '\n'); @@ -905,7 +937,13 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, tu->consumer.filter = filter; tu->inode = d_real_inode(tu->path.dentry); - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + if (tu->ref_ctr_offset) { + ret = uprobe_register_refctr(tu->inode, tu->offset, + tu->ref_ctr_offset, &tu->consumer); + } else { + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + } + if (ret) goto err_buffer; @@ -958,7 +996,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) static int uprobe_event_define_fields(struct trace_event_call *event_call) { - int ret, i, size; + int ret, size; struct uprobe_trace_entry_head field; struct trace_uprobe *tu = event_call->data; @@ -970,19 +1008,8 @@ static int uprobe_event_define_fields(struct trace_event_call *event_call) DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); size = SIZEOF_TRACE_ENTRY(false); } - /* Set argument names as fields */ - for (i = 0; i < tu->tp.nr_args; i++) { - struct probe_arg *parg = &tu->tp.args[i]; - - ret = trace_define_field(event_call, parg->type->fmttype, - parg->name, size + parg->offset, - parg->type->size, parg->type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; + return traceprobe_define_arg_fields(event_call, size, &tu->tp); } #ifdef CONFIG_PERF_EVENTS @@ -1233,7 +1260,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); - store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); if (tu->tp.flags & TP_FLAG_TRACE) ret |= uprobe_trace_func(tu, regs, ucb, dsize); @@ -1268,7 +1295,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); - store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); if (tu->tp.flags & TP_FLAG_TRACE) uretprobe_trace_func(tu, func, regs, ucb, dsize); @@ -1304,7 +1331,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) init_trace_event_call(tu, call); - if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) + if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; ret = register_trace_event(&call->event); @@ -1340,7 +1367,8 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) #ifdef CONFIG_PERF_EVENTS struct trace_event_call * -create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) +create_local_trace_uprobe(char *name, unsigned long offs, + unsigned long ref_ctr_offset, bool is_return) { struct trace_uprobe *tu; struct path path; @@ -1372,10 +1400,11 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) tu->offset = offs; tu->path = path; + tu->ref_ctr_offset = ref_ctr_offset; tu->filename = kstrdup(name, GFP_KERNEL); init_trace_event_call(tu, &tu->tp.call); - if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { + if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { ret = -ENOMEM; goto error; } |