diff options
Diffstat (limited to 'kernel')
91 files changed, 2216 insertions, 735 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c3223e0db2f5..eacb701bc2be 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8526,6 +8526,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: return BTF_KFUNC_HOOK_CGROUP; case BPF_PROG_TYPE_SCHED_ACT: return BTF_KFUNC_HOOK_SCHED_ACT; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 774accbd4a22..67e8a2fc1a99 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -33,8 +33,8 @@ #include <trace/events/xdp.h> #include <linux/btf_ids.h> -#include <linux/netdevice.h> /* netif_receive_skb_list */ -#include <linux/etherdevice.h> /* eth_type_trans */ +#include <linux/netdevice.h> +#include <net/gro.h> /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is @@ -68,6 +68,7 @@ struct bpf_cpu_map_entry { struct bpf_cpumap_val value; struct bpf_prog *prog; + struct gro_node gro; struct completion kthread_running; struct rcu_work free_work; @@ -133,22 +134,23 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) } } -static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, - struct list_head *listp, - struct xdp_cpumap_stats *stats) +static u32 cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, + void **skbs, u32 skb_n, + struct xdp_cpumap_stats *stats) { - struct sk_buff *skb, *tmp; struct xdp_buff xdp; - u32 act; + u32 act, pass = 0; int err; - list_for_each_entry_safe(skb, tmp, listp, list) { + for (u32 i = 0; i < skb_n; i++) { + struct sk_buff *skb = skbs[i]; + act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog); switch (act) { case XDP_PASS: + skbs[pass++] = skb; break; case XDP_REDIRECT: - skb_list_del_init(skb); err = xdp_do_generic_redirect(skb->dev, skb, &xdp, rcpu->prog); if (unlikely(err)) { @@ -157,7 +159,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, } else { stats->redirect++; } - return; + break; default: bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; @@ -165,12 +167,15 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, trace_xdp_exception(skb->dev, rcpu->prog, act); fallthrough; case XDP_DROP: - skb_list_del_init(skb); - kfree_skb(skb); + napi_consume_skb(skb, true); stats->drop++; - return; + break; } } + + stats->pass += pass; + + return pass; } static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, @@ -204,7 +209,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, stats->drop++; } else { frames[nframes++] = xdpf; - stats->pass++; } break; case XDP_REDIRECT: @@ -228,43 +232,65 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } xdp_clear_return_frame_no_direct(); + stats->pass += nframes; return nframes; } #define CPUMAP_BATCH 8 -static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, - int xdp_n, struct xdp_cpumap_stats *stats, - struct list_head *list) +struct cpu_map_ret { + u32 xdp_n; + u32 skb_n; +}; + +static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, + void **skbs, struct cpu_map_ret *ret, + struct xdp_cpumap_stats *stats) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; - int nframes; if (!rcpu->prog) - return xdp_n; + goto out; - rcu_read_lock_bh(); + rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); - nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); + ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); + if (unlikely(ret->skb_n)) + ret->skb_n = cpu_map_bpf_prog_run_skb(rcpu, skbs, ret->skb_n, + stats); if (stats->redirect) xdp_do_flush(); - if (unlikely(!list_empty(list))) - cpu_map_bpf_prog_run_skb(rcpu, list, stats); - bpf_net_ctx_clear(bpf_net_ctx); - rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ + rcu_read_unlock(); - return nframes; +out: + if (unlikely(ret->skb_n) && ret->xdp_n) + memmove(&skbs[ret->xdp_n], skbs, ret->skb_n * sizeof(*skbs)); +} + +static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty) +{ + /* + * If the ring is not empty, there'll be a new iteration soon, and we + * only need to do a full flush if a tick is long (> 1 ms). + * If the ring is empty, to not hold GRO packets in the stack for too + * long, do a full flush. + * This is equivalent to how NAPI decides whether to perform a full + * flush. + */ + gro_flush(&rcpu->gro, !empty && HZ >= 1000); + gro_normal_list(&rcpu->gro); } static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; unsigned long last_qs = jiffies; + u32 packets = 0; complete(&rcpu->kthread_running); set_current_state(TASK_INTERRUPTIBLE); @@ -277,11 +303,11 @@ static int cpu_map_kthread_run(void *data) while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { struct xdp_cpumap_stats stats = {}; /* zero stats */ unsigned int kmem_alloc_drops = 0, sched = 0; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - int i, n, m, nframes, xdp_n; + struct cpu_map_ret ret = { }; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - LIST_HEAD(list); + u32 i, n, m; + bool empty; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -306,7 +332,7 @@ static int cpu_map_kthread_run(void *data) */ n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - for (i = 0, xdp_n = 0; i < n; i++) { + for (i = 0; i < n; i++) { void *f = frames[i]; struct page *page; @@ -314,11 +340,11 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = f; __ptr_clear_bit(0, &skb); - list_add_tail(&skb->list, &list); + skbs[ret.skb_n++] = skb; continue; } - frames[xdp_n++] = f; + frames[ret.xdp_n++] = f; page = virt_to_page(f); /* Bring struct page memory area to curr CPU. Read by @@ -328,40 +354,51 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } + local_bh_disable(); + /* Support running another XDP prog on this CPU */ - nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list); - if (nframes) { - m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, - gfp, nframes, skbs); - if (unlikely(m == 0)) { - for (i = 0; i < nframes; i++) - skbs[i] = NULL; /* effect: xdp_return_frame */ - kmem_alloc_drops += nframes; - } + cpu_map_bpf_prog_run(rcpu, frames, skbs, &ret, &stats); + if (!ret.xdp_n) + goto stats; + + m = napi_skb_cache_get_bulk(skbs, ret.xdp_n); + if (unlikely(m < ret.xdp_n)) { + for (i = m; i < ret.xdp_n; i++) + xdp_return_frame(frames[i]); + + if (ret.skb_n) + memmove(&skbs[m], &skbs[ret.xdp_n], + ret.skb_n * sizeof(*skbs)); + + kmem_alloc_drops += ret.xdp_n - m; + ret.xdp_n = m; } - local_bh_disable(); - for (i = 0; i < nframes; i++) { + for (i = 0; i < ret.xdp_n; i++) { struct xdp_frame *xdpf = frames[i]; - struct sk_buff *skb = skbs[i]; - skb = __xdp_build_skb_from_frame(xdpf, skb, - xdpf->dev_rx); - if (!skb) { - xdp_return_frame(xdpf); - continue; - } - - list_add_tail(&skb->list, &list); + /* Can fail only when !skb -- already handled above */ + __xdp_build_skb_from_frame(xdpf, skbs[i], xdpf->dev_rx); } +stats: /* Feedback loop via tracepoint. * NB: keep before recv to allow measuring enqueue/dequeue latency. */ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, sched, &stats); - netif_receive_skb_list(&list); + for (i = 0; i < ret.xdp_n + ret.skb_n; i++) + gro_receive_skb(&rcpu->gro, skbs[i]); + + /* Flush either every 64 packets or in case of empty ring */ + packets += n; + empty = __ptr_ring_empty(rcpu->queue); + if (packets >= NAPI_POLL_WEIGHT || empty) { + cpu_map_gro_flush(rcpu, empty); + packets = 0; + } + local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); @@ -430,6 +467,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu->cpu = cpu; rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; + gro_init(&rcpu->gro); if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) goto free_ptr_ring; @@ -458,6 +496,7 @@ free_prog: if (rcpu->prog) bpf_prog_put(rcpu->prog); free_ptr_ring: + gro_cleanup(&rcpu->gro); ptr_ring_cleanup(rcpu->queue, NULL); free_queue: kfree(rcpu->queue); @@ -487,6 +526,7 @@ static void __cpu_map_entry_free(struct work_struct *work) if (rcpu->prog) bpf_prog_put(rcpu->prog); + gro_cleanup(&rcpu->gro); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 1a4fec330eaa..42ae8d595c2c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -25,6 +25,7 @@ #include <linux/rhashtable.h> #include <linux/rtnetlink.h> #include <linux/rwsem.h> +#include <net/netdev_lock.h> #include <net/xdp.h> /* Protects offdevs, members of bpf_offload_netdev and offload members @@ -528,13 +529,14 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&offmap->map, attr); - rtnl_lock(); - down_write(&bpf_devs_lock); offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); err = bpf_dev_offload_check(offmap->netdev); if (err) - goto err_unlock; + goto err_unlock_rtnl; + + netdev_lock_ops(offmap->netdev); + down_write(&bpf_devs_lock); ondev = bpf_offload_find_netdev(offmap->netdev); if (!ondev) { @@ -548,12 +550,15 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) list_add_tail(&offmap->offloads, &ondev->maps); up_write(&bpf_devs_lock); + netdev_unlock_ops(offmap->netdev); rtnl_unlock(); return &offmap->map; err_unlock: up_write(&bpf_devs_lock); + netdev_unlock_ops(offmap->netdev); +err_unlock_rtnl: rtnl_unlock(); bpf_map_area_free(offmap); return ERR_PTR(err); diff --git a/kernel/capability.c b/kernel/capability.c index e089d2628c29..829f49ae07b9 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -286,22 +286,6 @@ bool has_ns_capability(struct task_struct *t, } /** - * has_capability - Does a task have a capability in init_user_ns - * @t: The task in question - * @cap: The capability to be tested for - * - * Return true if the specified task has the given superior capability - * currently in effect to the initial user namespace, false if not. - * - * Note that this does not set PF_SUPERPRIV on the task. - */ -bool has_capability(struct task_struct *t, int cap) -{ - return has_ns_capability(t, &init_user_ns, cap); -} -EXPORT_SYMBOL(has_capability); - -/** * has_ns_capability_noaudit - Does a task have a capability (unaudited) * in a specific user ns. * @t: The task in question diff --git a/kernel/cpu.c b/kernel/cpu.c index ad755db29efd..b08bb34b1718 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -526,6 +526,7 @@ void lockdep_assert_cpus_held(void) percpu_rwsem_assert_held(&cpu_hotplug_lock); } +EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held); #ifdef CONFIG_LOCKDEP int lockdep_is_cpus_held(void) diff --git a/kernel/events/core.c b/kernel/events/core.c index 5d2221ec6d7c..0bb21659e252 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4886,7 +4886,7 @@ find_get_context(struct task_struct *task, struct perf_event *event) if (!task) { /* Must be root to operate on a CPU event: */ - err = perf_allow_cpu(&event->attr); + err = perf_allow_cpu(); if (err) return ERR_PTR(err); @@ -12848,7 +12848,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, } /* privileged levels capture (kernel, hv): check permissions */ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { - ret = perf_allow_kernel(attr); + ret = perf_allow_kernel(); if (ret) return ret; } @@ -13105,12 +13105,12 @@ SYSCALL_DEFINE5(perf_event_open, return err; /* Do we allow access to perf_event_open(2) ? */ - err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + err = security_perf_event_open(PERF_SECURITY_OPEN); if (err) return err; if (!attr.exclude_kernel) { - err = perf_allow_kernel(&attr); + err = perf_allow_kernel(); if (err) return err; } @@ -13130,7 +13130,7 @@ SYSCALL_DEFINE5(perf_event_open, /* Only privileged users can get physical addresses */ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { - err = perf_allow_kernel(&attr); + err = perf_allow_kernel(); if (err) return err; } @@ -13969,12 +13969,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) return &event->attr; } -int perf_allow_kernel(struct perf_event_attr *attr) +int perf_allow_kernel(void) { if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; - return security_perf_event_open(attr, PERF_SECURITY_KERNEL); + return security_perf_event_open(PERF_SECURITY_KERNEL); } EXPORT_SYMBOL_GPL(perf_allow_kernel); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 875f25ed6f71..3f02a0e45254 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -47,10 +47,6 @@ config GENERIC_IRQ_INJECTION config HARDIRQS_SW_RESEND bool -# Edge style eoi based handler (cell) -config IRQ_EDGE_EOI_HANDLER - bool - # Generic configurable interrupt chip implementation config GENERIC_IRQ_CHIP bool @@ -96,6 +92,7 @@ config GENERIC_MSI_IRQ bool select IRQ_DOMAIN_HIERARCHY +# irqchip drivers should select this if they call iommu_dma_prepare_msi() config IRQ_MSI_IOMMU bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 0ff987d3a799..36cf1b09cc84 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -838,53 +838,6 @@ out_unlock: } EXPORT_SYMBOL(handle_edge_irq); -#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER -/** - * handle_edge_eoi_irq - edge eoi type IRQ handler - * @desc: the interrupt description structure for this irq - * - * Similar as the above handle_edge_irq, but using eoi and w/o the - * mask/unmask logic. - */ -void handle_edge_eoi_irq(struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - - raw_spin_lock(&desc->lock); - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - if (!irq_may_run(desc)) { - desc->istate |= IRQS_PENDING; - goto out_eoi; - } - - /* - * If its disabled or no action available then mask it and get - * out of here. - */ - if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { - desc->istate |= IRQS_PENDING; - goto out_eoi; - } - - kstat_incr_irqs_this_cpu(desc); - - do { - if (unlikely(!desc->action)) - goto out_eoi; - - handle_irq_event(desc); - - } while ((desc->istate & IRQS_PENDING) && - !irqd_irq_disabled(&desc->irq_data)); - -out_eoi: - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); -} -#endif - /** * handle_percpu_irq - Per CPU local irq handler * @desc: the interrupt description structure for this irq diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index ad4a85e31160..5c8d43cdb0a3 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -270,11 +270,16 @@ fail: return ret; } +void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) +{ + *msg = entry->msg; +} + void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) { struct msi_desc *entry = irq_get_msi_desc(irq); - *msg = entry->msg; + __get_cached_msi_msg(entry, msg); } EXPORT_SYMBOL_GPL(get_cached_msi_msg); @@ -338,30 +343,26 @@ int msi_setup_device_data(struct device *dev) } /** - * __msi_lock_descs - Lock the MSI descriptor storage of a device + * msi_lock_descs - Lock the MSI descriptor storage of a device * @dev: Device to operate on - * - * Internal function for guard(msi_descs_lock). Don't use in code. */ -void __msi_lock_descs(struct device *dev) +void msi_lock_descs(struct device *dev) { mutex_lock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(__msi_lock_descs); +EXPORT_SYMBOL_GPL(msi_lock_descs); /** - * __msi_unlock_descs - Unlock the MSI descriptor storage of a device + * msi_unlock_descs - Unlock the MSI descriptor storage of a device * @dev: Device to operate on - * - * Internal function for guard(msi_descs_lock). Don't use in code. */ -void __msi_unlock_descs(struct device *dev) +void msi_unlock_descs(struct device *dev) { /* Invalidate the index which was cached by the iterator */ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX; mutex_unlock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(__msi_unlock_descs); +EXPORT_SYMBOL_GPL(msi_unlock_descs); static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid, enum msi_desc_filter filter) @@ -447,6 +448,7 @@ EXPORT_SYMBOL_GPL(msi_next_desc); unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index) { struct msi_desc *desc; + unsigned int ret = 0; bool pcimsi = false; struct xarray *xa; @@ -460,7 +462,7 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN) pcimsi = to_pci_dev(dev)->msi_enabled; - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); xa = &dev->msi.data->__domains[domid].store; desc = xa_load(xa, pcimsi ? 0 : index); if (desc && desc->irq) { @@ -469,12 +471,16 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne * PCI-MSIX and platform MSI use a descriptor per * interrupt. */ - if (!pcimsi) - return desc->irq; - if (index < desc->nvec_used) - return desc->irq + index; + if (pcimsi) { + if (index < desc->nvec_used) + ret = desc->irq + index; + } else { + ret = desc->irq; + } } - return 0; + + msi_unlock_descs(dev); + return ret; } EXPORT_SYMBOL_GPL(msi_domain_get_virq); @@ -992,8 +998,9 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, void *chip_data) { struct irq_domain *domain, *parent = dev->msi.domain; + struct fwnode_handle *fwnode, *fwnalloced = NULL; + struct msi_domain_template *bundle; const struct msi_parent_ops *pops; - struct fwnode_handle *fwnode; if (!irq_domain_is_msi_parent(parent)) return false; @@ -1001,8 +1008,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, if (domid >= MSI_MAX_DEVICE_IRQDOMAINS) return false; - struct msi_domain_template *bundle __free(kfree) = - bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); + bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); if (!bundle) return false; @@ -1025,36 +1031,41 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, * node as they are not guaranteed to have a fwnode. They are never * looked up and always handled in the context of the device. */ - struct fwnode_handle *fwnode_alloced __free(irq_domain_free_fwnode) = NULL; - - if (!(bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE)) - fwnode = fwnode_alloced = irq_domain_alloc_named_fwnode(bundle->name); - else + if (bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE) fwnode = dev->fwnode; + else + fwnode = fwnalloced = irq_domain_alloc_named_fwnode(bundle->name); if (!fwnode) - return false; + goto free_bundle; if (msi_setup_device_data(dev)) - return false; + goto free_fwnode; + + msi_lock_descs(dev); - guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid))) - return false; + goto fail; if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info)) - return false; + goto fail; domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent); if (!domain) - return false; + goto fail; - /* @bundle and @fwnode_alloced are now in use. Prevent cleanup */ - retain_ptr(bundle); - retain_ptr(fwnode_alloced); domain->dev = dev; dev->msi.data->__domains[domid].domain = domain; + msi_unlock_descs(dev); return true; + +fail: + msi_unlock_descs(dev); +free_fwnode: + irq_domain_free_fwnode(fwnalloced); +free_bundle: + kfree(bundle); + return false; } /** @@ -1068,10 +1079,12 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) struct msi_domain_info *info; struct irq_domain *domain; - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); + domain = msi_get_device_domain(dev, domid); + if (!domain || !irq_domain_is_msi_device(domain)) - return; + goto unlock; dev->msi.data->__domains[domid].domain = NULL; info = domain->host_data; @@ -1080,6 +1093,9 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) irq_domain_remove(domain); irq_domain_free_fwnode(fwnode); kfree(container_of(info, struct msi_domain_template, info)); + +unlock: + msi_unlock_descs(dev); } /** @@ -1095,14 +1111,16 @@ bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, { struct msi_domain_info *info; struct irq_domain *domain; + bool ret = false; - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); domain = msi_get_device_domain(dev, domid); if (domain && irq_domain_is_msi_device(domain)) { info = domain->host_data; - return info->bus_token == bus_token; + ret = info->bus_token == bus_token; } - return false; + msi_unlock_descs(dev); + return ret; } static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, @@ -1144,7 +1162,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) return false; - if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) + if (info->flags & MSI_FLAG_NO_MASK) return false; /* @@ -1334,17 +1352,21 @@ static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl) } /** - * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain + * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain * @dev: Pointer to device struct of the device for which the interrupts * are allocated * @domid: Id of the interrupt domain to operate on * @first: First index to allocate (inclusive) * @last: Last index to allocate (inclusive) * + * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() + * pair. Use this for MSI irqdomains which implement their own descriptor + * allocation/free. + * * Return: %0 on success or an error code. */ -int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last) +int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) { struct msi_ctrl ctrl = { .domid = domid, @@ -1353,9 +1375,29 @@ int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, .nirqs = last + 1 - first, }; - guard(msi_descs_lock)(dev); return msi_domain_alloc_locked(dev, &ctrl); } + +/** + * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @domid: Id of the interrupt domain to operate on + * @first: First index to allocate (inclusive) + * @last: Last index to allocate (inclusive) + * + * Return: %0 on success or an error code. + */ +int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) +{ + int ret; + + msi_lock_descs(dev); + ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last); + msi_unlock_descs(dev); + return ret; +} EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range); /** @@ -1458,8 +1500,12 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u const struct irq_affinity_desc *affdesc, union msi_instance_cookie *icookie) { - guard(msi_descs_lock)(dev); - return __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); + struct msi_map map; + + msi_lock_descs(dev); + map = __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); + msi_unlock_descs(dev); + return map; } /** @@ -1496,11 +1542,13 @@ int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, icookie.value = ((u64)type << 32) | hwirq; - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain)) map.index = -EINVAL; else map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie); + msi_unlock_descs(dev); + return map.index >= 0 ? map.virq : map.index; } @@ -1570,8 +1618,8 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl) * @first: First index to free (inclusive) * @last: Last index to free (inclusive) */ -static void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last) +void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) { struct msi_ctrl ctrl = { .domid = domid, @@ -1593,8 +1641,9 @@ static void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int d void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last) { - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); msi_domain_free_irqs_range_locked(dev, domid, first, last); + msi_unlock_descs(dev); } EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all); @@ -1624,8 +1673,9 @@ void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid) */ void msi_domain_free_irqs_all(struct device *dev, unsigned int domid) { - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); msi_domain_free_irqs_all_locked(dev, domid); + msi_unlock_descs(dev); } /** @@ -1644,11 +1694,12 @@ void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq) if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI)) return; - guard(msi_descs_lock)(dev); - if (WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) - return; - msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, - desc->msi_index); + msi_lock_descs(dev); + if (!WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) { + msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, + desc->msi_index); + } + msi_unlock_descs(dev); } /** diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c0bdc1686154..c22ad51c4317 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1013,7 +1013,7 @@ int kernel_kexec(void) error = -EBUSY; goto Restore_console; } - suspend_console(); + console_suspend_all(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_console; @@ -1072,7 +1072,7 @@ int kernel_kexec(void) Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: - resume_console(); + console_resume_all(); thaw_processes(); Restore_console: pm_restore_console(); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 0cd39954d5a1..4a0fb7978d0d 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -601,9 +601,12 @@ static int klp_add_object_nops(struct klp_patch *patch, } /* - * Add 'nop' functions which simply return to the caller to run - * the original function. The 'nop' functions are added to a - * patch to facilitate a 'replace' mode. + * Add 'nop' functions which simply return to the caller to run the + * original function. + * + * They are added only when the atomic replace mode is used and only for + * functions which are currently livepatched but are no longer included + * in the new livepatch. */ static int klp_add_nops(struct klp_patch *patch) { diff --git a/kernel/panic.c b/kernel/panic.c index d8635d5cecb2..0c55eec9e874 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -511,6 +511,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { TAINT_FLAG(AUX, 'X', ' ', true), TAINT_FLAG(RANDSTRUCT, 'T', ' ', true), TAINT_FLAG(TEST, 'N', ' ', true), + TAINT_FLAG(FWCTL, 'J', ' ', true), }; #undef TAINT_FLAG diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ca947ed32e3d..54a623680019 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -380,8 +380,7 @@ config CPU_PM config ENERGY_MODEL bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)" - depends on SMP - depends on CPU_FREQ + depends on CPU_FREQ || PM_DEVFREQ help Several subsystems (thermal and/or the task scheduler for example) can leverage information about the energy consumed by devices to diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 3874f0e97651..d9b7e2b38c7a 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -161,22 +161,10 @@ static void em_debug_create_pd(struct device *dev) {} static void em_debug_remove_pd(struct device *dev) {} #endif -static void em_destroy_table_rcu(struct rcu_head *rp) -{ - struct em_perf_table __rcu *table; - - table = container_of(rp, struct em_perf_table, rcu); - kfree(table); -} - static void em_release_table_kref(struct kref *kref) { - struct em_perf_table __rcu *table; - /* It was the last owner of this table so we can free */ - table = container_of(kref, struct em_perf_table, kref); - - call_rcu(&table->rcu, em_destroy_table_rcu); + kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu); } /** @@ -185,7 +173,7 @@ static void em_release_table_kref(struct kref *kref) * * No return values. */ -void em_table_free(struct em_perf_table __rcu *table) +void em_table_free(struct em_perf_table *table) { kref_put(&table->kref, em_release_table_kref); } @@ -198,9 +186,9 @@ void em_table_free(struct em_perf_table __rcu *table) * has a user. * Returns allocated table or NULL. */ -struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd) { - struct em_perf_table __rcu *table; + struct em_perf_table *table; int table_size; table_size = sizeof(struct em_perf_state) * pd->nr_perf_states; @@ -239,7 +227,7 @@ static void em_init_performance(struct device *dev, struct em_perf_domain *pd, } static int em_compute_costs(struct device *dev, struct em_perf_state *table, - struct em_data_callback *cb, int nr_states, + const struct em_data_callback *cb, int nr_states, unsigned long flags) { unsigned long prev_cost = ULONG_MAX; @@ -308,9 +296,9 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, * Return 0 on success or an error code on failure. */ int em_dev_update_perf_domain(struct device *dev, - struct em_perf_table __rcu *new_table) + struct em_perf_table *new_table) { - struct em_perf_table __rcu *old_table; + struct em_perf_table *old_table; struct em_perf_domain *pd; if (!dev) @@ -327,7 +315,8 @@ int em_dev_update_perf_domain(struct device *dev, kref_get(&new_table->kref); - old_table = pd->em_table; + old_table = rcu_dereference_protected(pd->em_table, + lockdep_is_held(&em_pd_mutex)); rcu_assign_pointer(pd->em_table, new_table); em_cpufreq_update_efficiencies(dev, new_table->state); @@ -341,7 +330,7 @@ EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, struct em_perf_state *table, - struct em_data_callback *cb, + const struct em_data_callback *cb, unsigned long flags) { unsigned long power, freq, prev_freq = 0; @@ -396,10 +385,11 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, } static int em_create_pd(struct device *dev, int nr_states, - struct em_data_callback *cb, cpumask_t *cpus, + const struct em_data_callback *cb, + const cpumask_t *cpus, unsigned long flags) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; int cpu, ret, num_cpus; @@ -556,9 +546,10 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * Return 0 on success */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *cpus, - bool microwatts) + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) { + struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; unsigned long flags = 0; int cpu, ret; @@ -631,7 +622,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, dev->em_pd->min_perf_state = 0; dev->em_pd->max_perf_state = nr_states - 1; - em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state); + em_table = rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex)); + em_cpufreq_update_efficiencies(dev, em_table->state); em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); @@ -668,7 +661,8 @@ void em_dev_unregister_perf_domain(struct device *dev) mutex_lock(&em_pd_mutex); em_debug_remove_pd(dev); - em_table_free(dev->em_pd->em_table); + em_table_free(rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex))); kfree(dev->em_pd); dev->em_pd = NULL; @@ -676,9 +670,9 @@ void em_dev_unregister_perf_domain(struct device *dev) } EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); -static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) +static struct em_perf_table *em_table_dup(struct em_perf_domain *pd) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_state *ps, *new_ps; int ps_size; @@ -700,7 +694,7 @@ static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) } static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, - struct em_perf_table __rcu *em_table) + struct em_perf_table *em_table) { int ret; @@ -728,10 +722,9 @@ free_em_table: * are correctly calculated. */ static void em_adjust_new_capacity(struct device *dev, - struct em_perf_domain *pd, - u64 max_cap) + struct em_perf_domain *pd) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; em_table = em_table_dup(pd); if (!em_table) { @@ -775,7 +768,8 @@ static void em_check_capacity_update(void) } cpufreq_cpu_put(policy); - pd = em_cpu_get(cpu); + dev = get_cpu_device(cpu); + pd = em_pd_get(dev); if (!pd || em_is_artificial(pd)) continue; @@ -799,8 +793,7 @@ static void em_check_capacity_update(void) pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu, cpu_capacity, em_max_perf); - dev = get_cpu_device(cpu); - em_adjust_new_capacity(dev, pd, cpu_capacity); + em_adjust_new_capacity(dev, pd); } free_cpumask_var(cpu_done_mask); @@ -822,7 +815,7 @@ static void em_update_workfn(struct work_struct *work) */ int em_dev_update_chip_binning(struct device *dev) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; int i, ret; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 10a01af63a80..23c0f4e6cb2f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) "PM: hibernation: " fmt +#include <crypto/acompress.h> #include <linux/blkdev.h> #include <linux/export.h> #include <linux/suspend.h> @@ -411,7 +412,7 @@ int hibernation_snapshot(int platform_mode) goto Thaw; } - suspend_console(); + console_suspend_all(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -437,7 +438,7 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); - resume_console(); + console_resume_all(); dpm_complete(msg); Close: @@ -547,7 +548,7 @@ int hibernation_restore(int platform_mode) int error; pm_prepare_console(); - suspend_console(); + console_suspend_all(); pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { @@ -561,7 +562,7 @@ int hibernation_restore(int platform_mode) } dpm_resume_end(PMSG_RECOVER); pm_restore_gfp_mask(); - resume_console(); + console_resume_all(); pm_restore_console(); return error; } @@ -586,7 +587,7 @@ int hibernation_platform_enter(void) goto Close; entering_platform_hibernation = true; - suspend_console(); + console_suspend_all(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -639,7 +640,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); - resume_console(); + console_resume_all(); Close: hibernation_ops->end(); @@ -757,7 +758,7 @@ int hibernate(void) */ if (!nocompress) { strscpy(hib_comp_algo, hibernate_compressor, sizeof(hib_comp_algo)); - if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) { + if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); return -EOPNOTSUPP; } @@ -901,7 +902,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto dpm_complete; - suspend_console(); + console_suspend_all(); error = dpm_suspend(PMSG_FREEZE); if (error) @@ -925,7 +926,7 @@ skip: dpm_resume: dpm_resume(PMSG_THAW); - resume_console(); + console_resume_all(); dpm_complete: dpm_complete(PMSG_THAW); @@ -1008,7 +1009,7 @@ static int software_resume(void) strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4, sizeof(hib_comp_algo)); else strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO, sizeof(hib_comp_algo)); - if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) { + if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); error = -EOPNOTSUPP; goto Unlock; @@ -1446,10 +1447,10 @@ static const char * const comp_alg_enabled[] = { static int hibernate_compressor_param_set(const char *compressor, const struct kernel_param *kp) { - unsigned int sleep_flags; int index, ret; - sleep_flags = lock_system_sleep(); + if (!mutex_trylock(&system_transition_mutex)) + return -EBUSY; index = sysfs_match_string(comp_alg_enabled, compressor); if (index >= 0) { @@ -1461,7 +1462,7 @@ static int hibernate_compressor_param_set(const char *compressor, ret = index; } - unlock_system_sleep(sleep_flags); + mutex_unlock(&system_transition_mutex); if (ret) pr_debug("Cannot set specified compressor %s\n", diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index c9fb559a6399..4e6e24e8b854 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2270,9 +2270,9 @@ int snapshot_read_next(struct snapshot_handle *handle) */ void *kaddr; - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); copy_page(buffer, kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); handle->buffer = buffer; } else { handle->buffer = page_address(page); @@ -2561,9 +2561,9 @@ static void copy_last_highmem_page(void) if (last_highmem_page) { void *dst; - dst = kmap_atomic(last_highmem_page); + dst = kmap_local_page(last_highmem_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); last_highmem_page = NULL; } } @@ -2881,13 +2881,13 @@ static inline void swap_two_pages_data(struct page *p1, struct page *p2, { void *kaddr1, *kaddr2; - kaddr1 = kmap_atomic(p1); - kaddr2 = kmap_atomic(p2); + kaddr1 = kmap_local_page(p1); + kaddr2 = kmap_local_page(p2); copy_page(buf, kaddr1); copy_page(kaddr1, kaddr2); copy_page(kaddr2, buf); - kunmap_atomic(kaddr2); - kunmap_atomic(kaddr1); + kunmap_local(kaddr2); + kunmap_local(kaddr1); } /** diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 09f8397bae15..8eaec4ab121d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -91,6 +91,16 @@ static void s2idle_enter(void) { trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); + /* + * The correctness of the code below depends on the number of online + * CPUs being stable, but CPUs cannot be taken offline or put online + * while it is running. + * + * The s2idle_lock must be acquired before the pending wakeup check to + * prevent pm_system_wakeup() from running as a whole between that check + * and the subsequent s2idle_state update in which case a wakeup event + * would get lost. + */ raw_spin_lock_irq(&s2idle_lock); if (pm_wakeup_pending()) goto out; @@ -98,8 +108,6 @@ static void s2idle_enter(void) s2idle_state = S2IDLE_STATE_ENTER; raw_spin_unlock_irq(&s2idle_lock); - cpus_read_lock(); - /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ @@ -112,8 +120,6 @@ static void s2idle_enter(void) */ wake_up_all_idle_cpus(); - cpus_read_unlock(); - raw_spin_lock_irq(&s2idle_lock); out: @@ -502,7 +508,7 @@ int suspend_devices_and_enter(suspend_state_t state) if (error) goto Close; - suspend_console(); + console_suspend_all(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -521,9 +527,9 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - trace_suspend_resume(TPS("resume_console"), state, true); - resume_console(); - trace_suspend_resume(TPS("resume_console"), state, false); + trace_suspend_resume(TPS("console_resume_all"), state, true); + console_resume_all(); + trace_suspend_resume(TPS("console_resume_all"), state, false); Close: platform_resume_end(state); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 82b884b67152..80ff5f933a62 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "PM: " fmt +#include <crypto/acompress.h> #include <linux/module.h> #include <linux/file.h> #include <linux/delay.h> @@ -635,7 +636,8 @@ static int crc32_threadfn(void *data) */ struct cmp_data { struct task_struct *thr; /* thread */ - struct crypto_comp *cc; /* crypto compressor stream */ + struct crypto_acomp *cc; /* crypto compressor */ + struct acomp_req *cr; /* crypto request */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -656,7 +658,6 @@ static atomic_t compressed_size = ATOMIC_INIT(0); static int compress_threadfn(void *data) { struct cmp_data *d = data; - unsigned int cmp_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || @@ -670,11 +671,13 @@ static int compress_threadfn(void *data) } atomic_set(&d->ready, 0); - cmp_len = CMP_SIZE - CMP_HEADER; - d->ret = crypto_comp_compress(d->cc, d->unc, d->unc_len, - d->cmp + CMP_HEADER, - &cmp_len); - d->cmp_len = cmp_len; + acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + acomp_request_set_src_nondma(d->cr, d->unc, d->unc_len); + acomp_request_set_dst_nondma(d->cr, d->cmp + CMP_HEADER, + CMP_SIZE - CMP_HEADER); + d->ret = crypto_acomp_compress(d->cr); + d->cmp_len = d->cr->dlen; atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); atomic_set_release(&d->stop, 1); @@ -745,13 +748,20 @@ static int save_compressed_image(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); + data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC); if (IS_ERR_OR_NULL(data[thr].cc)) { pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); ret = -EFAULT; goto out_clean; } + data[thr].cr = acomp_request_alloc(data[thr].cc); + if (!data[thr].cr) { + pr_err("Could not allocate comp request\n"); + ret = -ENOMEM; + goto out_clean; + } + data[thr].thr = kthread_run(compress_threadfn, &data[thr], "image_compress/%u", thr); @@ -899,8 +909,8 @@ out_clean: for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); - if (data[thr].cc) - crypto_free_comp(data[thr].cc); + acomp_request_free(data[thr].cr); + crypto_free_acomp(data[thr].cc); } vfree(data); } @@ -1142,7 +1152,8 @@ static int load_image(struct swap_map_handle *handle, */ struct dec_data { struct task_struct *thr; /* thread */ - struct crypto_comp *cc; /* crypto compressor stream */ + struct crypto_acomp *cc; /* crypto compressor */ + struct acomp_req *cr; /* crypto request */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -1160,7 +1171,6 @@ struct dec_data { static int decompress_threadfn(void *data) { struct dec_data *d = data; - unsigned int unc_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || @@ -1174,10 +1184,13 @@ static int decompress_threadfn(void *data) } atomic_set(&d->ready, 0); - unc_len = UNC_SIZE; - d->ret = crypto_comp_decompress(d->cc, d->cmp + CMP_HEADER, d->cmp_len, - d->unc, &unc_len); - d->unc_len = unc_len; + acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + acomp_request_set_src_nondma(d->cr, d->cmp + CMP_HEADER, + d->cmp_len); + acomp_request_set_dst_nondma(d->cr, d->unc, UNC_SIZE); + d->ret = crypto_acomp_decompress(d->cr); + d->unc_len = d->cr->dlen; if (clean_pages_on_decompress) flush_icache_range((unsigned long)d->unc, @@ -1254,13 +1267,20 @@ static int load_compressed_image(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); + data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC); if (IS_ERR_OR_NULL(data[thr].cc)) { pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); ret = -EFAULT; goto out_clean; } + data[thr].cr = acomp_request_alloc(data[thr].cc); + if (!data[thr].cr) { + pr_err("Could not allocate comp request\n"); + ret = -ENOMEM; + goto out_clean; + } + data[thr].thr = kthread_run(decompress_threadfn, &data[thr], "image_decompress/%u", thr); @@ -1507,8 +1527,8 @@ out_clean: for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); - if (data[thr].cc) - crypto_free_comp(data[thr].cc); + acomp_request_free(data[thr].cr); + crypto_free_acomp(data[thr].cc); } vfree(data); } diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index a91bdf802967..48a24e7b309d 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -64,6 +64,7 @@ struct dev_printk_info; extern struct printk_ringbuffer *prb; extern bool printk_kthreads_running; +extern bool debug_non_panic_cpus; __printf(4, 0) int vprintk_store(int facility, int level, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 057db78876cd..1eea80d0648e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2375,6 +2375,22 @@ void printk_legacy_allow_panic_sync(void) } } +bool __read_mostly debug_non_panic_cpus; + +#ifdef CONFIG_PRINTK_CALLER +static int __init debug_non_panic_cpus_setup(char *str) +{ + debug_non_panic_cpus = true; + pr_info("allow messages from non-panic CPUs in panic()\n"); + + return 0; +} +early_param("debug_non_panic_cpus", debug_non_panic_cpus_setup); +module_param(debug_non_panic_cpus, bool, 0644); +MODULE_PARM_DESC(debug_non_panic_cpus, + "allow messages from non-panic CPUs in panic()"); +#endif + asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) @@ -2391,7 +2407,9 @@ asmlinkage int vprintk_emit(int facility, int level, * non-panic CPUs are generating any messages, they will be * silently dropped. */ - if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace) + if (other_cpu_in_panic() && + !debug_non_panic_cpus && + !panic_triggering_all_cpu_backtrace) return 0; printk_get_console_flush_type(&ft); @@ -2731,11 +2749,11 @@ module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc"); /** - * suspend_console - suspend the console subsystem + * console_suspend_all - suspend the console subsystem * * This disables printk() while we go into suspend states */ -void suspend_console(void) +void console_suspend_all(void) { struct console *con; @@ -2758,7 +2776,7 @@ void suspend_console(void) synchronize_srcu(&console_srcu); } -void resume_console(void) +void console_resume_all(void) { struct console_flush_type ft; struct console *con; @@ -3340,7 +3358,12 @@ void console_unblank(void) */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) { + short flags = console_srcu_read_flags(c); + + if (flags & CON_SUSPENDED) + continue; + + if ((flags & CON_ENABLED) && c->unblank) { found_unblank = true; break; } @@ -3377,7 +3400,12 @@ void console_unblank(void) cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) + short flags = console_srcu_read_flags(c); + + if (flags & CON_SUSPENDED) + continue; + + if ((flags & CON_ENABLED) && c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); @@ -3495,10 +3523,10 @@ struct tty_driver *console_device(int *index) /* * Prevent further output on the passed console device so that (for example) - * serial drivers can disable console output before suspending a port, and can + * serial drivers can suspend console output before suspending a port, and can * re-enable output afterwards. */ -void console_stop(struct console *console) +void console_suspend(struct console *console) { __pr_flush(console, 1000, true); console_list_lock(); @@ -3513,9 +3541,9 @@ void console_stop(struct console *console) */ synchronize_srcu(&console_srcu); } -EXPORT_SYMBOL(console_stop); +EXPORT_SYMBOL(console_suspend); -void console_start(struct console *console) +void console_resume(struct console *console) { struct console_flush_type ft; bool is_nbcon; @@ -3540,7 +3568,7 @@ void console_start(struct console *console) __pr_flush(console, 1000, true); } -EXPORT_SYMBOL(console_start); +EXPORT_SYMBOL(console_resume); #ifdef CONFIG_PRINTK static int unregister_console_locked(struct console *console); @@ -4275,6 +4303,11 @@ void __init console_init(void) initcall_t call; initcall_entry_t *ce; +#ifdef CONFIG_NULL_TTY_DEFAULT_CONSOLE + if (!console_set_on_cmdline) + add_preferred_console("ttynull", 0, NULL); +#endif + /* Setup the default TTY line discipline. */ n_tty_init(); diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 88e8f3a61922..d9fb053cff67 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -2133,9 +2133,9 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * there may be other finalized records beyond that * need to be printed for a panic situation. If this * is the panic CPU, skip this - * non-existent/non-finalized record unless it is - * at or beyond the head, in which case it is not - * possible to continue. + * non-existent/non-finalized record unless non-panic + * CPUs are still running and their debugging is + * explicitly enabled. * * Note that new messages printed on panic CPU are * finalized when we are here. The only exception @@ -2143,10 +2143,13 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ - if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb))) + if (this_cpu_in_panic() && + (!debug_non_panic_cpus || legacy_allow_panic_sync) && + ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; - else + } else { return false; + } } } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 87540217fc09..cfaca3040b2f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -488,6 +488,16 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } #endif /* CONFIG_SCHED_CORE */ +/* need a wrapper since we may need to trace from modules */ +EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); + +/* Call via the helper macro trace_set_current_state. */ +void __trace_set_current_state(int state_value) +{ + trace_sched_set_state_tp(current, state_value); +} +EXPORT_SYMBOL(__trace_set_current_state); + /* * Serialization rules: * @@ -5295,6 +5305,12 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ finish_task_switch(prev); + /* + * This is a special case: the newly created task has just + * switched the context for the first time. It is returning from + * schedule for the first time in this path. + */ + trace_sched_exit_tp(true, CALLER_ADDR0); preempt_enable(); if (current->set_child_tid) @@ -6634,12 +6650,15 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; + bool is_switch = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; + trace_sched_entry_tp(preempt, CALLER_ADDR0); + cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; @@ -6705,7 +6724,8 @@ picked: clear_preempt_need_resched(); rq->last_seen_need_resched_ns = 0; - if (likely(prev != next)) { + is_switch = prev != next; + if (likely(is_switch)) { rq->nr_switches++; /* * RCU users of rcu_dereference(rq->curr) may not see @@ -6750,6 +6770,7 @@ picked: __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); } + trace_sched_exit_tp(is_switch, CALLER_ADDR0); } void __noreturn do_task_dead(void) diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index bb7d066a7c39..a297790b7333 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -206,7 +206,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func) continue; } - arch_static_call_transform(site_addr, NULL, func, + arch_static_call_transform(site_addr, tramp, func, static_call_is_tail(site)); } } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4ebe6136b08d..3b7a7308e35b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -20,9 +20,6 @@ */ #include <linux/module.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/slab.h> #include <linux/sysctl.h> #include <linux/bitmap.h> #include <linux/signal.h> @@ -31,7 +28,6 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> -#include <linux/kmemleak.h> #include <linux/filter.h> #include <linux/fs.h> #include <linux/init.h> @@ -42,25 +38,20 @@ #include <linux/highuid.h> #include <linux/writeback.h> #include <linux/ratelimit.h> -#include <linux/hugetlb.h> #include <linux/initrd.h> #include <linux/key.h> #include <linux/times.h> #include <linux/limits.h> -#include <linux/dcache.h> #include <linux/syscalls.h> -#include <linux/vmstat.h> #include <linux/nfs_fs.h> #include <linux/acpi.h> #include <linux/reboot.h> #include <linux/ftrace.h> -#include <linux/oom.h> #include <linux/kmod.h> #include <linux/capability.h> #include <linux/binfmts.h> #include <linux/sched/sysctl.h> #include <linux/mount.h> -#include <linux/userfaultfd_k.h> #include <linux/pid.h> #include "../lib/kstrtox.h" @@ -122,12 +113,6 @@ enum sysctl_writes_mode { static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; #endif /* CONFIG_PROC_SYSCTL */ - -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) -int sysctl_legacy_va_layout; -#endif - #endif /* CONFIG_SYSCTL */ /* @@ -1787,15 +1772,6 @@ static const struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_MAXOLDUID, }, -#ifdef CONFIG_S390 - { - .procname = "userprocess_debug", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "panic_on_oops", .data = &panic_on_oops, @@ -1843,15 +1819,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", - .data = &spin_retry, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", @@ -1901,215 +1868,9 @@ static const struct ctl_table kern_table[] = { #endif }; -static const struct ctl_table vm_table[] = { - { - .procname = "overcommit_memory", - .data = &sysctl_overcommit_memory, - .maxlen = sizeof(sysctl_overcommit_memory), - .mode = 0644, - .proc_handler = overcommit_policy_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "overcommit_ratio", - .data = &sysctl_overcommit_ratio, - .maxlen = sizeof(sysctl_overcommit_ratio), - .mode = 0644, - .proc_handler = overcommit_ratio_handler, - }, - { - .procname = "overcommit_kbytes", - .data = &sysctl_overcommit_kbytes, - .maxlen = sizeof(sysctl_overcommit_kbytes), - .mode = 0644, - .proc_handler = overcommit_kbytes_handler, - }, - { - .procname = "page-cluster", - .data = &page_cluster, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&page_cluster_max, - }, - { - .procname = "dirtytime_expire_seconds", - .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirtytime_expire_interval), - .mode = 0644, - .proc_handler = dirtytime_interval_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO_HUNDRED, - }, -#ifdef CONFIG_NUMA - { - .procname = "numa_stat", - .data = &sysctl_vm_numa_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { - .procname = "drop_caches", - .data = &sysctl_drop_caches, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = drop_caches_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_FOUR, - }, - { - .procname = "page_lock_unfairness", - .data = &sysctl_page_lock_unfairness, - .maxlen = sizeof(sysctl_page_lock_unfairness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#ifdef CONFIG_MMU - { - .procname = "max_map_count", - .data = &sysctl_max_map_count, - .maxlen = sizeof(sysctl_max_map_count), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#else - { - .procname = "nr_trim_pages", - .data = &sysctl_nr_trim_pages, - .maxlen = sizeof(sysctl_nr_trim_pages), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif - { - .procname = "vfs_cache_pressure", - .data = &sysctl_vfs_cache_pressure, - .maxlen = sizeof(sysctl_vfs_cache_pressure), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) - { - .procname = "legacy_va_layout", - .data = &sysctl_legacy_va_layout, - .maxlen = sizeof(sysctl_legacy_va_layout), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_NUMA - { - .procname = "zone_reclaim_mode", - .data = &node_reclaim_mode, - .maxlen = sizeof(node_reclaim_mode), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_SMP - { - .procname = "stat_interval", - .data = &sysctl_stat_interval, - .maxlen = sizeof(sysctl_stat_interval), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "stat_refresh", - .data = NULL, - .maxlen = 0, - .mode = 0600, - .proc_handler = vmstat_refresh, - }, -#endif -#ifdef CONFIG_MMU - { - .procname = "mmap_min_addr", - .data = &dac_mmap_min_addr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = mmap_min_addr_handler, - }, -#endif -#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ - (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) - { - .procname = "vdso_enabled", -#ifdef CONFIG_X86_32 - .data = &vdso32_enabled, - .maxlen = sizeof(vdso32_enabled), -#else - .data = &vdso_enabled, - .maxlen = sizeof(vdso_enabled), -#endif - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#endif - { - .procname = "user_reserve_kbytes", - .data = &sysctl_user_reserve_kbytes, - .maxlen = sizeof(sysctl_user_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "admin_reserve_kbytes", - .data = &sysctl_admin_reserve_kbytes, - .maxlen = sizeof(sysctl_admin_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS - { - .procname = "mmap_rnd_bits", - .data = &mmap_rnd_bits, - .maxlen = sizeof(mmap_rnd_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_bits_min, - .extra2 = (void *)&mmap_rnd_bits_max, - }, -#endif -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS - { - .procname = "mmap_rnd_compat_bits", - .data = &mmap_rnd_compat_bits, - .maxlen = sizeof(mmap_rnd_compat_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_compat_bits_min, - .extra2 = (void *)&mmap_rnd_compat_bits_max, - }, -#endif -}; - int __init sysctl_init_bases(void) { register_sysctl_init("kernel", kern_table); - register_sysctl_init("vm", vm_table); return 0; } diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 7f4e4fb7381e..101a0f7c43e0 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -109,6 +109,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) goto out; } pccontext->clk = clk; + pccontext->fp = fp; if (clk->ops.open) { err = clk->ops.open(pccontext, fp->f_mode); if (err) { @@ -229,7 +230,7 @@ static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx) if (err) return err; - if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + if (tx->modes && (cd.fp->f_mode & FMODE_WRITE) == 0) { err = -EACCES; goto out; } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d570b8b9c0a9..033fba0633cf 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -263,6 +263,18 @@ config FUNCTION_GRAPH_RETADDR the function is called. This feature is off by default, and you can enable it via the trace option funcgraph-retaddr. +config FUNCTION_TRACE_ARGS + bool + depends on HAVE_FUNCTION_ARG_ACCESS_API + depends on DEBUG_INFO_BTF + default y + help + If supported with function argument access API and BTF, then + the function tracer and function graph tracer will support printing + of function arguments. This feature is off by default, and can be + enabled via the trace option func-args (for the function tracer) and + funcgraph-args (for the function graph tracer) + config DYNAMIC_FTRACE bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 5dddfc2149f6..8d925cbdce3a 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -865,7 +865,7 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe } /* - * After all architecures have selected HAVE_FUNCTION_GRAPH_FREGS, we can + * After all architectures have selected HAVE_FUNCTION_GRAPH_FREGS, we can * leave only ftrace_return_to_handler(fregs). */ #ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fc88e0688daf..92015de6203d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1293,6 +1293,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) void ftrace_free_filter(struct ftrace_ops *ops) { ftrace_ops_init(ops); + if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) + return; free_ftrace_hash(ops->func_hash->filter_hash); free_ftrace_hash(ops->func_hash->notrace_hash); } @@ -7016,6 +7018,7 @@ static int ftrace_process_locs(struct module *mod, unsigned long *p; unsigned long addr; unsigned long flags = 0; /* Shut up gcc */ + unsigned long pages; int ret = -ENOMEM; count = end - start; @@ -7023,6 +7026,8 @@ static int ftrace_process_locs(struct module *mod, if (!count) return 0; + pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE); + /* * Sorting mcount in vmlinux at build time depend on * CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in @@ -7067,7 +7072,9 @@ static int ftrace_process_locs(struct module *mod, pg = start_pg; while (p < end) { unsigned long end_offset; - addr = ftrace_call_adjust(*p++); + + addr = *p++; + /* * Some architecture linkers will pad between * the different mcount_loc sections of different @@ -7079,6 +7086,19 @@ static int ftrace_process_locs(struct module *mod, continue; } + /* + * If this is core kernel, make sure the address is in core + * or inittext, as weak functions get zeroed and KASLR can + * move them to something other than zero. It just will not + * move it to an area where kernel text is. + */ + if (!mod && !(is_kernel_text(addr) || is_kernel_inittext(addr))) { + skipped++; + continue; + } + + addr = ftrace_call_adjust(addr); + end_offset = (pg->index+1) * sizeof(pg->records[0]); if (end_offset > PAGE_SIZE << pg->order) { /* We should have allocated enough */ @@ -7118,11 +7138,41 @@ static int ftrace_process_locs(struct module *mod, /* We should have used all pages unless we skipped some */ if (pg_unuse) { - WARN_ON(!skipped); + unsigned long pg_remaining, remaining = 0; + unsigned long skip; + + /* Count the number of entries unused and compare it to skipped. */ + pg_remaining = (ENTRIES_PER_PAGE << pg->order) - pg->index; + + if (!WARN(skipped < pg_remaining, "Extra allocated pages for ftrace")) { + + skip = skipped - pg_remaining; + + for (pg = pg_unuse; pg; pg = pg->next) + remaining += 1 << pg->order; + + pages -= remaining; + + skip = DIV_ROUND_UP(skip, ENTRIES_PER_PAGE); + + /* + * Check to see if the number of pages remaining would + * just fit the number of entries skipped. + */ + WARN(skip != remaining, "Extra allocated pages for ftrace: %lu with %lu skipped", + remaining, skipped); + } /* Need to synchronize with ftrace_location_range() */ synchronize_rcu(); ftrace_free_pages(pg_unuse); } + + if (!mod) { + count -= skipped; + pr_info("ftrace: allocating %ld entries in %ld pages\n", + count, pages); + } + return ret; } @@ -7768,9 +7818,6 @@ void __init ftrace_init(void) goto failed; } - pr_info("ftrace: allocating %ld entries in %ld pages\n", - count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); - ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bb6089c2951e..9d4d951090d3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5318,7 +5318,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * moving it. The page before the header page has the * flag bit '1' set if it is pointing to the page we want. * but if the writer is in the process of moving it - * than it will be '2' or already moved '0'. + * then it will be '2' or already moved '0'. */ ret = rb_head_page_replace(reader, cpu_buffer->reader_page); diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 8226352a0062..b39f36013ef2 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -27,6 +27,13 @@ menuconfig RV source "kernel/trace/rv/monitors/wip/Kconfig" source "kernel/trace/rv/monitors/wwnr/Kconfig" +source "kernel/trace/rv/monitors/sched/Kconfig" +source "kernel/trace/rv/monitors/tss/Kconfig" +source "kernel/trace/rv/monitors/sco/Kconfig" +source "kernel/trace/rv/monitors/snroc/Kconfig" +source "kernel/trace/rv/monitors/scpd/Kconfig" +source "kernel/trace/rv/monitors/snep/Kconfig" +source "kernel/trace/rv/monitors/sncid/Kconfig" # Add new monitors here config RV_REACTORS diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 188b64668e1f..f9b2cd0483c3 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -5,6 +5,13 @@ ccflags-y += -I $(src) # needed for trace events obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o +obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o +obj-$(CONFIG_RV_MON_TSS) += monitors/tss/tss.o +obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o +obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o +obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o +obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o +obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig new file mode 100644 index 000000000000..ae3eb410abd7 --- /dev/null +++ b/kernel/trace/rv/monitors/sched/Kconfig @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCHED + depends on RV + bool "sched monitor" + help + Collection of monitors to check the scheduler behaves according to specifications. + Enable this to enable all scheduler specification supported by the current kernel. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c new file mode 100644 index 000000000000..905e03c3c934 --- /dev/null +++ b/kernel/trace/rv/monitors/sched/sched.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +#define MODULE_NAME "sched" + +#include "sched.h" + +struct rv_monitor rv_sched; + +struct rv_monitor rv_sched = { + .name = "sched", + .description = "container for several scheduler monitor specifications.", + .enable = NULL, + .disable = NULL, + .reset = NULL, + .enabled = 0, +}; + +static int __init register_sched(void) +{ + rv_register_monitor(&rv_sched, NULL); + return 0; +} + +static void __exit unregister_sched(void) +{ + rv_unregister_monitor(&rv_sched); +} + +module_init(register_sched); +module_exit(unregister_sched); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sched: container for several scheduler monitor specifications."); diff --git a/kernel/trace/rv/monitors/sched/sched.h b/kernel/trace/rv/monitors/sched/sched.h new file mode 100644 index 000000000000..ba148dd8d48b --- /dev/null +++ b/kernel/trace/rv/monitors/sched/sched.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +extern struct rv_monitor rv_sched; diff --git a/kernel/trace/rv/monitors/sco/Kconfig b/kernel/trace/rv/monitors/sco/Kconfig new file mode 100644 index 000000000000..097c96cccdd7 --- /dev/null +++ b/kernel/trace/rv/monitors/sco/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCO + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sco monitor" + help + Monitor to ensure sched_set_state happens only in thread context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c new file mode 100644 index 000000000000..4cff59220bfc --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sco" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sco.h" + +static struct rv_monitor rv_sco; +DECLARE_DA_MON_PER_CPU(sco, unsigned char); + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + da_handle_start_event_sco(sched_set_state_sco); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_sco(schedule_entry_sco); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_sco(schedule_exit_sco); +} + +static int enable_sco(void) +{ + int retval; + + retval = da_monitor_init_sco(); + if (retval) + return retval; + + rv_attach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("sco", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sco", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_sco(void) +{ + rv_sco.enabled = 0; + + rv_detach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("sco", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sco", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_sco(); +} + +static struct rv_monitor rv_sco = { + .name = "sco", + .description = "scheduling context operations.", + .enable = enable_sco, + .disable = disable_sco, + .reset = da_monitor_reset_all_sco, + .enabled = 0, +}; + +static int __init register_sco(void) +{ + rv_register_monitor(&rv_sco, &rv_sched); + return 0; +} + +static void __exit unregister_sco(void) +{ + rv_unregister_monitor(&rv_sco); +} + +module_init(register_sco); +module_exit(unregister_sco); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sco: scheduling context operations."); diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h new file mode 100644 index 000000000000..7a4c1f2d5ca1 --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sco automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sco { + thread_context_sco = 0, + scheduling_context_sco, + state_max_sco +}; + +#define INVALID_STATE state_max_sco + +enum events_sco { + sched_set_state_sco = 0, + schedule_entry_sco, + schedule_exit_sco, + event_max_sco +}; + +struct automaton_sco { + char *state_names[state_max_sco]; + char *event_names[event_max_sco]; + unsigned char function[state_max_sco][event_max_sco]; + unsigned char initial_state; + bool final_states[state_max_sco]; +}; + +static const struct automaton_sco automaton_sco = { + .state_names = { + "thread_context", + "scheduling_context" + }, + .event_names = { + "sched_set_state", + "schedule_entry", + "schedule_exit" + }, + .function = { + { thread_context_sco, scheduling_context_sco, INVALID_STATE }, + { INVALID_STATE, INVALID_STATE, thread_context_sco }, + }, + .initial_state = thread_context_sco, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sco/sco_trace.h b/kernel/trace/rv/monitors/sco/sco_trace.h new file mode 100644 index 000000000000..b711cd9024ec --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SCO +DEFINE_EVENT(event_da_monitor, event_sco, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_sco, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SCO */ diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig new file mode 100644 index 000000000000..b9114fbf680f --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCPD + depends on RV + depends on PREEMPT_TRACER + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "scpd monitor" + help + Monitor to ensure schedule is called with preemption disabled. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c new file mode 100644 index 000000000000..cbdd6a5f8d7f --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "scpd" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "scpd.h" + +static struct rv_monitor rv_scpd; +DECLARE_DA_MON_PER_CPU(scpd, unsigned char); + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_scpd(preempt_disable_scpd); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_scpd(preempt_enable_scpd); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_scpd(schedule_entry_scpd); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_event_scpd(schedule_exit_scpd); +} + +static int enable_scpd(void) +{ + int retval; + + retval = da_monitor_init_scpd(); + if (retval) + return retval; + + rv_attach_trace_probe("scpd", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("scpd", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_scpd(void) +{ + rv_scpd.enabled = 0; + + rv_detach_trace_probe("scpd", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("scpd", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_scpd(); +} + +static struct rv_monitor rv_scpd = { + .name = "scpd", + .description = "schedule called with preemption disabled.", + .enable = enable_scpd, + .disable = disable_scpd, + .reset = da_monitor_reset_all_scpd, + .enabled = 0, +}; + +static int __init register_scpd(void) +{ + rv_register_monitor(&rv_scpd, &rv_sched); + return 0; +} + +static void __exit unregister_scpd(void) +{ + rv_unregister_monitor(&rv_scpd); +} + +module_init(register_scpd); +module_exit(unregister_scpd); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("scpd: schedule called with preemption disabled."); diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h new file mode 100644 index 000000000000..295f735a5811 --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of scpd automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_scpd { + cant_sched_scpd = 0, + can_sched_scpd, + state_max_scpd +}; + +#define INVALID_STATE state_max_scpd + +enum events_scpd { + preempt_disable_scpd = 0, + preempt_enable_scpd, + schedule_entry_scpd, + schedule_exit_scpd, + event_max_scpd +}; + +struct automaton_scpd { + char *state_names[state_max_scpd]; + char *event_names[event_max_scpd]; + unsigned char function[state_max_scpd][event_max_scpd]; + unsigned char initial_state; + bool final_states[state_max_scpd]; +}; + +static const struct automaton_scpd automaton_scpd = { + .state_names = { + "cant_sched", + "can_sched" + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "schedule_entry", + "schedule_exit" + }, + .function = { + { can_sched_scpd, INVALID_STATE, INVALID_STATE, INVALID_STATE }, + { INVALID_STATE, cant_sched_scpd, can_sched_scpd, can_sched_scpd }, + }, + .initial_state = cant_sched_scpd, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/scpd/scpd_trace.h b/kernel/trace/rv/monitors/scpd/scpd_trace.h new file mode 100644 index 000000000000..6b0f4aa4732e --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SCPD +DEFINE_EVENT(event_da_monitor, event_scpd, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_scpd, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SCPD */ diff --git a/kernel/trace/rv/monitors/sncid/Kconfig b/kernel/trace/rv/monitors/sncid/Kconfig new file mode 100644 index 000000000000..76bcfef4fd10 --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNCID + depends on RV + depends on IRQSOFF_TRACER + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sncid monitor" + help + Monitor to ensure schedule is not called with interrupt disabled. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c new file mode 100644 index 000000000000..f5037cd6214c --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/sncid.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sncid" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sncid.h" + +static struct rv_monitor rv_sncid; +DECLARE_DA_MON_PER_CPU(sncid, unsigned char); + +static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_sncid(irq_disable_sncid); +} + +static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_sncid(irq_enable_sncid); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_start_event_sncid(schedule_entry_sncid); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_sncid(schedule_exit_sncid); +} + +static int enable_sncid(void) +{ + int retval; + + retval = da_monitor_init_sncid(); + if (retval) + return retval; + + rv_attach_trace_probe("sncid", irq_disable, handle_irq_disable); + rv_attach_trace_probe("sncid", irq_enable, handle_irq_enable); + rv_attach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_sncid(void) +{ + rv_sncid.enabled = 0; + + rv_detach_trace_probe("sncid", irq_disable, handle_irq_disable); + rv_detach_trace_probe("sncid", irq_enable, handle_irq_enable); + rv_detach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_sncid(); +} + +static struct rv_monitor rv_sncid = { + .name = "sncid", + .description = "schedule not called with interrupt disabled.", + .enable = enable_sncid, + .disable = disable_sncid, + .reset = da_monitor_reset_all_sncid, + .enabled = 0, +}; + +static int __init register_sncid(void) +{ + rv_register_monitor(&rv_sncid, &rv_sched); + return 0; +} + +static void __exit unregister_sncid(void) +{ + rv_unregister_monitor(&rv_sncid); +} + +module_init(register_sncid); +module_exit(unregister_sncid); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sncid: schedule not called with interrupt disabled."); diff --git a/kernel/trace/rv/monitors/sncid/sncid.h b/kernel/trace/rv/monitors/sncid/sncid.h new file mode 100644 index 000000000000..21304725142b --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/sncid.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sncid automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sncid { + can_sched_sncid = 0, + cant_sched_sncid, + state_max_sncid +}; + +#define INVALID_STATE state_max_sncid + +enum events_sncid { + irq_disable_sncid = 0, + irq_enable_sncid, + schedule_entry_sncid, + schedule_exit_sncid, + event_max_sncid +}; + +struct automaton_sncid { + char *state_names[state_max_sncid]; + char *event_names[event_max_sncid]; + unsigned char function[state_max_sncid][event_max_sncid]; + unsigned char initial_state; + bool final_states[state_max_sncid]; +}; + +static const struct automaton_sncid automaton_sncid = { + .state_names = { + "can_sched", + "cant_sched" + }, + .event_names = { + "irq_disable", + "irq_enable", + "schedule_entry", + "schedule_exit" + }, + .function = { + { cant_sched_sncid, INVALID_STATE, can_sched_sncid, can_sched_sncid }, + { INVALID_STATE, can_sched_sncid, INVALID_STATE, INVALID_STATE }, + }, + .initial_state = can_sched_sncid, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sncid/sncid_trace.h b/kernel/trace/rv/monitors/sncid/sncid_trace.h new file mode 100644 index 000000000000..3ce42a57671d --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/sncid_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNCID +DEFINE_EVENT(event_da_monitor, event_sncid, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_sncid, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SNCID */ diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig new file mode 100644 index 000000000000..77527f971232 --- /dev/null +++ b/kernel/trace/rv/monitors/snep/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNEP + depends on RV + depends on PREEMPT_TRACER + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "snep monitor" + help + Monitor to ensure schedule does not enable preempt. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c new file mode 100644 index 000000000000..0076ba6d7ea4 --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "snep" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "snep.h" + +static struct rv_monitor rv_snep; +DECLARE_DA_MON_PER_CPU(snep, unsigned char); + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_snep(preempt_disable_snep); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_snep(preempt_enable_snep); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_snep(schedule_entry_snep); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_snep(schedule_exit_snep); +} + +static int enable_snep(void) +{ + int retval; + + retval = da_monitor_init_snep(); + if (retval) + return retval; + + rv_attach_trace_probe("snep", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("snep", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("snep", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("snep", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_snep(void) +{ + rv_snep.enabled = 0; + + rv_detach_trace_probe("snep", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("snep", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("snep", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("snep", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_snep(); +} + +static struct rv_monitor rv_snep = { + .name = "snep", + .description = "schedule does not enable preempt.", + .enable = enable_snep, + .disable = disable_snep, + .reset = da_monitor_reset_all_snep, + .enabled = 0, +}; + +static int __init register_snep(void) +{ + rv_register_monitor(&rv_snep, &rv_sched); + return 0; +} + +static void __exit unregister_snep(void) +{ + rv_unregister_monitor(&rv_snep); +} + +module_init(register_snep); +module_exit(unregister_snep); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("snep: schedule does not enable preempt."); diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h new file mode 100644 index 000000000000..6d16b9ad931e --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of snep automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_snep { + non_scheduling_context_snep = 0, + scheduling_contex_snep, + state_max_snep +}; + +#define INVALID_STATE state_max_snep + +enum events_snep { + preempt_disable_snep = 0, + preempt_enable_snep, + schedule_entry_snep, + schedule_exit_snep, + event_max_snep +}; + +struct automaton_snep { + char *state_names[state_max_snep]; + char *event_names[event_max_snep]; + unsigned char function[state_max_snep][event_max_snep]; + unsigned char initial_state; + bool final_states[state_max_snep]; +}; + +static const struct automaton_snep automaton_snep = { + .state_names = { + "non_scheduling_context", + "scheduling_contex" + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "schedule_entry", + "schedule_exit" + }, + .function = { + { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, INVALID_STATE }, + { INVALID_STATE, INVALID_STATE, INVALID_STATE, non_scheduling_context_snep }, + }, + .initial_state = non_scheduling_context_snep, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/snep/snep_trace.h b/kernel/trace/rv/monitors/snep/snep_trace.h new file mode 100644 index 000000000000..01aad49a949a --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNEP +DEFINE_EVENT(event_da_monitor, event_snep, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_snep, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SNEP */ diff --git a/kernel/trace/rv/monitors/snroc/Kconfig b/kernel/trace/rv/monitors/snroc/Kconfig new file mode 100644 index 000000000000..6e4365a2fea3 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNROC + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_ID + bool "snroc monitor" + help + Monitor to ensure sched_set_state happens only in the respective task's context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c new file mode 100644 index 000000000000..bb1f60d55296 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "snroc" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "snroc.h" + +static struct rv_monitor rv_snroc; +DECLARE_DA_MON_PER_TASK(snroc, unsigned char); + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + da_handle_event_snroc(tsk, sched_set_state_snroc); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_start_event_snroc(prev, sched_switch_out_snroc); + da_handle_event_snroc(next, sched_switch_in_snroc); +} + +static int enable_snroc(void) +{ + int retval; + + retval = da_monitor_init_snroc(); + if (retval) + return retval; + + rv_attach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("snroc", sched_switch, handle_sched_switch); + + return 0; +} + +static void disable_snroc(void) +{ + rv_snroc.enabled = 0; + + rv_detach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("snroc", sched_switch, handle_sched_switch); + + da_monitor_destroy_snroc(); +} + +static struct rv_monitor rv_snroc = { + .name = "snroc", + .description = "set non runnable on its own context.", + .enable = enable_snroc, + .disable = disable_snroc, + .reset = da_monitor_reset_all_snroc, + .enabled = 0, +}; + +static int __init register_snroc(void) +{ + rv_register_monitor(&rv_snroc, &rv_sched); + return 0; +} + +static void __exit unregister_snroc(void) +{ + rv_unregister_monitor(&rv_snroc); +} + +module_init(register_snroc); +module_exit(unregister_snroc); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("snroc: set non runnable on its own context."); diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h new file mode 100644 index 000000000000..c3650a2b1b10 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of snroc automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_snroc { + other_context_snroc = 0, + own_context_snroc, + state_max_snroc +}; + +#define INVALID_STATE state_max_snroc + +enum events_snroc { + sched_set_state_snroc = 0, + sched_switch_in_snroc, + sched_switch_out_snroc, + event_max_snroc +}; + +struct automaton_snroc { + char *state_names[state_max_snroc]; + char *event_names[event_max_snroc]; + unsigned char function[state_max_snroc][event_max_snroc]; + unsigned char initial_state; + bool final_states[state_max_snroc]; +}; + +static const struct automaton_snroc automaton_snroc = { + .state_names = { + "other_context", + "own_context" + }, + .event_names = { + "sched_set_state", + "sched_switch_in", + "sched_switch_out" + }, + .function = { + { INVALID_STATE, own_context_snroc, INVALID_STATE }, + { own_context_snroc, INVALID_STATE, other_context_snroc }, + }, + .initial_state = other_context_snroc, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/snroc/snroc_trace.h b/kernel/trace/rv/monitors/snroc/snroc_trace.h new file mode 100644 index 000000000000..50114cef5122 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNROC +DEFINE_EVENT(event_da_monitor_id, event_snroc, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_snroc, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_SNROC */ diff --git a/kernel/trace/rv/monitors/tss/Kconfig b/kernel/trace/rv/monitors/tss/Kconfig new file mode 100644 index 000000000000..479f86f52e60 --- /dev/null +++ b/kernel/trace/rv/monitors/tss/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_TSS + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "tss monitor" + help + Monitor to ensure sched_switch happens only in scheduling context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c new file mode 100644 index 000000000000..542787e6524f --- /dev/null +++ b/kernel/trace/rv/monitors/tss/tss.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "tss" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "tss.h" + +static struct rv_monitor rv_tss; +DECLARE_DA_MON_PER_CPU(tss, unsigned char); + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_event_tss(sched_switch_tss); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_tss(schedule_entry_tss); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_tss(schedule_exit_tss); +} + +static int enable_tss(void) +{ + int retval; + + retval = da_monitor_init_tss(); + if (retval) + return retval; + + rv_attach_trace_probe("tss", sched_switch, handle_sched_switch); + rv_attach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_tss(void) +{ + rv_tss.enabled = 0; + + rv_detach_trace_probe("tss", sched_switch, handle_sched_switch); + rv_detach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_tss(); +} + +static struct rv_monitor rv_tss = { + .name = "tss", + .description = "task switch while scheduling.", + .enable = enable_tss, + .disable = disable_tss, + .reset = da_monitor_reset_all_tss, + .enabled = 0, +}; + +static int __init register_tss(void) +{ + rv_register_monitor(&rv_tss, &rv_sched); + return 0; +} + +static void __exit unregister_tss(void) +{ + rv_unregister_monitor(&rv_tss); +} + +module_init(register_tss); +module_exit(unregister_tss); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("tss: task switch while scheduling."); diff --git a/kernel/trace/rv/monitors/tss/tss.h b/kernel/trace/rv/monitors/tss/tss.h new file mode 100644 index 000000000000..f0a36fda1b87 --- /dev/null +++ b/kernel/trace/rv/monitors/tss/tss.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of tss automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_tss { + thread_tss = 0, + sched_tss, + state_max_tss +}; + +#define INVALID_STATE state_max_tss + +enum events_tss { + sched_switch_tss = 0, + schedule_entry_tss, + schedule_exit_tss, + event_max_tss +}; + +struct automaton_tss { + char *state_names[state_max_tss]; + char *event_names[event_max_tss]; + unsigned char function[state_max_tss][event_max_tss]; + unsigned char initial_state; + bool final_states[state_max_tss]; +}; + +static const struct automaton_tss automaton_tss = { + .state_names = { + "thread", + "sched" + }, + .event_names = { + "sched_switch", + "schedule_entry", + "schedule_exit" + }, + .function = { + { INVALID_STATE, sched_tss, INVALID_STATE }, + { sched_tss, INVALID_STATE, thread_tss }, + }, + .initial_state = thread_tss, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/tss/tss_trace.h b/kernel/trace/rv/monitors/tss/tss_trace.h new file mode 100644 index 000000000000..4619dbb50cc0 --- /dev/null +++ b/kernel/trace/rv/monitors/tss/tss_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_TSS +DEFINE_EVENT(event_da_monitor, event_tss, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_tss, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_TSS */ diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig index 3ef664b5cd90..e464b9294865 100644 --- a/kernel/trace/rv/monitors/wip/Kconfig +++ b/kernel/trace/rv/monitors/wip/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +# config RV_MON_WIP depends on RV depends on PREEMPT_TRACER diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index db7389157c87..ed758fec8608 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -71,7 +71,7 @@ static struct rv_monitor rv_wip = { static int __init register_wip(void) { - rv_register_monitor(&rv_wip); + rv_register_monitor(&rv_wip, NULL); return 0; } diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index 2e373f2c65ed..c7193748bf36 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Automatically generated C representation of wip automaton * For further information about this format, see kernel documentation: diff --git a/kernel/trace/rv/monitors/wwnr/Kconfig b/kernel/trace/rv/monitors/wwnr/Kconfig index ee741aa6d6b8..d3bfc20037db 100644 --- a/kernel/trace/rv/monitors/wwnr/Kconfig +++ b/kernel/trace/rv/monitors/wwnr/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +# config RV_MON_WWNR depends on RV select DA_MON_EVENTS_ID diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 3b16994a9984..172f31c4b0f3 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -70,7 +70,7 @@ static struct rv_monitor rv_wwnr = { static int __init register_wwnr(void) { - rv_register_monitor(&rv_wwnr); + rv_register_monitor(&rv_wwnr, NULL); return 0; } diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index d0d9c4b8121b..0a59d23edf61 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Automatically generated C representation of wwnr automaton * For further information about this format, see kernel documentation: diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 8657fc8806e7..50344aa9f7f9 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -162,7 +162,7 @@ struct dentry *get_monitors_root(void) /* * Interface for the monitor register. */ -static LIST_HEAD(rv_monitors_list); +LIST_HEAD(rv_monitors_list); static int task_monitor_count; static bool task_monitor_slots[RV_PER_TASK_MONITORS]; @@ -207,6 +207,30 @@ void rv_put_task_monitor_slot(int slot) } /* + * Monitors with a parent are nested, + * Monitors without a parent could be standalone or containers. + */ +bool rv_is_nested_monitor(struct rv_monitor_def *mdef) +{ + return mdef->parent != NULL; +} + +/* + * We set our list to have nested monitors listed after their parent + * if a monitor has a child element its a container. + * Containers can be also identified based on their function pointers: + * as they are not real monitors they do not need function definitions + * for enable()/disable(). Use this condition to find empty containers. + * Keep both conditions in case we have some non-compliant containers. + */ +bool rv_is_container_monitor(struct rv_monitor_def *mdef) +{ + struct rv_monitor_def *next = list_next_entry(mdef, list); + + return next->parent == mdef->monitor || !mdef->monitor->enable; +} + +/* * This section collects the monitor/ files and folders. */ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, @@ -229,7 +253,8 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) if (mdef->monitor->enabled) { mdef->monitor->enabled = 0; - mdef->monitor->disable(); + if (mdef->monitor->disable) + mdef->monitor->disable(); /* * Wait for the execution of all events to finish. @@ -243,6 +268,60 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) return 0; } +static void rv_disable_single(struct rv_monitor_def *mdef) +{ + __rv_disable_monitor(mdef, true); +} + +static int rv_enable_single(struct rv_monitor_def *mdef) +{ + int retval; + + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) + return 0; + + retval = mdef->monitor->enable(); + + if (!retval) + mdef->monitor->enabled = 1; + + return retval; +} + +static void rv_disable_container(struct rv_monitor_def *mdef) +{ + struct rv_monitor_def *p = mdef; + int enabled = 0; + + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (p->parent != mdef->monitor) + break; + enabled += __rv_disable_monitor(p, false); + } + if (enabled) + tracepoint_synchronize_unregister(); + mdef->monitor->enabled = 0; +} + +static int rv_enable_container(struct rv_monitor_def *mdef) +{ + struct rv_monitor_def *p = mdef; + int retval = 0; + + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (retval || p->parent != mdef->monitor) + break; + retval = rv_enable_single(p); + } + if (retval) + rv_disable_container(mdef); + else + mdef->monitor->enabled = 1; + return retval; +} + /** * rv_disable_monitor - disable a given runtime monitor * @mdef: Pointer to the monitor definition structure. @@ -251,7 +330,11 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) */ int rv_disable_monitor(struct rv_monitor_def *mdef) { - __rv_disable_monitor(mdef, true); + if (rv_is_container_monitor(mdef)) + rv_disable_container(mdef); + else + rv_disable_single(mdef); + return 0; } @@ -265,15 +348,10 @@ int rv_enable_monitor(struct rv_monitor_def *mdef) { int retval; - lockdep_assert_held(&rv_interface_lock); - - if (mdef->monitor->enabled) - return 0; - - retval = mdef->monitor->enable(); - - if (!retval) - mdef->monitor->enabled = 1; + if (rv_is_container_monitor(mdef)) + retval = rv_enable_container(mdef); + else + retval = rv_enable_single(mdef); return retval; } @@ -336,9 +414,9 @@ static const struct file_operations interface_desc_fops = { * the monitor dir, where the specific options of the monitor * are exposed. */ -static int create_monitor_dir(struct rv_monitor_def *mdef) +static int create_monitor_dir(struct rv_monitor_def *mdef, struct rv_monitor_def *parent) { - struct dentry *root = get_monitors_root(); + struct dentry *root = parent ? parent->root_d : get_monitors_root(); const char *name = mdef->monitor->name; struct dentry *tmp; int retval; @@ -377,7 +455,11 @@ static int monitors_show(struct seq_file *m, void *p) { struct rv_monitor_def *mon_def = p; - seq_printf(m, "%s\n", mon_def->monitor->name); + if (mon_def->parent) + seq_printf(m, "%s:%s\n", mon_def->parent->name, + mon_def->monitor->name); + else + seq_printf(m, "%s\n", mon_def->monitor->name); return 0; } @@ -514,7 +596,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user struct rv_monitor_def *mdef; int retval = -EINVAL; bool enable = true; - char *ptr; + char *ptr, *tmp; int len; if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1) @@ -541,6 +623,11 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user retval = -EINVAL; + /* we support 1 nesting level, trim the parent */ + tmp = strstr(ptr, ":"); + if (tmp) + ptr = tmp+1; + list_for_each_entry(mdef, &rv_monitors_list, list) { if (strcmp(ptr, mdef->monitor->name) != 0) continue; @@ -613,7 +700,7 @@ static void reset_all_monitors(void) struct rv_monitor_def *mdef; list_for_each_entry(mdef, &rv_monitors_list, list) { - if (mdef->monitor->enabled) + if (mdef->monitor->enabled && mdef->monitor->reset) mdef->monitor->reset(); } } @@ -685,18 +772,19 @@ static void destroy_monitor_dir(struct rv_monitor_def *mdef) /** * rv_register_monitor - register a rv monitor. * @monitor: The rv_monitor to be registered. + * @parent: The parent of the monitor to be registered, NULL if not nested. * * Returns 0 if successful, error otherwise. */ -int rv_register_monitor(struct rv_monitor *monitor) +int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) { - struct rv_monitor_def *r; + struct rv_monitor_def *r, *p = NULL; int retval = 0; if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { pr_info("Monitor %s has a name longer than %d\n", monitor->name, MAX_RV_MONITOR_NAME_SIZE); - return -1; + return -EINVAL; } mutex_lock(&rv_interface_lock); @@ -704,11 +792,26 @@ int rv_register_monitor(struct rv_monitor *monitor) list_for_each_entry(r, &rv_monitors_list, list) { if (strcmp(monitor->name, r->monitor->name) == 0) { pr_info("Monitor %s is already registered\n", monitor->name); - retval = -1; + retval = -EEXIST; goto out_unlock; } } + if (parent) { + list_for_each_entry(r, &rv_monitors_list, list) { + if (strcmp(parent->name, r->monitor->name) == 0) { + p = r; + break; + } + } + } + + if (p && rv_is_nested_monitor(p)) { + pr_info("Parent monitor %s is already nested, cannot nest further\n", + parent->name); + return -EINVAL; + } + r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); if (!r) { retval = -ENOMEM; @@ -716,14 +819,19 @@ int rv_register_monitor(struct rv_monitor *monitor) } r->monitor = monitor; + r->parent = parent; - retval = create_monitor_dir(r); + retval = create_monitor_dir(r, p); if (retval) { kfree(r); goto out_unlock; } - list_add_tail(&r->list, &rv_monitors_list); + /* keep children close to the parent for easier visualisation */ + if (p) + list_add(&r->list, &p->list); + else + list_add_tail(&r->list, &rv_monitors_list); out_unlock: mutex_unlock(&rv_interface_lock); diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index db6cb0913dbd..98fca0a1adbc 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -21,6 +21,7 @@ struct rv_interface { #define MAX_RV_REACTOR_NAME_SIZE 32 extern struct mutex rv_interface_lock; +extern struct list_head rv_monitors_list; #ifdef CONFIG_RV_REACTORS struct rv_reactor_def { @@ -34,6 +35,7 @@ struct rv_reactor_def { struct rv_monitor_def { struct list_head list; struct rv_monitor *monitor; + struct rv_monitor *parent; struct dentry *root_d; #ifdef CONFIG_RV_REACTORS struct rv_reactor_def *rdef; @@ -45,6 +47,8 @@ struct rv_monitor_def { struct dentry *get_monitors_root(void); int rv_disable_monitor(struct rv_monitor_def *mdef); int rv_enable_monitor(struct rv_monitor_def *mdef); +bool rv_is_container_monitor(struct rv_monitor_def *mdef); +bool rv_is_nested_monitor(struct rv_monitor_def *mdef); #ifdef CONFIG_RV_REACTORS int reactor_populate_monitor(struct rv_monitor_def *mdef); diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 7b49cbe388d4..9501ca886d83 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -158,8 +158,9 @@ static const struct seq_operations monitor_reactors_seq_ops = { .show = monitor_reactor_show }; -static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef, - bool reacting) +static void monitor_swap_reactors_single(struct rv_monitor_def *mdef, + struct rv_reactor_def *rdef, + bool reacting, bool nested) { bool monitor_enabled; @@ -179,10 +180,31 @@ static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor mdef->reacting = reacting; mdef->monitor->react = rdef->reactor->react; - if (monitor_enabled) + /* enable only once if iterating through a container */ + if (monitor_enabled && !nested) rv_enable_monitor(mdef); } +static void monitor_swap_reactors(struct rv_monitor_def *mdef, + struct rv_reactor_def *rdef, bool reacting) +{ + struct rv_monitor_def *p = mdef; + + if (rv_is_container_monitor(mdef)) + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (p->parent != mdef->monitor) + break; + monitor_swap_reactors_single(p, rdef, reacting, true); + } + /* + * This call enables and disables the monitor if they were active. + * In case of a container, we already disabled all and will enable all. + * All nested monitors are enabled also if they were off, we may refine + * this logic in the future. + */ + monitor_swap_reactors_single(mdef, rdef, reacting, false); +} + static ssize_t monitor_reactors_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 5e65097423ba..422b75f58891 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -58,6 +58,11 @@ DECLARE_EVENT_CLASS(error_da_monitor, ); #include <monitors/wip/wip_trace.h> +#include <monitors/tss/tss_trace.h> +#include <monitors/sco/sco_trace.h> +#include <monitors/scpd/scpd_trace.h> +#include <monitors/snep/snep_trace.h> +#include <monitors/sncid/sncid_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ @@ -118,6 +123,7 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, ); #include <monitors/wwnr/wwnr_trace.h> +#include <monitors/snroc/snroc_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_ID here #endif /* CONFIG_DA_MON_EVENTS_ID */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fd3cb2b2ab82..826267f5b650 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -87,6 +87,7 @@ void __init disable_tracing_selftest(const char *reason) static struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; static bool tracepoint_printk_stop_on_boot __initdata; +static bool traceoff_after_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); /* For tracers that don't implement custom flags */ @@ -330,6 +331,13 @@ static int __init set_tracepoint_printk_stop(char *str) } __setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop); +static int __init set_traceoff_after_boot(char *str) +{ + traceoff_after_boot = true; + return 1; +} +__setup("traceoff_after_boot", set_traceoff_after_boot); + unsigned long long ns2usecs(u64 nsec) { nsec += 500; @@ -2878,13 +2886,16 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, void trace_function(struct trace_array *tr, unsigned long ip, unsigned long - parent_ip, unsigned int trace_ctx) + parent_ip, unsigned int trace_ctx, struct ftrace_regs *fregs) { struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; + int size = sizeof(*entry); + + size += FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long); - event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), + event = __trace_buffer_lock_reserve(buffer, TRACE_FN, size, trace_ctx); if (!event) return; @@ -2892,6 +2903,13 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long entry->ip = ip; entry->parent_ip = parent_ip; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + if (static_branch_unlikely(&trace_function_exports_enabled)) ftrace_exports(event, TRACE_EXPORT_FUNCTION); __buffer_unlock_commit(buffer, event); @@ -10699,6 +10717,9 @@ __init static int late_trace_init(void) tracepoint_printk = 0; } + if (traceoff_after_boot) + tracing_off(); + tracing_set_default_clock(); clear_boot_tracer(); return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9c21ba45b7af..4a6621e2a0fa 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -21,6 +21,7 @@ #include <linux/workqueue.h> #include <linux/ctype.h> #include <linux/once_lite.h> +#include <linux/ftrace_regs.h> #include "pid_list.h" @@ -697,7 +698,8 @@ unsigned long trace_total_entries(struct trace_array *tr); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned int trace_ctx); + unsigned int trace_ctx, + struct ftrace_regs *regs); void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, @@ -897,6 +899,7 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_RETVAL 0x800 #define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 #define TRACE_GRAPH_PRINT_RETADDR 0x2000 +#define TRACE_GRAPH_ARGS 0x4000 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) @@ -1714,7 +1717,7 @@ struct event_trigger_data { unsigned long count; int ref; int flags; - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; char *filter_str; @@ -1959,7 +1962,7 @@ struct event_command { int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); - struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); + const struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; /** diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index fbfb396905a6..ee40d4e6ad1c 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -61,8 +61,9 @@ FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, F_STRUCT( - __field_fn( unsigned long, ip ) - __field_fn( unsigned long, parent_ip ) + __field_fn( unsigned long, ip ) + __field_fn( unsigned long, parent_ip ) + __dynamic_array( unsigned long, args ) ), F_printk(" %ps <-- %ps", @@ -72,17 +73,18 @@ FTRACE_ENTRY_REG(function, ftrace_entry, ); /* Function call entry */ -FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, TRACE_GRAPH_ENT, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( int, graph_ent, depth ) + __field_packed( unsigned long, graph_ent, depth ) + __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth) + F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth) ); #ifdef CONFIG_FUNCTION_GRAPH_RETADDR diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 82fd637cfc19..c08355c3ef32 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -478,7 +478,7 @@ static void eprobe_trigger_func(struct event_trigger_data *data, __eprobe_trace_func(edata, rec); } -static struct event_trigger_ops eprobe_trigger_ops = { +static const struct event_trigger_ops eprobe_trigger_ops = { .trigger = eprobe_trigger_func, .print = eprobe_trigger_print, .init = eprobe_trigger_init, @@ -507,8 +507,8 @@ static void eprobe_trigger_unreg_func(char *glob, } -static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, - char *param) +static const struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, + char *param) { return &eprobe_trigger_ops; } @@ -913,6 +913,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } if (argc - 2 > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); ret = -E2BIG; goto error; } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 3ff9caa4a71b..a6bb7577e8c5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -49,7 +49,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event)) { - ret = perf_allow_tracepoint(&p_event->attr); + ret = perf_allow_tracepoint(); if (ret) return ret; @@ -86,7 +86,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, * ...otherwise raw tracepoint data can be a severe data leak, * only allow root to have these. */ - ret = perf_allow_tracepoint(&p_event->attr); + ret = perf_allow_tracepoint(); if (ret) return ret; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 513de9ceb80e..8e7603acca21 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -790,7 +790,9 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } - call->class->reg(call, TRACE_REG_UNREGISTER, file); + ret = call->class->reg(call, TRACE_REG_UNREGISTER, file); + + WARN_ON_ONCE(ret); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ if (file->flags & EVENT_FILE_FL_SOFT_MODE) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 53dc6719181e..1260c23cfa5f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6203,7 +6203,7 @@ static void event_hist_trigger_free(struct event_trigger_data *data) } } -static struct event_trigger_ops event_hist_trigger_ops = { +static const struct event_trigger_ops event_hist_trigger_ops = { .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_init, @@ -6235,15 +6235,15 @@ static void event_hist_trigger_named_free(struct event_trigger_data *data) } } -static struct event_trigger_ops event_hist_trigger_named_ops = { +static const struct event_trigger_ops event_hist_trigger_named_ops = { .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_named_init, .free = event_hist_trigger_named_free, }; -static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, - char *param) +static const struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, + char *param) { return &event_hist_trigger_ops; } @@ -6838,38 +6838,38 @@ hist_enable_count_trigger(struct event_trigger_data *data, hist_enable_trigger(data, buffer, rec, event); } -static struct event_trigger_ops hist_enable_trigger_ops = { +static const struct event_trigger_ops hist_enable_trigger_ops = { .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops hist_enable_count_trigger_ops = { +static const struct event_trigger_ops hist_enable_count_trigger_ops = { .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops hist_disable_trigger_ops = { +static const struct event_trigger_ops hist_disable_trigger_ops = { .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops hist_disable_count_trigger_ops = { +static const struct event_trigger_ops hist_disable_count_trigger_ops = { .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * hist_enable_get_trigger_ops(char *cmd, char *param) { - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; bool enable; enable = (strcmp(cmd, ENABLE_HIST_STR) == 0); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index e3f7d09e5512..969f48742d72 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -207,7 +207,7 @@ static int synth_field_string_size(char *type) if (len == 0) return 0; /* variable-length string */ - strncpy(buf, start, len); + memcpy(buf, start, len); buf[len] = '\0'; err = kstrtouint(buf, 0, &size); @@ -305,7 +305,7 @@ static const char *synth_field_fmt(char *type) else if (strcmp(type, "gfp_t") == 0) fmt = "%x"; else if (synth_field_is_string(type)) - fmt = "%.*s"; + fmt = "%s"; else if (synth_field_is_stack(type)) fmt = "%s"; @@ -612,7 +612,7 @@ static int __set_synth_event_print_fmt(struct synth_event *event, fmt = synth_field_fmt(event->fields[i]->type); pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", event->fields[i]->name, fmt, - i == event->n_fields - 1 ? "" : ", "); + i == event->n_fields - 1 ? "" : " "); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); @@ -852,6 +852,38 @@ static struct trace_event_fields synth_event_fields_array[] = { {} }; +static int synth_event_reg(struct trace_event_call *call, + enum trace_reg type, void *data) +{ + struct synth_event *event = container_of(call, struct synth_event, call); + + switch (type) { +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: +#endif + case TRACE_REG_REGISTER: + if (!try_module_get(event->mod)) + return -EBUSY; + break; + default: + break; + } + + int ret = trace_event_reg(call, type, data); + + switch (type) { +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_UNREGISTER: +#endif + case TRACE_REG_UNREGISTER: + module_put(event->mod); + break; + default: + break; + } + return ret; +} + static int register_synth_event(struct synth_event *event) { struct trace_event_call *call = &event->call; @@ -881,7 +913,7 @@ static int register_synth_event(struct synth_event *event) goto out; } call->flags = TRACE_EVENT_FL_TRACEPOINT; - call->class->reg = trace_event_reg; + call->class->reg = synth_event_reg; call->class->probe = trace_event_raw_event_synth; call->data = event; call->tp = event->tp; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d45448947094..b66b6d235d91 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -825,7 +825,7 @@ struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, void *private_data) { struct event_trigger_data *trigger_data; - struct event_trigger_ops *trigger_ops; + const struct event_trigger_ops *trigger_ops; trigger_ops = cmd_ops->get_trigger_ops(cmd, param); @@ -1367,38 +1367,38 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops traceon_trigger_ops = { +static const struct event_trigger_ops traceon_trigger_ops = { .trigger = traceon_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops traceon_count_trigger_ops = { +static const struct event_trigger_ops traceon_count_trigger_ops = { .trigger = traceon_count_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops traceoff_trigger_ops = { +static const struct event_trigger_ops traceoff_trigger_ops = { .trigger = traceoff_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops traceoff_count_trigger_ops = { +static const struct event_trigger_ops traceoff_count_trigger_ops = { .trigger = traceoff_count_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * onoff_get_trigger_ops(char *cmd, char *param) { - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; /* we register both traceon and traceoff to this callback */ if (strcmp(cmd, "traceon") == 0) @@ -1491,21 +1491,21 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops snapshot_trigger_ops = { +static const struct event_trigger_ops snapshot_trigger_ops = { .trigger = snapshot_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops snapshot_count_trigger_ops = { +static const struct event_trigger_ops snapshot_count_trigger_ops = { .trigger = snapshot_count_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * snapshot_get_trigger_ops(char *cmd, char *param) { return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; @@ -1586,21 +1586,21 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops stacktrace_trigger_ops = { +static const struct event_trigger_ops stacktrace_trigger_ops = { .trigger = stacktrace_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops stacktrace_count_trigger_ops = { +static const struct event_trigger_ops stacktrace_count_trigger_ops = { .trigger = stacktrace_count_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * stacktrace_get_trigger_ops(char *cmd, char *param) { return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; @@ -1711,28 +1711,28 @@ void event_enable_trigger_free(struct event_trigger_data *data) } } -static struct event_trigger_ops event_enable_trigger_ops = { +static const struct event_trigger_ops event_enable_trigger_ops = { .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops event_enable_count_trigger_ops = { +static const struct event_trigger_ops event_enable_count_trigger_ops = { .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops event_disable_trigger_ops = { +static const struct event_trigger_ops event_disable_trigger_ops = { .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops event_disable_count_trigger_ops = { +static const struct event_trigger_ops event_disable_count_trigger_ops = { .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, @@ -1916,10 +1916,10 @@ void event_enable_unregister_trigger(char *glob, data->ops->free(data); } -static struct event_trigger_ops * +static const struct event_trigger_ops * event_enable_get_trigger_ops(char *cmd, char *param) { - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; bool enable; #ifdef CONFIG_HIST_TRIGGERS diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 97325fbd6283..af42aaa3d172 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -455,7 +455,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work) if (ret && ret != -ENOENT) { struct user_event *user = enabler->event; - pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n", + pr_warn("user_events: Fault for mm: 0x%p @ 0x%llx event: %s\n", mm->mm, (unsigned long long)uaddr, EVENT_NAME(user)); } @@ -2793,11 +2793,8 @@ static int user_seq_show(struct seq_file *m, void *p) seq_printf(m, "%s", EVENT_TP_NAME(user)); - if (status != 0) - seq_puts(m, " #"); - if (status != 0) { - seq_puts(m, " Used by"); + seq_puts(m, " # Used by"); if (status & EVENT_STATUS_FTRACE) seq_puts(m, " ftrace"); if (status & EVENT_STATUS_PERF) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 985ff98272da..5d7ca80173ea 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -1199,8 +1199,11 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], argc = new_argc; argv = new_argv; } - if (argc > MAX_TRACE_ARGS) + if (argc > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index df56f9b76010..98ccf3f00c51 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -25,6 +25,9 @@ static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); static void +function_args_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); static void @@ -42,9 +45,10 @@ enum { TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ TRACE_FUNC_OPT_STACK = 0x1, TRACE_FUNC_OPT_NO_REPEATS = 0x2, + TRACE_FUNC_OPT_ARGS = 0x4, /* Update this to next highest bit. */ - TRACE_FUNC_OPT_HIGHEST_BIT = 0x4 + TRACE_FUNC_OPT_HIGHEST_BIT = 0x8 }; #define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1) @@ -114,6 +118,8 @@ static ftrace_func_t select_trace_function(u32 flags_val) switch (flags_val & TRACE_FUNC_OPT_MASK) { case TRACE_FUNC_NO_OPTS: return function_trace_call; + case TRACE_FUNC_OPT_ARGS: + return function_args_trace_call; case TRACE_FUNC_OPT_STACK: return function_stack_trace_call; case TRACE_FUNC_OPT_NO_REPEATS: @@ -220,7 +226,34 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, data = this_cpu_ptr(tr->array_buffer.data); if (!atomic_read(&data->disabled)) - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); + + ftrace_test_recursion_unlock(bit); +} + +static void +function_args_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned int trace_ctx; + int bit; + int cpu; + + if (unlikely(!tr->function_enabled)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + trace_ctx = tracing_gen_ctx(); + + cpu = smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + if (!atomic_read(&data->disabled)) + trace_function(tr, ip, parent_ip, trace_ctx, fregs); ftrace_test_recursion_unlock(bit); } @@ -270,7 +303,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, if (likely(disabled == 1)) { trace_ctx = tracing_gen_ctx_flags(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); #ifdef CONFIG_UNWINDER_FRAME_POINTER if (ftrace_pids_enabled(op)) skip++; @@ -349,7 +382,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_dec(); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); out: ftrace_test_recursion_unlock(bit); @@ -389,7 +422,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_flags(flags); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); __trace_stack(tr, trace_ctx, STACK_SKIP); } @@ -403,6 +436,9 @@ static struct tracer_opt func_opts[] = { { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif { TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) }, +#ifdef CONFIG_FUNCTION_TRACE_ARGS + { TRACER_OPT(func-args, TRACE_FUNC_OPT_ARGS) }, +#endif { } /* Always set a last empty entry */ }; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 136c750b0b4d..2f077d4158e5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -71,6 +71,10 @@ static struct tracer_opt trace_opts[] = { /* Display function return address ? */ { TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) }, #endif +#ifdef CONFIG_FUNCTION_TRACE_ARGS + /* Display function arguments ? */ + { TRACER_OPT(funcgraph-args, TRACE_GRAPH_ARGS) }, +#endif /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, @@ -110,25 +114,43 @@ static void print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags); -int __trace_graph_entry(struct trace_array *tr, - struct ftrace_graph_ent *trace, - unsigned int trace_ctx) +static int __graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, + unsigned int trace_ctx, struct ftrace_regs *fregs) { struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ent_entry *entry; + int size; - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, - sizeof(*entry), trace_ctx); + /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */ + size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long)); + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, size, trace_ctx); if (!event) return 0; - entry = ring_buffer_event_data(event); - entry->graph_ent = *trace; + + entry = ring_buffer_event_data(event); + entry->graph_ent = *trace; + +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + trace_buffer_unlock_commit_nostack(buffer, event); return 1; } +int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned int trace_ctx) +{ + return __graph_entry(tr, trace, trace_ctx, NULL); +} + #ifdef CONFIG_FUNCTION_GRAPH_RETADDR int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, @@ -174,9 +196,9 @@ struct fgraph_times { unsigned long long sleeptime; /* may be optional! */ }; -int trace_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct ftrace_regs *fregs) +static int graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; @@ -246,7 +268,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, unsigned long retaddr = ftrace_graph_top_ret_addr(current); ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); } else { - ret = __trace_graph_entry(tr, trace, trace_ctx); + ret = __graph_entry(tr, trace, trace_ctx, fregs); } } preempt_enable_notrace(); @@ -254,6 +276,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, return ret; } +int trace_graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + return graph_entry(trace, gops, NULL); +} + +static int trace_graph_entry_args(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + return graph_entry(trace, gops, fregs); +} + static void __trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned int trace_ctx) @@ -418,14 +454,17 @@ static int graph_trace_init(struct trace_array *tr) { int ret; - tr->gops->entryfunc = trace_graph_entry; + if (tracer_flags_is_set(TRACE_GRAPH_ARGS)) + tr->gops->entryfunc = trace_graph_entry_args; + else + tr->gops->entryfunc = trace_graph_entry; if (tracing_thresh) tr->gops->retfunc = trace_graph_thresh_return; else tr->gops->retfunc = trace_graph_return; - /* Make gops functions are visible before we start tracing */ + /* Make gops functions visible before we start tracing */ smp_mb(); ret = register_ftrace_graph(tr->gops); @@ -436,6 +475,28 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +static int ftrace_graph_trace_args(struct trace_array *tr, int set) +{ + trace_func_graph_ent_t entry; + + if (set) + entry = trace_graph_entry_args; + else + entry = trace_graph_entry; + + /* See if there's any changes */ + if (tr->gops->entryfunc == entry) + return 0; + + unregister_ftrace_graph(tr->gops); + + tr->gops->entryfunc = entry; + + /* Make gops functions visible before we start tracing */ + smp_mb(); + return register_ftrace_graph(tr->gops); +} + static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); @@ -775,7 +836,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry, struct ftrace_graph_ret *graph_ret, void *func, - u32 opt_flags, u32 trace_flags) + u32 opt_flags, u32 trace_flags, int args_size) { unsigned long err_code = 0; unsigned long retval = 0; @@ -809,7 +870,14 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT) print_retaddr = false; - trace_seq_printf(s, "%ps();", func); + trace_seq_printf(s, "%ps", func); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { + print_function_args(s, entry->args, (unsigned long)func); + trace_seq_putc(s, ';'); + } else + trace_seq_puts(s, "();"); + if (print_retval || print_retaddr) trace_seq_puts(s, " /*"); else @@ -836,7 +904,8 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr #else -#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags) do {} while (0) +#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags, args_size) \ + do {} while (0) #endif @@ -852,16 +921,17 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; - unsigned long func; + unsigned long ret_func; + int args_size; int cpu = iter->cpu; int i; + args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args); + graph_ret = &ret_entry->ret; call = &entry->graph_ent; duration = ret_entry->rettime - ret_entry->calltime; - func = call->func + iter->tr->text_delta; - if (data) { struct fgraph_cpu_data *cpu_data; @@ -887,16 +957,25 @@ print_graph_entry_leaf(struct trace_iterator *iter, for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) trace_seq_putc(s, ' '); + ret_func = graph_ret->func + iter->tr->text_delta; + /* * Write out the function return value or return address */ if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) { print_graph_retval(s, entry, graph_ret, (void *)graph_ret->func + iter->tr->text_delta, - flags, tr->trace_flags); + flags, tr->trace_flags, args_size); } else { - trace_seq_printf(s, "%ps();\n", (void *)func); + trace_seq_printf(s, "%ps", (void *)ret_func); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { + print_function_args(s, entry->args, ret_func); + trace_seq_putc(s, ';'); + } else + trace_seq_puts(s, "();"); } + trace_seq_printf(s, "\n"); print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -913,6 +992,7 @@ print_graph_entry_nested(struct trace_iterator *iter, struct fgraph_data *data = iter->private; struct trace_array *tr = iter->tr; unsigned long func; + int args_size; int i; if (data) { @@ -937,7 +1017,17 @@ print_graph_entry_nested(struct trace_iterator *iter, func = call->func + iter->tr->text_delta; - trace_seq_printf(s, "%ps() {", (void *)func); + trace_seq_printf(s, "%ps", (void *)func); + + args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) + print_function_args(s, entry->args, func); + else + trace_seq_puts(s, "()"); + + trace_seq_puts(s, " {"); + if (flags & __TRACE_GRAPH_PRINT_RETADDR && entry->ent.type == TRACE_GRAPH_RETADDR_ENT) print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry, @@ -1107,21 +1197,38 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter, u32 flags) { struct fgraph_data *data = iter->private; - struct ftrace_graph_ent *call = &field->graph_ent; + struct ftrace_graph_ent *call; struct ftrace_graph_ret_entry *leaf_ret; static enum print_line_t ret; int cpu = iter->cpu; + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * sizeof(struct ftrace_graph_ent_entry) is very small, + * it can be safely saved at the stack. + */ + struct ftrace_graph_ent_entry *entry; + u8 save_buf[sizeof(*entry) + FTRACE_REGS_MAX_ARGS * sizeof(long)]; + + /* The ent_size is expected to be as big as the entry */ + if (iter->ent_size > sizeof(save_buf)) + iter->ent_size = sizeof(save_buf); + + entry = (void *)save_buf; + memcpy(entry, field, iter->ent_size); + + call = &entry->graph_ent; if (check_irq_entry(iter, flags, call->func, call->depth)) return TRACE_TYPE_HANDLED; print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags); - leaf_ret = get_return_for_leaf(iter, field); + leaf_ret = get_return_for_leaf(iter, entry); if (leaf_ret) - ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags); + ret = print_graph_entry_leaf(iter, entry, leaf_ret, s, flags); else - ret = print_graph_entry_nested(iter, field, s, cpu, flags); + ret = print_graph_entry_nested(iter, entry, s, cpu, flags); if (data) { /* @@ -1195,7 +1302,8 @@ print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s, * funcgraph-retval option is enabled. */ if (flags & __TRACE_GRAPH_PRINT_RETVAL) { - print_graph_retval(s, NULL, trace, (void *)func, flags, tr->trace_flags); + print_graph_retval(s, NULL, trace, (void *)func, flags, + tr->trace_flags, 0); } else { /* * If the return function does not have a matching entry, @@ -1323,16 +1431,8 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) switch (entry->type) { case TRACE_GRAPH_ENT: { - /* - * print_graph_entry() may consume the current event, - * thus @field may become invalid, so we need to save it. - * sizeof(struct ftrace_graph_ent_entry) is very small, - * it can be safely saved at the stack. - */ - struct ftrace_graph_ent_entry saved; trace_assign_type(field, entry); - saved = *field; - return print_graph_entry(&saved, s, iter, flags); + return print_graph_entry(field, s, iter, flags); } #ifdef CONFIG_FUNCTION_GRAPH_RETADDR case TRACE_GRAPH_RETADDR_ENT: { @@ -1511,6 +1611,7 @@ void graph_trace_close(struct trace_iterator *iter) if (data) { free_percpu(data->cpu_data); kfree(data); + iter->private = NULL; } } @@ -1526,6 +1627,9 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (bit == TRACE_GRAPH_GRAPH_TIME) ftrace_graph_graph_time_control(set); + if (bit == TRACE_GRAPH_ARGS) + return ftrace_graph_trace_args(tr, set); + return 0; } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 7294ad676379..40c39e946940 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -150,7 +150,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_flags(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); atomic_dec(&data->disabled); } @@ -250,8 +250,6 @@ static void irqsoff_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - else - iter->private = NULL; } static void irqsoff_trace_close(struct trace_iterator *iter) @@ -295,11 +293,17 @@ __trace_function(struct trace_array *tr, if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); } #else -#define __trace_function trace_function +static inline void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned int trace_ctx) +{ + return trace_function(tr, ip, parent_ip, trace_ctx, NULL); +} static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d8d5f18a141a..8287b175667f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1007,8 +1007,11 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], argc = new_argc; argv = new_argv; } - if (argc > MAX_TRACE_ARGS) + if (argc > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 92e16f03fa4e..e732c9e37e14 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -316,33 +316,6 @@ static inline void osn_var_reset_all(void) bool trace_osnoise_callback_enabled; /* - * osnoise sample structure definition. Used to store the statistics of a - * sample run. - */ -struct osnoise_sample { - u64 runtime; /* runtime */ - u64 noise; /* noise */ - u64 max_sample; /* max single noise sample */ - int hw_count; /* # HW (incl. hypervisor) interference */ - int nmi_count; /* # NMIs during this sample */ - int irq_count; /* # IRQs during this sample */ - int softirq_count; /* # softirqs during this sample */ - int thread_count; /* # threads during this sample */ -}; - -#ifdef CONFIG_TIMERLAT_TRACER -/* - * timerlat sample structure definition. Used to store the statistics of - * a sample run. - */ -struct timerlat_sample { - u64 timer_latency; /* timer_latency */ - unsigned int seqnum; /* unique sequence */ - int context; /* timer context */ -}; -#endif - -/* * Tracer data. */ static struct osnoise_data { @@ -497,7 +470,7 @@ static void print_osnoise_headers(struct seq_file *s) * Record an osnoise_sample into the tracer buffer. */ static void -__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) +__record_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) { struct ring_buffer_event *event; struct osnoise_entry *entry; @@ -520,17 +493,19 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe } /* - * Record an osnoise_sample on all osnoise instances. + * Record an osnoise_sample on all osnoise instances and fire trace event. */ -static void trace_osnoise_sample(struct osnoise_sample *sample) +static void record_osnoise_sample(struct osnoise_sample *sample) { struct osnoise_instance *inst; struct trace_buffer *buffer; + trace_osnoise_sample(sample); + rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { buffer = inst->tr->array_buffer.buffer; - __trace_osnoise_sample(sample, buffer); + __record_osnoise_sample(sample, buffer); } rcu_read_unlock(); } @@ -574,7 +549,7 @@ static void print_timerlat_headers(struct seq_file *s) #endif /* CONFIG_PREEMPT_RT */ static void -__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) +__record_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) { struct ring_buffer_event *event; struct timerlat_entry *entry; @@ -594,15 +569,17 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf /* * Record an timerlat_sample into the tracer buffer. */ -static void trace_timerlat_sample(struct timerlat_sample *sample) +static void record_timerlat_sample(struct timerlat_sample *sample) { struct osnoise_instance *inst; struct trace_buffer *buffer; + trace_timerlat_sample(sample); + rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { buffer = inst->tr->array_buffer.buffer; - __trace_timerlat_sample(sample, buffer); + __record_timerlat_sample(sample, buffer); } rcu_read_unlock(); } @@ -1606,7 +1583,7 @@ static int run_osnoise(void) /* Save interference stats info */ diff_osn_sample_stats(osn_var, &s); - trace_osnoise_sample(&s); + record_osnoise_sample(&s); notify_new_max_latency(max_noise); @@ -1801,7 +1778,7 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) s.timer_latency = diff; s.context = IRQ_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); if (osnoise_data.stop_tracing) { if (time_to_us(diff) >= osnoise_data.stop_tracing) { @@ -1920,7 +1897,7 @@ static int timerlat_main(void *data) s.timer_latency = diff; s.context = THREAD_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); notify_new_max_latency(diff); @@ -2029,7 +2006,6 @@ static int start_kthread(unsigned int cpu) if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); - stop_per_cpu_kthreads(); return -ENOMEM; } @@ -2525,7 +2501,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, s.timer_latency = diff; s.context = THREAD_URET; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); notify_new_max_latency(diff); @@ -2560,7 +2536,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, s.timer_latency = diff; s.context = THREAD_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); if (osnoise_data.stop_tracing_total) { if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 03d56f711ad1..72b699f909e8 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -12,15 +12,19 @@ #include <linux/sched/clock.h> #include <linux/sched/mm.h> #include <linux/idr.h> +#include <linux/btf.h> +#include <linux/bpf.h> +#include <linux/hashtable.h> #include "trace_output.h" +#include "trace_btf.h" -/* must be a power of 2 */ -#define EVENT_HASHSIZE 128 +/* 2^7 = 128 */ +#define EVENT_HASH_BITS 7 DECLARE_RWSEM(trace_event_sem); -static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; +static DEFINE_HASHTABLE(event_hash, EVENT_HASH_BITS); enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { @@ -684,6 +688,88 @@ int trace_print_lat_context(struct trace_iterator *iter) return !trace_seq_has_overflowed(s); } +#ifdef CONFIG_FUNCTION_TRACE_ARGS +void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func) +{ + const struct btf_param *param; + const struct btf_type *t; + const char *param_name; + char name[KSYM_NAME_LEN]; + unsigned long arg; + struct btf *btf; + s32 tid, nr = 0; + int a, p, x; + + trace_seq_printf(s, "("); + + if (!args) + goto out; + if (lookup_symbol_name(func, name)) + goto out; + + /* TODO: Pass module name here too */ + t = btf_find_func_proto(name, &btf); + if (IS_ERR_OR_NULL(t)) + goto out; + + param = btf_get_func_param(t, &nr); + if (!param) + goto out_put; + + for (a = 0, p = 0; p < nr; a++, p++) { + if (p) + trace_seq_puts(s, ", "); + + /* This only prints what the arch allows (6 args by default) */ + if (a == FTRACE_REGS_MAX_ARGS) { + trace_seq_puts(s, "..."); + break; + } + + arg = args[a]; + + param_name = btf_name_by_offset(btf, param[p].name_off); + if (param_name) + trace_seq_printf(s, "%s=", param_name); + t = btf_type_skip_modifiers(btf, param[p].type, &tid); + + switch (t ? BTF_INFO_KIND(t->info) : BTF_KIND_UNKN) { + case BTF_KIND_UNKN: + trace_seq_putc(s, '?'); + /* Still print unknown type values */ + fallthrough; + case BTF_KIND_PTR: + trace_seq_printf(s, "0x%lx", arg); + break; + case BTF_KIND_INT: + trace_seq_printf(s, "%ld", arg); + break; + case BTF_KIND_ENUM: + trace_seq_printf(s, "%ld", arg); + break; + default: + /* This does not handle complex arguments */ + trace_seq_printf(s, "(%s)[0x%lx", btf_type_str(t), arg); + for (x = sizeof(long); x < t->size; x += sizeof(long)) { + trace_seq_putc(s, ':'); + if (++a == FTRACE_REGS_MAX_ARGS) { + trace_seq_puts(s, "...]"); + goto out_put; + } + trace_seq_printf(s, "0x%lx", args[a]); + } + trace_seq_putc(s, ']'); + break; + } + } +out_put: + btf_put(btf); +out: + trace_seq_printf(s, ")"); +} +#endif + /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -694,11 +780,8 @@ int trace_print_lat_context(struct trace_iterator *iter) struct trace_event *ftrace_find_event(int type) { struct trace_event *event; - unsigned key; - - key = type & (EVENT_HASHSIZE - 1); - hlist_for_each_entry(event, &event_hash[key], node) { + hash_for_each_possible(event_hash, event, node, type) { if (event->type == type) return event; } @@ -753,7 +836,6 @@ void trace_event_read_unlock(void) */ int register_trace_event(struct trace_event *event) { - unsigned key; int ret = 0; down_write(&trace_event_sem); @@ -786,9 +868,7 @@ int register_trace_event(struct trace_event *event) if (event->funcs->binary == NULL) event->funcs->binary = trace_nop_print; - key = event->type & (EVENT_HASHSIZE - 1); - - hlist_add_head(&event->node, &event_hash[key]); + hash_add(event_hash, &event->node, event->type); ret = event->type; out: @@ -803,7 +883,7 @@ EXPORT_SYMBOL_GPL(register_trace_event); */ int __unregister_trace_event(struct trace_event *event) { - hlist_del(&event->node); + hash_del(&event->node); free_trace_event_type(event->type); return 0; } @@ -1005,12 +1085,15 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, } static void print_fn_trace(struct trace_seq *s, unsigned long ip, - unsigned long parent_ip, long delta, int flags) + unsigned long parent_ip, long delta, + unsigned long *args, int flags) { ip += delta; parent_ip += delta; seq_print_ip_sym(s, ip, flags); + if (args) + print_function_args(s, args, ip); if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { trace_seq_puts(s, " <-"); @@ -1024,10 +1107,19 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; + unsigned long *args; + int args_size; trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); + args_size = iter->ent_size - offsetof(struct ftrace_entry, args); + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) + args = field->args; + else + args = NULL; + + print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, + args, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1700,7 +1792,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); + print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, NULL, flags); trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); trace_print_time(s, iter, iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index dca40f1f1da4..2e305364f2a9 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -41,5 +41,14 @@ extern struct rw_semaphore trace_event_sem; #define SEQ_PUT_HEX_FIELD(s, x) \ trace_seq_putmem_hex(s, &(x), sizeof(x)) +#ifdef CONFIG_FUNCTION_TRACE_ARGS +void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func); +#else +static inline void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func) { + trace_seq_puts(s, "()"); +} +#endif #endif diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8f58ee1e8858..2eeecb6c95ee 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -770,6 +770,10 @@ static int check_prepare_btf_string_fetch(char *typename, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +/* + * Add the entry code to store the 'argnum'th parameter and return the offset + * in the entry data buffer where the data will be stored. + */ static int __store_entry_arg(struct trace_probe *tp, int argnum) { struct probe_entry_arg *earg = tp->entry_arg; @@ -793,6 +797,20 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum) tp->entry_arg = earg; } + /* + * The entry code array is repeating the pair of + * [FETCH_OP_ARG(argnum)][FETCH_OP_ST_EDATA(offset of entry data buffer)] + * and the rest of entries are filled with [FETCH_OP_END]. + * + * To reduce the redundant function parameter fetching, we scan the entry + * code array to find the FETCH_OP_ARG which already fetches the 'argnum' + * parameter. If it doesn't match, update 'offset' to find the last + * offset. + * If we find the FETCH_OP_END without matching FETCH_OP_ARG entry, we + * will save the entry with FETCH_OP_ARG and FETCH_OP_ST_EDATA, and + * return data offset so that caller can find the data offset in the entry + * data buffer. + */ offset = 0; for (i = 0; i < earg->size - 1; i++) { switch (earg->code[i].op) { @@ -826,6 +844,16 @@ int traceprobe_get_entry_data_size(struct trace_probe *tp) if (!earg) return 0; + /* + * earg->code[] array has an operation sequence which is run in + * the entry handler. + * The sequence stopped by FETCH_OP_END and each data stored in + * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA + * stores the data at the data buffer + its offset, and all data are + * "unsigned long" size. The offset must be increased when a data is + * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the + * code array. + */ for (i = 0; i < earg->size; i++) { switch (earg->code[i].op) { case FETCH_OP_END: diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 96792bc4b092..854e5668f5ee 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -545,6 +545,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_BTF_TID, "Failed to get BTF type info."),\ C(BAD_TYPE4STR, "This type does not fit for string."),\ C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ + C(TOO_MANY_ARGS, "Too many arguments are specified"), \ C(TOO_MANY_EARGS, "Too many entry arguments specified"), #undef C diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index af30586f1aea..a0db3404f7f7 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -188,8 +188,6 @@ static void wakeup_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - else - iter->private = NULL; } static void wakeup_trace_close(struct trace_iterator *iter) @@ -242,7 +240,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, return; local_irq_save(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); local_irq_restore(flags); atomic_dec(&data->disabled); @@ -327,7 +325,7 @@ __trace_function(struct trace_array *tr, if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); } static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index ccc762fbb69c..3386439ec9f6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -562,8 +562,14 @@ static int __trace_uprobe_create(int argc, const char **argv) if (argc < 2) return -ECANCELED; - if (argc - 2 > MAX_TRACE_ARGS) + + trace_probe_log_init("trace_uprobe", argc, argv); + + if (argc - 2 > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; + } if (argv[0][1] == ':') event = &argv[0][2]; @@ -582,7 +588,6 @@ static int __trace_uprobe_create(int argc, const char **argv) return -ECANCELED; } - trace_probe_log_init("trace_uprobe", argc, argv); trace_probe_log_set_index(1); /* filename is the 2nd argument */ *arg++ = '\0'; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 1848ce7e2976..62719d2941c9 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -127,7 +127,7 @@ static void debug_print_probes(struct tracepoint_func *funcs) return; for (i = 0; funcs[i].func; i++) - printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func); + printk(KERN_DEBUG "Probe %d : %pSb\n", i, funcs[i].func); } static struct tracepoint_func * |