From 877c685607925238e302cd3aa38788dca6c1b226 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 5 Mar 2013 11:38:08 +0800 Subject: perf: Remove include of cgroup.h from perf_event.h Move struct perf_cgroup_info and perf_cgroup to kernel/perf/core.c, and then we can remove include of cgroup.h. Signed-off-by: Li Zefan Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tejun Heo Link: http://lkml.kernel.org/r/513568A0.6020804@huawei.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e47ee462c2f2..8737e1cee8b2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -21,7 +21,6 @@ */ #ifdef CONFIG_PERF_EVENTS -# include # include # include #endif @@ -299,22 +298,7 @@ struct swevent_hlist { #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 -#ifdef CONFIG_CGROUP_PERF -/* - * perf_cgroup_info keeps track of time_enabled for a cgroup. - * This is a per-cpu dynamically allocated data structure. - */ -struct perf_cgroup_info { - u64 time; - u64 timestamp; -}; - -struct perf_cgroup { - struct cgroup_subsys_state css; - struct perf_cgroup_info *info; /* timing info, one per cpu */ -}; -#endif - +struct perf_cgroup; struct ring_buffer; /** -- cgit v1.2.3 From 3a54aaa0a3ddb2cf2ec1b94a94024e9a8a8af962 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 24 Jan 2013 16:10:26 +0100 Subject: perf/x86: Improve sysfs event mapping with event string This patch extends Jiri's changes to make generic events mapping visible via sysfs. The patch extends the mechanism to non-generic events by allowing the mappings to be hardcoded in strings. This mechanism will be used by the PEBS-LL patch later on. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: ak@linux.intel.com Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: namhyung.kim@lge.com Link: http://lkml.kernel.org/r/1359040242-8269-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar [ fixed up conflict with 2663960 "perf: Make EVENT_ATTR global" ] Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/kernel/cpu/perf_event.c | 20 ++++++++++++-------- arch/x86/kernel/cpu/perf_event.h | 17 +++++++++++++++++ include/linux/perf_event.h | 1 + 3 files changed, 30 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c886dc8c63f8..6e8ab0427041 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1316,9 +1316,16 @@ static struct attribute_group x86_pmu_format_group = { */ static void __init filter_events(struct attribute **attrs) { + struct device_attribute *d; + struct perf_pmu_events_attr *pmu_attr; int i, j; for (i = 0; attrs[i]; i++) { + d = (struct device_attribute *)attrs[i]; + pmu_attr = container_of(d, struct perf_pmu_events_attr, attr); + /* str trumps id */ + if (pmu_attr->event_str) + continue; if (x86_pmu.event_map(i)) continue; @@ -1361,17 +1368,14 @@ static ssize_t events_sysfs_show(struct device *dev, struct device_attribute *at { struct perf_pmu_events_attr *pmu_attr = \ container_of(attr, struct perf_pmu_events_attr, attr); - u64 config = x86_pmu.event_map(pmu_attr->id); - return x86_pmu.events_sysfs_show(page, config); -} -#define EVENT_VAR(_id) event_attr_##_id -#define EVENT_PTR(_id) &event_attr_##_id.attr.attr + /* string trumps id */ + if (pmu_attr->event_str) + return sprintf(page, "%s", pmu_attr->event_str); -#define EVENT_ATTR(_name, _id) \ - PMU_EVENT_ATTR(_name, EVENT_VAR(_id), PERF_COUNT_HW_##_id, \ - events_sysfs_show) + return x86_pmu.events_sysfs_show(page, config); +} EVENT_ATTR(cpu-cycles, CPU_CYCLES ); EVENT_ATTR(instructions, INSTRUCTIONS ); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 95152c12a8d9..b1518eed5f99 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -422,6 +422,23 @@ do { \ #define ERF_NO_HT_SHARING 1 #define ERF_HAS_RSP_1 2 +#define EVENT_VAR(_id) event_attr_##_id +#define EVENT_PTR(_id) &event_attr_##_id.attr.attr + +#define EVENT_ATTR(_name, _id) \ +static struct perf_pmu_events_attr EVENT_VAR(_id) = { \ + .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ + .id = PERF_COUNT_HW_##_id, \ + .event_str = NULL, \ +}; + +#define EVENT_ATTR_STR(_name, v, str) \ +static struct perf_pmu_events_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ + .id = 0, \ + .event_str = str, \ +}; + extern struct x86_pmu x86_pmu __read_mostly; DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8737e1cee8b2..1c592114c437 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -809,6 +809,7 @@ do { \ struct perf_pmu_events_attr { struct device_attribute attr; u64 id; + const char *event_str; }; #define PMU_EVENT_ATTR(_name, _var, _id, _show) \ -- cgit v1.2.3 From 9fac2cf316b070ae43d2ae2525e381ff2d1d68aa Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 24 Jan 2013 16:10:27 +0100 Subject: perf/x86: Add flags to event constraints This patch adds a flags field to each event constraint. It can be used to store event specific features which can then later be used by scheduling code or low-level x86 code. The flags are propagated into event->hw.flags during the get_event_constraint() call. They are cleared during the put_event_constraint() call. This mechanism is going to be used by the PEBS-LL patches. It avoids defining yet another table to hold event specific information. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: ak@linux.intel.com Cc: jolsa@redhat.com Cc: namhyung.kim@lge.com Link: http://lkml.kernel.org/r/1359040242-8269-4-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event.h | 8 +++++--- arch/x86/kernel/cpu/perf_event_intel.c | 6 +++++- arch/x86/kernel/cpu/perf_event_intel_ds.c | 4 +++- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 +- include/linux/perf_event.h | 1 + 6 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 6e8ab0427041..8ba51518f689 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1489,7 +1489,7 @@ static int __init init_hw_perf_events(void) unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, - 0, x86_pmu.num_counters, 0); + 0, x86_pmu.num_counters, 0, 0); x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ x86_pmu_format_group.attrs = x86_pmu.format_attrs; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index b1518eed5f99..9686d38eb458 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -59,6 +59,7 @@ struct event_constraint { u64 cmask; int weight; int overlap; + int flags; }; struct amd_nb { @@ -170,16 +171,17 @@ struct cpu_hw_events { void *kfree_on_online; }; -#define __EVENT_CONSTRAINT(c, n, m, w, o) {\ +#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ { .idxmsk64 = (n) }, \ .code = (c), \ .cmask = (m), \ .weight = (w), \ .overlap = (o), \ + .flags = f, \ } #define EVENT_CONSTRAINT(c, n, m) \ - __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0) + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) /* * The overlap flag marks event constraints with overlapping counter @@ -203,7 +205,7 @@ struct cpu_hw_events { * and its counter masks must be kept at a minimum. */ #define EVENT_CONSTRAINT_OVERLAP(c, n, m) \ - __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1) + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0) /* * Constraint on the Event code. diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index dab7580c47ae..df3beaac3397 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1392,8 +1392,11 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { - if ((event->hw.config & c->cmask) == c->code) + if ((event->hw.config & c->cmask) == c->code) { + /* hw.flags zeroed at initialization */ + event->hw.flags |= c->flags; return c; + } } } @@ -1438,6 +1441,7 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, static void intel_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { + event->hw.flags = 0; intel_put_shared_regs_event_constraints(cpuc, event); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 826054a4f2ee..f30d85bcbda9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -430,8 +430,10 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) if (x86_pmu.pebs_constraints) { for_each_event_constraint(c, x86_pmu.pebs_constraints) { - if ((event->hw.config & c->cmask) == c->code) + if ((event->hw.config & c->cmask) == c->code) { + event->hw.flags |= c->flags; return c; + } } } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index b43200dbfe7e..75da9e18b128 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -2438,7 +2438,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type) type->unconstrainted = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, - 0, type->num_counters, 0); + 0, type->num_counters, 0, 0); for (i = 0; i < type->num_boxes; i++) { pmus[i].func_id = -1; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1c592114c437..cd3bb2cd9494 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -127,6 +127,7 @@ struct hw_perf_event { int event_base_rdpmc; int idx; int last_cpu; + int flags; struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; -- cgit v1.2.3 From c3feedf2aaf9ac8bad6f19f5d21e4ee0b4b87e9c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 24 Jan 2013 16:10:28 +0100 Subject: perf/core: Add weighted samples For some events it's useful to weight sample with a hardware provided number. This expresses how expensive the action the sample represent was. This allows the profiler to scale the samples to be more informative to the programmer. There is already the period which is used similarly, but it means something different, so I chose to not overload it. Instead a new sample type for WEIGHT is added. Can be used for multiple things. Initially it is used for TSX abort costs and profiling by memory latencies (so to make expensive load appear higher up in the histograms). The concept is quite generic and can be extended to many other kinds of events or architectures, as long as the hardware provides suitable auxillary values. In principle it could be also used for software tracepoints. This adds the generic glue. A new optional sample format for a 64-bit weight value. Signed-off-by: Andi Kleen Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: namhyung.kim@lge.com Link: http://lkml.kernel.org/r/1359040242-8269-5-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 2 ++ include/uapi/linux/perf_event.h | 6 +++++- kernel/events/core.c | 6 ++++++ 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index cd3bb2cd9494..7ce0b37b155b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -573,6 +573,7 @@ struct perf_sample_data { struct perf_branch_stack *br_stack; struct perf_regs_user regs_user; u64 stack_user_size; + u64 weight; }; static inline void perf_sample_data_init(struct perf_sample_data *data, @@ -586,6 +587,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->regs_user.abi = PERF_SAMPLE_REGS_ABI_NONE; data->regs_user.regs = NULL; data->stack_user_size = 0; + data->weight = 0; } extern void perf_output_sample(struct perf_output_handle *handle, diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 9fa9c622a7f4..cdc255da02e2 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -132,8 +132,10 @@ enum perf_event_sample_format { PERF_SAMPLE_BRANCH_STACK = 1U << 11, PERF_SAMPLE_REGS_USER = 1U << 12, PERF_SAMPLE_STACK_USER = 1U << 13, + PERF_SAMPLE_WEIGHT = 1U << 14, + + PERF_SAMPLE_MAX = 1U << 15, /* non-ABI */ - PERF_SAMPLE_MAX = 1U << 14, /* non-ABI */ }; /* @@ -588,6 +590,8 @@ enum perf_event_type { * { u64 size; * char data[size]; * u64 dyn_size; } && PERF_SAMPLE_STACK_USER + * + * { u64 weight; } && PERF_SAMPLE_WEIGHT * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index 7b4a55d41efc..9e3edb272b3e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -976,6 +976,9 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); + if (sample_type & PERF_SAMPLE_WEIGHT) + size += sizeof(data->weight); + if (sample_type & PERF_SAMPLE_READ) size += event->read_size; @@ -4193,6 +4196,9 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); + + if (sample_type & PERF_SAMPLE_WEIGHT) + perf_output_put(handle, data->weight); } void perf_prepare_sample(struct perf_event_header *header, -- cgit v1.2.3 From d6be9ad6c960f43800a6f118932bc8a5a4eadcd1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 24 Jan 2013 16:10:31 +0100 Subject: perf: Add generic memory sampling interface This patch adds PERF_SAMPLE_DATA_SRC. PERF_SAMPLE_DATA_SRC collects the data source, i.e., where did the data associated with the sampled instruction come from. Information is stored in a perf_mem_data_src structure. It contains opcode, mem level, tlb, snoop, lock information, subject to availability in hardware. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: ak@linux.intel.com Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: namhyung.kim@lge.com Link: http://lkml.kernel.org/r/1359040242-8269-8-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 2 ++ include/uapi/linux/perf_event.h | 68 +++++++++++++++++++++++++++++++++++++++-- kernel/events/core.c | 6 ++++ 3 files changed, 74 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7ce0b37b155b..42a6daaf4e0a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -568,6 +568,7 @@ struct perf_sample_data { u32 reserved; } cpu_entry; u64 period; + union perf_mem_data_src data_src; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; @@ -588,6 +589,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->regs_user.regs = NULL; data->stack_user_size = 0; data->weight = 0; + data->data_src.val = 0; } extern void perf_output_sample(struct perf_output_handle *handle, diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index cdc255da02e2..5b5762006855 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -133,9 +133,9 @@ enum perf_event_sample_format { PERF_SAMPLE_REGS_USER = 1U << 12, PERF_SAMPLE_STACK_USER = 1U << 13, PERF_SAMPLE_WEIGHT = 1U << 14, + PERF_SAMPLE_DATA_SRC = 1U << 15, - PERF_SAMPLE_MAX = 1U << 15, /* non-ABI */ - + PERF_SAMPLE_MAX = 1U << 16, /* non-ABI */ }; /* @@ -592,6 +592,7 @@ enum perf_event_type { * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * * { u64 weight; } && PERF_SAMPLE_WEIGHT + * { u64 data_src; } && PERF_SAMPLE_DATA_SRC * }; */ PERF_RECORD_SAMPLE = 9, @@ -617,4 +618,67 @@ enum perf_callchain_context { #define PERF_FLAG_FD_OUTPUT (1U << 1) #define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */ +union perf_mem_data_src { + __u64 val; + struct { + __u64 mem_op:5, /* type of opcode */ + mem_lvl:14, /* memory hierarchy level */ + mem_snoop:5, /* snoop mode */ + mem_lock:2, /* lock instr */ + mem_dtlb:7, /* tlb access */ + mem_rsvd:31; + }; +}; + +/* type of opcode (load/store/prefetch,code) */ +#define PERF_MEM_OP_NA 0x01 /* not available */ +#define PERF_MEM_OP_LOAD 0x02 /* load instruction */ +#define PERF_MEM_OP_STORE 0x04 /* store instruction */ +#define PERF_MEM_OP_PFETCH 0x08 /* prefetch */ +#define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ +#define PERF_MEM_OP_SHIFT 0 + +/* memory hierarchy (memory level, hit or miss) */ +#define PERF_MEM_LVL_NA 0x01 /* not available */ +#define PERF_MEM_LVL_HIT 0x02 /* hit level */ +#define PERF_MEM_LVL_MISS 0x04 /* miss level */ +#define PERF_MEM_LVL_L1 0x08 /* L1 */ +#define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */ +#define PERF_MEM_LVL_L2 0x20 /* L2 hit */ +#define PERF_MEM_LVL_L3 0x40 /* L3 hit */ +#define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */ +#define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */ +#define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */ +#define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */ +#define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */ +#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ +#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ +#define PERF_MEM_LVL_SHIFT 5 + +/* snoop mode */ +#define PERF_MEM_SNOOP_NA 0x01 /* not available */ +#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ +#define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */ +#define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */ +#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ +#define PERF_MEM_SNOOP_SHIFT 19 + +/* locked instruction */ +#define PERF_MEM_LOCK_NA 0x01 /* not available */ +#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ +#define PERF_MEM_LOCK_SHIFT 24 + +/* TLB access */ +#define PERF_MEM_TLB_NA 0x01 /* not available */ +#define PERF_MEM_TLB_HIT 0x02 /* hit level */ +#define PERF_MEM_TLB_MISS 0x04 /* miss level */ +#define PERF_MEM_TLB_L1 0x08 /* L1 */ +#define PERF_MEM_TLB_L2 0x10 /* L2 */ +#define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/ +#define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ +#define PERF_MEM_TLB_SHIFT 26 + +#define PERF_MEM_S(a, s) \ + (((u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) + #endif /* _UAPI_LINUX_PERF_EVENT_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 9e3edb272b3e..77c96d18c23a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -982,6 +982,9 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_READ) size += event->read_size; + if (sample_type & PERF_SAMPLE_DATA_SRC) + size += sizeof(data->data_src.val); + event->header_size = size; } @@ -4199,6 +4202,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_WEIGHT) perf_output_put(handle, data->weight); + + if (sample_type & PERF_SAMPLE_DATA_SRC) + perf_output_put(handle, data->data_src.val); } void perf_prepare_sample(struct perf_event_header *header, -- cgit v1.2.3 From 2fe85427e3bf65d791700d065132772fc26e4d75 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 24 Jan 2013 16:10:39 +0100 Subject: perf: Add PERF_RECORD_MISC_MMAP_DATA to RECORD_MMAP Type of mapping was lost and made it hard for a tool to distinguish code vs. data mmaps. Perf has the ability to distinguish the two. Use a bit in the header->misc bitmask to keep track of the mmap type. If PERF_RECORD_MISC_MMAP_DATA is set then the mapping is not executable (!VM_EXEC). If not set, then the mapping is executable. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: ak@linux.intel.com Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: namhyung.kim@lge.com Link: http://lkml.kernel.org/r/1359040242-8269-16-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar Signed-off-by: Arnaldo Carvalho de Melo --- include/uapi/linux/perf_event.h | 1 + kernel/events/core.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 5b5762006855..964a450a6e2c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -445,6 +445,7 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0) #define PERF_RECORD_MISC_GUEST_USER (5 << 0) +#define PERF_RECORD_MISC_MMAP_DATA (1 << 13) /* * Indicates that the content of PERF_SAMPLE_IP points to * the actual instruction that triggered the event. See also diff --git a/kernel/events/core.c b/kernel/events/core.c index 77c96d18c23a..98c0845fcd20 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4791,6 +4791,9 @@ got_name: mmap_event->file_name = name; mmap_event->file_size = size; + if (!(vma->vm_flags & VM_EXEC)) + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; rcu_read_lock(); -- cgit v1.2.3 From 0908ad6e56b5a6e86745680bc324bdbfac64d0b6 Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Fri, 22 Mar 2013 20:46:27 +0530 Subject: uprobes: Add trap variant helper Some architectures like powerpc have multiple variants of the trap instruction. Introduce an additional helper is_trap_insn() for run-time handling of non-uprobe traps on such architectures. While there, change is_swbp_at_addr() to is_trap_at_addr() for reading clarity. With this change, the uprobe registration path will supercede any trap instruction inserted at the requested location, while taking care of delivering the SIGTRAP for cases where the trap notification came in for an address without a uprobe. See [1] for a more detailed explanation. [1] https://lists.ozlabs.org/pipermail/linuxppc-dev/2013-March/104771.html This change was suggested by Oleg Nesterov. Signed-off-by: Ananth N Mavinakayanahalli Acked-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov --- include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 34 +++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 02b83db8e2c5..19612881399a 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -100,6 +100,7 @@ struct uprobes_state { extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); +extern bool __weak is_trap_insn(uprobe_opcode_t *insn); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 26bc2e24e9e3..ca9012930ce7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -173,6 +173,20 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) return *insn == UPROBE_SWBP_INSN; } +/** + * is_trap_insn - check if instruction is breakpoint instruction. + * @insn: instruction to be checked. + * Default implementation of is_trap_insn + * Returns true if @insn is a breakpoint instruction. + * + * This function is needed for the case where an architecture has multiple + * trap instructions (like powerpc). + */ +bool __weak is_trap_insn(uprobe_opcode_t *insn) +{ + return is_swbp_insn(insn); +} + static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) { void *kaddr = kmap_atomic(page); @@ -185,6 +199,15 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t uprobe_opcode_t old_opcode; bool is_swbp; + /* + * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. + * We do not check if it is any other 'trap variant' which could + * be conditional trap instruction such as the one powerpc supports. + * + * The logic is that we do not care if the underlying instruction + * is a trap variant; uprobes always wins over any other (gdb) + * breakpoint. + */ copy_opcode(page, vaddr, &old_opcode); is_swbp = is_swbp_insn(&old_opcode); @@ -204,7 +227,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction - * supported by that architecture then we need to modify is_swbp_at_addr and + * supported by that architecture then we need to modify is_trap_at_addr and * write_opcode accordingly. This would never be a problem for archs that * have fixed length instructions. */ @@ -550,7 +573,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, goto out; ret = -ENOTSUPP; - if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) + if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); @@ -1431,7 +1454,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) clear_bit(MMF_HAS_UPROBES, &mm->flags); } -static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) +static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) { struct page *page; uprobe_opcode_t opcode; @@ -1452,7 +1475,8 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) copy_opcode(page, vaddr, &opcode); put_page(page); out: - return is_swbp_insn(&opcode); + /* This needs to return true for any variant of the trap insn */ + return is_trap_insn(&opcode); } static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) @@ -1472,7 +1496,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) } if (!uprobe) - *is_swbp = is_swbp_at_addr(mm, bp_vaddr); + *is_swbp = is_trap_at_addr(mm, bp_vaddr); } else { *is_swbp = -EFAULT; } -- cgit v1.2.3 From cc2f5a8adbc7ab1fdb7d9bcf4ea9838c73e82dfe Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 5 Apr 2013 16:49:41 +0200 Subject: perf: Fix comments in PERF_MEM_LVL bitmask This small patch fixes a mistake in the comments for the PERF_MEM_LVL_* events. The L2, L3 bits simply represent cache levels, not hits or misses. That is encoded in PERF_MEM_LVL_MISS/PERF_MEM_LVL_HIT. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/20130405144941.GA30503@quad Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 964a450a6e2c..fb104e51496e 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -645,8 +645,8 @@ union perf_mem_data_src { #define PERF_MEM_LVL_MISS 0x04 /* miss level */ #define PERF_MEM_LVL_L1 0x08 /* L1 */ #define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */ -#define PERF_MEM_LVL_L2 0x20 /* L2 hit */ -#define PERF_MEM_LVL_L3 0x40 /* L3 hit */ +#define PERF_MEM_LVL_L2 0x20 /* L2 */ +#define PERF_MEM_LVL_L3 0x40 /* L3 */ #define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */ #define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */ #define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */ -- cgit v1.2.3 From 324670b620ab1ed00ba160e435686bd2ae4a59ce Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 4 Apr 2013 19:40:50 +0900 Subject: kprobes: Move __kprobes definition into compiler.h Currently, __kprobes is defined in linux/kprobes.h which is too big to be included in small or basic headers that want to make use of this simple attribute. So move __kprobes definition into linux/compiler.h in which other compiler attributes are defined. Signed-off-by: Masami Hiramatsu Cc: Timo Juhani Lindfors Cc: Ananth N Mavinakayanahalli Cc: Pavel Emelyanov Cc: Jiri Kosina Cc: Nadia Yvette Chambers Cc: yrl.pp-manager.tt@hitachi.com Cc: David S. Miller Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20130404104049.21071.20908.stgit@mhiramat-M0-7522 [ Improved the attribute explanation a bit. ] Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 6 ++++++ include/linux/kprobes.h | 6 +----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 10b8f23fab0f..92669cd182a6 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -351,4 +351,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); */ #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) +/* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */ +#ifdef CONFIG_KPROBES +# define __kprobes __attribute__((__section__(".kprobes.text"))) +#else +# define __kprobes +#endif #endif /* __LINUX_COMPILER_H */ diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 4b6ef4d33cc2..ca1d27a0d6a6 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -29,6 +29,7 @@ * and Prasanna S Panchamukhi * added function-return probes. */ +#include /* for __kprobes */ #include #include #include @@ -49,16 +50,11 @@ #define KPROBE_REENTER 0x00000004 #define KPROBE_HIT_SSDONE 0x00000008 -/* Attach to insert probes on any functions which should be ignored*/ -#define __kprobes __attribute__((__section__(".kprobes.text"))) - #else /* CONFIG_KPROBES */ typedef int kprobe_opcode_t; struct arch_specific_insn { int dummy; }; -#define __kprobes - #endif /* CONFIG_KPROBES */ struct kprobe; -- cgit v1.2.3 From ea024870cf10687b3fded66a9deb6253888f30b7 Mon Sep 17 00:00:00 2001 From: Anton Arapov Date: Wed, 3 Apr 2013 18:00:31 +0200 Subject: uretprobes: Introduce uprobe_consumer->ret_handler() Enclose return probes implementation, introduce ->ret_handler() and update existing code to rely on ->handler() *and* ->ret_handler() for uprobe and uretprobe respectively. Signed-off-by: Anton Arapov Acked-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov --- include/linux/uprobes.h | 3 +++ kernel/events/uprobes.c | 17 ++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 19612881399a..5c8d3290df41 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -46,6 +46,9 @@ enum uprobe_filter_ctx { struct uprobe_consumer { int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); + int (*ret_handler)(struct uprobe_consumer *self, + unsigned long func, + struct pt_regs *regs); bool (*filter)(struct uprobe_consumer *self, enum uprobe_filter_ctx ctx, struct mm_struct *mm); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7312503caf2e..eb384e90ac92 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -838,6 +838,14 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * struct uprobe *uprobe; int ret; + /* Uprobe must have at least one set consumer */ + if (!uc->handler && !uc->ret_handler) + return -EINVAL; + + /* TODO: Implement return probes */ + if (uc->ret_handler) + return -ENOSYS; + /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; @@ -1497,10 +1505,13 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) down_read(&uprobe->register_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { - int rc = uc->handler(uc, regs); + int rc = 0; - WARN(rc & ~UPROBE_HANDLER_MASK, - "bad rc=0x%x from %pf()\n", rc, uc->handler); + if (uc->handler) { + rc = uc->handler(uc, regs); + WARN(rc & ~UPROBE_HANDLER_MASK, + "bad rc=0x%x from %pf()\n", rc, uc->handler); + } remove &= rc; } -- cgit v1.2.3 From 0dfd0eb8e4d72ded8b21f4fee74ba5547408cbe9 Mon Sep 17 00:00:00 2001 From: Anton Arapov Date: Wed, 3 Apr 2013 18:00:35 +0200 Subject: uretprobes: Return probe entry, prepare_uretprobe() When a uprobe with return probe consumer is hit, prepare_uretprobe() function is invoked. It creates return_instance, hijacks return address and replaces it with the trampoline. * Return instances are kept as stack per uprobed task. * Return instance is chained, when the original return address is trampoline's page vaddr (e.g. recursive call of the probed function). Signed-off-by: Anton Arapov Acked-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov --- include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 5c8d3290df41..b0507f24eeb0 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -71,6 +71,7 @@ struct uprobe_task { enum uprobe_task_state state; struct arch_uprobe_task autask; + struct return_instance *return_instances; struct uprobe *active_uprobe; unsigned long xol_vaddr; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d345b7c6cb2d..3798947b3b58 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -75,6 +75,15 @@ struct uprobe { struct arch_uprobe arch; }; +struct return_instance { + struct uprobe *uprobe; + unsigned long func; + unsigned long orig_ret_vaddr; /* original return address */ + bool chained; /* true, if instance is nested */ + + struct return_instance *next; /* keep as stack */ +}; + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have @@ -1317,6 +1326,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; + struct return_instance *ri, *tmp; if (!utask) return; @@ -1324,6 +1334,15 @@ void uprobe_free_utask(struct task_struct *t) if (utask->active_uprobe) put_uprobe(utask->active_uprobe); + ri = utask->return_instances; + while (ri) { + tmp = ri; + ri = ri->next; + + put_uprobe(tmp->uprobe); + kfree(tmp); + } + xol_free_insn_slot(t); kfree(utask); t->utask = NULL; @@ -1371,6 +1390,65 @@ static unsigned long get_trampoline_vaddr(void) return trampoline_vaddr; } +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct return_instance *ri; + struct uprobe_task *utask; + unsigned long orig_ret_vaddr, trampoline_vaddr; + bool chained = false; + + if (!get_xol_area()) + return; + + utask = get_utask(); + if (!utask) + return; + + ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); + if (!ri) + goto fail; + + trampoline_vaddr = get_trampoline_vaddr(); + orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); + if (orig_ret_vaddr == -1) + goto fail; + + /* + * We don't want to keep trampoline address in stack, rather keep the + * original return address of first caller thru all the consequent + * instances. This also makes breakpoint unwrapping easier. + */ + if (orig_ret_vaddr == trampoline_vaddr) { + if (!utask->return_instances) { + /* + * This situation is not possible. Likely we have an + * attack from user-space. + */ + pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", + current->pid, current->tgid); + goto fail; + } + + chained = true; + orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; + } + + atomic_inc(&uprobe->ref); + ri->uprobe = uprobe; + ri->func = instruction_pointer(regs); + ri->orig_ret_vaddr = orig_ret_vaddr; + ri->chained = chained; + + /* add instance to the stack */ + ri->next = utask->return_instances; + utask->return_instances = ri; + + return; + + fail: + kfree(ri); +} + /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) @@ -1527,6 +1605,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; int remove = UPROBE_HANDLER_REMOVE; + bool need_prep = false; /* prepare return uprobe, when needed */ down_read(&uprobe->register_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { @@ -1537,9 +1616,16 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) WARN(rc & ~UPROBE_HANDLER_MASK, "bad rc=0x%x from %pf()\n", rc, uc->handler); } + + if (uc->ret_handler) + need_prep = true; + remove &= rc; } + if (need_prep && !remove) + prepare_uretprobe(uprobe, regs); /* put bp at return */ + if (remove && uprobe->consumers) { WARN_ON(!uprobe_is_active(uprobe)); unapply_uprobe(uprobe, current->mm); @@ -1658,7 +1744,11 @@ void uprobe_notify_resume(struct pt_regs *regs) */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { - if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) + if (!current->mm) + return 0; + + if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) && + (!current->utask || !current->utask->return_instances)) return 0; set_thread_flag(TIF_UPROBE); -- cgit v1.2.3 From ded49c55309a37129dc30a5f0e85b8a64e5c1716 Mon Sep 17 00:00:00 2001 From: Anton Arapov Date: Wed, 3 Apr 2013 18:00:37 +0200 Subject: uretprobes: Limit the depth of return probe nestedness Unlike the kretprobes we can't trust userspace, thus must have protection from user space attacks. User-space have "unlimited" stack, and this patch limits the return probes nestedness as a simple remedy for it. Note that this implementation leaks return_instance on siglongjmp until exit()/exec(). The intention is to have KISS and bare minimum solution for the initial implementation in order to not complicate the uretprobes code. In the future we may come up with more sophisticated solution that remove this depth limitation. It is not easy task and lays beyond this patchset. Signed-off-by: Anton Arapov Acked-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov --- include/linux/uprobes.h | 3 +++ kernel/events/uprobes.c | 11 +++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b0507f24eeb0..06f28beed7c2 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -38,6 +38,8 @@ struct inode; #define UPROBE_HANDLER_REMOVE 1 #define UPROBE_HANDLER_MASK 1 +#define MAX_URETPROBE_DEPTH 64 + enum uprobe_filter_ctx { UPROBE_FILTER_REGISTER, UPROBE_FILTER_UNREGISTER, @@ -72,6 +74,7 @@ struct uprobe_task { struct arch_uprobe_task autask; struct return_instance *return_instances; + unsigned int depth; struct uprobe *active_uprobe; unsigned long xol_vaddr; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 65429ad2ce51..6ab00e090c87 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1404,6 +1404,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) if (!utask) return; + if (utask->depth >= MAX_URETPROBE_DEPTH) { + printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" + " nestedness limit pid/tgid=%d/%d\n", + current->pid, current->tgid); + return; + } + ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); if (!ri) goto fail; @@ -1439,6 +1446,8 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; + utask->depth++; + /* add instance to the stack */ ri->next = utask->return_instances; utask->return_instances = ri; @@ -1681,6 +1690,8 @@ static bool handle_trampoline(struct pt_regs *regs) if (!chained) break; + utask->depth--; + BUG_ON(!ri); } -- cgit v1.2.3