From 3b1efb196eee45b2f0c4994e0c43edb5e367f620 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Jun 2016 22:47:14 +0200 Subject: bpf, maps: flush own entries on perf map release The behavior of perf event arrays are quite different from all others as they are tightly coupled to perf event fds, f.e. shown recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array to use struct file") to make refcounting on perf event more robust. A remaining issue that the current code still has is that since additions to the perf event array take a reference on the struct file via perf_event_get() and are only released via fput() (that cleans up the perf event eventually via perf_event_release_kernel()) when the element is either manually removed from the map from user space or automatically when the last reference on the perf event map is dropped. However, this leads us to dangling struct file's when the map gets pinned after the application owning the perf event descriptor exits, and since the struct file reference will in such case only be manually dropped or via pinned file removal, it leads to the perf event living longer than necessary, consuming needlessly resources for that time. Relations between perf event fds and bpf perf event map fds can be rather complex. F.e. maps can act as demuxers among different perf event fds that can possibly be owned by different threads and based on the index selection from the program, events get dispatched to one of the per-cpu fd endpoints. One perf event fd (or, rather a per-cpu set of them) can also live in multiple perf event maps at the same time, listening for events. Also, another requirement is that perf event fds can get closed from application side after they have been attached to the perf event map, so that on exit perf event map will take care of dropping their references eventually. Likewise, when such maps are pinned, the intended behavior is that a user application does bpf_obj_get(), puts its fds in there and on exit when fd is released, they are dropped from the map again, so the map acts rather as connector endpoint. This also makes perf event maps inherently different from program arrays as described in more detail in commit c9da161c6517 ("bpf: fix clearing on persistent program array maps"). To tackle this, map entries are marked by the map struct file that added the element to the map. And when the last reference to that map struct file is released from user space, then the tracked entries are purged from the map. This is okay, because new map struct files instances resp. frontends to the anon inode are provided via bpf_map_new_fd() that is called when we invoke bpf_obj_get_user() for retrieving a pinned map, but also when an initial instance is created via map_create(). The rest is resolved by the vfs layer automatically for us by keeping reference count on the map's struct file. Any concurrent updates on the map slot are fine as well, it just means that perf_event_fd_array_release() needs to delete less of its own entires. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 720b7bb01d43..037ea6ea3cb2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -192,18 +192,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) { struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_event_entry *ee; struct perf_event *event; - struct file *file; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = READ_ONCE(array->ptrs[index]); - if (unlikely(!file)) + ee = READ_ONCE(array->ptrs[index]); + if (unlikely(!ee)) return -ENOENT; - event = file->private_data; - + event = ee->event; /* make sure event is local and doesn't have pmu::count */ if (event->oncpu != smp_processor_id() || event->pmu->count) @@ -233,8 +232,8 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) u64 index = flags & BPF_F_INDEX_MASK; void *data = (void *) (long) r4; struct perf_sample_data sample_data; + struct bpf_event_entry *ee; struct perf_event *event; - struct file *file; struct perf_raw_record raw = { .size = size, .data = data, @@ -247,12 +246,11 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = READ_ONCE(array->ptrs[index]); - if (unlikely(!file)) + ee = READ_ONCE(array->ptrs[index]); + if (unlikely(!ee)) return -ENOENT; - event = file->private_data; - + event = ee->event; if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; -- cgit v1.2.3 From 1ca1cc98bf7418c680415bfce05699f67510a7fd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:23 +0200 Subject: bpf: minor cleanups on fd maps and helpers Some minor cleanups: i) Remove the unlikely() from fd array map lookups and let the CPU branch predictor do its job, scenarios where there is not always a map entry are very well valid. ii) Move the attribute type check in the bpf_perf_event_read() helper a bit earlier so it's consistent wrt checks with bpf_perf_event_output() helper as well. iii) remove some comments that are self-documenting in kprobe_prog_is_valid_access() and therefore make it consistent to tp_prog_is_valid_access() as well. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 3 +-- kernel/trace/bpf_trace.c | 18 ++++++------------ 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b94a36550591..d638062f66d6 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -719,14 +719,13 @@ select_insn: if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; prog = READ_ONCE(array->ptrs[index]); - if (unlikely(!prog)) + if (!prog) goto out; /* ARG1 at this point is guaranteed to point to CTX from diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3de25fbed785..4e61f74a5d73 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -199,19 +199,19 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) return -E2BIG; ee = READ_ONCE(array->ptrs[index]); - if (unlikely(!ee)) + if (!ee) return -ENOENT; event = ee->event; + if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && + event->attr.type != PERF_TYPE_RAW)) + return -EINVAL; + /* make sure event is local and doesn't have pmu::count */ if (event->oncpu != smp_processor_id() || event->pmu->count) return -EINVAL; - if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && - event->attr.type != PERF_TYPE_RAW)) - return -EINVAL; - /* * we don't know if the function is run successfully by the * return value. It can be judged in other places, such as @@ -251,7 +251,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) return -E2BIG; ee = READ_ONCE(array->ptrs[index]); - if (unlikely(!ee)) + if (!ee) return -ENOENT; event = ee->event; @@ -354,18 +354,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type) { - /* check bounds */ if (off < 0 || off >= sizeof(struct pt_regs)) return false; - - /* only read is allowed */ if (type != BPF_READ) return false; - - /* disallow misaligned access */ if (off % size != 0) return false; - return true; } -- cgit v1.2.3 From d79313303181d357d293453fb8486bdc87bfd2f4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:24 +0200 Subject: bpf, trace: fetch current cpu only once We currently have two invocations, which is unnecessary. Fetch it only once and use the smp_processor_id() variant, so we also get preemption checks along with it when DEBUG_PREEMPT is set. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4e61f74a5d73..505f9e9cdb3b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -233,6 +233,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) struct pt_regs *regs = (struct pt_regs *) (long) r1; struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_array *array = container_of(map, struct bpf_array, map); + unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; void *data = (void *) (long) r4; struct perf_sample_data sample_data; @@ -246,7 +247,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; if (index == BPF_F_CURRENT_CPU) - index = raw_smp_processor_id(); + index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; @@ -259,7 +260,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; - if (unlikely(event->oncpu != smp_processor_id())) + if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; perf_sample_data_init(&sample_data, 0, 0); -- cgit v1.2.3 From 6816a7ffce32e999601825ddfd887f36d3052932 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:25 +0200 Subject: bpf, trace: add BPF_F_CURRENT_CPU flag for bpf_perf_event_read Follow-up commit to 1e33759c788c ("bpf, trace: add BPF_F_CURRENT_CPU flag for bpf_perf_event_output") to add the same functionality into bpf_perf_event_read() helper. The split of index into flags and index component is also safe here, since such large maps are rejected during map allocation time. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 +- kernel/trace/bpf_trace.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 406459b935a2..58df2da3e9bf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -347,7 +347,7 @@ enum bpf_func_id { #define BPF_F_ZERO_CSUM_TX (1ULL << 1) #define BPF_F_DONT_FRAGMENT (1ULL << 2) -/* BPF_FUNC_perf_event_output flags. */ +/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */ #define BPF_F_INDEX_MASK 0xffffffffULL #define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 505f9e9cdb3b..19c5b4a5c3eb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -188,13 +188,19 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) +static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) { struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); + unsigned int cpu = smp_processor_id(); + u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; struct perf_event *event; + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + if (index == BPF_F_CURRENT_CPU) + index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; @@ -208,8 +214,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) return -EINVAL; /* make sure event is local and doesn't have pmu::count */ - if (event->oncpu != smp_processor_id() || - event->pmu->count) + if (unlikely(event->oncpu != cpu || event->pmu->count)) return -EINVAL; /* -- cgit v1.2.3 From 606274c5abd8e245add01bc7145a8cbb92b69ba8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 6 Jul 2016 22:38:36 -0700 Subject: bpf: introduce bpf_get_current_task() helper over time there were multiple requests to access different data structures and fields of task_struct current, so finally add the helper to access 'current' as-is. Tracing bpf programs will do the rest of walking the pointers via bpf_probe_read(). Note that current can be null and bpf program has to deal it with, but even dumb passing null into bpf_probe_read() is still safe. Suggested-by: Brendan Gregg Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ kernel/trace/bpf_trace.c | 13 +++++++++++++ 2 files changed, 20 insertions(+) (limited to 'kernel/trace') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c14ca1cd6297..262a7e883b19 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -357,6 +357,13 @@ enum bpf_func_id { */ BPF_FUNC_get_hash_recalc, + /** + * u64 bpf_get_current_task(void) + * Returns current task_struct + * Return: current + */ + BPF_FUNC_get_current_task, + __BPF_FUNC_MAX_ID, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 19c5b4a5c3eb..094c716154ed 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -312,6 +312,17 @@ const struct bpf_func_proto *bpf_get_event_output_proto(void) return &bpf_event_output_proto; } +static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return (long) current; +} + +static const struct bpf_func_proto bpf_get_current_task_proto = { + .func = bpf_get_current_task, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -329,6 +340,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_task: + return &bpf_get_current_task_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: -- cgit v1.2.3 From 7e3f977edd0bd9ea6104156feba95bb5ae9bdd38 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Jul 2016 18:08:03 +0200 Subject: perf, events: add non-linear data support for raw records This patch adds support for non-linear data on raw records. It extends raw records to have one or multiple fragments that will be written linearly into the ring slot, where each fragment can optionally have a custom callback handler to walk and extract complex, possibly non-linear data. If a callback handler is provided for a fragment, then the new __output_custom() will be used instead of __output_copy() for the perf_output_sample() part. perf_prepare_sample() does all the size calculation only once, so perf_output_sample() doesn't need to redo the same work anymore, meaning real_size and padding will be cached in the raw record. The raw record becomes 32 bytes in size without holes; to not increase it further and to avoid doing unnecessary recalculations in fast-path, we can reuse next pointer of the last fragment, idea here is borrowed from ZERO_OR_NULL_PTR(), which should keep the perf_output_sample() path for PERF_SAMPLE_RAW minimal. This facility is needed for BPF's event output helper as a first user that will, in a follow-up, add an additional perf_raw_frag to its perf_raw_record in order to be able to more efficiently dump skb context after a linear head meta data related to it. skbs can be non-linear and thus need a custom output function to dump buffers. Currently, the skb data needs to be copied twice; with the help of __output_custom() this work only needs to be done once. Future users could be things like XDP/BPF programs that work on different context though and would thus also have a different callback function. The few users of raw records are adapted to initialize their frag data from the raw record itself, no change in behavior for them. The code is based upon a PoC diff provided by Peter Zijlstra [1]. [1] http://thread.gmane.org/gmane.linux.network/421294 Suggested-by: Peter Zijlstra Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/s390/kernel/perf_cpum_sf.c | 9 ++++-- arch/x86/events/amd/ibs.c | 8 +++-- include/linux/perf_event.h | 20 ++++++++++++- kernel/events/core.c | 66 ++++++++++++++++++++++++++++------------- kernel/events/internal.h | 16 +++++++--- kernel/trace/bpf_trace.c | 6 ++-- 6 files changed, 93 insertions(+), 32 deletions(-) (limited to 'kernel/trace') diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index a8e832166417..92619cce57ed 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -979,12 +979,15 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) struct pt_regs regs; struct perf_sf_sde_regs *sde_regs; struct perf_sample_data data; - struct perf_raw_record raw; + struct perf_raw_record raw = { + .frag = { + .size = sfr->size, + .data = sfr, + }, + }; /* Setup perf sample */ perf_sample_data_init(&data, 0, event->hw.last_period); - raw.size = sfr->size; - raw.data = sfr; data.raw = &raw; /* Setup pt_regs to look like an CPU-measurement external interrupt diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index feb90f6730e8..72dea2f40fc4 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -655,8 +655,12 @@ fail: } if (event->attr.sample_type & PERF_SAMPLE_RAW) { - raw.size = sizeof(u32) + ibs_data.size; - raw.data = ibs_data.data; + raw = (struct perf_raw_record){ + .frag = { + .size = sizeof(u32) + ibs_data.size, + .data = ibs_data.data, + }, + }; data.raw = &raw; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1a827cecd62f..e79e6c6fed89 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -69,9 +69,22 @@ struct perf_callchain_entry_ctx { bool contexts_maxed; }; +typedef unsigned long (*perf_copy_f)(void *dst, const void *src, + unsigned long len); + +struct perf_raw_frag { + union { + struct perf_raw_frag *next; + unsigned long pad; + }; + perf_copy_f copy; + void *data; + u32 size; +} __packed; + struct perf_raw_record { + struct perf_raw_frag frag; u32 size; - void *data; }; /* @@ -1283,6 +1296,11 @@ extern void perf_restore_debug_store(void); static inline void perf_restore_debug_store(void) { } #endif +static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) +{ + return frag->pad < sizeof(u64); +} + #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 9c51ec3f0f44..b1891b6b5c1f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5553,16 +5553,26 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - u32 raw_size = data->raw->size; - u32 real_size = round_up(raw_size + sizeof(u32), - sizeof(u64)) - sizeof(u32); - u64 zero = 0; - - perf_output_put(handle, real_size); - __output_copy(handle, data->raw->data, raw_size); - if (real_size - raw_size) - __output_copy(handle, &zero, real_size - raw_size); + struct perf_raw_record *raw = data->raw; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + + perf_output_put(handle, raw->size); + do { + if (frag->copy) { + __output_custom(handle, frag->copy, + frag->data, frag->size); + } else { + __output_copy(handle, frag->data, + frag->size); + } + if (perf_raw_frag_last(frag)) + break; + frag = frag->next; + } while (1); + if (frag->pad) + __output_skip(handle, NULL, frag->pad); } else { struct { u32 size; @@ -5687,14 +5697,28 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); + struct perf_raw_record *raw = data->raw; + int size; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + u32 sum = 0; + + do { + sum += frag->size; + if (perf_raw_frag_last(frag)) + break; + frag = frag->next; + } while (1); + + size = round_up(sum + sizeof(u32), sizeof(u64)); + raw->size = size - sizeof(u32); + frag->pad = raw->size - sum; + } else { + size = sizeof(u64); + } - header->size += round_up(size, sizeof(u64)); + header->size += size; } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { @@ -7331,7 +7355,7 @@ static struct pmu perf_swevent = { static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { - void *record = data->raw->data; + void *record = data->raw->frag.data; /* only top level events have filters set */ if (event->parent) @@ -7387,8 +7411,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct perf_event *event; struct perf_raw_record raw = { - .size = entry_size, - .data = record, + .frag = { + .size = entry_size, + .data = record, + }, }; perf_sample_data_init(&data, 0, 0); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 05f9f6d626df..2417eb5512cd 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -123,10 +123,7 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) return rb->aux_nr_pages << PAGE_SHIFT; } -#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ -static inline unsigned long \ -func_name(struct perf_output_handle *handle, \ - const void *buf, unsigned long len) \ +#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func) \ { \ unsigned long size, written; \ \ @@ -152,6 +149,17 @@ func_name(struct perf_output_handle *handle, \ return len; \ } +#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ +static inline unsigned long \ +func_name(struct perf_output_handle *handle, \ + const void *buf, unsigned long len) \ +__DEFINE_OUTPUT_COPY_BODY(memcpy_func) + +static inline unsigned long +__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func, + const void *buf, unsigned long len) +__DEFINE_OUTPUT_COPY_BODY(copy_func) + static inline unsigned long memcpy_common(void *dst, const void *src, unsigned long n) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 094c716154ed..35ab1b2b041b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -245,8 +245,10 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) struct bpf_event_entry *ee; struct perf_event *event; struct perf_raw_record raw = { - .size = size, - .data = data, + .frag = { + .size = size, + .data = data, + }, }; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) -- cgit v1.2.3 From 8e7a3920ac277dd4e690c0e70c9750176e3acb83 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Jul 2016 18:08:04 +0200 Subject: bpf, perf: split bpf_perf_event_output Split the bpf_perf_event_output() helper as a preparation into two parts. The new bpf_perf_event_output() will prepare the raw record itself and test for unknown flags from BPF trace context, where the __bpf_perf_event_output() does the core work. The latter will be reused later on from bpf_event_output() directly. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 35ab1b2b041b..c35883a9bc11 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -233,26 +233,17 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; -static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +static __always_inline u64 +__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, + u64 flags, struct perf_raw_record *raw) { - struct pt_regs *regs = (struct pt_regs *) (long) r1; - struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; - void *data = (void *) (long) r4; struct perf_sample_data sample_data; struct bpf_event_entry *ee; struct perf_event *event; - struct perf_raw_record raw = { - .frag = { - .size = size, - .data = data, - }, - }; - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; if (index == BPF_F_CURRENT_CPU) index = cpu; if (unlikely(index >= array->map.max_entries)) @@ -271,11 +262,29 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) return -EOPNOTSUPP; perf_sample_data_init(&sample_data, 0, 0); - sample_data.raw = &raw; + sample_data.raw = raw; perf_event_output(event, &sample_data, regs); return 0; } +static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +{ + struct pt_regs *regs = (struct pt_regs *)(long) r1; + struct bpf_map *map = (struct bpf_map *)(long) r2; + void *data = (void *)(long) r4; + struct perf_raw_record raw = { + .frag = { + .size = size, + .data = data, + }, + }; + + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return __bpf_perf_event_output(regs, map, flags, &raw); +} + static const struct bpf_func_proto bpf_perf_event_output_proto = { .func = bpf_perf_event_output, .gpl_only = true, -- cgit v1.2.3 From 555c8a8623a3a87b3c990ba30b7fd2e5914e41d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Jul 2016 18:08:05 +0200 Subject: bpf: avoid stack copy and use skb ctx for event output This work addresses a couple of issues bpf_skb_event_output() helper currently has: i) We need two copies instead of just a single one for the skb data when it should be part of a sample. The data can be non-linear and thus needs to be extracted via bpf_skb_load_bytes() helper first, and then copied once again into the ring buffer slot. ii) Since bpf_skb_load_bytes() currently needs to be used first, the helper needs to see a constant size on the passed stack buffer to make sure BPF verifier can do sanity checks on it during verification time. Thus, just passing skb->len (or any other non-constant value) wouldn't work, but changing bpf_skb_load_bytes() is also not the proper solution, since the two copies are generally still needed. iii) bpf_skb_load_bytes() is just for rather small buffers like headers, since they need to sit on the limited BPF stack anyway. Instead of working around in bpf_skb_load_bytes(), this work improves the bpf_skb_event_output() helper to address all 3 at once. We can make use of the passed in skb context that we have in the helper anyway, and use some of the reserved flag bits as a length argument. The helper will use the new __output_custom() facility from perf side with bpf_skb_copy() as callback helper to walk and extract the data. It will pass the data for setup to bpf_event_output(), which generates and pushes the raw record with an additional frag part. The linear data used in the first frag of the record serves as programmatically defined meta data passed along with the appended sample. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 7 ++++++- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/core.c | 6 ++++-- kernel/trace/bpf_trace.c | 33 +++++++++++++++------------------ net/core/filter.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 69 insertions(+), 22 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b3336b4f5d04..c13e92b00bf5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -209,7 +209,12 @@ u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); -const struct bpf_func_proto *bpf_get_event_output_proto(void); + +typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, + unsigned long len); + +u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 262a7e883b19..c4d922439d20 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -401,6 +401,8 @@ enum bpf_func_id { /* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */ #define BPF_F_INDEX_MASK 0xffffffffULL #define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK +/* BPF_FUNC_perf_event_output for sk_buff input context. */ +#define BPF_F_CTXLEN_MASK (0xfffffULL << 32) /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d638062f66d6..03fd23d4d587 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1054,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) return NULL; } -const struct bpf_func_proto * __weak bpf_get_event_output_proto(void) +u64 __weak +bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - return NULL; + return -ENOTSUPP; } /* Always built-in helper functions. */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c35883a9bc11..ebfbb7dd7033 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -298,29 +298,26 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); -static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); + struct perf_raw_frag frag = { + .copy = ctx_copy, + .size = ctx_size, + .data = ctx, + }; + struct perf_raw_record raw = { + .frag = { + .next = ctx_size ? &frag : NULL, + .size = meta_size, + .data = meta, + }, + }; perf_fetch_caller_regs(regs); - return bpf_perf_event_output((long)regs, r2, flags, r4, size); -} - -static const struct bpf_func_proto bpf_event_output_proto = { - .func = bpf_event_output, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, -}; - -const struct bpf_func_proto *bpf_get_event_output_proto(void) -{ - return &bpf_event_output_proto; + return __bpf_perf_event_output(regs, map, flags, &raw); } static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) diff --git a/net/core/filter.c b/net/core/filter.c index 10c4a2f9e8bb..22e3992c8b48 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2025,6 +2025,47 @@ bool bpf_helper_changes_skb_data(void *func) return false; } +static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, + unsigned long len) +{ + void *ptr = skb_header_pointer(skb, 0, len, dst_buff); + + if (unlikely(!ptr)) + return len; + if (ptr != dst_buff) + memcpy(dst_buff, ptr, len); + + return 0; +} + +static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4, + u64 meta_size) +{ + struct sk_buff *skb = (struct sk_buff *)(long) r1; + struct bpf_map *map = (struct bpf_map *)(long) r2; + u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + void *meta = (void *)(long) r4; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) + return -EINVAL; + if (unlikely(skb_size > skb->len)) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, + bpf_skb_copy); +} + +static const struct bpf_func_proto bpf_skb_event_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; @@ -2357,7 +2398,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_perf_event_output: - return bpf_get_event_output_proto(); + return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; #ifdef CONFIG_SOCK_CGROUP_DATA -- cgit v1.2.3 From 183fc1537ec39be242dc8b619f71fc11b393d295 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 18 Jul 2016 15:50:58 -0700 Subject: kernel/trace/bpf_trace.c: work around gcc-4.4.4 anon union initialization bug kernel/trace/bpf_trace.c: In function 'bpf_event_output': kernel/trace/bpf_trace.c:312: error: unknown field 'next' specified in initializer kernel/trace/bpf_trace.c:312: warning: missing braces around initializer kernel/trace/bpf_trace.c:312: warning: (near initialization for 'raw.frag.') Fixes: 555c8a8623a3a87 ("bpf: avoid stack copy and use skb ctx for event output") Acked-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: David S. Miller Signed-off-by: Andrew Morton Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ebfbb7dd7033..a12bbd32c0a6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -309,7 +309,9 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; struct perf_raw_record raw = { .frag = { - .next = ctx_size ? &frag : NULL, + { + .next = ctx_size ? &frag : NULL, + }, .size = meta_size, .data = meta, }, -- cgit v1.2.3 From 96ae52279594470622ff0585621a13e96b700600 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Mon, 25 Jul 2016 05:54:46 -0700 Subject: bpf: Add bpf_probe_write_user BPF helper to be called in tracers This allows user memory to be written to during the course of a kprobe. It shouldn't be used to implement any kind of security mechanism because of TOC-TOU attacks, but rather to debug, divert, and manipulate execution of semi-cooperative processes. Although it uses probe_kernel_write, we limit the address space the probe can write into by checking the space with access_ok. We do this as opposed to calling copy_to_user directly, in order to avoid sleeping. In addition we ensure the threads's current fs / segment is USER_DS and the thread isn't exiting nor a kernel thread. Given this feature is meant for experiments, and it has a risk of crashing the system, and running programs, we print a warning on when a proglet that attempts to use this helper is installed, along with the pid and process name. Signed-off-by: Sargun Dhillon Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 10 ++++++++++ kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ samples/bpf/bpf_helpers.h | 2 ++ 3 files changed, 57 insertions(+) (limited to 'kernel/trace') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2b7076f5b5ad..da218fec6056 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -365,6 +365,16 @@ enum bpf_func_id { */ BPF_FUNC_get_current_task, + /** + * bpf_probe_write_user(void *dst, void *src, int len) + * safely attempt to write to a location + * @dst: destination address in userspace + * @src: source address on stack + * @len: number of bytes to copy + * Return: 0 on success or negative error + */ + BPF_FUNC_probe_write_user, + __BPF_FUNC_MAX_ID, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a12bbd32c0a6..b20438fdb029 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; +static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + void *unsafe_ptr = (void *) (long) r1; + void *src = (void *) (long) r2; + int size = (int) r3; + + /* + * Ensure we're in user context which is safe for the helper to + * run. This helper has no business in a kthread. + * + * access_ok() should prevent writing to non-user memory, but in + * some situations (nommu, temporary switch, etc) access_ok() does + * not provide enough validation, hence the check on KERNEL_DS. + */ + + if (unlikely(in_interrupt() || + current->flags & (PF_KTHREAD | PF_EXITING))) + return -EPERM; + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) + return -EPERM; + if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) + return -EPERM; + + return probe_kernel_write(unsafe_ptr, src, size); +} + +static const struct bpf_func_proto bpf_probe_write_user_proto = { + .func = bpf_probe_write_user, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto *bpf_get_probe_write_proto(void) +{ + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", + current->comm, task_pid_nr(current)); + + return &bpf_probe_write_user_proto; +} + /* * limited trace_printk() * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed @@ -362,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; + case BPF_FUNC_probe_write_user: + return bpf_get_probe_write_proto(); default: return NULL; } diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index 84e3fd919a06..217c8d507f2e 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -41,6 +41,8 @@ static int (*bpf_perf_event_output)(void *ctx, void *map, int index, void *data, (void *) BPF_FUNC_perf_event_output; static int (*bpf_get_stackid)(void *ctx, void *map, int flags) = (void *) BPF_FUNC_get_stackid; +static int (*bpf_probe_write_user)(void *dst, void *src, int size) = + (void *) BPF_FUNC_probe_write_user; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions -- cgit v1.2.3