From 824bd0ce6c7c43a9e1e210abf124958e54d88342 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:53 -0800 Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_HASH map Introduce BPF_MAP_TYPE_PERCPU_HASH map type which is used to do accurate counters without need to use BPF_XADD instruction which turned out to be too costly for high-performance network monitoring. In the typical use case the 'key' is the flow tuple or other long living object that sees a lot of events per second. bpf_map_lookup_elem() returns per-cpu area. Example: struct { u32 packets; u32 bytes; } * ptr = bpf_map_lookup_elem(&map, &key); /* ptr points to this_cpu area of the value, so the following * increments will not collide with other cpus */ ptr->packets ++; ptr->bytes += skb->len; bpf_update_elem() atomically creates a new element where all per-cpu values are zero initialized and this_cpu value is populated with given 'value'. Note that non-per-cpu hash map always allocates new element and then deletes old after rcu grace period to maintain atomicity of update. Per-cpu hash map updates element values in-place. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa6f8571de13..43ae40c8763e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -81,6 +81,7 @@ enum bpf_map_type { BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_PERF_EVENT_ARRAY, + BPF_MAP_TYPE_PERCPU_HASH, }; enum bpf_prog_type { -- cgit v1.2.3 From a10423b87a7eae75da79ce80a8d9475047a674ee Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:54 -0800 Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map Primary use case is a histogram array of latency where bpf program computes the latency of block requests or other events and stores histogram of latency into array of 64 elements. All cpus are constantly running, so normal increment is not accurate, bpf_xadd causes cache ping-pong and this per-cpu approach allows fastest collision-free counters. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/arraymap.c | 102 ++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 93 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 83d1926c61e4..141fb0d45731 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -151,6 +151,7 @@ struct bpf_array { union { char value[0] __aligned(8); void *ptrs[0] __aligned(8); + void __percpu *pptrs[0] __aligned(8); }; }; #define MAX_TAIL_CALL_CNT 32 diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43ae40c8763e..2ee0fde1bf96 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -82,6 +82,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_MAP_TYPE_PERCPU_HASH, + BPF_MAP_TYPE_PERCPU_ARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 89ebbc4d1164..b9bf1d7949ca 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -17,11 +17,39 @@ #include #include +static void bpf_array_free_percpu(struct bpf_array *array) +{ + int i; + + for (i = 0; i < array->map.max_entries; i++) + free_percpu(array->pptrs[i]); +} + +static int bpf_array_alloc_percpu(struct bpf_array *array) +{ + void __percpu *ptr; + int i; + + for (i = 0; i < array->map.max_entries; i++) { + ptr = __alloc_percpu_gfp(array->elem_size, 8, + GFP_USER | __GFP_NOWARN); + if (!ptr) { + bpf_array_free_percpu(array); + return -ENOMEM; + } + array->pptrs[i] = ptr; + } + + return 0; +} + /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; struct bpf_array *array; - u32 elem_size, array_size; + u64 array_size; + u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -36,12 +64,16 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); - /* check round_up into zero and u32 overflow */ - if (elem_size == 0 || - attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size) + array_size = sizeof(*array); + if (percpu) + array_size += (u64) attr->max_entries * sizeof(void *); + else + array_size += (u64) attr->max_entries * elem_size; + + /* make sure there is no u32 overflow later in round_up() */ + if (array_size >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); - array_size = sizeof(*array) + attr->max_entries * elem_size; /* allocate all map elements and zero-initialize them */ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); @@ -52,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } /* copy mandatory map attributes */ + array->map.map_type = attr->map_type; array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; - array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; array->elem_size = elem_size; + if (!percpu) + goto out; + + array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); + + if (array_size >= U32_MAX - PAGE_SIZE || + elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { + kvfree(array); + return ERR_PTR(-ENOMEM); + } +out: + array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; + return &array->map; } @@ -67,12 +112,24 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) return NULL; return array->value + array->elem_size * index; } +/* Called from eBPF program */ +static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return this_cpu_ptr(array->pptrs[index]); +} + /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -99,19 +156,24 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (map_flags == BPF_NOEXIST) + if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - memcpy(array->value + array->elem_size * index, value, map->value_size); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + memcpy(this_cpu_ptr(array->pptrs[index]), + value, map->value_size); + else + memcpy(array->value + array->elem_size * index, + value, map->value_size); return 0; } @@ -133,6 +195,9 @@ static void array_map_free(struct bpf_map *map) */ synchronize_rcu(); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + bpf_array_free_percpu(array); + kvfree(array); } @@ -150,9 +215,24 @@ static struct bpf_map_type_list array_type __read_mostly = { .type = BPF_MAP_TYPE_ARRAY, }; +static const struct bpf_map_ops percpu_array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = percpu_array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list percpu_array_type __read_mostly = { + .ops = &percpu_array_ops, + .type = BPF_MAP_TYPE_PERCPU_ARRAY, +}; + static int __init register_array_map(void) { bpf_register_map_type(&array_type); + bpf_register_map_type(&percpu_array_type); return 0; } late_initcall(register_array_map); -- cgit v1.2.3 From d5a3b1f691865be576c2bffa708549b8cdccda19 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Feb 2016 19:58:58 -0800 Subject: bpf: introduce BPF_MAP_TYPE_STACK_TRACE add new map type to store stack traces and corresponding helper bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id @ctx: struct pt_regs* @map: pointer to stack_trace map @flags: bits 0-7 - numer of stack frames to skip bit 8 - collect user stack instead of kernel bit 9 - compare stacks by hash only bit 10 - if two different stacks hash into the same stackid discard old other bits - reserved Return: >= 0 stackid on success or negative error stackid is a 32-bit integer handle that can be further combined with other data (including other stackid) and used as a key into maps. Userspace will access stackmap using standard lookup/delete syscall commands to retrieve full stack trace for given stackid. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 21 +++++ kernel/bpf/Makefile | 3 + kernel/bpf/stackmap.c | 237 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 6 +- kernel/trace/bpf_trace.c | 2 + 6 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/stackmap.c (limited to 'include/uapi/linux/bpf.h') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 90ee6ab24bc5..0cadbb7456c0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -237,6 +237,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_skb_vlan_push_proto; extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; +extern const struct bpf_func_proto bpf_get_stackid_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2ee0fde1bf96..d3e77da8e9e8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -83,6 +83,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_MAP_TYPE_PERCPU_HASH, BPF_MAP_TYPE_PERCPU_ARRAY, + BPF_MAP_TYPE_STACK_TRACE, }; enum bpf_prog_type { @@ -272,6 +273,20 @@ enum bpf_func_id { */ BPF_FUNC_perf_event_output, BPF_FUNC_skb_load_bytes, + + /** + * bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id + * @ctx: struct pt_regs* + * @map: pointer to stack_trace map + * @flags: bits 0-7 - numer of stack frames to skip + * bit 8 - collect user stack instead of kernel + * bit 9 - compare stacks by hash only + * bit 10 - if two different stacks hash into the same stackid + * discard old + * other bits - reserved + * Return: >= 0 stackid on success or negative error + */ + BPF_FUNC_get_stackid, __BPF_FUNC_MAX_ID, }; @@ -294,6 +309,12 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ #define BPF_F_TUNINFO_IPV6 (1ULL << 0) +/* BPF_FUNC_get_stackid flags. */ +#define BPF_F_SKIP_FIELD_MASK 0xffULL +#define BPF_F_USER_STACK (1ULL << 8) +#define BPF_F_FAST_STACK_CMP (1ULL << 9) +#define BPF_F_REUSE_STACKID (1ULL << 10) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 13272582eee0..8a932d079c24 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,3 +2,6 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_BPF_SYSCALL) += stackmap.o +endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c new file mode 100644 index 000000000000..8a60ee14a977 --- /dev/null +++ b/kernel/bpf/stackmap.c @@ -0,0 +1,237 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include + +struct stack_map_bucket { + struct rcu_head rcu; + u32 hash; + u32 nr; + u64 ip[]; +}; + +struct bpf_stack_map { + struct bpf_map map; + u32 n_buckets; + struct stack_map_bucket __rcu *buckets[]; +}; + +/* Called from syscall */ +static struct bpf_map *stack_map_alloc(union bpf_attr *attr) +{ + u32 value_size = attr->value_size; + struct bpf_stack_map *smap; + u64 cost, n_buckets; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + value_size < 8 || value_size % 8 || + value_size / 8 > PERF_MAX_STACK_DEPTH) + return ERR_PTR(-EINVAL); + + /* hash table size must be power of 2 */ + n_buckets = roundup_pow_of_two(attr->max_entries); + + cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + smap = kzalloc(cost, GFP_USER | __GFP_NOWARN); + if (!smap) { + smap = vzalloc(cost); + if (!smap) + return ERR_PTR(-ENOMEM); + } + + err = -E2BIG; + cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_smap; + + smap->map.map_type = attr->map_type; + smap->map.key_size = attr->key_size; + smap->map.value_size = value_size; + smap->map.max_entries = attr->max_entries; + smap->n_buckets = n_buckets; + smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = get_callchain_buffers(); + if (err) + goto free_smap; + + return &smap->map; + +free_smap: + kvfree(smap); + return ERR_PTR(err); +} + +static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +{ + struct pt_regs *regs = (struct pt_regs *) (long) r1; + struct bpf_map *map = (struct bpf_map *) (long) r2; + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct perf_callchain_entry *trace; + struct stack_map_bucket *bucket, *new_bucket, *old_bucket; + u32 max_depth = map->value_size / 8; + /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */ + u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 hash, id, trace_nr, trace_len; + bool user = flags & BPF_F_USER_STACK; + bool kernel = !user; + u64 *ips; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + /* get_perf_callchain() guarantees that trace->nr >= init_nr + * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth + */ + trace_nr = trace->nr - init_nr; + + if (trace_nr <= skip) + /* skipping more than usable stack trace */ + return -EFAULT; + + trace_nr -= skip; + trace_len = trace_nr * sizeof(u64); + ips = trace->ip + skip + init_nr; + hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); + id = hash & (smap->n_buckets - 1); + bucket = rcu_dereference(smap->buckets[id]); + + if (bucket && bucket->hash == hash) { + if (flags & BPF_F_FAST_STACK_CMP) + return id; + if (bucket->nr == trace_nr && + memcmp(bucket->ip, ips, trace_len) == 0) + return id; + } + + /* this call stack is not in the map, try to add it */ + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size, + GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!new_bucket)) + return -ENOMEM; + + memcpy(new_bucket->ip, ips, trace_len); + memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len); + new_bucket->hash = hash; + new_bucket->nr = trace_nr; + + old_bucket = xchg(&smap->buckets[id], new_bucket); + if (old_bucket) + kfree_rcu(old_bucket, rcu); + return id; +} + +const struct bpf_func_proto bpf_get_stackid_proto = { + .func = bpf_get_stackid, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +/* Called from syscall or from eBPF program */ +static void *stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *bucket; + u32 id = *(u32 *)key; + + if (unlikely(id >= smap->n_buckets)) + return NULL; + bucket = rcu_dereference(smap->buckets[id]); + return bucket ? bucket->ip : NULL; +} + +static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EINVAL; +} + +static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int stack_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *old_bucket; + u32 id = *(u32 *)key; + + if (unlikely(id >= smap->n_buckets)) + return -E2BIG; + + old_bucket = xchg(&smap->buckets[id], NULL); + if (old_bucket) { + kfree_rcu(old_bucket, rcu); + return 0; + } else { + return -ENOENT; + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void stack_map_free(struct bpf_map *map) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + int i; + + synchronize_rcu(); + + for (i = 0; i < smap->n_buckets; i++) + if (smap->buckets[i]) + kfree_rcu(smap->buckets[i], rcu); + kvfree(smap); + put_callchain_buffers(); +} + +static const struct bpf_map_ops stack_map_ops = { + .map_alloc = stack_map_alloc, + .map_free = stack_map_free, + .map_get_next_key = stack_map_get_next_key, + .map_lookup_elem = stack_map_lookup_elem, + .map_update_elem = stack_map_update_elem, + .map_delete_elem = stack_map_delete_elem, +}; + +static struct bpf_map_type_list stack_map_type __read_mostly = { + .ops = &stack_map_ops, + .type = BPF_MAP_TYPE_STACK_TRACE, +}; + +static int __init register_stack_map(void) +{ + bpf_register_map_type(&stack_map_type); + return 0; +} +late_initcall(register_stack_map); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d1d3e8f57de9..42ba4ccc020b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -246,6 +246,7 @@ static const struct { {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, + {BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid}, }; static void print_verifier_state(struct verifier_env *env) @@ -911,8 +912,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * don't allow any other map type to be passed into * the special func; */ - if (bool_func && bool_map != bool_func) + if (bool_func && bool_map != bool_func) { + verbose("cannot pass map_type %d into func %d\n", + map->map_type, func_id); return -EINVAL; + } } return 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 326a75e884db..4b8caa392b86 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -299,6 +299,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_perf_event_read_proto; case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto; default: return NULL; } -- cgit v1.2.3 From 7d672345ed295b1356a5d9f7111da1d1d7d65867 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 19 Feb 2016 23:05:23 +0100 Subject: bpf: add generic bpf_csum_diff helper For L4 checksums, we currently have bpf_l4_csum_replace() helper. It's currently limited to handle 2 and 4 byte changes in a header and feeds the from/to into inet_proto_csum_replace{2,4}() helpers of the kernel. When working with IPv6, for example, this makes it rather cumbersome to deal with, similarly when editing larger parts of a header. Instead, extend the API in a more generic way: For bpf_l4_csum_replace(), add a case for header field mask of 0 to change the checksum at a given offset through inet_proto_csum_replace_by_diff(), and provide a helper bpf_csum_diff() that can generically calculate a from/to diff for arbitrary amounts of data. This can be used in multiple ways: for the bpf_l4_csum_replace() only part, this even provides us with the option to insert precalculated diffs from user space f.e. from a map, or from bpf_csum_diff() during runtime. bpf_csum_diff() has a optional from/to stack buffer input, so we can calculate a diff by using a scratchbuffer for scenarios where we're inserting (from is NULL), removing (to is NULL) or diffing (from/to buffers don't need to be of equal size) data. Also, bpf_csum_diff() allows to feed a previous csum into csum_partial(), so the function can also be cascaded. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 11 ++++++++++ net/core/filter.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d3e77da8e9e8..48d0a6c54609 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -287,6 +287,17 @@ enum bpf_func_id { * Return: >= 0 stackid on success or negative error */ BPF_FUNC_get_stackid, + + /** + * bpf_csum_diff(from, from_size, to, to_size, seed) - calculate csum diff + * @from: raw from buffer + * @from_size: length of from buffer + * @to: raw to buffer + * @to_size: length of to buffer + * @seed: optional seed + * Return: csum result + */ + BPF_FUNC_csum_diff, __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index 2a6e9562f1ab..bf504f8fbe15 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1491,6 +1491,12 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EFAULT; switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; @@ -1519,6 +1525,51 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; +struct bpf_csum_scratchpad { + __be32 diff[128]; +}; + +static DEFINE_PER_CPU(struct bpf_csum_scratchpad, bpf_csum_sp); + +static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) +{ + struct bpf_csum_scratchpad *sp = this_cpu_ptr(&bpf_csum_sp); + u64 diff_size = from_size + to_size; + __be32 *from = (__be32 *) (long) r1; + __be32 *to = (__be32 *) (long) r3; + int i, j = 0; + + /* This is quite flexible, some examples: + * + * from_size == 0, to_size > 0, seed := csum --> pushing data + * from_size > 0, to_size == 0, seed := csum --> pulling data + * from_size > 0, to_size > 0, seed := 0 --> diffing data + * + * Even for diffing, from_size and to_size don't need to be equal. + */ + if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || + diff_size > sizeof(sp->diff))) + return -EINVAL; + + for (i = 0; i < from_size / sizeof(__be32); i++, j++) + sp->diff[j] = ~from[i]; + for (i = 0; i < to_size / sizeof(__be32); i++, j++) + sp->diff[j] = to[i]; + + return csum_partial(sp->diff, diff_size, seed); +} + +const struct bpf_func_proto bpf_csum_diff_proto = { + .func = bpf_csum_diff, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, +}; + static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; @@ -1849,6 +1900,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: -- cgit v1.2.3 From 2f72959a9c1260ade234f353ccca91118151af66 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 19 Feb 2016 23:05:26 +0100 Subject: bpf: fix csum update in bpf_l4_csum_replace helper for udp When using this helper for updating UDP checksums, we need to extend this in order to write CSUM_MANGLED_0 for csum computations that result into 0 as sum. Reason we need this is because packets with a checksum could otherwise become incorrectly marked as a packet without a checksum. Likewise, if the user indicates BPF_F_MARK_MANGLED_0, then we should not turn packets without a checksum into ones with a checksum. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 48d0a6c54609..6496f98d3d68 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -313,6 +313,7 @@ enum bpf_func_id { /* BPF_FUNC_l4_csum_replace flags. */ #define BPF_F_PSEUDO_HDR (1ULL << 4) +#define BPF_F_MARK_MANGLED_0 (1ULL << 5) /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ #define BPF_F_INGRESS (1ULL << 0) diff --git a/net/core/filter.c b/net/core/filter.c index f031b82128f3..8a0b8c3eb189 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1477,10 +1477,12 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; bool is_pseudo = flags & BPF_F_PSEUDO_HDR; + bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; int offset = (int) r2; __sum16 sum, *ptr; - if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | + BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; @@ -1490,6 +1492,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); if (unlikely(!ptr)) return -EFAULT; + if (is_mmzero && !*ptr) + return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { case 0: @@ -1508,6 +1512,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; } + if (is_mmzero && !*ptr) + *ptr = CSUM_MANGLED_0; if (ptr == &sum) /* skb_store_bits guaranteed to not return -EFAULT here */ skb_store_bits(skb, offset, ptr, sizeof(sum)); -- cgit v1.2.3 From 8afd54c87ad7089734ef0527937a256586ba828a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:03 +0100 Subject: bpf: add flags to bpf_skb_store_bytes for clearing hash When overwriting parts of the packet with bpf_skb_store_bytes() that were fed previously into skb->hash calculation, we should clear the current hash with skb_clear_hash(), so that a next skb_get_hash() call can determine the correct hash related to this skb. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ee2193287cbe..2e3e90309904 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -305,6 +305,7 @@ enum bpf_func_id { /* BPF_FUNC_skb_store_bytes flags. */ #define BPF_F_RECOMPUTE_CSUM (1ULL << 0) +#define BPF_F_INVALIDATE_HASH (1ULL << 1) /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. * First 4 bits are for passing the header field size. diff --git a/net/core/filter.c b/net/core/filter.c index 356a251657a5..a1fe246a6147 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1353,7 +1353,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) unsigned int len = (unsigned int) r4; void *ptr; - if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM))) + if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; /* bpf verifier guarantees that: @@ -1384,6 +1384,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) if (flags & BPF_F_RECOMPUTE_CSUM) skb_postpush_rcsum(skb, ptr, len); + if (flags & BPF_F_INVALIDATE_HASH) + skb_clear_hash(skb); return 0; } -- cgit v1.2.3 From 2208087061c4ad88de188911367effc550144836 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:05 +0100 Subject: bpf: allow to propagate df in bpf_skb_set_tunnel_key Added by 9a628224a61b ("ip_tunnel: Add dont fragment flag."), allow to feed df flag into tunneling facilities (currently supported on TX by vxlan, geneve and gre) as a hint from eBPF's bpf_skb_set_tunnel_key() helper. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2e3e90309904..21ee6d52016f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -330,6 +330,7 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) +#define BPF_F_DONT_FRAGMENT (1ULL << 2) /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure diff --git a/net/core/filter.c b/net/core/filter.c index ce4e18dd2c89..6c9d15561d04 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1819,7 +1819,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; - if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX))) + if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | + BPF_F_DONT_FRAGMENT))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { @@ -1844,6 +1845,9 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info->mode = IP_TUNNEL_INFO_TX; info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM; + if (flags & BPF_F_DONT_FRAGMENT) + info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; info->key.ttl = from->tunnel_ttl; -- cgit v1.2.3 From 14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:06 +0100 Subject: bpf: support for access to tunnel options After eBPF being able to programmatically access/manage tunnel key meta data via commit d3aa45ce6b94 ("bpf: add helpers to access tunnel metadata") and more recently also for IPv6 through c6c33454072f ("bpf: support ipv6 for bpf_skb_{set,get}_tunnel_key"), this work adds two complementary helpers to generically access their auxiliary tunnel options. Geneve and vxlan support this facility. For geneve, TLVs can be pushed, and for the vxlan case its GBP extension. I.e. setting tunnel key for geneve case only makes sense, if we can also read/write TLVs into it. In the GBP case, it provides the flexibility to easily map the group policy ID in combination with other helpers or maps. I chose to model this as two separate helpers, bpf_skb_{set,get}_tunnel_opt(), for a couple of reasons. bpf_skb_{set,get}_tunnel_key() is already rather complex by itself, and there may be cases for tunnel key backends where tunnel options are not always needed. If we would have integrated this into bpf_skb_{set,get}_tunnel_key() nevertheless, we are very limited with remaining helper arguments, so keeping compatibility on structs in case of passing in a flat buffer gets more cumbersome. Separating both also allows for more flexibility and future extensibility, f.e. options could be fed directly from a map, etc. Moreover, change geneve's xmit path to test only for info->options_len instead of TUNNEL_GENEVE_OPT flag. This makes it more consistent with vxlan's xmit path and allows for avoiding to specify a protocol flag in the API on xmit, so it can be protocol agnostic. Having info->options_len is enough information that is needed. Tested with vxlan and geneve. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- drivers/net/geneve.c | 4 +-- include/uapi/linux/bpf.h | 11 +++++++ net/core/filter.c | 83 ++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 90 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux/bpf.h') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index bc5da357e16d..36db4cf0579c 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -940,7 +940,7 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, u8 vni[3]; tunnel_id_to_vni(key->tun_id, vni); - if (key->tun_flags & TUNNEL_GENEVE_OPT) + if (info->options_len) opts = ip_tunnel_info_opts(info); if (key->tun_flags & TUNNEL_CSUM) @@ -1027,7 +1027,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, u8 vni[3]; tunnel_id_to_vni(key->tun_id, vni); - if (key->tun_flags & TUNNEL_GENEVE_OPT) + if (info->options_len) opts = ip_tunnel_info_opts(info); if (key->tun_flags & TUNNEL_CSUM) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 21ee6d52016f..9221f653fee3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -298,6 +298,17 @@ enum bpf_func_id { * Return: csum result */ BPF_FUNC_csum_diff, + + /** + * bpf_skb_[gs]et_tunnel_opt(skb, opt, size) + * retrieve or populate tunnel options metadata + * @skb: pointer to skb + * @opt: pointer to raw tunnel option data + * @size: size of @opt + * Return: 0 on success for set, option size for get + */ + BPF_FUNC_skb_get_tunnel_opt, + BPF_FUNC_skb_set_tunnel_opt, __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index 6c9d15561d04..012a10c2da94 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1809,6 +1809,32 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; +static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *to = (u8 *) (long) r2; + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + + if (unlikely(!info || + !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) + return -ENOENT; + if (unlikely(size < info->options_len)) + return -ENOMEM; + + ip_tunnel_info_opts_get(to, info); + + return info->options_len; +} + +static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { + .func = bpf_skb_get_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + static struct metadata_dst __percpu *md_dst; static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) @@ -1875,17 +1901,58 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void) +#define BPF_TUNLEN_MAX 255 + +static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *from = (u8 *) (long) r2; + struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct metadata_dst *md = this_cpu_ptr(md_dst); + + if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) + return -EINVAL; + if (unlikely(size > BPF_TUNLEN_MAX)) + return -ENOMEM; + + ip_tunnel_info_opts_set(info, from, size); + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { + .func = bpf_skb_set_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto * +bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { - /* race is not possible, since it's called from - * verifier that is holding verifier mutex + BUILD_BUG_ON(FIELD_SIZEOF(struct ip_tunnel_info, + options_len) != 1); + + /* Race is not possible, since it's called from verifier + * that is holding verifier mutex. */ - md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL); + md_dst = metadata_dst_alloc_percpu(BPF_TUNLEN_MAX, + GFP_KERNEL); if (!md_dst) return NULL; } - return &bpf_skb_set_tunnel_key_proto; + + switch (which) { + case BPF_FUNC_skb_set_tunnel_key: + return &bpf_skb_set_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return &bpf_skb_set_tunnel_opt_proto; + default: + return NULL; + } } static const struct bpf_func_proto * @@ -1939,7 +2006,11 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: - return bpf_get_skb_set_tunnel_key_proto(); + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_get_route_realm: -- cgit v1.2.3 From 6c90598174322b8888029e40dd84a4eb01f56afe Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:15 -0800 Subject: bpf: pre-allocate hash map elements If kprobe is placed on spin_unlock then calling kmalloc/kfree from bpf programs is not safe, since the following dead lock is possible: kfree->spin_lock(kmem_cache_node->lock)...spin_unlock->kprobe-> bpf_prog->map_update->kmalloc->spin_lock(of the same kmem_cache_node->lock) and deadlocks. The following solutions were considered and some implemented, but eventually discarded - kmem_cache_create for every map - add recursion check to slow-path of slub - use reserved memory in bpf_map_update for in_irq or in preempt_disabled - kmalloc via irq_work At the end pre-allocation of all map elements turned out to be the simplest solution and since the user is charged upfront for all the memory, such pre-allocation doesn't affect the user space visible behavior. Since it's impossible to tell whether kprobe is triggered in a safe location from kmalloc point of view, use pre-allocation by default and introduce new BPF_F_NO_PREALLOC flag. While testing of per-cpu hash maps it was discovered that alloc_percpu(GFP_ATOMIC) has odd corner cases and often fails to allocate memory even when 90% of it is free. The pre-allocation of per-cpu hash elements solves this problem as well. Turned out that bpf_map_update() quickly followed by bpf_map_lookup()+bpf_map_delete() is very common pattern used in many of iovisor/bcc/tools, so there is additional benefit of pre-allocation, since such use cases are must faster. Since all hash map elements are now pre-allocated we can remove atomic increment of htab->count and save few more cycles. Also add bpf_map_precharge_memlock() to check rlimit_memlock early to avoid large malloc/free done by users who don't have sufficient limits. Pre-allocation is done with vmalloc and alloc/free is done via percpu_freelist. Here are performance numbers for different pre-allocation algorithms that were implemented, but discarded in favor of percpu_freelist: 1 cpu: pcpu_ida 2.1M pcpu_ida nolock 2.3M bt 2.4M kmalloc 1.8M hlist+spinlock 2.3M pcpu_freelist 2.6M 4 cpu: pcpu_ida 1.5M pcpu_ida nolock 1.8M bt w/smp_align 1.7M bt no/smp_align 1.1M kmalloc 0.7M hlist+spinlock 0.2M pcpu_freelist 2.0M 8 cpu: pcpu_ida 0.7M bt w/smp_align 0.8M kmalloc 0.4M pcpu_freelist 1.5M 32 cpu: kmalloc 0.13M pcpu_freelist 0.49M pcpu_ida nolock is a modified percpu_ida algorithm without percpu_ida_cpu locks and without cross-cpu tag stealing. It's faster than existing percpu_ida, but not as fast as pcpu_freelist. bt is a variant of block/blk-mq-tag.c simlified and customized for bpf use case. bt w/smp_align is using cache line for every 'long' (similar to blk-mq-tag). bt no/smp_align allocates 'long' bitmasks continuously to save memory. It's comparable to percpu_ida and in some cases faster, but slower than percpu_freelist hlist+spinlock is the simplest free list with single spinlock. As expeceted it has very bad scaling in SMP. kmalloc is existing implementation which is still available via BPF_F_NO_PREALLOC flag. It's significantly slower in single cpu and in 8 cpu setup it's 3 times slower than pre-allocation with pcpu_freelist, but saves memory, so in cases where map->max_entries can be large and number of map update/delete per second is low, it may make sense to use it. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 + include/uapi/linux/bpf.h | 3 + kernel/bpf/hashtab.c | 240 +++++++++++++++++++++++++++++++++-------------- kernel/bpf/syscall.c | 15 ++- 4 files changed, 186 insertions(+), 74 deletions(-) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4b070827200d..efd1d4ca95c6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -37,6 +37,7 @@ struct bpf_map { u32 key_size; u32 value_size; u32 max_entries; + u32 map_flags; u32 pages; struct user_struct *user; const struct bpf_map_ops *ops; @@ -178,6 +179,7 @@ struct bpf_map *__bpf_map_get(struct fd f); void bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); +int bpf_map_precharge_memlock(u32 pages); extern int sysctl_unprivileged_bpf_disabled; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9221f653fee3..0e30b19012a5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -101,12 +101,15 @@ enum bpf_prog_type { #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ +#define BPF_F_NO_PREALLOC (1U << 0) + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ + __u32 map_flags; /* prealloc or not */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a68e95133fcd..fff3650d52fc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -13,6 +14,7 @@ #include #include #include +#include "percpu_freelist.h" struct bucket { struct hlist_head head; @@ -22,6 +24,8 @@ struct bucket { struct bpf_htab { struct bpf_map map; struct bucket *buckets; + void *elems; + struct pcpu_freelist freelist; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ @@ -29,15 +33,86 @@ struct bpf_htab { /* each htab element is struct htab_elem + key + value */ struct htab_elem { - struct hlist_node hash_node; - struct rcu_head rcu; union { - u32 hash; - u32 key_size; + struct hlist_node hash_node; + struct bpf_htab *htab; + struct pcpu_freelist_node fnode; }; + struct rcu_head rcu; + u32 hash; char key[0] __aligned(8); }; +static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, + void __percpu *pptr) +{ + *(void __percpu **)(l->key + key_size) = pptr; +} + +static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) +{ + return *(void __percpu **)(l->key + key_size); +} + +static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) +{ + return (struct htab_elem *) (htab->elems + i * htab->elem_size); +} + +static void htab_free_elems(struct bpf_htab *htab) +{ + int i; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto free_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + void __percpu *pptr; + + pptr = htab_elem_get_ptr(get_htab_elem(htab, i), + htab->map.key_size); + free_percpu(pptr); + } +free_elems: + vfree(htab->elems); +} + +static int prealloc_elems_and_freelist(struct bpf_htab *htab) +{ + int err = -ENOMEM, i; + + htab->elems = vzalloc(htab->elem_size * htab->map.max_entries); + if (!htab->elems) + return -ENOMEM; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto skip_percpu_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + u32 size = round_up(htab->map.value_size, 8); + void __percpu *pptr; + + pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); + if (!pptr) + goto free_elems; + htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, + pptr); + } + +skip_percpu_elems: + err = pcpu_freelist_init(&htab->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, + htab->map.max_entries); + return 0; + +free_elems: + htab_free_elems(htab); + return err; +} + /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { @@ -46,6 +121,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) int err, i; u64 cost; + if (attr->map_flags & ~BPF_F_NO_PREALLOC) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -55,6 +134,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.key_size = attr->key_size; htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; + htab->map.map_flags = attr->map_flags; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set @@ -92,7 +172,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (percpu) htab->elem_size += sizeof(void *); else - htab->elem_size += htab->map.value_size; + htab->elem_size += round_up(htab->map.value_size, 8); /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || @@ -112,6 +192,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) + goto free_htab; + err = -ENOMEM; htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), GFP_USER | __GFP_NOWARN); @@ -127,10 +212,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - atomic_set(&htab->count, 0); + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { + err = prealloc_elems_and_freelist(htab); + if (err) + goto free_buckets; + } return &htab->map; +free_buckets: + kvfree(htab->buckets); free_htab: kfree(htab); return ERR_PTR(err); @@ -249,42 +340,42 @@ find_first_elem: } } - /* itereated over all buckets and all elements */ + /* iterated over all buckets and all elements */ return -ENOENT; } - -static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, - void __percpu *pptr) -{ - *(void __percpu **)(l->key + key_size) = pptr; -} - -static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) -{ - return *(void __percpu **)(l->key + key_size); -} - -static void htab_percpu_elem_free(struct htab_elem *l) +static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { - free_percpu(htab_elem_get_ptr(l, l->key_size)); + if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) + free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); kfree(l); + } -static void htab_percpu_elem_free_rcu(struct rcu_head *head) +static void htab_elem_free_rcu(struct rcu_head *head) { struct htab_elem *l = container_of(head, struct htab_elem, rcu); + struct bpf_htab *htab = l->htab; - htab_percpu_elem_free(l); + /* must increment bpf_prog_active to avoid kprobe+bpf triggering while + * we're calling kfree, otherwise deadlock is possible if kprobes + * are placed somewhere inside of slub + */ + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + htab_elem_free(htab, l); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); } -static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size) +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { - if (percpu) { - l->key_size = key_size; - call_rcu(&l->rcu, htab_percpu_elem_free_rcu); + if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { + pcpu_freelist_push(&htab->freelist, &l->fnode); } else { - kfree_rcu(l, rcu); + atomic_dec(&htab->count); + l->htab = htab; + call_rcu(&l->rcu, htab_elem_free_rcu); } } @@ -293,23 +384,39 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, bool percpu, bool onallcpus) { u32 size = htab->map.value_size; + bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); struct htab_elem *l_new; void __percpu *pptr; - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return NULL; + if (prealloc) { + l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); + if (!l_new) + return ERR_PTR(-E2BIG); + } else { + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); + } memcpy(l_new->key, key, key_size); if (percpu) { /* round up value_size to 8 bytes */ size = round_up(size, 8); - /* alloc_percpu zero-fills */ - pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN); - if (!pptr) { - kfree(l_new); - return NULL; + if (prealloc) { + pptr = htab_elem_get_ptr(l_new, key_size); + } else { + /* alloc_percpu zero-fills */ + pptr = __alloc_percpu_gfp(size, 8, + GFP_ATOMIC | __GFP_NOWARN); + if (!pptr) { + kfree(l_new); + return ERR_PTR(-ENOMEM); + } } if (!onallcpus) { @@ -324,7 +431,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, off += size; } } - htab_elem_set_ptr(l_new, key_size, pptr); + if (!prealloc) + htab_elem_set_ptr(l_new, key_size, pptr); } else { memcpy(l_new->key + round_up(key_size, 8), value, size); } @@ -336,12 +444,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, u64 map_flags) { - if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries)) - /* if elem with this 'key' doesn't exist and we've reached - * max_entries limit, fail insertion of new elem - */ - return -E2BIG; - if (l_old && map_flags == BPF_NOEXIST) /* elem already exists */ return -EEXIST; @@ -375,13 +477,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hash = htab_map_hash(key, key_size); - /* allocate new element outside of the lock, since - * we're most likley going to insert it - */ - l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); - if (!l_new) - return -ENOMEM; - b = __select_bucket(htab, hash); head = &b->head; @@ -394,21 +489,24 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) goto err; + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); + if (IS_ERR(l_new)) { + /* all pre-allocated elements are in use or memory exhausted */ + ret = PTR_ERR(l_new); + goto err; + } + /* add new element to the head of the list, so that * concurrent search will find it before old elem */ hlist_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_del_rcu(&l_old->hash_node); - kfree_rcu(l_old, rcu); - } else { - atomic_inc(&htab->count); + free_htab_elem(htab, l_old); } - raw_spin_unlock_irqrestore(&b->lock, flags); - return 0; + ret = 0; err: raw_spin_unlock_irqrestore(&b->lock, flags); - kfree(l_new); return ret; } @@ -466,12 +564,11 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } else { l_new = alloc_htab_elem(htab, key, value, key_size, hash, true, onallcpus); - if (!l_new) { - ret = -ENOMEM; + if (IS_ERR(l_new)) { + ret = PTR_ERR(l_new); goto err; } hlist_add_head_rcu(&l_new->hash_node, head); - atomic_inc(&htab->count); } ret = 0; err: @@ -489,7 +586,6 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct hlist_head *head; struct bucket *b; struct htab_elem *l; @@ -511,8 +607,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) if (l) { hlist_del_rcu(&l->hash_node); - atomic_dec(&htab->count); - free_htab_elem(l, percpu, key_size); + free_htab_elem(htab, l); ret = 0; } @@ -531,17 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - atomic_dec(&htab->count); - if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) { - l->key_size = htab->map.key_size; - htab_percpu_elem_free(l); - } else { - kfree(l); - } + htab_elem_free(htab, l); } } } - /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -554,10 +642,16 @@ static void htab_map_free(struct bpf_map *map) */ synchronize_rcu(); - /* some of kfree_rcu() callbacks for elements of this map may not have - * executed. It's ok. Proceed to free residual elements and map itself + /* some of free_htab_elem() callbacks for elements of this map may + * not have executed. Wait for them. */ - delete_all_elements(htab); + rcu_barrier(); + if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + delete_all_elements(htab); + } else { + htab_free_elems(htab); + pcpu_freelist_destroy(&htab->freelist); + } kvfree(htab->buckets); kfree(htab); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dc99f6a000f5..cbd94b2144ff 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -48,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) list_add(&tl->list_node, &bpf_map_types); } +int bpf_map_precharge_memlock(u32 pages) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit, cur; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + cur = atomic_long_read(&user->locked_vm); + free_uid(user); + if (cur + pages > memlock_limit) + return -EPERM; + return 0; +} + static int bpf_map_charge_memlock(struct bpf_map *map) { struct user_struct *user = get_current_user(); @@ -153,7 +166,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD max_entries +#define BPF_MAP_CREATE_LAST_FIELD map_flags /* called via syscall */ static int map_create(union bpf_attr *attr) { -- cgit v1.2.3 From 4018ab1875e0d00b84ac61bc15427136ad55849e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 9 Mar 2016 03:00:05 +0100 Subject: bpf: support flow label for bpf_skb_{set, get}_tunnel_key This patch extends bpf_tunnel_key with a tunnel_label member, that maps to ip_tunnel_key's label so underlying backends like vxlan and geneve can propagate the label to udp_tunnel6_xmit_skb(), where it's being set in the IPv6 header. It allows for having 20 more bits to encode/decode flow related meta information programmatically. Tested with vxlan and geneve. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0e30b19012a5..924f537183fd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -375,6 +375,7 @@ struct bpf_tunnel_key { }; __u8 tunnel_tos; __u8 tunnel_ttl; + __u32 tunnel_label; }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index a66dc03c261f..6fc3893a6170 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1770,12 +1770,15 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EPROTO; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { + case offsetof(struct bpf_tunnel_key, tunnel_label): + goto set_compat; case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ if (ip_tunnel_info_af(info) != AF_INET) return -EINVAL; +set_compat: to = (struct bpf_tunnel_key *)compat; break; default: @@ -1787,11 +1790,13 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; - if (flags & BPF_F_TUNINFO_IPV6) + if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, sizeof(to->remote_ipv6)); - else + to->tunnel_label = be32_to_cpu(info->key.label); + } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + } if (unlikely(size != sizeof(struct bpf_tunnel_key))) memcpy((void *)(long) r2, to, size); @@ -1850,6 +1855,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { + case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. @@ -1862,6 +1868,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EINVAL; } } + if (unlikely(!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label)) + return -EINVAL; skb_dst_drop(skb); dst_hold((struct dst_entry *) md); @@ -1882,6 +1890,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info->mode |= IP_TUNNEL_INFO_IPV6; memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, sizeof(from->remote_ipv6)); + info->key.label = cpu_to_be32(from->tunnel_label) & + IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); if (flags & BPF_F_ZERO_CSUM_TX) -- cgit v1.2.3