summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@fb.com>2017-03-21 19:05:04 -0700
committerDavid S. Miller <davem@davemloft.net>2017-03-22 14:12:18 -0700
commit8c290e60fa2a51806159522331c9ed41252a8fb3 (patch)
tree9b19f17d6d003d05cf5f17a6b60c7ac719c5ef53 /kernel
parentdd1ef79120e1600cb48320cf80a612ee6510110c (diff)
downloadlwn-8c290e60fa2a51806159522331c9ed41252a8fb3.tar.gz
lwn-8c290e60fa2a51806159522331c9ed41252a8fb3.zip
bpf: fix hashmap extra_elems logic
In both kmalloc and prealloc mode the bpf_map_update_elem() is using per-cpu extra_elems to do atomic update when the map is full. There are two issues with it. The logic can be misused, since it allows max_entries+num_cpus elements to be present in the map. And alloc_extra_elems() at map creation time can fail percpu alloc for large map values with a warn: WARNING: CPU: 3 PID: 2752 at ../mm/percpu.c:892 pcpu_alloc+0x119/0xa60 illegal size (32824) or align (8) for percpu allocation The fixes for both of these issues are different for kmalloc and prealloc modes. For prealloc mode allocate extra num_possible_cpus elements and store their pointers into extra_elems array instead of actual elements. Hence we can use these hidden(spare) elements not only when the map is full but during bpf_map_update_elem() that replaces existing element too. That also improves performance, since pcpu_freelist_pop/push is avoided. Unfortunately this approach cannot be used for kmalloc mode which needs to kfree elements after rcu grace period. Therefore switch it back to normal kmalloc even when full and old element exists like it was prior to commit 6c9059817432 ("bpf: pre-allocate hash map elements"). Add tests to check for over max_entries and large map values. Reported-by: Dave Jones <davej@codemonkey.org.uk> Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") Signed-off-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Martin KaFai Lau <kafai@fb.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/hashtab.c144
1 files changed, 71 insertions, 73 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index afe5bab376c9..361a69dfe543 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -30,18 +30,12 @@ struct bpf_htab {
struct pcpu_freelist freelist;
struct bpf_lru lru;
};
- void __percpu *extra_elems;
+ struct htab_elem *__percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
};
-enum extra_elem_state {
- HTAB_NOT_AN_EXTRA_ELEM = 0,
- HTAB_EXTRA_ELEM_FREE,
- HTAB_EXTRA_ELEM_USED
-};
-
/* each htab element is struct htab_elem + key + value */
struct htab_elem {
union {
@@ -56,7 +50,6 @@ struct htab_elem {
};
union {
struct rcu_head rcu;
- enum extra_elem_state state;
struct bpf_lru_node lru_node;
};
u32 hash;
@@ -77,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}
+static bool htab_is_prealloc(const struct bpf_htab *htab)
+{
+ return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
+}
+
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
@@ -128,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
static int prealloc_init(struct bpf_htab *htab)
{
+ u32 num_entries = htab->map.max_entries;
int err = -ENOMEM, i;
- htab->elems = bpf_map_area_alloc(htab->elem_size *
- htab->map.max_entries);
+ if (!htab_is_percpu(htab) && !htab_is_lru(htab))
+ num_entries += num_possible_cpus();
+
+ htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
if (!htab->elems)
return -ENOMEM;
if (!htab_is_percpu(htab))
goto skip_percpu_elems;
- for (i = 0; i < htab->map.max_entries; i++) {
+ for (i = 0; i < num_entries; i++) {
u32 size = round_up(htab->map.value_size, 8);
void __percpu *pptr;
@@ -166,11 +167,11 @@ skip_percpu_elems:
if (htab_is_lru(htab))
bpf_lru_populate(&htab->lru, htab->elems,
offsetof(struct htab_elem, lru_node),
- htab->elem_size, htab->map.max_entries);
+ htab->elem_size, num_entries);
else
pcpu_freelist_populate(&htab->freelist,
htab->elems + offsetof(struct htab_elem, fnode),
- htab->elem_size, htab->map.max_entries);
+ htab->elem_size, num_entries);
return 0;
@@ -191,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab)
static int alloc_extra_elems(struct bpf_htab *htab)
{
- void __percpu *pptr;
+ struct htab_elem *__percpu *pptr, *l_new;
+ struct pcpu_freelist_node *l;
int cpu;
- pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
+ pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
+ GFP_USER | __GFP_NOWARN);
if (!pptr)
return -ENOMEM;
for_each_possible_cpu(cpu) {
- ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
- HTAB_EXTRA_ELEM_FREE;
+ l = pcpu_freelist_pop(&htab->freelist);
+ /* pop will succeed, since prealloc_init()
+ * preallocated extra num_possible_cpus elements
+ */
+ l_new = container_of(l, struct htab_elem, fnode);
+ *per_cpu_ptr(pptr, cpu) = l_new;
}
htab->extra_elems = pptr;
return 0;
@@ -342,25 +349,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&htab->buckets[i].lock);
}
- if (!percpu && !lru) {
- /* lru itself can remove the least used element, so
- * there is no need for an extra elem during map_update.
- */
- err = alloc_extra_elems(htab);
- if (err)
- goto free_buckets;
- }
-
if (prealloc) {
err = prealloc_init(htab);
if (err)
- goto free_extra_elems;
+ goto free_buckets;
+
+ if (!percpu && !lru) {
+ /* lru itself can remove the least used element, so
+ * there is no need for an extra elem during map_update.
+ */
+ err = alloc_extra_elems(htab);
+ if (err)
+ goto free_prealloc;
+ }
}
return &htab->map;
-free_extra_elems:
- free_percpu(htab->extra_elems);
+free_prealloc:
+ prealloc_destroy(htab);
free_buckets:
bpf_map_area_free(htab->buckets);
free_htab:
@@ -575,12 +582,7 @@ static void htab_elem_free_rcu(struct rcu_head *head)
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
- if (l->state == HTAB_EXTRA_ELEM_USED) {
- l->state = HTAB_EXTRA_ELEM_FREE;
- return;
- }
-
- if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
+ if (htab_is_prealloc(htab)) {
pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
@@ -610,47 +612,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
- bool old_elem_exists)
+ struct htab_elem *old_elem)
{
u32 size = htab->map.value_size;
- bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
- struct htab_elem *l_new;
+ bool prealloc = htab_is_prealloc(htab);
+ struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
- int err = 0;
if (prealloc) {
- struct pcpu_freelist_node *l;
+ if (old_elem) {
+ /* if we're updating the existing element,
+ * use per-cpu extra elems to avoid freelist_pop/push
+ */
+ pl_new = this_cpu_ptr(htab->extra_elems);
+ l_new = *pl_new;
+ *pl_new = old_elem;
+ } else {
+ struct pcpu_freelist_node *l;
- l = pcpu_freelist_pop(&htab->freelist);
- if (!l)
- err = -E2BIG;
- else
+ l = pcpu_freelist_pop(&htab->freelist);
+ if (!l)
+ return ERR_PTR(-E2BIG);
l_new = container_of(l, struct htab_elem, fnode);
- } else {
- if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
- atomic_dec(&htab->count);
- err = -E2BIG;
- } else {
- l_new = kmalloc(htab->elem_size,
- GFP_ATOMIC | __GFP_NOWARN);
- if (!l_new)
- return ERR_PTR(-ENOMEM);
}
- }
-
- if (err) {
- if (!old_elem_exists)
- return ERR_PTR(err);
-
- /* if we're updating the existing element and the hash table
- * is full, use per-cpu extra elems
- */
- l_new = this_cpu_ptr(htab->extra_elems);
- if (l_new->state != HTAB_EXTRA_ELEM_FREE)
- return ERR_PTR(-E2BIG);
- l_new->state = HTAB_EXTRA_ELEM_USED;
} else {
- l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
+ if (atomic_inc_return(&htab->count) > htab->map.max_entries)
+ if (!old_elem) {
+ /* when map is full and update() is replacing
+ * old element, it's ok to allocate, since
+ * old element will be freed immediately.
+ * Otherwise return an error
+ */
+ atomic_dec(&htab->count);
+ return ERR_PTR(-E2BIG);
+ }
+ l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+ if (!l_new)
+ return ERR_PTR(-ENOMEM);
}
memcpy(l_new->key, key, key_size);
@@ -731,7 +729,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
goto err;
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
- !!l_old);
+ l_old);
if (IS_ERR(l_new)) {
/* all pre-allocated elements are in use or memory exhausted */
ret = PTR_ERR(l_new);
@@ -744,7 +742,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
hlist_nulls_del_rcu(&l_old->hash_node);
- free_htab_elem(htab, l_old);
+ if (!htab_is_prealloc(htab))
+ free_htab_elem(htab, l_old);
}
ret = 0;
err:
@@ -856,7 +855,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
value, onallcpus);
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
- hash, true, onallcpus, false);
+ hash, true, onallcpus, NULL);
if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new);
goto err;
@@ -1024,8 +1023,7 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
hlist_nulls_del_rcu(&l->hash_node);
- if (l->state != HTAB_EXTRA_ELEM_USED)
- htab_elem_free(htab, l);
+ htab_elem_free(htab, l);
}
}
}
@@ -1045,7 +1043,7 @@ static void htab_map_free(struct bpf_map *map)
* not have executed. Wait for them.
*/
rcu_barrier();
- if (htab->map.map_flags & BPF_F_NO_PREALLOC)
+ if (!htab_is_prealloc(htab))
delete_all_elements(htab);
else
prealloc_destroy(htab);