Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next

Pull networking updates from David Miller: 1) New offloading infrastructure and example 'rocker' driver for offloading of switching and routing to hardware. This work was done by a large group of dedicated individuals, not limited to: Scott Feldman, Jiri Pirko, Thomas Graf, John Fastabend, Jamal Hadi Salim, Andy Gospodarek, Florian Fainelli, Roopa Prabhu 2) Start making the networking operate on IOV iterators instead of modifying iov objects in-situ during transfers. Thanks to Al Viro and Herbert Xu. 3) A set of new netlink interfaces for the TIPC stack, from Richard Alpe. 4) Remove unnecessary looping during ipv6 routing lookups, from Martin KaFai Lau. 5) Add PAUSE frame generation support to gianfar driver, from Matei Pavaluca. 6) Allow for larger reordering levels in TCP, which are easily achievable in the real world right now, from Eric Dumazet. 7) Add a variable of napi_schedule that doesn't need to disable cpu interrupts, from Eric Dumazet. 8) Use a doubly linked list to optimize neigh_parms_release(), from Nicolas Dichtel. 9) Various enhancements to the kernel BPF verifier, and allow eBPF programs to actually be attached to sockets. From Alexei Starovoitov. 10) Support TSO/LSO in sunvnet driver, from David L Stevens. 11) Allow controlling ECN usage via routing metrics, from Florian Westphal. 12) Remote checksum offload, from Tom Herbert. 13) Add split-header receive, BQL, and xmit_more support to amd-xgbe driver, from Thomas Lendacky. 14) Add MPLS support to openvswitch, from Simon Horman. 15) Support wildcard tunnel endpoints in ipv6 tunnels, from Steffen Klassert. 16) Do gro flushes on a per-device basis using a timer, from Eric Dumazet. This tries to resolve the conflicting goals between the desired handling of bulk vs. RPC-like traffic. 17) Allow userspace to ask for the CPU upon what a packet was received/steered, via SO_INCOMING_CPU. From Eric Dumazet. 18) Limit GSO packets to half the current congestion window, from Eric Dumazet. 19) Add a generic helper so that all drivers set their RSS keys in a consistent way, from Eric Dumazet. 20) Add xmit_more support to enic driver, from Govindarajulu Varadarajan. 21) Add VLAN packet scheduler action, from Jiri Pirko. 22) Support configurable RSS hash functions via ethtool, from Eyal Perry. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1820 commits) Fix race condition between vxlan_sock_add and vxlan_sock_release net/macb: fix compilation warning for print_hex_dump() called with skb->mac_header net/mlx4: Add support for A0 steering net/mlx4: Refactor QUERY_PORT net/mlx4_core: Add explicit error message when rule doesn't meet configuration net/mlx4: Add A0 hybrid steering net/mlx4: Add mlx4_bitmap zone allocator net/mlx4: Add a check if there are too many reserved QPs net/mlx4: Change QP allocation scheme net/mlx4_core: Use tasklet for user-space CQ completion events net/mlx4_core: Mask out host side virtualization features for guests net/mlx4_en: Set csum level for encapsulated packets be2net: Export tunnel offloads only when a VxLAN tunnel is created gianfar: Fix dma check map error when DMA_API_DEBUG is enabled cxgb4/csiostor: Don't use MASTER_MUST for fw_hello call net: fec: only enable mdio interrupt before phy device link up net: fec: clear all interrupt events to support i.MX6SX net: fec: reset fep link status in suspend function net: sock: fix access via invalid file descriptor net: introduce helper macro for_each_cmsghdr ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-12-11 14:27:06 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-12-11 14:27:06 -0800
commit: 70e71ca0af244f48a5dcf56dc435243792e3a495 (patch)
tree: f7d9c4c4d9a857a00043e9bf6aa2d6f533a34778 /kernel
parent: bae41e45b7400496b9bf0c70c6004419d9987819 (diff)
parent: 00c83b01d58068dfeb2e1351cca6fccf2a83fa8f (diff)
download: lwn-70e71ca0af244f48a5dcf56dc435243792e3a495.tar.gz
lwn-70e71ca0af244f48a5dcf56dc435243792e3a495.zip
7 files changed, 750 insertions, 97 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0daf7f6ae7df..a5ae60f0b0a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
 obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
 ifdef CONFIG_TEST_BPF
 obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
 endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..9eb4d8a7cd87
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,156 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+struct bpf_array {
+	struct bpf_map map;
+	u32 elem_size;
+	char value[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_array *array;
+	u32 elem_size, array_size;
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size == 0)
+		return ERR_PTR(-EINVAL);
+
+	elem_size = round_up(attr->value_size, 8);
+
+	/* check round_up into zero and u32 overflow */
+	if (elem_size == 0 ||
+	    attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size)
+		return ERR_PTR(-ENOMEM);
+
+	array_size = sizeof(*array) + attr->max_entries * elem_size;
+
+	/* allocate all map elements and zero-initialize them */
+	array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
+	if (!array) {
+		array = vzalloc(array_size);
+		if (!array)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	/* copy mandatory map attributes */
+	array->map.key_size = attr->key_size;
+	array->map.value_size = attr->value_size;
+	array->map.max_entries = attr->max_entries;
+
+	array->elem_size = elem_size;
+
+	return &array->map;
+}
+
+/* Called from syscall or from eBPF program */
+static void *array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+
+	if (index >= array->map.max_entries)
+		return NULL;
+
+	return array->value + array->elem_size * index;
+}
+
+/* Called from syscall */
+static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+	u32 *next = (u32 *)next_key;
+
+	if (index >= array->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == array->map.max_entries - 1)
+		return -ENOENT;
+
+	*next = index + 1;
+	return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
+				 u64 map_flags)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+
+	if (map_flags > BPF_EXIST)
+		/* unknown flags */
+		return -EINVAL;
+
+	if (index >= array->map.max_entries)
+		/* all elements were pre-allocated, cannot insert a new one */
+		return -E2BIG;
+
+	if (map_flags == BPF_NOEXIST)
+		/* all elements already exist */
+		return -EEXIST;
+
+	memcpy(array->value + array->elem_size * index, value, array->elem_size);
+	return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_delete_elem(struct bpf_map *map, void *key)
+{
+	return -EINVAL;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void array_map_free(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding programs to complete
+	 * and free the array
+	 */
+	synchronize_rcu();
+
+	kvfree(array);
+}
+
+static struct bpf_map_ops array_ops = {
+	.map_alloc = array_map_alloc,
+	.map_free = array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = array_map_lookup_elem,
+	.map_update_elem = array_map_update_elem,
+	.map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+	.ops = &array_ops,
+	.type = BPF_MAP_TYPE_ARRAY,
+};
+
+static int __init register_array_map(void)
+{
+	bpf_register_map_type(&tl);
+	return 0;
+}
+late_initcall(register_array_map);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..b3ba43674310
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,367 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+
+struct bpf_htab {
+	struct bpf_map map;
+	struct hlist_head *buckets;
+	spinlock_t lock;
+	u32 count;	/* number of elements in this hashtable */
+	u32 n_buckets;	/* number of hash buckets */
+	u32 elem_size;	/* size of each element in bytes */
+};
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+	struct hlist_node hash_node;
+	struct rcu_head rcu;
+	u32 hash;
+	char key[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_htab *htab;
+	int err, i;
+
+	htab = kzalloc(sizeof(*htab), GFP_USER);
+	if (!htab)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	htab->map.key_size = attr->key_size;
+	htab->map.value_size = attr->value_size;
+	htab->map.max_entries = attr->max_entries;
+
+	/* check sanity of attributes.
+	 * value_size == 0 may be allowed in the future to use map as a set
+	 */
+	err = -EINVAL;
+	if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
+	    htab->map.value_size == 0)
+		goto free_htab;
+
+	/* hash table size must be power of 2 */
+	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+
+	err = -E2BIG;
+	if (htab->map.key_size > MAX_BPF_STACK)
+		/* eBPF programs initialize keys on stack, so they cannot be
+		 * larger than max stack size
+		 */
+		goto free_htab;
+
+	err = -ENOMEM;
+	/* prevent zero size kmalloc and check for u32 overflow */
+	if (htab->n_buckets == 0 ||
+	    htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+		goto free_htab;
+
+	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+				      GFP_USER | __GFP_NOWARN);
+
+	if (!htab->buckets) {
+		htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+		if (!htab->buckets)
+			goto free_htab;
+	}
+
+	for (i = 0; i < htab->n_buckets; i++)
+		INIT_HLIST_HEAD(&htab->buckets[i]);
+
+	spin_lock_init(&htab->lock);
+	htab->count = 0;
+
+	htab->elem_size = sizeof(struct htab_elem) +
+			  round_up(htab->map.key_size, 8) +
+			  htab->map.value_size;
+	return &htab->map;
+
+free_htab:
+	kfree(htab);
+	return ERR_PTR(err);
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+	return jhash(key, key_len, 0);
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+					 void *key, u32 key_size)
+{
+	struct htab_elem *l;
+
+	hlist_for_each_entry_rcu(l, head, hash_node)
+		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+			return l;
+
+	return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l;
+	u32 hash, key_size;
+
+	/* Must be called with rcu_read_lock. */
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	head = select_bucket(htab, hash);
+
+	l = lookup_elem_raw(head, hash, key, key_size);
+
+	if (l)
+		return l->key + round_up(map->key_size, 8);
+
+	return NULL;
+}
+
+/* Called from syscall */
+static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l, *next_l;
+	u32 hash, key_size;
+	int i;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	head = select_bucket(htab, hash);
+
+	/* lookup the key */
+	l = lookup_elem_raw(head, hash, key, key_size);
+
+	if (!l) {
+		i = 0;
+		goto find_first_elem;
+	}
+
+	/* key was found, get next key in the same bucket */
+	next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+				  struct htab_elem, hash_node);
+
+	if (next_l) {
+		/* if next elem in this hash list is non-zero, just return it */
+		memcpy(next_key, next_l->key, key_size);
+		return 0;
+	}
+
+	/* no more elements in this hash list, go to the next bucket */
+	i = hash & (htab->n_buckets - 1);
+	i++;
+
+find_first_elem:
+	/* iterate over buckets */
+	for (; i < htab->n_buckets; i++) {
+		head = select_bucket(htab, i);
+
+		/* pick first element in the bucket */
+		next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+					  struct htab_elem, hash_node);
+		if (next_l) {
+			/* if it's not empty, just return it */
+			memcpy(next_key, next_l->key, key_size);
+			return 0;
+		}
+	}
+
+	/* itereated over all buckets and all elements */
+	return -ENOENT;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l_new, *l_old;
+	struct hlist_head *head;
+	unsigned long flags;
+	u32 key_size;
+	int ret;
+
+	if (map_flags > BPF_EXIST)
+		/* unknown flags */
+		return -EINVAL;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	/* allocate new element outside of lock */
+	l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
+	if (!l_new)
+		return -ENOMEM;
+
+	key_size = map->key_size;
+
+	memcpy(l_new->key, key, key_size);
+	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+
+	l_new->hash = htab_map_hash(l_new->key, key_size);
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	spin_lock_irqsave(&htab->lock, flags);
+
+	head = select_bucket(htab, l_new->hash);
+
+	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+
+	if (!l_old && unlikely(htab->count >= map->max_entries)) {
+		/* if elem with this 'key' doesn't exist and we've reached
+		 * max_entries limit, fail insertion of new elem
+		 */
+		ret = -E2BIG;
+		goto err;
+	}
+
+	if (l_old && map_flags == BPF_NOEXIST) {
+		/* elem already exists */
+		ret = -EEXIST;
+		goto err;
+	}
+
+	if (!l_old && map_flags == BPF_EXIST) {
+		/* elem doesn't exist, cannot update it */
+		ret = -ENOENT;
+		goto err;
+	}
+
+	/* add new element to the head of the list, so that concurrent
+	 * search will find it before old elem
+	 */
+	hlist_add_head_rcu(&l_new->hash_node, head);
+	if (l_old) {
+		hlist_del_rcu(&l_old->hash_node);
+		kfree_rcu(l_old, rcu);
+	} else {
+		htab->count++;
+	}
+	spin_unlock_irqrestore(&htab->lock, flags);
+
+	return 0;
+err:
+	spin_unlock_irqrestore(&htab->lock, flags);
+	kfree(l_new);
+	return ret;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l;
+	unsigned long flags;
+	u32 hash, key_size;
+	int ret = -ENOENT;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	spin_lock_irqsave(&htab->lock, flags);
+
+	head = select_bucket(htab, hash);
+
+	l = lookup_elem_raw(head, hash, key, key_size);
+
+	if (l) {
+		hlist_del_rcu(&l->hash_node);
+		htab->count--;
+		kfree_rcu(l, rcu);
+		ret = 0;
+	}
+
+	spin_unlock_irqrestore(&htab->lock, flags);
+	return ret;
+}
+
+static void delete_all_elements(struct bpf_htab *htab)
+{
+	int i;
+
+	for (i = 0; i < htab->n_buckets; i++) {
+		struct hlist_head *head = select_bucket(htab, i);
+		struct hlist_node *n;
+		struct htab_elem *l;
+
+		hlist_for_each_entry_safe(l, n, head, hash_node) {
+			hlist_del_rcu(&l->hash_node);
+			htab->count--;
+			kfree(l);
+		}
+	}
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void htab_map_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete
+	 */
+	synchronize_rcu();
+
+	/* some of kfree_rcu() callbacks for elements of this map may not have
+	 * executed. It's ok. Proceed to free residual elements and map itself
+	 */
+	delete_all_elements(htab);
+	kvfree(htab->buckets);
+	kfree(htab);
+}
+
+static struct bpf_map_ops htab_ops = {
+	.map_alloc = htab_map_alloc,
+	.map_free = htab_map_free,
+	.map_get_next_key = htab_map_get_next_key,
+	.map_lookup_elem = htab_map_lookup_elem,
+	.map_update_elem = htab_map_update_elem,
+	.map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+	.ops = &htab_ops,
+	.type = BPF_MAP_TYPE_HASH,
+};
+
+static int __init register_htab_map(void)
+{
+	bpf_register_map_type(&tl);
+	return 0;
+}
+late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..9e3414d85459
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+
+/* If kernel subsystem is allowing eBPF programs to call this function,
+ * inside its own verifier_ops->get_func_proto() callback it should return
+ * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
+ *
+ * Different map implementations will rely on rcu in map methods
+ * lookup/update/delete, therefore eBPF programs must run under rcu lock
+ * if program is allowed to access maps, so check rcu_read_lock_held in
+ * all three functions.
+ */
+static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* verifier checked that R1 contains a valid pointer to bpf_map
+	 * and R2 points to a program stack and map->key_size bytes were
+	 * initialized
+	 */
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	void *key = (void *) (unsigned long) r2;
+	void *value;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	value = map->ops->map_lookup_elem(map, key);
+
+	/* lookup() returns either pointer to element value or NULL
+	 * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
+	 */
+	return (unsigned long) value;
+}
+
+struct bpf_func_proto bpf_map_lookup_elem_proto = {
+	.func = bpf_map_lookup_elem,
+	.gpl_only = false,
+	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	void *key = (void *) (unsigned long) r2;
+	void *value = (void *) (unsigned long) r3;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	return map->ops->map_update_elem(map, key, value, r4);
+}
+
+struct bpf_func_proto bpf_map_update_elem_proto = {
+	.func = bpf_map_update_elem,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_MAP_KEY,
+	.arg3_type = ARG_PTR_TO_MAP_VALUE,
+	.arg4_type = ARG_ANYTHING,
+};
+
+static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	void *key = (void *) (unsigned long) r2;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	return map->ops->map_delete_elem(map, key);
+}
+
+struct bpf_func_proto bpf_map_delete_elem_proto = {
+	.func = bpf_map_delete_elem,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_MAP_KEY,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ba61c8c16032..088ac0b1b106 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (copy_from_user(key, ukey, map->key_size) != 0)
 		goto free_key;
 
-	err = -ESRCH;
+	err = -ENOENT;
 	rcu_read_lock();
 	value = map->ops->map_lookup_elem(map, key);
 	if (!value)
@@ -190,7 +190,7 @@ err_put:
 	return err;
 }
 
-#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
 
 static int map_update_elem(union bpf_attr *attr)
 {
@@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr)
 	 * therefore all map accessors rely on this fact, so do the same here
 	 */
 	rcu_read_lock();
-	err = map->ops->map_update_elem(map, key, value);
+	err = map->ops->map_update_elem(map, key, value, attr->flags);
 	rcu_read_unlock();
 
 free_value:
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
index fcaddff4003e..0ceae1e6e8b5 100644
--- a/kernel/bpf/test_stub.c
+++ b/kernel/bpf/test_stub.c
@@ -18,26 +18,18 @@ struct bpf_context {
 	u64 arg2;
 };
 
-static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-	return 0;
-}
-
-static struct bpf_func_proto test_funcs[] = {
-	[BPF_FUNC_unspec] = {
-		.func = test_func,
-		.gpl_only = true,
-		.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
-		.arg1_type = ARG_CONST_MAP_PTR,
-		.arg2_type = ARG_PTR_TO_MAP_KEY,
-	},
-};
-
 static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
 {
-	if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	default:
 		return NULL;
-	return &test_funcs[func_id];
+	}
 }
 
 static const struct bpf_context_access {
@@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = {
 	.type = BPF_PROG_TYPE_UNSPEC,
 };
 
-static struct bpf_map *test_map_alloc(union bpf_attr *attr)
-{
-	struct bpf_map *map;
-
-	map = kzalloc(sizeof(*map), GFP_USER);
-	if (!map)
-		return ERR_PTR(-ENOMEM);
-
-	map->key_size = attr->key_size;
-	map->value_size = attr->value_size;
-	map->max_entries = attr->max_entries;
-	return map;
-}
-
-static void test_map_free(struct bpf_map *map)
-{
-	kfree(map);
-}
-
-static struct bpf_map_ops test_map_ops = {
-	.map_alloc = test_map_alloc,
-	.map_free = test_map_free,
-};
-
-static struct bpf_map_type_list tl_map = {
-	.ops = &test_map_ops,
-	.type = BPF_MAP_TYPE_UNSPEC,
-};
-
 static int __init register_test_ops(void)
 {
-	bpf_register_map_type(&tl_map);
 	bpf_register_prog_type(&tl_prog);
 	return 0;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9f81818f2941..a28e09c7825d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,22 +153,19 @@ struct reg_state {
 
 enum bpf_stack_slot_type {
 	STACK_INVALID,    /* nothing was stored in this stack slot */
-	STACK_SPILL,      /* 1st byte of register spilled into stack */
-	STACK_SPILL_PART, /* other 7 bytes of register spill */
+	STACK_SPILL,      /* register spilled into stack */
 	STACK_MISC	  /* BPF program wrote some data into this slot */
 };
 
-struct bpf_stack_slot {
-	enum bpf_stack_slot_type stype;
-	struct reg_state reg_st;
-};
+#define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
 
 /* state of the program:
  * type of all registers and stack info
  */
 struct verifier_state {
 	struct reg_state regs[MAX_BPF_REG];
-	struct bpf_stack_slot stack[MAX_BPF_STACK];
+	u8 stack_slot_type[MAX_BPF_STACK];
+	struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
 };
 
 /* linked list of verifier states used to prune search */
@@ -259,10 +256,10 @@ static void print_verifier_state(struct verifier_env *env)
 				env->cur_state.regs[i].map_ptr->key_size,
 				env->cur_state.regs[i].map_ptr->value_size);
 	}
-	for (i = 0; i < MAX_BPF_STACK; i++) {
-		if (env->cur_state.stack[i].stype == STACK_SPILL)
+	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+		if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
 			verbose(" fp%d=%s", -MAX_BPF_STACK + i,
-				reg_type_str[env->cur_state.stack[i].reg_st.type]);
+				reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
 	}
 	verbose("\n");
 }
@@ -539,8 +536,10 @@ static int bpf_size_to_bytes(int bpf_size)
 static int check_stack_write(struct verifier_state *state, int off, int size,
 			     int value_regno)
 {
-	struct bpf_stack_slot *slot;
 	int i;
+	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
+	 * so it's aligned access and [off, off + size) are within stack limits
+	 */
 
 	if (value_regno >= 0 &&
 	    (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
@@ -548,30 +547,24 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
 	     state->regs[value_regno].type == PTR_TO_CTX)) {
 
 		/* register containing pointer is being spilled into stack */
-		if (size != 8) {
+		if (size != BPF_REG_SIZE) {
 			verbose("invalid size of register spill\n");
 			return -EACCES;
 		}
 
-		slot = &state->stack[MAX_BPF_STACK + off];
-		slot->stype = STACK_SPILL;
 		/* save register state */
-		slot->reg_st = state->regs[value_regno];
-		for (i = 1; i < 8; i++) {
-			slot = &state->stack[MAX_BPF_STACK + off + i];
-			slot->stype = STACK_SPILL_PART;
-			slot->reg_st.type = UNKNOWN_VALUE;
-			slot->reg_st.map_ptr = NULL;
-		}
-	} else {
+		state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
+			state->regs[value_regno];
 
+		for (i = 0; i < BPF_REG_SIZE; i++)
+			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+	} else {
 		/* regular write of data into stack */
-		for (i = 0; i < size; i++) {
-			slot = &state->stack[MAX_BPF_STACK + off + i];
-			slot->stype = STACK_MISC;
-			slot->reg_st.type = UNKNOWN_VALUE;
-			slot->reg_st.map_ptr = NULL;
-		}
+		state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
+			(struct reg_state) {};
+
+		for (i = 0; i < size; i++)
+			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
 	}
 	return 0;
 }
@@ -579,19 +572,18 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
 static int check_stack_read(struct verifier_state *state, int off, int size,
 			    int value_regno)
 {
+	u8 *slot_type;
 	int i;
-	struct bpf_stack_slot *slot;
 
-	slot = &state->stack[MAX_BPF_STACK + off];
+	slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
 
-	if (slot->stype == STACK_SPILL) {
-		if (size != 8) {
+	if (slot_type[0] == STACK_SPILL) {
+		if (size != BPF_REG_SIZE) {
 			verbose("invalid size of register spill\n");
 			return -EACCES;
 		}
-		for (i = 1; i < 8; i++) {
-			if (state->stack[MAX_BPF_STACK + off + i].stype !=
-			    STACK_SPILL_PART) {
+		for (i = 1; i < BPF_REG_SIZE; i++) {
+			if (slot_type[i] != STACK_SPILL) {
 				verbose("corrupted spill memory\n");
 				return -EACCES;
 			}
@@ -599,12 +591,12 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
 
 		if (value_regno >= 0)
 			/* restore register state from stack */
-			state->regs[value_regno] = slot->reg_st;
+			state->regs[value_regno] =
+				state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
 		return 0;
 	} else {
 		for (i = 0; i < size; i++) {
-			if (state->stack[MAX_BPF_STACK + off + i].stype !=
-			    STACK_MISC) {
+			if (slot_type[i] != STACK_MISC) {
 				verbose("invalid read from stack off %d+%d size %d\n",
 					off, i, size);
 				return -EACCES;
@@ -747,7 +739,7 @@ static int check_stack_boundary(struct verifier_env *env,
 	}
 
 	for (i = 0; i < access_size; i++) {
-		if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) {
+		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
 			verbose("invalid indirect read from stack off %d+%d size %d\n",
 				off, i, access_size);
 			return -EACCES;
@@ -1180,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
+/* verify safety of LD_ABS|LD_IND instructions:
+ * - they can only appear in the programs where ctx == skb
+ * - since they are wrappers of function calls, they scratch R1-R5 registers,
+ *   preserve R6-R9, and store return value into R0
+ *
+ * Implicit input:
+ *   ctx == skb == R6 == CTX
+ *
+ * Explicit input:
+ *   SRC == any register
+ *   IMM == 32-bit immediate
+ *
+ * Output:
+ *   R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+{
+	struct reg_state *regs = env->cur_state.regs;
+	u8 mode = BPF_MODE(insn->code);
+	struct reg_state *reg;
+	int i, err;
+
+	if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
+		verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
+		return -EINVAL;
+	}
+
+	if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+	    (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
+		verbose("BPF_LD_ABS uses reserved fields\n");
+		return -EINVAL;
+	}
+
+	/* check whether implicit source operand (register R6) is readable */
+	err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+	if (err)
+		return err;
+
+	if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+		verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+		return -EINVAL;
+	}
+
+	if (mode == BPF_IND) {
+		/* check explicit source operand */
+		err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+		if (err)
+			return err;
+	}
+
+	/* reset caller saved regs to unreadable */
+	for (i = 0; i < CALLER_SAVED_REGS; i++) {
+		reg = regs + caller_saved[i];
+		reg->type = NOT_INIT;
+		reg->imm = 0;
+	}
+
+	/* mark destination R0 register as readable, since it contains
+	 * the value fetched from the packet
+	 */
+	regs[BPF_REG_0].type = UNKNOWN_VALUE;
+	return 0;
+}
+
 /* non-recursive DFS pseudo code
  * 1  procedure DFS-iterative(G,v):
  * 2      label v as discovered
@@ -1417,12 +1473,33 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
 	}
 
 	for (i = 0; i < MAX_BPF_STACK; i++) {
-		if (memcmp(&old->stack[i], &cur->stack[i],
-			   sizeof(old->stack[0])) != 0) {
-			if (old->stack[i].stype == STACK_INVALID)
-				continue;
+		if (old->stack_slot_type[i] == STACK_INVALID)
+			continue;
+		if (old->stack_slot_type[i] != cur->stack_slot_type[i])
+			/* Ex: old explored (safe) state has STACK_SPILL in
+			 * this stack slot, but current has has STACK_MISC ->
+			 * this verifier states are not equivalent,
+			 * return false to continue verification of this path
+			 */
 			return false;
-		}
+		if (i % BPF_REG_SIZE)
+			continue;
+		if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
+			   &cur->spilled_regs[i / BPF_REG_SIZE],
+			   sizeof(old->spilled_regs[0])))
+			/* when explored and current stack slot types are
+			 * the same, check that stored pointers types
+			 * are the same as well.
+			 * Ex: explored safe path could have stored
+			 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
+			 * but current path has stored:
+			 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
+			 * such verifier states are not equivalent.
+			 * return false to continue verification of this path
+			 */
+			return false;
+		else
+			continue;
 	}
 	return true;
 }
@@ -1664,8 +1741,10 @@ process_bpf_exit:
 			u8 mode = BPF_MODE(insn->code);
 
 			if (mode == BPF_ABS || mode == BPF_IND) {
-				verbose("LD_ABS is not supported yet\n");
-				return -EINVAL;
+				err = check_ld_abs(env, insn);
+				if (err)
+					return err;
+
 			} else if (mode == BPF_IMM) {
 				err = check_ld_imm(env, insn);
 				if (err)
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-12-11 14:27:06 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-12-11 14:27:06 -0800
commit	70e71ca0af244f48a5dcf56dc435243792e3a495 (patch)
tree	f7d9c4c4d9a857a00043e9bf6aa2d6f533a34778 /kernel
parent	bae41e45b7400496b9bf0c70c6004419d9987819 (diff)
parent	00c83b01d58068dfeb2e1351cca6fccf2a83fa8f (diff)
download	lwn-70e71ca0af244f48a5dcf56dc435243792e3a495.tar.gz lwn-70e71ca0af244f48a5dcf56dc435243792e3a495.zip