summaryrefslogtreecommitdiff
path: root/net/ipv4/fib_trie.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/fib_trie.c')
-rw-r--r--net/ipv4/fib_trie.c351
1 files changed, 260 insertions, 91 deletions
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e2ffc2a5c7db..2919d1a10cfd 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -50,7 +50,7 @@
#define VERSION "0.409"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -73,6 +73,7 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
+#include <linux/notifier.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -80,10 +81,136 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
-#include <net/switchdev.h>
#include <trace/events/fib.h>
#include "fib_lookup.h"
+static unsigned int fib_seq_sum(void)
+{
+ unsigned int fib_seq = 0;
+ struct net *net;
+
+ rtnl_lock();
+ for_each_net(net)
+ fib_seq += net->ipv4.fib_seq;
+ rtnl_unlock();
+
+ return fib_seq;
+}
+
+static ATOMIC_NOTIFIER_HEAD(fib_chain);
+
+static int call_fib_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->net = net;
+ return nb->notifier_call(nb, event_type, info);
+}
+
+static void fib_rules_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ struct fib_notifier_info info;
+
+ if (net->ipv4.fib_has_custom_rules)
+ call_fib_notifier(nb, net, event_type, &info);
+#endif
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type);
+
+static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type, u32 dst,
+ int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 tb_id, u32 nlflags)
+{
+ struct fib_entry_notifier_info info = {
+ .dst = dst,
+ .dst_len = dst_len,
+ .fi = fi,
+ .tos = tos,
+ .type = type,
+ .tb_id = tb_id,
+ .nlflags = nlflags,
+ };
+ return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static bool fib_dump_is_consistent(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb),
+ unsigned int fib_seq)
+{
+ atomic_notifier_chain_register(&fib_chain, nb);
+ if (fib_seq == fib_seq_sum())
+ return true;
+ atomic_notifier_chain_unregister(&fib_chain, nb);
+ if (cb)
+ cb(nb);
+ return false;
+}
+
+#define FIB_DUMP_MAX_RETRIES 5
+int register_fib_notifier(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb))
+{
+ int retries = 0;
+
+ do {
+ unsigned int fib_seq = fib_seq_sum();
+ struct net *net;
+
+ /* Mutex semantics guarantee that every change done to
+ * FIB tries before we read the change sequence counter
+ * is now visible to us.
+ */
+ rcu_read_lock();
+ for_each_net_rcu(net) {
+ fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
+ fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
+ }
+ rcu_read_unlock();
+
+ if (fib_dump_is_consistent(nb, cb, fib_seq))
+ return 0;
+ } while (++retries < FIB_DUMP_MAX_RETRIES);
+
+ return -EBUSY;
+}
+EXPORT_SYMBOL(register_fib_notifier);
+
+int unregister_fib_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&fib_chain, nb);
+}
+EXPORT_SYMBOL(unregister_fib_notifier);
+
+int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ net->ipv4.fib_seq++;
+ info->net = net;
+ return atomic_notifier_call_chain(&fib_chain, event_type, info);
+}
+
+static int call_fib_entry_notifiers(struct net *net,
+ enum fib_event_type event_type, u32 dst,
+ int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 tb_id, u32 nlflags)
+{
+ struct fib_entry_notifier_info info = {
+ .dst = dst,
+ .dst_len = dst_len,
+ .fi = fi,
+ .tos = tos,
+ .type = type,
+ .tb_id = tb_id,
+ .nlflags = nlflags,
+ };
+ return call_fib_notifiers(net, event_type, &info.info);
+}
+
#define MAX_STAT_DEPTH 32
#define KEYLENGTH (8*sizeof(t_key))
@@ -681,6 +808,13 @@ static unsigned char update_suffix(struct key_vector *tn)
{
unsigned char slen = tn->pos;
unsigned long stride, i;
+ unsigned char slen_max;
+
+ /* only vector 0 can have a suffix length greater than or equal to
+ * tn->pos + tn->bits, the second highest node will have a suffix
+ * length at most of tn->pos + tn->bits - 1
+ */
+ slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
/* search though the list of children looking for nodes that might
* have a suffix greater than the one we currently have. This is
@@ -698,12 +832,8 @@ static unsigned char update_suffix(struct key_vector *tn)
slen = n->slen;
i &= ~(stride - 1);
- /* if slen covers all but the last bit we can stop here
- * there will be nothing longer than that since only node
- * 0 and 1 << (bits - 1) could have that as their suffix
- * length.
- */
- if ((slen + 1) >= (tn->pos + tn->bits))
+ /* stop searching if we have hit the maximum possible value */
+ if (slen >= slen_max)
break;
}
@@ -875,39 +1005,27 @@ static struct key_vector *resize(struct trie *t, struct key_vector *tn)
return collapse(t, tn);
/* update parent in case halve failed */
- tp = node_parent(tn);
-
- /* Return if at least one deflate was run */
- if (max_work != MAX_WORK)
- return tp;
-
- /* push the suffix length to the parent node */
- if (tn->slen > tn->pos) {
- unsigned char slen = update_suffix(tn);
-
- if (slen > tp->slen)
- tp->slen = slen;
- }
-
- return tp;
+ return node_parent(tn);
}
-static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l)
+static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
{
- while ((tp->slen > tp->pos) && (tp->slen > l->slen)) {
- if (update_suffix(tp) > l->slen)
+ unsigned char node_slen = tn->slen;
+
+ while ((node_slen > tn->pos) && (node_slen > slen)) {
+ slen = update_suffix(tn);
+ if (node_slen == slen)
break;
- tp = node_parent(tp);
+
+ tn = node_parent(tn);
+ node_slen = tn->slen;
}
}
-static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l)
+static void node_push_suffix(struct key_vector *tn, unsigned char slen)
{
- /* if this is a new leaf then tn will be NULL and we can sort
- * out parent suffix lengths as a part of trie_rebalance
- */
- while (tn->slen < l->slen) {
- tn->slen = l->slen;
+ while (tn->slen < slen) {
+ tn->slen = slen;
tn = node_parent(tn);
}
}
@@ -1028,6 +1146,7 @@ static int fib_insert_node(struct trie *t, struct key_vector *tp,
}
/* Case 3: n is NULL, and will just insert a new leaf */
+ node_push_suffix(tp, new->fa_slen);
NODE_INIT_PARENT(l, tp);
put_child_root(tp, key, l);
trie_rebalance(t, tp);
@@ -1069,19 +1188,20 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
/* if we added to the tail node then we need to update slen */
if (l->slen < new->fa_slen) {
l->slen = new->fa_slen;
- leaf_push_suffix(tp, l);
+ node_push_suffix(tp, new->fa_slen);
}
return 0;
}
/* Caller must hold RTNL. */
-int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_insert(struct net *net, struct fib_table *tb,
+ struct fib_config *cfg)
{
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
- unsigned int nlflags = 0;
+ u16 nlflags = NLM_F_EXCL;
struct fib_info *fi;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
@@ -1126,6 +1246,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (cfg->fc_nlflags & NLM_F_EXCL)
goto out;
+ nlflags &= ~NLM_F_EXCL;
+
/* We have 2 goals:
* 1. Find exact match for type, scope, fib_info to avoid
* duplicate routes
@@ -1151,6 +1273,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
struct fib_info *fi_drop;
u8 state;
+ nlflags |= NLM_F_REPLACE;
fa = fa_first;
if (fa_match) {
if (fa == fa_match)
@@ -1172,17 +1295,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
- err = switchdev_fib_ipv4_add(key, plen, fi,
- new_fa->fa_tos,
- cfg->fc_type,
- cfg->fc_nlflags,
- tb->tb_id);
- if (err) {
- switchdev_fib_ipv4_abort(fi);
- kmem_cache_free(fn_alias_kmem, new_fa);
- goto out;
- }
-
hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
@@ -1190,8 +1302,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
+
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
+ key, plen, fi,
+ new_fa->fa_tos, cfg->fc_type,
+ tb->tb_id, cfg->fc_nlflags);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
- tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
+ tb->tb_id, &cfg->fc_nlinfo, nlflags);
goto succeeded;
}
@@ -1203,7 +1320,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
goto out;
if (cfg->fc_nlflags & NLM_F_APPEND)
- nlflags = NLM_F_APPEND;
+ nlflags |= NLM_F_APPEND;
else
fa = fa_first;
}
@@ -1211,6 +1328,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (!(cfg->fc_nlflags & NLM_F_CREATE))
goto out;
+ nlflags |= NLM_F_CREATE;
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
if (!new_fa)
@@ -1224,30 +1342,22 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
- /* (Optionally) offload fib entry to switch hardware. */
- err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
- cfg->fc_nlflags, tb->tb_id);
- if (err) {
- switchdev_fib_ipv4_abort(fi);
- goto out_free_new_fa;
- }
-
/* Insert new entry to the list. */
err = fib_insert_alias(t, tp, l, new_fa, fa, key);
if (err)
- goto out_sw_fib_del;
+ goto out_free_new_fa;
if (!plen)
tb->tb_num_default++;
rt_cache_flush(cfg->fc_nlinfo.nl_net);
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos,
+ cfg->fc_type, tb->tb_id, cfg->fc_nlflags);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
&cfg->fc_nlinfo, nlflags);
succeeded:
return 0;
-out_sw_fib_del:
- switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1470,6 +1580,8 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
* out parent suffix lengths as a part of trie_rebalance
*/
if (hlist_empty(&l->leaf)) {
+ if (tp->slen == l->slen)
+ node_pull_suffix(tp, tp->pos);
put_child_root(tp, l->key, NULL);
node_free(l);
trie_rebalance(t, tp);
@@ -1482,11 +1594,12 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
/* update the trie with the latest suffix length */
l->slen = fa->fa_slen;
- leaf_pull_suffix(tp, l);
+ node_pull_suffix(tp, fa->fa_slen);
}
/* Caller must hold RTNL. */
-int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_delete(struct net *net, struct fib_table *tb,
+ struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
struct fib_alias *fa, *fa_to_delete;
@@ -1539,9 +1652,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (!fa_to_delete)
return -ESRCH;
- switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
- cfg->fc_type, tb->tb_id);
-
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
+ fa_to_delete->fa_info, tos, cfg->fc_type,
+ tb->tb_id, 0);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1713,8 +1826,10 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
local_l = fib_find_node(lt, &local_tp, l->key);
if (fib_insert_alias(lt, local_tp, local_l, new_fa,
- NULL, l->key))
+ NULL, l->key)) {
+ kmem_cache_free(fn_alias_kmem, new_fa);
goto out;
+ }
}
/* stop loop if key wrapped back to 0 */
@@ -1751,6 +1866,10 @@ void fib_table_flush_external(struct fib_table *tb)
if (IS_TRIE(pn))
break;
+ /* update the suffix to address pulled leaves */
+ if (pn->slen > pn->pos)
+ update_suffix(pn);
+
/* resize completed node */
pn = resize(t, pn);
cindex = get_index(pkey, pn);
@@ -1772,8 +1891,6 @@ void fib_table_flush_external(struct fib_table *tb)
}
hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
/* if alias was cloned to local then we just
* need to remove the local copy from main
*/
@@ -1785,13 +1902,6 @@ void fib_table_flush_external(struct fib_table *tb)
/* record local slen */
slen = fa->fa_slen;
-
- if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
- continue;
-
- switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos, fa->fa_type,
- tb->tb_id);
}
/* update leaf slen */
@@ -1805,7 +1915,7 @@ void fib_table_flush_external(struct fib_table *tb)
}
/* Caller must hold RTNL. */
-int fib_table_flush(struct fib_table *tb)
+int fib_table_flush(struct net *net, struct fib_table *tb)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *pn = t->kv;
@@ -1826,6 +1936,10 @@ int fib_table_flush(struct fib_table *tb)
if (IS_TRIE(pn))
break;
+ /* update the suffix to address pulled leaves */
+ if (pn->slen > pn->pos)
+ update_suffix(pn);
+
/* resize completed node */
pn = resize(t, pn);
cindex = get_index(pkey, pn);
@@ -1854,9 +1968,11 @@ int fib_table_flush(struct fib_table *tb)
continue;
}
- switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos, fa->fa_type,
- tb->tb_id);
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
+ n->key,
+ KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos, fa->fa_type,
+ tb->tb_id, 0);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
@@ -1876,6 +1992,62 @@ int fib_table_flush(struct fib_table *tb)
return found;
}
+static void fib_leaf_notify(struct net *net, struct key_vector *l,
+ struct fib_table *tb, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ struct fib_alias *fa;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi)
+ continue;
+
+ /* local and main table can share the same trie,
+ * so don't notify twice for the same entry.
+ */
+ if (tb->tb_id != fa->tb_id)
+ continue;
+
+ call_fib_entry_notifier(nb, net, event_type, l->key,
+ KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
+ fa->fa_type, fa->tb_id, 0);
+ }
+}
+
+static void fib_table_notify(struct net *net, struct fib_table *tb,
+ struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *l, *tp = t->kv;
+ t_key key = 0;
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ fib_leaf_notify(net, l, tb, nb, event_type);
+
+ key = l->key + 1;
+ /* stop in case of wrap around */
+ if (key < l->key)
+ break;
+ }
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist)
+ fib_table_notify(net, tb, nb, event_type);
+ }
+}
+
static void __trie_free_rcu(struct rcu_head *head)
{
struct fib_table *tb = container_of(head, struct fib_table, rcu);
@@ -2455,22 +2627,19 @@ static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
struct key_vector *l, **tp = &iter->tnode;
t_key key;
- /* use cache location of next-to-find key */
+ /* use cached location of previously found key */
if (iter->pos > 0 && pos >= iter->pos) {
- pos -= iter->pos;
key = iter->key;
} else {
- iter->pos = 0;
+ iter->pos = 1;
key = 0;
}
- while ((l = leaf_walk_rcu(tp, key)) != NULL) {
+ pos -= iter->pos;
+
+ while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) {
key = l->key + 1;
iter->pos++;
-
- if (--pos <= 0)
- break;
-
l = NULL;
/* handle unlikely case of a key wrap */
@@ -2479,7 +2648,7 @@ static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
}
if (l)
- iter->key = key; /* remember it */
+ iter->key = l->key; /* remember it */
else
iter->pos = 0; /* forget it */
@@ -2507,7 +2676,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
return fib_route_get_idx(iter, *pos);
iter->pos = 0;
- iter->key = 0;
+ iter->key = KEY_MAX;
return SEQ_START_TOKEN;
}
@@ -2516,7 +2685,7 @@ static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct fib_route_iter *iter = seq->private;
struct key_vector *l = NULL;
- t_key key = iter->key;
+ t_key key = iter->key + 1;
++*pos;
@@ -2525,7 +2694,7 @@ static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
l = leaf_walk_rcu(&iter->tnode, key);
if (l) {
- iter->key = l->key + 1;
+ iter->key = l->key;
iter->pos++;
} else {
iter->pos = 0;