From e1701c68c1d1aeb3213d7016593ea9a1d4309417 Mon Sep 17 00:00:00 2001 From: Thomas Graf <tgraf@suug.ch> Date: Sat, 24 Mar 2007 12:46:02 -0700 Subject: [NET]: Fix fib_rules compatibility breakage Based upon a patch from Patrick McHardy. The fib_rules netlink attribute policy introduced in 2.6.19 broke userspace compatibilty. When specifying a rule with "from all" or "to all", iproute adds a zero byte long netlink attribute, but the policy requires all addresses to have a size equal to sizeof(struct in_addr)/sizeof(struct in6_addr), resulting in a validation error. Check attribute length of FRA_SRC/FRA_DST in the generic framework by letting the family specific rules implementation provide the length of an address. Report an error if address length is non zero but no address attribute is provided. Fix actual bug by checking address length for non-zero instead of relying on availability of attribute. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/fib_rules.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index bc3c26494c3d..d585ea9fa97d 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -34,6 +34,7 @@ struct fib_rules_ops int family; struct list_head list; int rule_size; + int addr_size; int (*action)(struct fib_rule *, struct flowi *, int, -- cgit v1.2.3 From ecbb416939da77c0d107409976499724baddce7b Mon Sep 17 00:00:00 2001 From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> Date: Sat, 24 Mar 2007 12:52:16 -0700 Subject: [NET]: Fix neighbour destructor handling. ->neigh_destructor() is killed (not used), replaced with ->neigh_cleanup(), which is called when neighbor entry goes to dead state. At this point everything is still valid: neigh->dev, neigh->parms etc. The device should guarantee that dead neighbor entries (neigh->dead != 0) do not get private part initialized, otherwise nobody will cleanup it. I think this is enough for ipoib which is the only user of this thing. Initialization private part of neighbor entries happens in ipib start_xmit routine, which is not reached when device is down. But it would be better to add explicit test for neigh->dead in any case. Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 6 +++--- include/net/neighbour.h | 2 +- net/atm/clip.c | 9 --------- net/core/neighbour.c | 14 ++++++++++---- 4 files changed, 14 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 0741c6d1337c..f2a40ae8e7d0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -814,7 +814,7 @@ static void ipoib_set_mcast_list(struct net_device *dev) queue_work(ipoib_workqueue, &priv->restart_task); } -static void ipoib_neigh_destructor(struct neighbour *n) +static void ipoib_neigh_cleanup(struct neighbour *n) { struct ipoib_neigh *neigh; struct ipoib_dev_priv *priv = netdev_priv(n->dev); @@ -822,7 +822,7 @@ static void ipoib_neigh_destructor(struct neighbour *n) struct ipoib_ah *ah = NULL; ipoib_dbg(priv, - "neigh_destructor for %06x " IPOIB_GID_FMT "\n", + "neigh_cleanup for %06x " IPOIB_GID_FMT "\n", IPOIB_QPN(n->ha), IPOIB_GID_RAW_ARG(n->ha + 4)); @@ -874,7 +874,7 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh) static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) { - parms->neigh_destructor = ipoib_neigh_destructor; + parms->neigh_cleanup = ipoib_neigh_cleanup; return 0; } diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 3725b93c52f3..ad7fe1121412 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -36,7 +36,7 @@ struct neigh_parms struct net_device *dev; struct neigh_parms *next; int (*neigh_setup)(struct neighbour *); - void (*neigh_destructor)(struct neighbour *); + void (*neigh_cleanup)(struct neighbour *); struct neigh_table *tbl; void *sysctl_table; diff --git a/net/atm/clip.c b/net/atm/clip.c index ebb5d0ce8b6f..8c3825816085 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -261,14 +261,6 @@ static void clip_pop(struct atm_vcc *vcc, struct sk_buff *skb) spin_unlock_irqrestore(&PRIV(dev)->xoff_lock, flags); } -static void clip_neigh_destroy(struct neighbour *neigh) -{ - DPRINTK("clip_neigh_destroy (neigh %p)\n", neigh); - if (NEIGH2ENTRY(neigh)->vccs) - printk(KERN_CRIT "clip_neigh_destroy: vccs != NULL !!!\n"); - NEIGH2ENTRY(neigh)->vccs = (void *) NEIGHBOR_DEAD; -} - static void clip_neigh_solicit(struct neighbour *neigh, struct sk_buff *skb) { DPRINTK("clip_neigh_solicit (neigh %p, skb %p)\n", neigh, skb); @@ -342,7 +334,6 @@ static struct neigh_table clip_tbl = { /* parameters are copied from ARP ... */ .parms = { .tbl = &clip_tbl, - .neigh_destructor = clip_neigh_destroy, .base_reachable_time = 30 * HZ, .retrans_time = 1 * HZ, .gc_staletime = 60 * HZ, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3183142c6044..cfc60019cf92 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -140,6 +140,8 @@ static int neigh_forced_gc(struct neigh_table *tbl) n->dead = 1; shrunk = 1; write_unlock(&n->lock); + if (n->parms->neigh_cleanup) + n->parms->neigh_cleanup(n); neigh_release(n); continue; } @@ -211,6 +213,8 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) NEIGH_PRINTK2("neigh %p is stray.\n", n); } write_unlock(&n->lock); + if (n->parms->neigh_cleanup) + n->parms->neigh_cleanup(n); neigh_release(n); } } @@ -582,9 +586,6 @@ void neigh_destroy(struct neighbour *neigh) kfree(hh); } - if (neigh->parms->neigh_destructor) - (neigh->parms->neigh_destructor)(neigh); - skb_queue_purge(&neigh->arp_queue); dev_put(neigh->dev); @@ -675,6 +676,8 @@ static void neigh_periodic_timer(unsigned long arg) *np = n->next; n->dead = 1; write_unlock(&n->lock); + if (n->parms->neigh_cleanup) + n->parms->neigh_cleanup(n); neigh_release(n); continue; } @@ -2088,8 +2091,11 @@ void __neigh_for_each_release(struct neigh_table *tbl, } else np = &n->next; write_unlock(&n->lock); - if (release) + if (release) { + if (n->parms->neigh_cleanup) + n->parms->neigh_cleanup(n); neigh_release(n); + } } } } -- cgit v1.2.3 From f11e6659ce9058928d73ff440f9b40a818d628ab Mon Sep 17 00:00:00 2001 From: "David S. Miller" <davem@sunset.davemloft.net> Date: Sat, 24 Mar 2007 20:36:25 -0700 Subject: [IPV6]: Fix routing round-robin locking. As per RFC2461, section 6.3.6, item #2, when no routers on the matching list are known to be reachable or probably reachable we do round robin on those available routes so that we make sure to probe as many of them as possible to detect when one becomes reachable faster. Each routing table has a rwlock protecting the tree and the linked list of routes at each leaf. The round robin code executes during lookup and thus with the rwlock taken as a reader. A small local spinlock tries to provide protection but this does not work at all for two reasons: 1) The round-robin list manipulation, as coded, goes like this (with read lock held): walk routes finding head and tail spin_lock(); rotate list using head and tail spin_unlock(); While one thread is rotating the list, another thread can end up with stale values of head and tail and then proceed to corrupt the list when it gets the lock. This ends up causing the OOPS in fib6_add() later onthat many people have been hitting. 2) All the other code paths that run with the rwlock held as a reader do not expect the list to change on them, they expect it to remain completely fixed while they hold the lock in that way. So, simply stated, it is impossible to implement this correctly using a manipulation of the list without violating the rwlock locking semantics. Reimplement using a per-fib6_node round-robin pointer. This way we don't need to manipulate the list at all, and since the round-robin pointer can only ever point to real existing entries we don't need to perform any locking on the changing of the round-robin pointer itself. We only need to reset the round-robin pointer to NULL when the entry it is pointing to is removed. The idea is from Thomas Graf and it is very similar to how this was implemented before the advanced router selection code when in. Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/ip6_fib.h | 1 + net/ipv6/ip6_fib.c | 8 +++++ net/ipv6/route.c | 97 +++++++++++++++++++++++++++++++-------------------- 3 files changed, 68 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 9eda572a2a65..cf355a3c2ad5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -58,6 +58,7 @@ struct fib6_node __u16 fn_bit; /* bit key */ __u16 fn_flags; __u32 fn_sernum; + struct rt6_info *rr_ptr; }; #ifndef CONFIG_IPV6_SUBTREES diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f4d7be77eb0f..268f476ef3db 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -658,6 +658,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ins = &iter->u.dst.rt6_next; } + /* Reset round-robin state, if necessary */ + if (ins == &fn->leaf) + fn->rr_ptr = NULL; + /* * insert node */ @@ -1109,6 +1113,10 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, rt6_stats.fib_rt_entries--; rt6_stats.fib_discarded_routes++; + /* Reset round-robin state, if necessary */ + if (fn->rr_ptr == rt) + fn->rr_ptr = NULL; + /* Adjust walkers */ read_lock(&fib6_walker_lock); FOR_WALKERS(w) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a6b3117df546..3931b33b25e8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -363,55 +363,76 @@ static int rt6_score_route(struct rt6_info *rt, int oif, return m; } -static struct rt6_info *rt6_select(struct rt6_info **head, int oif, - int strict) +static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, + int *mpri, struct rt6_info *match) { - struct rt6_info *match = NULL, *last = NULL; - struct rt6_info *rt, *rt0 = *head; - u32 metric; + int m; + + if (rt6_check_expired(rt)) + goto out; + + m = rt6_score_route(rt, oif, strict); + if (m < 0) + goto out; + + if (m > *mpri) { + if (strict & RT6_LOOKUP_F_REACHABLE) + rt6_probe(match); + *mpri = m; + match = rt; + } else if (strict & RT6_LOOKUP_F_REACHABLE) { + rt6_probe(rt); + } + +out: + return match; +} + +static struct rt6_info *find_rr_leaf(struct fib6_node *fn, + struct rt6_info *rr_head, + u32 metric, int oif, int strict) +{ + struct rt6_info *rt, *match; int mpri = -1; - RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n", - __FUNCTION__, head, head ? *head : NULL, oif); + match = NULL; + for (rt = rr_head; rt && rt->rt6i_metric == metric; + rt = rt->u.dst.rt6_next) + match = find_match(rt, oif, strict, &mpri, match); + for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; + rt = rt->u.dst.rt6_next) + match = find_match(rt, oif, strict, &mpri, match); - for (rt = rt0, metric = rt0->rt6i_metric; - rt && rt->rt6i_metric == metric && (!last || rt != rt0); - rt = rt->u.dst.rt6_next) { - int m; + return match; +} - if (rt6_check_expired(rt)) - continue; +static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) +{ + struct rt6_info *match, *rt0; - last = rt; + RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n", + __FUNCTION__, fn->leaf, oif); - m = rt6_score_route(rt, oif, strict); - if (m < 0) - continue; + rt0 = fn->rr_ptr; + if (!rt0) + fn->rr_ptr = rt0 = fn->leaf; - if (m > mpri) { - if (strict & RT6_LOOKUP_F_REACHABLE) - rt6_probe(match); - match = rt; - mpri = m; - } else if (strict & RT6_LOOKUP_F_REACHABLE) { - rt6_probe(rt); - } - } + match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict); if (!match && - (strict & RT6_LOOKUP_F_REACHABLE) && - last && last != rt0) { + (strict & RT6_LOOKUP_F_REACHABLE)) { + struct rt6_info *next = rt0->u.dst.rt6_next; + /* no entries matched; do round-robin */ - static DEFINE_SPINLOCK(lock); - spin_lock(&lock); - *head = rt0->u.dst.rt6_next; - rt0->u.dst.rt6_next = last->u.dst.rt6_next; - last->u.dst.rt6_next = rt0; - spin_unlock(&lock); + if (!next || next->rt6i_metric != rt0->rt6i_metric) + next = fn->leaf; + + if (next != rt0) + fn->rr_ptr = next; } - RT6_TRACE("%s() => %p, score=%d\n", - __FUNCTION__, match, mpri); + RT6_TRACE("%s() => %p\n", + __FUNCTION__, match); return (match ? match : &ip6_null_entry); } @@ -657,7 +678,7 @@ restart_2: fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src); restart: - rt = rt6_select(&fn->leaf, fl->iif, strict | reachable); + rt = rt6_select(fn, fl->iif, strict | reachable); BACKTRACK(&fl->fl6_src); if (rt == &ip6_null_entry || rt->rt6i_flags & RTF_CACHE) @@ -752,7 +773,7 @@ restart_2: fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src); restart: - rt = rt6_select(&fn->leaf, fl->oif, strict | reachable); + rt = rt6_select(fn, fl->oif, strict | reachable); BACKTRACK(&fl->fl6_src); if (rt == &ip6_null_entry || rt->rt6i_flags & RTF_CACHE) -- cgit v1.2.3