From e1701c68c1d1aeb3213d7016593ea9a1d4309417 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 24 Mar 2007 12:46:02 -0700
Subject: [NET]: Fix fib_rules compatibility breakage

Based upon a patch from Patrick McHardy.

The fib_rules netlink attribute policy introduced in 2.6.19 broke
userspace compatibilty. When specifying a rule with "from all"
or "to all", iproute adds a zero byte long netlink attribute,
but the policy requires all addresses to have a size equal to
sizeof(struct in_addr)/sizeof(struct in6_addr), resulting in a
validation error.

Check attribute length of FRA_SRC/FRA_DST in the generic framework
by letting the family specific rules implementation provide the
length of an address. Report an error if address length is non
zero but no address attribute is provided. Fix actual bug by
checking address length for non-zero instead of relying on
availability of attribute.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index bc3c26494c3d..d585ea9fa97d 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -34,6 +34,7 @@ struct fib_rules_ops
 	int			family;
 	struct list_head	list;
 	int			rule_size;
+	int			addr_size;
 
 	int			(*action)(struct fib_rule *,
 					  struct flowi *, int,
-- 
cgit v1.2.3


From ecbb416939da77c0d107409976499724baddce7b Mon Sep 17 00:00:00 2001
From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Date: Sat, 24 Mar 2007 12:52:16 -0700
Subject: [NET]: Fix neighbour destructor handling.

->neigh_destructor() is killed (not used), replaced with
->neigh_cleanup(), which is called when neighbor entry goes to dead
state. At this point everything is still valid: neigh->dev,
neigh->parms etc.

The device should guarantee that dead neighbor entries (neigh->dead !=
0) do not get private part initialized, otherwise nobody will cleanup
it.

I think this is enough for ipoib which is the only user of this thing.
Initialization private part of neighbor entries happens in ipib
start_xmit routine, which is not reached when device is down.  But it
would be better to add explicit test for neigh->dead in any case.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c |  6 +++---
 include/net/neighbour.h                   |  2 +-
 net/atm/clip.c                            |  9 ---------
 net/core/neighbour.c                      | 14 ++++++++++----
 4 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 0741c6d1337c..f2a40ae8e7d0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -814,7 +814,7 @@ static void ipoib_set_mcast_list(struct net_device *dev)
 	queue_work(ipoib_workqueue, &priv->restart_task);
 }
 
-static void ipoib_neigh_destructor(struct neighbour *n)
+static void ipoib_neigh_cleanup(struct neighbour *n)
 {
 	struct ipoib_neigh *neigh;
 	struct ipoib_dev_priv *priv = netdev_priv(n->dev);
@@ -822,7 +822,7 @@ static void ipoib_neigh_destructor(struct neighbour *n)
 	struct ipoib_ah *ah = NULL;
 
 	ipoib_dbg(priv,
-		  "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
+		  "neigh_cleanup for %06x " IPOIB_GID_FMT "\n",
 		  IPOIB_QPN(n->ha),
 		  IPOIB_GID_RAW_ARG(n->ha + 4));
 
@@ -874,7 +874,7 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)
 
 static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms)
 {
-	parms->neigh_destructor = ipoib_neigh_destructor;
+	parms->neigh_cleanup = ipoib_neigh_cleanup;
 
 	return 0;
 }
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 3725b93c52f3..ad7fe1121412 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -36,7 +36,7 @@ struct neigh_parms
 	struct net_device *dev;
 	struct neigh_parms *next;
 	int	(*neigh_setup)(struct neighbour *);
-	void	(*neigh_destructor)(struct neighbour *);
+	void	(*neigh_cleanup)(struct neighbour *);
 	struct neigh_table *tbl;
 
 	void	*sysctl_table;
diff --git a/net/atm/clip.c b/net/atm/clip.c
index ebb5d0ce8b6f..8c3825816085 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -261,14 +261,6 @@ static void clip_pop(struct atm_vcc *vcc, struct sk_buff *skb)
 	spin_unlock_irqrestore(&PRIV(dev)->xoff_lock, flags);
 }
 
-static void clip_neigh_destroy(struct neighbour *neigh)
-{
-	DPRINTK("clip_neigh_destroy (neigh %p)\n", neigh);
-	if (NEIGH2ENTRY(neigh)->vccs)
-		printk(KERN_CRIT "clip_neigh_destroy: vccs != NULL !!!\n");
-	NEIGH2ENTRY(neigh)->vccs = (void *) NEIGHBOR_DEAD;
-}
-
 static void clip_neigh_solicit(struct neighbour *neigh, struct sk_buff *skb)
 {
 	DPRINTK("clip_neigh_solicit (neigh %p, skb %p)\n", neigh, skb);
@@ -342,7 +334,6 @@ static struct neigh_table clip_tbl = {
 	/* parameters are copied from ARP ... */
 	.parms = {
 		.tbl 			= &clip_tbl,
-		.neigh_destructor	= clip_neigh_destroy,
 		.base_reachable_time 	= 30 * HZ,
 		.retrans_time 		= 1 * HZ,
 		.gc_staletime 		= 60 * HZ,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 3183142c6044..cfc60019cf92 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -140,6 +140,8 @@ static int neigh_forced_gc(struct neigh_table *tbl)
 				n->dead = 1;
 				shrunk	= 1;
 				write_unlock(&n->lock);
+				if (n->parms->neigh_cleanup)
+					n->parms->neigh_cleanup(n);
 				neigh_release(n);
 				continue;
 			}
@@ -211,6 +213,8 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
 				NEIGH_PRINTK2("neigh %p is stray.\n", n);
 			}
 			write_unlock(&n->lock);
+			if (n->parms->neigh_cleanup)
+				n->parms->neigh_cleanup(n);
 			neigh_release(n);
 		}
 	}
@@ -582,9 +586,6 @@ void neigh_destroy(struct neighbour *neigh)
 			kfree(hh);
 	}
 
-	if (neigh->parms->neigh_destructor)
-		(neigh->parms->neigh_destructor)(neigh);
-
 	skb_queue_purge(&neigh->arp_queue);
 
 	dev_put(neigh->dev);
@@ -675,6 +676,8 @@ static void neigh_periodic_timer(unsigned long arg)
 			*np = n->next;
 			n->dead = 1;
 			write_unlock(&n->lock);
+			if (n->parms->neigh_cleanup)
+				n->parms->neigh_cleanup(n);
 			neigh_release(n);
 			continue;
 		}
@@ -2088,8 +2091,11 @@ void __neigh_for_each_release(struct neigh_table *tbl,
 			} else
 				np = &n->next;
 			write_unlock(&n->lock);
-			if (release)
+			if (release) {
+				if (n->parms->neigh_cleanup)
+					n->parms->neigh_cleanup(n);
 				neigh_release(n);
+			}
 		}
 	}
 }
-- 
cgit v1.2.3


From f11e6659ce9058928d73ff440f9b40a818d628ab Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Sat, 24 Mar 2007 20:36:25 -0700
Subject: [IPV6]: Fix routing round-robin locking.

As per RFC2461, section 6.3.6, item #2, when no routers on the
matching list are known to be reachable or probably reachable we
do round robin on those available routes so that we make sure
to probe as many of them as possible to detect when one becomes
reachable faster.

Each routing table has a rwlock protecting the tree and the linked
list of routes at each leaf.  The round robin code executes during
lookup and thus with the rwlock taken as a reader.  A small local
spinlock tries to provide protection but this does not work at all
for two reasons:

1) The round-robin list manipulation, as coded, goes like this (with
   read lock held):

	walk routes finding head and tail

	spin_lock();
	rotate list using head and tail
	spin_unlock();

   While one thread is rotating the list, another thread can
   end up with stale values of head and tail and then proceed
   to corrupt the list when it gets the lock.  This ends up causing
   the OOPS in fib6_add() later onthat many people have been hitting.

2) All the other code paths that run with the rwlock held as
   a reader do not expect the list to change on them, they
   expect it to remain completely fixed while they hold the
   lock in that way.

So, simply stated, it is impossible to implement this correctly using
a manipulation of the list without violating the rwlock locking
semantics.

Reimplement using a per-fib6_node round-robin pointer.  This way we
don't need to manipulate the list at all, and since the round-robin
pointer can only ever point to real existing entries we don't need
to perform any locking on the changing of the round-robin pointer
itself.  We only need to reset the round-robin pointer to NULL when
the entry it is pointing to is removed.

The idea is from Thomas Graf and it is very similar to how this
was implemented before the advanced router selection code when in.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  1 +
 net/ipv6/ip6_fib.c    |  8 +++++
 net/ipv6/route.c      | 97 +++++++++++++++++++++++++++++++--------------------
 3 files changed, 68 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 9eda572a2a65..cf355a3c2ad5 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -58,6 +58,7 @@ struct fib6_node
 	__u16			fn_bit;		/* bit key */
 	__u16			fn_flags;
 	__u32			fn_sernum;
+	struct rt6_info		*rr_ptr;
 };
 
 #ifndef CONFIG_IPV6_SUBTREES
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f4d7be77eb0f..268f476ef3db 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -658,6 +658,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 		ins = &iter->u.dst.rt6_next;
 	}
 
+	/* Reset round-robin state, if necessary */
+	if (ins == &fn->leaf)
+		fn->rr_ptr = NULL;
+
 	/*
 	 *	insert node
 	 */
@@ -1109,6 +1113,10 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 	rt6_stats.fib_rt_entries--;
 	rt6_stats.fib_discarded_routes++;
 
+	/* Reset round-robin state, if necessary */
+	if (fn->rr_ptr == rt)
+		fn->rr_ptr = NULL;
+
 	/* Adjust walkers */
 	read_lock(&fib6_walker_lock);
 	FOR_WALKERS(w) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a6b3117df546..3931b33b25e8 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -363,55 +363,76 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
 	return m;
 }
 
-static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
-				   int strict)
+static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
+				   int *mpri, struct rt6_info *match)
 {
-	struct rt6_info *match = NULL, *last = NULL;
-	struct rt6_info *rt, *rt0 = *head;
-	u32 metric;
+	int m;
+
+	if (rt6_check_expired(rt))
+		goto out;
+
+	m = rt6_score_route(rt, oif, strict);
+	if (m < 0)
+		goto out;
+
+	if (m > *mpri) {
+		if (strict & RT6_LOOKUP_F_REACHABLE)
+			rt6_probe(match);
+		*mpri = m;
+		match = rt;
+	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
+		rt6_probe(rt);
+	}
+
+out:
+	return match;
+}
+
+static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
+				     struct rt6_info *rr_head,
+				     u32 metric, int oif, int strict)
+{
+	struct rt6_info *rt, *match;
 	int mpri = -1;
 
-	RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
-		  __FUNCTION__, head, head ? *head : NULL, oif);
+	match = NULL;
+	for (rt = rr_head; rt && rt->rt6i_metric == metric;
+	     rt = rt->u.dst.rt6_next)
+		match = find_match(rt, oif, strict, &mpri, match);
+	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
+	     rt = rt->u.dst.rt6_next)
+		match = find_match(rt, oif, strict, &mpri, match);
 
-	for (rt = rt0, metric = rt0->rt6i_metric;
-	     rt && rt->rt6i_metric == metric && (!last || rt != rt0);
-	     rt = rt->u.dst.rt6_next) {
-		int m;
+	return match;
+}
 
-		if (rt6_check_expired(rt))
-			continue;
+static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
+{
+	struct rt6_info *match, *rt0;
 
-		last = rt;
+	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
+		  __FUNCTION__, fn->leaf, oif);
 
-		m = rt6_score_route(rt, oif, strict);
-		if (m < 0)
-			continue;
+	rt0 = fn->rr_ptr;
+	if (!rt0)
+		fn->rr_ptr = rt0 = fn->leaf;
 
-		if (m > mpri) {
-			if (strict & RT6_LOOKUP_F_REACHABLE)
-				rt6_probe(match);
-			match = rt;
-			mpri = m;
-		} else if (strict & RT6_LOOKUP_F_REACHABLE) {
-			rt6_probe(rt);
-		}
-	}
+	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
 
 	if (!match &&
-	    (strict & RT6_LOOKUP_F_REACHABLE) &&
-	    last && last != rt0) {
+	    (strict & RT6_LOOKUP_F_REACHABLE)) {
+		struct rt6_info *next = rt0->u.dst.rt6_next;
+
 		/* no entries matched; do round-robin */
-		static DEFINE_SPINLOCK(lock);
-		spin_lock(&lock);
-		*head = rt0->u.dst.rt6_next;
-		rt0->u.dst.rt6_next = last->u.dst.rt6_next;
-		last->u.dst.rt6_next = rt0;
-		spin_unlock(&lock);
+		if (!next || next->rt6i_metric != rt0->rt6i_metric)
+			next = fn->leaf;
+
+		if (next != rt0)
+			fn->rr_ptr = next;
 	}
 
-	RT6_TRACE("%s() => %p, score=%d\n",
-		  __FUNCTION__, match, mpri);
+	RT6_TRACE("%s() => %p\n",
+		  __FUNCTION__, match);
 
 	return (match ? match : &ip6_null_entry);
 }
@@ -657,7 +678,7 @@ restart_2:
 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
 
 restart:
-	rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
+	rt = rt6_select(fn, fl->iif, strict | reachable);
 	BACKTRACK(&fl->fl6_src);
 	if (rt == &ip6_null_entry ||
 	    rt->rt6i_flags & RTF_CACHE)
@@ -752,7 +773,7 @@ restart_2:
 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
 
 restart:
-	rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
+	rt = rt6_select(fn, fl->oif, strict | reachable);
 	BACKTRACK(&fl->fl6_src);
 	if (rt == &ip6_null_entry ||
 	    rt->rt6i_flags & RTF_CACHE)
-- 
cgit v1.2.3