From dfd25ffffc132c00070eed64200e8950da5d7e9d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 10 Mar 2012 09:20:21 +0000 Subject: tcp: fix syncookie regression commit ea4fc0d619 (ipv4: Don't use rt->rt_{src,dst} in ip_queue_xmit()) added a serious regression on synflood handling. Simon Kirby discovered a successful connection was delayed by 20 seconds before being responsive. In my tests, I discovered that xmit frames were lost, and needed ~4 retransmits and a socket dst rebuild before being really sent. In case of syncookie initiated connection, we use a different path to initialize the socket dst, and inet->cork.fl.u.ip4 is left cleared. As ip_queue_xmit() now depends on inet flow being setup, fix this by copying the temp flowi4 we use in cookie_v4_check(). Reported-by: Simon Kirby Bisected-by: Simon Kirby Signed-off-by: Eric Dumazet Tested-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 30 ++++++++++++++++-------------- net/ipv4/tcp_ipv4.c | 10 +++++++--- 2 files changed, 23 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 51fdbb490437..eab2a7fb15d1 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -278,6 +278,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct rtable *rt; __u8 rcv_wscale; bool ecn_ok = false; + struct flowi4 fl4; if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; @@ -346,20 +347,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * hasn't changed since we received the original syn, but I see * no easy way to do this. */ - { - struct flowi4 fl4; - - flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), - RT_SCOPE_UNIVERSE, IPPROTO_TCP, - inet_sk_flowi_flags(sk), - (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, - ireq->loc_addr, th->source, th->dest); - security_req_classify_flow(req, flowi4_to_flowi(&fl4)); - rt = ip_route_output_key(sock_net(sk), &fl4); - if (IS_ERR(rt)) { - reqsk_free(req); - goto out; - } + flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), + RT_SCOPE_UNIVERSE, IPPROTO_TCP, + inet_sk_flowi_flags(sk), + (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, + ireq->loc_addr, th->source, th->dest); + security_req_classify_flow(req, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(sock_net(sk), &fl4); + if (IS_ERR(rt)) { + reqsk_free(req); + goto out; } /* Try to redo what tcp_v4_send_synack did. */ @@ -373,5 +370,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->rcv_wscale = rcv_wscale; ret = get_cookie_sock(sk, skb, req, &rt->dst); + /* ip_queue_xmit() depends on our flow being setup + * Normal sockets get it right from inet_csk_route_child_sock() + */ + if (ret) + inet_sk(ret)->cork.fl.u.ip4 = fl4; out: return ret; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 94d683a61cba..fd54c5f8a255 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1466,9 +1466,13 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; - if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) - goto put_and_exit; - + if (!dst) { + dst = inet_csk_route_child_sock(sk, newsk, req); + if (!dst) + goto put_and_exit; + } else { + /* syncookie case : see end of cookie_v4_check() */ + } sk_setup_caps(newsk, dst); tcp_mtup_init(newsk); -- cgit v1.2.3 From 122bdf67f15a22bcabf6c2cab3a545d17ccf68dc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Mar 2012 21:13:11 +0000 Subject: ipv6: fix icmp6_dst_alloc() commit 87a115783 ( ipv6: Move xfrm_lookup() call down into icmp6_dst_alloc().) forgot to convert one error path, leading to crashes in mld_sendpack() Many thanks to Dave Jones for providing a very complete bug report. Reported-by: Dave Jones Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8c2e3ab58f2a..22b766407de1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1077,7 +1077,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct net *net = dev_net(dev); if (unlikely(!idev)) - return NULL; + return ERR_PTR(-ENODEV); rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0); if (unlikely(!rt)) { -- cgit v1.2.3 From cc34eb672eedb5ff248ac3bf9971a76f141fd141 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Mar 2012 18:04:25 +0000 Subject: sch_sfq: revert dont put new flow at the end of flows This reverts commit d47a0ac7b6 (sch_sfq: dont put new flow at the end of flows) As Jesper found out, patch sounded great but has bad side effects. In stress situation, pushing new flows in front of the queue can prevent old flows doing any progress. Packets can stay in SFQ queue for unlimited amount of time. It's possible to add heuristics to limit this problem, but this would add complexity outside of SFQ scope. A more sensible answer to Dave Taht concerns (who reported the issued I tried to solve in original commit) is probably to use a qdisc hierarchy so that high prio packets dont enter a potentially crowded SFQ qdisc. Reported-by: Jesper Dangaard Brouer Cc: Dave Taht Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_sfq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 60d47180f043..02a21abea65e 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -469,11 +469,15 @@ enqueue: if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ slot->next = x; - q->tail = slot; } else { slot->next = q->tail->next; q->tail->next = x; } + /* We put this flow at the end of our flow list. + * This might sound unfair for a new flow to wait after old ones, + * but we could endup servicing new flows only, and freeze old ones. + */ + q->tail = slot; /* We could use a bigger initial quantum for new flows */ slot->allot = q->scaled_quantum; } -- cgit v1.2.3 From c577923756b7fe9071f28a76b66b83b306d1d001 Mon Sep 17 00:00:00 2001 From: "RongQing.Li" Date: Thu, 15 Mar 2012 22:54:14 +0000 Subject: ipv6: Don't dev_hold(dev) in ip6_mc_find_dev_rcu. ip6_mc_find_dev_rcu() is called with rcu_read_lock(), so don't need to dev_hold(). With dev_hold(), not corresponding dev_put(), will lead to leak. [ bug introduced in 96b52e61be1 (ipv6: mcast: RCU conversions) ] Signed-off-by: RongQing.Li Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index b853f06cc148..16c33e308121 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -257,7 +257,6 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, if (rt) { dev = rt->dst.dev; - dev_hold(dev); dst_release(&rt->dst); } } else -- cgit v1.2.3 From a16a1647fa6b6783c2e91623e72e86f0c2adac5e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Mar 2012 02:00:34 +0000 Subject: netfilter: ctnetlink: fix race between delete and timeout expiration Kerin Millar reported hardlockups while running `conntrackd -c' in a busy firewall. That system (with several processors) was acting as backup in a primary-backup setup. After several tries, I found a race condition between the deletion operation of ctnetlink and timeout expiration. This patch fixes this problem. Tested-by: Kerin Millar Reported-by: Kerin Millar Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_netlink.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 10687692831e..b49da6c925b3 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -943,20 +943,21 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, } } - if (nf_conntrack_event_report(IPCT_DESTROY, ct, - NETLINK_CB(skb).pid, - nlmsg_report(nlh)) < 0) { + if (del_timer(&ct->timeout)) { + if (nf_conntrack_event_report(IPCT_DESTROY, ct, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)) < 0) { + nf_ct_delete_from_lists(ct); + /* we failed to report the event, try later */ + nf_ct_insert_dying_list(ct); + nf_ct_put(ct); + return 0; + } + /* death_by_timeout would report the event again */ + set_bit(IPS_DYING_BIT, &ct->status); nf_ct_delete_from_lists(ct); - /* we failed to report the event, try later */ - nf_ct_insert_dying_list(ct); nf_ct_put(ct); - return 0; } - - /* death_by_timeout would report the event again */ - set_bit(IPS_DYING_BIT, &ct->status); - - nf_ct_kill(ct); nf_ct_put(ct); return 0; -- cgit v1.2.3