From ce447eb91409225f8a488f6b7b2a1bdf7b2d884f Mon Sep 17 00:00:00 2001 From: John Heffner Date: Tue, 29 Apr 2008 03:13:02 -0700 Subject: tcp: Allow send-limited cwnd to grow up to max_burst when gso disabled This changes the logic in tcp_is_cwnd_limited() so that cwnd may grow up to tcp_max_burst() even when sk_can_gso() is false, or when sysctl_tcp_tso_win_divisor != 0. Signed-off-by: John Heffner Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 3a6be23d222f..bfb1996bd99f 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -285,14 +285,11 @@ int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) if (in_flight >= tp->snd_cwnd) return 1; - if (!sk_can_gso(sk)) - return 0; - left = tp->snd_cwnd - in_flight; - if (sysctl_tcp_tso_win_divisor) - return left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd; - else - return left <= tcp_max_burst(tp); + if (sk_can_gso(sk) && + left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd) + return 1; + return left <= tcp_max_burst(tp); } EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); -- cgit v1.2.3 From 246eb2af060fc32650f07203c02bdc0456ad76c7 Mon Sep 17 00:00:00 2001 From: John Heffner Date: Tue, 29 Apr 2008 03:13:52 -0700 Subject: tcp: Limit cwnd growth when deferring for GSO This fixes inappropriately large cwnd growth on sender-limited flows when GSO is enabled, limiting cwnd growth to 64k. Signed-off-by: John Heffner Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bfb1996bd99f..6a250828b767 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -287,7 +287,8 @@ int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) left = tp->snd_cwnd - in_flight; if (sk_can_gso(sk) && - left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd) + left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && + left * tp->mss_cache < sk->sk_gso_max_size) return 1; return left <= tcp_max_burst(tp); } -- cgit v1.2.3 From 9a732ed6d0e126d4c8a818f42a13f3df11755bee Mon Sep 17 00:00:00 2001 From: Arnaud Ebalard Date: Tue, 29 Apr 2008 03:16:34 -0700 Subject: netfilter: {nfnetlink,ip,ip6}_queue: fix skb_over_panic when enlarging packets While reinjecting *bigger* modified versions of IPv6 packets using libnetfilter_queue, things work fine on a 2.6.24 kernel (2.6.22 too) but I get the following on recents kernels (2.6.25, trace below is against today's net-2.6 git tree): skb_over_panic: text:c04fddb0 len:696 put:632 head:f7592c00 data:f7592c00 tail:0xf7592eb8 end:0xf7592e80 dev:eth0 ------------[ cut here ]------------ invalid opcode: 0000 [#1] PREEMPT Process sendd (pid: 3657, ti=f6014000 task=f77c31d0 task.ti=f6014000) Stack: c071e638 c04fddb0 000002b8 00000278 f7592c00 f7592c00 f7592eb8 f7592e80 f763c000 f6bc5200 f7592c40 f6015c34 c04cdbfc f6bc5200 00000278 f6015c60 c04fddb0 00000020 f72a10c0 f751b420 00000001 0000000a 000002b8 c065582c Call Trace: [] ? nfqnl_recv_verdict+0x1c0/0x2e0 [] ? skb_put+0x3c/0x40 [] ? nfqnl_recv_verdict+0x1c0/0x2e0 [] ? nfnetlink_rcv_msg+0xf5/0x160 [] ? nfnetlink_rcv_msg+0x1e/0x160 [] ? nfnetlink_rcv_msg+0x0/0x160 [] ? netlink_rcv_skb+0x77/0xa0 [] ? nfnetlink_rcv+0x1c/0x30 [] ? netlink_unicast+0x243/0x2b0 [] ? memcpy_fromiovec+0x4a/0x70 [] ? netlink_sendmsg+0x1c6/0x270 [] ? sock_sendmsg+0xc4/0xf0 [] ? set_next_entity+0x1d/0x50 [] ? autoremove_wake_function+0x0/0x40 [] ? __wake_up_common+0x3e/0x70 [] ? n_tty_receive_buf+0x34f/0x1280 [] ? __wake_up+0x68/0x70 [] ? copy_from_user+0x37/0x70 [] ? verify_iovec+0x2c/0x90 [] ? sys_sendmsg+0x10a/0x230 [] ? __dequeue_entity+0x2a/0xa0 [] ? set_next_entity+0x1d/0x50 [] ? pty_write+0x47/0x60 [] ? tty_default_put_char+0x1b/0x20 [] ? __wake_up+0x49/0x70 [] ? tty_ldisc_deref+0x39/0x90 [] ? tty_write+0x1a0/0x1b0 [] ? sys_socketcall+0x7f/0x260 [] ? sysenter_past_esp+0x6a/0x91 [] ? snd_intel8x0m_probe+0x270/0x6e0 ======================= Code: 00 00 89 5c 24 14 8b 98 9c 00 00 00 89 54 24 0c 89 5c 24 10 8b 40 50 89 4c 24 04 c7 04 24 38 e6 71 c0 89 44 24 08 e8 c4 46 c5 ff <0f> 0b eb fe 55 89 e5 56 89 d6 53 89 c3 83 ec 0c 8b 40 50 39 d0 EIP: [] skb_over_panic+0x5c/0x60 SS:ESP 0068:f6015bf8 Looking at the code, I ended up in nfq_mangle() function (called by nfqnl_recv_verdict()) which performs a call to skb_copy_expand() due to the increased size of data passed to the function. AFAICT, it should ask for 'diff' instead of 'diff - skb_tailroom(e->skb)'. Because the resulting sk_buff has not enough space to support the skb_put(skb, diff) call a few lines later, this results in the call to skb_over_panic(). The patch below asks for allocation of a copy with enough space for mangled packet and the same amount of headroom as old sk_buff. While looking at how the regression appeared (e2b58a67), I noticed the same pattern in ipq_mangle_ipv6() and ipq_mangle_ipv4(). The patch corrects those locations too. Tested with bigger reinjected IPv6 packets (nfqnl_mangle() path), things are ok (2.6.25 and today's net-2.6 git tree). Signed-off-by: Arnaud Ebalard Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_queue.c | 5 ++--- net/ipv6/netfilter/ip6_queue.c | 5 ++--- net/netfilter/nfnetlink_queue.c | 5 ++--- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 719be29f7506..26a37cedcf2e 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -296,9 +296,8 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e) if (v->data_len > 0xFFFF) return -EINVAL; if (diff > skb_tailroom(e->skb)) { - nskb = skb_copy_expand(e->skb, 0, - diff - skb_tailroom(e->skb), - GFP_ATOMIC); + nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), + diff, GFP_ATOMIC); if (!nskb) { printk(KERN_WARNING "ip_queue: error " "in mangle, dropping packet\n"); diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index 92a36c9e5402..2eff3ae8977d 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -298,9 +298,8 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct nf_queue_entry *e) if (v->data_len > 0xFFFF) return -EINVAL; if (diff > skb_tailroom(e->skb)) { - nskb = skb_copy_expand(e->skb, 0, - diff - skb_tailroom(e->skb), - GFP_ATOMIC); + nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), + diff, GFP_ATOMIC); if (!nskb) { printk(KERN_WARNING "ip6_queue: OOM " "in mangle, dropping packet\n"); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 2c9fe5c12894..3447025ce068 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -454,9 +454,8 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e) if (data_len > 0xFFFF) return -EINVAL; if (diff > skb_tailroom(e->skb)) { - nskb = skb_copy_expand(e->skb, 0, - diff - skb_tailroom(e->skb), - GFP_ATOMIC); + nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), + diff, GFP_ATOMIC); if (!nskb) { printk(KERN_WARNING "nf_queue: OOM " "in mangle, dropping packet\n"); -- cgit v1.2.3 From 2ad17defd596ca7e8ba782d5fc6950ee0e99513c Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 29 Apr 2008 03:21:23 -0700 Subject: ipvs: fix oops in backup for fwmark conn templates Fixes bug http://bugzilla.kernel.org/show_bug.cgi?id=10556 where conn templates with protocol=IPPROTO_IP can oops backup box. Result from ip_vs_proto_get() should be checked because protocol value can be invalid or unsupported in backup. But for valid message we should not fail for templates which use IPPROTO_IP. Also, add checks to validate message limits and connection state. Show state NONE for templates using IPPROTO_IP. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- include/net/ip_vs.h | 3 +- net/ipv4/ipvs/ip_vs_proto.c | 2 +- net/ipv4/ipvs/ip_vs_proto_ah.c | 1 + net/ipv4/ipvs/ip_vs_proto_esp.c | 1 + net/ipv4/ipvs/ip_vs_proto_tcp.c | 1 + net/ipv4/ipvs/ip_vs_proto_udp.c | 1 + net/ipv4/ipvs/ip_vs_sync.c | 80 ++++++++++++++++++++++++++++++----------- 7 files changed, 66 insertions(+), 23 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 56f3c94ae620..9a51ebad3f1f 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -405,7 +405,8 @@ struct sk_buff; struct ip_vs_protocol { struct ip_vs_protocol *next; char *name; - __u16 protocol; + u16 protocol; + u16 num_states; int dont_defrag; atomic_t appcnt; /* counter of proto app incs */ int *timeout_table; /* protocol timeout table */ diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c index dde28a250d92..4b1c16cbb16b 100644 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -148,7 +148,7 @@ const char * ip_vs_state_name(__u16 proto, int state) struct ip_vs_protocol *pp = ip_vs_proto_get(proto); if (pp == NULL || pp->state_name == NULL) - return "ERR!"; + return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; return pp->state_name(state); } diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index a842676e1c69..4bf835e1d86d 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -160,6 +160,7 @@ static void ah_exit(struct ip_vs_protocol *pp) struct ip_vs_protocol ip_vs_protocol_ah = { .name = "AH", .protocol = IPPROTO_AH, + .num_states = 1, .dont_defrag = 1, .init = ah_init, .exit = ah_exit, diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c index aef0d3ee8e44..db6a6b7b1a0b 100644 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -159,6 +159,7 @@ static void esp_exit(struct ip_vs_protocol *pp) struct ip_vs_protocol ip_vs_protocol_esp = { .name = "ESP", .protocol = IPPROTO_ESP, + .num_states = 1, .dont_defrag = 1, .init = esp_init, .exit = esp_exit, diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 620e40ff79a9..b83dc14b0a4d 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -594,6 +594,7 @@ static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) struct ip_vs_protocol ip_vs_protocol_tcp = { .name = "TCP", .protocol = IPPROTO_TCP, + .num_states = IP_VS_TCP_S_LAST, .dont_defrag = 0, .appcnt = ATOMIC_INIT(0), .init = ip_vs_tcp_init, diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 1caa2908373f..75771cb3cd6f 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -409,6 +409,7 @@ static void udp_exit(struct ip_vs_protocol *pp) struct ip_vs_protocol ip_vs_protocol_udp = { .name = "UDP", .protocol = IPPROTO_UDP, + .num_states = IP_VS_UDP_S_LAST, .dont_defrag = 0, .init = udp_init, .exit = udp_exit, diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 69c56663cc9a..eff54efe0351 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -288,11 +288,16 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) char *p; int i; + if (buflen < sizeof(struct ip_vs_sync_mesg)) { + IP_VS_ERR_RL("sync message header too short\n"); + return; + } + /* Convert size back to host byte order */ m->size = ntohs(m->size); if (buflen != m->size) { - IP_VS_ERR("bogus message\n"); + IP_VS_ERR_RL("bogus sync message size\n"); return; } @@ -307,9 +312,48 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) for (i=0; inr_conns; i++) { unsigned flags, state; - s = (struct ip_vs_sync_conn *)p; + if (p + SIMPLE_CONN_SIZE > buffer+buflen) { + IP_VS_ERR_RL("bogus conn in sync message\n"); + return; + } + s = (struct ip_vs_sync_conn *) p; flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; + flags &= ~IP_VS_CONN_F_HASHED; + if (flags & IP_VS_CONN_F_SEQ_MASK) { + opt = (struct ip_vs_sync_conn_options *)&s[1]; + p += FULL_CONN_SIZE; + if (p > buffer+buflen) { + IP_VS_ERR_RL("bogus conn options in sync message\n"); + return; + } + } else { + opt = NULL; + p += SIMPLE_CONN_SIZE; + } + state = ntohs(s->state); + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->protocol); + if (!pp) { + IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", + s->protocol); + continue; + } + if (state >= pp->num_states) { + IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", + pp->name, state); + continue; + } + } else { + /* protocol in templates is not used for state/timeout */ + pp = NULL; + if (state > 0) { + IP_VS_DBG(2, "Invalid template state %u in sync msg\n", + state); + state = 0; + } + } + if (!(flags & IP_VS_CONN_F_TEMPLATE)) cp = ip_vs_conn_in_get(s->protocol, s->caddr, s->cport, @@ -345,14 +389,9 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) IP_VS_ERR("ip_vs_conn_new failed\n"); return; } - cp->state = state; } else if (!cp->dest) { dest = ip_vs_try_bind_dest(cp); - if (!dest) { - /* it is an unbound entry created by - * synchronization */ - cp->flags = flags | IP_VS_CONN_F_HASHED; - } else + if (dest) atomic_dec(&dest->refcnt); } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && (cp->state != state)) { @@ -371,23 +410,22 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) } } - if (flags & IP_VS_CONN_F_SEQ_MASK) { - opt = (struct ip_vs_sync_conn_options *)&s[1]; + if (opt) memcpy(&cp->in_seq, opt, sizeof(*opt)); - p += FULL_CONN_SIZE; - } else - p += SIMPLE_CONN_SIZE; - atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); cp->state = state; - pp = ip_vs_proto_get(s->protocol); - cp->timeout = pp->timeout_table[cp->state]; + cp->old_state = cp->state; + /* + * We can not recover the right timeout for templates + * in all cases, we can not find the right fwmark + * virtual service. If needed, we can do it for + * non-fwmark persistent services. + */ + if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) + cp->timeout = pp->timeout_table[state]; + else + cp->timeout = (3*60*HZ); ip_vs_conn_put(cp); - - if (p > buffer+buflen) { - IP_VS_ERR("bogus message\n"); - return; - } } } -- cgit v1.2.3 From 42908c69f61f75dd70e424263ab89ee52040382b Mon Sep 17 00:00:00 2001 From: David L Stevens Date: Tue, 29 Apr 2008 03:23:22 -0700 Subject: net: Add compat support for getsockopt (MCAST_MSFILTER) This patch adds support for getsockopt for MCAST_MSFILTER for both IPv4 and IPv6. It depends on the previous setsockopt patch, and uses the same method. Signed-off-by: David L Stevens Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/compat.h | 3 ++ net/compat.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_sockglue.c | 9 +++++- net/ipv6/ipv6_sockglue.c | 4 +++ 4 files changed, 94 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/include/net/compat.h b/include/net/compat.h index 05fa5d0254ab..164cb682e220 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -42,5 +42,8 @@ extern int cmsghdr_from_user_compat_to_kern(struct msghdr *, struct sock *, unsi extern int compat_mc_setsockopt(struct sock *, int, int, char __user *, int, int (*)(struct sock *, int, int, char __user *, int)); +extern int compat_mc_getsockopt(struct sock *, int, int, char __user *, + int __user *, int (*)(struct sock *, int, int, char __user *, + int __user *)); #endif /* NET_COMPAT_H */ diff --git a/net/compat.c b/net/compat.c index 8146f654391c..c823f6f290cb 100644 --- a/net/compat.c +++ b/net/compat.c @@ -640,6 +640,85 @@ int compat_mc_setsockopt(struct sock *sock, int level, int optname, EXPORT_SYMBOL(compat_mc_setsockopt); +int compat_mc_getsockopt(struct sock *sock, int level, int optname, + char __user *optval, int __user *optlen, + int (*getsockopt)(struct sock *,int,int,char __user *,int __user *)) +{ + struct compat_group_filter __user *gf32 = (void *)optval; + struct group_filter __user *kgf; + int __user *koptlen; + u32 interface, fmode, numsrc; + int klen, ulen, err; + + if (optname != MCAST_MSFILTER) + return getsockopt(sock, level, optname, optval, optlen); + + koptlen = compat_alloc_user_space(sizeof(*koptlen)); + if (!access_ok(VERIFY_READ, optlen, sizeof(*optlen)) || + __get_user(ulen, optlen)) + return -EFAULT; + + /* adjust len for pad */ + klen = ulen + sizeof(*kgf) - sizeof(*gf32); + + if (klen < GROUP_FILTER_SIZE(0)) + return -EINVAL; + + if (!access_ok(VERIFY_WRITE, koptlen, sizeof(*koptlen)) || + __put_user(klen, koptlen)) + return -EFAULT; + + /* have to allow space for previous compat_alloc_user_space, too */ + kgf = compat_alloc_user_space(klen+sizeof(*optlen)); + + if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) || + __get_user(interface, &gf32->gf_interface) || + __get_user(fmode, &gf32->gf_fmode) || + __get_user(numsrc, &gf32->gf_numsrc) || + __put_user(interface, &kgf->gf_interface) || + __put_user(fmode, &kgf->gf_fmode) || + __put_user(numsrc, &kgf->gf_numsrc) || + copy_in_user(&kgf->gf_group,&gf32->gf_group,sizeof(kgf->gf_group))) + return -EFAULT; + + err = getsockopt(sock, level, optname, (char __user *)kgf, koptlen); + if (err) + return err; + + if (!access_ok(VERIFY_READ, koptlen, sizeof(*koptlen)) || + __get_user(klen, koptlen)) + return -EFAULT; + + ulen = klen - (sizeof(*kgf)-sizeof(*gf32)); + + if (!access_ok(VERIFY_WRITE, optlen, sizeof(*optlen)) || + __put_user(ulen, optlen)) + return -EFAULT; + + if (!access_ok(VERIFY_READ, kgf, klen) || + !access_ok(VERIFY_WRITE, gf32, ulen) || + __get_user(interface, &kgf->gf_interface) || + __get_user(fmode, &kgf->gf_fmode) || + __get_user(numsrc, &kgf->gf_numsrc) || + __put_user(interface, &gf32->gf_interface) || + __put_user(fmode, &gf32->gf_fmode) || + __put_user(numsrc, &gf32->gf_numsrc)) + return -EFAULT; + if (numsrc) { + int copylen; + + klen -= GROUP_FILTER_SIZE(0); + copylen = numsrc * sizeof(gf32->gf_slist[0]); + if (copylen > klen) + copylen = klen; + if (copy_in_user(gf32->gf_slist, kgf->gf_slist, copylen)) + return -EFAULT; + } + return err; +} + +EXPORT_SYMBOL(compat_mc_getsockopt); + /* Argument list sizes for compat_sys_socketcall */ #define AL(x) ((x) * sizeof(u32)) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 4d8d95404f45..e0514e82308e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1186,7 +1186,14 @@ int ip_getsockopt(struct sock *sk, int level, int compat_ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { - int err = do_ip_getsockopt(sk, level, optname, optval, optlen); + int err; + + if (optname == MCAST_MSFILTER) + return compat_mc_getsockopt(sk, level, optname, optval, optlen, + ip_getsockopt); + + err = do_ip_getsockopt(sk, level, optname, optval, optlen); + #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index db6fdc1498aa..b4a26f2505f8 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1089,6 +1089,10 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, if(level != SOL_IPV6) return -ENOPROTOOPT; + if (optname == MCAST_MSFILTER) + return compat_mc_getsockopt(sk, level, optname, optval, optlen, + ipv6_getsockopt); + err = do_ipv6_getsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ -- cgit v1.2.3 From 0010e46577a27c1d915034637f6c2fa57a9a091c Mon Sep 17 00:00:00 2001 From: Timo Teras Date: Tue, 29 Apr 2008 03:32:25 -0700 Subject: ipv4: Update MTU to all related cache entries in ip_rt_frag_needed() Add struct net_device parameter to ip_rt_frag_needed() and update MTU to cache entries where ifindex is specified. This is similar to what is already done in ip_rt_redirect(). Signed-off-by: Timo Teras Signed-off-by: David S. Miller --- include/net/route.h | 2 +- net/ipv4/icmp.c | 3 ++- net/ipv4/route.c | 38 ++++++++++++++++++++++---------------- 3 files changed, 25 insertions(+), 18 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/route.h b/include/net/route.h index c6338802e8f1..fc836ff824cc 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -116,7 +116,7 @@ extern int __ip_route_output_key(struct net *, struct rtable **, const struct f extern int ip_route_output_key(struct net *, struct rtable **, struct flowi *flp); extern int ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags); extern int ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin); -extern unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu); +extern unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev); extern void ip_rt_send_redirect(struct sk_buff *skb); extern unsigned inet_addr_type(struct net *net, __be32 addr); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c67d00e8c600..87397351ddac 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -691,7 +691,8 @@ static void icmp_unreach(struct sk_buff *skb) NIPQUAD(iph->daddr)); } else { info = ip_rt_frag_needed(net, iph, - ntohs(icmph->un.frag.mtu)); + ntohs(icmph->un.frag.mtu), + skb->dev); if (!info) goto out; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ce25a13f3430..5e3685c5c407 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1430,11 +1430,13 @@ static inline unsigned short guess_mtu(unsigned short old_mtu) } unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, - unsigned short new_mtu) + unsigned short new_mtu, + struct net_device *dev) { - int i; + int i, k; unsigned short old_mtu = ntohs(iph->tot_len); struct rtable *rth; + int ikeys[2] = { dev->ifindex, 0 }; __be32 skeys[2] = { iph->saddr, 0, }; __be32 daddr = iph->daddr; unsigned short est_mtu = 0; @@ -1442,22 +1444,26 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, if (ipv4_config.no_pmtu_disc) return 0; - for (i = 0; i < 2; i++) { - unsigned hash = rt_hash(daddr, skeys[i], 0); + for (k = 0; k < 2; k++) { + for (i = 0; i < 2; i++) { + unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]); - rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { - if (rth->fl.fl4_dst == daddr && - rth->fl.fl4_src == skeys[i] && - rth->rt_dst == daddr && - rth->rt_src == iph->saddr && - rth->fl.iif == 0 && - !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) && - net_eq(dev_net(rth->u.dst.dev), net) && - rth->rt_genid == atomic_read(&rt_genid)) { + rcu_read_lock(); + for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; + rth = rcu_dereference(rth->u.dst.rt_next)) { unsigned short mtu = new_mtu; + if (rth->fl.fl4_dst != daddr || + rth->fl.fl4_src != skeys[i] || + rth->rt_dst != daddr || + rth->rt_src != iph->saddr || + rth->fl.oif != ikeys[k] || + rth->fl.iif != 0 || + dst_metric_locked(&rth->u.dst, RTAX_MTU) || + !net_eq(dev_net(rth->u.dst.dev), net) || + rth->rt_genid != atomic_read(&rt_genid)) + continue; + if (new_mtu < 68 || new_mtu >= old_mtu) { /* BSD 4.2 compatibility hack :-( */ @@ -1483,8 +1489,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, est_mtu = mtu; } } + rcu_read_unlock(); } - rcu_read_unlock(); } return est_mtu ? : new_mtu; } -- cgit v1.2.3 From 443a70d50bdc212e1292778e264ce3d0a85b896f Mon Sep 17 00:00:00 2001 From: Philip Craig Date: Tue, 29 Apr 2008 03:35:10 -0700 Subject: netfilter: nf_conntrack: padding breaks conntrack hash on ARM commit 0794935e "[NETFILTER]: nf_conntrack: optimize hash_conntrack()" results in ARM platforms hashing uninitialised padding. This padding doesn't exist on other architectures. Fix this by replacing NF_CT_TUPLE_U_BLANK() with memset() to ensure everything is initialised. There were only 4 bytes that NF_CT_TUPLE_U_BLANK() wasn't clearing anyway (or 12 bytes on ARM). Signed-off-by: Philip Craig Signed-off-by: David S. Miller --- include/net/netfilter/nf_conntrack_tuple.h | 10 ---------- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- net/netfilter/nf_conntrack_core.c | 4 ++-- 3 files changed, 3 insertions(+), 13 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h index 1bb7087833d3..a6874ba22d54 100644 --- a/include/net/netfilter/nf_conntrack_tuple.h +++ b/include/net/netfilter/nf_conntrack_tuple.h @@ -107,16 +107,6 @@ struct nf_conntrack_tuple_mask } src; }; -/* This is optimized opposed to a memset of the whole structure. Everything we - * really care about is the source/destination unions */ -#define NF_CT_TUPLE_U_BLANK(tuple) \ - do { \ - (tuple)->src.u.all = 0; \ - (tuple)->dst.u.all = 0; \ - memset(&(tuple)->src.u3, 0, sizeof((tuple)->src.u3)); \ - memset(&(tuple)->dst.u3, 0, sizeof((tuple)->dst.u3)); \ - } while (0) - #ifdef __KERNEL__ static inline void nf_ct_dump_tuple_ip(const struct nf_conntrack_tuple *t) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index cacb9cb27dab..5a955c440364 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -303,7 +303,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) const struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; - NF_CT_TUPLE_U_BLANK(&tuple); + memset(&tuple, 0, sizeof(tuple)); tuple.src.u3.ip = inet->rcv_saddr; tuple.src.u.tcp.port = inet->sport; tuple.dst.u3.ip = inet->daddr; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 4eac65c74ed0..c4b1799da5d7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -104,7 +104,7 @@ nf_ct_get_tuple(const struct sk_buff *skb, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { - NF_CT_TUPLE_U_BLANK(tuple); + memset(tuple, 0, sizeof(*tuple)); tuple->src.l3num = l3num; if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) @@ -151,7 +151,7 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { - NF_CT_TUPLE_U_BLANK(inverse); + memset(inverse, 0, sizeof(*inverse)); inverse->src.l3num = orig->src.l3num; if (l3proto->invert_tuple(inverse, orig) == 0) -- cgit v1.2.3 From 45e741b89000519bedd4da4e7075a35acf5c655b Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Tue, 29 Apr 2008 20:58:15 -0700 Subject: ipv4: annotate a few functions __init in ipconfig.c A few functions are only used from __init context. So annotate these with __init for consistency and silence the following warnings: WARNING: net/ipv4/built-in.o(.text+0x2a876): Section mismatch in reference from the function ic_bootp_init() to the variable .init.data:bootp_packet_type WARNING: net/ipv4/built-in.o(.text+0x2a907): Section mismatch in reference from the function ic_bootp_cleanup() to the variable .init.data:bootp_packet_type Note: The warnings only appear with CONFIG_DEBUG_SECTION_MISMATCH=y Signed-off-by: Sam Ravnborg Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 0f42d1c1f690..89dee4346f60 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -412,12 +412,12 @@ static struct packet_type rarp_packet_type __initdata = { .func = ic_rarp_recv, }; -static inline void ic_rarp_init(void) +static inline void __init ic_rarp_init(void) { dev_add_pack(&rarp_packet_type); } -static inline void ic_rarp_cleanup(void) +static inline void __init ic_rarp_cleanup(void) { dev_remove_pack(&rarp_packet_type); } @@ -682,7 +682,7 @@ static void __init ic_bootp_init_ext(u8 *e) /* * Initialize the DHCP/BOOTP mechanism. */ -static inline void ic_bootp_init(void) +static inline void __init ic_bootp_init(void) { int i; @@ -696,7 +696,7 @@ static inline void ic_bootp_init(void) /* * DHCP/BOOTP cleanup. */ -static inline void ic_bootp_cleanup(void) +static inline void __init ic_bootp_cleanup(void) { dev_remove_pack(&bootp_packet_type); } -- cgit v1.2.3 From be9164e769d57aa10b2bbe15d103edc041b9e7de Mon Sep 17 00:00:00 2001 From: Kostya B Date: Tue, 29 Apr 2008 22:36:30 -0700 Subject: [IPv4] UFO: prevent generation of chained skb destined to UFO device Problem: ip_append_data() could wrongly generate a chained skb for devices which support UFO. When sk_write_queue is not empty (e.g. MSG_MORE), __instead__ of appending data into the next nr_frag of the queued skb, a new chained skb is created. I would normally assume UFO device should get data in nr_frags and not in frag_list. Later the udp4_hwcsum_outgoing() resets csum to NONE and skb_gso_segment() has oops. Proposal: 1. Even length is less than mtu, employ ip_ufo_append_data() and append data to the __existed__ skb in the sk_write_queue. 2. ip_ufo_append_data() is fixed due to a wrong manipulation of peek-ing and later enqueue-ing of the same skb. Now, enqueuing is always performed, because on error the further ip_flush_pending_frames() would release the queued skb. Signed-off-by: Kostya B Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 08349267ceb4..e527628f56cf 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -753,23 +753,15 @@ static inline int ip_ufo_append_data(struct sock *sk, skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; sk->sk_sndmsg_off = 0; - } - err = skb_append_datato_frags(sk,skb, getfrag, from, - (length - transhdrlen)); - if (!err) { - /* specify the length of each IP datagram fragment*/ + /* specify the length of each IP datagram fragment */ skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; __skb_queue_tail(&sk->sk_write_queue, skb); - - return 0; } - /* There is not enough support do UFO , - * so follow normal path - */ - kfree_skb(skb); - return err; + + return skb_append_datato_frags(sk, skb, getfrag, from, + (length - transhdrlen)); } /* @@ -863,9 +855,9 @@ int ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; inet->cork.length += length; - if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && - (rt->u.dst.dev->features & NETIF_F_UFO)) { - + if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) && + (sk->sk_protocol == IPPROTO_UDP) && + (rt->u.dst.dev->features & NETIF_F_UFO)) { err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags); -- cgit v1.2.3 From 159131149c2f56c1da5ae5e23ab9d5acef4916d1 Mon Sep 17 00:00:00 2001 From: Lachlan Andrew Date: Wed, 30 Apr 2008 01:04:03 -0700 Subject: tcp: Overflow bug in Vegas From: Lachlan Andrew There is an overflow bug in net/ipv4/tcp_vegas.c for large BDPs (e.g. 400Mbit/s, 400ms). The multiplication (old_wnd * vegas->baseRTT) << V_PARAM_SHIFT overflows a u32. [ Fix tcp_veno.c too, it has similar calculations. -DaveM ] Signed-off-by: David S. Miller --- net/ipv4/tcp_vegas.c | 10 ++++++---- net/ipv4/tcp_veno.c | 8 +++++--- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index be24d6ee34bd..0e1a8c91f78e 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -229,7 +229,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) */ tcp_reno_cong_avoid(sk, ack, in_flight); } else { - u32 rtt, target_cwnd, diff; + u32 rtt, diff; + u64 target_cwnd; /* We have enough RTT samples, so, using the Vegas * algorithm, we determine if we should increase or @@ -252,8 +253,9 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) * We keep it as a fixed point number with * V_PARAM_SHIFT bits to the right of the binary point. */ - target_cwnd = ((old_wnd * vegas->baseRTT) - << V_PARAM_SHIFT) / rtt; + target_cwnd = ((u64)old_wnd * vegas->baseRTT); + target_cwnd <<= V_PARAM_SHIFT; + do_div(target_cwnd, rtt); /* Calculate the difference between the window we had, * and the window we would like to have. This quantity @@ -279,7 +281,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) * utilization. */ tp->snd_cwnd = min(tp->snd_cwnd, - (target_cwnd >> + ((u32)target_cwnd >> V_PARAM_SHIFT)+1); } else if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index d16689e98516..2bf618a3b00b 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -133,7 +133,8 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) */ tcp_reno_cong_avoid(sk, ack, in_flight); } else { - u32 rtt, target_cwnd; + u64 target_cwnd; + u32 rtt; /* We have enough rtt samples, so, using the Veno * algorithm, we determine the state of the network. @@ -141,8 +142,9 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) rtt = veno->minrtt; - target_cwnd = ((tp->snd_cwnd * veno->basertt) - << V_PARAM_SHIFT) / rtt; + target_cwnd = (tp->snd_cwnd * veno->basertt); + target_cwnd <<= V_PARAM_SHIFT; + do_div(target_cwnd, rtt); veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; -- cgit v1.2.3