diff options
Diffstat (limited to 'net/ipv4')
48 files changed, 481 insertions, 1382 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 2e548eca3489..d678820e4306 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -944,6 +944,8 @@ const struct proto_ops inet_stream_ops = { .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, + .sendmsg_locked = tcp_sendmsg_locked, + .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, @@ -1219,10 +1221,9 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { - bool udpfrag = false, fixedid = false, gso_partial, encap; + bool fixedid = false, gso_partial, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; - unsigned int offset = 0; struct iphdr *iph; int proto, tot_len; int nhoff; @@ -1257,7 +1258,6 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); if (!skb->encapsulation || encap) { - udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); /* fixed ID is invalid if DF bit is not set */ @@ -1277,13 +1277,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); - if (udpfrag) { - iph->frag_off = htons(offset >> 3); - if (skb->next) - iph->frag_off |= htons(IP_MF); - offset += skb->len - nhoff - ihl; - tot_len = skb->len - nhoff; - } else if (skb_is_gso(skb)) { + if (skb_is_gso(skb)) { if (!fixedid) { iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; @@ -1778,6 +1772,11 @@ static const struct net_offload ipip_offload = { }, }; +static int __init ipip_offload_init(void) +{ + return inet_add_offload(&ipip_offload, IPPROTO_IPIP); +} + static int __init ipv4_offload_init(void) { /* @@ -1787,9 +1786,10 @@ static int __init ipv4_offload_init(void) pr_crit("%s: Cannot add UDP protocol offload\n", __func__); if (tcpv4_offload_init() < 0) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); + if (ipip_offload_init() < 0) + pr_crit("%s: Cannot add IPIP protocol offload\n", __func__); dev_add_offload(&ip_packet_offload); - inet_add_offload(&ipip_offload, IPPROTO_IPIP); return 0; } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 38d9af9b917c..d7adc0616599 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2491,9 +2491,9 @@ void __init devinet_init(void) rtnl_af_register(&inet_af_ops); - rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL); - rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL); - rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL); + rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, 0); + rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0); + rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, 0); rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, - inet_netconf_dump_devconf, NULL); + inet_netconf_dump_devconf, 0); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 044d2a159a3c..37819ab4cc74 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1247,22 +1247,28 @@ static int __net_init ip_fib_net_init(struct net *net) int err; size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; - net->ipv4.fib_seq = 0; + err = fib4_notifier_init(net); + if (err) + return err; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); - if (!net->ipv4.fib_table_hash) - return -ENOMEM; + if (!net->ipv4.fib_table_hash) { + err = -ENOMEM; + goto err_table_hash_alloc; + } err = fib4_rules_init(net); if (err < 0) - goto fail; + goto err_rules_init; return 0; -fail: +err_rules_init: kfree(net->ipv4.fib_table_hash); +err_table_hash_alloc: + fib4_notifier_exit(net); return err; } @@ -1292,6 +1298,7 @@ static void ip_fib_net_exit(struct net *net) #endif rtnl_unlock(); kfree(net->ipv4.fib_table_hash); + fib4_notifier_exit(net); } static int __net_init fib_net_init(struct net *net) @@ -1341,7 +1348,7 @@ void __init ip_fib_init(void) register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); - rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); - rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); - rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); + rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0); + rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0); + rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0); } diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index e0714d975947..5d7afb145562 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -1,86 +1,71 @@ #include <linux/rtnetlink.h> #include <linux/notifier.h> -#include <linux/rcupdate.h> +#include <linux/socket.h> #include <linux/kernel.h> #include <net/net_namespace.h> +#include <net/fib_notifier.h> #include <net/netns/ipv4.h> #include <net/ip_fib.h> -static ATOMIC_NOTIFIER_HEAD(fib_chain); - -int call_fib_notifier(struct notifier_block *nb, struct net *net, - enum fib_event_type event_type, - struct fib_notifier_info *info) +int call_fib4_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_notifier_info *info) { - info->net = net; - return nb->notifier_call(nb, event_type, info); + info->family = AF_INET; + return call_fib_notifier(nb, net, event_type, info); } -int call_fib_notifiers(struct net *net, enum fib_event_type event_type, - struct fib_notifier_info *info) +int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, + struct fib_notifier_info *info) { + ASSERT_RTNL(); + + info->family = AF_INET; net->ipv4.fib_seq++; - info->net = net; - return atomic_notifier_call_chain(&fib_chain, event_type, info); + return call_fib_notifiers(net, event_type, info); } -static unsigned int fib_seq_sum(void) +static unsigned int fib4_seq_read(struct net *net) { - unsigned int fib_seq = 0; - struct net *net; - - rtnl_lock(); - for_each_net(net) - fib_seq += net->ipv4.fib_seq; - rtnl_unlock(); + ASSERT_RTNL(); - return fib_seq; + return net->ipv4.fib_seq + fib4_rules_seq_read(net); } -static bool fib_dump_is_consistent(struct notifier_block *nb, - void (*cb)(struct notifier_block *nb), - unsigned int fib_seq) +static int fib4_dump(struct net *net, struct notifier_block *nb) { - atomic_notifier_chain_register(&fib_chain, nb); - if (fib_seq == fib_seq_sum()) - return true; - atomic_notifier_chain_unregister(&fib_chain, nb); - if (cb) - cb(nb); - return false; + int err; + + err = fib4_rules_dump(net, nb); + if (err) + return err; + + fib_notify(net, nb); + + return 0; } -#define FIB_DUMP_MAX_RETRIES 5 -int register_fib_notifier(struct notifier_block *nb, - void (*cb)(struct notifier_block *nb)) -{ - int retries = 0; +static const struct fib_notifier_ops fib4_notifier_ops_template = { + .family = AF_INET, + .fib_seq_read = fib4_seq_read, + .fib_dump = fib4_dump, +}; - do { - unsigned int fib_seq = fib_seq_sum(); - struct net *net; +int __net_init fib4_notifier_init(struct net *net) +{ + struct fib_notifier_ops *ops; - /* Mutex semantics guarantee that every change done to - * FIB tries before we read the change sequence counter - * is now visible to us. - */ - rcu_read_lock(); - for_each_net_rcu(net) { - fib_rules_notify(net, nb); - fib_notify(net, nb); - } - rcu_read_unlock(); + net->ipv4.fib_seq = 0; - if (fib_dump_is_consistent(nb, cb, fib_seq)) - return 0; - } while (++retries < FIB_DUMP_MAX_RETRIES); + ops = fib_notifier_ops_register(&fib4_notifier_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + net->ipv4.notifier_ops = ops; - return -EBUSY; + return 0; } -EXPORT_SYMBOL(register_fib_notifier); -int unregister_fib_notifier(struct notifier_block *nb) +void __net_exit fib4_notifier_exit(struct net *net) { - return atomic_notifier_chain_unregister(&fib_chain, nb); + fib_notifier_ops_unregister(net->ipv4.notifier_ops); } -EXPORT_SYMBOL(unregister_fib_notifier); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 778ecf977eb2..35d646a62ad4 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -68,6 +68,16 @@ bool fib4_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL_GPL(fib4_rule_default); +int fib4_rules_dump(struct net *net, struct notifier_block *nb) +{ + return fib_rules_dump(net, nb, AF_INET); +} + +unsigned int fib4_rules_seq_read(struct net *net) +{ + return fib_rules_seq_read(net, AF_INET); +} + int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { @@ -185,38 +195,6 @@ static struct fib_table *fib_empty_table(struct net *net) return NULL; } -static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, - enum fib_event_type event_type, - struct fib_rule *rule) -{ - struct fib_rule_notifier_info info = { - .rule = rule, - }; - - return call_fib_notifier(nb, net, event_type, &info.info); -} - -static int call_fib_rule_notifiers(struct net *net, - enum fib_event_type event_type, - struct fib_rule *rule) -{ - struct fib_rule_notifier_info info = { - .rule = rule, - }; - - return call_fib_notifiers(net, event_type, &info.info); -} - -/* Called with rcu_read_lock() */ -void fib_rules_notify(struct net *net, struct notifier_block *nb) -{ - struct fib_rules_ops *ops = net->ipv4.rules_ops; - struct fib_rule *rule; - - list_for_each_entry_rcu(rule, &ops->rules_list, list) - call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule); -} - static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = { FRA_GENERIC_POLICY, [FRA_FLOW] = { .type = NLA_U32 }, @@ -273,7 +251,6 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->tos = frh->tos; net->ipv4.fib_has_custom_rules = true; - call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule); err = 0; errout: @@ -295,7 +272,6 @@ static int fib4_rule_delete(struct fib_rule *rule) net->ipv4.fib_num_tclassid_users--; #endif net->ipv4.fib_has_custom_rules = true; - call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule); errout: return err; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b8d18171cca3..632b454ce77c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -44,6 +44,7 @@ #include <net/netlink.h> #include <net/nexthop.h> #include <net/lwtunnel.h> +#include <net/fib_notifier.h> #include "fib_lookup.h" @@ -1342,6 +1343,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) rtm->rtm_flags |= RTNH_F_DEAD; } + if (fi->fib_nh->nh_flags & RTNH_F_OFFLOAD) + rtm->rtm_flags |= RTNH_F_OFFLOAD; #ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid && nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) @@ -1449,14 +1452,14 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh, if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && fib_nh->nh_flags & RTNH_F_LINKDOWN) break; - return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type, - &info.info); + return call_fib4_notifiers(dev_net(fib_nh->nh_dev), event_type, + &info.info); case FIB_EVENT_NH_DEL: if ((in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && fib_nh->nh_flags & RTNH_F_LINKDOWN) || (fib_nh->nh_flags & RTNH_F_DEAD)) - return call_fib_notifiers(dev_net(fib_nh->nh_dev), - event_type, &info.info); + return call_fib4_notifiers(dev_net(fib_nh->nh_dev), + event_type, &info.info); default: break; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 64668c69dda6..1a6ffb0dab9c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -81,6 +81,7 @@ #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/fib_notifier.h> #include <trace/events/fib.h> #include "fib_lookup.h" @@ -97,7 +98,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, .type = type, .tb_id = tb_id, }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib4_notifier(nb, net, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, @@ -113,7 +114,7 @@ static int call_fib_entry_notifiers(struct net *net, .type = type, .tb_id = tb_id, }; - return call_fib_notifiers(net, event_type, &info.info); + return call_fib4_notifiers(net, event_type, &info.info); } #define MAX_STAT_DEPTH 32 diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index d5cac99170b1..416bb304a281 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; - bool need_csum, ufo, gso_partial; + bool need_csum, gso_partial; if (!skb->encapsulation) goto out; @@ -47,20 +47,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); skb->encap_hdr_csum = need_csum; - ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); - features &= skb->dev->hw_enc_features; - /* The only checksum offload we care about from here on out is the - * outer one so strip the existing checksum feature flags based - * on the fact that we will be computing our checksum in software. - */ - if (ufo) { - features &= ~NETIF_F_CSUM_MASK; - if (!need_csum) - features |= NETIF_F_HW_CSUM; - } - /* segment inner packet. */ segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c2be26b98b5f..681e33998e03 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -412,7 +412,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) int type = icmp_param->data.icmph.type; int code = icmp_param->data.icmph.code; - if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) + if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb)) return; /* Needed by both icmp_global_allow and icmp_xmit_lock */ @@ -694,7 +694,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in)) + if (ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in)) goto out_unlock; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 498706b072fb..9f86b5133605 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2549,7 +2549,8 @@ done: /* * check if a multicast source filter allows delivery for a given <src,dst,intf> */ -int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) +int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, + int dif, int sdif) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *pmc; @@ -2564,7 +2565,8 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) rcu_read_lock(); for_each_pmc_rcu(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == loc_addr && - pmc->multi.imr_ifindex == dif) + (pmc->multi.imr_ifindex == dif || + (sdif && pmc->multi.imr_ifindex == sdif))) break; } ret = inet->mc_all; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 2e3389d614d1..597bb4cfe805 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -170,7 +170,7 @@ EXPORT_SYMBOL_GPL(__inet_inherit_port); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, - const int dif, bool exact_dif) + const int dif, const int sdif, bool exact_dif) { int score = -1; struct inet_sock *inet = inet_sk(sk); @@ -185,9 +185,13 @@ static inline int compute_score(struct sock *sk, struct net *net, score += 4; } if (sk->sk_bound_dev_if || exact_dif) { - if (sk->sk_bound_dev_if != dif) + bool dev_match = (sk->sk_bound_dev_if == dif || + sk->sk_bound_dev_if == sdif); + + if (exact_dif && !dev_match) return -1; - score += 4; + if (sk->sk_bound_dev_if && dev_match) + score += 4; } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; @@ -208,7 +212,7 @@ struct sock *__inet_lookup_listener(struct net *net, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, - const int dif) + const int dif, const int sdif) { unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; @@ -218,7 +222,8 @@ struct sock *__inet_lookup_listener(struct net *net, u32 phash = 0; sk_for_each_rcu(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif, exact_dif); + score = compute_score(sk, net, hnum, daddr, + dif, sdif, exact_dif); if (score > hiscore) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -268,7 +273,7 @@ struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, - const int dif) + const int dif, const int sdif) { INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); @@ -286,11 +291,12 @@ begin: if (sk->sk_hash != hash) continue; if (likely(INET_MATCH(sk, net, acookie, - saddr, daddr, ports, dif))) { + saddr, daddr, ports, dif, sdif))) { if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; if (unlikely(!INET_MATCH(sk, net, acookie, - saddr, daddr, ports, dif))) { + saddr, daddr, ports, + dif, sdif))) { sock_gen_put(sk); goto begin; } @@ -321,9 +327,10 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, __be32 daddr = inet->inet_rcv_saddr; __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; + struct net *net = sock_net(sk); + int sdif = l3mdev_master_ifindex_by_index(net, dif); INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); - struct net *net = sock_net(sk); unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); @@ -339,7 +346,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, continue; if (likely(INET_MATCH(sk2, net, acookie, - saddr, daddr, ports, dif))) { + saddr, daddr, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (twsk_unique(sk, sk2, twp)) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index c5a117cc6619..337ad41bb80a 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -33,7 +33,7 @@ * also be removed if the pool is overloaded i.e. if the total amount of * entries is greater-or-equal than the threshold. * - * Node pool is organised as an AVL tree. + * Node pool is organised as an RB tree. * Such an implementation has been chosen not just for fun. It's a way to * prevent easy and efficient DoS attacks by creating hash collisions. A huge * amount of long living nodes in a single hash slot would significantly delay @@ -45,7 +45,7 @@ * AND reference count being 0. * 3. Global variable peer_total is modified under the pool lock. * 4. struct inet_peer fields modification: - * avl_left, avl_right, avl_parent, avl_height: pool lock + * rb_node: pool lock * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * daddr: unchangeable @@ -53,30 +53,15 @@ static struct kmem_cache *peer_cachep __read_mostly; -static LIST_HEAD(gc_list); -static const int gc_delay = 60 * HZ; -static struct delayed_work gc_work; -static DEFINE_SPINLOCK(gc_lock); - -#define node_height(x) x->avl_height - -#define peer_avl_empty ((struct inet_peer *)&peer_fake_node) -#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node) -static const struct inet_peer peer_fake_node = { - .avl_left = peer_avl_empty_rcu, - .avl_right = peer_avl_empty_rcu, - .avl_height = 0 -}; - void inet_peer_base_init(struct inet_peer_base *bp) { - bp->root = peer_avl_empty_rcu; + bp->rb_root = RB_ROOT; seqlock_init(&bp->lock); bp->total = 0; } EXPORT_SYMBOL_GPL(inet_peer_base_init); -#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ +#define PEER_MAX_GC 32 /* Exported for sysctl_net_ipv4. */ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more @@ -84,53 +69,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ -static void inetpeer_gc_worker(struct work_struct *work) -{ - struct inet_peer *p, *n, *c; - struct list_head list; - - spin_lock_bh(&gc_lock); - list_replace_init(&gc_list, &list); - spin_unlock_bh(&gc_lock); - - if (list_empty(&list)) - return; - - list_for_each_entry_safe(p, n, &list, gc_list) { - - if (need_resched()) - cond_resched(); - - c = rcu_dereference_protected(p->avl_left, 1); - if (c != peer_avl_empty) { - list_add_tail(&c->gc_list, &list); - p->avl_left = peer_avl_empty_rcu; - } - - c = rcu_dereference_protected(p->avl_right, 1); - if (c != peer_avl_empty) { - list_add_tail(&c->gc_list, &list); - p->avl_right = peer_avl_empty_rcu; - } - - n = list_entry(p->gc_list.next, struct inet_peer, gc_list); - - if (refcount_read(&p->refcnt) == 1) { - list_del(&p->gc_list); - kmem_cache_free(peer_cachep, p); - } - } - - if (list_empty(&list)) - return; - - spin_lock_bh(&gc_lock); - list_splice(&list, &gc_list); - spin_unlock_bh(&gc_lock); - - schedule_delayed_work(&gc_work, gc_delay); -} - /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) { @@ -153,225 +91,62 @@ void __init inet_initpeers(void) sizeof(struct inet_peer), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); - - INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker); } -#define rcu_deref_locked(X, BASE) \ - rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock)) - -/* - * Called with local BH disabled and the pool lock held. - */ -#define lookup(_daddr, _stack, _base) \ -({ \ - struct inet_peer *u; \ - struct inet_peer __rcu **v; \ - \ - stackptr = _stack; \ - *stackptr++ = &_base->root; \ - for (u = rcu_deref_locked(_base->root, _base); \ - u != peer_avl_empty;) { \ - int cmp = inetpeer_addr_cmp(_daddr, &u->daddr); \ - if (cmp == 0) \ - break; \ - if (cmp == -1) \ - v = &u->avl_left; \ - else \ - v = &u->avl_right; \ - *stackptr++ = v; \ - u = rcu_deref_locked(*v, _base); \ - } \ - u; \ -}) - -/* - * Called with rcu_read_lock() - * Because we hold no lock against a writer, its quite possible we fall - * in an endless loop. - * But every pointer we follow is guaranteed to be valid thanks to RCU. - * We exit from this function if number of links exceeds PEER_MAXDEPTH - */ -static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, - struct inet_peer_base *base) +/* Called with rcu_read_lock() or base->lock held */ +static struct inet_peer *lookup(const struct inetpeer_addr *daddr, + struct inet_peer_base *base, + unsigned int seq, + struct inet_peer *gc_stack[], + unsigned int *gc_cnt, + struct rb_node **parent_p, + struct rb_node ***pp_p) { - struct inet_peer *u = rcu_dereference(base->root); - int count = 0; + struct rb_node **pp, *parent; + struct inet_peer *p; + + pp = &base->rb_root.rb_node; + parent = NULL; + while (*pp) { + int cmp; - while (u != peer_avl_empty) { - int cmp = inetpeer_addr_cmp(daddr, &u->daddr); + parent = rcu_dereference_raw(*pp); + p = rb_entry(parent, struct inet_peer, rb_node); + cmp = inetpeer_addr_cmp(daddr, &p->daddr); if (cmp == 0) { - /* Before taking a reference, check if this entry was - * deleted (refcnt=0) - */ - if (!refcount_inc_not_zero(&u->refcnt)) { - u = NULL; - } - return u; + if (!refcount_inc_not_zero(&p->refcnt)) + break; + return p; + } + if (gc_stack) { + if (*gc_cnt < PEER_MAX_GC) + gc_stack[(*gc_cnt)++] = p; + } else if (unlikely(read_seqretry(&base->lock, seq))) { + break; } if (cmp == -1) - u = rcu_dereference(u->avl_left); + pp = &(*pp)->rb_left; else - u = rcu_dereference(u->avl_right); - if (unlikely(++count == PEER_MAXDEPTH)) - break; + pp = &(*pp)->rb_right; } + *parent_p = parent; + *pp_p = pp; return NULL; } -/* Called with local BH disabled and the pool lock held. */ -#define lookup_rightempty(start, base) \ -({ \ - struct inet_peer *u; \ - struct inet_peer __rcu **v; \ - *stackptr++ = &start->avl_left; \ - v = &start->avl_left; \ - for (u = rcu_deref_locked(*v, base); \ - u->avl_right != peer_avl_empty_rcu;) { \ - v = &u->avl_right; \ - *stackptr++ = v; \ - u = rcu_deref_locked(*v, base); \ - } \ - u; \ -}) - -/* Called with local BH disabled and the pool lock held. - * Variable names are the proof of operation correctness. - * Look into mm/map_avl.c for more detail description of the ideas. - */ -static void peer_avl_rebalance(struct inet_peer __rcu **stack[], - struct inet_peer __rcu ***stackend, - struct inet_peer_base *base) -{ - struct inet_peer __rcu **nodep; - struct inet_peer *node, *l, *r; - int lh, rh; - - while (stackend > stack) { - nodep = *--stackend; - node = rcu_deref_locked(*nodep, base); - l = rcu_deref_locked(node->avl_left, base); - r = rcu_deref_locked(node->avl_right, base); - lh = node_height(l); - rh = node_height(r); - if (lh > rh + 1) { /* l: RH+2 */ - struct inet_peer *ll, *lr, *lrl, *lrr; - int lrh; - ll = rcu_deref_locked(l->avl_left, base); - lr = rcu_deref_locked(l->avl_right, base); - lrh = node_height(lr); - if (lrh <= node_height(ll)) { /* ll: RH+1 */ - RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ - RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ - node->avl_height = lrh + 1; /* RH+1 or RH+2 */ - RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */ - RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */ - l->avl_height = node->avl_height + 1; - RCU_INIT_POINTER(*nodep, l); - } else { /* ll: RH, lr: RH+1 */ - lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */ - lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */ - RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ - RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ - node->avl_height = rh + 1; /* node: RH+1 */ - RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */ - RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */ - l->avl_height = rh + 1; /* l: RH+1 */ - RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */ - RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */ - lr->avl_height = rh + 2; - RCU_INIT_POINTER(*nodep, lr); - } - } else if (rh > lh + 1) { /* r: LH+2 */ - struct inet_peer *rr, *rl, *rlr, *rll; - int rlh; - rr = rcu_deref_locked(r->avl_right, base); - rl = rcu_deref_locked(r->avl_left, base); - rlh = node_height(rl); - if (rlh <= node_height(rr)) { /* rr: LH+1 */ - RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ - RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ - node->avl_height = rlh + 1; /* LH+1 or LH+2 */ - RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */ - RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */ - r->avl_height = node->avl_height + 1; - RCU_INIT_POINTER(*nodep, r); - } else { /* rr: RH, rl: RH+1 */ - rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */ - rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */ - RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ - RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ - node->avl_height = lh + 1; /* node: LH+1 */ - RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */ - RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */ - r->avl_height = lh + 1; /* r: LH+1 */ - RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */ - RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */ - rl->avl_height = lh + 2; - RCU_INIT_POINTER(*nodep, rl); - } - } else { - node->avl_height = (lh > rh ? lh : rh) + 1; - } - } -} - -/* Called with local BH disabled and the pool lock held. */ -#define link_to_pool(n, base) \ -do { \ - n->avl_height = 1; \ - n->avl_left = peer_avl_empty_rcu; \ - n->avl_right = peer_avl_empty_rcu; \ - /* lockless readers can catch us now */ \ - rcu_assign_pointer(**--stackptr, n); \ - peer_avl_rebalance(stack, stackptr, base); \ -} while (0) - static void inetpeer_free_rcu(struct rcu_head *head) { kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); } -static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, - struct inet_peer __rcu **stack[PEER_MAXDEPTH]) -{ - struct inet_peer __rcu ***stackptr, ***delp; - - if (lookup(&p->daddr, stack, base) != p) - BUG(); - delp = stackptr - 1; /* *delp[0] == p */ - if (p->avl_left == peer_avl_empty_rcu) { - *delp[0] = p->avl_right; - --stackptr; - } else { - /* look for a node to insert instead of p */ - struct inet_peer *t; - t = lookup_rightempty(p, base); - BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); - **--stackptr = t->avl_left; - /* t is removed, t->daddr > x->daddr for any - * x in p->avl_left subtree. - * Put t in the old place of p. */ - RCU_INIT_POINTER(*delp[0], t); - t->avl_left = p->avl_left; - t->avl_right = p->avl_right; - t->avl_height = p->avl_height; - BUG_ON(delp[1] != &p->avl_left); - delp[1] = &t->avl_left; /* was &p->avl_left */ - } - peer_avl_rebalance(stack, stackptr, base); - base->total--; - call_rcu(&p->rcu, inetpeer_free_rcu); -} - /* perform garbage collect on all items stacked during a lookup */ -static int inet_peer_gc(struct inet_peer_base *base, - struct inet_peer __rcu **stack[PEER_MAXDEPTH], - struct inet_peer __rcu ***stackptr) +static void inet_peer_gc(struct inet_peer_base *base, + struct inet_peer *gc_stack[], + unsigned int gc_cnt) { - struct inet_peer *p, *gchead = NULL; + struct inet_peer *p; __u32 delta, ttl; - int cnt = 0; + int i; if (base->total >= inet_peer_threshold) ttl = 0; /* be aggressive */ @@ -379,43 +154,38 @@ static int inet_peer_gc(struct inet_peer_base *base, ttl = inet_peer_maxttl - (inet_peer_maxttl - inet_peer_minttl) / HZ * base->total / inet_peer_threshold * HZ; - stackptr--; /* last stack slot is peer_avl_empty */ - while (stackptr > stack) { - stackptr--; - p = rcu_deref_locked(**stackptr, base); - if (refcount_read(&p->refcnt) == 1) { - smp_rmb(); - delta = (__u32)jiffies - p->dtime; - if (delta >= ttl && refcount_dec_if_one(&p->refcnt)) { - p->gc_next = gchead; - gchead = p; - } - } + for (i = 0; i < gc_cnt; i++) { + p = gc_stack[i]; + delta = (__u32)jiffies - p->dtime; + if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) + gc_stack[i] = NULL; } - while ((p = gchead) != NULL) { - gchead = p->gc_next; - cnt++; - unlink_from_pool(p, base, stack); + for (i = 0; i < gc_cnt; i++) { + p = gc_stack[i]; + if (p) { + rb_erase(&p->rb_node, &base->rb_root); + base->total--; + call_rcu(&p->rcu, inetpeer_free_rcu); + } } - return cnt; } struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr, int create) { - struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; - struct inet_peer *p; - unsigned int sequence; - int invalidated, gccnt = 0; + struct inet_peer *p, *gc_stack[PEER_MAX_GC]; + struct rb_node **pp, *parent; + unsigned int gc_cnt, seq; + int invalidated; /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ rcu_read_lock(); - sequence = read_seqbegin(&base->lock); - p = lookup_rcu(daddr, base); - invalidated = read_seqretry(&base->lock, sequence); + seq = read_seqbegin(&base->lock); + p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); + invalidated = read_seqretry(&base->lock, seq); rcu_read_unlock(); if (p) @@ -428,36 +198,31 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ + parent = NULL; write_seqlock_bh(&base->lock); -relookup: - p = lookup(daddr, stack, base); - if (p != peer_avl_empty) { - refcount_inc(&p->refcnt); - write_sequnlock_bh(&base->lock); - return p; - } - if (!gccnt) { - gccnt = inet_peer_gc(base, stack, stackptr); - if (gccnt && create) - goto relookup; - } - p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; - if (p) { - p->daddr = *daddr; - refcount_set(&p->refcnt, 2); - atomic_set(&p->rid, 0); - p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; - p->rate_tokens = 0; - /* 60*HZ is arbitrary, but chosen enough high so that the first - * calculation of tokens is at its maximum. - */ - p->rate_last = jiffies - 60*HZ; - INIT_LIST_HEAD(&p->gc_list); - /* Link the node. */ - link_to_pool(p, base); - base->total++; + gc_cnt = 0; + p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); + if (!p && create) { + p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); + if (p) { + p->daddr = *daddr; + refcount_set(&p->refcnt, 2); + atomic_set(&p->rid, 0); + p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; + p->rate_tokens = 0; + /* 60*HZ is arbitrary, but chosen enough high so that the first + * calculation of tokens is at its maximum. + */ + p->rate_last = jiffies - 60*HZ; + + rb_link_node(&p->rb_node, parent, pp); + rb_insert_color(&p->rb_node, &base->rb_root); + base->total++; + } } + if (gc_cnt) + inet_peer_gc(base, gc_stack, gc_cnt); write_sequnlock_bh(&base->lock); return p; @@ -467,8 +232,9 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { p->dtime = (__u32)jiffies; - smp_mb__before_atomic(); - refcount_dec(&p->refcnt); + + if (refcount_dec_and_test(&p->refcnt)) + call_rcu(&p->rcu, inetpeer_free_rcu); } EXPORT_SYMBOL_GPL(inet_putpeer); @@ -513,30 +279,16 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) } EXPORT_SYMBOL(inet_peer_xrlim_allow); -static void inetpeer_inval_rcu(struct rcu_head *head) -{ - struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu); - - spin_lock_bh(&gc_lock); - list_add_tail(&p->gc_list, &gc_list); - spin_unlock_bh(&gc_lock); - - schedule_delayed_work(&gc_work, gc_delay); -} - void inetpeer_invalidate_tree(struct inet_peer_base *base) { - struct inet_peer *root; - - write_seqlock_bh(&base->lock); + struct inet_peer *p, *n; - root = rcu_deref_locked(base->root, base); - if (root != peer_avl_empty) { - base->root = peer_avl_empty_rcu; - base->total = 0; - call_rcu(&root->gc_rcu, inetpeer_inval_rcu); + rbtree_postorder_for_each_entry_safe(p, n, &base->rb_root, rb_node) { + inet_putpeer(p); + cond_resched(); } - write_sequnlock_bh(&base->lock); + base->rb_root = RB_ROOT; + base->total = 0; } EXPORT_SYMBOL(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 93157f2f4758..525ae88d1e58 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -86,8 +86,8 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, * NOTE: dopt cannot point to skb. */ -int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, - const struct ip_options *sopt) +int __ip_options_echo(struct net *net, struct ip_options *dopt, + struct sk_buff *skb, const struct ip_options *sopt) { unsigned char *sptr, *dptr; int soffset, doffset; @@ -140,7 +140,7 @@ int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, __be32 addr; memcpy(&addr, dptr+soffset-1, 4); - if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) { + if (inet_addr_type(net, addr) != RTN_UNICAST) { dopt->ts_needtime = 1; soffset += 8; } @@ -174,9 +174,6 @@ int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, doffset -= 4; } if (doffset > 3) { - __be32 daddr = fib_compute_spec_dst(skb); - - memcpy(&start[doffset-1], &daddr, 4); dopt->faddr = faddr; dptr[0] = start[0]; dptr[1] = doffset+3; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e153c40c2436..73b0b15245b6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -853,61 +853,6 @@ csum_page(struct page *page, int offset, int copy) return csum; } -static inline int ip_ufo_append_data(struct sock *sk, - struct sk_buff_head *queue, - int getfrag(void *from, char *to, int offset, int len, - int odd, struct sk_buff *skb), - void *from, int length, int hh_len, int fragheaderlen, - int transhdrlen, int maxfraglen, unsigned int flags) -{ - struct sk_buff *skb; - int err; - - /* There is support for UDP fragmentation offload by network - * device, so create one single skb packet containing complete - * udp datagram - */ - skb = skb_peek_tail(queue); - if (!skb) { - skb = sock_alloc_send_skb(sk, - hh_len + fragheaderlen + transhdrlen + 20, - (flags & MSG_DONTWAIT), &err); - - if (!skb) - return err; - - /* reserve space for Hardware header */ - skb_reserve(skb, hh_len); - - /* create space for UDP/IP header */ - skb_put(skb, fragheaderlen + transhdrlen); - - /* initialize network header pointer */ - skb_reset_network_header(skb); - - /* initialize protocol header pointer */ - skb->transport_header = skb->network_header + fragheaderlen; - - skb->csum = 0; - - if (flags & MSG_CONFIRM) - skb_set_dst_pending_confirm(skb, 1); - - __skb_queue_tail(queue, skb); - } else if (skb_is_gso(skb)) { - goto append; - } - - skb->ip_summed = CHECKSUM_PARTIAL; - /* specify the length of each IP datagram fragment */ - skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - -append: - return skb_append_datato_frags(sk, skb, getfrag, from, - (length - transhdrlen)); -} - static int __ip_append_data(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, @@ -965,19 +910,6 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if ((skb && skb_is_gso(skb)) || - (((length + (skb ? skb->len : fragheaderlen)) > mtu) && - (skb_queue_len(queue) <= 1) && - (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && - (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx)) { - err = ip_ufo_append_data(sk, queue, getfrag, from, length, - hh_len, fragheaderlen, transhdrlen, - maxfraglen, flags); - if (err) - goto error; - return 0; - } /* So, what's going on in the loop below? * @@ -1288,16 +1220,6 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, if (!skb) return -EINVAL; - if ((size + skb->len > mtu) && - (skb_queue_len(&sk->sk_write_queue) == 1) && - (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO)) { - if (skb->ip_summed != CHECKSUM_PARTIAL) - return -EOPNOTSUPP; - - skb_shinfo(skb)->gso_size = mtu - fragheaderlen; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - } cork->length += size; while (size > 0) { @@ -1603,7 +1525,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, int err; int oif; - if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) + if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) return; ipc.addr = daddr; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ecc4b4a2413e..dd68a9ed5e40 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -80,7 +80,8 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) } -static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) +static void ip_cmsg_recv_retopts(struct net *net, struct msghdr *msg, + struct sk_buff *skb) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; @@ -88,7 +89,7 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) if (IPCB(skb)->opt.optlen == 0) return; - if (ip_options_echo(opt, skb)) { + if (ip_options_echo(net, opt, skb)) { msg->msg_flags |= MSG_CTRUNC; return; } @@ -204,7 +205,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, } if (flags & IP_CMSG_RETOPTS) { - ip_cmsg_recv_retopts(msg, skb); + ip_cmsg_recv_retopts(sock_net(sk), msg, skb); flags &= ~IP_CMSG_RETOPTS; if (!flags) @@ -1227,14 +1228,7 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) pktinfo->ipi_ifindex = 0; pktinfo->ipi_spec_dst.s_addr = 0; } - /* We need to keep the dst for __ip_options_echo() - * We could restrict the test to opt.ts_needtime || opt.srr, - * but the following is good enough as IP options are not often used. - */ - if (unlikely(IPCB(skb)->opt.optlen)) - skb_dst_force(skb); - else - skb_dst_drop(skb); + skb_dst_drop(skb); } int ip_setsockopt(struct sock *sk, int level, diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 0192c255e508..5ed63d250950 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -584,33 +584,6 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = { .get_link_net = ip_tunnel_get_link_net, }; -static bool is_vti_tunnel(const struct net_device *dev) -{ - return dev->netdev_ops == &vti_netdev_ops; -} - -static int vti_device_event(struct notifier_block *unused, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct ip_tunnel *tunnel = netdev_priv(dev); - - if (!is_vti_tunnel(dev)) - return NOTIFY_DONE; - - switch (event) { - case NETDEV_DOWN: - if (!net_eq(tunnel->net, dev_net(dev))) - xfrm_garbage_collect(tunnel->net); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block vti_notifier_block __read_mostly = { - .notifier_call = vti_device_event, -}; - static int __init vti_init(void) { const char *msg; @@ -618,8 +591,6 @@ static int __init vti_init(void) pr_info("IPv4 over IPsec tunneling driver\n"); - register_netdevice_notifier(&vti_notifier_block); - msg = "tunnel device"; err = register_pernet_device(&vti_net_ops); if (err < 0) @@ -652,7 +623,6 @@ xfrm_proto_ah_failed: xfrm_proto_esp_failed: unregister_pernet_device(&vti_net_ops); pernet_dev_failed: - unregister_netdevice_notifier(&vti_notifier_block); pr_err("vti init: failed to register %s\n", msg); return err; } @@ -664,7 +634,6 @@ static void __exit vti_fini(void) xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); unregister_pernet_device(&vti_net_ops); - unregister_netdevice_notifier(&vti_notifier_block); } module_init(vti_init); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 06863ea3fc5b..c9b3e6e069ae 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3114,14 +3114,14 @@ int __init ip_mr_init(void) } #endif rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, - ipmr_rtm_getroute, ipmr_rtm_dumproute, NULL); + ipmr_rtm_getroute, ipmr_rtm_dumproute, 0); rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE, - ipmr_rtm_route, NULL, NULL); + ipmr_rtm_route, NULL, 0); rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE, - ipmr_rtm_route, NULL, NULL); + ipmr_rtm_route, NULL, 0); rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK, - NULL, ipmr_rtm_dumplink, NULL); + NULL, ipmr_rtm_dumplink, 0); return 0; #ifdef CONFIG_IP_PIMSM_V2 diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 43eb6567b3a0..b6d3fe03feb3 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -206,14 +206,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), - SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED), - SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG), - SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE), - SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED), - SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS), - SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER), SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), - SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS), SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), @@ -230,14 +223,12 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES), SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS), - SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS), SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY), SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), - SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index b0bb5d0a30bd..33b70bfd1122 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -122,7 +122,8 @@ void raw_unhash_sk(struct sock *sk) EXPORT_SYMBOL_GPL(raw_unhash_sk); struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, - unsigned short num, __be32 raddr, __be32 laddr, int dif) + unsigned short num, __be32 raddr, __be32 laddr, + int dif, int sdif) { sk_for_each_from(sk) { struct inet_sock *inet = inet_sk(sk); @@ -130,7 +131,8 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, if (net_eq(sock_net(sk), net) && inet->inet_num == num && !(inet->inet_daddr && inet->inet_daddr != raddr) && !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && + sk->sk_bound_dev_if != sdif)) goto found; /* gotcha */ } sk = NULL; @@ -171,6 +173,7 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) */ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) { + int sdif = inet_sdif(skb); struct sock *sk; struct hlist_head *head; int delivered = 0; @@ -184,13 +187,13 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) net = dev_net(skb->dev); sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, iph->saddr, iph->daddr, - skb->dev->ifindex); + skb->dev->ifindex, sdif); while (sk) { delivered = 1; if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && ip_mc_sf_allow(sk, iph->daddr, iph->saddr, - skb->dev->ifindex)) { + skb->dev->ifindex, sdif)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ @@ -199,7 +202,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) } sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, iph->saddr, iph->daddr, - skb->dev->ifindex); + skb->dev->ifindex, sdif); } out: read_unlock(&raw_v4_hashinfo.lock); @@ -297,12 +300,15 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) read_lock(&raw_v4_hashinfo.lock); raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); if (raw_sk) { + int dif = skb->dev->ifindex; + int sdif = inet_sdif(skb); + iph = (const struct iphdr *)skb->data; net = dev_net(skb->dev); while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, iph->daddr, iph->saddr, - skb->dev->ifindex)) != NULL) { + dif, sdif)) != NULL) { raw_err(raw_sk, skb, info); raw_sk = sk_next(raw_sk); iph = (const struct iphdr *)skb->data; diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index e1a51ca68d23..c200065ef9a5 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -46,13 +46,13 @@ static struct sock *raw_lookup(struct net *net, struct sock *from, sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol, r->id.idiag_dst[0], r->id.idiag_src[0], - r->id.idiag_if); + r->id.idiag_if, 0); #if IS_ENABLED(CONFIG_IPV6) else sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol, (const struct in6_addr *)r->id.idiag_src, (const struct in6_addr *)r->id.idiag_dst, - r->id.idiag_if); + r->id.idiag_if, 0); #endif return sk; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0383e66f59bc..2ef46294475f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3067,7 +3067,7 @@ int __init ip_rt_init(void) xfrm_init(); xfrm4_init(); #endif - rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, 0); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 03ad8778c395..b1bb1b3a1082 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -355,7 +355,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ - ireq->opt = tcp_v4_save_options(skb); + ireq->opt = tcp_v4_save_options(sock_net(sk), skb); if (security_inet_conn_request(sk, skb, req)) { reqsk_free(req); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9bf809726066..0d3c038d7b04 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -45,6 +45,9 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +/* obsolete */ +static int sysctl_tcp_low_latency __read_mostly; + /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71ce33decd97..71b25567e787 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -388,6 +388,19 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) return period; } +static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) +{ + u32 rate = READ_ONCE(tp->rate_delivered); + u32 intv = READ_ONCE(tp->rate_interval_us); + u64 rate64 = 0; + + if (rate && intv) { + rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; + do_div(rate64, intv); + } + return rate64; +} + /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to @@ -400,7 +413,6 @@ void tcp_init_sock(struct sock *sk) tp->out_of_order_queue = RB_ROOT; tcp_init_xmit_timers(sk); - tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); icsk->icsk_rto = TCP_TIMEOUT_INIT; @@ -1034,23 +1046,29 @@ out_err: } EXPORT_SYMBOL_GPL(do_tcp_sendpages); -int tcp_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) +int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, + size_t size, int flags) { - ssize_t res; - if (!(sk->sk_route_caps & NETIF_F_SG) || !sk_check_csum_caps(sk)) return sock_no_sendpage(sk->sk_socket, page, offset, size, flags); - lock_sock(sk); - tcp_rate_check_app_limited(sk); /* is sending application-limited? */ - res = do_tcp_sendpages(sk, page, offset, size, flags); + return do_tcp_sendpages(sk, page, offset, size, flags); +} + +int tcp_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + int ret; + + lock_sock(sk); + ret = tcp_sendpage_locked(sk, page, offset, size, flags); release_sock(sk); - return res; + + return ret; } EXPORT_SYMBOL(tcp_sendpage); @@ -1144,9 +1162,10 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, return err; } -int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); + struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct sockcm_cookie sockc; int flags, err, copied = 0; @@ -1155,9 +1174,27 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) bool sg; long timeo; - lock_sock(sk); - flags = msg->msg_flags; + + if (flags & MSG_ZEROCOPY && size) { + if (sk->sk_state != TCP_ESTABLISHED) { + err = -EINVAL; + goto out_err; + } + + skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL; + uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); + if (!uarg) { + err = -ENOBUFS; + goto out_err; + } + + /* skb may be freed in main loop, keep extra ref on uarg */ + sock_zerocopy_get(uarg); + if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG)) + uarg->zerocopy = 0; + } + if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); if (err == -EINPROGRESS && copied_syn > 0) @@ -1281,7 +1318,7 @@ new_segment: err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); if (err) goto do_fault; - } else { + } else if (!uarg || !uarg->zerocopy) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); @@ -1319,6 +1356,13 @@ new_segment: page_ref_inc(pfrag->page); } pfrag->offset += copy; + } else { + err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); + if (err == -EMSGSIZE || err == -EEXIST) + goto new_segment; + if (err < 0) + goto do_error; + copy = err; } if (!copied) @@ -1365,7 +1409,7 @@ out: tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: - release_sock(sk); + sock_zerocopy_put(uarg); return copied + copied_syn; do_fault: @@ -1382,6 +1426,7 @@ do_error: if (copied + copied_syn) goto out; out_err: + sock_zerocopy_put_abort(uarg); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && @@ -1389,9 +1434,19 @@ out_err: sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } - release_sock(sk); return err; } + +int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + int ret; + + lock_sock(sk); + ret = tcp_sendmsg_locked(sk, msg, size); + release_sock(sk); + + return ret; +} EXPORT_SYMBOL(tcp_sendmsg); /* @@ -1525,20 +1580,6 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } -static void tcp_prequeue_process(struct sock *sk) -{ - struct sk_buff *skb; - struct tcp_sock *tp = tcp_sk(sk); - - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED); - - while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb); - - /* Clear memory counter. */ - tp->ucopy.memory = 0; -} - static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1671,7 +1712,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int err; int target; /* Read at least this many bytes */ long timeo; - struct task_struct *user_recv = NULL; struct sk_buff *skb, *last; u32 urg_hole = 0; @@ -1806,51 +1846,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, tcp_cleanup_rbuf(sk, copied); - if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { - /* Install new reader */ - if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { - user_recv = current; - tp->ucopy.task = user_recv; - tp->ucopy.msg = msg; - } - - tp->ucopy.len = len; - - WARN_ON(tp->copied_seq != tp->rcv_nxt && - !(flags & (MSG_PEEK | MSG_TRUNC))); - - /* Ugly... If prequeue is not empty, we have to - * process it before releasing socket, otherwise - * order will be broken at second iteration. - * More elegant solution is required!!! - * - * Look: we have the following (pseudo)queues: - * - * 1. packets in flight - * 2. backlog - * 3. prequeue - * 4. receive_queue - * - * Each queue can be processed only if the next ones - * are empty. At this point we have empty receive_queue. - * But prequeue _can_ be not empty after 2nd iteration, - * when we jumped to start of loop because backlog - * processing added something to receive_queue. - * We cannot release_sock(), because backlog contains - * packets arrived _after_ prequeued ones. - * - * Shortly, algorithm is clear --- to process all - * the queues in order. We could make it more directly, - * requeueing packets from backlog to prequeue, if - * is not empty. It is more elegant, but eats cycles, - * unfortunately. - */ - if (!skb_queue_empty(&tp->ucopy.prequeue)) - goto do_prequeue; - - /* __ Set realtime policy in scheduler __ */ - } - if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); @@ -1859,31 +1854,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, sk_wait_data(sk, &timeo, last); } - if (user_recv) { - int chunk; - - /* __ Restore normal policy in scheduler __ */ - - chunk = len - tp->ucopy.len; - if (chunk != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); - len -= chunk; - copied += chunk; - } - - if (tp->rcv_nxt == tp->copied_seq && - !skb_queue_empty(&tp->ucopy.prequeue)) { -do_prequeue: - tcp_prequeue_process(sk); - - chunk = len - tp->ucopy.len; - if (chunk != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); - len -= chunk; - copied += chunk; - } - } - } if ((flags & MSG_PEEK) && (peek_seq - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", @@ -1934,10 +1904,8 @@ do_prequeue: tcp_rcv_space_adjust(sk); skip_copy: - if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { + if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) tp->urg_data = 0; - tcp_fast_path_check(sk); - } if (used + offset < skb->len) continue; @@ -1955,25 +1923,6 @@ skip_copy: break; } while (len > 0); - if (user_recv) { - if (!skb_queue_empty(&tp->ucopy.prequeue)) { - int chunk; - - tp->ucopy.len = copied > 0 ? len : 0; - - tcp_prequeue_process(sk); - - if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); - len -= chunk; - copied += chunk; - } - } - - tp->ucopy.task = NULL; - tp->ucopy.len = 0; - } - /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ @@ -2823,7 +2772,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); - u32 now, intv; + u32 now; u64 rate64; bool slow; u32 rate; @@ -2922,13 +2871,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; - rate = READ_ONCE(tp->rate_delivered); - intv = READ_ONCE(tp->rate_interval_us); - if (rate && intv) { - rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; - do_div(rate64, intv); + rate64 = tcp_compute_delivery_rate(tp); + if (rate64) info->tcpi_delivery_rate = rate64; - } unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2938,8 +2883,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; struct tcp_info info; + u64 rate64; + u32 rate; - stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC); + stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + + 3 * nla_total_size(sizeof(u32)) + + 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -2954,6 +2903,20 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) tp->data_segs_out, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, tp->total_retrans, TCP_NLA_PAD); + + rate = READ_ONCE(sk->sk_pacing_rate); + rate64 = rate != ~0U ? rate : ~0ULL; + nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); + + rate64 = tcp_compute_delivery_rate(tp); + nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); + + nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd); + nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); + nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); + + nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); + nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); return stats; } diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 609965f0e298..fc3614377413 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -49,7 +49,6 @@ MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wma struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ u32 last_max_cwnd; /* last maximum snd_cwnd */ - u32 loss_cwnd; /* congestion window at last loss */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ u32 epoch_start; /* beginning of an epoch */ @@ -72,7 +71,6 @@ static void bictcp_init(struct sock *sk) struct bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); - ca->loss_cwnd = 0; if (initial_ssthresh) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; @@ -172,22 +170,12 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) else ca->last_max_cwnd = tp->snd_cwnd; - ca->loss_cwnd = tp->snd_cwnd; - if (tp->snd_cwnd <= low_window) return max(tp->snd_cwnd >> 1U, 2U); else return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -static u32 bictcp_undo_cwnd(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - const struct bictcp *ca = inet_csk_ca(sk); - - return max(tp->snd_cwnd, ca->loss_cwnd); -} - static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) @@ -214,7 +202,7 @@ static struct tcp_congestion_ops bictcp __read_mostly = { .ssthresh = bictcp_recalc_ssthresh, .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, - .undo_cwnd = bictcp_undo_cwnd, + .undo_cwnd = tcp_reno_undo_cwnd, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "bic", diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 50a0f3e51d5b..66ac69f7bd19 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -85,7 +85,6 @@ struct cdg { u8 state; u8 delack; u32 rtt_seq; - u32 undo_cwnd; u32 shadow_wnd; u16 backoff_cnt; u16 sample_cnt; @@ -330,8 +329,6 @@ static u32 tcp_cdg_ssthresh(struct sock *sk) struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - ca->undo_cwnd = tp->snd_cwnd; - if (ca->state == CDG_BACKOFF) return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10); @@ -344,13 +341,6 @@ static u32 tcp_cdg_ssthresh(struct sock *sk) return max(2U, tp->snd_cwnd >> 1); } -static u32 tcp_cdg_undo_cwnd(struct sock *sk) -{ - struct cdg *ca = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd); -} - static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) { struct cdg *ca = inet_csk_ca(sk); @@ -403,7 +393,7 @@ struct tcp_congestion_ops tcp_cdg __read_mostly = { .cong_avoid = tcp_cdg_cong_avoid, .cwnd_event = tcp_cdg_cwnd_event, .pkts_acked = tcp_cdg_acked, - .undo_cwnd = tcp_cdg_undo_cwnd, + .undo_cwnd = tcp_reno_undo_cwnd, .ssthresh = tcp_cdg_ssthresh, .release = tcp_cdg_release, .init = tcp_cdg_init, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index fde983f6376b..c2b174469645 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -456,7 +456,7 @@ u32 tcp_reno_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - return max(tp->snd_cwnd, tp->snd_ssthresh << 1); + return max(tp->snd_cwnd, tp->prior_cwnd); } EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 57ae5b5ae643..78bfadfcf342 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -83,7 +83,6 @@ MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (mse struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ u32 last_max_cwnd; /* last maximum snd_cwnd */ - u32 loss_cwnd; /* congestion window at last loss */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ u32 bic_origin_point;/* origin point of bic function */ @@ -142,7 +141,6 @@ static void bictcp_init(struct sock *sk) struct bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); - ca->loss_cwnd = 0; if (hystart) bictcp_hystart_reset(sk); @@ -366,18 +364,9 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) else ca->last_max_cwnd = tp->snd_cwnd; - ca->loss_cwnd = tp->snd_cwnd; - return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -static u32 bictcp_undo_cwnd(struct sock *sk) -{ - struct bictcp *ca = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); -} - static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) { @@ -470,7 +459,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { .ssthresh = bictcp_recalc_ssthresh, .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, - .undo_cwnd = bictcp_undo_cwnd, + .undo_cwnd = tcp_reno_undo_cwnd, .cwnd_event = bictcp_cwnd_event, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 6d9879e93648..d1c33c91eadc 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -94,7 +94,6 @@ static const struct hstcp_aimd_val { struct hstcp { u32 ai; - u32 loss_cwnd; }; static void hstcp_init(struct sock *sk) @@ -153,22 +152,14 @@ static u32 hstcp_ssthresh(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - ca->loss_cwnd = tp->snd_cwnd; /* Do multiplicative decrease */ return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); } -static u32 hstcp_cwnd_undo(struct sock *sk) -{ - const struct hstcp *ca = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); -} - static struct tcp_congestion_ops tcp_highspeed __read_mostly = { .init = hstcp_init, .ssthresh = hstcp_ssthresh, - .undo_cwnd = hstcp_cwnd_undo, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = hstcp_cong_avoid, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 3eb78cde6ff0..082d479462fa 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -66,7 +66,6 @@ static inline void htcp_reset(struct htcp *ca) static u32 htcp_cwnd_undo(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); if (ca->undo_last_cong) { @@ -76,7 +75,7 @@ static u32 htcp_cwnd_undo(struct sock *sk) ca->undo_last_cong = 0; } - return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta); + return tcp_reno_undo_cwnd(sk); } static inline void measure_rtt(struct sock *sk, u32 srtt) diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 60352ff4f5a8..7c843578f233 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -48,7 +48,6 @@ struct illinois { u32 end_seq; /* right edge of current RTT */ u32 alpha; /* Additive increase */ u32 beta; /* Muliplicative decrease */ - u32 loss_cwnd; /* cwnd on loss */ u16 acked; /* # packets acked by current ACK */ u8 rtt_above; /* average rtt has gone above threshold */ u8 rtt_low; /* # of rtts measurements below threshold */ @@ -297,18 +296,10 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); - ca->loss_cwnd = tp->snd_cwnd; /* Multiplicative decrease */ return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U); } -static u32 tcp_illinois_cwnd_undo(struct sock *sk) -{ - const struct illinois *ca = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); -} - /* Extract info for Tcp socket info provided via netlink. */ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) @@ -336,7 +327,7 @@ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, static struct tcp_congestion_ops tcp_illinois __read_mostly = { .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, - .undo_cwnd = tcp_illinois_cwnd_undo, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_illinois_cong_avoid, .set_state = tcp_illinois_state, .get_info = tcp_illinois_info, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53de1424c13c..d73903fe8c83 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -103,7 +103,6 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ -#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ @@ -1952,6 +1951,7 @@ void tcp_enter_loss(struct sock *sk) !after(tp->high_seq, tp->snd_una) || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->prior_cwnd = tp->snd_cwnd; tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); tcp_init_undo(tp); @@ -3372,12 +3372,6 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 if (tp->snd_wnd != nwin) { tp->snd_wnd = nwin; - /* Note, it is the only place, where - * fast path is recovered for sending TCP. - */ - tp->pred_flags = 0; - tcp_fast_path_check(sk); - if (tcp_send_head(sk)) tcp_slow_start_after_idle_check(sk); @@ -3559,6 +3553,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 lost = tp->lost; int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 ack_ev_flags = 0; sack_state.first_sackt = 0; sack_state.rate = &rs; @@ -3599,42 +3594,26 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (flag & FLAG_UPDATE_TS_RECENT) tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { - /* Window is constant, pure forward advance. - * No more checks are required. - * Note, we use the fact that SND.UNA>=SND.WL2. - */ - tcp_update_wl(tp, ack_seq); - tcp_snd_una_update(tp, ack); - flag |= FLAG_WIN_UPDATE; - - tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); - - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); - } else { - u32 ack_ev_flags = CA_ACK_SLOWPATH; - - if (ack_seq != TCP_SKB_CB(skb)->end_seq) - flag |= FLAG_DATA; - else - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS); + if (ack_seq != TCP_SKB_CB(skb)->end_seq) + flag |= FLAG_DATA; + else + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS); - flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); + flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); - if (TCP_SKB_CB(skb)->sacked) - flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_state); + if (TCP_SKB_CB(skb)->sacked) + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, + &sack_state); - if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { - flag |= FLAG_ECE; - ack_ev_flags |= CA_ACK_ECE; - } + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { + flag |= FLAG_ECE; + ack_ev_flags = CA_ACK_ECE; + } - if (flag & FLAG_WIN_UPDATE) - ack_ev_flags |= CA_ACK_WIN_UPDATE; + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; - tcp_in_ack_event(sk, ack_ev_flags); - } + tcp_in_ack_event(sk, ack_ev_flags); /* We passed data and got it acked, remove any soft error * log. Something worked... @@ -4402,8 +4381,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) return; } - /* Disable header prediction. */ - tp->pred_flags = 0; inet_csk_schedule_ack(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); @@ -4592,8 +4569,8 @@ err: static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - bool fragstolen = false; - int eaten = -1; + bool fragstolen; + int eaten; if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); @@ -4615,32 +4592,13 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) goto out_of_window; /* Ok. In sequence. In window. */ - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && - sock_owned_by_user(sk) && !tp->urg_data) { - int chunk = min_t(unsigned int, skb->len, - tp->ucopy.len); - - __set_current_state(TASK_RUNNING); - - if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) { - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - eaten = (chunk == skb->len); - tcp_rcv_space_adjust(sk); - } - } - - if (eaten <= 0) { queue_and_out: - if (eaten < 0) { - if (skb_queue_len(&sk->sk_receive_queue) == 0) - sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) - goto drop; - } - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); - } + if (skb_queue_len(&sk->sk_receive_queue) == 0) + sk_forced_mem_schedule(sk, skb->truesize); + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + goto drop; + + eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); @@ -4660,8 +4618,6 @@ queue_and_out: if (tp->rx_opt.num_sacks) tcp_sack_remove(tp); - tcp_fast_path_check(sk); - if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) @@ -4987,7 +4943,6 @@ static int tcp_prune_queue(struct sock *sk) NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED); /* Massive buffer overcommit. */ - tp->pred_flags = 0; return -1; } @@ -5159,9 +5114,6 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) tp->urg_data = TCP_URG_NOTYET; tp->urg_seq = ptr; - - /* Disable header prediction. */ - tp->pred_flags = 0; } /* This is the 'fast' part of urgent handling. */ @@ -5190,26 +5142,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t } } -static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) -{ - struct tcp_sock *tp = tcp_sk(sk); - int chunk = skb->len - hlen; - int err; - - if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk); - else - err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg); - - if (!err) { - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - tcp_rcv_space_adjust(sk); - } - - return err; -} - /* Accept RST for rcv_nxt - 1 after a FIN. * When tcp connections are abruptly terminated from Mac OSX (via ^C), a * FIN is sent followed by a RST packet. The RST is sent with the same @@ -5340,201 +5272,29 @@ discard: /* * TCP receive function for the ESTABLISHED state. - * - * It is split into a fast path and a slow path. The fast path is - * disabled when: - * - A zero window was announced from us - zero window probing - * is only handled properly in the slow path. - * - Out of order segments arrived. - * - Urgent data is expected. - * - There is no buffer space left - * - Unexpected TCP flags/window values/header lengths are received - * (detected by checking the TCP header against pred_flags) - * - Data is sent in both directions. Fast path only supports pure senders - * or pure receivers (this means either the sequence number or the ack - * value must stay constant) - * - Unexpected TCP option. - * - * When these conditions are not satisfied it drops into a standard - * receive procedure patterned after RFC793 to handle all cases. - * The first three cases are guaranteed by proper pred_flags setting, - * the rest is checked inline. Fast processing is turned on in - * tcp_data_queue when everything is OK. */ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { + unsigned int len = skb->len; struct tcp_sock *tp = tcp_sk(sk); tcp_mstamp_refresh(tp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); - /* - * Header prediction. - * The code loosely follows the one in the famous - * "30 instruction TCP receive" Van Jacobson mail. - * - * Van's trick is to deposit buffers into socket queue - * on a device interrupt, to call tcp_recv function - * on the receive process context and checksum and copy - * the buffer to user space. smart... - * - * Our current scheme is not silly either but we take the - * extra cost of the net_bh soft interrupt processing... - * We do checksum and copy also but from device to kernel. - */ tp->rx_opt.saw_tstamp = 0; - /* pred_flags is 0xS?10 << 16 + snd_wnd - * if header_prediction is to be made - * 'S' will always be tp->tcp_header_len >> 2 - * '?' will be 0 for the fast path, otherwise pred_flags is 0 to - * turn it off (when there are holes in the receive - * space for instance) - * PSH flag is ignored. - */ - - if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && - TCP_SKB_CB(skb)->seq == tp->rcv_nxt && - !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { - int tcp_header_len = tp->tcp_header_len; - - /* Timestamp header prediction: tcp_header_len - * is automatically equal to th->doff*4 due to pred_flags - * match. - */ - - /* Check timestamp */ - if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { - /* No? Slow path! */ - if (!tcp_parse_aligned_timestamp(tp, th)) - goto slow_path; - - /* If PAWS failed, check it more carefully in slow path */ - if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) - goto slow_path; - - /* DO NOT update ts_recent here, if checksum fails - * and timestamp was corrupted part, it will result - * in a hung connection since we will drop all - * future packets due to the PAWS test. - */ - } - - if (len <= tcp_header_len) { - /* Bulk data transfer: sender */ - if (len == tcp_header_len) { - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - /* We know that such packets are checksummed - * on entry. - */ - tcp_ack(sk, skb, 0); - __kfree_skb(skb); - tcp_data_snd_check(sk); - return; - } else { /* Header too small */ - TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); - goto discard; - } - } else { - int eaten = 0; - bool fragstolen = false; - - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len && - sock_owned_by_user(sk)) { - __set_current_state(TASK_RUNNING); - - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + - TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - tcp_rcv_rtt_measure_ts(sk, skb); - - __skb_pull(skb, tcp_header_len); - tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPHPHITSTOUSER); - eaten = 1; - } - } - if (!eaten) { - if (tcp_checksum_complete(skb)) - goto csum_error; - - if ((int)skb->truesize > sk->sk_forward_alloc) - goto step5; - - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - tcp_rcv_rtt_measure_ts(sk, skb); - - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); - - /* Bulk data transfer: receiver */ - eaten = tcp_queue_rcv(sk, skb, tcp_header_len, - &fragstolen); - } - - tcp_event_data_recv(sk, skb); - - if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { - /* Well, only one small jumplet in fast path... */ - tcp_ack(sk, skb, FLAG_DATA); - tcp_data_snd_check(sk); - if (!inet_csk_ack_scheduled(sk)) - goto no_ack; - } - - __tcp_ack_snd_check(sk, 0); -no_ack: - if (eaten) - kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk); - return; - } - } - -slow_path: if (len < (th->doff << 2) || tcp_checksum_complete(skb)) goto csum_error; if (!th->ack && !th->rst && !th->syn) goto discard; - /* - * Standard slow path. - */ - if (!tcp_validate_incoming(sk, skb, th, 1)) return; -step5: - if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) + if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0) goto discard; tcp_rcv_rtt_measure_ts(sk, skb); @@ -5587,12 +5347,6 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); - - if (!tp->rx_opt.snd_wscale) - __tcp_fast_path_on(tp, tp->snd_wnd); - else - tp->pred_flags = 0; - } static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, @@ -5721,7 +5475,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_ecn_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_ack(sk, skb, FLAG_SLOWPATH); + tcp_ack(sk, skb, 0); /* Ok.. it's good. Set up sequence numbers and * move to established. @@ -5957,8 +5711,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) return 0; /* step 5: check the ACK field */ - acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | - FLAG_UPDATE_TS_RECENT | + + acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT | FLAG_NO_CHALLENGE_ACK) > 0; if (!acceptable) { @@ -6026,7 +5780,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); - tcp_fast_path_on(tp); break; case TCP_FIN_WAIT1: { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a20e7f03d5f7..c8784ab37852 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -85,8 +85,6 @@ #include <crypto/hash.h> #include <linux/scatterlist.h> -int sysctl_tcp_low_latency __read_mostly; - #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); @@ -385,7 +383,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, ntohs(th->source), - inet_iif(icmp_skb)); + inet_iif(icmp_skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; @@ -661,7 +659,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, - ntohs(th->source), inet_iif(skb)); + ntohs(th->source), inet_iif(skb), + tcp_v4_sdif(skb)); /* don't send rst if it can't find key */ if (!sk1) goto out; @@ -1269,7 +1268,7 @@ static void tcp_v4_init_req(struct request_sock *req, sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->opt = tcp_v4_save_options(skb); + ireq->opt = tcp_v4_save_options(sock_net(sk_listener), skb); } static struct dst_entry *tcp_v4_route_req(const struct sock *sk, @@ -1458,7 +1457,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_rx_dst = NULL; } } - tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); + tcp_rcv_established(sk, skb, tcp_hdr(skb)); return 0; } @@ -1525,7 +1524,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), - skb->skb_iif); + skb->skb_iif, inet_sdif(skb)); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; @@ -1541,61 +1540,6 @@ void tcp_v4_early_demux(struct sk_buff *skb) } } -/* Packet is added to VJ-style prequeue for processing in process - * context, if a reader task is waiting. Apparently, this exciting - * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) - * failed somewhere. Latency? Burstiness? Well, at least now we will - * see, why it failed. 8)8) --ANK - * - */ -bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (sysctl_tcp_low_latency || !tp->ucopy.task) - return false; - - if (skb->len <= tcp_hdrlen(skb) && - skb_queue_len(&tp->ucopy.prequeue) == 0) - return false; - - /* Before escaping RCU protected region, we need to take care of skb - * dst. Prequeue is only enabled for established sockets. - * For such sockets, we might need the skb dst only to set sk->sk_rx_dst - * Instead of doing full sk_rx_dst validity here, let's perform - * an optimistic check. - */ - if (likely(sk->sk_rx_dst)) - skb_dst_drop(skb); - else - skb_dst_force_safe(skb); - - __skb_queue_tail(&tp->ucopy.prequeue, skb); - tp->ucopy.memory += skb->truesize; - if (skb_queue_len(&tp->ucopy.prequeue) >= 32 || - tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { - struct sk_buff *skb1; - - BUG_ON(sock_owned_by_user(sk)); - __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED, - skb_queue_len(&tp->ucopy.prequeue)); - - while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb1); - - tp->ucopy.memory = 0; - } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { - wake_up_interruptible_sync_poll(sk_sleep(sk), - POLLIN | POLLRDNORM | POLLRDBAND); - if (!inet_csk_ack_scheduled(sk)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - (3 * tcp_rto_min(sk)) / 4, - TCP_RTO_MAX); - } - return true; -} -EXPORT_SYMBOL(tcp_prequeue); - bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; @@ -1645,6 +1589,7 @@ EXPORT_SYMBOL(tcp_filter); int tcp_v4_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + int sdif = inet_sdif(skb); const struct iphdr *iph; const struct tcphdr *th; bool refcounted; @@ -1695,7 +1640,7 @@ int tcp_v4_rcv(struct sk_buff *skb) lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, - th->dest, &refcounted); + th->dest, sdif, &refcounted); if (!sk) goto no_tcp_socket; @@ -1770,8 +1715,7 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v4_do_rcv(sk, skb); + ret = tcp_v4_do_rcv(sk, skb); } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } @@ -1824,7 +1768,8 @@ do_time_wait: __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, - inet_iif(skb)); + inet_iif(skb), + sdif); if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; @@ -1936,9 +1881,6 @@ void tcp_v4_destroy_sock(struct sock *sk) } #endif - /* Clean prequeue, it must be empty really */ - __skb_queue_purge(&tp->ucopy.prequeue); - /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0ff83c1637d8..1537b87c657f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -436,8 +436,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct tcp_sock *newtp = tcp_sk(newsk); /* Now setup tcp_sock */ - newtp->pred_flags = 0; - newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; newtp->segs_in = 1; @@ -445,7 +443,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; - tcp_prequeue_init(newtp); INIT_LIST_HEAD(&newtp->tsq_node); tcp_init_wl(newtp, treq->rcv_isn); diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 6d650ed3cb59..1ff73982e28c 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -86,7 +86,6 @@ struct tcpnv { * < 0 => less than 1 packet/RTT */ u8 available8; u16 available16; - u32 loss_cwnd; /* cwnd at last loss */ u8 nv_allow_cwnd_growth:1, /* whether cwnd can grow */ nv_reset:1, /* whether to reset values */ nv_catchup:1; /* whether we are growing because @@ -121,7 +120,6 @@ static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); ca->nv_reset = 0; - ca->loss_cwnd = 0; ca->nv_no_cong_cnt = 0; ca->nv_rtt_cnt = 0; ca->nv_last_rtt = 0; @@ -177,19 +175,10 @@ static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) static u32 tcpnv_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - struct tcpnv *ca = inet_csk_ca(sk); - ca->loss_cwnd = tp->snd_cwnd; return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U); } -static u32 tcpnv_undo_cwnd(struct sock *sk) -{ - struct tcpnv *ca = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); -} - static void tcpnv_state(struct sock *sk, u8 new_state) { struct tcpnv *ca = inet_csk_ca(sk); @@ -446,7 +435,7 @@ static struct tcp_congestion_ops tcpnv __read_mostly = { .ssthresh = tcpnv_recalc_ssthresh, .cong_avoid = tcpnv_cong_avoid, .set_state = tcpnv_state, - .undo_cwnd = tcpnv_undo_cwnd, + .undo_cwnd = tcp_reno_undo_cwnd, .pkts_acked = tcpnv_acked, .get_info = tcpnv_get_info, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b7661a68d498..3e0d19631534 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -295,9 +295,7 @@ static u16 tcp_select_window(struct sock *sk) /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; - /* If we advertise zero window, disable fast path. */ if (new_win == 0) { - tp->pred_flags = 0; if (old_win) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTOZEROWINDOWADV); @@ -2377,7 +2375,6 @@ bool tcp_schedule_loss_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); u32 timeout, rto_delta_us; /* Don't do any loss probe on a Fast Open connection before 3WHS @@ -2398,15 +2395,19 @@ bool tcp_schedule_loss_probe(struct sock *sk) tcp_send_head(sk)) return false; - /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account + /* Probe timeout is 2*rtt. Add minimum RTO to account * for delayed ack when there's one outstanding packet. If no RTT * sample is available then probe after TCP_TIMEOUT_INIT. */ - timeout = rtt << 1 ? : TCP_TIMEOUT_INIT; - if (tp->packets_out == 1) - timeout = max_t(u32, timeout, - (rtt + (rtt >> 1) + TCP_DELACK_MAX)); - timeout = max_t(u32, timeout, msecs_to_jiffies(10)); + if (tp->srtt_us) { + timeout = usecs_to_jiffies(tp->srtt_us >> 2); + if (tp->packets_out == 1) + timeout += TCP_RTO_MIN; + else + timeout += TCP_TIMEOUT_MIN; + } else { + timeout = TCP_TIMEOUT_INIT; + } /* If the RTO formula yields an earlier time, then use that time. */ rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */ diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index f6c50af24a64..697f4c67b2e3 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -105,8 +105,9 @@ static inline int tcp_probe_avail(void) * Note: arguments must match tcp_rcv_established()! */ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { + unsigned int len = skb->len; const struct tcp_sock *tp = tcp_sk(sk); const struct inet_sock *inet = inet_sk(sk); @@ -145,7 +146,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, BUG(); } - p->length = skb->len; + p->length = len; p->snd_nxt = tp->snd_nxt; p->snd_una = tp->snd_una; p->snd_cwnd = tp->snd_cwnd; diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index fe9a493d0208..449cd914d58e 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -113,7 +113,7 @@ void tcp_rack_mark_lost(struct sock *sk) tp->rack.advanced = 0; tcp_rack_detect_loss(sk, &timeout); if (timeout) { - timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN); + timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN; inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, timeout, inet_csk(sk)->icsk_rto); } diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index f2123075ce6e..addc122f8818 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,10 +15,6 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -struct scalable { - u32 loss_cwnd; -}; - static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -36,23 +32,13 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) static u32 tcp_scalable_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - struct scalable *ca = inet_csk_ca(sk); - - ca->loss_cwnd = tp->snd_cwnd; return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); } -static u32 tcp_scalable_cwnd_undo(struct sock *sk) -{ - const struct scalable *ca = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); -} - static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, - .undo_cwnd = tcp_scalable_cwnd_undo, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_scalable_cong_avoid, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index e906014890b6..655dd8d7f064 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -239,7 +239,6 @@ static int tcp_write_timeout(struct sock *sk) /* Called with BH disabled */ void tcp_delack_timer_handler(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); sk_mem_reclaim_partial(sk); @@ -254,17 +253,6 @@ void tcp_delack_timer_handler(struct sock *sk) } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; - if (!skb_queue_empty(&tp->ucopy.prequeue)) { - struct sk_buff *skb; - - __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); - - while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb); - - tp->ucopy.memory = 0; - } - if (inet_csk_ack_scheduled(sk)) { if (!icsk->icsk_ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 76005d4b8dfc..6fcf482d611b 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -30,7 +30,6 @@ struct veno { u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ u32 inc; /* decide whether to increase cwnd */ u32 diff; /* calculate the diff rate */ - u32 loss_cwnd; /* cwnd when loss occured */ }; /* There are several situations when we must "re-start" Veno: @@ -194,7 +193,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); - veno->loss_cwnd = tp->snd_cwnd; if (veno->diff < beta) /* in "non-congestive state", cut cwnd by 1/5 */ return max(tp->snd_cwnd * 4 / 5, 2U); @@ -203,17 +201,10 @@ static u32 tcp_veno_ssthresh(struct sock *sk) return max(tp->snd_cwnd >> 1U, 2U); } -static u32 tcp_veno_cwnd_undo(struct sock *sk) -{ - const struct veno *veno = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, veno->loss_cwnd); -} - static struct tcp_congestion_ops tcp_veno __read_mostly = { .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, - .undo_cwnd = tcp_veno_cwnd_undo, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_veno_cong_avoid, .pkts_acked = tcp_veno_pkts_acked, .set_state = tcp_veno_state, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index bec9cafbe3f9..e5de84310949 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -154,24 +154,6 @@ static inline void update_rtt_min(struct westwood *w) } /* - * @westwood_fast_bw - * It is called when we are in fast path. In particular it is called when - * header prediction is successful. In such case in fact update is - * straight forward and doesn't need any particular care. - */ -static inline void westwood_fast_bw(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct westwood *w = inet_csk_ca(sk); - - westwood_update_window(sk); - - w->bk += tp->snd_una - w->snd_una; - w->snd_una = tp->snd_una; - update_rtt_min(w); -} - -/* * @westwood_acked_count * This function evaluates cumul_ack for evaluating bk in case of * delayed or partial acks. @@ -223,17 +205,12 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) static void tcp_westwood_ack(struct sock *sk, u32 ack_flags) { - if (ack_flags & CA_ACK_SLOWPATH) { - struct westwood *w = inet_csk_ca(sk); - - westwood_update_window(sk); - w->bk += westwood_acked_count(sk); + struct westwood *w = inet_csk_ca(sk); - update_rtt_min(w); - return; - } + westwood_update_window(sk); + w->bk += westwood_acked_count(sk); - westwood_fast_bw(sk); + update_rtt_min(w); } static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index e6ff99c4bd3b..96e829b2e2fc 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -37,7 +37,6 @@ struct yeah { u32 fast_count; u32 pkts_acked; - u32 loss_cwnd; }; static void tcp_yeah_init(struct sock *sk) @@ -220,22 +219,14 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) yeah->fast_count = 0; yeah->reno_count = max(yeah->reno_count>>1, 2U); - yeah->loss_cwnd = tp->snd_cwnd; return max_t(int, tp->snd_cwnd - reduction, 2); } -static u32 tcp_yeah_cwnd_undo(struct sock *sk) -{ - const struct yeah *yeah = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, yeah->loss_cwnd); -} - static struct tcp_congestion_ops tcp_yeah __read_mostly = { .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, - .undo_cwnd = tcp_yeah_cwnd_undo, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_yeah_cong_avoid, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a7c804f73990..cb633884e825 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -380,8 +380,8 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) static int compute_score(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum, int dif, - bool exact_dif) + __be32 daddr, unsigned short hnum, + int dif, int sdif, bool exact_dif) { int score; struct inet_sock *inet; @@ -413,10 +413,15 @@ static int compute_score(struct sock *sk, struct net *net, } if (sk->sk_bound_dev_if || exact_dif) { - if (sk->sk_bound_dev_if != dif) + bool dev_match = (sk->sk_bound_dev_if == dif || + sk->sk_bound_dev_if == sdif); + + if (exact_dif && !dev_match) return -1; - score += 4; + if (sk->sk_bound_dev_if && dev_match) + score += 4; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; return score; @@ -436,10 +441,11 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, - __be32 saddr, __be16 sport, - __be32 daddr, unsigned int hnum, int dif, bool exact_dif, - struct udp_hslot *hslot2, - struct sk_buff *skb) + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, + int dif, int sdif, bool exact_dif, + struct udp_hslot *hslot2, + struct sk_buff *skb) { struct sock *sk, *result; int score, badness, matches = 0, reuseport = 0; @@ -449,7 +455,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, exact_dif); + daddr, hnum, dif, sdif, exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -477,8 +483,8 @@ static struct sock *udp4_lib_lookup2(struct net *net, * harder than this. -DaveM */ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, - __be16 sport, __be32 daddr, __be16 dport, - int dif, struct udp_table *udptable, struct sk_buff *skb) + __be16 sport, __be32 daddr, __be16 dport, int dif, + int sdif, struct udp_table *udptable, struct sk_buff *skb) { struct sock *sk, *result; unsigned short hnum = ntohs(dport); @@ -496,7 +502,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, goto begin; result = udp4_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, + daddr, hnum, dif, sdif, exact_dif, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; @@ -511,7 +517,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, goto begin; result = udp4_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, + daddr, hnum, dif, sdif, exact_dif, hslot2, skb); } return result; @@ -521,7 +527,7 @@ begin: badness = 0; sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, exact_dif); + daddr, hnum, dif, sdif, exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -554,7 +560,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, iph->daddr, dport, inet_iif(skb), - udptable, skb); + inet_sdif(skb), udptable, skb); } struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, @@ -576,7 +582,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, struct sock *sk; sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, - dif, &udp_table, NULL); + dif, 0, &udp_table, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; @@ -587,7 +593,7 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup); static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, - int dif, unsigned short hnum) + int dif, int sdif, unsigned short hnum) { struct inet_sock *inet = inet_sk(sk); @@ -597,9 +603,10 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport != rmt_port && inet->inet_dport) || (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) || ipv6_only_sock(sk) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && + sk->sk_bound_dev_if != sdif)) return false; - if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif)) + if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif)) return false; return true; } @@ -628,8 +635,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) struct net *net = dev_net(skb->dev); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, - iph->saddr, uh->source, skb->dev->ifindex, udptable, - NULL); + iph->saddr, uh->source, skb->dev->ifindex, 0, + udptable, NULL); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; /* No socket for error */ @@ -1176,7 +1183,11 @@ static void udp_set_dev_scratch(struct sk_buff *skb) scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); scratch->is_linear = !skb_is_nonlinear(skb); #endif - if (likely(!skb->_skb_refdst)) + /* all head states execept sp (dst, sk, nf) are always cleared by + * udp_rcv() and we need to preserve secpath, if present, to eventually + * process IP_CMSG_PASSSEC at recvmsg() time + */ + if (likely(!skb_sec_path(skb))) scratch->_tsize_state |= UDP_SKB_IS_STATELESS; } @@ -1782,13 +1793,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id_once(sk, skb); } - /* At recvmsg() time we may access skb->dst or skb->sp depending on - * the IP options and the cmsg flags, elsewhere can we clear all - * pending head states while they are hot in the cache - */ - if (likely(IPCB(skb)->opt.optlen == 0 && !skb_sec_path(skb))) - skb_release_head_state(skb); - rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); @@ -1956,6 +1960,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); unsigned int offset = offsetof(typeof(*sk), sk_node); int dif = skb->dev->ifindex; + int sdif = inet_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; @@ -1970,7 +1975,7 @@ start_lookup: sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr, - uh->source, saddr, dif, hnum)) + uh->source, saddr, dif, sdif, hnum)) continue; if (!first) { @@ -2160,7 +2165,7 @@ drop: static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, - int dif) + int dif, int sdif) { struct sock *sk, *result; unsigned short hnum = ntohs(loc_port); @@ -2174,7 +2179,7 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, result = NULL; sk_for_each_rcu(sk, &hslot->head) { if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr, - rmt_port, rmt_addr, dif, hnum)) { + rmt_port, rmt_addr, dif, sdif, hnum)) { if (result) return NULL; result = sk; @@ -2191,7 +2196,7 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, static struct sock *__udp4_lib_demux_lookup(struct net *net, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, - int dif) + int dif, int sdif) { unsigned short hnum = ntohs(loc_port); unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); @@ -2203,7 +2208,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { if (INET_MATCH(sk, net, acookie, rmt_addr, - loc_addr, ports, dif)) + loc_addr, ports, dif, sdif)) return sk; /* Only check first socket in chain */ break; @@ -2219,6 +2224,7 @@ void udp_v4_early_demux(struct sk_buff *skb) struct sock *sk = NULL; struct dst_entry *dst; int dif = skb->dev->ifindex; + int sdif = inet_sdif(skb); int ours; /* validate the packet */ @@ -2244,10 +2250,11 @@ void udp_v4_early_demux(struct sk_buff *skb) } sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, - uh->source, iph->saddr, dif); + uh->source, iph->saddr, + dif, sdif); } else if (skb->pkt_type == PACKET_HOST) { sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, - uh->source, iph->saddr, dif); + uh->source, iph->saddr, dif, sdif); } if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 4515836d2a3a..d0390d844ac8 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -45,7 +45,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, - req->id.idiag_if, tbl, NULL); + req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, @@ -53,7 +53,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, - req->id.idiag_if, tbl, NULL); + req->id.idiag_if, 0, tbl, NULL); #endif if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; @@ -182,7 +182,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, sk = __udp4_lib_lookup(net, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, - req->id.idiag_if, tbl, NULL); + req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && @@ -190,7 +190,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, sk = __udp4_lib_lookup(net, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, - req->id.idiag_if, tbl, NULL); + req->id.idiag_if, 0, tbl, NULL); else sk = __udp6_lib_lookup(net, @@ -198,7 +198,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, - req->id.idiag_if, tbl, NULL); + req->id.idiag_if, 0, tbl, NULL); } #endif else { diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 0932c85b42af..97658bfc1b58 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, __be16 new_protocol, bool is_ipv6) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); - bool remcsum, need_csum, offload_csum, ufo, gso_partial; + bool remcsum, need_csum, offload_csum, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); struct udphdr *uh = udp_hdr(skb); u16 mac_offset = skb->mac_header; @@ -61,8 +61,6 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); skb->remcsum_offload = remcsum; - ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); - need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); /* Try to offload checksum if possible */ offload_csum = !!(need_csum && @@ -77,7 +75,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * outer one so strip the existing checksum feature flags and * instead set the flag based on our outer checksum offload value. */ - if (remcsum || ufo) { + if (remcsum) { features &= ~NETIF_F_CSUM_MASK; if (!need_csum || offload_csum) features |= NETIF_F_HW_CSUM; @@ -189,66 +187,16 @@ out_unlock: } EXPORT_SYMBOL(skb_udp_tunnel_segment); -static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); - unsigned int mss; - __wsum csum; - struct udphdr *uh; - struct iphdr *iph; if (skb->encapsulation && (skb_shinfo(skb)->gso_type & - (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) segs = skb_udp_tunnel_segment(skb, features, false); - goto out; - } - if (!pskb_may_pull(skb, sizeof(struct udphdr))) - goto out; - - mss = skb_shinfo(skb)->gso_size; - if (unlikely(skb->len <= mss)) - goto out; - - if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { - /* Packet is from an untrusted source, reset gso_segs. */ - - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); - - segs = NULL; - goto out; - } - - /* Do software UFO. Complete and fill in the UDP checksum as - * HW cannot do checksum of UDP packets sent as multiple - * IP fragments. - */ - - uh = udp_hdr(skb); - iph = ip_hdr(skb); - - uh->check = 0; - csum = skb_checksum(skb, 0, skb->len, 0); - uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - skb->ip_summed = CHECKSUM_UNNECESSARY; - - /* If there is no outer header we can fake a checksum offload - * due to the fact that we have already done the checksum in - * software prior to segmenting the frame. - */ - if (!skb->encap_hdr_csum) - features |= NETIF_F_HW_CSUM; - - /* Fragment the skb. IP headers of the fragments are updated in - * inet_gso_segment() - */ - segs = skb_segment(skb, features); -out: return segs; } @@ -382,7 +330,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv4_offload = { .callbacks = { - .gso_segment = udp4_ufo_fragment, + .gso_segment = udp4_tunnel_segment, .gro_receive = udp4_gro_receive, .gro_complete = udp4_gro_complete, }, diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 58bd39fb14b4..6539ff15e9a3 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -82,7 +82,8 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, struct sock *sk = sock->sk; struct udp_tunnel_info ti; - if (!dev->netdev_ops->ndo_udp_tunnel_add) + if (!dev->netdev_ops->ndo_udp_tunnel_add || + !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) return; ti.type = type; @@ -93,6 +94,24 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, } EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); +void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, + unsigned short type) +{ + struct sock *sk = sock->sk; + struct udp_tunnel_info ti; + + if (!dev->netdev_ops->ndo_udp_tunnel_del || + !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); +} +EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port); + /* Notify netdevs that UDP port started listening */ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) { @@ -109,6 +128,8 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) for_each_netdev_rcu(net, dev) { if (!dev->netdev_ops->ndo_udp_tunnel_add) continue; + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + continue; dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); } rcu_read_unlock(); @@ -131,6 +152,8 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) for_each_netdev_rcu(net, dev) { if (!dev->netdev_ops->ndo_udp_tunnel_del) continue; + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + continue; dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); } rcu_read_unlock(); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 71b4ecc195c7..4aefb149fe0a 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -213,14 +213,6 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) fl4->flowi4_tos = iph->tos; } -static inline int xfrm4_garbage_collect(struct dst_ops *ops) -{ - struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); - - xfrm_garbage_collect_deferred(net); - return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); -} - static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu) { @@ -259,14 +251,13 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static struct dst_ops xfrm4_dst_ops_template = { .family = AF_INET, - .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, .redirect = xfrm4_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, - .gc_thresh = INT_MAX, + .gc_thresh = 32768, }; static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = { |