diff options
Diffstat (limited to 'net/ipv4')
45 files changed, 1155 insertions, 812 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 21f46ee7b6e9..5df1f1325259 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -153,7 +153,7 @@ void inet_sock_destruct(struct sock *sk) WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc)); WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); WARN_ON_ONCE(sk->sk_wmem_queued); - WARN_ON_ONCE(sk_forward_alloc_get(sk)); + WARN_ON_ONCE(sk->sk_forward_alloc); kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 814300eee39d..a648fff71ea7 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1064,8 +1064,8 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } - if (__in_dev_get_rtnl(dev)) { - IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); + if (__in_dev_get_rtnl_net(dev)) { + IN_DEV_CONF_SET(__in_dev_get_rtnl_net(dev), PROXY_ARP, on); return 0; } return -ENXIO; @@ -1295,14 +1295,14 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) switch (cmd) { case SIOCDARP: - rtnl_lock(); + rtnl_net_lock(net); err = arp_req_delete(net, &r); - rtnl_unlock(); + rtnl_net_unlock(net); break; case SIOCSARP: - rtnl_lock(); + rtnl_net_lock(net); err = arp_req_set(net, &r); - rtnl_unlock(); + rtnl_net_unlock(net); break; case SIOCGARP: rcu_read_lock(); diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 554804774628..e01492234b0b 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -121,7 +121,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt) { /* bpf_tcp_ca prog cannot have NULL tp */ - __tcp_send_ack((struct sock *)tp, rcv_nxt); + __tcp_send_ack((struct sock *)tp, rcv_nxt, 0); return 0; } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 55b8151759bc..754f60fb6e25 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -46,6 +46,7 @@ #include <linux/notifier.h> #include <linux/inetdevice.h> #include <linux/igmp.h> +#include "igmp_internal.h" #include <linux/slab.h> #include <linux/hash.h> #ifdef CONFIG_SYSCTL @@ -107,15 +108,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_PROTO] = { .type = NLA_U8 }, }; -struct inet_fill_args { - u32 portid; - u32 seq; - int event; - unsigned int flags; - int netnsid; - int ifindex; -}; - #define IN4_ADDR_HSIZE_SHIFT 8 #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) @@ -1847,9 +1839,38 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, return 0; } -static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, - struct netlink_callback *cb, int *s_ip_idx, - struct inet_fill_args *fillargs) +static int in_dev_dump_ifmcaddr(struct in_device *in_dev, struct sk_buff *skb, + struct netlink_callback *cb, int *s_ip_idx, + struct inet_fill_args *fillargs) +{ + struct ip_mc_list *im; + int ip_idx = 0; + int err; + + for (im = rcu_dereference(in_dev->mc_list); + im; + im = rcu_dereference(im->next_rcu)) { + if (ip_idx < *s_ip_idx) { + ip_idx++; + continue; + } + err = inet_fill_ifmcaddr(skb, in_dev->dev, im, fillargs); + if (err < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + ip_idx++; + } + err = 0; + ip_idx = 0; +done: + *s_ip_idx = ip_idx; + return err; +} + +static int in_dev_dump_ifaddr(struct in_device *in_dev, struct sk_buff *skb, + struct netlink_callback *cb, int *s_ip_idx, + struct inet_fill_args *fillargs) { struct in_ifaddr *ifa; int ip_idx = 0; @@ -1875,6 +1896,21 @@ done: return err; } +static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, + struct netlink_callback *cb, int *s_ip_idx, + struct inet_fill_args *fillargs) +{ + switch (fillargs->event) { + case RTM_NEWADDR: + return in_dev_dump_ifaddr(in_dev, skb, cb, s_ip_idx, fillargs); + case RTM_GETMULTICAST: + return in_dev_dump_ifmcaddr(in_dev, skb, cb, s_ip_idx, + fillargs); + default: + return -EINVAL; + } +} + /* Combine dev_addr_genid and dev_base_seq to detect changes. */ static u32 inet_base_seq(const struct net *net) @@ -1890,13 +1926,14 @@ static u32 inet_base_seq(const struct net *net) return res; } -static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +static int inet_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, + int event) { const struct nlmsghdr *nlh = cb->nlh; struct inet_fill_args fillargs = { .portid = NETLINK_CB(cb->skb).portid, .seq = nlh->nlmsg_seq, - .event = RTM_NEWADDR, + .event = event, .flags = NLM_F_MULTI, .netnsid = -1, }; @@ -1950,6 +1987,16 @@ done: return err; } +static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + return inet_dump_addr(skb, cb, RTM_NEWADDR); +} + +static int inet_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + return inet_dump_addr(skb, cb, RTM_GETMULTICAST); +} + static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid) { @@ -2846,6 +2893,8 @@ static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_GETNETCONF, .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET, .msgtype = RTM_GETMULTICAST, + .dumpit = inet_dump_ifmcaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; void __init devinet_init(void) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 272e42d81323..3f4e629998fa 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -553,18 +553,16 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, const struct in_ifaddr *ifa; struct in_device *in_dev; - in_dev = __in_dev_get_rtnl(dev); + in_dev = __in_dev_get_rtnl_net(dev); if (!in_dev) return -ENODEV; *colon = ':'; - rcu_read_lock(); - in_dev_for_each_ifa_rcu(ifa, in_dev) { + in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; } - rcu_read_unlock(); if (!ifa) return -ENODEV; @@ -635,7 +633,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - rtnl_lock(); + rtnl_net_lock(net); err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; @@ -659,7 +657,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) /* allocated by rtentry_to_fib_config() */ kfree(cfg.fc_mx); } - rtnl_unlock(); + rtnl_net_unlock(net); return err; } return -EINVAL; @@ -809,7 +807,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), nla_len(attr), - extack); + extack, false); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); @@ -827,7 +825,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, - extack); + extack, false); if (err < 0) goto errout; break; @@ -837,19 +835,33 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, } } + if (cfg->fc_dst_len > 32) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + err = -EINVAL; + goto errout; + } + + if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) { + NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); + err = -EINVAL; + goto errout; + } + if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); - return -EINVAL; + err = -EINVAL; + goto errout; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); - return -EINVAL; + err = -EINVAL; + goto errout; } if (!cfg->fc_table) @@ -872,20 +884,24 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + rtnl_net_lock(net); + if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); err = -EINVAL; - goto errout; + goto unlock; } tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; - goto errout; + goto unlock; } err = fib_table_delete(net, tb, &cfg, extack); +unlock: + rtnl_net_unlock(net); errout: return err; } @@ -902,15 +918,20 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + rtnl_net_lock(net); + tb = fib_new_table(net, cfg.fc_table); if (!tb) { err = -ENOBUFS; - goto errout; + goto unlock; } err = fib_table_insert(net, tb, &cfg, extack); if (!err && cfg.fc_type == RTN_LOCAL) net->ipv4.fib_has_custom_local_routes = true; + +unlock: + rtnl_net_unlock(net); errout: return err; } @@ -1450,7 +1471,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); - rt_cache_flush(dev_net(dev)); + rt_cache_flush(net); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); @@ -1461,7 +1482,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, */ fib_disable_ip(dev, event, true); } else { - rt_cache_flush(dev_net(dev)); + rt_cache_flush(net); } break; } @@ -1575,7 +1596,7 @@ static void ip_fib_net_exit(struct net *net) { int i; - ASSERT_RTNL(); + ASSERT_RTNL_NET(net); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); @@ -1615,9 +1636,15 @@ static int __net_init fib_net_init(struct net *net) error = ip_fib_net_init(net); if (error < 0) goto out; + + error = fib4_semantics_init(net); + if (error) + goto out_semantics; + error = nl_fib_lookup_init(net); if (error < 0) goto out_nlfl; + error = fib_proc_init(net); if (error < 0) goto out_proc; @@ -1627,9 +1654,11 @@ out: out_proc: nl_fib_lookup_exit(net); out_nlfl: - rtnl_lock(); + fib4_semantics_exit(net); +out_semantics: + rtnl_net_lock(net); ip_fib_net_exit(net); - rtnl_unlock(); + rtnl_net_unlock(net); goto out; } @@ -1644,10 +1673,15 @@ static void __net_exit fib_net_exit_batch(struct list_head *net_list) struct net *net; rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) + list_for_each_entry(net, net_list, exit_list) { + __rtnl_net_lock(net); ip_fib_net_exit(net); - + __rtnl_net_unlock(net); + } rtnl_unlock(); + + list_for_each_entry(net, net_list, exit_list) + fib4_semantics_exit(net); } static struct pernet_operations fib_net_ops = { @@ -1658,9 +1692,9 @@ static struct pernet_operations fib_net_ops = { static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWROUTE, - .doit = inet_rtm_newroute}, + .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELROUTE, - .doit = inet_rtm_delroute}, + .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, }; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 9517b8667e00..fa58d6620ed6 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -37,6 +37,7 @@ struct fib4_rule { u8 dst_len; u8 src_len; dscp_t dscp; + dscp_t dscp_mask; u8 dscp_full:1; /* DSCP or TOS selector */ __be32 src; __be32 srcmask; @@ -192,7 +193,8 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, * to mask the upper three DSCP bits prior to matching to maintain * legacy behavior. */ - if (r->dscp_full && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos)) + if (r->dscp_full && + (r->dscp ^ inet_dsfield_to_dscp(fl4->flowi4_tos)) & r->dscp_mask) return 0; else if (!r->dscp_full && r->dscp && !fib_dscp_masked_match(r->dscp, fl4)) @@ -201,12 +203,12 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto)) return 0; - if (fib_rule_port_range_set(&rule->sport_range) && - !fib_rule_port_inrange(&rule->sport_range, fl4->fl4_sport)) + if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask, + fl4->fl4_sport)) return 0; - if (fib_rule_port_range_set(&rule->dport_range) && - !fib_rule_port_inrange(&rule->dport_range, fl4->fl4_dport)) + if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask, + fl4->fl4_dport)) return 0; return 1; @@ -235,19 +237,43 @@ static int fib4_nl2rule_dscp(const struct nlattr *nla, struct fib4_rule *rule4, } rule4->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + rule4->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK); rule4->dscp_full = true; return 0; } +static int fib4_nl2rule_dscp_mask(const struct nlattr *nla, + struct fib4_rule *rule4, + struct netlink_ext_ack *extack) +{ + dscp_t dscp_mask; + + if (!rule4->dscp_full) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Cannot specify DSCP mask without DSCP value"); + return -EINVAL; + } + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + if (rule4->dscp & ~dscp_mask) { + NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask"); + return -EINVAL; + } + + rule4->dscp_mask = dscp_mask; + + return 0; +} + static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { - struct net *net = sock_net(skb->sk); + struct fib4_rule *rule4 = (struct fib4_rule *)rule; + struct net *net = rule->fr_net; int err = -EINVAL; - struct fib4_rule *rule4 = (struct fib4_rule *) rule; if (tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) { NL_SET_ERR_MSG(extack, @@ -271,6 +297,10 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, fib4_nl2rule_dscp(tb[FRA_DSCP], rule4, extack) < 0) goto errout; + if (tb[FRA_DSCP_MASK] && + fib4_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule4, extack) < 0) + goto errout; + /* split local/main if they are not already split */ err = fib_unmerge(net); if (err) @@ -366,6 +396,14 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, return 0; } + if (tb[FRA_DSCP_MASK]) { + dscp_t dscp_mask; + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2); + if (!rule4->dscp_full || rule4->dscp_mask != dscp_mask) + return 0; + } + #ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; @@ -391,7 +429,9 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, if (rule4->dscp_full) { frh->tos = 0; if (nla_put_u8(skb, FRA_DSCP, - inet_dscp_to_dsfield(rule4->dscp) >> 2)) + inet_dscp_to_dsfield(rule4->dscp) >> 2) || + nla_put_u8(skb, FRA_DSCP_MASK, + inet_dscp_to_dsfield(rule4->dscp_mask) >> 2)) goto nla_put_failure; } else { frh->tos = inet_dscp_to_dsfield(rule4->dscp); @@ -418,7 +458,8 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) return nla_total_size(4) /* dst */ + nla_total_size(4) /* src */ + nla_total_size(4) /* flow */ - + nla_total_size(1); /* dscp */ + + nla_total_size(1) /* dscp */ + + nla_total_size(1); /* dscp mask */ } static void fib4_rule_flush_cache(struct fib_rules_ops *ops) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d2cee5c314f5..f68bb9e34c34 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -50,12 +50,6 @@ #include "fib_lookup.h" -static struct hlist_head *fib_info_hash; -static struct hlist_head *fib_info_laddrhash; -static unsigned int fib_info_hash_size; -static unsigned int fib_info_hash_bits; -static unsigned int fib_info_cnt; - /* for_nexthops and change_nexthops only used when nexthop object * is not set in a fib_info. The logic within can reference fib_nh. */ @@ -258,8 +252,7 @@ void fib_release_info(struct fib_info *fi) ASSERT_RTNL(); if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); - - fib_info_cnt--; + fi->fib_net->ipv4.fib_info_cnt--; if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); @@ -335,11 +328,12 @@ static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, static unsigned int fib_info_hashfn_result(const struct net *net, unsigned int val) { - return hash_32(val ^ net_hash_mix(net), fib_info_hash_bits); + return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits); } -static inline unsigned int fib_info_hashfn(struct fib_info *fi) +static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) { + struct net *net = fi->fib_net; unsigned int val; val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, @@ -354,7 +348,70 @@ static inline unsigned int fib_info_hashfn(struct fib_info *fi) } endfor_nexthops(fi) } - return fib_info_hashfn_result(fi->fib_net, val); + return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)]; +} + +static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, + __be32 val) +{ + unsigned int hash_bits = net->ipv4.fib_info_hash_bits; + u32 slot; + + slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits); + + return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot]; +} + +static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) +{ + /* The second half is used for prefsrc */ + return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head *), + GFP_KERNEL); +} + +static void fib_info_hash_free(struct hlist_head *head) +{ + kvfree(head); +} + +static void fib_info_hash_grow(struct net *net) +{ + unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits; + struct hlist_head *new_info_hash, *old_info_hash; + unsigned int i; + + if (net->ipv4.fib_info_cnt < old_size) + return; + + new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1); + if (!new_info_hash) + return; + + old_info_hash = net->ipv4.fib_info_hash; + net->ipv4.fib_info_hash = new_info_hash; + net->ipv4.fib_info_hash_bits += 1; + + for (i = 0; i < old_size; i++) { + struct hlist_head *head = &old_info_hash[i]; + struct hlist_node *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, n, head, fib_hash) + hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); + } + + for (i = 0; i < old_size; i++) { + struct hlist_head *lhead = &old_info_hash[old_size + i]; + struct hlist_node *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) + hlist_add_head(&fi->fib_lhash, + fib_info_laddrhash_bucket(fi->fib_net, + fi->fib_prefsrc)); + } + + fib_info_hash_free(old_info_hash); } /* no metrics, only nexthop id */ @@ -370,13 +427,12 @@ static struct fib_info *fib_find_info_nh(struct net *net, (__force u32)cfg->fc_prefsrc, cfg->fc_priority); hash = fib_info_hashfn_result(net, hash); - head = &fib_info_hash[hash]; + head = &net->ipv4.fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { - if (!net_eq(fi->fib_net, net)) - continue; if (!fi->nh || fi->nh->id != cfg->fc_nh_id) continue; + if (cfg->fc_protocol == fi->fib_protocol && cfg->fc_scope == fi->fib_scope && cfg->fc_prefsrc == fi->fib_prefsrc && @@ -392,18 +448,13 @@ static struct fib_info *fib_find_info_nh(struct net *net, static struct fib_info *fib_find_info(struct fib_info *nfi) { - struct hlist_head *head; + struct hlist_head *head = fib_info_hash_bucket(nfi); struct fib_info *fi; - unsigned int hash; - - hash = fib_info_hashfn(nfi); - head = &fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { - if (!net_eq(fi->fib_net, nfi->fib_net)) - continue; if (fi->fib_nhs != nfi->fib_nhs) continue; + if (nfi->fib_protocol == fi->fib_protocol && nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && @@ -1239,64 +1290,6 @@ int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, return err; } -static struct hlist_head * -fib_info_laddrhash_bucket(const struct net *net, __be32 val) -{ - u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val, - fib_info_hash_bits); - - return &fib_info_laddrhash[slot]; -} - -static void fib_info_hash_move(struct hlist_head *new_info_hash, - struct hlist_head *new_laddrhash, - unsigned int new_size) -{ - struct hlist_head *old_info_hash, *old_laddrhash; - unsigned int old_size = fib_info_hash_size; - unsigned int i; - - ASSERT_RTNL(); - old_info_hash = fib_info_hash; - old_laddrhash = fib_info_laddrhash; - fib_info_hash_size = new_size; - fib_info_hash_bits = ilog2(new_size); - - for (i = 0; i < old_size; i++) { - struct hlist_head *head = &fib_info_hash[i]; - struct hlist_node *n; - struct fib_info *fi; - - hlist_for_each_entry_safe(fi, n, head, fib_hash) { - struct hlist_head *dest; - unsigned int new_hash; - - new_hash = fib_info_hashfn(fi); - dest = &new_info_hash[new_hash]; - hlist_add_head(&fi->fib_hash, dest); - } - } - fib_info_hash = new_info_hash; - - fib_info_laddrhash = new_laddrhash; - for (i = 0; i < old_size; i++) { - struct hlist_head *lhead = &old_laddrhash[i]; - struct hlist_node *n; - struct fib_info *fi; - - hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) { - struct hlist_head *ldest; - - ldest = fib_info_laddrhash_bucket(fi->fib_net, - fi->fib_prefsrc); - hlist_add_head(&fi->fib_lhash, ldest); - } - } - - kvfree(old_info_hash); - kvfree(old_laddrhash); -} - __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope) { @@ -1409,32 +1402,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } #endif - err = -ENOBUFS; - - if (fib_info_cnt >= fib_info_hash_size) { - unsigned int new_size = fib_info_hash_size << 1; - struct hlist_head *new_info_hash; - struct hlist_head *new_laddrhash; - size_t bytes; - - if (!new_size) - new_size = 16; - bytes = (size_t)new_size * sizeof(struct hlist_head *); - new_info_hash = kvzalloc(bytes, GFP_KERNEL); - new_laddrhash = kvzalloc(bytes, GFP_KERNEL); - if (!new_info_hash || !new_laddrhash) { - kvfree(new_info_hash); - kvfree(new_laddrhash); - } else { - fib_info_hash_move(new_info_hash, new_laddrhash, new_size); - } - if (!fib_info_hash_size) - goto failure; - } + fib_info_hash_grow(net); fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); - if (!fi) + if (!fi) { + err = -ENOBUFS; goto failure; + } + fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); @@ -1571,9 +1546,9 @@ link_it: refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); - fib_info_cnt++; - hlist_add_head(&fi->fib_hash, - &fib_info_hash[fib_info_hashfn(fi)]); + net->ipv4.fib_info_cnt++; + hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); + if (fi->fib_prefsrc) { struct hlist_head *head; @@ -1855,7 +1830,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) struct fib_info *fi; int ret = 0; - if (!fib_info_laddrhash || local == 0) + if (!local) return 0; head = fib_info_laddrhash_bucket(net, local); @@ -2257,3 +2232,22 @@ check_saddr: fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK); } } + +int __net_init fib4_semantics_init(struct net *net) +{ + unsigned int hash_bits = 4; + + net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits); + if (!net->ipv4.fib_info_hash) + return -ENOMEM; + + net->ipv4.fib_info_hash_bits = hash_bits; + net->ipv4.fib_info_cnt = 0; + + return 0; +} + +void __net_exit fib4_semantics_exit(struct net *net) +{ + fib_info_hash_free(net->ipv4.fib_info_hash); +} diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index d6411ac81096..59a6f0a9638f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1187,22 +1187,6 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, return 0; } -static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) -{ - if (plen > KEYLENGTH) { - NL_SET_ERR_MSG(extack, "Invalid prefix length"); - return false; - } - - if ((plen < KEYLENGTH) && (key << plen)) { - NL_SET_ERR_MSG(extack, - "Invalid prefix for given prefix length"); - return false; - } - - return true; -} - static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old); @@ -1223,9 +1207,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); @@ -1717,9 +1698,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5482edb5aade..717cb7d3607a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -405,7 +405,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct ipcm_cookie ipc; struct flowi4 fl4; struct sock *sk; - struct inet_sock *inet; __be32 daddr, saddr; u32 mark = IP4_REPLY_MARK(net, skb->mark); int type = icmp_param->data.icmph.type; @@ -424,12 +423,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) sk = icmp_xmit_lock(net); if (!sk) goto out_bh_enable; - inet = inet_sk(sk); icmp_param->data.icmph.checksum = 0; ipcm_init(&ipc); - inet->tos = ip_hdr(skb)->tos; + ipc.tos = ip_hdr(skb)->tos; ipc.sockc.mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); @@ -737,8 +735,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, icmp_param.data.icmph.checksum = 0; icmp_param.skb = skb_in; icmp_param.offset = skb_network_offset(skb_in); - inet_sk(sk)->tos = tos; ipcm_init(&ipc); + ipc.tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; ipc.sockc.mark = mark; @@ -1250,22 +1248,6 @@ int icmp_rcv(struct sk_buff *skb) goto reason_check; } - if (icmph->type == ICMP_EXT_ECHOREPLY) { - reason = ping_rcv(skb); - goto reason_check; - } - - /* - * 18 is the highest 'known' ICMP type. Anything else is a mystery - * - * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently - * discarded. - */ - if (icmph->type > NR_ICMP_TYPES) { - reason = SKB_DROP_REASON_UNHANDLED_PROTO; - goto error; - } - /* * Parse the ICMP message */ @@ -1292,6 +1274,23 @@ int icmp_rcv(struct sk_buff *skb) } } + if (icmph->type == ICMP_EXT_ECHOREPLY || + icmph->type == ICMP_ECHOREPLY) { + reason = ping_rcv(skb); + return reason ? NET_RX_DROP : NET_RX_SUCCESS; + } + + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + reason = SKB_DROP_REASON_UNHANDLED_PROTO; + goto error; + } + reason = icmp_pointers[icmph->type].handler(skb); reason_check: if (!reason) { diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 3da126cea884..2c394c364cb9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -81,6 +81,7 @@ #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> +#include "igmp_internal.h" #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/times.h> @@ -1432,14 +1433,16 @@ static void ip_mc_hash_remove(struct in_device *in_dev, *mc_hash = im->next_hash; } -static int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, - const struct ip_mc_list *im, int event) +int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, + const struct ip_mc_list *im, + struct inet_fill_args *args) { struct ifa_cacheinfo ci; struct ifaddrmsg *ifm; struct nlmsghdr *nlh; - nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct ifaddrmsg), 0); + nlh = nlmsg_put(skb, args->portid, args->seq, args->event, + sizeof(struct ifaddrmsg), args->flags); if (!nlh) return -EMSGSIZE; @@ -1468,6 +1471,9 @@ static int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, static void inet_ifmcaddr_notify(struct net_device *dev, const struct ip_mc_list *im, int event) { + struct inet_fill_args fillargs = { + .event = event, + }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOMEM; @@ -1479,7 +1485,7 @@ static void inet_ifmcaddr_notify(struct net_device *dev, if (!skb) goto error; - err = inet_fill_ifmcaddr(skb, dev, im, event); + err = inet_fill_ifmcaddr(skb, dev, im, &fillargs); if (err < 0) { WARN_ON_ONCE(err == -EMSGSIZE); nlmsg_free(skb); diff --git a/net/ipv4/igmp_internal.h b/net/ipv4/igmp_internal.h new file mode 100644 index 000000000000..0a1bcc8ec8e1 --- /dev/null +++ b/net/ipv4/igmp_internal.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_IGMP_INTERNAL_H +#define _LINUX_IGMP_INTERNAL_H + +struct inet_fill_args { + u32 portid; + u32 seq; + int event; + unsigned int flags; + int netnsid; + int ifindex; +}; + +int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, + const struct ip_mc_list *im, + struct inet_fill_args *args); +#endif diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e4decfb270fa..dd5cf8914a28 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -157,12 +157,10 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { - int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - - if (addr_type == IPV6_ADDR_ANY) + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) return false; - if (addr_type != IPV6_ADDR_MAPPED) + if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) return true; } #endif @@ -600,7 +598,7 @@ fail_unlock: if (bhash2_created) inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2); if (bhash_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); } if (head2_lock_acquired) spin_unlock(&head2->lock); @@ -799,18 +797,6 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk) sk_stop_timer_sync(sk, &sk->sk_timer); } -void inet_csk_delete_keepalive_timer(struct sock *sk) -{ - sk_stop_timer(sk, &sk->sk_timer); -} -EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); - -void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) -{ - sk_reset_timer(sk, &sk->sk_timer, jiffies + len); -} -EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); - struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) @@ -1249,39 +1235,59 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, const gfp_t priority) { struct sock *newsk = sk_clone_lock(sk, priority); + struct inet_connection_sock *newicsk; + struct inet_request_sock *ireq; + struct inet_sock *newinet; - if (newsk) { - struct inet_connection_sock *newicsk = inet_csk(newsk); + if (!newsk) + return NULL; - inet_sk_set_state(newsk, TCP_SYN_RECV); - newicsk->icsk_bind_hash = NULL; - newicsk->icsk_bind2_hash = NULL; + newicsk = inet_csk(newsk); + newinet = inet_sk(newsk); + ireq = inet_rsk(req); - inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; - inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; - inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); + newicsk->icsk_bind_hash = NULL; + newicsk->icsk_bind2_hash = NULL; - /* listeners have SOCK_RCU_FREE, not the children */ - sock_reset_flag(newsk, SOCK_RCU_FREE); + newinet->inet_dport = ireq->ir_rmt_port; + newinet->inet_num = ireq->ir_num; + newinet->inet_sport = htons(ireq->ir_num); - inet_sk(newsk)->mc_list = NULL; + newsk->sk_bound_dev_if = ireq->ir_iif; - newsk->sk_mark = inet_rsk(req)->ir_mark; - atomic64_set(&newsk->sk_cookie, - atomic64_read(&inet_rsk(req)->ir_cookie)); + newsk->sk_daddr = ireq->ir_rmt_addr; + newsk->sk_rcv_saddr = ireq->ir_loc_addr; + newinet->inet_saddr = ireq->ir_loc_addr; - newicsk->icsk_retransmits = 0; - newicsk->icsk_backoff = 0; - newicsk->icsk_probes_out = 0; - newicsk->icsk_probes_tstamp = 0; +#if IS_ENABLED(CONFIG_IPV6) + newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; + newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; +#endif - /* Deinitialize accept_queue to trap illegal accesses. */ - memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + /* listeners have SOCK_RCU_FREE, not the children */ + sock_reset_flag(newsk, SOCK_RCU_FREE); - inet_clone_ulp(req, newsk, priority); + inet_sk(newsk)->mc_list = NULL; + + newsk->sk_mark = inet_rsk(req)->ir_mark; + atomic64_set(&newsk->sk_cookie, + atomic64_read(&inet_rsk(req)->ir_cookie)); + + newicsk->icsk_retransmits = 0; + newicsk->icsk_backoff = 0; + newicsk->icsk_probes_out = 0; + newicsk->icsk_probes_tstamp = 0; + + /* Deinitialize accept_queue to trap illegal accesses. */ + memset(&newicsk->icsk_accept_queue, 0, + sizeof(newicsk->icsk_accept_queue)); + + inet_sk_set_state(newsk, TCP_SYN_RECV); + + inet_clone_ulp(req, newsk, priority); + + security_inet_csk_clone(newsk, req); - security_inet_csk_clone(newsk, req); - } return newsk; } EXPORT_SYMBOL_GPL(inet_csk_clone_lock); @@ -1547,17 +1553,6 @@ skip_child_forget: } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); -void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) -{ - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; - const struct inet_sock *inet = inet_sk(sk); - - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = inet->inet_daddr; - sin->sin_port = inet->inet_dport; -} -EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); - static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) { const struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 321acc8abf17..c2bb91d9e9ff 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -282,7 +282,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), - .idiag_fmem = sk_forward_alloc_get(sk), + .idiag_fmem = READ_ONCE(sk->sk_forward_alloc), .idiag_tmem = sk_wmem_alloc_get(sk), }; @@ -315,12 +315,12 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = - jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); } else if (icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = - jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; r->idiag_retrans = icsk->icsk_probes_out; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index d179a2c84222..19fae4811ab2 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -145,8 +145,7 @@ static void inet_frags_free_cb(void *ptr, void *arg) } spin_unlock_bh(&fq->lock); - if (refcount_sub_and_test(count, &fq->refcnt)) - inet_frag_destroy(fq); + inet_frag_putn(fq, count); } static LLIST_HEAD(fqdir_free_list); @@ -226,10 +225,10 @@ void fqdir_exit(struct fqdir *fqdir) } EXPORT_SYMBOL(fqdir_exit); -void inet_frag_kill(struct inet_frag_queue *fq) +void inet_frag_kill(struct inet_frag_queue *fq, int *refs) { if (del_timer(&fq->timer)) - refcount_dec(&fq->refcnt); + (*refs)++; if (!(fq->flags & INET_FRAG_COMPLETE)) { struct fqdir *fqdir = fq->fqdir; @@ -244,7 +243,7 @@ void inet_frag_kill(struct inet_frag_queue *fq) if (!READ_ONCE(fqdir->dead)) { rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, fqdir->f->rhash_params); - refcount_dec(&fq->refcnt); + (*refs)++; } else { fq->flags |= INET_FRAG_HASH_DEAD; } @@ -328,7 +327,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); - refcount_set(&q->refcnt, 3); + /* One reference for the timer, one for the hash table. */ + refcount_set(&q->refcnt, 2); return q; } @@ -350,15 +350,20 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, &q->node, f->rhash_params); if (*prev) { + /* We could not insert in the hash table, + * we need to cancel what inet_frag_alloc() + * anticipated. + */ + int refs = 1; + q->flags |= INET_FRAG_COMPLETE; - inet_frag_kill(q); - inet_frag_destroy(q); + inet_frag_kill(q, &refs); + inet_frag_putn(q, refs); return NULL; } return q; } -/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) { /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ @@ -368,17 +373,11 @@ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) return NULL; - rcu_read_lock(); - prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); if (!prev) fq = inet_frag_create(fqdir, key, &prev); - if (!IS_ERR_OR_NULL(prev)) { + if (!IS_ERR_OR_NULL(prev)) fq = prev; - if (!refcount_inc_not_zero(&fq->refcnt)) - fq = NULL; - } - rcu_read_unlock(); return fq; } EXPORT_SYMBOL(inet_frag_find); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 9bfcfd016e18..5bf163f756e9 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -35,8 +35,8 @@ u32 inet_ehashfn(const struct net *net, const __be32 laddr, { net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); - return __inet_ehashfn(laddr, lport, faddr, fport, - inet_ehash_secret + net_hash_mix(net)); + return lport + __inet_ehashfn(laddr, 0, faddr, fport, + inet_ehash_secret + net_hash_mix(net)); } EXPORT_SYMBOL_GPL(inet_ehashfn); @@ -76,7 +76,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, tb->fastreuse = 0; tb->fastreuseport = 0; INIT_HLIST_HEAD(&tb->bhash2); - hlist_add_head(&tb->node, &head->chain); + hlist_add_head_rcu(&tb->node, &head->chain); } return tb; } @@ -84,11 +84,11 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, /* * Caller must hold hashbucket lock for this tb with local BH disabled */ -void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) +void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) { if (hlist_empty(&tb->bhash2)) { - __hlist_del(&tb->node); - kmem_cache_free(cachep, tb); + hlist_del_rcu(&tb->node); + kfree_rcu(tb, rcu); } } @@ -201,7 +201,7 @@ static void __inet_put_port(struct sock *sk) } spin_unlock(&head2->lock); - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head->lock); } @@ -285,7 +285,7 @@ bhash2_find: error: if (created_inet_bind_bucket) - inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; @@ -537,7 +537,9 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established); /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + bool rcu_lookup, + u32 hash) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -548,14 +550,25 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, int sdif = l3mdev_master_ifindex_by_index(net, dif); INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); - unsigned int hash = inet_ehashfn(net, daddr, lport, - saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); - spinlock_t *lock = inet_ehash_lockp(hinfo, hash); - struct sock *sk2; - const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; + const struct hlist_nulls_node *node; + struct sock *sk2; + spinlock_t *lock; + if (rcu_lookup) { + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash || + !inet_match(net, sk2, acookie, ports, dif, sdif)) + continue; + if (sk2->sk_state == TCP_TIME_WAIT) + break; + return -EADDRNOTAVAIL; + } + return 0; + } + + lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { @@ -993,8 +1006,10 @@ static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, + u32 hash_port0, int (*check_established)(struct inet_timewait_death_row *, - struct sock *, __u16, struct inet_timewait_sock **)) + struct sock *, __u16, struct inet_timewait_sock **, + bool rcu_lookup, u32 hash)) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_bind_hashbucket *head, *head2; @@ -1012,7 +1027,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, if (port) { local_bh_disable(); - ret = check_established(death_row, sk, port, NULL); + ret = check_established(death_row, sk, port, NULL, false, + hash_port0 + port); local_bh_enable(); return ret; } @@ -1048,6 +1064,22 @@ other_parity_scan: continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; + rcu_read_lock(); + hlist_for_each_entry_rcu(tb, &head->chain, node) { + if (!inet_bind_bucket_match(tb, net, port, l3mdev)) + continue; + if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) { + rcu_read_unlock(); + goto next_port; + } + if (!check_established(death_row, sk, port, &tw, true, + hash_port0 + port)) + break; + rcu_read_unlock(); + goto next_port; + } + rcu_read_unlock(); + spin_lock_bh(&head->lock); /* Does not bother with rcv_saddr checks, because @@ -1057,12 +1089,13 @@ other_parity_scan: if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) - goto next_port; + goto next_port_unlock; WARN_ON(hlist_empty(&tb->bhash2)); if (!check_established(death_row, sk, - port, &tw)) + port, &tw, false, + hash_port0 + port)) goto ok; - goto next_port; + goto next_port_unlock; } } @@ -1076,8 +1109,9 @@ other_parity_scan: tb->fastreuse = -1; tb->fastreuseport = -1; goto ok; -next_port: +next_port_unlock: spin_unlock_bh(&head->lock); +next_port: cond_resched(); } @@ -1149,7 +1183,7 @@ error: spin_unlock(&head2->lock); if (tb_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head->lock); if (tw) @@ -1166,11 +1200,18 @@ error: int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { + const struct inet_sock *inet = inet_sk(sk); + const struct net *net = sock_net(sk); u64 port_offset = 0; + u32 hash_port0; if (!inet_sk(sk)->inet_num) port_offset = inet_sk_port_offset(sk); - return __inet_hash_connect(death_row, sk, port_offset, + + hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0, + inet->inet_daddr, inet->inet_dport); + + return __inet_hash_connect(death_row, sk, port_offset, hash_port0, __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); @@ -1230,22 +1271,37 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); unsigned int i, nblocks = 1; + spinlock_t *ptr = NULL; - if (locksz != 0) { - /* allocate 2 cache lines or at least one spinlock per cpu */ - nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); - nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); + if (locksz == 0) + goto set_mask; - /* no more locks than number of hash buckets */ - nblocks = min(nblocks, hashinfo->ehash_mask + 1); + /* Allocate 2 cache lines or at least one spinlock per cpu. */ + nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus(); - hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); - if (!hashinfo->ehash_locks) - return -ENOMEM; + /* At least one page per NUMA node. */ + nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz); + + nblocks = roundup_pow_of_two(nblocks); + + /* No more locks than number of hash buckets. */ + nblocks = min(nblocks, hashinfo->ehash_mask + 1); - for (i = 0; i < nblocks; i++) - spin_lock_init(&hashinfo->ehash_locks[i]); + if (num_online_nodes() > 1) { + /* Use vmalloc() to allow NUMA policy to spread pages + * on all available nodes if desired. + */ + ptr = vmalloc_array(nblocks, locksz); + } + if (!ptr) { + ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL); + if (!ptr) + return -ENOMEM; } + for (i = 0; i < nblocks; i++) + spin_lock_init(&ptr[i]); + hashinfo->ehash_locks = ptr; +set_mask: hashinfo->ehash_locks_mask = nblocks - 1; return 0; } diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 337390ba85b4..aded4bf1bc16 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -39,7 +39,7 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, tw->tw_tb = NULL; tw->tw_tb2 = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); __sock_put((struct sock *)tw); } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index b8b23a77ceb4..7b1e0a2d6906 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -60,7 +60,7 @@ void inet_peer_base_init(struct inet_peer_base *bp) seqlock_init(&bp->lock); bp->total = 0; } -EXPORT_SYMBOL_GPL(inet_peer_base_init); +EXPORT_IPV6_MOD_GPL(inet_peer_base_init); #define PEER_MAX_GC 32 @@ -218,7 +218,7 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, return p; } -EXPORT_SYMBOL_GPL(inet_getpeer); +EXPORT_IPV6_MOD_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { @@ -269,7 +269,7 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) WRITE_ONCE(peer->rate_tokens, token); return rc; } -EXPORT_SYMBOL(inet_peer_xrlim_allow); +EXPORT_IPV6_MOD(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { @@ -286,4 +286,4 @@ void inetpeer_invalidate_tree(struct inet_peer_base *base) base->total = 0; } -EXPORT_SYMBOL(inetpeer_invalidate_tree); +EXPORT_IPV6_MOD(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 7a435746a22d..77f395b28ec7 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -76,7 +76,8 @@ static u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static void ip4_frag_init(struct inet_frag_queue *q, const void *a) @@ -107,22 +108,6 @@ static void ip4_frag_free(struct inet_frag_queue *q) inet_putpeer(qp->peer); } - -/* Destruction primitives. */ - -static void ipq_put(struct ipq *ipq) -{ - inet_frag_put(&ipq->q); -} - -/* Kill ipq entry. It is not destroyed immediately, - * because caller (and someone more) holds reference count. - */ -static void ipq_kill(struct ipq *ipq) -{ - inet_frag_kill(&ipq->q); -} - static bool frag_expire_skip_icmp(u32 user) { return user == IP_DEFRAG_AF_PACKET || @@ -143,6 +128,7 @@ static void ip_expire(struct timer_list *t) struct sk_buff *head = NULL; struct net *net; struct ipq *qp; + int refs = 1; qp = container_of(frag, struct ipq, q); net = qp->q.fqdir->net; @@ -159,7 +145,7 @@ static void ip_expire(struct timer_list *t) goto out; qp->q.flags |= INET_FRAG_DROP; - ipq_kill(qp); + inet_frag_kill(&qp->q, &refs); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); @@ -202,7 +188,7 @@ out: out_rcu_unlock: rcu_read_unlock(); kfree_skb_reason(head, reason); - ipq_put(qp); + inet_frag_putn(&qp->q, refs); } /* Find the correct entry in the "incomplete datagrams" queue for @@ -278,7 +264,7 @@ static int ip_frag_reinit(struct ipq *qp) } /* Add new segment to existing queue. */ -static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) +static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb, int *refs) { struct net *net = qp->q.fqdir->net; int ihl, end, flags, offset; @@ -298,7 +284,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && unlikely(ip_frag_too_far(qp)) && unlikely(err = ip_frag_reinit(qp))) { - ipq_kill(qp); + inet_frag_kill(&qp->q, refs); goto err; } @@ -382,10 +368,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip_frag_reasm(qp, skb, prev_tail, dev); + err = ip_frag_reasm(qp, skb, prev_tail, dev, refs); skb->_skb_refdst = orefdst; if (err) - inet_frag_kill(&qp->q); + inet_frag_kill(&qp->q, refs); return err; } @@ -402,7 +388,7 @@ insert_error: err = -EINVAL; __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: - inet_frag_kill(&qp->q); + inet_frag_kill(&qp->q, refs); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: kfree_skb_reason(skb, reason); @@ -416,7 +402,8 @@ static bool ip_frag_coalesce_ok(const struct ipq *qp) /* Build a new IP datagram from all its fragments. */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { struct net *net = qp->q.fqdir->net; struct iphdr *iph; @@ -424,7 +411,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, int len, err; u8 ecn; - ipq_kill(qp); + inet_frag_kill(&qp->q, refs); ecn = ip_frag_ecn_table[qp->ecn]; if (unlikely(ecn == 0xff)) { @@ -496,18 +483,21 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); /* Lookup (or create) queue header */ + rcu_read_lock(); qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { - int ret; + int ret, refs = 0; spin_lock(&qp->q.lock); - ret = ip_frag_queue(qp, skb); + ret = ip_frag_queue(qp, skb, &refs); spin_unlock(&qp->q.lock); - ipq_put(qp); + rcu_read_unlock(); + inet_frag_putn(&qp->q, refs); return ret; } + rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index ed1b6b44faf8..26d15f907551 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -141,7 +141,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - unsigned int data_len = 0; struct ip_tunnel *t; if (tpi->proto == htons(ETH_P_TEB)) @@ -182,7 +181,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; - data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: @@ -190,10 +188,16 @@ static int ipgre_err(struct sk_buff *skb, u32 info, } #if IS_ENABLED(CONFIG_IPV6) - if (tpi->proto == htons(ETH_P_IPV6) && - !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, - type, data_len)) - return 0; + if (tpi->proto == htons(ETH_P_IPV6)) { + unsigned int data_len = 0; + + if (type == ICMP_TIME_EXCEEDED) + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ + + if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, + type, data_len)) + return 0; + } #endif if (t->parms.iph.daddr == 0 || @@ -1392,10 +1396,12 @@ ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[]) return 0; } -static int ipgre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipgre_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1407,13 +1413,16 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; - return ip_tunnel_newlink(dev, tb, &p, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); } -static int erspan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int erspan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1425,7 +1434,8 @@ static int erspan_newlink(struct net *src_net, struct net_device *dev, err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); if (err) return err; - return ip_tunnel_newlink(dev, tb, &p, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); } static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], @@ -1693,6 +1703,7 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = { struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type) { + struct rtnl_newlink_params params = { .src_net = net }; struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; LIST_HEAD(list_kill); @@ -1700,6 +1711,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, int err; memset(&tb, 0, sizeof(tb)); + params.tb = tb; dev = rtnl_create_link(net, name, name_assign_type, &ipgre_tap_ops, tb, NULL); @@ -1710,7 +1722,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, t = netdev_priv(dev); t->collect_md = true; - err = ipgre_newlink(net, dev, tb, NULL, NULL); + err = ipgre_newlink(dev, ¶ms, NULL); if (err < 0) { free_netdev(dev); return ERR_PTR(err); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ea7a260bec8a..6e18d7ec5062 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -75,7 +75,6 @@ #include <net/checksum.h> #include <net/gso.h> #include <net/inetpeer.h> -#include <net/inet_ecn.h> #include <net/lwtunnel.h> #include <net/inet_dscp.h> #include <linux/bpf-cgroup.h> @@ -1640,7 +1639,7 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, if (IS_ERR(rt)) return; - inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK; + inet_sk(sk)->tos = arg->tos; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 09b73acf037a..1024f961ec9a 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -40,6 +40,7 @@ #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <net/rtnetlink.h> #include <net/udp.h> #include <net/dst_metadata.h> @@ -1162,7 +1163,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, * Allowing to move it to another netns is clearly unsafe. */ if (!IS_ERR(itn->fb_tunnel_dev)) { - itn->fb_tunnel_dev->netns_local = true; + itn->fb_tunnel_dev->netns_immutable = true; itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); itn->type = itn->fb_tunnel_dev->type; @@ -1213,11 +1214,11 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, } EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); -int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm_kern *p, __u32 fwmark) +int ip_tunnel_newlink(struct net *net, struct net_device *dev, + struct nlattr *tb[], struct ip_tunnel_parm_kern *p, + __u32 fwmark) { struct ip_tunnel *nt; - struct net *net = dev_net(dev); struct ip_tunnel_net *itn; int mtu; int err; @@ -1326,7 +1327,6 @@ int ip_tunnel_init(struct net_device *dev) } tunnel->dev = dev; - tunnel->net = dev_net(dev); strscpy(tunnel->parms.name, dev->name); iph->version = 4; iph->ihl = 5; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index f0b4419cef34..159b4473290e 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -575,15 +575,18 @@ static void vti_netlink_parms(struct nlattr *data[], *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } -static int vti_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vti_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; struct ip_tunnel_parm_kern parms; + struct nlattr **tb = params->tb; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); - return ip_tunnel_newlink(dev, tb, &parms, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, + &parms, fwmark); } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index dc0db5895e0e..bab0bf90c908 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -436,11 +436,13 @@ static void ipip_netlink_parms(struct nlattr *data[], *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } -static int ipip_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipip_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; @@ -453,7 +455,8 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, } ipip_netlink_parms(data, &p, &t->collect_md, &fwmark); - return ip_tunnel_newlink(dev, tb, &p, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); } static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 21ae7594a852..b81c8131e23f 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -563,7 +563,7 @@ static void reg_vif_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; - dev->netns_local = true; + dev->netns_immutable = true; } static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 625adbc42037..9082ca17e845 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -71,6 +71,11 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct net_device *oif; const struct net_device *found; + if (nft_fib_can_skip(pkt)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; + } + /* * Do not set flowi4_oif, it restricts results (for example, asking * for oif 3 will get RTN_UNICAST result even if the daddr exits @@ -85,12 +90,6 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, else oif = NULL; - if (nft_hook(pkt) == NF_INET_PRE_ROUTING && - nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv, nft_in(pkt)); - return; - } - iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); if (!iph) { regs->verdict.code = NFT_BREAK; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 09a3d73b45ba..467151517023 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1272,10 +1272,8 @@ static int nh_check_attr_group(struct net *net, u16 nh_grp_type, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); - u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int i, j; - u8 nhg_fdb = 0; if (!len || len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, @@ -1307,10 +1305,41 @@ static int nh_check_attr_group(struct net *net, } } - if (tb[NHA_FDB]) - nhg_fdb = 1; nhg = nla_data(tb[NHA_GROUP]); - for (i = 0; i < len; ++i) { + for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { + if (!tb[i]) + continue; + switch (i) { + case NHA_HW_STATS_ENABLE: + case NHA_FDB: + continue; + case NHA_RES_GROUP: + if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) + continue; + break; + } + NL_SET_ERR_MSG(extack, + "No other attributes can be set in nexthop groups"); + return -EINVAL; + } + + return 0; +} + +static int nh_check_attr_group_rtnl(struct net *net, struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + u8 nh_family = AF_UNSPEC; + struct nexthop_grp *nhg; + unsigned int len; + unsigned int i; + u8 nhg_fdb; + + len = nla_len(tb[NHA_GROUP]) / sizeof(*nhg); + nhg = nla_data(tb[NHA_GROUP]); + nhg_fdb = !!tb[NHA_FDB]; + + for (i = 0; i < len; i++) { struct nexthop *nh; bool is_fdb_nh; @@ -1330,22 +1359,6 @@ static int nh_check_attr_group(struct net *net, return -EINVAL; } } - for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { - if (!tb[i]) - continue; - switch (i) { - case NHA_HW_STATS_ENABLE: - case NHA_FDB: - continue; - case NHA_RES_GROUP: - if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) - continue; - break; - } - NL_SET_ERR_MSG(extack, - "No other attributes can be set in nexthop groups"); - return -EINVAL; - } return 0; } @@ -2679,9 +2692,6 @@ static struct nexthop *nexthop_create_group(struct net *net, int err; int i; - if (WARN_ON(!num_nh)) - return ERR_PTR(-EINVAL); - nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); @@ -2915,11 +2925,6 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, struct nexthop *nh; int err; - if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) { - NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); - return ERR_PTR(-EINVAL); - } - if (!cfg->nh_id) { cfg->nh_id = nh_find_unused_id(net); if (!cfg->nh_id) { @@ -3016,19 +3021,13 @@ static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, } static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nh_config *cfg, + struct nlmsghdr *nlh, struct nlattr **tb, + struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); - struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; int err; - err = nlmsg_parse(nlh, sizeof(*nhm), tb, - ARRAY_SIZE(rtm_nh_policy_new) - 1, - rtm_nh_policy_new, extack); - if (err < 0) - return err; - err = -EINVAL; if (nhm->resvd || nhm->nh_scope) { NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); @@ -3093,7 +3092,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } - err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), + + err = nh_check_attr_group(net, tb, ARRAY_SIZE(rtm_nh_policy_new), cfg->nh_grp_type, extack); if (err) goto out; @@ -3126,25 +3126,6 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, goto out; } - if (!cfg->nh_fdb && tb[NHA_OIF]) { - cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); - if (cfg->nh_ifindex) - cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); - - if (!cfg->dev) { - NL_SET_ERR_MSG(extack, "Invalid device index"); - goto out; - } else if (!(cfg->dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Nexthop device is not up"); - err = -ENETDOWN; - goto out; - } else if (!netif_carrier_ok(cfg->dev)) { - NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); - err = -ENETDOWN; - goto out; - } - } - err = -EINVAL; if (tb[NHA_GATEWAY]) { struct nlattr *gwa = tb[NHA_GATEWAY]; @@ -3187,7 +3168,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, } cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); - err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); + err = lwtunnel_valid_encap_type(cfg->nh_encap_type, + extack, false); if (err < 0) goto out; @@ -3206,22 +3188,76 @@ out: return err; } +static int rtm_to_nh_config_rtnl(struct net *net, struct nlattr **tb, + struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + if (tb[NHA_GROUP]) + return nh_check_attr_group_rtnl(net, tb, extack); + + if (tb[NHA_OIF]) { + cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); + if (cfg->nh_ifindex) + cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); + + if (!cfg->dev) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + return -EINVAL; + } + + if (!(cfg->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + return -ENETDOWN; + } + + if (!netif_carrier_ok(cfg->dev)) { + NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); + return -ENETDOWN; + } + } + + return 0; +} + /* rtnl */ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; struct net *net = sock_net(skb->sk); struct nh_config cfg; struct nexthop *nh; int err; - err = rtm_to_nh_config(net, skb, nlh, &cfg, extack); - if (!err) { - nh = nexthop_add(net, &cfg, extack); - if (IS_ERR(nh)) - err = PTR_ERR(nh); + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_new) - 1, + rtm_nh_policy_new, extack); + if (err < 0) + goto out; + + err = rtm_to_nh_config(net, skb, nlh, tb, &cfg, extack); + if (err) + goto out; + + if (cfg.nlflags & NLM_F_REPLACE && !cfg.nh_id) { + NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); + err = -EINVAL; + goto out; } + rtnl_net_lock(net); + + err = rtm_to_nh_config_rtnl(net, tb, &cfg, extack); + if (err) + goto unlock; + + nh = nexthop_add(net, &cfg, extack); + if (IS_ERR(nh)) + err = PTR_ERR(nh); + +unlock: + rtnl_net_unlock(net); +out: return err; } @@ -3278,13 +3314,17 @@ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; + rtnl_net_lock(net); + nh = nexthop_find_by_id(net, id); - if (!nh) - return -ENOENT; + if (nh) + remove_nexthop(net, nh, &nlinfo); + else + err = -ENOENT; - remove_nexthop(net, nh, &nlinfo); + rtnl_net_unlock(net); - return 0; + return err; } /* rtnl */ @@ -4036,18 +4076,20 @@ static struct pernet_operations nexthop_net_ops = { }; static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = { - {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop}, - {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop}, + {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop, + .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_GETNEXTHOP, .doit = rtm_get_nexthop, .dumpit = rtm_dump_nexthop}, {.msgtype = RTM_GETNEXTHOPBUCKET, .doit = rtm_get_nexthop_bucket, .dumpit = rtm_dump_nexthop_bucket}, {.protocol = PF_INET, .msgtype = RTM_NEWNEXTHOP, - .doit = rtm_new_nexthop}, + .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETNEXTHOP, .dumpit = rtm_dump_nexthop}, {.protocol = PF_INET6, .msgtype = RTM_NEWNEXTHOP, - .doit = rtm_new_nexthop}, + .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET6, .msgtype = RTM_GETNEXTHOP, .dumpit = rtm_dump_nexthop}, }; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 619ddc087957..c14baa6589c7 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -705,7 +705,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip_options_data opt_copy; int free = 0; __be32 saddr, daddr, faddr; - u8 tos, scope; + u8 scope; int err; pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -768,7 +768,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } faddr = ipc.opt->opt.faddr; } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (ipv4_is_multicast(daddr)) { @@ -779,7 +778,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!ipc.oif) ipc.oif = READ_ONCE(inet->uc_index); - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); @@ -966,10 +966,9 @@ EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); enum skb_drop_reason ping_rcv(struct sk_buff *skb) { - enum skb_drop_reason reason = SKB_DROP_REASON_NO_SOCKET; - struct sock *sk; struct net *net = dev_net(skb->dev); struct icmphdr *icmph = icmp_hdr(skb); + struct sock *sk; /* We assume the packet has already been checked by icmp_rcv */ @@ -980,20 +979,11 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb) skb_push(skb, skb->data - (u8 *)icmph); sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); - if (sk) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); - - pr_debug("rcv on socket %p\n", sk); - if (skb2) - reason = __ping_queue_rcv_skb(sk, skb2); - else - reason = SKB_DROP_REASON_NOMEM; - } - - if (reason) - pr_debug("no socket, dropping\n"); + if (sk) + return __ping_queue_rcv_skb(sk, skb); - return reason; + kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET); + return SKB_DROP_REASON_NO_SOCKET; } EXPORT_SYMBOL_GPL(ping_rcv); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index affd21a0f572..10cbeb76c274 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -189,6 +189,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), + SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED), SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4304a68d1db0..6aace4d55733 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -486,7 +486,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; - u8 tos, scope; + u8 scope; int free = 0; __be32 daddr; __be32 saddr; @@ -581,7 +581,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = ipc.opt->opt.faddr; } } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); uc_index = READ_ONCE(inet->uc_index); @@ -606,7 +605,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } } - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 1948d15f1f28..5459a78b9809 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -222,7 +222,7 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, return NULL; } -EXPORT_SYMBOL(tcp_get_cookie_sock); +EXPORT_IPV6_MOD(tcp_get_cookie_sock); /* * when syncookies are in effect and tcp timestamps are enabled we stored @@ -259,7 +259,7 @@ bool cookie_timestamp_decode(const struct net *net, return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0; } -EXPORT_SYMBOL(cookie_timestamp_decode); +EXPORT_IPV6_MOD(cookie_timestamp_decode); static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, struct request_sock *req) @@ -279,6 +279,7 @@ static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, ireq->smc_ok = 0; treq->snt_synack = 0; + treq->snt_tsval_first = 0; treq->tfo_listener = false; treq->txhash = net_tx_rndhash(); treq->rcv_isn = ntohl(th->seq) - 1; @@ -310,7 +311,7 @@ struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb) return req; } -EXPORT_SYMBOL_GPL(cookie_bpf_check); +EXPORT_IPV6_MOD_GPL(cookie_bpf_check); #endif struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, @@ -351,7 +352,7 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, return req; } -EXPORT_SYMBOL_GPL(cookie_tcp_reqsk_alloc); +EXPORT_IPV6_MOD_GPL(cookie_tcp_reqsk_alloc); static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 42cb5dc9cb24..3a43010d726f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -28,6 +28,7 @@ static int tcp_adv_win_scale_max = 31; static int tcp_app_win_max = 31; static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; static int tcp_min_snd_mss_max = 65535; +static int tcp_rto_max_max = TCP_RTO_MAX_SEC * MSEC_PER_SEC; static int ip_privileged_port_min; static int ip_privileged_port_max = 65535; static int ip_ttl_min = 1; @@ -1583,6 +1584,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "tcp_rto_max_ms", + .data = &init_net.ipv4.sysctl_tcp_rto_max_ms, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_THOUSAND, + .extra2 = &tcp_rto_max_max, + }, }; static __net_init int ipv4_sysctl_init_net(struct net *net) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 57df7c1d2faa..ea8de00f669d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -300,10 +300,10 @@ DEFINE_PER_CPU(u32, tcp_tw_isn); EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); long sysctl_tcp_mem[3] __read_mostly; -EXPORT_SYMBOL(sysctl_tcp_mem); +EXPORT_IPV6_MOD(sysctl_tcp_mem); atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */ -EXPORT_SYMBOL(tcp_memory_allocated); +EXPORT_IPV6_MOD(tcp_memory_allocated); DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); @@ -316,7 +316,7 @@ EXPORT_SYMBOL(tcp_have_smc); * Current number of TCP sockets. */ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; -EXPORT_SYMBOL(tcp_sockets_allocated); +EXPORT_IPV6_MOD(tcp_sockets_allocated); /* * TCP splice context @@ -349,7 +349,7 @@ void tcp_enter_memory_pressure(struct sock *sk) if (!cmpxchg(&tcp_memory_pressure, 0, val)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); } -EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure); +EXPORT_IPV6_MOD_GPL(tcp_enter_memory_pressure); void tcp_leave_memory_pressure(struct sock *sk) { @@ -362,7 +362,7 @@ void tcp_leave_memory_pressure(struct sock *sk) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO, jiffies_to_msecs(jiffies - val)); } -EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure); +EXPORT_IPV6_MOD_GPL(tcp_leave_memory_pressure); /* Convert seconds to retransmits based on initial and max timeout */ static u8 secs_to_retrans(int seconds, int timeout, int rto_max) @@ -423,7 +423,7 @@ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int rto_min_us; + int rto_min_us, rto_max_ms; tp->out_of_order_queue = RB_ROOT; sk->tcp_rtx_queue = RB_ROOT; @@ -432,6 +432,10 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; + + rto_max_ms = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_max_ms); + icsk->icsk_rto_max = msecs_to_jiffies(rto_max_ms); + rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us); icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us); icsk->icsk_delack_max = TCP_DELACK_MAX; @@ -475,7 +479,7 @@ void tcp_init_sock(struct sock *sk) sk_sockets_allocated_inc(sk); xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1); } -EXPORT_SYMBOL(tcp_init_sock); +EXPORT_IPV6_MOD(tcp_init_sock); static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) { @@ -488,10 +492,14 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) sock_tx_timestamp(sk, sockc, &shinfo->tx_flags); if (tsflags & SOF_TIMESTAMPING_TX_ACK) - tcb->txstamp_ack = 1; + tcb->txstamp_ack |= TSTAMP_ACK_SK; if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; } + + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && + SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb) + bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB); } static bool tcp_stream_is_readable(struct sock *sk, int target) @@ -660,7 +668,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg) *karg = answ; return 0; } -EXPORT_SYMBOL(tcp_ioctl); +EXPORT_IPV6_MOD(tcp_ioctl); void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { @@ -876,7 +884,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, return ret; } -EXPORT_SYMBOL(tcp_splice_read); +EXPORT_IPV6_MOD(tcp_splice_read); struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule) @@ -1123,7 +1131,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockcm_init(&sockc, sk); + sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags)}; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { @@ -1376,7 +1384,7 @@ void tcp_splice_eof(struct socket *sock) tcp_push(sk, 0, mss_now, tp->nonagle, size_goal); release_sock(sk); } -EXPORT_SYMBOL_GPL(tcp_splice_eof); +EXPORT_IPV6_MOD_GPL(tcp_splice_eof); /* * Handle reading urgent data. BSD has very simple semantics for @@ -1517,11 +1525,25 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) __tcp_cleanup_rbuf(sk, copied); } +/* private version of sock_rfree() avoiding one atomic_sub() */ +void tcp_sock_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + unsigned int len = skb->truesize; + + sock_owned_by_me(sk); + atomic_set(&sk->sk_rmem_alloc, + atomic_read(&sk->sk_rmem_alloc) - len); + + sk_forward_alloc_add(sk, len); + sk_mem_reclaim(sk); +} + static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); - if (likely(skb->destructor == sock_rfree)) { - sock_rfree(skb); + if (likely(skb->destructor == tcp_sock_rfree)) { + tcp_sock_rfree(skb); skb->destructor = NULL; skb->sk = NULL; return skb_attempt_defer_free(skb); @@ -1686,7 +1708,7 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) } return copied; } -EXPORT_SYMBOL(tcp_read_skb); +EXPORT_IPV6_MOD(tcp_read_skb); void tcp_read_done(struct sock *sk, size_t len) { @@ -1731,7 +1753,7 @@ int tcp_peek_len(struct socket *sock) { return tcp_inq(sock->sk); } -EXPORT_SYMBOL(tcp_peek_len); +EXPORT_IPV6_MOD(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) @@ -1758,7 +1780,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } return 0; } -EXPORT_SYMBOL(tcp_set_rcvlowat); +EXPORT_IPV6_MOD(tcp_set_rcvlowat); void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping_internal *tss) @@ -1791,7 +1813,7 @@ int tcp_mmap(struct file *file, struct socket *sock, vma->vm_ops = &tcp_vm_ops; return 0; } -EXPORT_SYMBOL(tcp_mmap); +EXPORT_IPV6_MOD(tcp_mmap); static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, u32 *offset_frag) @@ -2493,6 +2515,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, } niov = skb_frag_net_iov(frag); + if (!net_is_devmem_iov(niov)) { + err = -ENODEV; + goto out; + } + end = start + skb_frag_size(frag); copy = end - offset; @@ -2511,7 +2538,7 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, /* Will perform the exchange later */ dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx]; - dmabuf_cmsg.dmabuf_id = net_iov_binding_id(niov); + dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov); offset += copy; remaining_len -= copy; @@ -2877,7 +2904,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, } return ret; } -EXPORT_SYMBOL(tcp_recvmsg); +EXPORT_IPV6_MOD(tcp_recvmsg); void tcp_set_state(struct sock *sk, int state) { @@ -3007,7 +3034,7 @@ void tcp_shutdown(struct sock *sk, int how) tcp_send_fin(sk); } } -EXPORT_SYMBOL(tcp_shutdown); +EXPORT_IPV6_MOD(tcp_shutdown); int tcp_orphan_count_sum(void) { @@ -3187,7 +3214,7 @@ adjudge_to_death: const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); @@ -3339,8 +3366,8 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_probes_out = 0; icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; - icsk->icsk_rto_min = TCP_RTO_MIN; - icsk->icsk_delack_max = TCP_DELACK_MAX; + WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN); + WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX); tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; @@ -3506,7 +3533,7 @@ static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, } DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); -EXPORT_SYMBOL(tcp_tx_delay_enabled); +EXPORT_IPV6_MOD(tcp_tx_delay_enabled); static void tcp_enable_tx_delay(void) { @@ -3640,7 +3667,7 @@ int tcp_sock_set_keepidle_locked(struct sock *sk, int val) elapsed = tp->keepalive_time - elapsed; else elapsed = 0; - inet_csk_reset_keepalive_timer(sk, elapsed); + tcp_reset_keepalive_timer(sk, elapsed); } return 0; @@ -3680,32 +3707,32 @@ EXPORT_SYMBOL(tcp_sock_set_keepcnt); int tcp_set_window_clamp(struct sock *sk, int val) { + u32 old_window_clamp, new_window_clamp, new_rcv_ssthresh; struct tcp_sock *tp = tcp_sk(sk); if (!val) { if (sk->sk_state != TCP_CLOSE) return -EINVAL; WRITE_ONCE(tp->window_clamp, 0); - } else { - u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp; - u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ? - SOCK_MIN_RCVBUF / 2 : val; + return 0; + } - if (new_window_clamp == old_window_clamp) - return 0; + old_window_clamp = tp->window_clamp; + new_window_clamp = max_t(int, SOCK_MIN_RCVBUF / 2, val); - WRITE_ONCE(tp->window_clamp, new_window_clamp); - if (new_window_clamp < old_window_clamp) { - /* need to apply the reserved mem provisioning only - * when shrinking the window clamp - */ - __tcp_adjust_rcv_ssthresh(sk, tp->window_clamp); + if (new_window_clamp == old_window_clamp) + return 0; - } else { - new_rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp); - tp->rcv_ssthresh = max(new_rcv_ssthresh, - tp->rcv_ssthresh); - } + WRITE_ONCE(tp->window_clamp, new_window_clamp); + + /* Need to apply the reserved mem provisioning only + * when shrinking the window clamp. + */ + if (new_window_clamp < old_window_clamp) { + __tcp_adjust_rcv_ssthresh(sk, new_window_clamp); + } else { + new_rcv_ssthresh = min(tp->rcv_wnd, new_window_clamp); + tp->rcv_ssthresh = max(new_rcv_ssthresh, tp->rcv_ssthresh); } return 0; } @@ -3815,6 +3842,27 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ)); return 0; + case TCP_RTO_MAX_MS: + if (val < MSEC_PER_SEC || val > TCP_RTO_MAX_SEC * MSEC_PER_SEC) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val)); + return 0; + case TCP_RTO_MIN_US: { + int rto_min = usecs_to_jiffies(val); + + if (rto_min > TCP_RTO_MIN || rto_min < TCP_TIMEOUT_MIN) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_rto_min, rto_min); + return 0; + } + case TCP_DELACK_MAX_US: { + int delack_max = usecs_to_jiffies(val); + + if (delack_max > TCP_DELACK_MAX || delack_max < TCP_TIMEOUT_MIN) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max); + return 0; + } } sockopt_lock_sock(sk); @@ -4044,7 +4092,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } -EXPORT_SYMBOL(tcp_setsockopt); +EXPORT_IPV6_MOD(tcp_setsockopt); static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, struct tcp_info *info) @@ -4120,7 +4168,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; } - if (tp->ecn_flags & TCP_ECN_OK) + if (tcp_ecn_mode_any(tp)) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; @@ -4651,6 +4699,15 @@ zerocopy_rcv_out: case TCP_IS_MPTCP: val = 0; break; + case TCP_RTO_MAX_MS: + val = jiffies_to_msecs(tcp_rto_max(sk)); + break; + case TCP_RTO_MIN_US: + val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_rto_min)); + break; + case TCP_DELACK_MAX_US: + val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_delack_max)); + break; default: return -ENOPROTOOPT; } @@ -4672,7 +4729,7 @@ bool tcp_bpf_bypass_getsockopt(int level, int optname) return false; } -EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); +EXPORT_IPV6_MOD(tcp_bpf_bypass_getsockopt); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) @@ -4686,11 +4743,11 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); } -EXPORT_SYMBOL(tcp_getsockopt); +EXPORT_IPV6_MOD(tcp_getsockopt); #ifdef CONFIG_TCP_MD5SIG int tcp_md5_sigpool_id = -1; -EXPORT_SYMBOL_GPL(tcp_md5_sigpool_id); +EXPORT_IPV6_MOD_GPL(tcp_md5_sigpool_id); int tcp_md5_alloc_sigpool(void) { @@ -4736,7 +4793,7 @@ int tcp_md5_hash_key(struct tcp_sigpool *hp, */ return data_race(crypto_ahash_update(hp->req)); } -EXPORT_SYMBOL(tcp_md5_hash_key); +EXPORT_IPV6_MOD(tcp_md5_hash_key); /* Called with rcu_read_lock() */ static enum skb_drop_reason @@ -4856,7 +4913,7 @@ tcp_inbound_hash(struct sock *sk, const struct request_sock *req, return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family, l3index, md5_location); } -EXPORT_SYMBOL_GPL(tcp_inbound_hash); +EXPORT_IPV6_MOD_GPL(tcp_inbound_hash); void tcp_done(struct sock *sk) { @@ -5005,7 +5062,12 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); +#if IS_ENABLED(CONFIG_TLS_DEVICE) + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tcp_clean_acked); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 77); +#else CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69); +#endif /* TX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 8a45a4aea933..03abe0848420 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -90,7 +90,7 @@ __bpf_kfunc static void dctcp_init(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - if ((tp->ecn_flags & TCP_ECN_OK) || + if (tcp_ecn_mode_any(tp) || (sk->sk_state == TCP_LISTEN || sk->sk_state == TCP_CLOSE)) { struct dctcp *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_dctcp.h b/net/ipv4/tcp_dctcp.h index d69a77cbd0c7..4b0259111d81 100644 --- a/net/ipv4/tcp_dctcp.h +++ b/net/ipv4/tcp_dctcp.h @@ -28,7 +28,7 @@ static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, */ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { dctcp_ece_ack_cwr(sk, *ce_state); - __tcp_send_ack(sk, *prior_rcv_nxt); + __tcp_send_ack(sk, *prior_rcv_nxt, 0); } inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index f428ecf9120f..45e174b8cd22 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -83,7 +83,7 @@ static int tcp_diag_put_md5sig(struct sk_buff *skb, #endif static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk, - const struct tcp_ulp_ops *ulp_ops) + const struct tcp_ulp_ops *ulp_ops, bool net_admin) { struct nlattr *nest; int err; @@ -97,7 +97,7 @@ static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk, goto nla_failure; if (ulp_ops->get_info) - err = ulp_ops->get_info(sk, skb); + err = ulp_ops->get_info(sk, skb, net_admin); if (err) goto nla_failure; @@ -113,6 +113,7 @@ static int tcp_diag_get_aux(struct sock *sk, bool net_admin, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_ulp_ops *ulp_ops; int err = 0; #ifdef CONFIG_TCP_MD5SIG @@ -129,15 +130,13 @@ static int tcp_diag_get_aux(struct sock *sk, bool net_admin, } #endif - if (net_admin) { - const struct tcp_ulp_ops *ulp_ops; - - ulp_ops = icsk->icsk_ulp_ops; - if (ulp_ops) - err = tcp_diag_put_ulp(skb, sk, ulp_ops); - if (err) + ulp_ops = icsk->icsk_ulp_ops; + if (ulp_ops) { + err = tcp_diag_put_ulp(skb, sk, ulp_ops, net_admin); + if (err < 0) return err; } + return 0; } @@ -164,7 +163,7 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) } #endif - if (net_admin && sk_fullsock(sk)) { + if (sk_fullsock(sk)) { const struct tcp_ulp_ops *ulp_ops; ulp_ops = icsk->icsk_ulp_ops; @@ -172,7 +171,7 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) size += nla_total_size(0) + nla_total_size(TCP_ULP_NAME_MAX); if (ulp_ops->get_info_size) - size += ulp_ops->get_info_size(sk); + size += ulp_ops->get_info_size(sk, net_admin); } } return size; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 32b28fc21b63..ca40665145c6 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -189,7 +189,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) tcp_segs_in(tp, skb); __skb_pull(skb, tcp_hdrlen(skb)); sk_forced_mem_schedule(sk, skb->truesize); - skb_set_owner_r(skb, sk); + tcp_skb_set_owner_r(skb, sk); TCP_SKB_CB(skb)->seq++; TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; @@ -274,8 +274,8 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, * because it's been added to the accept queue directly. */ req->timeout = tcp_timeout_init(child); - inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, - req->timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(child, ICSK_TIME_RETRANS, + req->timeout, false); refcount_set(&req->rsk_refcnt, 2); @@ -468,7 +468,7 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) } return false; } -EXPORT_SYMBOL(tcp_fastopen_defer_connect); +EXPORT_IPV6_MOD(tcp_fastopen_defer_connect); /* * The following code block is to deal with middle box issues with TFO: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0cbf81bf3d45..e1f952fbac48 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -102,6 +102,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ #define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */ +#define FLAG_TS_PROGRESS 0x40000 /* Positive timestamp delta */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -118,18 +119,18 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #if IS_ENABLED(CONFIG_TLS_DEVICE) static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); -void clean_acked_data_enable(struct inet_connection_sock *icsk, +void clean_acked_data_enable(struct tcp_sock *tp, void (*cad)(struct sock *sk, u32 ack_seq)) { - icsk->icsk_clean_acked = cad; + tp->tcp_clean_acked = cad; static_branch_deferred_inc(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_enable); -void clean_acked_data_disable(struct inet_connection_sock *icsk) +void clean_acked_data_disable(struct tcp_sock *tp) { static_branch_slow_dec_deferred(&clean_acked_data_enabled); - icsk->icsk_clean_acked = NULL; + tp->tcp_clean_acked = NULL; } EXPORT_SYMBOL_GPL(clean_acked_data_disable); @@ -169,6 +170,7 @@ static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); @@ -185,6 +187,7 @@ static void bpf_skops_established(struct sock *sk, int bpf_op, memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = bpf_op; sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ if (skb) @@ -331,15 +334,14 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) static bool tcp_in_quickack_mode(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); - const struct dst_entry *dst = __sk_dst_get(sk); - return (dst && dst_metric(dst, RTAX_QUICKACK)) || + return icsk->icsk_ack.dst_quick_ack || (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } static void tcp_ecn_queue_cwr(struct tcp_sock *tp) { - if (tp->ecn_flags & TCP_ECN_OK) + if (tcp_ecn_mode_rfc3168(tp)) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } @@ -362,10 +364,13 @@ static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } -static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) +static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + if (tcp_ecn_disabled(tp)) + return; + switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, @@ -394,31 +399,39 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) } } -static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) -{ - if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) - __tcp_ecn_check_ce(sk, skb); -} - static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { - if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) - tp->ecn_flags &= ~TCP_ECN_OK; + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) { - if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) - tp->ecn_flags &= ~TCP_ECN_OK; + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { - if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) + if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) return true; return false; } +static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) +{ + tp->delivered_ce += ecn_count; +} + +/* Updates the delivered and delivered_ce counts */ +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, + bool ece_ack) +{ + tp->delivered += delivered; + if (ece_ack) + tcp_count_delivered_ce(tp, delivered); +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -636,7 +649,7 @@ void tcp_initialize_rcv_mss(struct sock *sk) inet_csk(sk)->icsk_ack.rcv_mss = hint; } -EXPORT_SYMBOL(tcp_initialize_rcv_mss); +EXPORT_IPV6_MOD(tcp_initialize_rcv_mss); /* Receiver "autotuning" code. * @@ -857,7 +870,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) icsk->icsk_ack.lrcvtime = now; tcp_save_lrcv_flowlabel(sk, skb); - tcp_ecn_check_ce(sk, skb); + tcp_data_ecn_check(sk, skb); if (skb->len >= 128) tcp_grow_window(sk, skb, true); @@ -1154,15 +1167,6 @@ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) } } -/* Updates the delivered and delivered_ce counts */ -static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, - bool ece_ack) -{ - tp->delivered += delivered; - if (ece_ack) - tp->delivered_ce += delivered; -} - /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -2258,8 +2262,7 @@ static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag) unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), msecs_to_jiffies(10)); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - delay, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, false); *ack_flag &= ~FLAG_SET_XMIT_TIMER; return true; } @@ -2716,6 +2719,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd)) return; + trace_tcp_cwnd_reduction_tp(sk, newly_acked_sacked, newly_lost, flag); + tp->prr_delivered += newly_acked_sacked; if (delta < 0) { u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + @@ -2898,7 +2903,7 @@ void tcp_simple_retransmit(struct sock *sk) */ tcp_non_congestion_loss_retransmit(sk); } -EXPORT_SYMBOL(tcp_simple_retransmit); +EXPORT_IPV6_MOD(tcp_simple_retransmit); void tcp_enter_recovery(struct sock *sk, bool ece_ack) { @@ -3288,8 +3293,7 @@ void tcp_rearm_rto(struct sock *sk) */ rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } - tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, true); } } @@ -3566,10 +3570,10 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { - unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); + unsigned long when = tcp_probe0_when(sk, tcp_rto_max(sk)); when = tcp_clamp_probe0_to_user_timeout(sk, when); - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, true); } } @@ -3814,8 +3818,16 @@ static void tcp_store_ts_recent(struct tcp_sock *tp) tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); } -static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +static int __tcp_replace_ts_recent(struct tcp_sock *tp, s32 tstamp_delta) +{ + tcp_store_ts_recent(tp); + return tstamp_delta > 0 ? FLAG_TS_PROGRESS : 0; +} + +static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) { + s32 delta; + if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { /* PAWS bug workaround wrt. ACK frames, the PAWS discard * extra check below makes sure this can only happen @@ -3824,9 +3836,13 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) * Not only, also it occurs for expired timestamps. */ - if (tcp_paws_check(&tp->rx_opt, 0)) - tcp_store_ts_recent(tp); + if (tcp_paws_check(&tp->rx_opt, 0)) { + delta = tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent; + return __tcp_replace_ts_recent(tp, delta); + } } + + return 0; } /* This routine deals with acks during a TLP episode and ends an episode by @@ -3862,12 +3878,23 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) } } -static inline void tcp_in_ack_event(struct sock *sk, u32 flags) +static void tcp_in_ack_event(struct sock *sk, int flag) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_ca_ops->in_ack_event) - icsk->icsk_ca_ops->in_ack_event(sk, flags); + if (icsk->icsk_ca_ops->in_ack_event) { + u32 ack_ev_flags = 0; + + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; + if (flag & FLAG_SLOWPATH) { + ack_ev_flags |= CA_ACK_SLOWPATH; + if (flag & FLAG_ECE) + ack_ev_flags |= CA_ACK_ECE; + } + + icsk->icsk_ca_ops->in_ack_event(sk, ack_ev_flags); + } } /* Congestion control has updated the cwnd already. So if we're in @@ -3960,8 +3987,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) #if IS_ENABLED(CONFIG_TLS_DEVICE) if (static_branch_unlikely(&clean_acked_data_enabled.key)) - if (icsk->icsk_clean_acked) - icsk->icsk_clean_acked(sk, ack); + if (tp->tcp_clean_acked) + tp->tcp_clean_acked(sk, ack); #endif } @@ -3972,7 +3999,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * is in window. */ if (flag & FLAG_UPDATE_TS_RECENT) - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + flag |= tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == FLAG_SND_UNA_ADVANCED) { @@ -3984,12 +4011,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_snd_una_update(tp, ack); flag |= FLAG_WIN_UPDATE; - tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { - u32 ack_ev_flags = CA_ACK_SLOWPATH; - if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else @@ -4001,19 +4024,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; - ack_ev_flags |= CA_ACK_ECE; - } if (sack_state.sack_delivered) tcp_count_delivered(tp, sack_state.sack_delivered, flag & FLAG_ECE); - - if (flag & FLAG_WIN_UPDATE) - ack_ev_flags |= CA_ACK_WIN_UPDATE; - - tcp_in_ack_event(sk, ack_ev_flags); } /* This is a deviation from RFC3168 since it states that: @@ -4040,6 +4056,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + tcp_in_ack_event(sk, flag); + if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -4071,6 +4089,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, @@ -4180,7 +4199,6 @@ u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) } return mss; } -EXPORT_SYMBOL_GPL(tcp_parse_mss_option); /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when @@ -4530,7 +4548,7 @@ void tcp_done_with_error(struct sock *sk, int err) if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } -EXPORT_SYMBOL(tcp_done_with_error); +EXPORT_IPV6_MOD(tcp_done_with_error); /* When we get a reset we do this. */ void tcp_reset(struct sock *sk, struct sk_buff *skb) @@ -5019,7 +5037,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) bool fragstolen; tcp_save_lrcv_flowlabel(sk, skb); - tcp_ecn_check_ce(sk, skb); + tcp_data_ecn_check(sk, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); @@ -5153,7 +5171,7 @@ end: if (tcp_is_sack(tp)) tcp_grow_window(sk, skb, false); skb_condense(skb); - skb_set_owner_r(skb, sk); + tcp_skb_set_owner_r(skb, sk); } } @@ -5169,7 +5187,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { tcp_add_receive_queue(sk, skb); - skb_set_owner_r(skb, sk); + tcp_skb_set_owner_r(skb, sk); } return eaten; } @@ -5486,7 +5504,7 @@ skip_this: __skb_queue_before(list, skb, nskb); else __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ - skb_set_owner_r(nskb, sk); + tcp_skb_set_owner_r(nskb, sk); mptcp_skb_ext_move(nskb, skb); /* Copy data, releasing collapsed skbs. */ @@ -6156,6 +6174,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->seq == tp->rcv_nxt && !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { int tcp_header_len = tp->tcp_header_len; + s32 delta = 0; + int flag = 0; /* Timestamp header prediction: tcp_header_len * is automatically equal to th->doff*4 due to pred_flags @@ -6168,8 +6188,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (!tcp_parse_aligned_timestamp(tp, th)) goto slow_path; + delta = tp->rx_opt.rcv_tsval - + tp->rx_opt.ts_recent; /* If PAWS failed, check it more carefully in slow path */ - if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) + if (delta < 0) goto slow_path; /* DO NOT update ts_recent here, if checksum fails @@ -6189,12 +6211,13 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); + flag |= __tcp_replace_ts_recent(tp, + delta); /* We know that such packets are checksummed * on entry. */ - tcp_ack(sk, skb, 0); + tcp_ack(sk, skb, flag); __kfree_skb(skb); tcp_data_snd_check(sk); /* When receiving pure ack in fast path, update @@ -6225,7 +6248,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); + flag |= __tcp_replace_ts_recent(tp, + delta); tcp_rcv_rtt_measure_ts(sk, skb); @@ -6240,7 +6264,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { /* Well, only one small jumplet in fast path... */ - tcp_ack(sk, skb, FLAG_DATA); + tcp_ack(sk, skb, flag | FLAG_DATA); tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; @@ -6300,7 +6324,7 @@ csum_error: discard: tcp_drop_reason(sk, skb, reason); } -EXPORT_SYMBOL(tcp_rcv_established); +EXPORT_IPV6_MOD(tcp_rcv_established); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) { @@ -6353,7 +6377,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; if (sock_flag(sk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); + tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); if (!tp->rx_opt.snd_wscale) __tcp_fast_path_on(tp, tp->snd_wnd); @@ -6476,9 +6500,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { /* Previous FIN/ACK or RST/ACK might be ignored. */ if (icsk->icsk_retransmits == 0) - inet_csk_reset_xmit_timer(sk, - ICSK_TIME_RETRANS, - TCP_TIMEOUT_MIN, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + TCP_TIMEOUT_MIN, false); SKB_DR_SET(reason, TCP_INVALID_ACK_SEQUENCE); goto reset_and_undo; } @@ -6593,8 +6616,8 @@ consume: */ inet_csk_schedule_ack(sk); tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, false); goto consume; } tcp_send_ack(sk); @@ -6812,10 +6835,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - if (!tcp_check_req(sk, skb, req, true, &req_stolen)) { - SKB_DR_SET(reason, TCP_FASTOPEN); + SKB_DR_SET(reason, TCP_FASTOPEN); + if (!tcp_check_req(sk, skb, req, true, &req_stolen, &reason)) goto discard; - } } if (!th->ack && !th->rst && !th->syn) { @@ -6928,7 +6950,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing @@ -6936,7 +6958,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * if it spins in bh_lock_sock(), but it is really * marginal case. */ - inet_csk_reset_keepalive_timer(sk, tmo); + tcp_reset_keepalive_timer(sk, tmo); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto consume; @@ -7014,7 +7036,7 @@ consume: __kfree_skb(skb); return 0; } -EXPORT_SYMBOL(tcp_rcv_state_process); +EXPORT_IPV6_MOD(tcp_rcv_state_process); static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) { @@ -7081,6 +7103,7 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; + tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; @@ -7196,7 +7219,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, return mss; } -EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); +EXPORT_IPV6_MOD_GPL(tcp_get_syncookie_mss); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, @@ -7377,4 +7400,4 @@ drop: tcp_listendrop(sk); return 0; } -EXPORT_SYMBOL(tcp_conn_request); +EXPORT_IPV6_MOD(tcp_conn_request); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2632844d2c35..8cce0d5489da 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -66,6 +66,7 @@ #include <net/transp_v6.h> #include <net/ipv6.h> #include <net/inet_common.h> +#include <net/inet_ecn.h> #include <net/timewait_sock.h> #include <net/xfrm.h> #include <net/secure_seq.h> @@ -92,7 +93,6 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, #endif struct inet_hashinfo tcp_hashinfo; -EXPORT_SYMBOL(tcp_hashinfo); static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), @@ -199,7 +199,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) return 0; } -EXPORT_SYMBOL_GPL(tcp_twsk_unique); +EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -359,7 +359,7 @@ failure: inet->inet_dport = 0; return err; } -EXPORT_SYMBOL(tcp_v4_connect); +EXPORT_IPV6_MOD(tcp_v4_connect); /* * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. @@ -400,7 +400,7 @@ void tcp_v4_mtu_reduced(struct sock *sk) tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ } -EXPORT_SYMBOL(tcp_v4_mtu_reduced); +EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); static void do_redirect(struct sk_buff *skb, struct sock *sk) { @@ -434,7 +434,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) } reqsk_put(req); } -EXPORT_SYMBOL(tcp_req_err); +EXPORT_IPV6_MOD(tcp_req_err); /* TCP-LD (RFC 6069) logic */ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) @@ -458,15 +458,14 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) icsk->icsk_backoff--; icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; - icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); tcp_mstamp_refresh(tp); delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); if (remaining > 0) { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - remaining, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); } else { /* RTO revert clocked out retransmission. * Will retransmit now. @@ -474,7 +473,7 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) tcp_retransmit_timer(sk); } } -EXPORT_SYMBOL(tcp_ld_RTO_revert); +EXPORT_IPV6_MOD(tcp_ld_RTO_revert); /* * This routine is called by the ICMP module when it gets some @@ -496,14 +495,14 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); - struct tcp_sock *tp; + struct net *net = dev_net_rcu(skb->dev); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - struct sock *sk; struct request_sock *fastopen; + struct tcp_sock *tp; u32 seq, snd_una; + struct sock *sk; int err; - struct net *net = dev_net(skb->dev); sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, iph->daddr, th->dest, iph->saddr, @@ -676,7 +675,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); } -EXPORT_SYMBOL(tcp_v4_send_check); +EXPORT_IPV6_MOD(tcp_v4_send_check); #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) @@ -788,7 +787,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); - net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) @@ -889,7 +888,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); - arg.tos = ip_hdr(skb)->tos; + /* ECN bits of TW reset are cleared */ + arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); @@ -1035,11 +1035,21 @@ static void tcp_v4_send_ack(const struct sock *sk, local_bh_enable(); } -static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) +static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, + enum tcp_tw_status tw_status) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); struct tcp_key key = {}; + u8 tos = tw->tw_tos; + + /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, + * while not cleaning ECN bits of other TW ACKs to avoid these ACKs + * being placed in a different service queues (Classic rather than L4S) + */ + if (tw_status == TCP_TW_ACK_OOW) + tos &= ~INET_ECN_MASK; + #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; @@ -1083,7 +1093,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, &key, tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, - tw->tw_tos, + tos, tw->tw_txhash); inet_twsk_put(tw); @@ -1153,14 +1163,15 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, key.type = TCP_KEY_MD5; } + /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), - READ_ONCE(req->ts_recent), + req->ts_recent, 0, &key, inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, - ip_hdr(skb)->tos, + ip_hdr(skb)->tos & ~INET_ECN_MASK, READ_ONCE(tcp_rsk(req)->txhash)); if (tcp_key_is_ao(&key)) kfree(key.traffic_key); @@ -1231,7 +1242,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) */ DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); -EXPORT_SYMBOL(tcp_md5_needed); +EXPORT_IPV6_MOD(tcp_md5_needed); static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) { @@ -1290,7 +1301,7 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, } return best_match; } -EXPORT_SYMBOL(__tcp_md5_do_lookup); +EXPORT_IPV6_MOD(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, @@ -1337,7 +1348,7 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } -EXPORT_SYMBOL(tcp_v4_md5_lookup); +EXPORT_IPV6_MOD(tcp_v4_md5_lookup); static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) { @@ -1433,7 +1444,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, newkey, newkeylen, GFP_KERNEL); } -EXPORT_SYMBOL(tcp_md5_do_add); +EXPORT_IPV6_MOD(tcp_md5_do_add); int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, @@ -1465,7 +1476,7 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, key->flags, key->key, key->keylen, sk_gfp_mask(sk, GFP_ATOMIC)); } -EXPORT_SYMBOL(tcp_md5_key_copy); +EXPORT_IPV6_MOD(tcp_md5_key_copy); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags) @@ -1480,7 +1491,7 @@ int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, kfree_rcu(key, rcu); return 0; } -EXPORT_SYMBOL(tcp_md5_do_del); +EXPORT_IPV6_MOD(tcp_md5_do_del); void tcp_clear_md5_list(struct sock *sk) { @@ -1659,7 +1670,7 @@ clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } -EXPORT_SYMBOL(tcp_v4_md5_hash_skb); +EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); #endif @@ -1732,7 +1743,7 @@ drop: tcp_listendrop(sk); return 0; } -EXPORT_SYMBOL(tcp_v4_conn_request); +EXPORT_IPV6_MOD(tcp_v4_conn_request); /* @@ -1770,10 +1781,6 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); - sk_daddr_set(newsk, ireq->ir_rmt_addr); - sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); - newsk->sk_bound_dev_if = ireq->ir_iif; - newinet->inet_saddr = ireq->ir_loc_addr; inet_opt = rcu_dereference(ireq->ireq_opt); RCU_INIT_POINTER(newinet->inet_opt, inet_opt); newinet->mc_index = inet_iif(skb); @@ -1856,7 +1863,7 @@ put_and_exit: tcp_done(newsk); goto exit; } -EXPORT_SYMBOL(tcp_v4_syn_recv_sock); +EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) { @@ -1967,7 +1974,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv); int tcp_v4_early_demux(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; @@ -2057,7 +2064,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, !((TCP_SKB_CB(tail)->tcp_flags & TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || ((TCP_SKB_CB(tail)->tcp_flags ^ - TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || + TCP_SKB_CB(skb)->tcp_flags) & + (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || !tcp_skb_can_collapse_rx(tail, skb) || thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) @@ -2135,7 +2143,7 @@ no_coalesce: } return false; } -EXPORT_SYMBOL(tcp_add_backlog); +EXPORT_IPV6_MOD(tcp_add_backlog); int tcp_filter(struct sock *sk, struct sk_buff *skb) { @@ -2143,7 +2151,7 @@ int tcp_filter(struct sock *sk, struct sk_buff *skb) return sk_filter_trim_cap(sk, skb, th->doff * 4); } -EXPORT_SYMBOL(tcp_filter); +EXPORT_IPV6_MOD(tcp_filter); static void tcp_v4_restore_cb(struct sk_buff *skb) { @@ -2165,7 +2173,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -2178,8 +2186,9 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_rcv(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; + enum tcp_tw_status tw_status; int sdif = inet_sdif(skb); int dif = inet_iif(skb); const struct iphdr *iph; @@ -2271,7 +2280,8 @@ lookup: th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); - nsk = tcp_check_req(sk, skb, req, false, &req_stolen); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen, + &drop_reason); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } @@ -2406,7 +2416,9 @@ do_time_wait: inet_twsk_put(inet_twsk(sk)); goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { + + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn); + switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, @@ -2427,7 +2439,8 @@ do_time_wait: /* to ACK */ fallthrough; case TCP_TW_ACK: - tcp_v4_timewait_ack(sk, skb); + case TCP_TW_ACK_OOW: + tcp_v4_timewait_ack(sk, skb, tw_status); break; case TCP_TW_RST: tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); @@ -2452,7 +2465,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) sk->sk_rx_dst_ifindex = skb->skb_iif; } } -EXPORT_SYMBOL(inet_sk_rx_dst_set); +EXPORT_IPV6_MOD(inet_sk_rx_dst_set); const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, @@ -2464,11 +2477,9 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, - .addr2sockaddr = inet_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in), .mtu_reduced = tcp_v4_mtu_reduced, }; -EXPORT_SYMBOL(ipv4_specific); +EXPORT_IPV6_MOD(ipv4_specific); #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { @@ -2578,7 +2589,7 @@ void tcp_v4_destroy_sock(struct sock *sk) sk_sockets_allocated_dec(sk); } -EXPORT_SYMBOL(tcp_v4_destroy_sock); +EXPORT_IPV6_MOD(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ @@ -2814,7 +2825,7 @@ out: st->last_pos = *pos; return rc; } -EXPORT_SYMBOL(tcp_seq_start); +EXPORT_IPV6_MOD(tcp_seq_start); void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { @@ -2845,7 +2856,7 @@ out: st->last_pos = *pos; return rc; } -EXPORT_SYMBOL(tcp_seq_next); +EXPORT_IPV6_MOD(tcp_seq_next); void tcp_seq_stop(struct seq_file *seq, void *v) { @@ -2863,7 +2874,7 @@ void tcp_seq_stop(struct seq_file *seq, void *v) break; } } -EXPORT_SYMBOL(tcp_seq_stop); +EXPORT_IPV6_MOD(tcp_seq_stop); static void get_openreq4(const struct request_sock *req, struct seq_file *f, int i) @@ -2912,10 +2923,10 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk_timeout(icsk); } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk_timeout(icsk); } else if (timer_pending(&sk->sk_timer)) { timer_active = 2; timer_expires = sk->sk_timer.expires; @@ -3533,6 +3544,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_pingpong_thresh = 1; net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); + net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; return 0; } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 95669935494e..4251670e328c 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -170,7 +170,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, bool reclaim = false; spin_lock_bh(&tcp_metrics_lock); - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); /* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again. @@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, return NULL; } - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); @@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, else return NULL; - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index dfdb7a4608a8..fb9349be36b8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -44,7 +44,7 @@ tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, /* Send ACK. Note, we do not put the bucket, * it will be released by caller. */ - return TCP_TW_ACK; + return TCP_TW_ACK_OOW; } /* We are rate-limiting, so just release the tw sock and drop skb. */ @@ -264,7 +264,7 @@ kill: inet_twsk_put(tw); return TCP_TW_SUCCESS; } -EXPORT_SYMBOL(tcp_timewait_state_process); +EXPORT_IPV6_MOD(tcp_timewait_state_process); static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) { @@ -398,7 +398,7 @@ void tcp_twsk_destructor(struct sock *sk) #endif tcp_ao_destroy_sock(sk, true); } -EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor); void tcp_twsk_purge(struct list_head *net_exit_list) { @@ -457,12 +457,13 @@ void tcp_openreq_init_rwin(struct request_sock *req, rcv_wnd); ireq->rcv_wscale = rcv_wscale; } -EXPORT_SYMBOL(tcp_openreq_init_rwin); static void tcp_ecn_openreq_child(struct tcp_sock *tp, const struct request_sock *req) { - tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; + tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? + TCP_ECN_MODE_RFC3168 : + TCP_ECN_DISABLED); } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) @@ -492,7 +493,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) tcp_set_ca_state(sk, TCP_CA_Open); } -EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); +EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child); static void smc_check_reset_syn_req(const struct tcp_sock *oldtp, struct request_sock *req, @@ -566,8 +567,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); + tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; newtp->rx_opt.sack_ok = ireq->sack_ok; @@ -587,7 +587,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (newtp->rx_opt.tstamp_ok) { newtp->tcp_usec_ts = treq->req_usec_ts; - newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent); + newtp->rx_opt.ts_recent = req->ts_recent; newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { @@ -659,12 +659,14 @@ EXPORT_SYMBOL(tcp_create_openreq_child); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - bool fastopen, bool *req_stolen) + bool fastopen, bool *req_stolen, + enum skb_drop_reason *drop_reason) { struct tcp_options_received tmp_opt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); + bool tsecr_reject = false; bool paws_reject = false; bool own_req; @@ -673,9 +675,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { - tmp_opt.ts_recent = READ_ONCE(req->ts_recent); - if (tmp_opt.rcv_tsecr) + tmp_opt.ts_recent = req->ts_recent; + if (tmp_opt.rcv_tsecr) { + if (inet_rsk(req)->tstamp_ok && !fastopen) + tsecr_reject = !between(tmp_opt.rcv_tsecr, + tcp_rsk(req)->snt_tsval_first, + READ_ONCE(tcp_rsk(req)->snt_tsval_last)); tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; + } /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. @@ -790,26 +797,29 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->snt_isn + 1)) return sk; - /* Also, it would be not so bad idea to check rcv_tsecr, which - * is essentially ACK extension and too early or too late values - * should cause reset in unsynchronized states. - */ - /* RFC793: "first check sequence number". */ - if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, - tcp_rsk(req)->rcv_nxt + - tcp_synack_window(req))) { + if (paws_reject || tsecr_reject || + !tcp_in_window(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + tcp_rsk(req)->rcv_nxt, + tcp_rsk(req)->rcv_nxt + + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); - if (paws_reject) + if (paws_reject) { + SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS); NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + } else if (tsecr_reject) { + SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED); + } else { + SKB_DR_SET(*drop_reason, TCP_OVERWINDOW); + } return NULL; } @@ -879,6 +889,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: + SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW); if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); @@ -908,7 +919,7 @@ embryonic_reset: } return NULL; } -EXPORT_SYMBOL(tcp_check_req); +EXPORT_IPV6_MOD(tcp_check_req); /* * Queue segment on the new socket if the new socket is active, @@ -950,4 +961,4 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, sock_put(child); return reason; } -EXPORT_SYMBOL(tcp_child_process); +EXPORT_IPV6_MOD(tcp_child_process); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 2dfac79dc78b..934f777f29d3 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -142,6 +142,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, struct sk_buff *gso_skb = skb; __sum16 newcheck; bool ooo_okay, copy_destructor; + bool ecn_cwr_mask; __wsum delta; th = tcp_hdr(skb); @@ -201,6 +202,8 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, newcheck = ~csum_fold(csum_add(csum_unfold(th->check), delta)); + ecn_cwr_mask = !!(skb_shinfo(gso_skb)->gso_type & SKB_GSO_TCP_ACCECN); + while (skb->next) { th->fin = th->psh = 0; th->check = newcheck; @@ -220,7 +223,8 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th = tcp_hdr(skb); th->seq = htonl(seq); - th->cwr = 0; + + th->cwr &= ecn_cwr_mask; } /* Following permits TCP Small Queues to work well with GSO : @@ -328,7 +332,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, th2 = tcp_hdr(p); flush = (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & - ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); + ~(TCP_FLAG_FIN | TCP_FLAG_PSH)); flush |= (__force int)(th->ack_seq ^ th2->ack_seq); for (i = sizeof(*th); i < thlen; i += 4) flush |= *(u32 *)((u8 *)th + i) ^ @@ -404,7 +408,7 @@ void tcp_gro_complete(struct sk_buff *skb) shinfo->gso_segs = NAPI_GRO_CB(skb)->count; if (th->cwr) - shinfo->gso_type |= SKB_GSO_TCP_ECN; + shinfo->gso_type |= SKB_GSO_TCP_ACCECN; } EXPORT_SYMBOL(tcp_gro_complete); @@ -428,7 +432,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet_get_iif_sdif(skb, &iif, &sdif); iph = skb_gro_network_header(skb); - net = dev_net(skb->dev); + net = dev_net_rcu(skb->dev); sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bc95d2a5924f..13295a59d22e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -250,7 +250,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, WRITE_ONCE(*__window_clamp, min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp)); } -EXPORT_SYMBOL(tcp_select_initial_window); +EXPORT_IPV6_MOD(tcp_select_initial_window); /* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied. The return @@ -325,7 +325,7 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) const struct tcp_sock *tp = tcp_sk(sk); TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; - if (!(tp->ecn_flags & TCP_ECN_OK)) + if (tcp_ecn_disabled(tp)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; else if (tcp_ca_needs_ecn(sk) || tcp_bpf_ca_needs_ecn(sk)) @@ -351,7 +351,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; - tp->ecn_flags = TCP_ECN_OK; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); } @@ -381,7 +381,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (tp->ecn_flags & TCP_ECN_OK) { + if (tcp_ecn_mode_rfc3168(tp)) { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { @@ -403,7 +403,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ -static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) +static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u16 flags) { skb->ip_summed = CHECKSUM_PARTIAL; @@ -525,6 +525,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, sock_owned_by_me(sk); sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } @@ -570,6 +571,7 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, sock_owned_by_me(sk); sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } @@ -941,7 +943,13 @@ static unsigned int tcp_synack_options(const struct sock *sk, opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) + tcp_rsk(req)->ts_off; - opts->tsecr = READ_ONCE(req->ts_recent); + if (!tcp_rsk(req)->snt_tsval_first) { + if (!opts->tsval) + opts->tsval = ~0U; + tcp_rsk(req)->snt_tsval_first = opts->tsval; + } + WRITE_ONCE(tcp_rsk(req)->snt_tsval_last, opts->tsval); + opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { @@ -1171,7 +1179,7 @@ void tcp_release_cb(struct sock *sk) if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk)) tcp_send_ack(sk); } -EXPORT_SYMBOL(tcp_release_cb); +EXPORT_IPV6_MOD(tcp_release_cb); void __init tcp_tasklet_init(void) { @@ -1387,7 +1395,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, th->seq = htonl(tcb->seq); th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | - tcb->tcp_flags); + (tcb->tcp_flags & TCPHDR_FLAGS_MASK)); th->check = 0; th->urg_ptr = 0; @@ -1608,8 +1616,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *buff; int old_factor; long limit; + u16 flags; int nlen; - u8 flags; if (WARN_ON(len > skb->len)) return -EINVAL; @@ -1783,7 +1791,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) return __tcp_mtu_to_mss(sk, pmtu) - (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); } -EXPORT_SYMBOL(tcp_mtu_to_mss); +EXPORT_IPV6_MOD(tcp_mtu_to_mss); /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) @@ -1813,7 +1821,6 @@ void tcp_mtup_init(struct sock *sk) if (icsk->icsk_mtup.enabled) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } -EXPORT_SYMBOL(tcp_mtup_init); /* This function synchronize snd mss to current pmtu/exthdr set. @@ -1857,7 +1864,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) return mss_now; } -EXPORT_SYMBOL(tcp_sync_mss); +EXPORT_IPV6_MOD(tcp_sync_mss); /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. @@ -2164,7 +2171,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, { int nlen = skb->len - len; struct sk_buff *buff; - u8 flags; + u16 flags; /* All of a TSO frame must be composed of paged data. */ DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len); @@ -2911,7 +2918,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); - tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, true); return true; } @@ -3545,8 +3552,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) } if (rearm_timer) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); + inet_csk(sk)->icsk_rto, true); } /* We allow to exceed memory limits for FIN packets to expedite @@ -3853,7 +3859,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, return skb; } -EXPORT_SYMBOL(tcp_make_synack); +EXPORT_IPV6_MOD(tcp_make_synack); static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) { @@ -4163,8 +4169,8 @@ int tcp_connect(struct sock *sk) TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, false); return 0; } EXPORT_SYMBOL(tcp_connect); @@ -4173,7 +4179,7 @@ u32 tcp_delack_max(const struct sock *sk) { u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1; - return min(inet_csk(sk)->icsk_delack_max, delack_from_rto_min); + return min(READ_ONCE(inet_csk(sk)->icsk_delack_max), delack_from_rto_min); } /* Send out a delayed ack, the caller does the policy checking @@ -4219,22 +4225,21 @@ void tcp_send_delayed_ack(struct sock *sk) /* Use new timeout only if there wasn't a older one earlier. */ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer is about to expire, send ACK now. */ - if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { + if (time_before_eq(icsk_delack_timeout(icsk), jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } - if (!time_before(timeout, icsk->icsk_ack.timeout)) - timeout = icsk->icsk_ack.timeout; + if (!time_before(timeout, icsk_delack_timeout(icsk))) + timeout = icsk_delack_timeout(icsk); } smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); - icsk->icsk_ack.timeout = timeout; sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } /* This routine sends an ack and also updates the window. */ -void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags) { struct sk_buff *buff; @@ -4253,17 +4258,17 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) unsigned long delay; delay = TCP_DELACK_MAX << icsk->icsk_ack.retry; - if (delay < TCP_RTO_MAX) + if (delay < tcp_rto_max(sk)) icsk->icsk_ack.retry++; inet_csk_schedule_ack(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, false); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); + tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK | flags); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. @@ -4278,7 +4283,7 @@ EXPORT_SYMBOL_GPL(__tcp_send_ack); void tcp_send_ack(struct sock *sk) { - __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); + __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt, 0); } /* This routine sends a packet with an out of date sequence @@ -4393,7 +4398,7 @@ void tcp_send_probe0(struct sock *sk) if (err <= 0) { if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; - timeout = tcp_probe0_when(sk, TCP_RTO_MAX); + timeout = tcp_probe0_when(sk, tcp_rto_max(sk)); } else { /* If packet was not sent due to local congestion, * Let senders fight for local resources conservatively. @@ -4402,7 +4407,7 @@ void tcp_send_probe0(struct sock *sk) } timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, true); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) @@ -4430,4 +4435,4 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) } return res; } -EXPORT_SYMBOL(tcp_rtx_synack); +EXPORT_IPV6_MOD(tcp_rtx_synack); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b412ed88ccd9..e4c616bbd727 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -109,7 +109,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ - if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*tcp_rto_max(sk) || !do_reset) shift++; /* If some dubious ICMP arrived, penalize even more. */ @@ -189,12 +189,12 @@ static unsigned int tcp_model_timeout(struct sock *sk, { unsigned int linear_backoff_thresh, timeout; - linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base); + linear_backoff_thresh = ilog2(tcp_rto_max(sk) / rto_base); if (boundary <= linear_backoff_thresh) timeout = ((2 << boundary) - 1) * rto_base; else timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + - (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + (boundary - linear_backoff_thresh) * tcp_rto_max(sk); return jiffies_to_msecs(timeout); } /** @@ -268,7 +268,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { - const bool alive = icsk->icsk_rto < TCP_RTO_MAX; + const bool alive = icsk->icsk_rto < tcp_rto_max(sk); retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || @@ -322,8 +322,9 @@ void tcp_delack_timer_handler(struct sock *sk) if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) return; - if (time_after(icsk->icsk_ack.timeout, jiffies)) { - sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + if (time_after(icsk_delack_timeout(icsk), jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, + icsk_delack_timeout(icsk)); return; } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; @@ -416,7 +417,8 @@ static void tcp_probe_timer(struct sock *sk) } max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { - const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; + unsigned int rto_max = tcp_rto_max(sk); + const bool alive = inet_csk_rto_backoff(icsk, rto_max) < rto_max; max_probes = tcp_orphan_retries(sk, alive); if (!alive && icsk->icsk_backoff >= max_probes) @@ -481,8 +483,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) tcp_update_rto_stats(sk); if (!tp->retrans_stamp) tp->retrans_stamp = tcp_time_stamp_ts(tp); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - req->timeout << req->num_timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + req->timeout << req->num_timeout, false); } static bool tcp_rtx_probe0_timed_out(const struct sock *sk, @@ -492,7 +494,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, const struct inet_connection_sock *icsk = inet_csk(sk); u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); const struct tcp_sock *tp = tcp_sk(sk); - int timeout = TCP_RTO_MAX * 2; + int timeout = tcp_rto_max(sk) * 2; s32 rcv_delta; if (user_timeout) { @@ -508,7 +510,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, * and tp->rcv_tstamp might very well have been written recently. * rcv_delta can thus be negative. */ - rcv_delta = icsk->icsk_timeout - tp->rcv_tstamp; + rcv_delta = icsk_timeout(icsk) - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; @@ -626,9 +628,9 @@ void tcp_retransmit_timer(struct sock *sk) /* Retransmission failed because of local congestion, * Let senders fight for local resources conservatively. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - TCP_RESOURCE_PROBE_INTERVAL, - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + TCP_RESOURCE_PROBE_INTERVAL, + false); goto out; } @@ -665,7 +667,7 @@ out_reset_timer: icsk->icsk_backoff = 0; icsk->icsk_rto = clamp(__tcp_set_rto(tp), tcp_rto_min(sk), - TCP_RTO_MAX); + tcp_rto_max(sk)); } else if (sk->sk_state != TCP_SYN_SENT || tp->total_rto > READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) { @@ -673,10 +675,10 @@ out_reset_timer: * activated. */ icsk->icsk_backoff++; - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + icsk->icsk_rto = min(icsk->icsk_rto << 1, tcp_rto_max(sk)); } - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + tcp_clamp_rto_to_user_timeout(sk), false); if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0)) __sk_dst_reset(sk); @@ -684,7 +686,8 @@ out:; } /* Called with bottom-half processing disabled. - Called by tcp_write_timer() */ + * Called by tcp_write_timer() and tcp_release_cb(). + */ void tcp_write_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -694,11 +697,11 @@ void tcp_write_timer_handler(struct sock *sk) !icsk->icsk_pending) return; - if (time_after(icsk->icsk_timeout, jiffies)) { - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + if (time_after(icsk_timeout(icsk), jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, + icsk_timeout(icsk)); return; } - tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; @@ -749,7 +752,17 @@ void tcp_syn_ack_timeout(const struct request_sock *req) __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS); } -EXPORT_SYMBOL(tcp_syn_ack_timeout); +EXPORT_IPV6_MOD(tcp_syn_ack_timeout); + +void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len) +{ + sk_reset_timer(sk, &sk->sk_timer, jiffies + len); +} + +static void tcp_delete_keepalive_timer(struct sock *sk) +{ + sk_stop_timer(sk, &sk->sk_timer); +} void tcp_set_keepalive(struct sock *sk, int val) { @@ -757,14 +770,13 @@ void tcp_set_keepalive(struct sock *sk, int val) return; if (val && !sock_flag(sk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); + tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); else if (!val) - inet_csk_delete_keepalive_timer(sk); + tcp_delete_keepalive_timer(sk); } -EXPORT_SYMBOL_GPL(tcp_set_keepalive); - +EXPORT_IPV6_MOD_GPL(tcp_set_keepalive); -static void tcp_keepalive_timer (struct timer_list *t) +static void tcp_keepalive_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); @@ -775,7 +787,7 @@ static void tcp_keepalive_timer (struct timer_list *t) bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - inet_csk_reset_keepalive_timer (sk, HZ/20); + tcp_reset_keepalive_timer(sk, HZ/20); goto out; } @@ -841,7 +853,7 @@ static void tcp_keepalive_timer (struct timer_list *t) } resched: - inet_csk_reset_keepalive_timer (sk, elapsed); + tcp_reset_keepalive_timer(sk, elapsed); goto out; death: @@ -884,11 +896,9 @@ void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); - hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED_SOFT); - tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; + hrtimer_setup(&tcp_sk(sk)->pacing_timer, tcp_pace_kick, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_SOFT); - hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED_SOFT); - tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick; + hrtimer_setup(&tcp_sk(sk)->compressed_ack_timer, tcp_compressed_ack_kick, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED_SOFT); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a9bb9ce5438e..d0bffcfa56d8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -121,13 +121,12 @@ #endif struct udp_table udp_table __read_mostly; -EXPORT_SYMBOL(udp_table); long sysctl_udp_mem[3] __read_mostly; -EXPORT_SYMBOL(sysctl_udp_mem); +EXPORT_IPV6_MOD(sysctl_udp_mem); atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; -EXPORT_SYMBOL(udp_memory_allocated); +EXPORT_IPV6_MOD(udp_memory_allocated); DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); @@ -352,7 +351,7 @@ fail_unlock: fail: return error; } -EXPORT_SYMBOL(udp_lib_get_port); +EXPORT_IPV6_MOD(udp_lib_get_port); int udp_v4_get_port(struct sock *sk, unsigned short snum) { @@ -418,7 +417,7 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, return __inet_ehashfn(laddr, lport, faddr, fport, udp_ehash_secret + net_hash_mix(net)); } -EXPORT_SYMBOL(udp_ehashfn); +EXPORT_IPV6_MOD(udp_ehashfn); /** * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port) @@ -653,7 +652,7 @@ void udp_lib_hash4(struct sock *sk, u16 hash) spin_unlock_bh(&hslot->lock); } -EXPORT_SYMBOL(udp_lib_hash4); +EXPORT_IPV6_MOD(udp_lib_hash4); /* call with sock lock */ void udp4_hash4(struct sock *sk) @@ -669,7 +668,7 @@ void udp4_hash4(struct sock *sk) udp_lib_hash4(sk, hash); } -EXPORT_SYMBOL(udp4_hash4); +EXPORT_IPV6_MOD(udp4_hash4); #endif /* CONFIG_BASE_SMALL */ /* UDP is nearly always wildcards out the wazoo, it makes no sense to try @@ -809,11 +808,11 @@ static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk, } DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); -EXPORT_SYMBOL(udp_encap_needed_key); +EXPORT_IPV6_MOD(udp_encap_needed_key); #if IS_ENABLED(CONFIG_IPV6) DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); -EXPORT_SYMBOL(udpv6_encap_needed_key); +EXPORT_IPV6_MOD(udpv6_encap_needed_key); #endif void udp_encap_enable(void) @@ -1041,7 +1040,7 @@ void udp_flush_pending_frames(struct sock *sk) ip_flush_pending_frames(sk); } } -EXPORT_SYMBOL(udp_flush_pending_frames); +EXPORT_IPV6_MOD(udp_flush_pending_frames); /** * udp4_hwcsum - handle outgoing HW checksumming @@ -1229,7 +1228,7 @@ out: WRITE_ONCE(up->pending, 0); return err; } -EXPORT_SYMBOL(udp_push_pending_frames); +EXPORT_IPV6_MOD(udp_push_pending_frames); static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size) { @@ -1266,7 +1265,7 @@ int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size) return need_ip; } -EXPORT_SYMBOL_GPL(udp_cmsg_send); +EXPORT_IPV6_MOD_GPL(udp_cmsg_send); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { @@ -1281,7 +1280,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int free = 0; int connected = 0; __be32 daddr, faddr, saddr; - u8 tos, scope; + u8 scope; __be16 dport; int err, is_udplite = IS_UDPLITE(sk); int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; @@ -1405,7 +1404,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) faddr = ipc.opt->opt.faddr; connected = 0; } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (scope == RT_SCOPE_LINK) connected = 0; @@ -1442,7 +1440,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport, sk->sk_uid); @@ -1561,7 +1560,7 @@ void udp_splice_eof(struct socket *sock) udp_push_pending_frames(sk); release_sock(sk); } -EXPORT_SYMBOL_GPL(udp_splice_eof); +EXPORT_IPV6_MOD_GPL(udp_splice_eof); #define UDP_SKB_IS_STATELESS 0x80000000 @@ -1678,7 +1677,7 @@ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) prefetch(&skb->data); udp_rmem_release(sk, udp_skb_truesize(skb), 1, false); } -EXPORT_SYMBOL(udp_skb_destructor); +EXPORT_IPV6_MOD(udp_skb_destructor); /* as above, but the caller held the rx queue lock, too */ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) @@ -1785,7 +1784,7 @@ drop: busylock_release(busy); return err; } -EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); +EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb); void udp_destruct_common(struct sock *sk) { @@ -1801,7 +1800,7 @@ void udp_destruct_common(struct sock *sk) } udp_rmem_release(sk, total, 0, true); } -EXPORT_SYMBOL_GPL(udp_destruct_common); +EXPORT_IPV6_MOD_GPL(udp_destruct_common); static void udp_destruct_sock(struct sock *sk) { @@ -1832,7 +1831,7 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) skb_release_head_state(skb); __consume_stateless_skb(skb); } -EXPORT_SYMBOL_GPL(skb_consume_udp); +EXPORT_IPV6_MOD_GPL(skb_consume_udp); static struct sk_buff *__first_packet_length(struct sock *sk, struct sk_buff_head *rcvq, @@ -1849,7 +1848,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, atomic_inc(&sk->sk_drops); __skb_unlink(skb, rcvq); *total += skb->truesize; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); } else { udp_skb_csum_unnecessary_set(skb); break; @@ -1914,7 +1913,7 @@ int udp_ioctl(struct sock *sk, int cmd, int *karg) return 0; } -EXPORT_SYMBOL(udp_ioctl); +EXPORT_IPV6_MOD(udp_ioctl); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int *off, int *err) @@ -2003,14 +2002,14 @@ try_again: __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); return recv_actor(sk, skb); } -EXPORT_SYMBOL(udp_read_skb); +EXPORT_IPV6_MOD(udp_read_skb); /* * This should be easy, if there is something there we @@ -2118,7 +2117,7 @@ csum_copy_err: UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); /* starting over for a new packet, but check if we need to yield */ cond_resched(); @@ -2137,7 +2136,7 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len); } -EXPORT_SYMBOL(udp_pre_connect); +EXPORT_IPV6_MOD(udp_pre_connect); static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -2186,7 +2185,7 @@ int udp_disconnect(struct sock *sk, int flags) release_sock(sk); return 0; } -EXPORT_SYMBOL(udp_disconnect); +EXPORT_IPV6_MOD(udp_disconnect); void udp_lib_unhash(struct sock *sk) { @@ -2216,7 +2215,7 @@ void udp_lib_unhash(struct sock *sk) spin_unlock_bh(&hslot->lock); } } -EXPORT_SYMBOL(udp_lib_unhash); +EXPORT_IPV6_MOD(udp_lib_unhash); /* * inet_rcv_saddr was changed, we must rehash secondary hash @@ -2280,7 +2279,7 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) } } } -EXPORT_SYMBOL(udp_lib_rehash); +EXPORT_IPV6_MOD(udp_lib_rehash); void udp_v4_rehash(struct sock *sk) { @@ -2485,7 +2484,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) } return false; } -EXPORT_SYMBOL(udp_sk_rx_dst_set); +EXPORT_IPV6_MOD(udp_sk_rx_dst_set); /* * Multicasts and broadcasts go to each listener. @@ -3041,7 +3040,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, return err; } -EXPORT_SYMBOL(udp_lib_setsockopt); +EXPORT_IPV6_MOD(udp_lib_setsockopt); int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) @@ -3112,7 +3111,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; return 0; } -EXPORT_SYMBOL(udp_lib_getsockopt); +EXPORT_IPV6_MOD(udp_lib_getsockopt); int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) @@ -3154,7 +3153,7 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) return mask; } -EXPORT_SYMBOL(udp_poll); +EXPORT_IPV6_MOD(udp_poll); int udp_abort(struct sock *sk, int err) { @@ -3177,7 +3176,7 @@ out: return 0; } -EXPORT_SYMBOL_GPL(udp_abort); +EXPORT_IPV6_MOD_GPL(udp_abort); struct proto udp_prot = { .name = "UDP", @@ -3311,7 +3310,7 @@ void *udp_seq_start(struct seq_file *seq, loff_t *pos) return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } -EXPORT_SYMBOL(udp_seq_start); +EXPORT_IPV6_MOD(udp_seq_start); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { @@ -3325,7 +3324,7 @@ void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; return sk; } -EXPORT_SYMBOL(udp_seq_next); +EXPORT_IPV6_MOD(udp_seq_next); void udp_seq_stop(struct seq_file *seq, void *v) { @@ -3337,7 +3336,7 @@ void udp_seq_stop(struct seq_file *seq, void *v) if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); } -EXPORT_SYMBOL(udp_seq_stop); +EXPORT_IPV6_MOD(udp_seq_stop); /* ------------------------------------------------------------------------ */ static void udp4_format_sock(struct sock *sp, struct seq_file *f, @@ -3616,7 +3615,7 @@ const struct seq_operations udp_seq_ops = { .stop = udp_seq_stop, .show = udp4_seq_show, }; -EXPORT_SYMBOL(udp_seq_ops); +EXPORT_IPV6_MOD(udp_seq_ops); static struct udp_seq_afinfo udp4_seq_afinfo = { .family = AF_INET, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index ecfca59f31f1..2c0725583be3 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -634,7 +634,7 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { const struct iphdr *iph = skb_gro_network_header(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); int iif, sdif; inet_get_iif_sdif(skb, &iif, &sdif); |