diff options
author | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2023-08-13 22:14:51 +0200 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2023-08-13 22:14:51 +0200 |
commit | e75850b4573a092078d5ff1493d3d9ee16b98821 (patch) | |
tree | ece3bd9b45b43ec13c75de84967043a88fedda6e /net | |
parent | 5a652fe5e38d906621e9c54a7d14ca4e030ab4f6 (diff) | |
parent | 2ccdd1b13c591d306f0401d98dedc4bdcd02b421 (diff) | |
download | lwn-e75850b4573a092078d5ff1493d3d9ee16b98821.tar.gz lwn-e75850b4573a092078d5ff1493d3d9ee16b98821.zip |
Merge 6.5-rc6 into char-misc-next
We need the char/misc fixes in here as well to build on top of.
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'net')
63 files changed, 883 insertions, 466 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index e40aa3e3641c..b3662119ddbc 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -384,8 +384,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } - if (event == NETDEV_DOWN && - (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) + if (event == NETDEV_DOWN) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); diff --git a/net/can/raw.c b/net/can/raw.c index ba6b52b1d776..e10f59375659 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -865,7 +865,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) skb->dev = dev; skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = READ_ONCE(sk->sk_mark); skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, sockc.tsflags); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 11c04e7d928e..658a6f2320cf 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -3334,17 +3334,24 @@ static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq) int ret; dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); - ret = wait_for_completion_interruptible(&lreq->reg_commit_wait); + ret = wait_for_completion_killable(&lreq->reg_commit_wait); return ret ?: lreq->reg_commit_error; } -static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq) +static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq, + unsigned long timeout) { - int ret; + long left; dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); - ret = wait_for_completion_interruptible(&lreq->notify_finish_wait); - return ret ?: lreq->notify_finish_error; + left = wait_for_completion_killable_timeout(&lreq->notify_finish_wait, + ceph_timeout_jiffies(timeout)); + if (left <= 0) + left = left ?: -ETIMEDOUT; + else + left = lreq->notify_finish_error; /* completed */ + + return left; } /* @@ -4896,7 +4903,8 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, linger_submit(lreq); ret = linger_reg_commit_wait(lreq); if (!ret) - ret = linger_notify_finish_wait(lreq); + ret = linger_notify_finish_wait(lreq, + msecs_to_jiffies(2 * timeout * MSEC_PER_SEC)); else dout("lreq %p failed to initiate notify %d\n", lreq, ret); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d4172534dfa8..cca7594be92e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -496,8 +496,11 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) return ERR_PTR(-EPERM); nla_for_each_nested(nla, nla_stgs, rem) { - if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) + if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) { + if (nla_len(nla) != sizeof(u32)) + return ERR_PTR(-EINVAL); nr_maps++; + } } diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); diff --git a/net/core/filter.c b/net/core/filter.c index 06ba0e56e369..28a59596987a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4116,12 +4116,6 @@ BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) if (unlikely(data_end > data_hard_end)) return -EINVAL; - /* ALL drivers MUST init xdp->frame_sz, chicken check below */ - if (unlikely(xdp->frame_sz > PAGE_SIZE)) { - WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz); - return -EINVAL; - } - if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3ad4e030846d..aef25aa5cf1d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -5140,13 +5140,17 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { + if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !have_flags) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; have_flags = true; flags = nla_get_u16(attr); - break; + } + + if (nla_type(attr) == IFLA_BRIDGE_MODE) { + if (nla_len(attr) < sizeof(u16)) + return -EINVAL; } } } diff --git a/net/core/skmsg.c b/net/core/skmsg.c index a29508e1ff35..ef1a2eb6520b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1120,13 +1120,19 @@ static void sk_psock_strp_data_ready(struct sock *sk) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { + int ret; + static const struct strp_callbacks cb = { .rcv_msg = sk_psock_strp_read, .read_sock_done = sk_psock_strp_read_done, .parse_msg = sk_psock_strp_parse, }; - return strp_init(&psock->strp, sk, &cb); + ret = strp_init(&psock->strp, sk, &cb); + if (!ret) + sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); + + return ret; } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) @@ -1154,7 +1160,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ - if (psock->progs.stream_parser) + if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED)) strp_done(&psock->strp); } #else diff --git a/net/core/sock.c b/net/core/sock.c index 9370fd50aa2c..732fc37a4771 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -429,6 +429,7 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, { struct __kernel_sock_timeval tv; int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); + long val; if (err) return err; @@ -439,7 +440,7 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, if (tv.tv_sec < 0) { static int warned __read_mostly; - *timeo_p = 0; + WRITE_ONCE(*timeo_p, 0); if (warned < 10 && net_ratelimit()) { warned++; pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", @@ -447,11 +448,12 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, } return 0; } - *timeo_p = MAX_SCHEDULE_TIMEOUT; - if (tv.tv_sec == 0 && tv.tv_usec == 0) - return 0; - if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) - *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); + val = MAX_SCHEDULE_TIMEOUT; + if ((tv.tv_sec || tv.tv_usec) && + (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) + val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, + USEC_PER_SEC / HZ); + WRITE_ONCE(*timeo_p, val); return 0; } @@ -804,7 +806,7 @@ EXPORT_SYMBOL(sock_no_linger); void sock_set_priority(struct sock *sk, u32 priority) { lock_sock(sk); - sk->sk_priority = priority; + WRITE_ONCE(sk->sk_priority, priority); release_sock(sk); } EXPORT_SYMBOL(sock_set_priority); @@ -813,9 +815,9 @@ void sock_set_sndtimeo(struct sock *sk, s64 secs) { lock_sock(sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) - sk->sk_sndtimeo = secs * HZ; + WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); else - sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); release_sock(sk); } EXPORT_SYMBOL(sock_set_sndtimeo); @@ -988,7 +990,7 @@ EXPORT_SYMBOL(sock_set_rcvbuf); static void __sock_set_mark(struct sock *sk, u32 val) { if (val != sk->sk_mark) { - sk->sk_mark = val; + WRITE_ONCE(sk->sk_mark, val); sk_dst_reset(sk); } } @@ -1007,7 +1009,7 @@ static void sock_release_reserved_memory(struct sock *sk, int bytes) bytes = round_down(bytes, PAGE_SIZE); WARN_ON(bytes > sk->sk_reserved_mem); - sk->sk_reserved_mem -= bytes; + WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); sk_mem_reclaim(sk); } @@ -1044,7 +1046,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes) } sk->sk_forward_alloc += pages << PAGE_SHIFT; - sk->sk_reserved_mem += pages << PAGE_SHIFT; + WRITE_ONCE(sk->sk_reserved_mem, + sk->sk_reserved_mem + (pages << PAGE_SHIFT)); return 0; } @@ -1213,7 +1216,7 @@ set_sndbuf: if ((val >= 0 && val <= 6) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - sk->sk_priority = val; + WRITE_ONCE(sk->sk_priority, val); else ret = -EPERM; break; @@ -1438,7 +1441,8 @@ set_sndbuf: cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); - sk->sk_max_pacing_rate = ulval; + /* Pairs with READ_ONCE() from sk_getsockopt() */ + WRITE_ONCE(sk->sk_max_pacing_rate, ulval); sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); break; } @@ -1533,7 +1537,9 @@ set_sndbuf: } if ((u8)val == SOCK_TXREHASH_DEFAULT) val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); - /* Paired with READ_ONCE() in tcp_rtx_synack() */ + /* Paired with READ_ONCE() in tcp_rtx_synack() + * and sk_getsockopt(). + */ WRITE_ONCE(sk->sk_txrehash, (u8)val); break; @@ -1633,11 +1639,11 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_SNDBUF: - v.val = sk->sk_sndbuf; + v.val = READ_ONCE(sk->sk_sndbuf); break; case SO_RCVBUF: - v.val = sk->sk_rcvbuf; + v.val = READ_ONCE(sk->sk_rcvbuf); break; case SO_REUSEADDR: @@ -1679,7 +1685,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PRIORITY: - v.val = sk->sk_priority; + v.val = READ_ONCE(sk->sk_priority); break; case SO_LINGER: @@ -1717,16 +1723,18 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: - lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); + lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, + SO_RCVTIMEO_OLD == optname); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: - lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); + lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, + SO_SNDTIMEO_OLD == optname); break; case SO_RCVLOWAT: - v.val = sk->sk_rcvlowat; + v.val = READ_ONCE(sk->sk_rcvlowat); break; case SO_SNDLOWAT: @@ -1770,7 +1778,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, spin_unlock(&sk->sk_peer_lock); if (!peer_pid) - return -ESRCH; + return -ENODATA; pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); put_pid(peer_pid); @@ -1843,7 +1851,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, optval, optlen, len); case SO_MARK: - v.val = sk->sk_mark; + v.val = READ_ONCE(sk->sk_mark); break; case SO_RCVMARK: @@ -1862,7 +1870,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, if (!sock->ops->set_peek_off) return -EOPNOTSUPP; - v.val = sk->sk_peek_off; + v.val = READ_ONCE(sk->sk_peek_off); break; case SO_NOFCS: v.val = sock_flag(sk, SOCK_NOFCS); @@ -1892,7 +1900,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: - v.val = sk->sk_ll_usec; + v.val = READ_ONCE(sk->sk_ll_usec); break; case SO_PREFER_BUSY_POLL: v.val = READ_ONCE(sk->sk_prefer_busy_poll); @@ -1900,12 +1908,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, #endif case SO_MAX_PACING_RATE: + /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { lv = sizeof(v.ulval); - v.ulval = sk->sk_max_pacing_rate; + v.ulval = READ_ONCE(sk->sk_max_pacing_rate); } else { /* 32bit version */ - v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); + v.val = min_t(unsigned long, ~0U, + READ_ONCE(sk->sk_max_pacing_rate)); } break; @@ -1973,11 +1983,12 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_RESERVE_MEM: - v.val = sk->sk_reserved_mem; + v.val = READ_ONCE(sk->sk_reserved_mem); break; case SO_TXREHASH: - v.val = sk->sk_txrehash; + /* Paired with WRITE_ONCE() in sk_setsockopt() */ + v.val = READ_ONCE(sk->sk_txrehash); break; default: @@ -3168,7 +3179,7 @@ EXPORT_SYMBOL(__sk_mem_reclaim); int sk_set_peek_off(struct sock *sk, int val) { - sk->sk_peek_off = val; + WRITE_ONCE(sk->sk_peek_off, val); return 0; } EXPORT_SYMBOL_GPL(sk_set_peek_off); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 19538d628714..8f07fea39d9e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -115,7 +115,6 @@ static void sock_map_sk_acquire(struct sock *sk) __acquires(&sk->sk_lock.slock) { lock_sock(sk); - preempt_disable(); rcu_read_lock(); } @@ -123,7 +122,6 @@ static void sock_map_sk_release(struct sock *sk) __releases(&sk->sk_lock.slock) { rcu_read_unlock(); - preempt_enable(); release_sock(sk); } @@ -148,13 +146,13 @@ static void sock_map_del_link(struct sock *sk, list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; - struct bpf_stab *stab = container_of(map, struct bpf_stab, - map); - if (psock->saved_data_ready && stab->progs.stream_parser) + struct sk_psock_progs *progs = sock_map_progs(map); + + if (psock->saved_data_ready && progs->stream_parser) strp_stop = true; - if (psock->saved_data_ready && stab->progs.stream_verdict) + if (psock->saved_data_ready && progs->stream_verdict) verdict_stop = true; - if (psock->saved_data_ready && stab->progs.skb_verdict) + if (psock->saved_data_ready && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index c0c438128575..2e6b8c8fd2de 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -980,7 +980,7 @@ static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_BCN_ATTR_MAX, - tb[DCB_ATTR_BCN], dcbnl_pfc_up_nest, + tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 7249ef218178..d29d1163203d 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -238,8 +238,8 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass, - sk->sk_priority); + err = ip6_xmit(sk, skb, &fl6, READ_ONCE(sk->sk_mark), opt, + np->tclass, sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } diff --git a/net/dccp/output.c b/net/dccp/output.c index b8a24734385e..fd2eb148d24d 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -187,7 +187,7 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; - dp->dccps_mss_cache = cur_mps; + WRITE_ONCE(dp->dccps_mss_cache, cur_mps); return cur_mps; } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index f331e5977a84..4e3266e4d7c3 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -630,7 +630,7 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, return dccp_getsockopt_service(sk, len, (__be32 __user *)optval, optlen); case DCCP_SOCKOPT_GET_CUR_MPS: - val = dp->dccps_mss_cache; + val = READ_ONCE(dp->dccps_mss_cache); break; case DCCP_SOCKOPT_AVAILABLE_CCIDS: return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); @@ -739,7 +739,7 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) trace_dccp_probe(sk, len); - if (len > dp->dccps_mss_cache) + if (len > READ_ONCE(dp->dccps_mss_cache)) return -EMSGSIZE; lock_sock(sk); @@ -772,6 +772,12 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out_discard; } + /* We need to check dccps_mss_cache after socket is locked. */ + if (len > dp->dccps_mss_cache) { + rc = -EMSGSIZE; + goto out_discard; + } + skb_reserve(skb, sk->sk_prot->max_header); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc != 0) diff --git a/net/dsa/port.c b/net/dsa/port.c index 0ce8fd311c78..2f6195d7b741 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1727,8 +1727,15 @@ int dsa_port_phylink_create(struct dsa_port *dp) ds->ops->phylink_mac_an_restart) dp->pl_config.legacy_pre_march2020 = true; - if (ds->ops->phylink_get_caps) + if (ds->ops->phylink_get_caps) { ds->ops->phylink_get_caps(ds, dp->index, &dp->pl_config); + } else { + /* For legacy drivers */ + __set_bit(PHY_INTERFACE_MODE_INTERNAL, + dp->pl_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_GMII, + dp->pl_config.supported_interfaces); + } pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), mode, &dsa_port_phylink_mac_ops); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index b812eb36f0e3..f7426926a104 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -150,7 +150,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, } #endif - if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark)) + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, READ_ONCE(sk->sk_mark))) goto errout; if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) || @@ -799,7 +799,7 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; if (sk_fullsock(sk)) - entry.mark = sk->sk_mark; + entry.mark = READ_ONCE(sk->sk_mark); else if (sk->sk_state == TCP_NEW_SYN_RECV) entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; else if (sk->sk_state == TCP_TIME_WAIT) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6e70839257f7..6ba1a0fafbaa 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -184,9 +184,9 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, ip_options_build(skb, &opt->opt, daddr, rt); } - skb->priority = sk->sk_priority; + skb->priority = READ_ONCE(sk->sk_priority); if (!skb->mark) - skb->mark = sk->sk_mark; + skb->mark = READ_ONCE(sk->sk_mark); /* Send it out. */ return ip_local_out(net, skb->sk, skb); @@ -528,8 +528,8 @@ packet_routed: skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ - skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->priority = READ_ONCE(sk->sk_priority); + skb->mark = READ_ONCE(sk->sk_mark); res = ip_local_out(net, sk, skb); rcu_read_unlock(); @@ -1158,10 +1158,15 @@ alloc_new_skb: } copy = datalen - transhdrlen - fraggap - pagedlen; + /* [!] NOTE: copy will be negative if pagedlen>0 + * because then the equation reduces to -fraggap. + */ if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; + } else if (flags & MSG_SPLICE_PAGES) { + copy = 0; } offset += copy; @@ -1209,6 +1214,10 @@ alloc_new_skb: } else if (flags & MSG_SPLICE_PAGES) { struct msghdr *msg = from; + err = -EIO; + if (WARN_ON_ONCE(copy > msg->msg_iter.count)) + goto error; + err = skb_splice_from_iter(skb, &msg->msg_iter, copy, sk->sk_allocation); if (err < 0) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 8e97d8d4cc9d..d41bce8927b2 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -592,7 +592,7 @@ void __ip_sock_set_tos(struct sock *sk, int val) } if (inet_sk(sk)->tos != val) { inet_sk(sk)->tos = val; - sk->sk_priority = rt_tos2priority(val); + WRITE_ONCE(sk->sk_priority, rt_tos2priority(val)); sk_dst_reset(sk); } } diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 92c02c886fe7..586b1b3e35b8 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -224,7 +224,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) .un.frag.__unused = 0, .un.frag.mtu = htons(mtu), }; - icmph->checksum = ip_compute_csum(icmph, len); + icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0)); skb_reset_transport_header(skb); niph = skb_push(skb, sizeof(*niph)); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index f95142e56da0..be5498f5dd31 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3221,13 +3221,9 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) &rtm_dump_nexthop_cb, &filter); if (err < 0) { if (likely(skb->len)) - goto out; - goto out_err; + err = skb->len; } -out: - err = skb->len; -out_err: cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; @@ -3367,25 +3363,19 @@ static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, dd->filter.res_bucket_nh_id != nhge->nh->id) continue; + dd->ctx->bucket_index = bucket_index; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); - if (err < 0) { - if (likely(skb->len)) - goto out; - goto out_err; - } + if (err) + return err; } dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1; - bucket_index = 0; + dd->ctx->bucket_index = 0; -out: - err = skb->len; -out_err: - dd->ctx->bucket_index = bucket_index; - return err; + return 0; } static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, @@ -3434,13 +3424,9 @@ static int rtm_dump_nexthop_bucket(struct sk_buff *skb, if (err < 0) { if (likely(skb->len)) - goto out; - goto out_err; + err = skb->len; } -out: - err = skb->len; -out_err: cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 7782ff5e6539..cb381f5aa464 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -348,7 +348,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, goto error; skb_reserve(skb, hlen); - skb->priority = sk->sk_priority; + skb->priority = READ_ONCE(sk->sk_priority); skb->mark = sockc->mark; skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 98d7e6ba7493..92fede388d52 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -518,7 +518,7 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4, const struct inet_sock *inet = inet_sk(sk); oif = sk->sk_bound_dev_if; - mark = sk->sk_mark; + mark = READ_ONCE(sk->sk_mark); tos = ip_sock_rt_tos(sk); scope = ip_sock_rt_scope(sk); prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; @@ -552,7 +552,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; - flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk) & IPTOS_RT_MASK, ip_sock_rt_scope(sk), inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 069642014636..a59cc4b83861 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -931,9 +931,9 @@ static void tcp_v4_send_ack(const struct sock *sk, ctl_sk = this_cpu_read(ipv4_tcp_sk); sock_net_set(ctl_sk, net); ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_priority : sk->sk_priority; + inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); transmit_time = tcp_transmit_time(sk); ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 82f4575f9cd9..99ac5efe244d 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -40,7 +40,7 @@ struct tcp_fastopen_metrics { struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; - possible_net_t tcpm_net; + struct net *tcpm_net; struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; @@ -51,34 +51,38 @@ struct tcp_metrics_block { struct rcu_head rcu_head; }; -static inline struct net *tm_net(struct tcp_metrics_block *tm) +static inline struct net *tm_net(const struct tcp_metrics_block *tm) { - return read_pnet(&tm->tcpm_net); + /* Paired with the WRITE_ONCE() in tcpm_new() */ + return READ_ONCE(tm->tcpm_net); } static bool tcp_metric_locked(struct tcp_metrics_block *tm, enum tcp_metric_index idx) { - return tm->tcpm_lock & (1 << idx); + /* Paired with WRITE_ONCE() in tcpm_suck_dst() */ + return READ_ONCE(tm->tcpm_lock) & (1 << idx); } -static u32 tcp_metric_get(struct tcp_metrics_block *tm, +static u32 tcp_metric_get(const struct tcp_metrics_block *tm, enum tcp_metric_index idx) { - return tm->tcpm_vals[idx]; + /* Paired with WRITE_ONCE() in tcp_metric_set() */ + return READ_ONCE(tm->tcpm_vals[idx]); } static void tcp_metric_set(struct tcp_metrics_block *tm, enum tcp_metric_index idx, u32 val) { - tm->tcpm_vals[idx] = val; + /* Paired with READ_ONCE() in tcp_metric_get() */ + WRITE_ONCE(tm->tcpm_vals[idx], val); } static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { - return inetpeer_addr_cmp(a, b) == 0; + return (a->family == b->family) && !inetpeer_addr_cmp(a, b); } struct tcpm_hash_bucket { @@ -89,6 +93,7 @@ static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly; static unsigned int tcp_metrics_hash_log __read_mostly; static DEFINE_SPINLOCK(tcp_metrics_lock); +static DEFINE_SEQLOCK(fastopen_seqlock); static void tcpm_suck_dst(struct tcp_metrics_block *tm, const struct dst_entry *dst, @@ -97,7 +102,7 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, u32 msval; u32 val; - tm->tcpm_stamp = jiffies; + WRITE_ONCE(tm->tcpm_stamp, jiffies); val = 0; if (dst_metric_locked(dst, RTAX_RTT)) @@ -110,30 +115,42 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, val |= 1 << TCP_METRIC_CWND; if (dst_metric_locked(dst, RTAX_REORDERING)) val |= 1 << TCP_METRIC_REORDERING; - tm->tcpm_lock = val; + /* Paired with READ_ONCE() in tcp_metric_locked() */ + WRITE_ONCE(tm->tcpm_lock, val); msval = dst_metric_raw(dst, RTAX_RTT); - tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC; + tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC); msval = dst_metric_raw(dst, RTAX_RTTVAR); - tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC; - tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); - tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); - tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); + tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC); + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, + dst_metric_raw(dst, RTAX_SSTHRESH)); + tcp_metric_set(tm, TCP_METRIC_CWND, + dst_metric_raw(dst, RTAX_CWND)); + tcp_metric_set(tm, TCP_METRIC_REORDERING, + dst_metric_raw(dst, RTAX_REORDERING)); if (fastopen_clear) { + write_seqlock(&fastopen_seqlock); tm->tcpm_fastopen.mss = 0; tm->tcpm_fastopen.syn_loss = 0; tm->tcpm_fastopen.try_exp = 0; tm->tcpm_fastopen.cookie.exp = false; tm->tcpm_fastopen.cookie.len = 0; + write_sequnlock(&fastopen_seqlock); } } #define TCP_METRICS_TIMEOUT (60 * 60 * HZ) -static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) +static void tcpm_check_stamp(struct tcp_metrics_block *tm, + const struct dst_entry *dst) { - if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) + unsigned long limit; + + if (!tm) + return; + limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT; + if (unlikely(time_after(jiffies, limit))) tcpm_suck_dst(tm, dst, false); } @@ -174,20 +191,23 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, oldest = deref_locked(tcp_metrics_hash[hash].chain); for (tm = deref_locked(oldest->tcpm_next); tm; tm = deref_locked(tm->tcpm_next)) { - if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) + if (time_before(READ_ONCE(tm->tcpm_stamp), + READ_ONCE(oldest->tcpm_stamp))) oldest = tm; } tm = oldest; } else { - tm = kmalloc(sizeof(*tm), GFP_ATOMIC); + tm = kzalloc(sizeof(*tm), GFP_ATOMIC); if (!tm) goto out_unlock; } - write_pnet(&tm->tcpm_net, net); + /* Paired with the READ_ONCE() in tm_net() */ + WRITE_ONCE(tm->tcpm_net, net); + tm->tcpm_saddr = *saddr; tm->tcpm_daddr = *daddr; - tcpm_suck_dst(tm, dst, true); + tcpm_suck_dst(tm, dst, reclaim); if (likely(!reclaim)) { tm->tcpm_next = tcp_metrics_hash[hash].chain; @@ -434,7 +454,7 @@ void tcp_update_metrics(struct sock *sk) tp->reordering); } } - tm->tcpm_stamp = jiffies; + WRITE_ONCE(tm->tcpm_stamp, jiffies); out_unlock: rcu_read_unlock(); } @@ -539,8 +559,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) return ret; } -static DEFINE_SEQLOCK(fastopen_seqlock); - void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { @@ -647,7 +665,7 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, } if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE, - jiffies - tm->tcpm_stamp, + jiffies - READ_ONCE(tm->tcpm_stamp), TCP_METRICS_ATTR_PAD) < 0) goto nla_put_failure; @@ -658,7 +676,7 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, if (!nest) goto nla_put_failure; for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { - u32 val = tm->tcpm_vals[i]; + u32 val = tcp_metric_get(tm, i); if (!val) continue; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 42a96b3547c9..abfa860367aa 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -114,6 +114,7 @@ #include <net/sock_reuseport.h> #include <net/addrconf.h> #include <net/udp_tunnel.h> +#include <net/gro.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6_stubs.h> #endif @@ -555,10 +556,13 @@ struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, { const struct iphdr *iph = ip_hdr(skb); struct net *net = dev_net(skb->dev); + int iif, sdif; + + inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, - iph->daddr, dport, inet_iif(skb), - inet_sdif(skb), net->ipv4.udp_table, NULL); + iph->daddr, dport, iif, + sdif, net->ipv4.udp_table, NULL); } /* Must be called under rcu_read_lock(). diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index f402946da344..0f46b3c2e4ac 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -609,10 +609,13 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct iphdr *iph = skb_gro_network_header(skb); struct net *net = dev_net(skb->dev); + int iif, sdif; + + inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, - iph->daddr, dport, inet_iif(skb), - inet_sdif(skb), net->ipv4.udp_table, NULL); + iph->daddr, dport, iif, + sdif, net->ipv4.udp_table, NULL); } INDIRECT_CALLABLE_SCOPE diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index cc3d5ad17257..67a3b8f6e72b 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1073,7 +1073,7 @@ static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, And all this only to mangle msg->im6_msgtype and to set msg->im6_mbz to "mbz" :-) */ - skb_push(skb, -skb_network_offset(pkt)); + __skb_pull(skb, skb_network_offset(pkt)); skb_push(skb, sizeof(*msg)); skb_reset_transport_header(skb); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 18634ebd20a4..a42be96ae209 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -197,7 +197,8 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, static inline int ndisc_is_useropt(const struct net_device *dev, struct nd_opt_hdr *opt) { - return opt->nd_opt_type == ND_OPT_RDNSS || + return opt->nd_opt_type == ND_OPT_PREFIX_INFO || + opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || opt->nd_opt_type == ND_OPT_PREF64 || diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index f804c11e2146..c2c291827a2c 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -120,7 +120,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init_sk(&ipc6, np); ipc6.sockc.tsflags = sk->sk_tsflags; - ipc6.sockc.mark = sk->sk_mark; + ipc6.sockc.mark = READ_ONCE(sk->sk_mark); fl6.flowi6_oif = oif; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ac1cef094c5f..49381f35b623 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -614,7 +614,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IPV6); - skb->priority = sk->sk_priority; + skb->priority = READ_ONCE(sk->sk_priority); skb->mark = sockc->mark; skb->tstamp = sockc->transmit_time; @@ -774,12 +774,12 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; ipcm6_init(&ipc6); ipc6.sockc.tsflags = sk->sk_tsflags; - ipc6.sockc.mark = sk->sk_mark; + ipc6.sockc.mark = fl6.flowi6_mark; if (sin6) { if (addr_len < SIN6_LEN_RFC2133) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 64e873f5895f..56a55585eb79 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2951,7 +2951,8 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) if (!oif && skb->dev) oif = l3mdev_master_ifindex(skb->dev); - ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); + ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark), + sk->sk_uid); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || @@ -3172,8 +3173,8 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) { - ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, - sk->sk_uid); + ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, + READ_ONCE(sk->sk_mark), sk->sk_uid); } EXPORT_SYMBOL_GPL(ip6_sk_redirect); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4714eb695913..6e86721e1cdb 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -564,8 +564,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, fl6, skb->mark ? : sk->sk_mark, opt, - tclass, sk->sk_priority); + err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark), + opt, tclass, sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -939,7 +939,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; else - mark = sk->sk_mark; + mark = READ_ONCE(sk->sk_mark); skb_set_delivery_time(buff, tcp_transmit_time(sk), true); } if (txhash) { @@ -1128,7 +1128,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index), - ipv6_get_dsfield(ipv6_hdr(skb)), 0, sk->sk_priority, + ipv6_get_dsfield(ipv6_hdr(skb)), 0, + READ_ONCE(sk->sk_priority), READ_ONCE(tcp_rsk(req)->txhash)); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b7c972aa09a7..f787e6b8424c 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -51,6 +51,7 @@ #include <net/inet6_hashtables.h> #include <net/busy_poll.h> #include <net/sock_reuseport.h> +#include <net/gro.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -300,10 +301,13 @@ struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, { const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); + int iif, sdif; + + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, - &iph->daddr, dport, inet6_iif(skb), - inet6_sdif(skb), net->ipv4.udp_table, NULL); + &iph->daddr, dport, iif, + sdif, net->ipv4.udp_table, NULL); } /* Must be called under rcu_read_lock(). @@ -624,7 +628,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == NDISC_REDIRECT) { if (tunnel) { ip6_redirect(skb, sock_net(sk), inet6_iif(skb), - sk->sk_mark, sk->sk_uid); + READ_ONCE(sk->sk_mark), sk->sk_uid); } else { ip6_sk_redirect(skb, sk); } @@ -1356,7 +1360,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init(&ipc6); ipc6.gso_size = READ_ONCE(up->gso_size); ipc6.sockc.tsflags = sk->sk_tsflags; - ipc6.sockc.mark = sk->sk_mark; + ipc6.sockc.mark = READ_ONCE(sk->sk_mark); /* destination address check */ if (sin6) { diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 09fa7a42cb93..6b95ba241ebe 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -118,10 +118,13 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct ipv6hdr *iph = skb_gro_network_header(skb); struct net *net = dev_net(skb->dev); + int iif, sdif; + + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, - &iph->daddr, dport, inet6_iif(skb), - inet6_sdif(skb), net->ipv4.udp_table, NULL); + &iph->daddr, dport, iif, + sdif, net->ipv4.udp_table, NULL); } INDIRECT_CALLABLE_SCOPE diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index b1623f9c4f92..ff78217f0cb1 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -519,7 +519,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* Get and verify the address */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; ipcm6_init(&ipc6); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3317d1cca156..d80658547836 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2335,7 +2335,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); - if (flags & MPTCP_CF_FASTCLOSE) { + if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) { /* be sure to force the tcp_disconnect() path, * to generate the egress reset */ @@ -3328,7 +3328,7 @@ static void mptcp_release_cb(struct sock *sk) if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); - if (unlikely(&msk->cb_flags)) { + if (unlikely(msk->cb_flags)) { /* be sure to set the current sk state before tacking actions * depending on sk_state, that is processing MPTCP_ERROR_REPORT */ diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 37fbe22e2433..ba2a873a4d2e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -325,7 +325,6 @@ struct mptcp_sock { u32 subflow_id; u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; - struct mptcp_sock *dl_next; }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 63f7a09335c5..a3f1fe810cc9 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -103,7 +103,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in break; case SO_MARK: if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) { - ssk->sk_mark = sk->sk_mark; + WRITE_ONCE(ssk->sk_mark, sk->sk_mark); sk_dst_reset(ssk); } break; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9ee3b7abbaf6..94ae7dd01c65 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1793,16 +1793,31 @@ static void subflow_state_change(struct sock *sk) void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) { struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; - struct mptcp_sock *msk, *next, *head = NULL; - struct request_sock *req; - struct sock *sk; + struct request_sock *req, *head, *tail; + struct mptcp_subflow_context *subflow; + struct sock *sk, *ssk; - /* build a list of all unaccepted mptcp sockets */ + /* Due to lock dependencies no relevant lock can be acquired under rskq_lock. + * Splice the req list, so that accept() can not reach the pending ssk after + * the listener socket is released below. + */ spin_lock_bh(&queue->rskq_lock); - for (req = queue->rskq_accept_head; req; req = req->dl_next) { - struct mptcp_subflow_context *subflow; - struct sock *ssk = req->sk; + head = queue->rskq_accept_head; + tail = queue->rskq_accept_tail; + queue->rskq_accept_head = NULL; + queue->rskq_accept_tail = NULL; + spin_unlock_bh(&queue->rskq_lock); + + if (!head) + return; + /* can't acquire the msk socket lock under the subflow one, + * or will cause ABBA deadlock + */ + release_sock(listener_ssk); + + for (req = head; req; req = req->dl_next) { + ssk = req->sk; if (!sk_is_mptcp(ssk)) continue; @@ -1810,32 +1825,10 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s if (!subflow || !subflow->conn) continue; - /* skip if already in list */ sk = subflow->conn; - msk = mptcp_sk(sk); - if (msk->dl_next || msk == head) - continue; - sock_hold(sk); - msk->dl_next = head; - head = msk; - } - spin_unlock_bh(&queue->rskq_lock); - if (!head) - return; - - /* can't acquire the msk socket lock under the subflow one, - * or will cause ABBA deadlock - */ - release_sock(listener_ssk); - - for (msk = head; msk; msk = next) { - sk = (struct sock *)msk; lock_sock_nested(sk, SINGLE_DEPTH_NESTING); - next = msk->dl_next; - msk->dl_next = NULL; - __mptcp_unaccepted_force_close(sk); release_sock(sk); @@ -1859,6 +1852,13 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s /* we are still under the listener msk socket lock */ lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); + + /* restore the listener queue, to let the TCP code clean it up */ + spin_lock_bh(&queue->rskq_lock); + WARN_ON_ONCE(queue->rskq_accept_head); + queue->rskq_accept_head = head; + queue->rskq_accept_tail = tail; + spin_unlock_bh(&queue->rskq_lock); } static int subflow_ulp_init(struct sock *sk) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d3c6ecd1f5a6..c62227ae7746 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -31,7 +31,9 @@ static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); static LIST_HEAD(nf_tables_destroy_list); +static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); +static DEFINE_SPINLOCK(nf_tables_gc_list_lock); enum { NFT_VALIDATE_SKIP = 0, @@ -120,6 +122,9 @@ static void nft_validate_state_update(struct nft_table *table, u8 new_validate_s static void nf_tables_trans_destroy_work(struct work_struct *w); static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); +static void nft_trans_gc_work(struct work_struct *work); +static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); + static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, @@ -582,10 +587,6 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, return __nft_trans_set_add(ctx, msg_type, set, NULL); } -static void nft_setelem_data_deactivate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem); - static int nft_mapelem_deactivate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, @@ -5055,6 +5056,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, INIT_LIST_HEAD(&set->bindings); INIT_LIST_HEAD(&set->catchall_list); + refcount_set(&set->refs, 1); set->table = table; write_pnet(&set->net, net); set->ops = ops; @@ -5122,6 +5124,14 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, } } +static void nft_set_put(struct nft_set *set) +{ + if (refcount_dec_and_test(&set->refs)) { + kfree(set->name); + kvfree(set); + } +} + static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { int i; @@ -5134,8 +5144,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) set->ops->destroy(ctx, set); nft_set_catchall_destroy(ctx, set); - kfree(set->name); - kvfree(set); + nft_set_put(set); } static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, @@ -5602,8 +5611,12 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_set_elem *elem) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); struct nft_set_dump_args *args; + if (nft_set_elem_expired(ext)) + return 0; + args = container_of(iter, struct nft_set_dump_args, iter); return nf_tables_fill_setelem(args->skb, set, elem, args->reset); } @@ -6274,7 +6287,8 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (nft_set_elem_active(ext, genmask) && - !nft_set_elem_expired(ext)) + !nft_set_elem_expired(ext) && + !nft_set_elem_is_dead(ext)) return ext; } @@ -6282,29 +6296,6 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, } EXPORT_SYMBOL_GPL(nft_set_catchall_lookup); -void *nft_set_catchall_gc(const struct nft_set *set) -{ - struct nft_set_elem_catchall *catchall, *next; - struct nft_set_ext *ext; - void *elem = NULL; - - list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { - ext = nft_set_elem_ext(set, catchall->elem); - - if (!nft_set_elem_expired(ext) || - nft_set_elem_mark_busy(ext)) - continue; - - elem = catchall->elem; - list_del_rcu(&catchall->list); - kfree_rcu(catchall, rcu); - break; - } - - return elem; -} -EXPORT_SYMBOL_GPL(nft_set_catchall_gc); - static int nft_setelem_catchall_insert(const struct net *net, struct nft_set *set, const struct nft_set_elem *elem, @@ -6366,7 +6357,6 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set, if (nft_setelem_is_catchall(set, elem)) { nft_set_elem_change_active(net, set, ext); - nft_set_elem_clear_busy(ext); } else { set->ops->activate(net, set, elem); } @@ -6381,8 +6371,7 @@ static int nft_setelem_catchall_deactivate(const struct net *net, list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_is_active(net, ext) || - nft_set_elem_mark_busy(ext)) + if (!nft_is_active(net, ext)) continue; kfree(elem->priv); @@ -6777,7 +6766,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_elem_free; } - ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; + ext->genmask = nft_genmask_cur(ctx->net); err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags); if (err) { @@ -6929,9 +6918,9 @@ static void nft_setelem_data_activate(const struct net *net, nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); } -static void nft_setelem_data_deactivate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem) +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); @@ -7095,8 +7084,7 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask) || - nft_set_elem_mark_busy(ext)) + if (!nft_set_elem_active(ext, genmask)) continue; elem.priv = catchall->elem; @@ -7170,29 +7158,6 @@ static int nf_tables_delsetelem(struct sk_buff *skb, return err; } -void nft_set_gc_batch_release(struct rcu_head *rcu) -{ - struct nft_set_gc_batch *gcb; - unsigned int i; - - gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu); - for (i = 0; i < gcb->head.cnt; i++) - nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); - kfree(gcb); -} - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp) -{ - struct nft_set_gc_batch *gcb; - - gcb = kzalloc(sizeof(*gcb), gfp); - if (gcb == NULL) - return gcb; - gcb->head.set = set; - return gcb; -} - /* * Stateful objects */ @@ -9414,6 +9379,207 @@ void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, + struct nft_trans_gc *trans) +{ + void **priv = trans->priv; + unsigned int i; + + for (i = 0; i < trans->count; i++) { + struct nft_set_elem elem = { + .priv = priv[i], + }; + + nft_setelem_data_deactivate(ctx->net, trans->set, &elem); + nft_setelem_remove(ctx->net, trans->set, &elem); + } +} + +void nft_trans_gc_destroy(struct nft_trans_gc *trans) +{ + nft_set_put(trans->set); + put_net(trans->net); + kfree(trans); +} + +static void nft_trans_gc_trans_free(struct rcu_head *rcu) +{ + struct nft_set_elem elem = {}; + struct nft_trans_gc *trans; + struct nft_ctx ctx = {}; + unsigned int i; + + trans = container_of(rcu, struct nft_trans_gc, rcu); + ctx.net = read_pnet(&trans->set->net); + + for (i = 0; i < trans->count; i++) { + elem.priv = trans->priv[i]; + if (!nft_setelem_is_catchall(trans->set, &elem)) + atomic_dec(&trans->set->nelems); + + nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv); + } + + nft_trans_gc_destroy(trans); +} + +static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) +{ + struct nftables_pernet *nft_net; + struct nft_ctx ctx = {}; + + nft_net = nft_pernet(trans->net); + + mutex_lock(&nft_net->commit_mutex); + + /* Check for race with transaction, otherwise this batch refers to + * stale objects that might not be there anymore. Skip transaction if + * set has been destroyed from control plane transaction in case gc + * worker loses race. + */ + if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) { + mutex_unlock(&nft_net->commit_mutex); + return false; + } + + ctx.net = trans->net; + ctx.table = trans->set->table; + + nft_trans_gc_setelem_remove(&ctx, trans); + mutex_unlock(&nft_net->commit_mutex); + + return true; +} + +static void nft_trans_gc_work(struct work_struct *work) +{ + struct nft_trans_gc *trans, *next; + LIST_HEAD(trans_gc_list); + + spin_lock(&nf_tables_destroy_list_lock); + list_splice_init(&nf_tables_gc_list, &trans_gc_list); + spin_unlock(&nf_tables_destroy_list_lock); + + list_for_each_entry_safe(trans, next, &trans_gc_list, list) { + list_del(&trans->list); + if (!nft_trans_gc_work_done(trans)) { + nft_trans_gc_destroy(trans); + continue; + } + call_rcu(&trans->rcu, nft_trans_gc_trans_free); + } +} + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp) +{ + struct net *net = read_pnet(&set->net); + struct nft_trans_gc *trans; + + trans = kzalloc(sizeof(*trans), gfp); + if (!trans) + return NULL; + + refcount_inc(&set->refs); + trans->set = set; + trans->net = get_net(net); + trans->seq = gc_seq; + + return trans; +} + +void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) +{ + trans->priv[trans->count++] = priv; +} + +static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) +{ + spin_lock(&nf_tables_gc_list_lock); + list_add_tail(&trans->list, &nf_tables_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + schedule_work(&trans_gc_work); +} + +static int nft_trans_gc_space(struct nft_trans_gc *trans) +{ + return NFT_TRANS_GC_BATCHCOUNT - trans->count; +} + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp) +{ + if (nft_trans_gc_space(gc)) + return gc; + + nft_trans_gc_queue_work(gc); + + return nft_trans_gc_alloc(gc->set, gc_seq, gfp); +} + +void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) +{ + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + nft_trans_gc_queue_work(trans); +} + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) +{ + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) + return NULL; + + if (nft_trans_gc_space(gc)) + return gc; + + call_rcu(&gc->rcu, nft_trans_gc_trans_free); + + return nft_trans_gc_alloc(gc->set, 0, gfp); +} + +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) +{ + WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); + + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + call_rcu(&trans->rcu, nft_trans_gc_trans_free); +} + +struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, + unsigned int gc_seq) +{ + struct nft_set_elem_catchall *catchall; + const struct nft_set *set = gc->set; + struct nft_set_ext *ext; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!nft_set_elem_expired(ext)) + continue; + if (nft_set_elem_is_dead(ext)) + goto dead_elem; + + nft_set_elem_dead(ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + return NULL; + + nft_trans_gc_elem_add(gc, catchall->elem); + } + + return gc; +} + static void nf_tables_module_autoload_cleanup(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); @@ -9576,11 +9742,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; + unsigned int base_seq, gc_seq; LIST_HEAD(set_update_list); struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; - unsigned int base_seq; LIST_HEAD(adl); int err; @@ -9657,6 +9823,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) WRITE_ONCE(nft_net->base_seq, base_seq); + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(nft_net->gc_seq); + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); @@ -9764,6 +9934,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELSET: case NFT_MSG_DESTROYSET: + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), trans->msg_type, GFP_KERNEL); @@ -9866,6 +10037,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); nf_tables_commit_audit_log(&adl, nft_net->base_seq); + + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); nf_tables_commit_release(net); return 0; @@ -10915,6 +11088,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); nft_net->base_seq = 1; + nft_net->gc_seq = 0; return 0; } @@ -10943,10 +11117,16 @@ static void __net_exit nf_tables_exit_net(struct net *net) WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); } +static void nf_tables_exit_batch(struct list_head *net_exit_list) +{ + flush_work(&trans_gc_work); +} + static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .pre_exit = nf_tables_pre_exit_net, .exit = nf_tables_exit_net, + .exit_batch = nf_tables_exit_batch, .id = &nf_tables_net_id, .size = sizeof(struct nftables_pernet), }; @@ -11018,6 +11198,7 @@ static void __exit nf_tables_module_exit(void) nft_chain_filter_fini(); nft_chain_route_fini(); unregister_pernet_subsys(&nf_tables_net_ops); + cancel_work_sync(&trans_gc_work); cancel_work_sync(&trans_destroy_work); rcu_barrier(); rhltable_destroy(&nft_objname_ht); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 0b73cb0e752f..cef5df846000 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -59,6 +59,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) return 1; + if (nft_set_elem_is_dead(&he->ext)) + return 1; if (nft_set_elem_expired(&he->ext)) return 1; if (!nft_set_elem_active(&he->ext, x->genmask)) @@ -188,7 +190,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, struct nft_rhash_elem *he = elem->priv; nft_set_elem_change_active(net, set, &he->ext); - nft_set_elem_clear_busy(&he->ext); } static bool nft_rhash_flush(const struct net *net, @@ -196,12 +197,9 @@ static bool nft_rhash_flush(const struct net *net, { struct nft_rhash_elem *he = priv; - if (!nft_set_elem_mark_busy(&he->ext) || - !nft_is_active(net, &he->ext)) { - nft_set_elem_change_active(net, set, &he->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &he->ext); + + return true; } static void *nft_rhash_deactivate(const struct net *net, @@ -218,9 +216,8 @@ static void *nft_rhash_deactivate(const struct net *net, rcu_read_lock(); he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); - if (he != NULL && - !nft_rhash_flush(net, set, he)) - he = NULL; + if (he) + nft_set_elem_change_active(net, set, &he->ext); rcu_read_unlock(); @@ -252,7 +249,9 @@ static bool nft_rhash_delete(const struct nft_set *set, if (he == NULL) return false; - return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0; + nft_set_elem_dead(&he->ext); + + return true; } static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, @@ -278,8 +277,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&he->ext)) - goto cont; if (!nft_set_elem_active(&he->ext, iter->genmask)) goto cont; @@ -314,25 +311,48 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, static void nft_rhash_gc(struct work_struct *work) { + struct nftables_pernet *nft_net; struct nft_set *set; struct nft_rhash_elem *he; struct nft_rhash *priv; - struct nft_set_gc_batch *gcb = NULL; struct rhashtable_iter hti; + struct nft_trans_gc *gc; + struct net *net; + u32 gc_seq; priv = container_of(work, struct nft_rhash, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + nft_net = nft_pernet(net); + gc_seq = READ_ONCE(nft_net->gc_seq); + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) - break; + if (PTR_ERR(he) != -EAGAIN) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } continue; } + /* Ruleset has been updated, try later. */ + if (READ_ONCE(nft_net->gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + + if (nft_set_elem_is_dead(&he->ext)) + goto dead_elem; + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) && nft_rhash_expr_needs_gc_run(set, &he->ext)) goto needs_gc_run; @@ -340,26 +360,26 @@ static void nft_rhash_gc(struct work_struct *work) if (!nft_set_elem_expired(&he->ext)) continue; needs_gc_run: - if (nft_set_elem_mark_busy(&he->ext)) - continue; + nft_set_elem_dead(&he->ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb == NULL) - break; - rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, he); + nft_trans_gc_elem_add(gc, he); } + + gc = nft_trans_gc_catchall(gc, gc_seq); + +try_later: + /* catchall list iteration requires rcu read side lock. */ rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); - he = nft_set_catchall_gc(set); - if (he) { - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb) - nft_set_gc_batch_add(gcb, he); - } - nft_set_gc_batch_complete(gcb); + if (gc) + nft_trans_gc_queue_async_done(gc); + +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } @@ -394,7 +414,7 @@ static int nft_rhash_init(const struct nft_set *set, return err; INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc); - if (set->flags & NFT_SET_TIMEOUT) + if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL)) nft_rhash_gc_init(set); return 0; @@ -422,7 +442,6 @@ static void nft_rhash_destroy(const struct nft_ctx *ctx, }; cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, (void *)&rhash_ctx); } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 49915a2a58eb..a5b8301afe4a 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -566,8 +566,7 @@ next_match: goto out; if (last) { - if (nft_set_elem_expired(&f->mt[b].e->ext) || - (genmask && + if ((genmask && !nft_set_elem_active(&f->mt[b].e->ext, genmask))) goto next_match; @@ -601,8 +600,17 @@ out: static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { - return pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net)); + struct nft_pipapo_elem *ret; + + ret = pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); + if (IS_ERR(ret)) + return ret; + + if (nft_set_elem_expired(&ret->ext)) + return ERR_PTR(-ENOENT); + + return ret; } /** @@ -1528,16 +1536,34 @@ static void pipapo_drop(struct nft_pipapo_match *m, } } +static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, + struct nft_pipapo_elem *e) + +{ + struct nft_set_elem elem = { + .priv = e, + }; + + nft_setelem_data_deactivate(net, set, &elem); +} + /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements * @set: nftables API set representation * @m: Matching data */ -static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) +static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) { + struct nft_set *set = (struct nft_set *) _set; struct nft_pipapo *priv = nft_set_priv(set); + struct net *net = read_pnet(&set->net); int rules_f0, first_rule = 0; struct nft_pipapo_elem *e; + struct nft_trans_gc *gc; + + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; @@ -1561,13 +1587,20 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) f--; i--; e = f->mt[rulemap[i].to].e; - if (nft_set_elem_expired(&e->ext) && - !nft_set_elem_mark_busy(&e->ext)) { + + /* synchronous gc never fails, there is no need to set on + * NFT_SET_ELEM_DEAD_BIT. + */ + if (nft_set_elem_expired(&e->ext)) { priv->dirty = true; - pipapo_drop(m, rulemap); - rcu_barrier(); - nft_set_elem_destroy(set, e, true); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (!gc) + break; + + nft_pipapo_gc_deactivate(net, set, e); + pipapo_drop(m, rulemap); + nft_trans_gc_elem_add(gc, e); /* And check again current first rule, which is now the * first we haven't checked. @@ -1577,11 +1610,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) } } - e = nft_set_catchall_gc(set); - if (e) - nft_set_elem_destroy(set, e, true); - - priv->last_gc = jiffies; + gc = nft_trans_gc_catchall(gc, 0); + if (gc) { + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; + } } /** @@ -1706,7 +1739,6 @@ static void nft_pipapo_activate(const struct net *net, return; nft_set_elem_change_active(net, set, &e->ext); - nft_set_elem_clear_busy(&e->ext); } /** @@ -2005,8 +2037,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, goto cont; e = f->mt[r].e; - if (nft_set_elem_expired(&e->ext)) - goto cont; elem.priv = e; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 8d73fffd2d09..f9d4c8fcbbf8 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -46,6 +46,12 @@ static int nft_rbtree_cmp(const struct nft_set *set, set->klen); } +static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) +{ + return nft_set_elem_expired(&rbe->ext) || + nft_set_elem_is_dead(&rbe->ext); +} + static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext, unsigned int seq) @@ -80,7 +86,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set continue; } - if (nft_set_elem_expired(&rbe->ext)) + if (nft_rbtree_elem_expired(rbe)) return false; if (nft_rbtree_interval_end(rbe)) { @@ -98,7 +104,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && - !nft_set_elem_expired(&interval->ext) && + !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) { *ext = &interval->ext; return true; @@ -215,6 +221,18 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, return rbe; } +static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set_elem elem = { + .priv = rbe, + }; + + nft_setelem_data_deactivate(net, set, &elem); + rb_erase(&rbe->node, &priv->root); +} + static int nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, struct nft_rbtree_elem *rbe, @@ -222,11 +240,12 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); + struct net *net = read_pnet(&set->net); struct nft_rbtree_elem *rbe_prev; - struct nft_set_gc_batch *gcb; + struct nft_trans_gc *gc; - gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); - if (!gcb) + gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); + if (!gc) return -ENOMEM; /* search for end interval coming before this element. @@ -244,17 +263,28 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + nft_rbtree_gc_remove(net, set, priv, rbe_prev); - rb_erase(&rbe_prev->node, &priv->root); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_prev); + /* There is always room in this trans gc for this element, + * memory allocation never actually happens, hence, the warning + * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, + * this is synchronous gc which never fails. + */ + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return -ENOMEM; + + nft_trans_gc_elem_add(gc, rbe_prev); } - rb_erase(&rbe->node, &priv->root); - atomic_dec(&set->nelems); + nft_rbtree_gc_remove(net, set, priv, rbe); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return -ENOMEM; + + nft_trans_gc_elem_add(gc, rbe); - nft_set_gc_batch_add(gcb, rbe); - nft_set_gc_batch_complete(gcb); + nft_trans_gc_queue_sync_done(gc); return 0; } @@ -482,7 +512,6 @@ static void nft_rbtree_activate(const struct net *net, struct nft_rbtree_elem *rbe = elem->priv; nft_set_elem_change_active(net, set, &rbe->ext); - nft_set_elem_clear_busy(&rbe->ext); } static bool nft_rbtree_flush(const struct net *net, @@ -490,12 +519,9 @@ static bool nft_rbtree_flush(const struct net *net, { struct nft_rbtree_elem *rbe = priv; - if (!nft_set_elem_mark_busy(&rbe->ext) || - !nft_is_active(net, &rbe->ext)) { - nft_set_elem_change_active(net, set, &rbe->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &rbe->ext); + + return true; } static void *nft_rbtree_deactivate(const struct net *net, @@ -552,8 +578,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&rbe->ext)) - goto cont; if (!nft_set_elem_active(&rbe->ext, iter->genmask)) goto cont; @@ -572,26 +596,40 @@ cont: static void nft_rbtree_gc(struct work_struct *work) { - struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; - struct nft_set_gc_batch *gcb = NULL; + struct nft_rbtree_elem *rbe, *rbe_end = NULL; + struct nftables_pernet *nft_net; struct nft_rbtree *priv; + struct nft_trans_gc *gc; struct rb_node *node; struct nft_set *set; + unsigned int gc_seq; struct net *net; - u8 genmask; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); net = read_pnet(&set->net); - genmask = nft_genmask_cur(net); + nft_net = nft_pernet(net); + gc_seq = READ_ONCE(nft_net->gc_seq); + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + + /* Ruleset has been updated, try later. */ + if (READ_ONCE(nft_net->gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + rbe = rb_entry(node, struct nft_rbtree_elem, node); - if (!nft_set_elem_active(&rbe->ext, genmask)) - continue; + if (nft_set_elem_is_dead(&rbe->ext)) + goto dead_elem; /* elements are reversed in the rbtree for historical reasons, * from highest to lowest value, that is why end element is @@ -604,46 +642,36 @@ static void nft_rbtree_gc(struct work_struct *work) if (!nft_set_elem_expired(&rbe->ext)) continue; - if (nft_set_elem_mark_busy(&rbe->ext)) { - rbe_end = NULL; + nft_set_elem_dead(&rbe->ext); + + if (!rbe_end) continue; - } - if (rbe_prev) { - rb_erase(&rbe_prev->node, &priv->root); - rbe_prev = NULL; - } - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (!gcb) - break; + nft_set_elem_dead(&rbe_end->ext); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe); - rbe_prev = rbe; + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; - if (rbe_end) { - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_end); - rb_erase(&rbe_end->node, &priv->root); - rbe_end = NULL; - } - node = rb_next(node); - if (!node) - break; + nft_trans_gc_elem_add(gc, rbe_end); + rbe_end = NULL; +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; + + nft_trans_gc_elem_add(gc, rbe); } - if (rbe_prev) - rb_erase(&rbe_prev->node, &priv->root); + + gc = nft_trans_gc_catchall(gc, gc_seq); + +try_later: write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); - rbe = nft_set_catchall_gc(set); - if (rbe) { - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb) - nft_set_gc_batch_add(gcb, rbe); - } - nft_set_gc_batch_complete(gcb); - + if (gc) + nft_trans_gc_queue_async_done(gc); +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 84def74698b7..9ed85be79452 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -107,7 +107,7 @@ static void nft_socket_eval(const struct nft_expr *expr, break; case NFT_SOCKET_MARK: if (sk_fullsock(sk)) { - *dest = sk->sk_mark; + *dest = READ_ONCE(sk->sk_mark); } else { regs->verdict.code = NFT_BREAK; return; diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 7013f55f05d1..76e01f292aaf 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -77,7 +77,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && transparent && sk_fullsock(sk)) - pskb->mark = sk->sk_mark; + pskb->mark = READ_ONCE(sk->sk_mark); if (sk != skb->sk) sock_gen_put(sk); @@ -138,7 +138,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && transparent && sk_fullsock(sk)) - pskb->mark = sk->sk_mark; + pskb->mark = READ_ONCE(sk->sk_mark); if (sk != skb->sk) sock_gen_put(sk); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8e3ddec4c3d5..a2935bd18ed9 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -401,18 +401,20 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union tpacket_uhdr h; + /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */ + h.raw = frame; switch (po->tp_version) { case TPACKET_V1: - h.h1->tp_status = status; + WRITE_ONCE(h.h1->tp_status, status); flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: - h.h2->tp_status = status; + WRITE_ONCE(h.h2->tp_status, status); flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: - h.h3->tp_status = status; + WRITE_ONCE(h.h3->tp_status, status); flush_dcache_page(pgv_to_page(&h.h3->tp_status)); break; default: @@ -429,17 +431,19 @@ static int __packet_get_status(const struct packet_sock *po, void *frame) smp_rmb(); + /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */ + h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); - return h.h1->tp_status; + return READ_ONCE(h.h1->tp_status); case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); - return h.h2->tp_status; + return READ_ONCE(h.h2->tp_status); case TPACKET_V3: flush_dcache_page(pgv_to_page(&h.h3->tp_status)); - return h.h3->tp_status; + return READ_ONCE(h.h3->tp_status); default: WARN(1, "TPACKET version not supported.\n"); BUG(); @@ -2050,8 +2054,8 @@ retry: skb->protocol = proto; skb->dev = dev; - skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->priority = READ_ONCE(sk->sk_priority); + skb->mark = READ_ONCE(sk->sk_mark); skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, sockc.tsflags); @@ -2585,8 +2589,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->protocol = proto; skb->dev = dev; - skb->priority = po->sk.sk_priority; - skb->mark = po->sk.sk_mark; + skb->priority = READ_ONCE(po->sk.sk_priority); + skb->mark = READ_ONCE(po->sk.sk_mark); skb->tstamp = sockc->transmit_time; skb_setup_tx_timestamp(skb, sockc->tsflags); skb_zcopy_set_nouarg(skb, ph.raw); @@ -2988,7 +2992,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; sockcm_init(&sockc, sk); - sockc.mark = sk->sk_mark; + sockc.mark = READ_ONCE(sk->sk_mark); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) @@ -3061,7 +3065,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->protocol = proto; skb->dev = dev; - skb->priority = sk->sk_priority; + skb->priority = READ_ONCE(sk->sk_priority); skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 8da9d039d964..9f0711da9c95 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -776,7 +776,8 @@ mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL] = { .type = NLA_U32 }, }; -static const struct nla_policy cfm_opt_policy[TCA_FLOWER_KEY_CFM_OPT_MAX] = { +static const struct nla_policy +cfm_opt_policy[TCA_FLOWER_KEY_CFM_OPT_MAX + 1] = { [TCA_FLOWER_KEY_CFM_MD_LEVEL] = NLA_POLICY_MAX(NLA_U8, FLOW_DIS_CFM_MDL_MAX), [TCA_FLOWER_KEY_CFM_OPCODE] = { .type = NLA_U8 }, @@ -1709,7 +1710,7 @@ static int fl_set_key_cfm(struct nlattr **tb, struct fl_flow_key *mask, struct netlink_ext_ack *extack) { - struct nlattr *nla_cfm_opt[TCA_FLOWER_KEY_CFM_OPT_MAX]; + struct nlattr *nla_cfm_opt[TCA_FLOWER_KEY_CFM_OPT_MAX + 1]; int err; if (!tb[TCA_FLOWER_KEY_CFM]) diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 8641f8059317..c49d6af0e048 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -267,7 +267,6 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, return -ENOBUFS; fnew->id = f->id; - fnew->res = f->res; fnew->ifindex = f->ifindex; fnew->tp = f->tp; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index d0c53724d3e8..1e20bbd687f1 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -513,7 +513,6 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (fold) { f->id = fold->id; f->iif = fold->iif; - f->res = fold->res; f->handle = fold->handle; f->tp = fold->tp; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 5abf31e432ca..da4c179a4d41 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -826,7 +826,6 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, new->ifindex = n->ifindex; new->fshift = n->fshift; - new->res = n->res; new->flags = n->flags; RCU_INIT_POINTER(new->ht_down, ht); @@ -1024,18 +1023,62 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; } + /* At this point, we need to derive the new handle that will be used to + * uniquely map the identity of this table match entry. The + * identity of the entry that we need to construct is 32 bits made of: + * htid(12b):bucketid(8b):node/entryid(12b) + * + * At this point _we have the table(ht)_ in which we will insert this + * entry. We carry the table's id in variable "htid". + * Note that earlier code picked the ht selection either by a) the user + * providing the htid specified via TCA_U32_HASH attribute or b) when + * no such attribute is passed then the root ht, is default to at ID + * 0x[800][00][000]. Rule: the root table has a single bucket with ID 0. + * If OTOH the user passed us the htid, they may also pass a bucketid of + * choice. 0 is fine. For example a user htid is 0x[600][01][000] it is + * indicating hash bucketid of 1. Rule: the entry/node ID _cannot_ be + * passed via the htid, so even if it was non-zero it will be ignored. + * + * We may also have a handle, if the user passed one. The handle also + * carries the same addressing of htid(12b):bucketid(8b):node/entryid(12b). + * Rule: the bucketid on the handle is ignored even if one was passed; + * rather the value on "htid" is always assumed to be the bucketid. + */ if (handle) { + /* Rule: The htid from handle and tableid from htid must match */ if (TC_U32_HTID(handle) && TC_U32_HTID(handle ^ htid)) { NL_SET_ERR_MSG_MOD(extack, "Handle specified hash table address mismatch"); return -EINVAL; } - handle = htid | TC_U32_NODE(handle); - err = idr_alloc_u32(&ht->handle_idr, NULL, &handle, handle, - GFP_KERNEL); - if (err) - return err; - } else + /* Ok, so far we have a valid htid(12b):bucketid(8b) but we + * need to finalize the table entry identification with the last + * part - the node/entryid(12b)). Rule: Nodeid _cannot be 0_ for + * entries. Rule: nodeid of 0 is reserved only for tables(see + * earlier code which processes TC_U32_DIVISOR attribute). + * Rule: The nodeid can only be derived from the handle (and not + * htid). + * Rule: if the handle specified zero for the node id example + * 0x60000000, then pick a new nodeid from the pool of IDs + * this hash table has been allocating from. + * If OTOH it is specified (i.e for example the user passed a + * handle such as 0x60000123), then we use it generate our final + * handle which is used to uniquely identify the match entry. + */ + if (!TC_U32_NODE(handle)) { + handle = gen_new_kid(ht, htid); + } else { + handle = htid | TC_U32_NODE(handle); + err = idr_alloc_u32(&ht->handle_idr, NULL, &handle, + handle, GFP_KERNEL); + if (err) + return err; + } + } else { + /* The user did not give us a handle; lets just generate one + * from the table's pool of nodeids. + */ handle = gen_new_kid(ht, htid); + } if (tb[TCA_U32_SEL] == NULL) { NL_SET_ERR_MSG_MOD(extack, "Selector not specified"); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index af85a73c4c54..6fdba069f6bf 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -568,7 +568,7 @@ META_COLLECTOR(int_sk_rcvtimeo) *err = -1; return; } - dst->value = sk->sk_rcvtimeo / HZ; + dst->value = READ_ONCE(sk->sk_rcvtimeo) / HZ; } META_COLLECTOR(int_sk_sndtimeo) @@ -579,7 +579,7 @@ META_COLLECTOR(int_sk_sndtimeo) *err = -1; return; } - dst->value = sk->sk_sndtimeo / HZ; + dst->value = READ_ONCE(sk->sk_sndtimeo) / HZ; } META_COLLECTOR(int_sk_sendmsg_off) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 717ae51d94a0..8c9cfff7fd05 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1015,6 +1015,11 @@ static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { TC_FP_PREEMPTIBLE), }; +static struct netlink_range_validation_signed taprio_cycle_time_range = { + .min = 0, + .max = INT_MAX, +}; + static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_PRIOMAP] = { .len = sizeof(struct tc_mqprio_qopt) @@ -1023,7 +1028,8 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, - [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 }, + [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = + NLA_POLICY_FULL_RANGE_SIGNED(NLA_S64, &taprio_cycle_time_range), [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 }, [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, @@ -1159,6 +1165,11 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, return -EINVAL; } + if (cycle < 0 || cycle > INT_MAX) { + NL_SET_ERR_MSG(extack, "'cycle_time' is too big"); + return -EINVAL; + } + new->cycle_time = cycle; } @@ -1347,7 +1358,7 @@ static void setup_txtime(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct sched_entry *entry; - u32 interval = 0; + u64 interval = 0; list_for_each_entry(entry, &sched->entries, list) { entry->next_txtime = ktime_add_ns(base, interval); diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index a7f887d91d89..f5834af5fad5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -378,8 +378,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; - WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); - WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); + WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem)); + WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem)); smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); @@ -436,24 +436,9 @@ out: return rc; } -static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, - unsigned long mask) -{ - /* options we don't get control via setsockopt for */ - nsk->sk_type = osk->sk_type; - nsk->sk_sndbuf = osk->sk_sndbuf; - nsk->sk_rcvbuf = osk->sk_rcvbuf; - nsk->sk_sndtimeo = osk->sk_sndtimeo; - nsk->sk_rcvtimeo = osk->sk_rcvtimeo; - nsk->sk_mark = osk->sk_mark; - nsk->sk_priority = osk->sk_priority; - nsk->sk_rcvlowat = osk->sk_rcvlowat; - nsk->sk_bound_dev_if = osk->sk_bound_dev_if; - nsk->sk_err = osk->sk_err; - - nsk->sk_flags &= ~mask; - nsk->sk_flags |= osk->sk_flags & mask; -} +/* copy only relevant settings and flags of SOL_SOCKET level from smc to + * clc socket (since smc is not called for these options from net/core) + */ #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ (1UL << SOCK_KEEPOPEN) | \ @@ -470,9 +455,55 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, (1UL << SOCK_NOFCS) | \ (1UL << SOCK_FILTER_LOCKED) | \ (1UL << SOCK_TSTAMP_NEW)) -/* copy only relevant settings and flags of SOL_SOCKET level from smc to - * clc socket (since smc is not called for these options from net/core) - */ + +/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */ +static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, + unsigned long mask) +{ + struct net *nnet = sock_net(nsk); + + nsk->sk_userlocks = osk->sk_userlocks; + if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) { + nsk->sk_sndbuf = osk->sk_sndbuf; + } else { + if (mask == SK_FLAGS_SMC_TO_CLC) + WRITE_ONCE(nsk->sk_sndbuf, + READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1])); + else + WRITE_ONCE(nsk->sk_sndbuf, + 2 * READ_ONCE(nnet->smc.sysctl_wmem)); + } + if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) { + nsk->sk_rcvbuf = osk->sk_rcvbuf; + } else { + if (mask == SK_FLAGS_SMC_TO_CLC) + WRITE_ONCE(nsk->sk_rcvbuf, + READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1])); + else + WRITE_ONCE(nsk->sk_rcvbuf, + 2 * READ_ONCE(nnet->smc.sysctl_rmem)); + } +} + +static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, + unsigned long mask) +{ + /* options we don't get control via setsockopt for */ + nsk->sk_type = osk->sk_type; + nsk->sk_sndtimeo = osk->sk_sndtimeo; + nsk->sk_rcvtimeo = osk->sk_rcvtimeo; + nsk->sk_mark = READ_ONCE(osk->sk_mark); + nsk->sk_priority = osk->sk_priority; + nsk->sk_rcvlowat = osk->sk_rcvlowat; + nsk->sk_bound_dev_if = osk->sk_bound_dev_if; + nsk->sk_err = osk->sk_err; + + nsk->sk_flags &= ~mask; + nsk->sk_flags |= osk->sk_flags & mask; + + smc_adjust_sock_bufsizes(nsk, osk, mask); +} + static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) { smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); @@ -2479,8 +2510,6 @@ static void smc_tcp_listen_work(struct work_struct *work) sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); - new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; - new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; sock_hold(&new_smc->sk); /* sock_put in passive closing */ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) sock_put(&new_smc->sk); diff --git a/net/smc/smc.h b/net/smc/smc.h index 2eeea4cdc718..1f2b912c43d1 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -161,7 +161,7 @@ struct smc_connection { struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ - int rmbe_size_short;/* compressed notation */ + int rmbe_size_comp; /* compressed notation */ int rmbe_update_limit; /* lower limit for consumer * cursor update diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index b9b8b07aa702..c90d9e5dda54 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -1007,7 +1007,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, clc->d0.gid = conn->lgr->smcd->ops->get_local_gid(conn->lgr->smcd); clc->d0.token = conn->rmb_desc->token; - clc->d0.dmbe_size = conn->rmbe_size_short; + clc->d0.dmbe_size = conn->rmbe_size_comp; clc->d0.dmbe_idx = 0; memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); if (version == SMC_V1) { @@ -1050,7 +1050,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); break; } - clc->r0.rmbe_size = conn->rmbe_size_short; + clc->r0.rmbe_size = conn->rmbe_size_comp; clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : cpu_to_be64((u64)sg_dma_address diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3f465faf2b68..6b78075404d7 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2309,31 +2309,30 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; - int bufsize, bufsize_short; + int bufsize, bufsize_comp; struct rw_semaphore *lock; /* lock buffer list */ bool is_dgraded = false; - int sk_buf_size; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_rcvbuf; + bufsize = smc->sk.sk_rcvbuf / 2; else /* use socket send buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_sndbuf; + bufsize = smc->sk.sk_sndbuf / 2; - for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); - bufsize_short >= 0; bufsize_short--) { + for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb); + bufsize_comp >= 0; bufsize_comp--) { if (is_rmb) { lock = &lgr->rmbs_lock; - buf_list = &lgr->rmbs[bufsize_short]; + buf_list = &lgr->rmbs[bufsize_comp]; } else { lock = &lgr->sndbufs_lock; - buf_list = &lgr->sndbufs[bufsize_short]; + buf_list = &lgr->sndbufs[bufsize_comp]; } - bufsize = smc_uncompress_bufsize(bufsize_short); + bufsize = smc_uncompress_bufsize(bufsize_comp); /* check for reusable slot in the link group */ - buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); + buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list); if (buf_desc) { buf_desc->is_dma_need_sync = 0; SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); @@ -2377,8 +2376,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_rmb) { conn->rmb_desc = buf_desc; - conn->rmbe_size_short = bufsize_short; - smc->sk.sk_rcvbuf = bufsize; + conn->rmbe_size_comp = bufsize_comp; + smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); @@ -2386,7 +2385,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; - smc->sk.sk_sndbuf = bufsize; + smc->sk.sk_sndbuf = bufsize * 2; atomic_set(&conn->sndbuf_space, bufsize); } return 0; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index b6f79fabb9d3..0b2a957ca5f5 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -21,6 +21,10 @@ static int min_sndbuf = SMC_BUF_MIN_SIZE; static int min_rcvbuf = SMC_BUF_MIN_SIZE; +static int max_sndbuf = INT_MAX / 2; +static int max_rcvbuf = INT_MAX / 2; +static const int net_smc_wmem_init = (64 * 1024); +static const int net_smc_rmem_init = (64 * 1024); static struct ctl_table smc_table[] = { { @@ -53,6 +57,7 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, + .extra2 = &max_sndbuf, }, { .procname = "rmem", @@ -61,6 +66,7 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, + .extra2 = &max_rcvbuf, }, { } }; @@ -88,8 +94,8 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; - WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); - WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); + WRITE_ONCE(net->smc.sysctl_wmem, net_smc_wmem_init); + WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init); return 0; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 2021fe557e50..529101eb20bd 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -52,6 +52,8 @@ static LIST_HEAD(tls_device_list); static LIST_HEAD(tls_device_down_list); static DEFINE_SPINLOCK(tls_device_lock); +static struct page *dummy_page; + static void tls_device_free_ctx(struct tls_context *ctx) { if (ctx->tx_conf == TLS_HW) { @@ -312,36 +314,33 @@ static int tls_push_record(struct sock *sk, return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); } -static int tls_device_record_close(struct sock *sk, - struct tls_context *ctx, - struct tls_record_info *record, - struct page_frag *pfrag, - unsigned char record_type) +static void tls_device_record_close(struct sock *sk, + struct tls_context *ctx, + struct tls_record_info *record, + struct page_frag *pfrag, + unsigned char record_type) { struct tls_prot_info *prot = &ctx->prot_info; - int ret; + struct page_frag dummy_tag_frag; /* append tag * device will fill in the tag, we just need to append a placeholder * use socket memory to improve coalescing (re-using a single buffer * increases frag count) - * if we can't allocate memory now, steal some back from data + * if we can't allocate memory now use the dummy page */ - if (likely(skb_page_frag_refill(prot->tag_size, pfrag, - sk->sk_allocation))) { - ret = 0; - tls_append_frag(record, pfrag, prot->tag_size); - } else { - ret = prot->tag_size; - if (record->len <= prot->overhead_size) - return -ENOMEM; + if (unlikely(pfrag->size - pfrag->offset < prot->tag_size) && + !skb_page_frag_refill(prot->tag_size, pfrag, sk->sk_allocation)) { + dummy_tag_frag.page = dummy_page; + dummy_tag_frag.offset = 0; + pfrag = &dummy_tag_frag; } + tls_append_frag(record, pfrag, prot->tag_size); /* fill prepend */ tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]), record->len - prot->overhead_size, record_type); - return ret; } static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, @@ -541,18 +540,8 @@ last_record: if (done || record->len >= max_open_record_len || (record->num_frags >= MAX_SKB_FRAGS - 1)) { - rc = tls_device_record_close(sk, tls_ctx, record, - pfrag, record_type); - if (rc) { - if (rc > 0) { - size += rc; - } else { - size = orig_size; - destroy_record(record); - ctx->open_record = NULL; - break; - } - } + tls_device_record_close(sk, tls_ctx, record, + pfrag, record_type); rc = tls_push_record(sk, tls_ctx, @@ -1450,14 +1439,26 @@ int __init tls_device_init(void) { int err; - destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0); - if (!destruct_wq) + dummy_page = alloc_page(GFP_KERNEL); + if (!dummy_page) return -ENOMEM; + destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0); + if (!destruct_wq) { + err = -ENOMEM; + goto err_free_dummy; + } + err = register_netdevice_notifier(&tls_dev_notifier); if (err) - destroy_workqueue(destruct_wq); + goto err_destroy_wq; + return 0; + +err_destroy_wq: + destroy_workqueue(destruct_wq); +err_free_dummy: + put_page(dummy_page); return err; } @@ -1466,4 +1467,5 @@ void __exit tls_device_cleanup(void) unregister_netdevice_notifier(&tls_dev_notifier); destroy_workqueue(destruct_wq); clean_acked_data_flush(); + put_page(dummy_page); } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index b6896126bb92..4a8ee2f6badb 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -139,9 +139,6 @@ int tls_push_sg(struct sock *sk, ctx->splicing_pages = true; while (1) { - if (sg_is_last(sg)) - msg.msg_flags = flags; - /* is sending application-limited? */ tcp_rate_check_app_limited(sk); p = sg_page(sg); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 78585217f61a..86930a8ed012 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -790,7 +790,7 @@ static int unix_set_peek_off(struct sock *sk, int val) if (mutex_lock_interruptible(&u->iolock)) return -EINTR; - sk->sk_peek_off = val; + WRITE_ONCE(sk->sk_peek_off, val); mutex_unlock(&u->iolock); return 0; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0da2e6a2a7ea..8bcf8e293308 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5430,8 +5430,11 @@ nl80211_parse_mbssid_elems(struct wiphy *wiphy, struct nlattr *attrs) if (!wiphy->mbssid_max_interfaces) return ERR_PTR(-EINVAL); - nla_for_each_nested(nl_elems, attrs, rem_elems) + nla_for_each_nested(nl_elems, attrs, rem_elems) { + if (num_elems >= 255) + return ERR_PTR(-EINVAL); num_elems++; + } elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL); if (!elems) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 8bf00caf5d29..0cf1ce7b6934 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -657,7 +657,7 @@ static int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, ret = cfg80211_calc_short_ssid(ies, &ssid_elem, &s_ssid_tmp); if (ret) - return ret; + return 0; for_each_element_id(elem, WLAN_EID_REDUCED_NEIGHBOR_REPORT, ies->data, ies->len) { diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 31dca4ecb2c5..10ea85c03147 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -505,7 +505,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb->dev = dev; skb->priority = xs->sk.sk_priority; - skb->mark = xs->sk.sk_mark; + skb->mark = READ_ONCE(xs->sk.sk_mark); skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr; skb->destructor = xsk_destruct_skb; @@ -994,6 +994,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) err = xp_alloc_tx_descs(xs->pool, xs); if (err) { xp_put_pool(xs->pool); + xs->pool = NULL; sockfd_put(sock); goto out_unlock; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e7617c9959c3..d6b405782b63 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2250,7 +2250,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, match = xfrm_selector_match(&pol->selector, fl, family); if (match) { - if ((sk->sk_mark & pol->mark.m) != pol->mark.v || + if ((READ_ONCE(sk->sk_mark) & pol->mark.m) != pol->mark.v || pol->if_id != if_id) { pol = NULL; goto out; |