diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-05 20:13:21 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-05 20:13:21 -0700 |
commit | 47ec5303d73ea344e84f46660fff693c57641386 (patch) | |
tree | a2252debab749de29620c43285295d60c4741119 /net | |
parent | 8186749621ed6b8fc42644c399e8c755a2b6f630 (diff) | |
parent | c1055b76ad00aed0e8b79417080f212d736246b6 (diff) | |
download | lwn-47ec5303d73ea344e84f46660fff693c57641386.tar.gz lwn-47ec5303d73ea344e84f46660fff693c57641386.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from David Miller:
1) Support 6Ghz band in ath11k driver, from Rajkumar Manoharan.
2) Support UDP segmentation in code TSO code, from Eric Dumazet.
3) Allow flashing different flash images in cxgb4 driver, from Vishal
Kulkarni.
4) Add drop frames counter and flow status to tc flower offloading,
from Po Liu.
5) Support n-tuple filters in cxgb4, from Vishal Kulkarni.
6) Various new indirect call avoidance, from Eric Dumazet and Brian
Vazquez.
7) Fix BPF verifier failures on 32-bit pointer arithmetic, from
Yonghong Song.
8) Support querying and setting hardware address of a port function via
devlink, use this in mlx5, from Parav Pandit.
9) Support hw ipsec offload on bonding slaves, from Jarod Wilson.
10) Switch qca8k driver over to phylink, from Jonathan McDowell.
11) In bpftool, show list of processes holding BPF FD references to
maps, programs, links, and btf objects. From Andrii Nakryiko.
12) Several conversions over to generic power management, from Vaibhav
Gupta.
13) Add support for SO_KEEPALIVE et al. to bpf_setsockopt(), from Dmitry
Yakunin.
14) Various https url conversions, from Alexander A. Klimov.
15) Timestamping and PHC support for mscc PHY driver, from Antoine
Tenart.
16) Support bpf iterating over tcp and udp sockets, from Yonghong Song.
17) Support 5GBASE-T i40e NICs, from Aleksandr Loktionov.
18) Add kTLS RX HW offload support to mlx5e, from Tariq Toukan.
19) Fix the ->ndo_start_xmit() return type to be netdev_tx_t in several
drivers. From Luc Van Oostenryck.
20) XDP support for xen-netfront, from Denis Kirjanov.
21) Support receive buffer autotuning in MPTCP, from Florian Westphal.
22) Support EF100 chip in sfc driver, from Edward Cree.
23) Add XDP support to mvpp2 driver, from Matteo Croce.
24) Support MPTCP in sock_diag, from Paolo Abeni.
25) Commonize UDP tunnel offloading code by creating udp_tunnel_nic
infrastructure, from Jakub Kicinski.
26) Several pci_ --> dma_ API conversions, from Christophe JAILLET.
27) Add FLOW_ACTION_POLICE support to mlxsw, from Ido Schimmel.
28) Add SK_LOOKUP bpf program type, from Jakub Sitnicki.
29) Refactor a lot of networking socket option handling code in order to
avoid set_fs() calls, from Christoph Hellwig.
30) Add rfc4884 support to icmp code, from Willem de Bruijn.
31) Support TBF offload in dpaa2-eth driver, from Ioana Ciornei.
32) Support XDP_REDIRECT in qede driver, from Alexander Lobakin.
33) Support PCI relaxed ordering in mlx5 driver, from Aya Levin.
34) Support TCP syncookies in MPTCP, from Flowian Westphal.
35) Fix several tricky cases of PMTU handling wrt. briding, from Stefano
Brivio.
* git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2056 commits)
net: thunderx: initialize VF's mailbox mutex before first usage
usb: hso: remove bogus check for EINPROGRESS
usb: hso: no complaint about kmalloc failure
hso: fix bailout in error case of probe
ip_tunnel_core: Fix build for archs without _HAVE_ARCH_IPV6_CSUM
selftests/net: relax cpu affinity requirement in msg_zerocopy test
mptcp: be careful on subflow creation
selftests: rtnetlink: make kci_test_encap() return sub-test result
selftests: rtnetlink: correct the final return value for the test
net: dsa: sja1105: use detected device id instead of DT one on mismatch
tipc: set ub->ifindex for local ipv6 address
ipv6: add ipv6_dev_find()
net: openvswitch: silence suspicious RCU usage warning
Revert "vxlan: fix tos value before xmit"
ptp: only allow phase values lower than 1 period
farsync: switch from 'pci_' to 'dma_' API
wan: wanxl: switch from 'pci_' to 'dma_' API
hv_netvsc: do not use VF device if link is down
dpaa2-eth: Fix passing zero to 'PTR_ERR' warning
net: macb: Properly handle phylink on at91sam9x
...
Diffstat (limited to 'net')
438 files changed, 14821 insertions, 6173 deletions
diff --git a/net/9p/client.c b/net/9p/client.c index fc1f3635e5dd..09f1ec589b80 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -811,7 +811,7 @@ reterr: * @uodata: source for zero copy write * @inlen: read buffer size * @olen: write buffer size - * @hdrlen: reader header size, This is the size of response protocol data + * @in_hdrlen: reader header size, This is the size of response protocol data * @fmt: protocol format string (see protocol.c) * * Returns request structure (which client must free using p9_tag_remove) diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index b21c3c209815..2885ff9c76f0 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -94,14 +94,15 @@ struct p9_trans_rdma { struct completion cm_done; }; +struct p9_rdma_req; + /** - * p9_rdma_context - Keeps track of in-process WR + * struct p9_rdma_context - Keeps track of in-process WR * * @busa: Bus address to unmap when the WR completes * @req: Keeps track of requests (send) * @rc: Keepts track of replies (receive) */ -struct p9_rdma_req; struct p9_rdma_context { struct ib_cqe cqe; dma_addr_t busa; @@ -112,7 +113,7 @@ struct p9_rdma_context { }; /** - * p9_rdma_opts - Collection of mount options + * struct p9_rdma_opts - Collection of mount options * @port: port of connection * @sq_depth: The requested depth of the SQ. This really doesn't need * to be any deeper than the number of threads used in the client diff --git a/net/Kconfig b/net/Kconfig index d1672280d6a4..3831206977a1 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -455,7 +455,6 @@ config FAILOVER config ETHTOOL_NETLINK bool "Netlink interface for ethtool" default y - depends on PHYLIB=y || PHYLIB=n help An alternative userspace interface for ethtool based on generic netlink. It provides better extensibility and some new features, diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c index 550c6ca007cc..9c1241292d1d 100644 --- a/net/appletalk/atalk_proc.c +++ b/net/appletalk/atalk_proc.c @@ -229,6 +229,8 @@ int __init atalk_proc_init(void) sizeof(struct aarp_iter_state), NULL)) goto out; + return 0; + out: remove_proc_subtree("atalk", init_net.proc_net); return -ENOMEM; diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 15787e8c0629..1d48708c5a2e 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1917,8 +1917,6 @@ static const struct proto_ops atalk_dgram_ops = { #endif .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = atalk_sendmsg, .recvmsg = atalk_recvmsg, .mmap = sock_no_mmap, diff --git a/net/atm/common.c b/net/atm/common.c index 8575f5d52087..84367b844b14 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -745,7 +745,7 @@ static int check_qos(const struct atm_qos *qos) } int vcc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct atm_vcc *vcc; unsigned long value; @@ -760,7 +760,7 @@ int vcc_setsockopt(struct socket *sock, int level, int optname, { struct atm_qos qos; - if (copy_from_user(&qos, optval, sizeof(qos))) + if (copy_from_sockptr(&qos, optval, sizeof(qos))) return -EFAULT; error = check_qos(&qos); if (error) @@ -774,7 +774,7 @@ int vcc_setsockopt(struct socket *sock, int level, int optname, return 0; } case SO_SETCLP: - if (get_user(value, (unsigned long __user *)optval)) + if (copy_from_sockptr(&value, optval, sizeof(value))) return -EFAULT; if (value) vcc->atm_options |= ATM_ATMOPT_CLP; @@ -782,13 +782,8 @@ int vcc_setsockopt(struct socket *sock, int level, int optname, vcc->atm_options &= ~ATM_ATMOPT_CLP; return 0; default: - if (level == SOL_SOCKET) - return -EINVAL; - break; - } - if (!vcc->dev || !vcc->dev->ops->setsockopt) return -EINVAL; - return vcc->dev->ops->setsockopt(vcc, level, optname, optval, optlen); + } } int vcc_getsockopt(struct socket *sock, int level, int optname, @@ -826,13 +821,8 @@ int vcc_getsockopt(struct socket *sock, int level, int optname, return copy_to_user(optval, &pvc, sizeof(pvc)) ? -EFAULT : 0; } default: - if (level == SOL_SOCKET) - return -EINVAL; - break; - } - if (!vcc->dev || !vcc->dev->ops->getsockopt) return -EINVAL; - return vcc->dev->ops->getsockopt(vcc, level, optname, optval, len); + } } int register_atmdevice_notifier(struct notifier_block *nb) diff --git a/net/atm/common.h b/net/atm/common.h index 5850649068bb..a1e56e8de698 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -21,7 +21,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait); int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen); + sockptr_t optval, unsigned int optlen); int vcc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); void vcc_process_recv_queue(struct atm_vcc *vcc); diff --git a/net/atm/lec_arpc.h b/net/atm/lec_arpc.h index 1205d8792d28..39115fe074c4 100644 --- a/net/atm/lec_arpc.h +++ b/net/atm/lec_arpc.h @@ -44,7 +44,7 @@ struct lec_arp_table { u8 *tlvs; u32 sizeoftlvs; /* * LANE2: Each MAC address can have TLVs - * associated with it. sizeoftlvs tells the + * associated with it. sizeoftlvs tells * the length of the tlvs array */ struct sk_buff_head tx_wait; /* wait queue for outgoing packets */ diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 02bd2a436bdf..53e7d3f39e26 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -63,7 +63,7 @@ static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr, } static int pvc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int error; diff --git a/net/atm/svc.c b/net/atm/svc.c index ba144d035e3d..4a02bcaad279 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -451,7 +451,7 @@ int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos) } static int svc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); @@ -464,7 +464,7 @@ static int svc_setsockopt(struct socket *sock, int level, int optname, error = -EINVAL; goto out; } - if (copy_from_user(&vcc->sap, optval, optlen)) { + if (copy_from_sockptr(&vcc->sap, optval, optlen)) { error = -EFAULT; goto out; } @@ -475,7 +475,7 @@ static int svc_setsockopt(struct socket *sock, int level, int optname, error = -EINVAL; goto out; } - if (get_user(value, (int __user *)optval)) { + if (copy_from_sockptr(&value, optval, sizeof(int))) { error = -EFAULT; goto out; } diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig index 97d686d115c0..d3a9843a043d 100644 --- a/net/ax25/Kconfig +++ b/net/ax25/Kconfig @@ -8,7 +8,7 @@ menuconfig HAMRADIO bool "Amateur Radio support" help If you want to connect your Linux box to an amateur radio, answer Y - here. You want to read <http://www.tapr.org/> + here. You want to read <https://www.tapr.org/> and more specifically about AX.25 on Linux <http://www.linux-ax25.org/>. @@ -39,11 +39,11 @@ config AX25 Information about where to get supporting software for Linux amateur radio as well as information about how to configure an AX.25 port is contained in the AX25-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. You might also want to + <https://www.tldp.org/docs.html#howto>. You might also want to check out the file <file:Documentation/networking/ax25.rst> in the kernel source. More information about digital amateur radio in general is on the WWW at - <http://www.tapr.org/>. + <https://www.tapr.org/>. To compile this driver as a module, choose M here: the module will be called ax25. @@ -90,7 +90,7 @@ config NETROM <http://www.linux-ax25.org>. You also might want to check out the file <file:Documentation/networking/ax25.rst>. More information about digital amateur radio in general is on the WWW at - <http://www.tapr.org/>. + <https://www.tapr.org/>. To compile this driver as a module, choose M here: the module will be called netrom. @@ -109,7 +109,7 @@ config ROSE <http://www.linux-ax25.org>. You also might want to check out the file <file:Documentation/networking/ax25.rst>. More information about digital amateur radio in general is on the WWW at - <http://www.tapr.org/>. + <https://www.tapr.org/>. To compile this driver as a module, choose M here: the module will be called rose. diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index dec3f35467c9..269ee89d2c2b 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -528,7 +528,7 @@ ax25_cb *ax25_create_cb(void) */ static int ax25_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; ax25_cb *ax25; @@ -543,7 +543,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(unsigned int)) return -EINVAL; - if (get_user(opt, (unsigned int __user *)optval)) + if (copy_from_sockptr(&opt, optval, sizeof(unsigned int))) return -EFAULT; lock_sock(sk); @@ -640,7 +640,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, memset(devname, 0, sizeof(devname)); - if (copy_from_user(devname, optval, optlen)) { + if (copy_from_sockptr(devname, optval, optlen)) { res = -EFAULT; break; } diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index e87f19c82e8d..a4faf5f904d9 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -134,7 +134,7 @@ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) * * Return: the originator object corresponding to the passed mac address or NULL * on failure. - * If the object does not exists it is created an initialised. + * If the object does not exist, it is created and initialised. */ static struct batadv_orig_node * batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) @@ -871,7 +871,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) } /** - * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over iterface + * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface * @orig_node: originator which reproadcasted the OGMs directly * @if_outgoing: interface which transmitted the original OGM and received the * direct rebroadcast @@ -1075,10 +1075,10 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_neigh_ifinfo *neigh_ifinfo; u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; + unsigned int tq_iface_hop_penalty = BATADV_TQ_MAX_VALUE; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; - unsigned int tq_iface_penalty; bool ret = false; /* find corresponding one hop neighbor */ @@ -1157,31 +1157,32 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, inv_asym_penalty = BATADV_TQ_MAX_VALUE * neigh_rq_inv_cube; inv_asym_penalty /= neigh_rq_max_cube; tq_asym_penalty = BATADV_TQ_MAX_VALUE - inv_asym_penalty; + tq_iface_hop_penalty -= atomic_read(&if_incoming->hop_penalty); /* penalize if the OGM is forwarded on the same interface. WiFi * interfaces and other half duplex devices suffer from throughput * drops as they can't send and receive at the same time. */ - tq_iface_penalty = BATADV_TQ_MAX_VALUE; if (if_outgoing && if_incoming == if_outgoing && batadv_is_wifi_hardif(if_outgoing)) - tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE, - bat_priv); + tq_iface_hop_penalty = batadv_hop_penalty(tq_iface_hop_penalty, + bat_priv); combined_tq = batadv_ogm_packet->tq * tq_own * tq_asym_penalty * - tq_iface_penalty; + tq_iface_hop_penalty; combined_tq /= BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE; batadv_ogm_packet->tq = combined_tq; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", + "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_hop_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", orig_node->orig, orig_neigh_node->orig, total_count, - neigh_rq_count, tq_own, tq_asym_penalty, tq_iface_penalty, - batadv_ogm_packet->tq, if_incoming->net_dev->name, + neigh_rq_count, tq_own, tq_asym_penalty, + tq_iface_hop_penalty, batadv_ogm_packet->tq, + if_incoming->net_dev->name, if_outgoing ? if_outgoing->net_dev->name : "DEFAULT"); /* if link has the minimum required transmission quality @@ -1554,7 +1555,7 @@ static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet, * batadv_iv_ogm_process() - process an incoming batman iv OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) - * @if_incoming: the interface where this packet was receved + * @if_incoming: the interface where this packet was received */ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) @@ -2288,7 +2289,7 @@ batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information - * @single_hardif: Limit dump to this hard interfaace + * @single_hardif: Limit dump to this hard interface */ static void batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 0bdefa35da98..d35aca0e969a 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -60,7 +60,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) * @neigh: the neighbour for which the throughput has to be obtained * * Return: The throughput towards the given neighbour in multiples of 100kpbs - * (a value of '1' equals to 0.1Mbps, '10' equals 1Mbps, etc). + * (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). */ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) { @@ -183,8 +183,8 @@ void batadv_v_elp_throughput_metric_update(struct work_struct *work) * * Sends a predefined number of unicast wifi packets to a given neighbour in * order to trigger the throughput estimation on this link by the RC algorithm. - * Packets are sent only if there there is not enough payload unicast traffic - * towards this neighbour.. + * Packets are sent only if there is not enough payload unicast traffic towards + * this neighbour.. * * Return: True on success and false in case of error during skb preparation. */ @@ -244,7 +244,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) * batadv_v_elp_periodic_work() - ELP periodic task per interface * @work: work queue item * - * Emits broadcast ELP message in regular intervals. + * Emits broadcast ELP messages in regular intervals. */ static void batadv_v_elp_periodic_work(struct work_struct *work) { @@ -499,7 +499,7 @@ orig_free: * @skb: the received packet * @if_incoming: the interface this packet was received through * - * Return: NET_RX_SUCCESS and consumes the skb if the packet was peoperly + * Return: NET_RX_SUCCESS and consumes the skb if the packet was properly * processed or NET_RX_DROP in case of failure. */ int batadv_v_elp_packet_recv(struct sk_buff *skb, diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 18028b9f95f0..0f8495b9eeb1 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -47,9 +47,9 @@ * @bat_priv: the bat priv with all the soft interface information * @addr: the address of the originator * - * Return: the orig_node corresponding to the specified address. If such object - * does not exist it is allocated here. In case of allocation failure returns - * NULL. + * Return: the orig_node corresponding to the specified address. If such an + * object does not exist, it is allocated here. In case of allocation failure + * returns NULL. */ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) @@ -172,7 +172,7 @@ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, * batadv_v_ogm_aggr_list_free - free all elements in an aggregation queue * @hard_iface: the interface holding the aggregation queue * - * Empties the OGMv2 aggregation queue and frees all the skbs it contained. + * Empties the OGMv2 aggregation queue and frees all the skbs it contains. * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ @@ -378,7 +378,7 @@ static void batadv_v_ogm_send(struct work_struct *work) * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface * @work: work queue item * - * Emits aggregated OGM message in regular intervals. + * Emits aggregated OGM messages in regular intervals. */ void batadv_v_ogm_aggr_work(struct work_struct *work) { @@ -399,7 +399,7 @@ void batadv_v_ogm_aggr_work(struct work_struct *work) * batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V * @hard_iface: the interface to prepare * - * Takes care of scheduling own OGM sending routine for this interface. + * Takes care of scheduling its own OGM sending routine for this interface. * * Return: 0 on success or a negative error code otherwise */ @@ -455,15 +455,17 @@ unlock: * @throughput: the current throughput * * Apply a penalty on the current throughput metric value based on the - * characteristic of the interface where the OGM has been received. The return - * value is computed as follows: + * characteristic of the interface where the OGM has been received. + * + * Initially the per hardif hop penalty is applied to the throughput. After + * that the return value is then computed as follows: * - throughput * 50% if the incoming and outgoing interface are the * same WiFi interface and the throughput is above * 1MBit/s * - throughput if the outgoing interface is the default * interface (i.e. this OGM is processed for the * internal table and not forwarded) - * - throughput * hop penalty otherwise + * - throughput * node hop penalty otherwise * * Return: the penalised throughput metric. */ @@ -472,9 +474,14 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, u32 throughput) { + int if_hop_penalty = atomic_read(&if_incoming->hop_penalty); int hop_penalty = atomic_read(&bat_priv->hop_penalty); int hop_penalty_max = BATADV_TQ_MAX_VALUE; + /* Apply per hardif hop penalty */ + throughput = throughput * (hop_penalty_max - if_hop_penalty) / + hop_penalty_max; + /* Don't apply hop penalty in default originator table. */ if (if_outgoing == BATADV_IF_DEFAULT) return throughput; @@ -847,7 +854,7 @@ batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, * batadv_v_ogm_process() - process an incoming batman v OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) - * @if_incoming: the interface where this packet was receved + * @if_incoming: the interface where this packet was received */ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 41cc87f06b14..91a04ca373dc 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -992,7 +992,7 @@ static bool batadv_handle_claim(struct batadv_priv *bat_priv, * @hw_dst: the Hardware destination in the ARP Header * @ethhdr: pointer to the Ethernet header of the claim frame * - * checks if it is a claim packet and if its on the same group. + * checks if it is a claim packet and if it's on the same group. * This function also applies the group ID of the sender * if it is in the same mesh. * @@ -1757,7 +1757,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv) * @vid: the VLAN ID of the frame * * Checks if this packet is a loop detect frame which has been sent by us, - * throw an uevent and log the event if that is the case. + * throws an uevent and logs the event if that is the case. * * Return: true if it is a loop detect frame which is to be dropped, false * otherwise. @@ -1815,7 +1815,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, * * we have to race for a claim * * if the frame is allowed on the LAN * - * in these cases, the skb is further handled by this function + * In these cases, the skb is further handled by this function * * Return: true if handled, otherwise it returns false and the caller shall * further process the skb. diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b85da4b7a77b..0e6e53e9b5f3 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -666,7 +666,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, * @vid: VLAN identifier * @packet_subtype: unicast4addr packet subtype to use * - * This function copies the skb with pskb_copy() and is sent as unicast packet + * This function copies the skb with pskb_copy() and is sent as a unicast packet * to each of the selected candidates. * * Return: true if the packet is sent to at least one candidate, false diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 7cad97644d05..9fdbe3068153 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -102,8 +102,8 @@ static int batadv_frag_size_limit(void) * * Caller must hold chain->lock. * - * Return: true if chain is empty and caller can just insert the new fragment - * without searching for the right position. + * Return: true if chain is empty and the caller can just insert the new + * fragment without searching for the right position. */ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, u16 seqno) @@ -306,7 +306,7 @@ free: * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb * to NULL; 3) Error: Return false and free skb. * - * Return: true when packet is merged or buffered, false when skb is not not + * Return: true when the packet is merged or buffered, false when skb is not not * used. */ bool batadv_frag_skb_buffer(struct sk_buff **skb, diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 3a256af92784..fa06b51c0144 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -138,10 +138,10 @@ static bool batadv_mutual_parents(const struct net_device *dev1, * @net_dev: the device to check * * If the user creates any virtual device on top of a batman-adv interface, it - * is important to prevent this new interface to be used to create a new mesh - * network (this behaviour would lead to a batman-over-batman configuration). - * This function recursively checks all the fathers of the device passed as - * argument looking for a batman-adv soft interface. + * is important to prevent this new interface from being used to create a new + * mesh network (this behaviour would lead to a batman-over-batman + * configuration). This function recursively checks all the fathers of the + * device passed as argument looking for a batman-adv soft interface. * * Return: true if the device is descendant of a batman-adv mesh interface (or * if it is a batman-adv interface itself), false otherwise @@ -680,8 +680,8 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) * @slave: the interface enslaved in another master * @master: the master from which slave has to be removed * - * Invoke ndo_del_slave on master passing slave as argument. In this way slave - * is free'd and master can correctly change its internal state. + * Invoke ndo_del_slave on master passing slave as argument. In this way the + * slave is free'd and the master can correctly change its internal state. * * Return: 0 on success, a negative value representing the error otherwise */ @@ -818,7 +818,7 @@ err: * @soft_iface: soft interface to check * * This function is only using RCU for locking - the result can therefore be - * off when another functions is modifying the list at the same time. The + * off when another function is modifying the list at the same time. The * caller can use the rtnl_lock to make sure that the count is accurate. * * Return: number of connected/enslaved hard interfaces @@ -939,6 +939,8 @@ batadv_hardif_add_interface(struct net_device *net_dev) if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; + atomic_set(&hard_iface->hop_penalty, 0); + batadv_v_hardif_init(hard_iface); batadv_check_known_mac_addr(hard_iface->net_dev); diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index f9884dc56cf3..979864c0fa6b 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -69,7 +69,7 @@ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) __printf(2, 3); /** - * _batadv_dbg() - Store debug output with(out) ratelimiting + * _batadv_dbg() - Store debug output with(out) rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @ratelimited: whether output should be rate limited @@ -95,7 +95,7 @@ static inline void _batadv_dbg(int type __always_unused, #endif /** - * batadv_dbg() - Store debug output without ratelimiting + * batadv_dbg() - Store debug output without rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @arg: format string and variable arguments @@ -104,7 +104,7 @@ static inline void _batadv_dbg(int type __always_unused, _batadv_dbg(type, bat_priv, 0, ## arg) /** - * batadv_dbg_ratelimited() - Store debug output with ratelimiting + * batadv_dbg_ratelimited() - Store debug output with rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @arg: format string and variable arguments diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d8a255c85e77..519c08c2cfba 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -666,7 +666,7 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) * @vid: the VLAN identifier for which the AP isolation attributed as to be * looked up * - * Return: true if AP isolation is on for the VLAN idenfied by vid, false + * Return: true if AP isolation is on for the VLAN identified by vid, false * otherwise */ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 61d8dbe8c954..0393bb9ed3d0 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2020.2" +#define BATADV_SOURCE_VERSION "2020.3" #endif /* B.A.T.M.A.N. parameters */ @@ -308,7 +308,7 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a predecessor - * unless the variable sequence number has grown by more then + * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: @@ -330,11 +330,11 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, /** * batadv_seq_after() - Checks if a sequence number x is a successor of y - * @x: potential sucessor of @y + * @x: potential successor of @y * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a successor - * unless the variable sequence number has grown by more then + * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 9ebdc1e864b9..bdc4a1fba1c6 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -510,7 +510,7 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * - * If there is a bridge interface on top of dev, collects from that one + * If there is a bridge interface on top of dev, collect from that one * instead. Just like with IP addresses and routes, multicast listeners * will(/should) register to the bridge interface instead of an * enslaved bat0. @@ -832,8 +832,8 @@ batadv_mcast_bridge_log(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @flags: TVLV flags indicating the new multicast state * - * Whenever the multicast TVLV flags this nodes announces change this notifies - * userspace via the 'mcast' log level. + * Whenever the multicast TVLV flags this node announces change, this function + * should be used to notify userspace about the change. */ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { @@ -1244,7 +1244,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) * @ethhdr: an ethernet header to determine the protocol family from * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or - * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and + * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, sets and * increases its refcount. */ static struct batadv_orig_node * @@ -1693,7 +1693,7 @@ batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv, } /** - * batadv_mcast_forw_send() - send packet to any detected multicast recpient + * batadv_mcast_forw_send() - send packet to any detected multicast recipient * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier @@ -1742,7 +1742,8 @@ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, - * orig, has toggled then this method updates counter and list accordingly. + * orig, has toggled then this method updates the counter and the list + * accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1787,7 +1788,7 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1832,7 +1833,7 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1877,7 +1878,7 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1922,7 +1923,7 @@ static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 02ed073f95a9..dc193618a761 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -640,7 +640,7 @@ batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie) * @bat_priv: the bat priv with all the soft interface information * @dst: destination of tp_meter session * @result: reason for tp meter session stop - * @test_time: total time ot the tp_meter session + * @test_time: total time of the tp_meter session * @total_bytes: bytes acked to the receiver * @cookie: cookie of tp_meter session * @@ -826,6 +826,10 @@ static int batadv_netlink_hardif_fill(struct sk_buff *msg, goto nla_put_failure; } + if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY, + atomic_read(&hard_iface->hop_penalty))) + goto nla_put_failure; + #ifdef CONFIG_BATMAN_ADV_BATMAN_V if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL, atomic_read(&hard_iface->bat_v.elp_interval))) @@ -920,9 +924,15 @@ static int batadv_netlink_set_hardif(struct sk_buff *skb, { struct batadv_hard_iface *hard_iface = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; + struct nlattr *attr; + + if (info->attrs[BATADV_ATTR_HOP_PENALTY]) { + attr = info->attrs[BATADV_ATTR_HOP_PENALTY]; + + atomic_set(&hard_iface->hop_penalty, nla_get_u8(attr)); + } #ifdef CONFIG_BATMAN_ADV_BATMAN_V - struct nlattr *attr; if (info->attrs[BATADV_ATTR_ELP_INTERVAL]) { attr = info->attrs[BATADV_ATTR_ELP_INTERVAL]; diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index b0469d15da0e..48d707850f3e 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -134,7 +134,7 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, } /** - * batadv_nc_mesh_init() - initialise coding hash table and start house keeping + * batadv_nc_mesh_init() - initialise coding hash table and start housekeeping * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure @@ -700,7 +700,7 @@ batadv_nc_process_nc_paths(struct batadv_priv *bat_priv, } /** - * batadv_nc_worker() - periodic task for house keeping related to network + * batadv_nc_worker() - periodic task for housekeeping related to network * coding * @work: kernel work struct */ @@ -1316,7 +1316,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, } /** - * batadv_nc_skb_src_search() - Loops through the list of neighoring nodes of + * batadv_nc_skb_src_search() - Loops through the list of neighboring nodes of * the skb's sender (may be equal to the originator). * @bat_priv: the bat priv with all the soft interface information * @skb: data skb to forward @@ -1402,10 +1402,10 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, * @neigh_node: next hop to forward packet to * @ethhdr: pointer to the ethernet header inside the skb * - * Loops through list of neighboring nodes the next hop has a good connection to - * (receives OGMs with a sufficient quality). We need to find a neighbor of our - * next hop that potentially sent a packet which our next hop also received - * (overheard) and has stored for later decoding. + * Loops through the list of neighboring nodes the next hop has a good + * connection to (receives OGMs with a sufficient quality). We need to find a + * neighbor of our next hop that potentially sent a packet which our next hop + * also received (overheard) and has stored for later decoding. * * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 5b0c2fffc214..805d8969bdfb 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -325,7 +325,7 @@ void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to * - * Return: the neighbor which should be router for this orig_node/iface. + * Return: the neighbor which should be the router for this orig_node/iface. * * The object is returned with refcounter increased by 1. */ @@ -515,7 +515,7 @@ out: * Looks for and possibly returns a neighbour belonging to this originator list * which is connected through the provided hard interface. * - * Return: neighbor when found. Othwerwise NULL + * Return: neighbor when found. Otherwise NULL */ static struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, @@ -620,7 +620,7 @@ batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, * * Looks for and possibly returns a neighbour belonging to this hard interface. * - * Return: neighbor when found. Othwerwise NULL + * Return: neighbor when found. Otherwise NULL */ struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, @@ -999,7 +999,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the originator * - * Creates a new originator object and initialise all the generic fields. + * Creates a new originator object and initialises all the generic fields. * The new object is not added to the originator list. * * Return: the newly created object or NULL on failure. diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index d343382e9664..27cdf5e4349a 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -449,7 +449,7 @@ free_skb: * @skb: packet to check * @hdr_size: size of header to pull * - * Check for short header and bad addresses in given packet. + * Checks for short header and bad addresses in the given packet. * * Return: negative value when check fails and 0 otherwise. The negative value * depends on the reason: -ENODATA for bad header, -EBADR for broadcast @@ -1113,7 +1113,7 @@ free_skb: * @recv_if: interface that the skb is received on * * This function does one of the three following things: 1) Forward fragment, if - * the assembled packet will exceed our MTU; 2) Buffer fragment, if we till + * the assembled packet will exceed our MTU; 2) Buffer fragment, if we still * lack further fragments; 3) Merge fragments, if we have all needed parts. * * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise. diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 7f8ade04e08e..d267b94800d6 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -605,8 +605,8 @@ bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet, * given hard_iface. If hard_iface is NULL forwarding packets on all hard * interfaces will be claimed. * - * The packets are being moved from the forw_list to the cleanup_list and - * by that allows already running threads to notice the claiming. + * The packets are being moved from the forw_list to the cleanup_list. This + * makes it possible for already running threads to notice the claim. */ static void batadv_forw_packet_list_steal(struct hlist_head *forw_list, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index f1f1c86f3419..23833a0ba5e6 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -406,7 +406,7 @@ end: * @hdr_size: size of already parsed batman-adv header * @orig_node: originator from which the batman-adv packet was sent * - * Sends a ethernet frame to the receive path of the local @soft_iface. + * Sends an ethernet frame to the receive path of the local @soft_iface. * skb->data has still point to the batman-adv header with the size @hdr_size. * The caller has to have parsed this header already and made sure that at least * @hdr_size bytes are still available for pull in @skb. diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index bd2ac570c42c..db7e3774825b 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -66,7 +66,7 @@ /** * BATADV_TP_MAX_RTO - Maximum sender timeout. If the sender RTO gets beyond - * such amound of milliseconds, the receiver is considered unreachable and the + * such amount of milliseconds, the receiver is considered unreachable and the * connection is killed */ #define BATADV_TP_MAX_RTO 30000 @@ -108,10 +108,10 @@ static u32 batadv_tp_session_cookie(const u8 session[2], u8 icmp_uid) * batadv_tp_cwnd() - compute the new cwnd size * @base: base cwnd size value * @increment: the value to add to base to get the new size - * @min: minumim cwnd value (usually MSS) + * @min: minimum cwnd value (usually MSS) * - * Return the new cwnd size and ensures it does not exceed the Advertised - * Receiver Window size. It is wrap around safe. + * Return the new cwnd size and ensure it does not exceed the Advertised + * Receiver Window size. It is wrapped around safely. * For details refer to Section 3.1 of RFC5681 * * Return: new congestion window size in bytes @@ -254,7 +254,7 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, * @dst: the other endpoint MAC address to look for * * Look for a tp_vars object matching dst as end_point and return it after - * having incremented the refcounter. Return NULL is not found + * having increment the refcounter. Return NULL is not found * * Return: matching tp_vars or NULL when no tp_vars with @dst was found */ @@ -291,7 +291,7 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, * @session: session identifier * * Look for a tp_vars object matching dst as end_point, session as tp meter - * session and return it after having incremented the refcounter. Return NULL + * session and return it after having increment the refcounter. Return NULL * is not found * * Return: matching tp_vars or NULL when no tp_vars was found diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index a9635c882fe0..98a0aaaf0d50 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -301,7 +301,7 @@ void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) * @vid: VLAN identifier * * Return: the number of originators advertising the given address/data - * (excluding ourself). + * (excluding our self). */ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) @@ -842,7 +842,7 @@ out: * table. In case of success the value is updated with the real amount of * reserved bytes * Allocate the needed amount of memory for the entire TT TVLV and write its - * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data + * header made up of one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN served by the originator node. * * Return: the size of the allocated buffer or 0 in case of failure. @@ -1674,7 +1674,7 @@ out: * the function argument. * If a TT local entry exists for this non-mesh client remove it. * - * The caller must hold orig_node refcount. + * The caller must hold the orig_node refcount. * * Return: true if the new entry has been added, false otherwise */ @@ -1839,7 +1839,7 @@ out: * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: global translation table entry to be analyzed * - * This functon assumes the caller holds rcu_read_lock(). + * This function assumes the caller holds rcu_read_lock(). * Return: best originator list entry or NULL on errors. */ static struct batadv_tt_orig_list_entry * @@ -1887,7 +1887,7 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, * @tt_global_entry: global translation table entry to be printed * @seq: debugfs table seq_file struct * - * This functon assumes the caller holds rcu_read_lock(). + * This function assumes the caller holds rcu_read_lock(). */ static void batadv_tt_global_print_entry(struct batadv_priv *bat_priv, diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 0963a43ad996..6a23a566cde1 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -353,8 +353,8 @@ end: * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * - * Return: success if handler was not found or the return value of the handler - * callback. + * Return: success if the handler was not found or the return value of the + * handler callback. */ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, struct batadv_tvlv_handler *tvlv_handler, diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index d152b8e81f61..ed519efa3c36 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -208,6 +208,12 @@ struct batadv_hard_iface { /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; + /** + * @hop_penalty: penalty which will be applied to the tq-field + * of an OGM received via this interface + */ + atomic_t hop_penalty; + /** @bat_iv: per hard-interface B.A.T.M.A.N. IV data */ struct batadv_hard_iface_bat_iv bat_iv; @@ -455,8 +461,8 @@ struct batadv_orig_node { spinlock_t tt_buff_lock; /** - * @tt_lock: prevents from updating the table while reading it. Table - * update is made up by two operations (data structure update and + * @tt_lock: avoids concurrent read from and write to the table. Table + * update is made up of two operations (data structure update and * metadata -CRC/TTVN-recalculation) and they have to be executed * atomically in order to avoid another thread to read the * table/metadata between those. @@ -748,7 +754,7 @@ struct batadv_neigh_ifinfo { * struct batadv_bcast_duplist_entry - structure for LAN broadcast suppression */ struct batadv_bcast_duplist_entry { - /** @orig: mac address of orig node orginating the broadcast */ + /** @orig: mac address of orig node originating the broadcast */ u8 orig[ETH_ALEN]; /** @crc: crc32 checksum of broadcast payload */ @@ -1010,7 +1016,7 @@ struct batadv_priv_tt { /** * @commit_lock: prevents from executing a local TT commit while reading - * the local table. The local TT commit is made up by two operations + * the local table. The local TT commit is made up of two operations * (data structure update and metadata -CRC/TTVN- recalculation) and * they have to be executed atomically in order to avoid another thread * to read the table/metadata between those. @@ -1024,7 +1030,7 @@ struct batadv_priv_tt { #ifdef CONFIG_BATMAN_ADV_BLA /** - * struct batadv_priv_bla - per mesh interface bridge loope avoidance data + * struct batadv_priv_bla - per mesh interface bridge loop avoidance data */ struct batadv_priv_bla { /** @num_requests: number of bla requests in flight */ @@ -1718,7 +1724,7 @@ struct batadv_priv { spinlock_t softif_vlan_list_lock; #ifdef CONFIG_BATMAN_ADV_BLA - /** @bla: bridge loope avoidance data */ + /** @bla: bridge loop avoidance data */ struct batadv_priv_bla bla; #endif diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index bb55d92691b0..cff4944d5b66 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -50,6 +50,7 @@ static bool enable_6lowpan; /* We are listening incoming connections via this channel */ static struct l2cap_chan *listen_chan; +static DEFINE_MUTEX(set_lock); struct lowpan_peer { struct list_head list; @@ -1078,12 +1079,14 @@ static void do_enable_set(struct work_struct *work) enable_6lowpan = set_enable->flag; + mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); } listen_chan = bt_6lowpan_listen(); + mutex_unlock(&set_lock); kfree(set_enable); } @@ -1135,11 +1138,13 @@ static ssize_t lowpan_control_write(struct file *fp, if (ret == -EINVAL) return ret; + mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); listen_chan = NULL; } + mutex_unlock(&set_lock); if (conn) { struct lowpan_peer *peer; diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 1d6d243cdde9..e2497d764e97 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -21,7 +21,7 @@ menuconfig BT It was designed as a replacement for cables and other short-range technologies like IrDA. Bluetooth operates in personal area range that typically extends up to 10 meters. More information about - Bluetooth can be found at <http://www.bluetooth.com/>. + Bluetooth can be found at <https://www.bluetooth.com/>. Linux Bluetooth subsystem consist of several layers: Bluetooth Core diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 41dd541a44a5..1c645fba8c49 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -14,7 +14,7 @@ bluetooth_6lowpan-y := 6lowpan.o bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \ - ecdh_helper.o hci_request.o mgmt_util.o + ecdh_helper.o hci_request.o mgmt_util.o mgmt_config.o bluetooth-$(CONFIG_BT_BREDR) += sco.o bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 3fd124927d4d..4ef6a54403aa 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -286,6 +286,9 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (msg->msg_name && bt_sk(sk)->skb_msg_name) bt_sk(sk)->skb_msg_name(skb, msg->msg_name, &msg->msg_namelen); + + if (bt_sk(sk)->skb_put_cmsg) + bt_sk(sk)->skb_put_cmsg(skb, msg, sk); } skb_free_datagram(sk, skb); @@ -453,8 +456,6 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; __poll_t mask = 0; - BT_DBG("sock %p, sk %p", sock, sk); - poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == BT_LISTEN) diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index cfd83c5521ae..d515571b2afb 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -182,8 +182,6 @@ static const struct proto_ops bnep_sock_ops = { .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index defdd4871919..96d49d9fae96 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -185,8 +185,6 @@ static const struct proto_ops cmtp_sock_ops = { .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 307800fd18e6..9832f8445d43 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -789,11 +789,8 @@ static void set_ext_conn_params(struct hci_conn *conn, memset(p, 0, sizeof(*p)); - /* Set window to be the same value as the interval to - * enable continuous scanning. - */ - p->scan_interval = cpu_to_le16(hdev->le_scan_interval); - p->scan_window = p->scan_interval; + p->scan_interval = cpu_to_le16(hdev->le_scan_int_connect); + p->scan_window = cpu_to_le16(hdev->le_scan_window_connect); p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); p->conn_latency = cpu_to_le16(conn->le_conn_latency); @@ -875,11 +872,8 @@ static void hci_req_add_le_create_conn(struct hci_request *req, memset(&cp, 0, sizeof(cp)); - /* Set window to be the same value as the interval to enable - * continuous scanning. - */ - cp.scan_interval = cpu_to_le16(hdev->le_scan_interval); - cp.scan_window = cp.scan_interval; + cp.scan_interval = cpu_to_le16(hdev->le_scan_int_connect); + cp.scan_window = cpu_to_le16(hdev->le_scan_window_connect); bacpy(&cp.peer_addr, &conn->dst); cp.peer_addr_type = conn->dst_type; @@ -937,7 +931,7 @@ static void hci_req_directed_advertising(struct hci_request *req, * So it is required to remove adv set for handle 0x00. since we use * instance 0 for directed adv. */ - hci_req_add(req, HCI_OP_LE_REMOVE_ADV_SET, sizeof(cp.handle), &cp.handle); + __hci_req_remove_ext_adv_instance(req, cp.handle); hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); @@ -1009,6 +1003,11 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, struct hci_request req; int err; + /* This ensures that during disable le_scan address resolution + * will not be disabled if it is followed by le_create_conn + */ + bool rpa_le_conn = true; + /* Let's make sure that le is enabled.*/ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { if (lmp_le_capable(hdev)) @@ -1109,7 +1108,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, * state. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { - hci_req_add_le_scan_disable(&req); + hci_req_add_le_scan_disable(&req, rpa_le_conn); hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED); } @@ -1180,7 +1179,8 @@ static int hci_explicit_conn_params_set(struct hci_dev *hdev, /* This function requires the caller holds hdev->lock */ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, - u16 conn_timeout) + u16 conn_timeout, + enum conn_reasons conn_reason) { struct hci_conn *conn; @@ -1225,6 +1225,7 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, conn->sec_level = BT_SECURITY_LOW; conn->pending_sec_level = sec_level; conn->conn_timeout = conn_timeout; + conn->conn_reason = conn_reason; hci_update_background_scan(hdev); @@ -1234,7 +1235,8 @@ done: } struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, - u8 sec_level, u8 auth_type) + u8 sec_level, u8 auth_type, + enum conn_reasons conn_reason) { struct hci_conn *acl; @@ -1254,6 +1256,7 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, hci_conn_hold(acl); + acl->conn_reason = conn_reason; if (acl->state == BT_OPEN || acl->state == BT_CLOSED) { acl->sec_level = BT_SECURITY_LOW; acl->pending_sec_level = sec_level; @@ -1270,7 +1273,8 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, struct hci_conn *acl; struct hci_conn *sco; - acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING); + acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING, + CONN_REASON_SCO_CONNECT); if (IS_ERR(acl)) return acl; @@ -1323,6 +1327,23 @@ int hci_conn_check_link_mode(struct hci_conn *conn) return 0; } + /* AES encryption is required for Level 4: + * + * BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 3, Part C + * page 1319: + * + * 128-bit equivalent strength for link and encryption keys + * required using FIPS approved algorithms (E0 not allowed, + * SAFER+ not allowed, and P-192 not allowed; encryption key + * not shortened) + */ + if (conn->sec_level == BT_SECURITY_FIPS && + !test_bit(HCI_CONN_AES_CCM, &conn->flags)) { + bt_dev_err(conn->hdev, + "Invalid security: Missing AES-CCM usage"); + return 0; + } + if (hci_conn_ssp_enabled(conn) && !test_bit(HCI_CONN_ENCRYPT, &conn->flags)) return 0; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index dbe2d79f233f..68bfe57b6625 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -26,7 +26,6 @@ /* Bluetooth HCI core. */ #include <linux/export.h> -#include <linux/idr.h> #include <linux/rfkill.h> #include <linux/debugfs.h> #include <linux/crypto.h> @@ -606,7 +605,8 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) if (hdev->commands[8] & 0x01) hci_req_add(req, HCI_OP_READ_PAGE_SCAN_ACTIVITY, 0, NULL); - if (hdev->commands[18] & 0x04) + if (hdev->commands[18] & 0x04 && + !test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) hci_req_add(req, HCI_OP_READ_DEF_ERR_DATA_REPORTING, 0, NULL); /* Some older Broadcom based Bluetooth 1.2 controllers do not @@ -763,6 +763,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL); } + if (hdev->commands[35] & 0x40) { + __le16 rpa_timeout = cpu_to_le16(hdev->rpa_timeout); + + /* Set RPA timeout */ + hci_req_add(req, HCI_OP_LE_SET_RPA_TIMEOUT, 2, + &rpa_timeout); + } + if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { /* Read LE Maximum Data Length */ hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL); @@ -851,7 +859,8 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) /* Set erroneous data reporting if supported to the wideband speech * setting value */ - if (hdev->commands[18] & 0x08) { + if (hdev->commands[18] & 0x08 && + !test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) { bool enabled = hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED); @@ -2982,7 +2991,7 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, adv_instance->remaining_time = timeout; if (duration == 0) - adv_instance->duration = HCI_DEFAULT_ADV_DURATION; + adv_instance->duration = hdev->def_multi_adv_rotation_duration; else adv_instance->duration = duration; @@ -2996,6 +3005,94 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, return 0; } +/* This function requires the caller holds hdev->lock */ +void hci_adv_monitors_clear(struct hci_dev *hdev) +{ + struct adv_monitor *monitor; + int handle; + + idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) + hci_free_adv_monitor(monitor); + + idr_destroy(&hdev->adv_monitors_idr); +} + +void hci_free_adv_monitor(struct adv_monitor *monitor) +{ + struct adv_pattern *pattern; + struct adv_pattern *tmp; + + if (!monitor) + return; + + list_for_each_entry_safe(pattern, tmp, &monitor->patterns, list) + kfree(pattern); + + kfree(monitor); +} + +/* This function requires the caller holds hdev->lock */ +int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) +{ + int min, max, handle; + + if (!monitor) + return -EINVAL; + + min = HCI_MIN_ADV_MONITOR_HANDLE; + max = HCI_MIN_ADV_MONITOR_HANDLE + HCI_MAX_ADV_MONITOR_NUM_HANDLES; + handle = idr_alloc(&hdev->adv_monitors_idr, monitor, min, max, + GFP_KERNEL); + if (handle < 0) + return handle; + + hdev->adv_monitors_cnt++; + monitor->handle = handle; + + hci_update_background_scan(hdev); + + return 0; +} + +static int free_adv_monitor(int id, void *ptr, void *data) +{ + struct hci_dev *hdev = data; + struct adv_monitor *monitor = ptr; + + idr_remove(&hdev->adv_monitors_idr, monitor->handle); + hci_free_adv_monitor(monitor); + + return 0; +} + +/* This function requires the caller holds hdev->lock */ +int hci_remove_adv_monitor(struct hci_dev *hdev, u16 handle) +{ + struct adv_monitor *monitor; + + if (handle) { + monitor = idr_find(&hdev->adv_monitors_idr, handle); + if (!monitor) + return -ENOENT; + + idr_remove(&hdev->adv_monitors_idr, monitor->handle); + hci_free_adv_monitor(monitor); + } else { + /* Remove all monitors if handle is 0. */ + idr_for_each(&hdev->adv_monitors_idr, &free_adv_monitor, hdev); + } + + hci_update_background_scan(hdev); + + return 0; +} + +/* This function requires the caller holds hdev->lock */ +bool hci_is_adv_monitoring(struct hci_dev *hdev) +{ + return !idr_is_empty(&hdev->adv_monitors_idr); +} + struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { @@ -3023,6 +3120,20 @@ struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk( return NULL; } +struct bdaddr_list_with_flags * +hci_bdaddr_list_lookup_with_flags(struct list_head *bdaddr_list, + bdaddr_t *bdaddr, u8 type) +{ + struct bdaddr_list_with_flags *b; + + list_for_each_entry(b, bdaddr_list, list) { + if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) + return b; + } + + return NULL; +} + void hci_bdaddr_list_clear(struct list_head *bdaddr_list) { struct bdaddr_list *b, *n; @@ -3084,6 +3195,30 @@ int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr, return 0; } +int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr, + u8 type, u32 flags) +{ + struct bdaddr_list_with_flags *entry; + + if (!bacmp(bdaddr, BDADDR_ANY)) + return -EBADF; + + if (hci_bdaddr_list_lookup(list, bdaddr, type)) + return -EEXIST; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + bacpy(&entry->bdaddr, bdaddr); + entry->bdaddr_type = type; + entry->current_flags = flags; + + list_add(&entry->list, list); + + return 0; +} + int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; @@ -3123,6 +3258,26 @@ int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, return 0; } +int hci_bdaddr_list_del_with_flags(struct list_head *list, bdaddr_t *bdaddr, + u8 type) +{ + struct bdaddr_list_with_flags *entry; + + if (!bacmp(bdaddr, BDADDR_ANY)) { + hci_bdaddr_list_clear(list); + return 0; + } + + entry = hci_bdaddr_list_lookup_with_flags(list, bdaddr, type); + if (!entry) + return -ENOENT; + + list_del(&entry->list); + kfree(entry); + + return 0; +} + /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) @@ -3145,6 +3300,15 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, { struct hci_conn_params *param; + switch (addr_type) { + case ADDR_LE_DEV_PUBLIC_RESOLVED: + addr_type = ADDR_LE_DEV_PUBLIC; + break; + case ADDR_LE_DEV_RANDOM_RESOLVED: + addr_type = ADDR_LE_DEV_RANDOM; + break; + } + list_for_each_entry(param, list, action) { if (bacmp(¶m->addr, addr) == 0 && param->addr_type == addr_type) @@ -3289,10 +3453,10 @@ static int hci_suspend_wait_event(struct hci_dev *hdev) WAKE_COND, SUSPEND_NOTIFIER_TIMEOUT); if (ret == 0) { - bt_dev_dbg(hdev, "Timed out waiting for suspend"); + bt_dev_err(hdev, "Timed out waiting for suspend events"); for (i = 0; i < __SUSPEND_NUM_TASKS; ++i) { if (test_bit(i, hdev->suspend_tasks)) - bt_dev_dbg(hdev, "Bit %d is set", i); + bt_dev_err(hdev, "Suspend timeout bit: %d", i); clear_bit(i, hdev->suspend_tasks); } @@ -3360,12 +3524,15 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, ret = hci_change_suspend_state(hdev, BT_RUNNING); } - /* If suspend failed, restore it to running */ - if (ret && action == PM_SUSPEND_PREPARE) - hci_change_suspend_state(hdev, BT_RUNNING); - done: - return ret ? notifier_from_errno(-EBUSY) : NOTIFY_STOP; + /* We always allow suspend even if suspend preparation failed and + * attempt to recover in resume. + */ + if (ret) + bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d", + action, ret); + + return NOTIFY_DONE; } /* Alloc HCI device */ @@ -3397,6 +3564,12 @@ struct hci_dev *hci_alloc_dev(void) hdev->le_adv_max_interval = 0x0800; hdev->le_scan_interval = 0x0060; hdev->le_scan_window = 0x0030; + hdev->le_scan_int_suspend = 0x0400; + hdev->le_scan_window_suspend = 0x0012; + hdev->le_scan_int_discovery = DISCOV_LE_SCAN_INT; + hdev->le_scan_window_discovery = DISCOV_LE_SCAN_WIN; + hdev->le_scan_int_connect = 0x0060; + hdev->le_scan_window_connect = 0x0060; hdev->le_conn_min_interval = 0x0018; hdev->le_conn_max_interval = 0x0028; hdev->le_conn_latency = 0x0000; @@ -3412,6 +3585,8 @@ struct hci_dev *hci_alloc_dev(void) hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES; + hdev->def_multi_adv_rotation_duration = HCI_DEFAULT_ADV_DURATION; + hdev->def_le_autoconnect_timeout = HCI_LE_AUTOCONN_TIMEOUT; hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; @@ -3420,13 +3595,17 @@ struct hci_dev *hci_alloc_dev(void) hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; hdev->min_enc_key_size = HCI_MIN_ENC_KEY_SIZE; + /* default 1.28 sec page scan */ + hdev->def_page_scan_type = PAGE_SCAN_TYPE_STANDARD; + hdev->def_page_scan_int = 0x0800; + hdev->def_page_scan_window = 0x0012; + mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); INIT_LIST_HEAD(&hdev->mgmt_pending); INIT_LIST_HEAD(&hdev->blacklist); INIT_LIST_HEAD(&hdev->whitelist); - INIT_LIST_HEAD(&hdev->wakeable); INIT_LIST_HEAD(&hdev->uuids); INIT_LIST_HEAD(&hdev->link_keys); INIT_LIST_HEAD(&hdev->long_term_keys); @@ -3574,6 +3753,8 @@ int hci_register_dev(struct hci_dev *hdev) queue_work(hdev->req_workqueue, &hdev->power_on); + idr_init(&hdev->adv_monitors_idr); + return id; err_wqueue: @@ -3603,9 +3784,10 @@ void hci_unregister_dev(struct hci_dev *hdev) cancel_work_sync(&hdev->power_on); - hci_dev_do_close(hdev); - unregister_pm_notifier(&hdev->suspend_notifier); + cancel_work_sync(&hdev->suspend_prepare); + + hci_dev_do_close(hdev); if (!test_bit(HCI_INIT, &hdev->flags) && !hci_dev_test_flag(hdev, HCI_SETUP) && @@ -3644,6 +3826,7 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_smp_irks_clear(hdev); hci_remote_oob_data_clear(hdev); hci_adv_instances_clear(hdev); + hci_adv_monitors_clear(hdev); hci_bdaddr_list_clear(&hdev->le_white_list); hci_bdaddr_list_clear(&hdev->le_resolv_list); hci_conn_params_clear_all(hdev); @@ -4551,6 +4734,7 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) if (conn) { /* Send to upper protocol */ + bt_cb(skb)->sco.pkt_status = flags & 0x03; sco_recv_scodata(conn, skb); return; } else { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index af9d7f2ff8ba..4b7fc430793c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2299,6 +2299,22 @@ static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr, if (!conn) return; + /* When using controller based address resolution, then the new + * address types 0x02 and 0x03 are used. These types need to be + * converted back into either public address or random address type + */ + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) { + switch (own_address_type) { + case ADDR_LE_DEV_PUBLIC_RESOLVED: + own_address_type = ADDR_LE_DEV_PUBLIC; + break; + case ADDR_LE_DEV_RANDOM_RESOLVED: + own_address_type = ADDR_LE_DEV_RANDOM; + break; + } + } + /* Store the initiator and responder address information which * is needed for SMP. These values will not change during the * lifetime of the connection. @@ -2520,7 +2536,7 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s num_rsp %d", hdev->name, num_rsp); - if (!num_rsp) + if (!num_rsp || skb->len < num_rsp * sizeof(*info) + 1) return; if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) @@ -2700,10 +2716,10 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) */ if (hci_dev_test_flag(hdev, HCI_MGMT) && !hci_dev_test_flag(hdev, HCI_CONNECTABLE) && - !hci_bdaddr_list_lookup(&hdev->whitelist, &ev->bdaddr, - BDADDR_BREDR)) { - hci_reject_conn(hdev, &ev->bdaddr); - return; + !hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, &ev->bdaddr, + BDADDR_BREDR)) { + hci_reject_conn(hdev, &ev->bdaddr); + return; } /* Connection accepted */ @@ -2828,7 +2844,7 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) case HCI_AUTO_CONN_LINK_LOSS: if (ev->reason != HCI_ERROR_CONNECTION_TIMEOUT) break; - /* Fall through */ + fallthrough; case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: @@ -3068,27 +3084,23 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); + /* Check link security requirements are met */ + if (!hci_conn_check_link_mode(conn)) + ev->status = HCI_ERROR_AUTH_FAILURE; + if (ev->status && conn->state == BT_CONNECTED) { if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING) set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags); + /* Notify upper layers so they can cleanup before + * disconnecting. + */ + hci_encrypt_cfm(conn, ev->status); hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE); hci_conn_drop(conn); goto unlock; } - /* In Secure Connections Only mode, do not allow any connections - * that are not encrypted with AES-CCM using a P-256 authenticated - * combination key. - */ - if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && - (!test_bit(HCI_CONN_AES_CCM, &conn->flags) || - conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) { - hci_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE); - hci_conn_drop(conn); - goto unlock; - } - /* Try reading the encryption key size for encrypted ACL links */ if (!ev->status && ev->encrypt && conn->type == ACL_LINK) { struct hci_cp_read_enc_key_size cp; @@ -4166,6 +4178,9 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct inquiry_info_with_rssi_and_pscan_mode *info; info = (void *) (skb->data + 1); + if (skb->len < num_rsp * sizeof(*info) + 1) + goto unlock; + for (; num_rsp; num_rsp--, info++) { u32 flags; @@ -4187,6 +4202,9 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, } else { struct inquiry_info_with_rssi *info = (void *) (skb->data + 1); + if (skb->len < num_rsp * sizeof(*info) + 1) + goto unlock; + for (; num_rsp; num_rsp--, info++) { u32 flags; @@ -4207,6 +4225,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, } } +unlock: hci_dev_unlock(hdev); } @@ -4327,7 +4346,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, if (hci_setup_sync(conn, conn->link->handle)) goto unlock; } - /* fall through */ + fallthrough; default: conn->state = BT_CLOSED; @@ -4382,7 +4401,7 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, BT_DBG("%s num_rsp %d", hdev->name, num_rsp); - if (!num_rsp) + if (!num_rsp || skb->len < num_rsp * sizeof(*info) + 1) return; if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) @@ -5212,6 +5231,11 @@ static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, le16_to_cpu(ev->interval), le16_to_cpu(ev->latency), le16_to_cpu(ev->supervision_timeout)); + + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && + hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) + hci_req_disable_address_resolution(hdev); } static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb) @@ -5322,7 +5346,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, } conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW, - HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER, + hdev->def_le_autoconnect_timeout, HCI_ROLE_MASTER, direct_rpa); if (!IS_ERR(conn)) { /* If HCI_AUTO_CONN_EXPLICIT is set, conn is already owned @@ -5456,14 +5480,15 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, /* Passive scanning shouldn't trigger any device found events, * except for devices marked as CONN_REPORT for which we do send - * device found events. + * device found events, or advertisement monitoring requested. */ if (hdev->le_scan_type == LE_SCAN_PASSIVE) { if (type == LE_ADV_DIRECT_IND) return; if (!hci_pend_le_action_lookup(&hdev->pend_le_reports, - bdaddr, bdaddr_type)) + bdaddr, bdaddr_type) && + idr_is_empty(&hdev->adv_monitors_idr)) return; if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND) diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 1fc55685da62..e0269192f2e5 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -34,9 +34,6 @@ #define HCI_REQ_PEND 1 #define HCI_REQ_CANCELED 2 -#define LE_SUSPEND_SCAN_WINDOW 0x0012 -#define LE_SUSPEND_SCAN_INTERVAL 0x0400 - void hci_req_init(struct hci_request *req, struct hci_dev *hdev) { skb_queue_head_init(&req->cmd_q); @@ -366,13 +363,11 @@ void __hci_req_write_fast_connectable(struct hci_request *req, bool enable) /* 160 msec page scan interval */ acp.interval = cpu_to_le16(0x0100); } else { - type = PAGE_SCAN_TYPE_STANDARD; /* default */ - - /* default 1.28 sec page scan */ - acp.interval = cpu_to_le16(0x0800); + type = hdev->def_page_scan_type; + acp.interval = cpu_to_le16(hdev->def_page_scan_int); } - acp.window = cpu_to_le16(0x0012); + acp.window = cpu_to_le16(hdev->def_page_scan_window); if (__cpu_to_le16(hdev->page_scan_interval) != acp.interval || __cpu_to_le16(hdev->page_scan_window) != acp.window) @@ -418,18 +413,22 @@ static void __hci_update_background_scan(struct hci_request *req) */ hci_discovery_filter_clear(hdev); + BT_DBG("%s ADV monitoring is %s", hdev->name, + hci_is_adv_monitoring(hdev) ? "on" : "off"); + if (list_empty(&hdev->pend_le_conns) && - list_empty(&hdev->pend_le_reports)) { + list_empty(&hdev->pend_le_reports) && + !hci_is_adv_monitoring(hdev)) { /* If there is no pending LE connections or devices - * to be scanned for, we should stop the background - * scanning. + * to be scanned for or no ADV monitors, we should stop the + * background scanning. */ /* If controller is not scanning we are done. */ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) return; - hci_req_add_le_scan_disable(req); + hci_req_add_le_scan_disable(req, false); BT_DBG("%s stopping background scanning", hdev->name); } else { @@ -448,7 +447,7 @@ static void __hci_update_background_scan(struct hci_request *req) * don't miss any advertising (due to duplicates filter). */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) - hci_req_add_le_scan_disable(req); + hci_req_add_le_scan_disable(req, false); hci_req_add_le_passive_scan(req); @@ -653,7 +652,7 @@ void __hci_req_update_eir(struct hci_request *req) hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp); } -void hci_req_add_le_scan_disable(struct hci_request *req) +void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn) { struct hci_dev *hdev = req->hdev; @@ -676,6 +675,15 @@ void hci_req_add_le_scan_disable(struct hci_request *req) cp.enable = LE_SCAN_DISABLE; hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); } + + /* Disable address resolution */ + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && + hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION) && !rpa_le_conn) { + __u8 enable = 0x00; + + hci_req_add(req, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE, 1, &enable); + } } static void del_from_white_list(struct hci_request *req, bdaddr_t *bdaddr, @@ -689,6 +697,21 @@ static void del_from_white_list(struct hci_request *req, bdaddr_t *bdaddr, bt_dev_dbg(req->hdev, "Remove %pMR (0x%x) from whitelist", &cp.bdaddr, cp.bdaddr_type); hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, sizeof(cp), &cp); + + if (use_ll_privacy(req->hdev)) { + struct smp_irk *irk; + + irk = hci_find_irk_by_addr(req->hdev, bdaddr, bdaddr_type); + if (irk) { + struct hci_cp_le_del_from_resolv_list cp; + + cp.bdaddr_type = bdaddr_type; + bacpy(&cp.bdaddr, bdaddr); + + hci_req_add(req, HCI_OP_LE_DEL_FROM_RESOLV_LIST, + sizeof(cp), &cp); + } + } } /* Adds connection to white list if needed. On error, returns -1. */ @@ -709,13 +732,14 @@ static int add_to_white_list(struct hci_request *req, return -1; /* White list can not be used with RPAs */ - if (!allow_rpa && + if (!allow_rpa && !use_ll_privacy(hdev) && hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type)) { return -1; } /* During suspend, only wakeable devices can be in whitelist */ - if (hdev->suspended && !params->wakeable) + if (hdev->suspended && !hci_conn_test_flag(HCI_CONN_FLAG_REMOTE_WAKEUP, + params->current_flags)) return 0; *num_entries += 1; @@ -726,6 +750,28 @@ static int add_to_white_list(struct hci_request *req, cp.bdaddr_type); hci_req_add(req, HCI_OP_LE_ADD_TO_WHITE_LIST, sizeof(cp), &cp); + if (use_ll_privacy(hdev)) { + struct smp_irk *irk; + + irk = hci_find_irk_by_addr(hdev, ¶ms->addr, + params->addr_type); + if (irk) { + struct hci_cp_le_add_to_resolv_list cp; + + cp.bdaddr_type = params->addr_type; + bacpy(&cp.bdaddr, ¶ms->addr); + memcpy(cp.peer_irk, irk->val, 16); + + if (hci_dev_test_flag(hdev, HCI_PRIVACY)) + memcpy(cp.local_irk, hdev->irk, 16); + else + memset(cp.local_irk, 0, 16); + + hci_req_add(req, HCI_OP_LE_ADD_TO_RESOLV_LIST, + sizeof(cp), &cp); + } + } + return 0; } @@ -766,7 +812,7 @@ static u8 update_white_list(struct hci_request *req) } /* White list can not be used with RPAs */ - if (!allow_rpa && + if (!allow_rpa && !use_ll_privacy(hdev) && hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) { return 0x00; } @@ -798,6 +844,14 @@ static u8 update_white_list(struct hci_request *req) return 0x00; } + /* Once the controller offloading of advertisement monitor is in place, + * the if condition should include the support of MSFT extension + * support. If suspend is ongoing, whitelist should be the default to + * prevent waking by random advertisements. + */ + if (!idr_is_empty(&hdev->adv_monitors_idr) && !hdev->suspended) + return 0x00; + /* Select filter policy to use white list */ return 0x01; } @@ -808,10 +862,24 @@ static bool scan_use_rpa(struct hci_dev *hdev) } static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, - u16 window, u8 own_addr_type, u8 filter_policy) + u16 window, u8 own_addr_type, u8 filter_policy, + bool addr_resolv) { struct hci_dev *hdev = req->hdev; + if (hdev->scanning_paused) { + bt_dev_dbg(hdev, "Scanning is paused for suspend"); + return; + } + + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && + addr_resolv) { + u8 enable = 0x01; + + hci_req_add(req, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE, 1, &enable); + } + /* Use ext scanning if set ext scan param and ext scan enable is * supported */ @@ -885,12 +953,39 @@ static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, } } +/* Returns true if an le connection is in the scanning state */ +static inline bool hci_is_le_conn_scanning(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == LE_LINK && c->state == BT_CONNECT && + test_bit(HCI_CONN_SCANNING, &c->flags)) { + rcu_read_unlock(); + return true; + } + } + + rcu_read_unlock(); + + return false; +} + +/* Ensure to call hci_req_add_le_scan_disable() first to disable the + * controller based address resolution to be able to reconfigure + * resolving list. + */ void hci_req_add_le_passive_scan(struct hci_request *req) { struct hci_dev *hdev = req->hdev; u8 own_addr_type; u8 filter_policy; u16 window, interval; + /* Background scanning should run with address resolution */ + bool addr_resolv = true; if (hdev->scanning_paused) { bt_dev_dbg(hdev, "Scanning is paused for suspend"); @@ -927,8 +1022,11 @@ void hci_req_add_le_passive_scan(struct hci_request *req) filter_policy |= 0x02; if (hdev->suspended) { - window = LE_SUSPEND_SCAN_WINDOW; - interval = LE_SUSPEND_SCAN_INTERVAL; + window = hdev->le_scan_window_suspend; + interval = hdev->le_scan_int_suspend; + } else if (hci_is_le_conn_scanning(hdev)) { + window = hdev->le_scan_window_connect; + interval = hdev->le_scan_int_connect; } else { window = hdev->le_scan_window; interval = hdev->le_scan_interval; @@ -936,7 +1034,7 @@ void hci_req_add_le_passive_scan(struct hci_request *req) bt_dev_dbg(hdev, "LE passive scan with whitelist = %d", filter_policy); hci_req_start_scan(req, LE_SCAN_PASSIVE, interval, window, - own_addr_type, filter_policy); + own_addr_type, filter_policy, addr_resolv); } static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance) @@ -973,15 +1071,19 @@ static void hci_req_clear_event_filter(struct hci_request *req) static void hci_req_set_event_filter(struct hci_request *req) { - struct bdaddr_list *b; + struct bdaddr_list_with_flags *b; struct hci_cp_set_event_filter f; struct hci_dev *hdev = req->hdev; - u8 scan; + u8 scan = SCAN_DISABLED; /* Always clear event filter when starting */ hci_req_clear_event_filter(req); - list_for_each_entry(b, &hdev->wakeable, list) { + list_for_each_entry(b, &hdev->whitelist, list) { + if (!hci_conn_test_flag(HCI_CONN_FLAG_REMOTE_WAKEUP, + b->current_flags)) + continue; + memset(&f, 0, sizeof(f)); bacpy(&f.addr_conn_flt.bdaddr, &b->bdaddr); f.flt_type = HCI_FLT_CONN_SETUP; @@ -990,16 +1092,17 @@ static void hci_req_set_event_filter(struct hci_request *req) bt_dev_dbg(hdev, "Adding event filters for %pMR", &b->bdaddr); hci_req_add(req, HCI_OP_SET_EVENT_FLT, sizeof(f), &f); + scan = SCAN_PAGE; } - scan = !list_empty(&hdev->wakeable) ? SCAN_PAGE : SCAN_DISABLED; hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } static void hci_req_config_le_suspend_scan(struct hci_request *req) { - /* Can't change params without disabling first */ - hci_req_add_le_scan_disable(req); + /* Before changing params disable scan if enabled */ + if (hci_dev_test_flag(req->hdev, HCI_LE_SCAN)) + hci_req_add_le_scan_disable(req, false); /* Configure params and enable scanning */ hci_req_add_le_passive_scan(req); @@ -1065,8 +1168,9 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) page_scan = SCAN_DISABLED; hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &page_scan); - /* Disable LE passive scan */ - hci_req_add_le_scan_disable(&req); + /* Disable LE passive scan if enabled */ + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) + hci_req_add_le_scan_disable(&req, false); /* Mark task needing completion */ set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); @@ -1160,13 +1264,8 @@ static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) void __hci_req_disable_advertising(struct hci_request *req) { if (ext_adv_capable(req->hdev)) { - struct hci_cp_le_set_ext_adv_enable cp; - - cp.enable = 0x00; - /* Disable all sets since we only support one set at the moment */ - cp.num_of_sets = 0x00; + __hci_req_disable_ext_adv_instance(req, 0x00); - hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp), &cp); } else { u8 enable = 0x00; @@ -1627,6 +1726,28 @@ int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance) return hci_req_run(&req, NULL); } +static void enable_addr_resolution_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + BT_DBG("%s status %u", hdev->name, status); +} + +void hci_req_disable_address_resolution(struct hci_dev *hdev) +{ + struct hci_request req; + __u8 enable = 0x00; + + if (!use_ll_privacy(hdev) && + !hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) + return; + + hci_req_init(&req, hdev); + + hci_req_add(&req, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE, 1, &enable); + + hci_req_run(&req, enable_addr_resolution_complete); +} + static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) { BT_DBG("%s status %u", hdev->name, status); @@ -1786,8 +1907,6 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) int err; struct adv_info *adv_instance; bool secondary_adv; - /* In ext adv set param interval is 3 octets */ - const u8 adv_interval[3] = { 0x00, 0x08, 0x00 }; if (instance > 0) { adv_instance = hci_find_adv_instance(hdev, instance); @@ -1820,8 +1939,9 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) memset(&cp, 0, sizeof(cp)); - memcpy(cp.min_interval, adv_interval, sizeof(cp.min_interval)); - memcpy(cp.max_interval, adv_interval, sizeof(cp.max_interval)); + /* In ext adv set param interval is 3 octets */ + hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval); + hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval); secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK); @@ -1932,13 +2052,59 @@ int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance) return 0; } +int __hci_req_disable_ext_adv_instance(struct hci_request *req, u8 instance) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_le_set_ext_adv_enable *cp; + struct hci_cp_ext_adv_set *adv_set; + u8 data[sizeof(*cp) + sizeof(*adv_set) * 1]; + u8 req_size; + + /* If request specifies an instance that doesn't exist, fail */ + if (instance > 0 && !hci_find_adv_instance(hdev, instance)) + return -EINVAL; + + memset(data, 0, sizeof(data)); + + cp = (void *)data; + adv_set = (void *)cp->data; + + /* Instance 0x00 indicates all advertising instances will be disabled */ + cp->num_of_sets = !!instance; + cp->enable = 0x00; + + adv_set->handle = instance; + + req_size = sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets; + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, req_size, data); + + return 0; +} + +int __hci_req_remove_ext_adv_instance(struct hci_request *req, u8 instance) +{ + struct hci_dev *hdev = req->hdev; + + /* If request specifies an instance that doesn't exist, fail */ + if (instance > 0 && !hci_find_adv_instance(hdev, instance)) + return -EINVAL; + + hci_req_add(req, HCI_OP_LE_REMOVE_ADV_SET, sizeof(instance), &instance); + + return 0; +} + int __hci_req_start_ext_adv(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; + struct adv_info *adv_instance = hci_find_adv_instance(hdev, instance); int err; - if (hci_dev_test_flag(hdev, HCI_LE_ADV)) - __hci_req_disable_advertising(req); + /* If instance isn't pending, the chip knows about it, and it's safe to + * disable + */ + if (adv_instance && !adv_instance->pending) + __hci_req_disable_ext_adv_instance(req, instance); err = __hci_req_setup_ext_adv_instance(req, instance); if (err < 0) @@ -2086,7 +2252,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, hci_dev_test_flag(hdev, HCI_ADVERTISING)) return; - if (next_instance) + if (next_instance && !ext_adv_capable(hdev)) __hci_req_schedule_adv_instance(req, next_instance->instance, false); } @@ -2128,7 +2294,13 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, if (use_rpa) { int to; - *own_addr_type = ADDR_LE_DEV_RANDOM; + /* If Controller supports LL Privacy use own address type is + * 0x03 + */ + if (use_ll_privacy(hdev)) + *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; + else + *own_addr_type = ADDR_LE_DEV_RANDOM; if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) && !bacmp(&hdev->random_addr, &hdev->rpa)) @@ -2547,7 +2719,7 @@ static void bg_scan_update(struct work_struct *work) static int le_scan_disable(struct hci_request *req, unsigned long opt) { - hci_req_add_le_scan_disable(req); + hci_req_add_le_scan_disable(req, false); return 0; } @@ -2645,7 +2817,12 @@ static int le_scan_restart(struct hci_request *req, unsigned long opt) if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) return 0; - hci_req_add_le_scan_disable(req); + if (hdev->scanning_paused) { + bt_dev_dbg(hdev, "Scanning is paused for suspend"); + return 0; + } + + hci_req_add_le_scan_disable(req, false); if (use_ext_scan(hdev)) { struct hci_cp_le_set_ext_scan_enable ext_enable_cp; @@ -2725,6 +2902,8 @@ static int active_scan(struct hci_request *req, unsigned long opt) u8 own_addr_type; /* White list is not used for discovery */ u8 filter_policy = 0x00; + /* Discovery doesn't require controller address resolution */ + bool addr_resolv = false; int err; BT_DBG("%s", hdev->name); @@ -2734,7 +2913,7 @@ static int active_scan(struct hci_request *req, unsigned long opt) * discovery scanning parameters. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) - hci_req_add_le_scan_disable(req); + hci_req_add_le_scan_disable(req, false); /* All active scans will be done with either a resolvable private * address (when privacy feature has been enabled) or non-resolvable @@ -2745,8 +2924,9 @@ static int active_scan(struct hci_request *req, unsigned long opt) if (err < 0) own_addr_type = ADDR_LE_DEV_PUBLIC; - hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, DISCOV_LE_SCAN_WIN, - own_addr_type, filter_policy); + hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, + hdev->le_scan_window_discovery, own_addr_type, + filter_policy, addr_resolv); return 0; } @@ -2793,18 +2973,18 @@ static void start_discovery(struct hci_dev *hdev, u8 *status) * to do BR/EDR inquiry. */ hci_req_sync(hdev, interleaved_discov, - DISCOV_LE_SCAN_INT * 2, HCI_CMD_TIMEOUT, + hdev->le_scan_int_discovery * 2, HCI_CMD_TIMEOUT, status); break; } timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout); - hci_req_sync(hdev, active_scan, DISCOV_LE_SCAN_INT, + hci_req_sync(hdev, active_scan, hdev->le_scan_int_discovery, HCI_CMD_TIMEOUT, status); break; case DISCOV_TYPE_LE: timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); - hci_req_sync(hdev, active_scan, DISCOV_LE_SCAN_INT, + hci_req_sync(hdev, active_scan, hdev->le_scan_int_discovery, HCI_CMD_TIMEOUT, status); break; default: @@ -2848,14 +3028,14 @@ bool hci_req_stop_discovery(struct hci_request *req) if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { cancel_delayed_work(&hdev->le_scan_disable); - hci_req_add_le_scan_disable(req); + hci_req_add_le_scan_disable(req, false); } ret = true; } else { /* Passive scanning */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { - hci_req_add_le_scan_disable(req); + hci_req_add_le_scan_disable(req, false); ret = true; } } diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 0e81614d235e..6a12e84c66c4 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -65,11 +65,12 @@ void __hci_req_write_fast_connectable(struct hci_request *req, bool enable); void __hci_req_update_name(struct hci_request *req); void __hci_req_update_eir(struct hci_request *req); -void hci_req_add_le_scan_disable(struct hci_request *req); +void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn); void hci_req_add_le_passive_scan(struct hci_request *req); void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next); +void hci_req_disable_address_resolution(struct hci_dev *hdev); void hci_req_reenable_advertising(struct hci_dev *hdev); void __hci_req_enable_advertising(struct hci_request *req); void __hci_req_disable_advertising(struct hci_request *req); @@ -86,6 +87,8 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance); int __hci_req_start_ext_adv(struct hci_request *req, u8 instance); int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance); +int __hci_req_disable_ext_adv_instance(struct hci_request *req, u8 instance); +int __hci_req_remove_ext_adv_instance(struct hci_request *req, u8 instance); void __hci_req_clear_ext_adv_sets(struct hci_request *req); int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, bool use_rpa, struct adv_info *adv_instance, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index caf38a8ea6a8..251b9128f530 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -52,7 +52,7 @@ struct hci_pinfo { struct bt_sock bt; struct hci_dev *hdev; struct hci_filter filter; - __u32 cmsg_mask; + __u8 cmsg_mask; unsigned short channel; unsigned long flags; __u32 cookie; @@ -443,8 +443,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) case HCI_DEV_SETUP: if (hdev->manufacturer == 0xffff) return NULL; - - /* fall through */ + fallthrough; case HCI_DEV_UP: skb = bt_skb_alloc(HCI_MON_INDEX_INFO_SIZE, GFP_ATOMIC); @@ -1399,7 +1398,7 @@ done: static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { - __u32 mask = hci_pi(sk)->cmsg_mask; + __u8 mask = hci_pi(sk)->cmsg_mask; if (mask & HCI_CMSG_DIR) { int incoming = bt_cb(skb)->incoming; @@ -1842,7 +1841,7 @@ drop: } static int hci_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int len) + sockptr_t optval, unsigned int len) { struct hci_ufilter uf = { .opcode = 0 }; struct sock *sk = sock->sk; @@ -1862,7 +1861,7 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case HCI_DATA_DIR: - if (get_user(opt, (int __user *)optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(opt))) { err = -EFAULT; break; } @@ -1874,7 +1873,7 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, break; case HCI_TIME_STAMP: - if (get_user(opt, (int __user *)optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(opt))) { err = -EFAULT; break; } @@ -1896,7 +1895,7 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, } len = min_t(unsigned int, len, sizeof(uf)); - if (copy_from_user(&uf, optval, len)) { + if (copy_from_sockptr(&uf, optval, len)) { err = -EFAULT; break; } diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 03be6a4baef3..595fb3c9d6c3 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -233,8 +233,6 @@ static const struct proto_ops hidp_sock_ops = { .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index fe913a5c754a..ade83e224567 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -666,8 +666,7 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) l2cap_seq_list_free(&chan->srej_list); l2cap_seq_list_free(&chan->retrans_list); - - /* fall through */ + fallthrough; case L2CAP_MODE_STREAMING: skb_queue_purge(&chan->tx_q); @@ -872,7 +871,8 @@ static inline u8 l2cap_get_auth_type(struct l2cap_chan *chan) else return HCI_AT_NO_BONDING; } - /* fall through */ + fallthrough; + default: switch (chan->sec_level) { case BT_SECURITY_HIGH: @@ -2983,8 +2983,7 @@ static void l2cap_tx_state_wait_f(struct l2cap_chan *chan, break; case L2CAP_EV_RECV_REQSEQ_AND_FBIT: l2cap_process_reqseq(chan, control->reqseq); - - /* Fall through */ + fallthrough; case L2CAP_EV_RECV_FBIT: if (control && control->final) { @@ -3311,7 +3310,7 @@ static inline __u8 l2cap_select_mode(__u8 mode, __u16 remote_feat_mask) case L2CAP_MODE_ERTM: if (l2cap_mode_supported(mode, remote_feat_mask)) return mode; - /* fall through */ + fallthrough; default: return L2CAP_MODE_BASIC; } @@ -3447,7 +3446,7 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data if (__l2cap_efs_supported(chan->conn)) set_bit(FLAG_EFS_ENABLE, &chan->flags); - /* fall through */ + fallthrough; default: chan->mode = l2cap_select_mode(rfc.mode, chan->conn->feat_mask); break; @@ -4539,7 +4538,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, goto done; break; } - /* fall through */ + fallthrough; default: l2cap_chan_set_err(chan, ECONNRESET); @@ -7719,7 +7718,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) conn->mtu = hcon->hdev->le_mtu; break; } - /* fall through */ + fallthrough; default: conn->mtu = hcon->hdev->acl_mtu; break; @@ -7841,7 +7840,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, case L2CAP_MODE_STREAMING: if (!disable_ertm) break; - /* fall through */ + fallthrough; default: err = -EOPNOTSUPP; goto done; @@ -7893,11 +7892,13 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, else hcon = hci_connect_le_scan(hdev, dst, dst_type, chan->sec_level, - HCI_LE_CONN_TIMEOUT); + HCI_LE_CONN_TIMEOUT, + CONN_REASON_L2CAP_CHAN); } else { u8 auth_type = l2cap_get_auth_type(chan); - hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type); + hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type, + CONN_REASON_L2CAP_CHAN); } if (IS_ERR(hcon)) { diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index a995d2c51fa7..e1a3e66b1754 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -284,7 +284,7 @@ static int l2cap_sock_listen(struct socket *sock, int backlog) case L2CAP_MODE_STREAMING: if (!disable_ertm) break; - /* fall through */ + fallthrough; default: err = -EOPNOTSUPP; goto done; @@ -703,7 +703,7 @@ static bool l2cap_valid_mtu(struct l2cap_chan *chan, u16 mtu) } static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; @@ -736,7 +736,7 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, opts.txwin_size = chan->tx_win; len = min_t(unsigned int, sizeof(opts), optlen); - if (copy_from_user((char *) &opts, optval, len)) { + if (copy_from_sockptr(&opts, optval, len)) { err = -EFAULT; break; } @@ -760,7 +760,7 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, case L2CAP_MODE_STREAMING: if (!disable_ertm) break; - /* fall through */ + fallthrough; default: err = -EINVAL; break; @@ -782,7 +782,7 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; case L2CAP_LM: - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -859,7 +859,7 @@ static int l2cap_set_mode(struct l2cap_chan *chan, u8 mode) } static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; @@ -891,7 +891,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, sec.level = BT_SECURITY_LOW; len = min_t(unsigned int, sizeof(sec), optlen); - if (copy_from_user((char *) &sec, optval, len)) { + if (copy_from_sockptr(&sec, optval, len)) { err = -EFAULT; break; } @@ -939,7 +939,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -954,7 +954,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_FLUSHABLE: - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -990,7 +990,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, pwr.force_active = BT_POWER_FORCE_ACTIVE_ON; len = min_t(unsigned int, sizeof(pwr), optlen); - if (copy_from_user((char *) &pwr, optval, len)) { + if (copy_from_sockptr(&pwr, optval, len)) { err = -EFAULT; break; } @@ -1002,7 +1002,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_CHANNEL_POLICY: - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -1050,7 +1050,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u16 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u16))) { err = -EFAULT; break; } @@ -1081,7 +1081,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u8 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u8))) { err = -EFAULT; break; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 9e8a3cccc6ca..5bbe71002fb9 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -36,9 +36,11 @@ #include "hci_request.h" #include "smp.h" #include "mgmt_util.h" +#include "mgmt_config.h" +#include "msft.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 17 +#define MGMT_REVISION 18 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -111,6 +113,15 @@ static const u16 mgmt_commands[] = { MGMT_OP_READ_SECURITY_INFO, MGMT_OP_READ_EXP_FEATURES_INFO, MGMT_OP_SET_EXP_FEATURE, + MGMT_OP_READ_DEF_SYSTEM_CONFIG, + MGMT_OP_SET_DEF_SYSTEM_CONFIG, + MGMT_OP_READ_DEF_RUNTIME_CONFIG, + MGMT_OP_SET_DEF_RUNTIME_CONFIG, + MGMT_OP_GET_DEVICE_FLAGS, + MGMT_OP_SET_DEVICE_FLAGS, + MGMT_OP_READ_ADV_MONITOR_FEATURES, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_OP_REMOVE_ADV_MONITOR, }; static const u16 mgmt_events[] = { @@ -151,6 +162,7 @@ static const u16 mgmt_events[] = { MGMT_EV_EXT_INFO_CHANGED, MGMT_EV_PHY_CONFIGURATION_CHANGED, MGMT_EV_EXP_FEATURE_CHANGED, + MGMT_EV_DEVICE_FLAGS_CHANGED, }; static const u16 mgmt_untrusted_commands[] = { @@ -162,6 +174,8 @@ static const u16 mgmt_untrusted_commands[] = { MGMT_OP_READ_EXT_INFO, MGMT_OP_READ_SECURITY_INFO, MGMT_OP_READ_EXP_FEATURES_INFO, + MGMT_OP_READ_DEF_SYSTEM_CONFIG, + MGMT_OP_READ_DEF_RUNTIME_CONFIG, }; static const u16 mgmt_untrusted_events[] = { @@ -177,6 +191,8 @@ static const u16 mgmt_untrusted_events[] = { MGMT_EV_EXT_INDEX_REMOVED, MGMT_EV_EXT_INFO_CHANGED, MGMT_EV_EXP_FEATURE_CHANGED, + MGMT_EV_ADV_MONITOR_ADDED, + MGMT_EV_ADV_MONITOR_REMOVED, }; #define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) @@ -779,10 +795,15 @@ static u32 get_supported_settings(struct hci_dev *hdev) if (lmp_le_capable(hdev)) { settings |= MGMT_SETTING_LE; - settings |= MGMT_SETTING_ADVERTISING; settings |= MGMT_SETTING_SECURE_CONN; settings |= MGMT_SETTING_PRIVACY; settings |= MGMT_SETTING_STATIC_ADDRESS; + + /* When the experimental feature for LL Privacy support is + * enabled, then advertising is no longer supported. + */ + if (!hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) + settings |= MGMT_SETTING_ADVERTISING; } if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || @@ -2915,7 +2936,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, if (cp->addr.type == BDADDR_BREDR) { conn = hci_connect_acl(hdev, &cp->addr.bdaddr, sec_level, - auth_type); + auth_type, CONN_REASON_PAIR_DEVICE); } else { u8 addr_type = le_addr_type(cp->addr.type); struct hci_conn_params *p; @@ -2934,9 +2955,9 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, if (p->auto_connect == HCI_AUTO_CONN_EXPLICIT) p->auto_connect = HCI_AUTO_CONN_DISABLED; - conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr, - addr_type, sec_level, - HCI_LE_CONN_TIMEOUT); + conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr, addr_type, + sec_level, HCI_LE_CONN_TIMEOUT, + CONN_REASON_PAIR_DEVICE); } if (IS_ERR(conn)) { @@ -3037,6 +3058,20 @@ static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data, err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, 0, addr, sizeof(*addr)); + + /* Since user doesn't want to proceed with the connection, abort any + * ongoing pairing and then terminate the link if it was created + * because of the pair device action. + */ + if (addr->type == BDADDR_BREDR) + hci_remove_link_key(hdev, &addr->bdaddr); + else + smp_cancel_and_remove_pairing(hdev, &addr->bdaddr, + le_addr_type(addr->type)); + + if (conn->conn_reason == CONN_REASON_PAIR_DEVICE) + hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); + unlock: hci_dev_unlock(hdev); return err; @@ -3723,12 +3758,25 @@ static const u8 debug_uuid[16] = { }; #endif +/* 671b10b5-42c0-4696-9227-eb28d1b049d6 */ +static const u8 simult_central_periph_uuid[16] = { + 0xd6, 0x49, 0xb0, 0xd1, 0x28, 0xeb, 0x27, 0x92, + 0x96, 0x46, 0xc0, 0x42, 0xb5, 0x10, 0x1b, 0x67, +}; + +/* 15c0a148-c273-11ea-b3de-0242ac130004 */ +static const u8 rpa_resolution_uuid[16] = { + 0x04, 0x00, 0x13, 0xac, 0x42, 0x02, 0xde, 0xb3, + 0xea, 0x11, 0x73, 0xc2, 0x48, 0xa1, 0xc0, 0x15, +}; + static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { - char buf[42]; + char buf[62]; /* Enough space for 3 features */ struct mgmt_rp_read_exp_features_info *rp = (void *)buf; u16 idx = 0; + u32 flags; bt_dev_dbg(hdev, "sock %p", sk); @@ -3736,7 +3784,7 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, #ifdef CONFIG_BT_FEATURE_DEBUG if (!hdev) { - u32 flags = bt_dbg_get() ? BIT(0) : 0; + flags = bt_dbg_get() ? BIT(0) : 0; memcpy(rp->features[idx].uuid, debug_uuid, 16); rp->features[idx].flags = cpu_to_le32(flags); @@ -3744,6 +3792,31 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, } #endif + if (hdev) { + if (test_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks) && + (hdev->le_states[4] & 0x08) && /* Central */ + (hdev->le_states[4] & 0x40) && /* Peripheral */ + (hdev->le_states[3] & 0x10)) /* Simultaneous */ + flags = BIT(0); + else + flags = 0; + + memcpy(rp->features[idx].uuid, simult_central_periph_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; + } + + if (hdev && use_ll_privacy(hdev)) { + if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) + flags = BIT(0) | BIT(1); + else + flags = BIT(1); + + memcpy(rp->features[idx].uuid, rpa_resolution_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; + } + rp->feature_count = cpu_to_le16(idx); /* After reading the experimental features information, enable @@ -3756,6 +3829,21 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, 0, rp, sizeof(*rp) + (20 * idx)); } +static int exp_ll_privacy_feature_changed(bool enabled, struct hci_dev *hdev, + struct sock *skip) +{ + struct mgmt_ev_exp_feature_changed ev; + + memset(&ev, 0, sizeof(ev)); + memcpy(ev.uuid, rpa_resolution_uuid, 16); + ev.flags = cpu_to_le32((enabled ? BIT(0) : 0) | BIT(1)); + + return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, hdev, + &ev, sizeof(ev), + HCI_MGMT_EXP_FEATURE_EVENTS, skip); + +} + #ifdef CONFIG_BT_FEATURE_DEBUG static int exp_debug_feature_changed(bool enabled, struct sock *skip) { @@ -3794,6 +3882,16 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, } #endif + if (hdev && use_ll_privacy(hdev) && !hdev_is_powered(hdev)) { + bool changed = hci_dev_test_flag(hdev, + HCI_ENABLE_LL_PRIVACY); + + hci_dev_clear_flag(hdev, HCI_ENABLE_LL_PRIVACY); + + if (changed) + exp_ll_privacy_feature_changed(false, hdev, sk); + } + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, @@ -3844,11 +3942,401 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, } #endif + if (!memcmp(cp->uuid, rpa_resolution_uuid, 16)) { + bool val, changed; + int err; + u32 flags; + + /* Command requires to use the controller index */ + if (!hdev) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_INDEX); + + /* Changes can only be made when controller is powered down */ + if (hdev_is_powered(hdev)) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_NOT_POWERED); + + /* Parameters are limited to a single octet */ + if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + /* Only boolean on/off is supported */ + if (cp->param[0] != 0x00 && cp->param[0] != 0x01) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + val = !!cp->param[0]; + + if (val) { + changed = !hci_dev_test_flag(hdev, + HCI_ENABLE_LL_PRIVACY); + hci_dev_set_flag(hdev, HCI_ENABLE_LL_PRIVACY); + hci_dev_clear_flag(hdev, HCI_ADVERTISING); + + /* Enable LL privacy + supported settings changed */ + flags = BIT(0) | BIT(1); + } else { + changed = hci_dev_test_flag(hdev, + HCI_ENABLE_LL_PRIVACY); + hci_dev_clear_flag(hdev, HCI_ENABLE_LL_PRIVACY); + + /* Disable LL privacy + supported settings changed */ + flags = BIT(1); + } + + memcpy(rp.uuid, rpa_resolution_uuid, 16); + rp.flags = cpu_to_le32(flags); + + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); + + if (changed) + exp_ll_privacy_feature_changed(val, hdev, sk); + + return err; + } + return mgmt_cmd_status(sk, hdev ? hdev->id : MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_NOT_SUPPORTED); } +#define SUPPORTED_DEVICE_FLAGS() ((1U << HCI_CONN_FLAG_MAX) - 1) + +static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len) +{ + struct mgmt_cp_get_device_flags *cp = data; + struct mgmt_rp_get_device_flags rp; + struct bdaddr_list_with_flags *br_params; + struct hci_conn_params *params; + u32 supported_flags = SUPPORTED_DEVICE_FLAGS(); + u32 current_flags = 0; + u8 status = MGMT_STATUS_INVALID_PARAMS; + + bt_dev_dbg(hdev, "Get device flags %pMR (type 0x%x)\n", + &cp->addr.bdaddr, cp->addr.type); + + hci_dev_lock(hdev); + + if (cp->addr.type == BDADDR_BREDR) { + br_params = hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, + &cp->addr.bdaddr, + cp->addr.type); + if (!br_params) + goto done; + + current_flags = br_params->current_flags; + } else { + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + + if (!params) + goto done; + + current_flags = params->current_flags; + } + + bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); + rp.addr.type = cp->addr.type; + rp.supported_flags = cpu_to_le32(supported_flags); + rp.current_flags = cpu_to_le32(current_flags); + + status = MGMT_STATUS_SUCCESS; + +done: + hci_dev_unlock(hdev); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_DEVICE_FLAGS, status, + &rp, sizeof(rp)); +} + +static void device_flags_changed(struct sock *sk, struct hci_dev *hdev, + bdaddr_t *bdaddr, u8 bdaddr_type, + u32 supported_flags, u32 current_flags) +{ + struct mgmt_ev_device_flags_changed ev; + + bacpy(&ev.addr.bdaddr, bdaddr); + ev.addr.type = bdaddr_type; + ev.supported_flags = cpu_to_le32(supported_flags); + ev.current_flags = cpu_to_le32(current_flags); + + mgmt_event(MGMT_EV_DEVICE_FLAGS_CHANGED, hdev, &ev, sizeof(ev), sk); +} + +static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, + u16 len) +{ + struct mgmt_cp_set_device_flags *cp = data; + struct bdaddr_list_with_flags *br_params; + struct hci_conn_params *params; + u8 status = MGMT_STATUS_INVALID_PARAMS; + u32 supported_flags = SUPPORTED_DEVICE_FLAGS(); + u32 current_flags = __le32_to_cpu(cp->current_flags); + + bt_dev_dbg(hdev, "Set device flags %pMR (type 0x%x) = 0x%x", + &cp->addr.bdaddr, cp->addr.type, + __le32_to_cpu(current_flags)); + + if ((supported_flags | current_flags) != supported_flags) { + bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)", + current_flags, supported_flags); + goto done; + } + + hci_dev_lock(hdev); + + if (cp->addr.type == BDADDR_BREDR) { + br_params = hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, + &cp->addr.bdaddr, + cp->addr.type); + + if (br_params) { + br_params->current_flags = current_flags; + status = MGMT_STATUS_SUCCESS; + } else { + bt_dev_warn(hdev, "No such BR/EDR device %pMR (0x%x)", + &cp->addr.bdaddr, cp->addr.type); + } + } else { + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + if (params) { + params->current_flags = current_flags; + status = MGMT_STATUS_SUCCESS; + } else { + bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", + &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + } + } + +done: + hci_dev_unlock(hdev); + + if (status == MGMT_STATUS_SUCCESS) + device_flags_changed(sk, hdev, &cp->addr.bdaddr, cp->addr.type, + supported_flags, current_flags); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEVICE_FLAGS, status, + &cp->addr, sizeof(cp->addr)); +} + +static void mgmt_adv_monitor_added(struct sock *sk, struct hci_dev *hdev, + u16 handle) +{ + struct mgmt_ev_adv_monitor_added ev; + + ev.monitor_handle = cpu_to_le16(handle); + + mgmt_event(MGMT_EV_ADV_MONITOR_ADDED, hdev, &ev, sizeof(ev), sk); +} + +static void mgmt_adv_monitor_removed(struct sock *sk, struct hci_dev *hdev, + u16 handle) +{ + struct mgmt_ev_adv_monitor_added ev; + + ev.monitor_handle = cpu_to_le16(handle); + + mgmt_event(MGMT_EV_ADV_MONITOR_REMOVED, hdev, &ev, sizeof(ev), sk); +} + +static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct adv_monitor *monitor = NULL; + struct mgmt_rp_read_adv_monitor_features *rp = NULL; + int handle; + size_t rp_size = 0; + __u32 supported = 0; + __u16 num_handles = 0; + __u16 handles[HCI_MAX_ADV_MONITOR_NUM_HANDLES]; + + BT_DBG("request for %s", hdev->name); + + hci_dev_lock(hdev); + + if (msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR) + supported |= MGMT_ADV_MONITOR_FEATURE_MASK_OR_PATTERNS; + + idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) { + handles[num_handles++] = monitor->handle; + } + + hci_dev_unlock(hdev); + + rp_size = sizeof(*rp) + (num_handles * sizeof(u16)); + rp = kmalloc(rp_size, GFP_KERNEL); + if (!rp) + return -ENOMEM; + + /* Once controller-based monitoring is in place, the enabled_features + * should reflect the use. + */ + rp->supported_features = cpu_to_le32(supported); + rp->enabled_features = 0; + rp->max_num_handles = cpu_to_le16(HCI_MAX_ADV_MONITOR_NUM_HANDLES); + rp->max_num_patterns = HCI_MAX_ADV_MONITOR_NUM_PATTERNS; + rp->num_handles = cpu_to_le16(num_handles); + if (num_handles) + memcpy(&rp->handles, &handles, (num_handles * sizeof(u16))); + + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_READ_ADV_MONITOR_FEATURES, + MGMT_STATUS_SUCCESS, rp, rp_size); +} + +static int add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_add_adv_patterns_monitor *cp = data; + struct mgmt_rp_add_adv_patterns_monitor rp; + struct adv_monitor *m = NULL; + struct adv_pattern *p = NULL; + unsigned int mp_cnt = 0, prev_adv_monitors_cnt; + __u8 cp_ofst = 0, cp_len = 0; + int err, i; + + BT_DBG("request for %s", hdev->name); + + if (len <= sizeof(*cp) || cp->pattern_count == 0) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_STATUS_INVALID_PARAMS); + goto failed; + } + + m = kmalloc(sizeof(*m), GFP_KERNEL); + if (!m) { + err = -ENOMEM; + goto failed; + } + + INIT_LIST_HEAD(&m->patterns); + m->active = false; + + for (i = 0; i < cp->pattern_count; i++) { + if (++mp_cnt > HCI_MAX_ADV_MONITOR_NUM_PATTERNS) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_STATUS_INVALID_PARAMS); + goto failed; + } + + cp_ofst = cp->patterns[i].offset; + cp_len = cp->patterns[i].length; + if (cp_ofst >= HCI_MAX_AD_LENGTH || + cp_len > HCI_MAX_AD_LENGTH || + (cp_ofst + cp_len) > HCI_MAX_AD_LENGTH) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_STATUS_INVALID_PARAMS); + goto failed; + } + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + err = -ENOMEM; + goto failed; + } + + p->ad_type = cp->patterns[i].ad_type; + p->offset = cp->patterns[i].offset; + p->length = cp->patterns[i].length; + memcpy(p->value, cp->patterns[i].value, p->length); + + INIT_LIST_HEAD(&p->list); + list_add(&p->list, &m->patterns); + } + + if (mp_cnt != cp->pattern_count) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_STATUS_INVALID_PARAMS); + goto failed; + } + + hci_dev_lock(hdev); + + prev_adv_monitors_cnt = hdev->adv_monitors_cnt; + + err = hci_add_adv_monitor(hdev, m); + if (err) { + if (err == -ENOSPC) { + mgmt_cmd_status(sk, hdev->id, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_STATUS_NO_RESOURCES); + } + goto unlock; + } + + if (hdev->adv_monitors_cnt > prev_adv_monitors_cnt) + mgmt_adv_monitor_added(sk, hdev, m->handle); + + hci_dev_unlock(hdev); + + rp.monitor_handle = cpu_to_le16(m->handle); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + +unlock: + hci_dev_unlock(hdev); + +failed: + hci_free_adv_monitor(m); + return err; +} + +static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_remove_adv_monitor *cp = data; + struct mgmt_rp_remove_adv_monitor rp; + unsigned int prev_adv_monitors_cnt; + u16 handle; + int err; + + BT_DBG("request for %s", hdev->name); + + hci_dev_lock(hdev); + + handle = __le16_to_cpu(cp->monitor_handle); + prev_adv_monitors_cnt = hdev->adv_monitors_cnt; + + err = hci_remove_adv_monitor(hdev, handle); + if (err == -ENOENT) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADV_MONITOR, + MGMT_STATUS_INVALID_INDEX); + goto unlock; + } + + if (hdev->adv_monitors_cnt < prev_adv_monitors_cnt) + mgmt_adv_monitor_removed(sk, hdev, handle); + + hci_dev_unlock(hdev); + + rp.monitor_handle = cp->monitor_handle; + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_ADV_MONITOR, + MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + +unlock: + hci_dev_unlock(hdev); + return err; +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -4147,7 +4635,7 @@ static bool discovery_type_is_valid(struct hci_dev *hdev, uint8_t type, *mgmt_status = mgmt_le_support(hdev); if (*mgmt_status) return false; - /* Intentional fall-through */ + fallthrough; case DISCOV_TYPE_BREDR: *mgmt_status = mgmt_bredr_support(hdev); if (*mgmt_status) @@ -4662,6 +5150,13 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, status); + /* Enabling the experimental LL Privay support disables support for + * advertising. + */ + if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + MGMT_STATUS_NOT_SUPPORTED); + if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); @@ -4848,7 +5343,7 @@ static int set_scan_params(struct sock *sk, struct hci_dev *hdev, hci_req_init(&req, hdev); - hci_req_add_le_scan_disable(&req); + hci_req_add_le_scan_disable(&req, false); hci_req_add_le_passive_scan(&req); hci_req_run(&req, NULL); @@ -5523,7 +6018,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, case MGMT_LTK_P256_DEBUG: authenticated = 0x00; type = SMP_LTK_P256_DEBUG; - /* fall through */ + fallthrough; default: continue; } @@ -5966,7 +6461,9 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, { struct mgmt_cp_add_device *cp = data; u8 auto_conn, addr_type; + struct hci_conn_params *params; int err; + u32 current_flags = 0; bt_dev_dbg(hdev, "sock %p", sk); @@ -5993,8 +6490,9 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - err = hci_bdaddr_list_add(&hdev->whitelist, &cp->addr.bdaddr, - cp->addr.type); + err = hci_bdaddr_list_add_with_flags(&hdev->whitelist, + &cp->addr.bdaddr, + cp->addr.type, 0); if (err) goto unlock; @@ -6033,12 +6531,19 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, MGMT_STATUS_FAILED, &cp->addr, sizeof(cp->addr)); goto unlock; + } else { + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, + addr_type); + if (params) + current_flags = params->current_flags; } hci_update_background_scan(hdev); added: device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); + device_flags_changed(NULL, hdev, &cp->addr.bdaddr, cp->addr.type, + SUPPORTED_DEVICE_FLAGS(), current_flags); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, MGMT_STATUS_SUCCESS, &cp->addr, @@ -6724,6 +7229,13 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES, MGMT_STATUS_REJECTED); + /* Enabling the experimental LL Privay support disables support for + * advertising. + */ + if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + MGMT_STATUS_NOT_SUPPORTED); + hci_dev_lock(hdev); rp_len = sizeof(*rp) + hdev->adv_instance_cnt; @@ -6927,6 +7439,13 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, status); + /* Enabling the experimental LL Privay support disables support for + * advertising. + */ + if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + MGMT_STATUS_NOT_SUPPORTED); + if (cp->instance < 1 || cp->instance > HCI_MAX_ADV_INSTANCES) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); @@ -7091,6 +7610,13 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, bt_dev_dbg(hdev, "sock %p", sk); + /* Enabling the experimental LL Privay support disables support for + * advertising. + */ + if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + MGMT_STATUS_NOT_SUPPORTED); + hci_dev_lock(hdev); if (cp->instance && !hci_find_adv_instance(hdev, cp->instance)) { @@ -7116,6 +7642,12 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, hci_req_init(&req, hdev); + /* If we use extended advertising, instance is disabled and removed */ + if (ext_adv_capable(hdev)) { + __hci_req_disable_ext_adv_instance(&req, cp->instance); + __hci_req_remove_ext_adv_instance(&req, cp->instance); + } + hci_req_clear_adv_instance(hdev, sk, &req, cp->instance, true); if (list_empty(&hdev->adv_instances)) @@ -7297,6 +7829,20 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { set_exp_feature, MGMT_SET_EXP_FEATURE_SIZE, HCI_MGMT_VAR_LEN | HCI_MGMT_HDEV_OPTIONAL }, + { read_def_system_config, MGMT_READ_DEF_SYSTEM_CONFIG_SIZE, + HCI_MGMT_UNTRUSTED }, + { set_def_system_config, MGMT_SET_DEF_SYSTEM_CONFIG_SIZE, + HCI_MGMT_VAR_LEN }, + { read_def_runtime_config, MGMT_READ_DEF_RUNTIME_CONFIG_SIZE, + HCI_MGMT_UNTRUSTED }, + { set_def_runtime_config, MGMT_SET_DEF_RUNTIME_CONFIG_SIZE, + HCI_MGMT_VAR_LEN }, + { get_device_flags, MGMT_GET_DEVICE_FLAGS_SIZE }, + { set_device_flags, MGMT_SET_DEVICE_FLAGS_SIZE }, + { read_adv_mon_features, MGMT_READ_ADV_MONITOR_FEATURES_SIZE }, + { add_adv_patterns_monitor,MGMT_ADD_ADV_PATTERNS_MONITOR_SIZE, + HCI_MGMT_VAR_LEN }, + { remove_adv_monitor, MGMT_REMOVE_ADV_MONITOR_SIZE }, }; void mgmt_index_added(struct hci_dev *hdev) @@ -8216,8 +8762,11 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, if (!hci_discovery_active(hdev)) { if (link_type == ACL_LINK) return; - if (link_type == LE_LINK && list_empty(&hdev->pend_le_reports)) + if (link_type == LE_LINK && + list_empty(&hdev->pend_le_reports) && + !hci_is_adv_monitoring(hdev)) { return; + } } if (hdev->discovery.result_filtering) { diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c new file mode 100644 index 000000000000..b30b571f8caf --- /dev/null +++ b/net/bluetooth/mgmt_config.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (C) 2020 Google Corporation + */ + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci_core.h> +#include <net/bluetooth/mgmt.h> + +#include "mgmt_util.h" +#include "mgmt_config.h" + +#define HDEV_PARAM_U16(_param_code_, _param_name_) \ +{ \ + { cpu_to_le16(_param_code_), sizeof(__u16) }, \ + { cpu_to_le16(hdev->_param_name_) } \ +} + +#define HDEV_PARAM_U16_JIFFIES_TO_MSECS(_param_code_, _param_name_) \ +{ \ + { cpu_to_le16(_param_code_), sizeof(__u16) }, \ + { cpu_to_le16(jiffies_to_msecs(hdev->_param_name_)) } \ +} + +int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len) +{ + struct { + struct mgmt_tlv entry; + union { + /* This is a simplification for now since all values + * are 16 bits. In the future, this code may need + * refactoring to account for variable length values + * and properly calculate the required buffer size. + */ + __le16 value; + }; + } __packed params[] = { + /* Please see mgmt-api.txt for documentation of these values */ + HDEV_PARAM_U16(0x0000, def_page_scan_type), + HDEV_PARAM_U16(0x0001, def_page_scan_int), + HDEV_PARAM_U16(0x0002, def_page_scan_window), + HDEV_PARAM_U16(0x0003, def_inq_scan_type), + HDEV_PARAM_U16(0x0004, def_inq_scan_int), + HDEV_PARAM_U16(0x0005, def_inq_scan_window), + HDEV_PARAM_U16(0x0006, def_br_lsto), + HDEV_PARAM_U16(0x0007, def_page_timeout), + HDEV_PARAM_U16(0x0008, sniff_min_interval), + HDEV_PARAM_U16(0x0009, sniff_max_interval), + HDEV_PARAM_U16(0x000a, le_adv_min_interval), + HDEV_PARAM_U16(0x000b, le_adv_max_interval), + HDEV_PARAM_U16(0x000c, def_multi_adv_rotation_duration), + HDEV_PARAM_U16(0x000d, le_scan_interval), + HDEV_PARAM_U16(0x000e, le_scan_window), + HDEV_PARAM_U16(0x000f, le_scan_int_suspend), + HDEV_PARAM_U16(0x0010, le_scan_window_suspend), + HDEV_PARAM_U16(0x0011, le_scan_int_discovery), + HDEV_PARAM_U16(0x0012, le_scan_window_discovery), + HDEV_PARAM_U16(0x0013, le_scan_int_adv_monitor), + HDEV_PARAM_U16(0x0014, le_scan_window_adv_monitor), + HDEV_PARAM_U16(0x0015, le_scan_int_connect), + HDEV_PARAM_U16(0x0016, le_scan_window_connect), + HDEV_PARAM_U16(0x0017, le_conn_min_interval), + HDEV_PARAM_U16(0x0018, le_conn_max_interval), + HDEV_PARAM_U16(0x0019, le_conn_latency), + HDEV_PARAM_U16(0x001a, le_supv_timeout), + HDEV_PARAM_U16_JIFFIES_TO_MSECS(0x001b, + def_le_autoconnect_timeout), + }; + struct mgmt_rp_read_def_system_config *rp = (void *)params; + + bt_dev_dbg(hdev, "sock %p", sk); + + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_READ_DEF_SYSTEM_CONFIG, + 0, rp, sizeof(params)); +} + +#define TO_TLV(x) ((struct mgmt_tlv *)(x)) +#define TLV_GET_LE16(tlv) le16_to_cpu(*((__le16 *)(TO_TLV(tlv)->value))) + +int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len) +{ + u16 buffer_left = data_len; + u8 *buffer = data; + + if (buffer_left < sizeof(struct mgmt_tlv)) { + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_DEF_SYSTEM_CONFIG, + MGMT_STATUS_INVALID_PARAMS); + } + + /* First pass to validate the tlv */ + while (buffer_left >= sizeof(struct mgmt_tlv)) { + const u8 len = TO_TLV(buffer)->length; + const u16 exp_len = sizeof(struct mgmt_tlv) + + len; + const u16 type = le16_to_cpu(TO_TLV(buffer)->type); + + if (buffer_left < exp_len) { + bt_dev_warn(hdev, "invalid len left %d, exp >= %d", + buffer_left, exp_len); + + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_DEF_SYSTEM_CONFIG, + MGMT_STATUS_INVALID_PARAMS); + } + + /* Please see mgmt-api.txt for documentation of these values */ + switch (type) { + case 0x0000: + case 0x0001: + case 0x0002: + case 0x0003: + case 0x0004: + case 0x0005: + case 0x0006: + case 0x0007: + case 0x0008: + case 0x0009: + case 0x000a: + case 0x000b: + case 0x000c: + case 0x000d: + case 0x000e: + case 0x000f: + case 0x0010: + case 0x0011: + case 0x0012: + case 0x0013: + case 0x0014: + case 0x0015: + case 0x0016: + case 0x0017: + case 0x0018: + case 0x0019: + case 0x001a: + case 0x001b: + if (len != sizeof(u16)) { + bt_dev_warn(hdev, "invalid length %d, exp %zu for type %d", + len, sizeof(u16), type); + + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_DEF_SYSTEM_CONFIG, + MGMT_STATUS_INVALID_PARAMS); + } + break; + default: + bt_dev_warn(hdev, "unsupported parameter %u", type); + break; + } + + buffer_left -= exp_len; + buffer += exp_len; + } + + buffer_left = data_len; + buffer = data; + while (buffer_left >= sizeof(struct mgmt_tlv)) { + const u8 len = TO_TLV(buffer)->length; + const u16 exp_len = sizeof(struct mgmt_tlv) + + len; + const u16 type = le16_to_cpu(TO_TLV(buffer)->type); + + switch (type) { + case 0x0000: + hdev->def_page_scan_type = TLV_GET_LE16(buffer); + break; + case 0x0001: + hdev->def_page_scan_int = TLV_GET_LE16(buffer); + break; + case 0x0002: + hdev->def_page_scan_window = TLV_GET_LE16(buffer); + break; + case 0x0003: + hdev->def_inq_scan_type = TLV_GET_LE16(buffer); + break; + case 0x0004: + hdev->def_inq_scan_int = TLV_GET_LE16(buffer); + break; + case 0x0005: + hdev->def_inq_scan_window = TLV_GET_LE16(buffer); + break; + case 0x0006: + hdev->def_br_lsto = TLV_GET_LE16(buffer); + break; + case 0x0007: + hdev->def_page_timeout = TLV_GET_LE16(buffer); + break; + case 0x0008: + hdev->sniff_min_interval = TLV_GET_LE16(buffer); + break; + case 0x0009: + hdev->sniff_max_interval = TLV_GET_LE16(buffer); + break; + case 0x000a: + hdev->le_adv_min_interval = TLV_GET_LE16(buffer); + break; + case 0x000b: + hdev->le_adv_max_interval = TLV_GET_LE16(buffer); + break; + case 0x000c: + hdev->def_multi_adv_rotation_duration = + TLV_GET_LE16(buffer); + break; + case 0x000d: + hdev->le_scan_interval = TLV_GET_LE16(buffer); + break; + case 0x000e: + hdev->le_scan_window = TLV_GET_LE16(buffer); + break; + case 0x000f: + hdev->le_scan_int_suspend = TLV_GET_LE16(buffer); + break; + case 0x0010: + hdev->le_scan_window_suspend = TLV_GET_LE16(buffer); + break; + case 0x0011: + hdev->le_scan_int_discovery = TLV_GET_LE16(buffer); + break; + case 0x00012: + hdev->le_scan_window_discovery = TLV_GET_LE16(buffer); + break; + case 0x00013: + hdev->le_scan_int_adv_monitor = TLV_GET_LE16(buffer); + break; + case 0x00014: + hdev->le_scan_window_adv_monitor = TLV_GET_LE16(buffer); + break; + case 0x00015: + hdev->le_scan_int_connect = TLV_GET_LE16(buffer); + break; + case 0x00016: + hdev->le_scan_window_connect = TLV_GET_LE16(buffer); + break; + case 0x00017: + hdev->le_conn_min_interval = TLV_GET_LE16(buffer); + break; + case 0x00018: + hdev->le_conn_max_interval = TLV_GET_LE16(buffer); + break; + case 0x00019: + hdev->le_conn_latency = TLV_GET_LE16(buffer); + break; + case 0x0001a: + hdev->le_supv_timeout = TLV_GET_LE16(buffer); + break; + case 0x0001b: + hdev->def_le_autoconnect_timeout = + msecs_to_jiffies(TLV_GET_LE16(buffer)); + break; + default: + bt_dev_warn(hdev, "unsupported parameter %u", type); + break; + } + + buffer_left -= exp_len; + buffer += exp_len; + } + + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_DEF_SYSTEM_CONFIG, 0, NULL, 0); +} + +int read_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len) +{ + bt_dev_dbg(hdev, "sock %p", sk); + + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_READ_DEF_RUNTIME_CONFIG, 0, NULL, 0); +} + +int set_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len) +{ + bt_dev_dbg(hdev, "sock %p", sk); + + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEF_SYSTEM_CONFIG, + MGMT_STATUS_INVALID_PARAMS); +} diff --git a/net/bluetooth/mgmt_config.h b/net/bluetooth/mgmt_config.h new file mode 100644 index 000000000000..a4965f107891 --- /dev/null +++ b/net/bluetooth/mgmt_config.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (C) 2020 Google Corporation + */ + +int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len); + +int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len); + +int read_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len); + +int set_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len); diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index d6c4e6b5ae77..8579bfeb2836 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -139,3 +139,10 @@ void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) bt_dev_dbg(hdev, "MSFT vendor event %u", event); } + +__u64 msft_get_features(struct hci_dev *hdev) +{ + struct msft_data *msft = hdev->msft_data; + + return msft ? msft->features : 0; +} diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h index 5aa9130e1f8a..e9c478e890b8 100644 --- a/net/bluetooth/msft.h +++ b/net/bluetooth/msft.h @@ -3,16 +3,25 @@ * Copyright (C) 2020 Google Corporation */ +#define MSFT_FEATURE_MASK_BREDR_RSSI_MONITOR BIT(0) +#define MSFT_FEATURE_MASK_LE_CONN_RSSI_MONITOR BIT(1) +#define MSFT_FEATURE_MASK_LE_ADV_RSSI_MONITOR BIT(2) +#define MSFT_FEATURE_MASK_LE_ADV_MONITOR BIT(3) +#define MSFT_FEATURE_MASK_CURVE_VALIDITY BIT(4) +#define MSFT_FEATURE_MASK_CONCURRENT_ADV_MONITOR BIT(5) + #if IS_ENABLED(CONFIG_BT_MSFTEXT) void msft_do_open(struct hci_dev *hdev); void msft_do_close(struct hci_dev *hdev); void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb); +__u64 msft_get_features(struct hci_dev *hdev); #else static inline void msft_do_open(struct hci_dev *hdev) {} static inline void msft_do_close(struct hci_dev *hdev) {} static inline void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) {} +static inline __u64 msft_get_features(struct hci_dev *hdev) { return 0; } #endif diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 2e20af317cea..f2bacb464ccf 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -479,7 +479,7 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) /* if closing a dlc in a session that hasn't been started, * just close and unlink the dlc */ - /* fall through */ + fallthrough; default: rfcomm_dlc_clear_timer(d); diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index df14eebe80da..ae6f80730561 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -218,7 +218,7 @@ static void __rfcomm_sock_close(struct sock *sk) case BT_CONFIG: case BT_CONNECTED: rfcomm_dlc_close(d, 0); - /* fall through */ + fallthrough; default: sock_set_flag(sk, SOCK_ZAPPED); @@ -644,7 +644,8 @@ static int rfcomm_sock_recvmsg(struct socket *sock, struct msghdr *msg, return len; } -static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen) +static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0; @@ -656,7 +657,7 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __u switch (optname) { case RFCOMM_LM: - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -685,7 +686,8 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __u return err; } -static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct bt_security sec; @@ -713,7 +715,7 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c sec.level = BT_SECURITY_LOW; len = min_t(unsigned int, sizeof(sec), optlen); - if (copy_from_user((char *) &sec, optval, len)) { + if (copy_from_sockptr(&sec, optval, len)) { err = -EFAULT; break; } @@ -732,7 +734,7 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index c8c3d38cdc7b..dcf7f96ff417 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -66,6 +66,7 @@ struct sco_pinfo { bdaddr_t dst; __u32 flags; __u16 setting; + __u8 cmsg_mask; struct sco_conn *conn; }; @@ -449,6 +450,15 @@ static void sco_sock_close(struct sock *sk) sco_sock_kill(sk); } +static void sco_skb_put_cmsg(struct sk_buff *skb, struct msghdr *msg, + struct sock *sk) +{ + if (sco_pi(sk)->cmsg_mask & SCO_CMSG_PKT_STATUS) + put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_STATUS, + sizeof(bt_cb(skb)->sco.pkt_status), + &bt_cb(skb)->sco.pkt_status); +} + static void sco_sock_init(struct sock *sk, struct sock *parent) { BT_DBG("sk %p", sk); @@ -457,6 +467,8 @@ static void sco_sock_init(struct sock *sk, struct sock *parent) sk->sk_type = parent->sk_type; bt_sk(sk)->flags = bt_sk(parent)->flags; security_sk_clone(parent, sk); + } else { + bt_sk(sk)->skb_put_cmsg = sco_skb_put_cmsg; } } @@ -791,7 +803,7 @@ static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg, } static int sco_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int len, err = 0; @@ -810,7 +822,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -831,7 +843,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, voice.setting = sco_pi(sk)->setting; len = min_t(unsigned int, sizeof(voice), optlen); - if (copy_from_user((char *)&voice, optval, len)) { + if (copy_from_sockptr(&voice, optval, len)) { err = -EFAULT; break; } @@ -846,6 +858,18 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, sco_pi(sk)->setting = voice.setting; break; + case BT_PKT_STATUS: + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { + err = -EFAULT; + break; + } + + if (opt) + sco_pi(sk)->cmsg_mask |= SCO_CMSG_PKT_STATUS; + else + sco_pi(sk)->cmsg_mask &= SCO_CMSG_PKT_STATUS; + break; + default: err = -ENOPROTOOPT; break; @@ -923,6 +947,7 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, int len, err = 0; struct bt_voice voice; u32 phys; + int pkt_status; BT_DBG("sk %p", sk); @@ -969,6 +994,13 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, err = -EFAULT; break; + case BT_PKT_STATUS: + pkt_status = (sco_pi(sk)->cmsg_mask & SCO_CMSG_PKT_STATUS); + + if (put_user(pkt_status, (int __user *)optval)) + err = -EFAULT; + break; + default: err = -ENOPROTOOPT; break; diff --git a/net/bluetooth/selftest.c b/net/bluetooth/selftest.c index 03e3c89c3046..f71c6fa65fb3 100644 --- a/net/bluetooth/selftest.c +++ b/net/bluetooth/selftest.c @@ -205,7 +205,7 @@ static int __init test_ecdh(void) calltime = ktime_get(); - tfm = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + tfm = crypto_alloc_kpp("ecdh", 0, 0); if (IS_ERR(tfm)) { BT_ERR("Unable to create ECDH crypto context"); err = PTR_ERR(tfm); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index c2c5ab05fa7e..433227f96c73 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1387,7 +1387,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) goto zfree_smp; } - smp->tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + smp->tfm_ecdh = crypto_alloc_kpp("ecdh", 0, 0); if (IS_ERR(smp->tfm_ecdh)) { BT_ERR("Unable to create ECDH crypto context"); goto free_shash; @@ -1654,7 +1654,7 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) memset(smp->tk, 0, sizeof(smp->tk)); BT_DBG("PassKey: %d", value); put_unaligned_le32(value, smp->tk); - /* Fall Through */ + fallthrough; case MGMT_OP_USER_CONFIRM_REPLY: set_bit(SMP_FLAG_TK_VALID, &smp->flags); break; @@ -3282,7 +3282,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) return ERR_CAST(tfm_cmac); } - tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + tfm_ecdh = crypto_alloc_kpp("ecdh", 0, 0); if (IS_ERR(tfm_ecdh)) { BT_ERR("Unable to create ECDH crypto context"); crypto_free_shash(tfm_cmac); @@ -3847,7 +3847,7 @@ int __init bt_selftest_smp(void) return PTR_ERR(tfm_cmac); } - tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + tfm_ecdh = crypto_alloc_kpp("ecdh", 0, 0); if (IS_ERR(tfm_ecdh)) { BT_ERR("Unable to create ECDH crypto context"); crypto_free_shash(tfm_cmac); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index b03c469cd01f..99eb8c6c0fbc 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -327,6 +327,12 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) /* priority is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, priority), + offsetof(struct __sk_buff, ifindex))) + return -EINVAL; + + /* ifindex is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, ifindex), offsetof(struct __sk_buff, cb))) return -EINVAL; @@ -381,6 +387,7 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) __skb->mark = skb->mark; __skb->priority = skb->priority; + __skb->ifindex = skb->dev->ifindex; __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); __skb->wire_len = cb->pkt_len; @@ -391,6 +398,8 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool is_l2 = false, is_direct_pkt_access = false; + struct net *net = current->nsproxy->net_ns; + struct net_device *dev = net->loopback_dev; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct __sk_buff *ctx = NULL; @@ -432,7 +441,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, kfree(ctx); return -ENOMEM; } - sock_net_set(sk, current->nsproxy->net_ns); + sock_net_set(sk, net); sock_init_data(NULL, sk); skb = build_skb(data, 0); @@ -446,9 +455,37 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); __skb_put(skb, size); - skb->protocol = eth_type_trans(skb, current->nsproxy->net_ns->loopback_dev); + if (ctx && ctx->ifindex > 1) { + dev = dev_get_by_index(net, ctx->ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + } + skb->protocol = eth_type_trans(skb, dev); skb_reset_network_header(skb); + switch (skb->protocol) { + case htons(ETH_P_IP): + sk->sk_family = AF_INET; + if (sizeof(struct iphdr) <= skb_headlen(skb)) { + sk->sk_rcv_saddr = ip_hdr(skb)->saddr; + sk->sk_daddr = ip_hdr(skb)->daddr; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + sk->sk_family = AF_INET6; + if (sizeof(struct ipv6hdr) <= skb_headlen(skb)) { + sk->sk_v6_rcv_saddr = ipv6_hdr(skb)->saddr; + sk->sk_v6_daddr = ipv6_hdr(skb)->daddr; + } + break; +#endif + default: + break; + } + if (is_l2) __skb_push(skb, hh_len); if (is_direct_pkt_access) @@ -481,6 +518,8 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct __sk_buff)); out: + if (dev && dev != net->loopback_dev) + dev_put(dev); kfree_skb(skb); bpf_sk_storage_free(sk); kfree(sk); diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig index 84015ef3ee27..73d0b12789f1 100644 --- a/net/bpfilter/Kconfig +++ b/net/bpfilter/Kconfig @@ -9,12 +9,14 @@ menuconfig BPFILTER if BPFILTER config BPFILTER_UMH tristate "bpfilter kernel module with user mode helper" - depends on CC_CAN_LINK_STATIC + depends on CC_CAN_LINK + depends on m || CC_CAN_LINK_STATIC default m help This builds bpfilter kernel module with embedded user mode helper - Note: your toolchain must support building static binaries, since - rootfs isn't mounted at the time when __init functions are called - and do_execv won't be able to find the elf interpreter. + Note: To compile this as built-in, your toolchain must support + building static binaries, since rootfs isn't mounted at the time + when __init functions are called and do_execv won't be able to find + the elf interpreter. endif diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index f23b53294fba..cdac82b8c53a 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -7,10 +7,12 @@ userprogs := bpfilter_umh bpfilter_umh-objs := main.o userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi +ifeq ($(CONFIG_BPFILTER_UMH), y) # builtin bpfilter_umh should be linked with -static # since rootfs isn't mounted at the time of __init # function is called and do_execv won't find elf interpreter userldflags += -static +endif $(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index c3146b2700a0..51a941b56ec3 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -31,47 +31,55 @@ static void __stop_umh(void) shutdown_umh(); } -static int __bpfilter_process_sockopt(struct sock *sk, int optname, - char __user *optval, - unsigned int optlen, bool is_set) +static int bpfilter_send_req(struct mbox_request *req) { - struct mbox_request req; struct mbox_reply reply; loff_t pos = 0; ssize_t n; - int ret = -EFAULT; - req.is_set = is_set; - req.pid = current->pid; - req.cmd = optname; - req.addr = (long __force __user)optval; - req.len = optlen; if (!bpfilter_ops.info.tgid) - goto out; - n = kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req), + return -EFAULT; + pos = 0; + n = kernel_write(bpfilter_ops.info.pipe_to_umh, req, sizeof(*req), &pos); - if (n != sizeof(req)) { + if (n != sizeof(*req)) { pr_err("write fail %zd\n", n); - __stop_umh(); - ret = -EFAULT; - goto out; + goto stop; } pos = 0; n = kernel_read(bpfilter_ops.info.pipe_from_umh, &reply, sizeof(reply), &pos); if (n != sizeof(reply)) { pr_err("read fail %zd\n", n); - __stop_umh(); - ret = -EFAULT; - goto out; + goto stop; + } + return reply.status; +stop: + __stop_umh(); + return -EFAULT; +} + +static int bpfilter_process_sockopt(struct sock *sk, int optname, + sockptr_t optval, unsigned int optlen, + bool is_set) +{ + struct mbox_request req = { + .is_set = is_set, + .pid = current->pid, + .cmd = optname, + .addr = (uintptr_t)optval.user, + .len = optlen, + }; + if (uaccess_kernel() || sockptr_is_kernel(optval)) { + pr_err("kernel access not supported\n"); + return -EFAULT; } - ret = reply.status; -out: - return ret; + return bpfilter_send_req(&req); } static int start_umh(void) { + struct mbox_request req = { .pid = current->pid }; int err; /* fork usermode process */ @@ -81,7 +89,7 @@ static int start_umh(void) pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid)); /* health check that usermode process started correctly */ - if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { + if (bpfilter_send_req(&req) != 0) { shutdown_umh(); return -EFAULT; } @@ -102,7 +110,7 @@ static int __init load_umh(void) mutex_lock(&bpfilter_ops.lock); err = start_umh(); if (!err && IS_ENABLED(CONFIG_INET)) { - bpfilter_ops.sockopt = &__bpfilter_process_sockopt; + bpfilter_ops.sockopt = &bpfilter_process_sockopt; bpfilter_ops.start = &start_umh; } mutex_unlock(&bpfilter_ops.lock); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 8c7b78f8bc23..9a2fb4aa1a10 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -36,6 +36,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) const unsigned char *dest; u16 vid = 0; + memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); + rcu_read_lock(); nf_ops = rcu_dereference(nf_br_ops); if (nf_ops && nf_ops->br_dev_xmit_hook(skb)) { diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 4877a0db16c6..9db504baa094 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -349,12 +349,21 @@ void br_fdb_cleanup(struct work_struct *work) */ rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { - unsigned long this_timer; + unsigned long this_timer = f->updated + delay; if (test_bit(BR_FDB_STATIC, &f->flags) || - test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) + test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) { + if (test_bit(BR_FDB_NOTIFY, &f->flags)) { + if (time_after(this_timer, now)) + work_delay = min(work_delay, + this_timer - now); + else if (!test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, + &f->flags)) + fdb_notify(br, f, RTM_NEWNEIGH, false); + } continue; - this_timer = f->updated + delay; + } + if (time_after(this_timer, now)) { work_delay = min(work_delay, this_timer - now); } else { @@ -556,11 +565,17 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, return ret; } +/* returns true if the fdb was modified */ +static bool __fdb_mark_active(struct net_bridge_fdb_entry *fdb) +{ + return !!(test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags) && + test_and_clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)); +} + void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; - bool fdb_modified = false; /* some users want to always flood. */ if (hold_time(br) == 0) @@ -575,6 +590,12 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, source->dev->name, addr, vid); } else { unsigned long now = jiffies; + bool fdb_modified = false; + + if (now != fdb->updated) { + fdb->updated = now; + fdb_modified = __fdb_mark_active(fdb); + } /* fastpath: update of existing entry */ if (unlikely(source != fdb->dst && @@ -587,8 +608,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); } - if (now != fdb->updated) - fdb->updated = now; + if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (unlikely(fdb_modified)) { @@ -667,6 +687,23 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, &fdb->key.vlan_id)) goto nla_put_failure; + if (test_bit(BR_FDB_NOTIFY, &fdb->flags)) { + struct nlattr *nest = nla_nest_start(skb, NDA_FDB_EXT_ATTRS); + u8 notify_bits = FDB_NOTIFY_BIT; + + if (!nest) + goto nla_put_failure; + if (test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) + notify_bits |= FDB_NOTIFY_INACTIVE_BIT; + + if (nla_put_u8(skb, NFEA_ACTIVITY_NOTIFY, notify_bits)) { + nla_nest_cancel(skb, nest); + goto nla_put_failure; + } + + nla_nest_end(skb, nest); + } + nlmsg_end(skb, nlh); return 0; @@ -681,7 +718,9 @@ static inline size_t fdb_nlmsg_size(void) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(u32)) /* NDA_MASTER */ + nla_total_size(sizeof(u16)) /* NDA_VLAN */ - + nla_total_size(sizeof(struct nda_cacheinfo)); + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */ + + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ } static void fdb_notify(struct net_bridge *br, @@ -791,14 +830,41 @@ errout: return err; } +/* returns true if the fdb is modified */ +static bool fdb_handle_notify(struct net_bridge_fdb_entry *fdb, u8 notify) +{ + bool modified = false; + + /* allow to mark an entry as inactive, usually done on creation */ + if ((notify & FDB_NOTIFY_INACTIVE_BIT) && + !test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) + modified = true; + + if ((notify & FDB_NOTIFY_BIT) && + !test_and_set_bit(BR_FDB_NOTIFY, &fdb->flags)) { + /* enabled activity tracking */ + modified = true; + } else if (!(notify & FDB_NOTIFY_BIT) && + test_and_clear_bit(BR_FDB_NOTIFY, &fdb->flags)) { + /* disabled activity tracking, clear notify state */ + clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags); + modified = true; + } + + return modified; +} + /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, - const u8 *addr, u16 state, u16 flags, u16 vid, - u8 ndm_flags) + const u8 *addr, struct ndmsg *ndm, u16 flags, u16 vid, + struct nlattr *nfea_tb[]) { - bool is_sticky = !!(ndm_flags & NTF_STICKY); + bool is_sticky = !!(ndm->ndm_flags & NTF_STICKY); + bool refresh = !nfea_tb[NFEA_DONT_REFRESH]; struct net_bridge_fdb_entry *fdb; + u16 state = ndm->ndm_state; bool modified = false; + u8 notify = 0; /* If the port cannot learn allow only local and static entries */ if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) && @@ -815,6 +881,13 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (is_sticky && (state & NUD_PERMANENT)) return -EINVAL; + if (nfea_tb[NFEA_ACTIVITY_NOTIFY]) { + notify = nla_get_u8(nfea_tb[NFEA_ACTIVITY_NOTIFY]); + if ((notify & ~BR_FDB_NOTIFY_SETTABLE_BITS) || + (notify & BR_FDB_NOTIFY_SETTABLE_BITS) == FDB_NOTIFY_INACTIVE_BIT) + return -EINVAL; + } + fdb = br_fdb_find(br, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) @@ -858,11 +931,15 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, modified = true; } + if (fdb_handle_notify(fdb, notify)) + modified = true; + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); fdb->used = jiffies; if (modified) { - fdb->updated = jiffies; + if (refresh) + fdb->updated = jiffies; fdb_notify(br, fdb, RTM_NEWNEIGH, true); } @@ -871,7 +948,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, - u16 nlh_flags, u16 vid) + u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[]) { int err = 0; @@ -893,20 +970,25 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, err = br_fdb_external_learn_add(br, p, addr, vid, true); } else { spin_lock_bh(&br->hash_lock); - err = fdb_add_entry(br, p, addr, ndm->ndm_state, - nlh_flags, vid, ndm->ndm_flags); + err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb); spin_unlock_bh(&br->hash_lock); } return err; } +static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = { + [NFEA_ACTIVITY_NOTIFY] = { .type = NLA_U8 }, + [NFEA_DONT_REFRESH] = { .type = NLA_FLAG }, +}; + /* Add new permanent fdb entry with RTM_NEWNEIGH */ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, struct netlink_ext_ack *extack) { + struct nlattr *nfea_tb[NFEA_MAX + 1], *attr; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; @@ -939,6 +1021,16 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], vg = nbp_vlan_group(p); } + if (tb[NDA_FDB_EXT_ATTRS]) { + attr = tb[NDA_FDB_EXT_ATTRS]; + err = nla_parse_nested(nfea_tb, NFEA_MAX, attr, + br_nda_fdb_pol, extack); + if (err) + return err; + } else { + memset(nfea_tb, 0, sizeof(struct nlattr *) * (NFEA_MAX + 1)); + } + if (vid) { v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) { @@ -947,9 +1039,9 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], } /* VID was specified, so use it. */ - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb); } else { - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb); if (err || !vg || !vg->num_vlans) goto out; @@ -960,7 +1052,8 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid, + nfea_tb); if (err) goto out; } diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 90592af9db61..b36689e6e7cb 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -4,6 +4,27 @@ #include "br_private_mrp.h" static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 }; +static const u8 mrp_in_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x3 }; + +static bool br_mrp_is_ring_port(struct net_bridge_port *p_port, + struct net_bridge_port *s_port, + struct net_bridge_port *port) +{ + if (port == p_port || + port == s_port) + return true; + + return false; +} + +static bool br_mrp_is_in_port(struct net_bridge_port *i_port, + struct net_bridge_port *port) +{ + if (port == i_port) + return true; + + return false; +} static struct net_bridge_port *br_mrp_get_port(struct net_bridge *br, u32 ifindex) @@ -37,6 +58,22 @@ static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id) return res; } +static struct br_mrp *br_mrp_find_in_id(struct net_bridge *br, u32 in_id) +{ + struct br_mrp *res = NULL; + struct br_mrp *mrp; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { + if (mrp->in_id == in_id) { + res = mrp; + break; + } + } + + return res; +} + static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex) { struct br_mrp *mrp; @@ -52,6 +89,10 @@ static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex) p = rtnl_dereference(mrp->s_port); if (p && p->dev->ifindex == ifindex) return false; + + p = rtnl_dereference(mrp->i_port); + if (p && p->dev->ifindex == ifindex) + return false; } return true; @@ -66,7 +107,8 @@ static struct br_mrp *br_mrp_find_port(struct net_bridge *br, list_for_each_entry_rcu(mrp, &br->mrp_list, list, lockdep_rtnl_is_held()) { if (rcu_access_pointer(mrp->p_port) == p || - rcu_access_pointer(mrp->s_port) == p) { + rcu_access_pointer(mrp->s_port) == p || + rcu_access_pointer(mrp->i_port) == p) { res = mrp; break; } @@ -160,6 +202,36 @@ static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, return skb; } +static struct sk_buff *br_mrp_alloc_in_test_skb(struct br_mrp *mrp, + struct net_bridge_port *p, + enum br_mrp_port_role_type port_role) +{ + struct br_mrp_in_test_hdr *hdr = NULL; + struct sk_buff *skb = NULL; + + if (!p) + return NULL; + + skb = br_mrp_skb_alloc(p, p->dev->dev_addr, mrp_in_test_dmac); + if (!skb) + return NULL; + + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_IN_TEST, sizeof(*hdr)); + hdr = skb_put(skb, sizeof(*hdr)); + + hdr->id = cpu_to_be16(mrp->in_id); + ether_addr_copy(hdr->sa, p->br->dev->dev_addr); + hdr->port_role = cpu_to_be16(port_role); + hdr->state = cpu_to_be16(mrp->in_state); + hdr->transitions = cpu_to_be16(mrp->in_transitions); + hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies)); + + br_mrp_skb_common(skb, mrp); + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0); + + return skb; +} + /* This function is continuously called in the following cases: * - when node role is MRM, in this case test_monitor is always set to false * because it needs to notify the userspace that the ring is open and needs to @@ -213,7 +285,7 @@ static void br_mrp_test_work_expired(struct work_struct *work) } if (notify_open && !mrp->ring_role_offloaded) - br_mrp_port_open(p->dev, true); + br_mrp_ring_port_open(p->dev, true); } p = rcu_dereference(mrp->s_port); @@ -229,7 +301,7 @@ static void br_mrp_test_work_expired(struct work_struct *work) } if (notify_open && !mrp->ring_role_offloaded) - br_mrp_port_open(p->dev, true); + br_mrp_ring_port_open(p->dev, true); } out: @@ -239,6 +311,83 @@ out: usecs_to_jiffies(mrp->test_interval)); } +/* This function is continuously called when the node has the interconnect role + * MIM. It would generate interconnect test frames and will send them on all 3 + * ports. But will also check if it stop receiving interconnect test frames. + */ +static void br_mrp_in_test_work_expired(struct work_struct *work) +{ + struct delayed_work *del_work = to_delayed_work(work); + struct br_mrp *mrp = container_of(del_work, struct br_mrp, in_test_work); + struct net_bridge_port *p; + bool notify_open = false; + struct sk_buff *skb; + + if (time_before_eq(mrp->in_test_end, jiffies)) + return; + + if (mrp->in_test_count_miss < mrp->in_test_max_miss) { + mrp->in_test_count_miss++; + } else { + /* Notify that the interconnect ring is open only if the + * interconnect ring state is closed, otherwise it would + * continue to notify at every interval. + */ + if (mrp->in_state == BR_MRP_IN_STATE_CLOSED) + notify_open = true; + } + + rcu_read_lock(); + + p = rcu_dereference(mrp->p_port); + if (p) { + skb = br_mrp_alloc_in_test_skb(mrp, p, + BR_MRP_PORT_ROLE_PRIMARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + + if (notify_open && !mrp->in_role_offloaded) + br_mrp_in_port_open(p->dev, true); + } + + p = rcu_dereference(mrp->s_port); + if (p) { + skb = br_mrp_alloc_in_test_skb(mrp, p, + BR_MRP_PORT_ROLE_SECONDARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + + if (notify_open && !mrp->in_role_offloaded) + br_mrp_in_port_open(p->dev, true); + } + + p = rcu_dereference(mrp->i_port); + if (p) { + skb = br_mrp_alloc_in_test_skb(mrp, p, + BR_MRP_PORT_ROLE_INTER); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + + if (notify_open && !mrp->in_role_offloaded) + br_mrp_in_port_open(p->dev, true); + } + +out: + rcu_read_unlock(); + + queue_delayed_work(system_wq, &mrp->in_test_work, + usecs_to_jiffies(mrp->in_test_interval)); +} + /* Deletes the MRP instance. * note: called under rtnl_lock */ @@ -251,6 +400,10 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) cancel_delayed_work_sync(&mrp->test_work); br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0, 0); + /* Stop sending MRP_InTest frames if has an interconnect role */ + cancel_delayed_work_sync(&mrp->in_test_work); + br_mrp_switchdev_send_in_test(br, mrp, 0, 0, 0); + br_mrp_switchdev_del(br, mrp); /* Reset the ports */ @@ -278,6 +431,18 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) rcu_assign_pointer(mrp->s_port, NULL); } + p = rtnl_dereference(mrp->i_port); + if (p) { + spin_lock_bh(&br->lock); + state = netif_running(br->dev) ? + BR_STATE_FORWARDING : BR_STATE_DISABLED; + p->state = state; + p->flags &= ~BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + br_mrp_port_switchdev_set_state(p, state); + rcu_assign_pointer(mrp->i_port, NULL); + } + list_del_rcu(&mrp->list); kfree_rcu(mrp, rcu); } @@ -329,6 +494,7 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) rcu_assign_pointer(mrp->s_port, p); INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired); + INIT_DELAYED_WORK(&mrp->in_test_work, br_mrp_in_test_work_expired); list_add_tail_rcu(&mrp->list, &br->mrp_list); err = br_mrp_switchdev_add(br, mrp); @@ -511,6 +677,180 @@ int br_mrp_start_test(struct net_bridge *br, return 0; } +/* Set in state, int state can be only Open or Closed + * note: already called with rtnl_lock + */ +int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state) +{ + struct br_mrp *mrp = br_mrp_find_in_id(br, state->in_id); + + if (!mrp) + return -EINVAL; + + if (mrp->in_state == BR_MRP_IN_STATE_CLOSED && + state->in_state != BR_MRP_IN_STATE_CLOSED) + mrp->in_transitions++; + + mrp->in_state = state->in_state; + + br_mrp_switchdev_set_in_state(br, mrp, state->in_state); + + return 0; +} + +/* Set in role, in role can be only MIM(Media Interconnection Manager) or + * MIC(Media Interconnection Client). + * note: already called with rtnl_lock + */ +int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role) +{ + struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id); + struct net_bridge_port *p; + int err; + + if (!mrp) + return -EINVAL; + + if (!br_mrp_get_port(br, role->i_ifindex)) + return -EINVAL; + + if (role->in_role == BR_MRP_IN_ROLE_DISABLED) { + u8 state; + + /* It is not allowed to disable a port that doesn't exist */ + p = rtnl_dereference(mrp->i_port); + if (!p) + return -EINVAL; + + /* Stop the generating MRP_InTest frames */ + cancel_delayed_work_sync(&mrp->in_test_work); + br_mrp_switchdev_send_in_test(br, mrp, 0, 0, 0); + + /* Remove the port */ + spin_lock_bh(&br->lock); + state = netif_running(br->dev) ? + BR_STATE_FORWARDING : BR_STATE_DISABLED; + p->state = state; + p->flags &= ~BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + br_mrp_port_switchdev_set_state(p, state); + rcu_assign_pointer(mrp->i_port, NULL); + + mrp->in_role = role->in_role; + mrp->in_id = 0; + + return 0; + } + + /* It is not possible to have the same port part of multiple rings */ + if (!br_mrp_unique_ifindex(br, role->i_ifindex)) + return -EINVAL; + + /* It is not allowed to set a different interconnect port if the mrp + * instance has already one. First it needs to be disabled and after + * that set the new port + */ + if (rcu_access_pointer(mrp->i_port)) + return -EINVAL; + + p = br_mrp_get_port(br, role->i_ifindex); + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags |= BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + rcu_assign_pointer(mrp->i_port, p); + + mrp->in_role = role->in_role; + mrp->in_id = role->in_id; + + /* If there is an error just bailed out */ + err = br_mrp_switchdev_set_in_role(br, mrp, role->in_id, + role->ring_id, role->in_role); + if (err && err != -EOPNOTSUPP) + return err; + + /* Now detect if the HW actually applied the role or not. If the HW + * applied the role it means that the SW will not to do those operations + * anymore. For example if the role is MIM then the HW will notify the + * SW when interconnect ring is open, but if the is not pushed to the HW + * the SW will need to detect when the interconnect ring is open. + */ + mrp->in_role_offloaded = err == -EOPNOTSUPP ? 0 : 1; + + return 0; +} + +/* Start to generate MRP_InTest frames, the frames are generated by + * HW and if it fails, they are generated by the SW. + * note: already called with rtnl_lock + */ +int br_mrp_start_in_test(struct net_bridge *br, + struct br_mrp_start_in_test *in_test) +{ + struct br_mrp *mrp = br_mrp_find_in_id(br, in_test->in_id); + + if (!mrp) + return -EINVAL; + + if (mrp->in_role != BR_MRP_IN_ROLE_MIM) + return -EINVAL; + + /* Try to push it to the HW and if it fails then continue with SW + * implementation and if that also fails then return error. + */ + if (!br_mrp_switchdev_send_in_test(br, mrp, in_test->interval, + in_test->max_miss, in_test->period)) + return 0; + + mrp->in_test_interval = in_test->interval; + mrp->in_test_end = jiffies + usecs_to_jiffies(in_test->period); + mrp->in_test_max_miss = in_test->max_miss; + mrp->in_test_count_miss = 0; + queue_delayed_work(system_wq, &mrp->in_test_work, + usecs_to_jiffies(in_test->interval)); + + return 0; +} + +/* Determin if the frame type is a ring frame */ +static bool br_mrp_ring_frame(struct sk_buff *skb) +{ + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return false; + + if (hdr->type == BR_MRP_TLV_HEADER_RING_TEST || + hdr->type == BR_MRP_TLV_HEADER_RING_TOPO || + hdr->type == BR_MRP_TLV_HEADER_RING_LINK_DOWN || + hdr->type == BR_MRP_TLV_HEADER_RING_LINK_UP || + hdr->type == BR_MRP_TLV_HEADER_OPTION) + return true; + + return false; +} + +/* Determin if the frame type is an interconnect frame */ +static bool br_mrp_in_frame(struct sk_buff *skb) +{ + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return false; + + if (hdr->type == BR_MRP_TLV_HEADER_IN_TEST || + hdr->type == BR_MRP_TLV_HEADER_IN_TOPO || + hdr->type == BR_MRP_TLV_HEADER_IN_LINK_DOWN || + hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP) + return true; + + return false; +} + /* Process only MRP Test frame. All the other MRP frames are processed by * userspace application * note: already called with rcu_read_lock @@ -537,7 +877,7 @@ static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port, * not closed */ if (mrp->ring_state != BR_MRP_RING_STATE_CLOSED) - br_mrp_port_open(port->dev, false); + br_mrp_ring_port_open(port->dev, false); } /* Determin if the test hdr has a better priority than the node */ @@ -591,17 +931,92 @@ static void br_mrp_mra_process(struct br_mrp *mrp, struct net_bridge *br, mrp->test_count_miss = 0; } -/* This will just forward the frame to the other mrp ring port(MRC role) or will - * not do anything. +/* Process only MRP InTest frame. All the other MRP frames are processed by + * userspace application + * note: already called with rcu_read_lock + */ +static bool br_mrp_mim_process(struct br_mrp *mrp, struct net_bridge_port *port, + struct sk_buff *skb) +{ + const struct br_mrp_in_test_hdr *in_hdr; + struct br_mrp_in_test_hdr _in_hdr; + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + /* Each MRP header starts with a version field which is 16 bits. + * Therefore skip the version and get directly the TLV header. + */ + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return false; + + /* The check for InTest frame type was already done */ + in_hdr = skb_header_pointer(skb, sizeof(uint16_t) + sizeof(_hdr), + sizeof(_in_hdr), &_in_hdr); + if (!in_hdr) + return false; + + /* It needs to process only it's own InTest frames. */ + if (mrp->in_id != ntohs(in_hdr->id)) + return false; + + mrp->in_test_count_miss = 0; + + /* Notify the userspace that the ring is closed only when the ring is + * not closed + */ + if (mrp->in_state != BR_MRP_IN_STATE_CLOSED) + br_mrp_in_port_open(port->dev, false); + + return true; +} + +/* Get the MRP frame type + * note: already called with rcu_read_lock + */ +static u8 br_mrp_get_frame_type(struct sk_buff *skb) +{ + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + /* Each MRP header starts with a version field which is 16 bits. + * Therefore skip the version and get directly the TLV header. + */ + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return 0xff; + + return hdr->type; +} + +static bool br_mrp_mrm_behaviour(struct br_mrp *mrp) +{ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRM || + (mrp->ring_role == BR_MRP_RING_ROLE_MRA && !mrp->test_monitor)) + return true; + + return false; +} + +static bool br_mrp_mrc_behaviour(struct br_mrp *mrp) +{ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRC || + (mrp->ring_role == BR_MRP_RING_ROLE_MRA && mrp->test_monitor)) + return true; + + return false; +} + +/* This will just forward the frame to the other mrp ring ports, depending on + * the frame type, ring role and interconnect role * note: already called with rcu_read_lock */ static int br_mrp_rcv(struct net_bridge_port *p, struct sk_buff *skb, struct net_device *dev) { - struct net_device *s_dev, *p_dev, *d_dev; - struct net_bridge_port *p_port, *s_port; + struct net_bridge_port *p_port, *s_port, *i_port = NULL; + struct net_bridge_port *p_dst, *s_dst, *i_dst = NULL; struct net_bridge *br; - struct sk_buff *nskb; struct br_mrp *mrp; /* If port is disabled don't accept any frames */ @@ -616,46 +1031,139 @@ static int br_mrp_rcv(struct net_bridge_port *p, p_port = rcu_dereference(mrp->p_port); if (!p_port) return 0; + p_dst = p_port; s_port = rcu_dereference(mrp->s_port); if (!s_port) return 0; + s_dst = s_port; - /* If the role is MRM then don't forward the frames */ - if (mrp->ring_role == BR_MRP_RING_ROLE_MRM) { - br_mrp_mrm_process(mrp, p, skb); - return 1; - } - - /* If the role is MRA then don't forward the frames if it behaves as - * MRM node + /* If the frame is a ring frame then it is not required to check the + * interconnect role and ports to process or forward the frame */ - if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) { - if (!mrp->test_monitor) { + if (br_mrp_ring_frame(skb)) { + /* If the role is MRM then don't forward the frames */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRM) { br_mrp_mrm_process(mrp, p, skb); - return 1; + goto no_forward; } - br_mrp_mra_process(mrp, br, p, skb); + /* If the role is MRA then don't forward the frames if it + * behaves as MRM node + */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) { + if (!mrp->test_monitor) { + br_mrp_mrm_process(mrp, p, skb); + goto no_forward; + } + + br_mrp_mra_process(mrp, br, p, skb); + } + + goto forward; } - /* Clone the frame and forward it on the other MRP port */ - nskb = skb_clone(skb, GFP_ATOMIC); - if (!nskb) - return 0; + if (br_mrp_in_frame(skb)) { + u8 in_type = br_mrp_get_frame_type(skb); - p_dev = p_port->dev; - s_dev = s_port->dev; + i_port = rcu_dereference(mrp->i_port); + i_dst = i_port; - if (p_dev == dev) - d_dev = s_dev; - else - d_dev = p_dev; + /* If the ring port is in block state it should not forward + * In_Test frames + */ + if (br_mrp_is_ring_port(p_port, s_port, p) && + p->state == BR_STATE_BLOCKING && + in_type == BR_MRP_TLV_HEADER_IN_TEST) + goto no_forward; + + /* Nodes that behaves as MRM needs to stop forwarding the + * frames in case the ring is closed, otherwise will be a loop. + * In this case the frame is no forward between the ring ports. + */ + if (br_mrp_mrm_behaviour(mrp) && + br_mrp_is_ring_port(p_port, s_port, p) && + (s_port->state != BR_STATE_FORWARDING || + p_port->state != BR_STATE_FORWARDING)) { + p_dst = NULL; + s_dst = NULL; + } + + /* A node that behaves as MRC and doesn't have a interconnect + * role then it should forward all frames between the ring ports + * because it doesn't have an interconnect port + */ + if (br_mrp_mrc_behaviour(mrp) && + mrp->in_role == BR_MRP_IN_ROLE_DISABLED) + goto forward; + + if (mrp->in_role == BR_MRP_IN_ROLE_MIM) { + if (in_type == BR_MRP_TLV_HEADER_IN_TEST) { + /* MIM should not forward it's own InTest + * frames + */ + if (br_mrp_mim_process(mrp, p, skb)) { + goto no_forward; + } else { + if (br_mrp_is_ring_port(p_port, s_port, + p)) + i_dst = NULL; + + if (br_mrp_is_in_port(i_port, p)) + goto no_forward; + } + } else { + /* MIM should forward IntLinkChange and + * IntTopoChange between ring ports but MIM + * should not forward IntLinkChange and + * IntTopoChange if the frame was received at + * the interconnect port + */ + if (br_mrp_is_ring_port(p_port, s_port, p)) + i_dst = NULL; + + if (br_mrp_is_in_port(i_port, p)) + goto no_forward; + } + } + + if (mrp->in_role == BR_MRP_IN_ROLE_MIC) { + /* MIC should forward InTest frames on all ports + * regardless of the received port + */ + if (in_type == BR_MRP_TLV_HEADER_IN_TEST) + goto forward; + + /* MIC should forward IntLinkChange frames only if they + * are received on ring ports to all the ports + */ + if (br_mrp_is_ring_port(p_port, s_port, p) && + (in_type == BR_MRP_TLV_HEADER_IN_LINK_UP || + in_type == BR_MRP_TLV_HEADER_IN_LINK_DOWN)) + goto forward; + + /* Should forward the InTopo frames only between the + * ring ports + */ + if (in_type == BR_MRP_TLV_HEADER_IN_TOPO) { + i_dst = NULL; + goto forward; + } + + /* In all the other cases don't forward the frames */ + goto no_forward; + } + } - nskb->dev = d_dev; - skb_push(nskb, ETH_HLEN); - dev_queue_xmit(nskb); +forward: + if (p_dst) + br_forward(p_dst, skb, true, false); + if (s_dst) + br_forward(s_dst, skb, true, false); + if (i_dst) + br_forward(i_dst, skb, true, false); +no_forward: return 1; } diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index 34b3a8776991..2a2fdf3500c5 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -14,6 +14,9 @@ static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = { [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_NESTED }, [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_NESTED }, [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_IN_ROLE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_IN_STATE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_START_IN_TEST] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -235,6 +238,121 @@ static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr, return br_mrp_start_test(br, &test); } +static const struct nla_policy +br_mrp_in_state_policy[IFLA_BRIDGE_MRP_IN_STATE_MAX + 1] = { + [IFLA_BRIDGE_MRP_IN_STATE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_IN_STATE_IN_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_IN_STATE_STATE] = { .type = NLA_U32 }, +}; + +static int br_mrp_in_state_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_IN_STATE_MAX + 1]; + struct br_mrp_in_state state; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_IN_STATE_MAX, attr, + br_mrp_in_state_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_IN_STATE_IN_ID] || + !tb[IFLA_BRIDGE_MRP_IN_STATE_STATE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: IN_ID or STATE"); + return -EINVAL; + } + + memset(&state, 0x0, sizeof(state)); + + state.in_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_STATE_IN_ID]); + state.in_state = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_STATE_STATE]); + + return br_mrp_set_in_state(br, &state); +} + +static const struct nla_policy +br_mrp_in_role_policy[IFLA_BRIDGE_MRP_IN_ROLE_MAX + 1] = { + [IFLA_BRIDGE_MRP_IN_ROLE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_IN_ROLE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_IN_ROLE_IN_ID] = { .type = NLA_U16 }, + [IFLA_BRIDGE_MRP_IN_ROLE_ROLE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX] = { .type = NLA_U32 }, +}; + +static int br_mrp_in_role_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_IN_ROLE_MAX + 1]; + struct br_mrp_in_role role; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_IN_ROLE_MAX, attr, + br_mrp_in_role_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_IN_ROLE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_IN_ROLE_IN_ID] || + !tb[IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX] || + !tb[IFLA_BRIDGE_MRP_IN_ROLE_ROLE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or ROLE or IN_ID or I_IFINDEX"); + return -EINVAL; + } + + memset(&role, 0x0, sizeof(role)); + + role.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_RING_ID]); + role.in_id = nla_get_u16(tb[IFLA_BRIDGE_MRP_IN_ROLE_IN_ID]); + role.i_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX]); + role.in_role = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_ROLE]); + + return br_mrp_set_in_role(br, &role); +} + +static const struct nla_policy +br_mrp_start_in_test_policy[IFLA_BRIDGE_MRP_START_IN_TEST_MAX + 1] = { + [IFLA_BRIDGE_MRP_START_IN_TEST_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD] = { .type = NLA_U32 }, +}; + +static int br_mrp_start_in_test_parse(struct net_bridge *br, + struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX + 1]; + struct br_mrp_start_in_test test; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_START_IN_TEST_MAX, attr, + br_mrp_start_in_test_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID] || + !tb[IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL] || + !tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS] || + !tb[IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or INTERVAL or MAX_MISS or PERIOD"); + return -EINVAL; + } + + memset(&test, 0x0, sizeof(test)); + + test.in_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID]); + test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL]); + test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS]); + test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD]); + + return br_mrp_start_in_test(br, &test); +} + int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { @@ -301,10 +419,114 @@ int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, return err; } + if (tb[IFLA_BRIDGE_MRP_IN_STATE]) { + err = br_mrp_in_state_parse(br, tb[IFLA_BRIDGE_MRP_IN_STATE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_IN_ROLE]) { + err = br_mrp_in_role_parse(br, tb[IFLA_BRIDGE_MRP_IN_ROLE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_START_IN_TEST]) { + err = br_mrp_start_in_test_parse(br, + tb[IFLA_BRIDGE_MRP_START_IN_TEST], + extack); + if (err) + return err; + } + return 0; } -int br_mrp_port_open(struct net_device *dev, u8 loc) +int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br) +{ + struct nlattr *tb, *mrp_tb; + struct br_mrp *mrp; + + mrp_tb = nla_nest_start_noflag(skb, IFLA_BRIDGE_MRP); + if (!mrp_tb) + return -EMSGSIZE; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list) { + struct net_bridge_port *p; + + tb = nla_nest_start_noflag(skb, IFLA_BRIDGE_MRP_INFO); + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_ID, + mrp->ring_id)) + goto nla_put_failure; + + p = rcu_dereference(mrp->p_port); + if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_P_IFINDEX, + p->dev->ifindex)) + goto nla_put_failure; + + p = rcu_dereference(mrp->s_port); + if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_S_IFINDEX, + p->dev->ifindex)) + goto nla_put_failure; + + p = rcu_dereference(mrp->i_port); + if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_I_IFINDEX, + p->dev->ifindex)) + goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_BRIDGE_MRP_INFO_PRIO, + mrp->prio)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_STATE, + mrp->ring_state)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_ROLE, + mrp->ring_role)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_INTERVAL, + mrp->test_interval)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_MAX_MISS, + mrp->test_max_miss)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_MONITOR, + mrp->test_monitor)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_STATE, + mrp->in_state)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_ROLE, + mrp->in_role)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_TEST_INTERVAL, + mrp->in_test_interval)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_TEST_MAX_MISS, + mrp->in_test_max_miss)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + } + nla_nest_end(skb, mrp_tb); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, tb); + +nla_info_failure: + nla_nest_cancel(skb, mrp_tb); + + return -EMSGSIZE; +} + +int br_mrp_ring_port_open(struct net_device *dev, u8 loc) { struct net_bridge_port *p; int err = 0; @@ -325,3 +547,25 @@ int br_mrp_port_open(struct net_device *dev, u8 loc) out: return err; } + +int br_mrp_in_port_open(struct net_device *dev, u8 loc) +{ + struct net_bridge_port *p; + int err = 0; + + p = br_port_get_rcu(dev); + if (!p) { + err = -EINVAL; + goto out; + } + + if (loc) + p->flags |= BR_MRP_LOST_IN_CONT; + else + p->flags &= ~BR_MRP_LOST_IN_CONT; + + br_ifinfo_notify(RTM_NEWLINK, NULL, p); + +out: + return err; +} diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c index 0da68a0da4b5..ed547e03ace1 100644 --- a/net/bridge/br_mrp_switchdev.c +++ b/net/bridge/br_mrp_switchdev.c @@ -107,6 +107,68 @@ int br_mrp_switchdev_set_ring_state(struct net_bridge *br, return 0; } +int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, + u16 in_id, u32 ring_id, + enum br_mrp_in_role_type role) +{ + struct switchdev_obj_in_role_mrp mrp_role = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_IN_ROLE_MRP, + .in_role = role, + .in_id = mrp->in_id, + .ring_id = mrp->ring_id, + .i_port = rtnl_dereference(mrp->i_port)->dev, + }; + int err; + + if (role == BR_MRP_IN_ROLE_DISABLED) + err = switchdev_port_obj_del(br->dev, &mrp_role.obj); + else + err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL); + + return err; +} + +int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_in_state_type state) +{ + struct switchdev_obj_in_state_mrp mrp_state = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_IN_STATE_MRP, + .in_state = state, + .in_id = mrp->in_id, + }; + int err; + + err = switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period) +{ + struct switchdev_obj_in_test_mrp test = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_IN_TEST_MRP, + .interval = interval, + .max_miss = max_miss, + .in_id = mrp->in_id, + .period = period, + }; + int err; + + if (interval == 0) + err = switchdev_port_obj_del(br->dev, &test.obj); + else + err = switchdev_port_obj_add(br->dev, &test.obj, NULL); + + return err; +} + int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, enum br_mrp_port_state_type state) { diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 240e260e3461..147d52596e17 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -152,6 +152,7 @@ static inline size_t br_port_info_size(void) #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + 0; } @@ -216,6 +217,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, !!(p->flags & BR_NEIGH_SUPPRESS)) || nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & BR_MRP_LOST_CONT)) || + nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, + !!(p->flags & BR_MRP_LOST_IN_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) return -EMSGSIZE; @@ -453,6 +456,28 @@ static int br_fill_ifinfo(struct sk_buff *skb, rcu_read_unlock(); if (err) goto nla_put_failure; + + nla_nest_end(skb, af); + } + + if (filter_mask & RTEXT_FILTER_MRP) { + struct nlattr *af; + int err; + + if (!br_mrp_enabled(br) || port) + goto done; + + af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); + if (!af) + goto nla_put_failure; + + rcu_read_lock(); + err = br_mrp_fill_info(skb, br); + rcu_read_unlock(); + + if (err) + goto nla_put_failure; + nla_nest_end(skb, af); } @@ -516,7 +541,8 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_bridge_port *port = br_port_get_rtnl(dev); if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) && - !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) + !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) && + !(filter_mask & RTEXT_FILTER_MRP)) return 0; return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags, diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c index 162998e2f039..8914290c75d4 100644 --- a/net/bridge/br_netlink_tunnel.c +++ b/net/bridge/br_netlink_tunnel.c @@ -250,6 +250,36 @@ int br_parse_vlan_tunnel_info(struct nlattr *attr, return 0; } +/* send a notification if v_curr can't enter the range and start a new one */ +static void __vlan_tunnel_handle_range(const struct net_bridge_port *p, + struct net_bridge_vlan **v_start, + struct net_bridge_vlan **v_end, + int v_curr, bool curr_change) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + + vg = nbp_vlan_group(p); + if (!vg) + return; + + v = br_vlan_find(vg, v_curr); + + if (!*v_start) + goto out_init; + + if (v && curr_change && br_vlan_can_enter_range(v, *v_end)) { + *v_end = v; + return; + } + + br_vlan_notify(p->br, p, (*v_start)->vid, (*v_end)->vid, RTM_NEWVLAN); +out_init: + /* we start a range only if there are any changes to notify about */ + *v_start = curr_change ? v : NULL; + *v_end = *v_start; +} + int br_process_vlan_tunnel_info(const struct net_bridge *br, const struct net_bridge_port *p, int cmd, struct vtunnel_info *tinfo_curr, @@ -263,6 +293,7 @@ int br_process_vlan_tunnel_info(const struct net_bridge *br, return -EINVAL; memcpy(tinfo_last, tinfo_curr, sizeof(struct vtunnel_info)); } else if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END) { + struct net_bridge_vlan *v_start = NULL, *v_end = NULL; int t, v; if (!(tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN)) @@ -272,11 +303,24 @@ int br_process_vlan_tunnel_info(const struct net_bridge *br, return -EINVAL; t = tinfo_last->tunid; for (v = tinfo_last->vid; v <= tinfo_curr->vid; v++) { - err = br_vlan_tunnel_info(p, cmd, v, t, changed); + bool curr_change = false; + + err = br_vlan_tunnel_info(p, cmd, v, t, &curr_change); if (err) - return err; + break; t++; + + if (curr_change) + *changed = curr_change; + __vlan_tunnel_handle_range(p, &v_start, &v_end, v, + curr_change); } + if (v_start && v_end) + br_vlan_notify(br, p, v_start->vid, v_end->vid, + RTM_NEWVLAN); + if (err) + return err; + memset(tinfo_last, 0, sizeof(struct vtunnel_info)); memset(tinfo_curr, 0, sizeof(struct vtunnel_info)); } else { @@ -286,6 +330,7 @@ int br_process_vlan_tunnel_info(const struct net_bridge *br, tinfo_curr->tunid, changed); if (err) return err; + br_vlan_notify(br, p, tinfo_curr->vid, 0, RTM_NEWVLAN); memset(tinfo_last, 0, sizeof(struct vtunnel_info)); memset(tinfo_curr, 0, sizeof(struct vtunnel_info)); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index e0ea6dbbc97e..baa1500f384f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -48,6 +48,8 @@ enum { /* Path to usermode spanning tree program */ #define BR_STP_PROG "/sbin/bridge-stp" +#define BR_FDB_NOTIFY_SETTABLE_BITS (FDB_NOTIFY_BIT | FDB_NOTIFY_INACTIVE_BIT) + typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; typedef __u16 port_id; @@ -184,6 +186,8 @@ enum { BR_FDB_ADDED_BY_USER, BR_FDB_ADDED_BY_EXT_LEARN, BR_FDB_OFFLOADED, + BR_FDB_NOTIFY, + BR_FDB_NOTIFY_INACTIVE }; struct net_bridge_fdb_key { @@ -1196,6 +1200,12 @@ static inline void br_vlan_notify(const struct net_bridge *br, int cmd) { } + +static inline bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end) +{ + return true; +} #endif /* br_vlan_options.c */ @@ -1313,6 +1323,7 @@ int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb); bool br_mrp_enabled(struct net_bridge *br); void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p); +int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br); #else static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, @@ -1335,6 +1346,12 @@ static inline void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p) { } + +static inline int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br) +{ + return 0; +} + #endif /* br_netlink.c */ diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 315eb37d89f0..af0e9eff6549 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -12,8 +12,10 @@ struct br_mrp { struct net_bridge_port __rcu *p_port; struct net_bridge_port __rcu *s_port; + struct net_bridge_port __rcu *i_port; u32 ring_id; + u16 in_id; u16 prio; enum br_mrp_ring_role_type ring_role; @@ -21,6 +23,11 @@ struct br_mrp { enum br_mrp_ring_state_type ring_state; u32 ring_transitions; + enum br_mrp_in_role_type in_role; + u8 in_role_offloaded; + enum br_mrp_in_state_type in_state; + u32 in_transitions; + struct delayed_work test_work; u32 test_interval; unsigned long test_end; @@ -28,6 +35,12 @@ struct br_mrp { u32 test_max_miss; bool test_monitor; + struct delayed_work in_test_work; + u32 in_test_interval; + unsigned long in_test_end; + u32 in_test_count_miss; + u32 in_test_max_miss; + u32 seq_id; struct rcu_head rcu; @@ -44,6 +57,10 @@ int br_mrp_set_ring_state(struct net_bridge *br, struct br_mrp_ring_state *state); int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role); int br_mrp_start_test(struct net_bridge *br, struct br_mrp_start_test *test); +int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state); +int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role); +int br_mrp_start_in_test(struct net_bridge *br, + struct br_mrp_start_in_test *test); /* br_mrp_switchdev.c */ int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp); @@ -59,8 +76,16 @@ int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, enum br_mrp_port_state_type state); int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, enum br_mrp_port_role_type role); +int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, + u16 in_id, u32 ring_id, + enum br_mrp_in_role_type role); +int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_in_state_type state); +int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period); /* br_mrp_netlink.c */ -int br_mrp_port_open(struct net_device *dev, u8 loc); +int br_mrp_ring_port_open(struct net_device *dev, u8 loc); +int br_mrp_in_port_open(struct net_device *dev, u8 loc); #endif /* _BR_PRIVATE_MRP_H */ diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index b13b49b9f75c..1641f414d1ba 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1063,14 +1063,13 @@ free_counterstmp: } /* replace the table */ -static int do_replace(struct net *net, const void __user *user, - unsigned int len) +static int do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret, countersize; struct ebt_table_info *newinfo; struct ebt_replace tmp; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; if (len != sizeof(tmp) + tmp.entries_size) @@ -1242,9 +1241,8 @@ void ebt_unregister_table(struct net *net, struct ebt_table *table, /* userspace just supplied us with counters */ static int do_update_counters(struct net *net, const char *name, - struct ebt_counter __user *counters, - unsigned int num_counters, - const void __user *user, unsigned int len) + struct ebt_counter __user *counters, + unsigned int num_counters, unsigned int len) { int i, ret; struct ebt_counter *tmp; @@ -1287,19 +1285,18 @@ free_tmp: return ret; } -static int update_counters(struct net *net, const void __user *user, - unsigned int len) +static int update_counters(struct net *net, sockptr_t arg, unsigned int len) { struct ebt_replace hlp; - if (copy_from_user(&hlp, user, sizeof(hlp))) + if (copy_from_sockptr(&hlp, arg, sizeof(hlp))) return -EFAULT; if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter)) return -EINVAL; return do_update_counters(net, hlp.name, hlp.counters, - hlp.num_counters, user, len); + hlp.num_counters, len); } static inline int ebt_obj_to_user(char __user *um, const char *_name, @@ -1451,86 +1448,6 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user, ebt_entry_to_user, entries, tmp.entries); } -static int do_ebt_set_ctl(struct sock *sk, - int cmd, void __user *user, unsigned int len) -{ - int ret; - struct net *net = sock_net(sk); - - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case EBT_SO_SET_ENTRIES: - ret = do_replace(net, user, len); - break; - case EBT_SO_SET_COUNTERS: - ret = update_counters(net, user, len); - break; - default: - ret = -EINVAL; - } - return ret; -} - -static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) -{ - int ret; - struct ebt_replace tmp; - struct ebt_table *t; - struct net *net = sock_net(sk); - - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - if (copy_from_user(&tmp, user, sizeof(tmp))) - return -EFAULT; - - tmp.name[sizeof(tmp.name) - 1] = '\0'; - - t = find_table_lock(net, tmp.name, &ret, &ebt_mutex); - if (!t) - return ret; - - switch (cmd) { - case EBT_SO_GET_INFO: - case EBT_SO_GET_INIT_INFO: - if (*len != sizeof(struct ebt_replace)) { - ret = -EINVAL; - mutex_unlock(&ebt_mutex); - break; - } - if (cmd == EBT_SO_GET_INFO) { - tmp.nentries = t->private->nentries; - tmp.entries_size = t->private->entries_size; - tmp.valid_hooks = t->valid_hooks; - } else { - tmp.nentries = t->table->nentries; - tmp.entries_size = t->table->entries_size; - tmp.valid_hooks = t->table->valid_hooks; - } - mutex_unlock(&ebt_mutex); - if (copy_to_user(user, &tmp, *len) != 0) { - ret = -EFAULT; - break; - } - ret = 0; - break; - - case EBT_SO_GET_ENTRIES: - case EBT_SO_GET_INIT_ENTRIES: - ret = copy_everything_to_user(t, user, len, cmd); - mutex_unlock(&ebt_mutex); - break; - - default: - mutex_unlock(&ebt_mutex); - ret = -EINVAL; - } - - return ret; -} - #ifdef CONFIG_COMPAT /* 32 bit-userspace compatibility definitions. */ struct compat_ebt_replace { @@ -1935,7 +1852,7 @@ static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt, size_kern = match_size; module_put(match->me); break; - case EBT_COMPAT_WATCHER: /* fallthrough */ + case EBT_COMPAT_WATCHER: case EBT_COMPAT_TARGET: wt = xt_request_find_target(NFPROTO_BRIDGE, name, mwt->u.revision); @@ -2160,7 +2077,7 @@ static int compat_copy_entries(unsigned char *data, unsigned int size_user, static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl, - void __user *user, unsigned int len) + sockptr_t arg, unsigned int len) { struct compat_ebt_replace tmp; int i; @@ -2168,7 +2085,7 @@ static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl, if (len < sizeof(tmp)) return -EINVAL; - if (copy_from_user(&tmp, user, sizeof(tmp))) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp))) return -EFAULT; if (len != sizeof(tmp) + tmp.entries_size) @@ -2195,8 +2112,7 @@ static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl, return 0; } -static int compat_do_replace(struct net *net, void __user *user, - unsigned int len) +static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret, i, countersize, size64; struct ebt_table_info *newinfo; @@ -2204,10 +2120,10 @@ static int compat_do_replace(struct net *net, void __user *user, struct ebt_entries_buf_state state; void *entries_tmp; - ret = compat_copy_ebt_replace_from_user(&tmp, user, len); + ret = compat_copy_ebt_replace_from_user(&tmp, arg, len); if (ret) { /* try real handler in case userland supplied needed padding */ - if (ret == -EINVAL && do_replace(net, user, len) == 0) + if (ret == -EINVAL && do_replace(net, arg, len) == 0) ret = 0; return ret; } @@ -2298,42 +2214,20 @@ out_unlock: goto free_entries; } -static int compat_update_counters(struct net *net, void __user *user, +static int compat_update_counters(struct net *net, sockptr_t arg, unsigned int len) { struct compat_ebt_replace hlp; - if (copy_from_user(&hlp, user, sizeof(hlp))) + if (copy_from_sockptr(&hlp, arg, sizeof(hlp))) return -EFAULT; /* try real handler in case userland supplied needed padding */ if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter)) - return update_counters(net, user, len); + return update_counters(net, arg, len); return do_update_counters(net, hlp.name, compat_ptr(hlp.counters), - hlp.num_counters, user, len); -} - -static int compat_do_ebt_set_ctl(struct sock *sk, - int cmd, void __user *user, unsigned int len) -{ - int ret; - struct net *net = sock_net(sk); - - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case EBT_SO_SET_ENTRIES: - ret = compat_do_replace(net, user, len); - break; - case EBT_SO_SET_COUNTERS: - ret = compat_update_counters(net, user, len); - break; - default: - ret = -EINVAL; - } - return ret; + hlp.num_counters, len); } static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, @@ -2344,14 +2238,6 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, struct ebt_table *t; struct net *net = sock_net(sk); - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - /* try real handler in case userland supplied needed padding */ - if ((cmd == EBT_SO_GET_INFO || - cmd == EBT_SO_GET_INIT_INFO) && *len != sizeof(tmp)) - return do_ebt_get_ctl(sk, cmd, user, len); - if (copy_from_user(&tmp, user, sizeof(tmp))) return -EFAULT; @@ -2413,20 +2299,112 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, } #endif +static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + struct net *net = sock_net(sk); + struct ebt_replace tmp; + struct ebt_table *t; + int ret; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + +#ifdef CONFIG_COMPAT + /* try real handler in case userland supplied needed padding */ + if (in_compat_syscall() && + ((cmd != EBT_SO_GET_INFO && cmd != EBT_SO_GET_INIT_INFO) || + *len != sizeof(tmp))) + return compat_do_ebt_get_ctl(sk, cmd, user, len); +#endif + + if (copy_from_user(&tmp, user, sizeof(tmp))) + return -EFAULT; + + tmp.name[sizeof(tmp.name) - 1] = '\0'; + + t = find_table_lock(net, tmp.name, &ret, &ebt_mutex); + if (!t) + return ret; + + switch (cmd) { + case EBT_SO_GET_INFO: + case EBT_SO_GET_INIT_INFO: + if (*len != sizeof(struct ebt_replace)) { + ret = -EINVAL; + mutex_unlock(&ebt_mutex); + break; + } + if (cmd == EBT_SO_GET_INFO) { + tmp.nentries = t->private->nentries; + tmp.entries_size = t->private->entries_size; + tmp.valid_hooks = t->valid_hooks; + } else { + tmp.nentries = t->table->nentries; + tmp.entries_size = t->table->entries_size; + tmp.valid_hooks = t->table->valid_hooks; + } + mutex_unlock(&ebt_mutex); + if (copy_to_user(user, &tmp, *len) != 0) { + ret = -EFAULT; + break; + } + ret = 0; + break; + + case EBT_SO_GET_ENTRIES: + case EBT_SO_GET_INIT_ENTRIES: + ret = copy_everything_to_user(t, user, len, cmd); + mutex_unlock(&ebt_mutex); + break; + + default: + mutex_unlock(&ebt_mutex); + ret = -EINVAL; + } + + return ret; +} + +static int do_ebt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, + unsigned int len) +{ + struct net *net = sock_net(sk); + int ret; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case EBT_SO_SET_ENTRIES: +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_do_replace(net, arg, len); + else +#endif + ret = do_replace(net, arg, len); + break; + case EBT_SO_SET_COUNTERS: +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_update_counters(net, arg, len); + else +#endif + ret = update_counters(net, arg, len); + break; + default: + ret = -EINVAL; + } + return ret; +} + static struct nf_sockopt_ops ebt_sockopts = { .pf = PF_INET, .set_optmin = EBT_BASE_CTL, .set_optmax = EBT_SO_SET_MAX + 1, .set = do_ebt_set_ctl, -#ifdef CONFIG_COMPAT - .compat_set = compat_do_ebt_set_ctl, -#endif .get_optmin = EBT_BASE_CTL, .get_optmax = EBT_SO_GET_MAX + 1, .get = do_ebt_get_ctl, -#ifdef CONFIG_COMPAT - .compat_get = compat_do_ebt_get_ctl, -#endif .owner = THIS_MODULE, }; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index ef14da50a981..3ad0a1df6712 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -669,8 +669,8 @@ out_err: return sent ? : err; } -static int setsockopt(struct socket *sock, - int lvl, int opt, char __user *ov, unsigned int ol) +static int setsockopt(struct socket *sock, int lvl, int opt, sockptr_t ov, + unsigned int ol) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); @@ -685,7 +685,7 @@ static int setsockopt(struct socket *sock, return -EINVAL; if (lvl != SOL_CAIF) goto bad_sol; - if (copy_from_user(&linksel, ov, sizeof(int))) + if (copy_from_sockptr(&linksel, ov, sizeof(int))) return -EINVAL; lock_sock(&(cf_sk->sk)); cf_sk->conn_req.link_selector = linksel; @@ -699,7 +699,7 @@ static int setsockopt(struct socket *sock, return -ENOPROTOOPT; lock_sock(&(cf_sk->sk)); if (ol > sizeof(cf_sk->conn_req.param.data) || - copy_from_user(&cf_sk->conn_req.param.data, ov, ol)) { + copy_from_sockptr(&cf_sk->conn_req.param.data, ov, ol)) { release_sock(&cf_sk->sk); return -EINVAL; } @@ -981,7 +981,6 @@ static const struct proto_ops caif_seqpacket_ops = { .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = caif_seqpkt_sendmsg, .recvmsg = caif_seqpkt_recvmsg, .mmap = sock_no_mmap, @@ -1002,7 +1001,6 @@ static const struct proto_ops caif_stream_ops = { .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = caif_stream_sendmsg, .recvmsg = caif_stream_recvmsg, .mmap = sock_no_mmap, diff --git a/net/can/af_can.c b/net/can/af_can.c index 128d37a4c2e0..5c06404bdf3e 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -410,6 +410,7 @@ static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask, /** * can_rx_register - subscribe CAN frames from a specific interface + * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => subcribe from 'all' CAN devices list) * @can_id: CAN identifier (see description) * @mask: CAN mask (see description) @@ -498,6 +499,7 @@ static void can_rx_delete_receiver(struct rcu_head *rp) /** * can_rx_unregister - unsubscribe CAN frames from a specific interface + * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => unsubscribe from 'all' CAN devices list) * @can_id: CAN identifier * @mask: CAN mask diff --git a/net/can/bcm.c b/net/can/bcm.c index c96fa0f33db3..d14ea12affb1 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1648,8 +1648,6 @@ static const struct proto_ops bcm_ops = { .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = bcm_sendmsg, .recvmsg = bcm_recvmsg, .mmap = sock_no_mmap, diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index f7587428febd..78ff9b3f1d40 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -627,14 +627,14 @@ static int j1939_sk_release(struct socket *sock) return 0; } -static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, char __user *optval, +static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, sockptr_t optval, unsigned int optlen, int flag) { int tmp; if (optlen != sizeof(tmp)) return -EINVAL; - if (copy_from_user(&tmp, optval, optlen)) + if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; lock_sock(&jsk->sk); if (tmp) @@ -646,7 +646,7 @@ static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, char __user *optval, } static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); @@ -658,7 +658,7 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case SO_J1939_FILTER: - if (optval) { + if (!sockptr_is_null(optval)) { struct j1939_filter *f; int c; @@ -670,7 +670,7 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, return -EINVAL; count = optlen / sizeof(*filters); - filters = memdup_user(optval, optlen); + filters = memdup_sockptr(optval, optlen); if (IS_ERR(filters)) return PTR_ERR(filters); @@ -703,7 +703,7 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, case SO_J1939_SEND_PRIO: if (optlen != sizeof(tmp)) return -EINVAL; - if (copy_from_user(&tmp, optval, optlen)) + if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; if (tmp < 0 || tmp > 7) return -EDOM; diff --git a/net/can/raw.c b/net/can/raw.c index 59c039d73c6d..94a9405658dc 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -485,7 +485,7 @@ static int raw_getname(struct socket *sock, struct sockaddr *uaddr, } static int raw_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); @@ -511,11 +511,11 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (count > 1) { /* filter does not fit into dfilter => alloc space */ - filter = memdup_user(optval, optlen); + filter = memdup_sockptr(optval, optlen); if (IS_ERR(filter)) return PTR_ERR(filter); } else if (count == 1) { - if (copy_from_user(&sfilter, optval, sizeof(sfilter))) + if (copy_from_sockptr(&sfilter, optval, sizeof(sfilter))) return -EFAULT; } @@ -568,7 +568,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(err_mask)) return -EINVAL; - if (copy_from_user(&err_mask, optval, optlen)) + if (copy_from_sockptr(&err_mask, optval, optlen)) return -EFAULT; err_mask &= CAN_ERR_MASK; @@ -607,7 +607,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(ro->loopback)) return -EINVAL; - if (copy_from_user(&ro->loopback, optval, optlen)) + if (copy_from_sockptr(&ro->loopback, optval, optlen)) return -EFAULT; break; @@ -616,7 +616,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(ro->recv_own_msgs)) return -EINVAL; - if (copy_from_user(&ro->recv_own_msgs, optval, optlen)) + if (copy_from_sockptr(&ro->recv_own_msgs, optval, optlen)) return -EFAULT; break; @@ -625,7 +625,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(ro->fd_frames)) return -EINVAL; - if (copy_from_user(&ro->fd_frames, optval, optlen)) + if (copy_from_sockptr(&ro->fd_frames, optval, optlen)) return -EFAULT; break; @@ -634,7 +634,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(ro->join_filters)) return -EINVAL; - if (copy_from_user(&ro->join_filters, optval, optlen)) + if (copy_from_sockptr(&ro->join_filters, optval, optlen)) return -EFAULT; break; diff --git a/net/compat.c b/net/compat.c index 77f3a0e98fd0..703acb51c698 100644 --- a/net/compat.c +++ b/net/compat.c @@ -330,120 +330,6 @@ void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm) __scm_destroy(scm); } -/* allocate a 64-bit sock_fprog on the user stack for duration of syscall. */ -struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval) -{ - struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval; - struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog)); - struct compat_sock_fprog f32; - struct sock_fprog f; - - if (copy_from_user(&f32, fprog32, sizeof(*fprog32))) - return NULL; - memset(&f, 0, sizeof(f)); - f.len = f32.len; - f.filter = compat_ptr(f32.filter); - if (copy_to_user(kfprog, &f, sizeof(struct sock_fprog))) - return NULL; - - return kfprog; -} -EXPORT_SYMBOL_GPL(get_compat_bpf_fprog); - -static int do_set_attach_filter(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - struct sock_fprog __user *kfprog; - - kfprog = get_compat_bpf_fprog(optval); - if (!kfprog) - return -EFAULT; - - return sock_setsockopt(sock, level, optname, (char __user *)kfprog, - sizeof(struct sock_fprog)); -} - -static int compat_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (optname == SO_ATTACH_FILTER || - optname == SO_ATTACH_REUSEPORT_CBPF) - return do_set_attach_filter(sock, level, optname, - optval, optlen); - return sock_setsockopt(sock, level, optname, optval, optlen); -} - -static int __compat_sys_setsockopt(int fd, int level, int optname, - char __user *optval, unsigned int optlen) -{ - int err; - struct socket *sock; - - if (optlen > INT_MAX) - return -EINVAL; - - sock = sockfd_lookup(fd, &err); - if (sock) { - err = security_socket_setsockopt(sock, level, optname); - if (err) { - sockfd_put(sock); - return err; - } - - if (level == SOL_SOCKET) - err = compat_sock_setsockopt(sock, level, - optname, optval, optlen); - else if (sock->ops->compat_setsockopt) - err = sock->ops->compat_setsockopt(sock, level, - optname, optval, optlen); - else - err = sock->ops->setsockopt(sock, level, - optname, optval, optlen); - sockfd_put(sock); - } - return err; -} - -COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, - char __user *, optval, unsigned int, optlen) -{ - return __compat_sys_setsockopt(fd, level, optname, optval, optlen); -} - -static int __compat_sys_getsockopt(int fd, int level, int optname, - char __user *optval, - int __user *optlen) -{ - int err; - struct socket *sock = sockfd_lookup(fd, &err); - - if (sock) { - err = security_socket_getsockopt(sock, level, optname); - if (err) { - sockfd_put(sock); - return err; - } - - if (level == SOL_SOCKET) - err = sock_getsockopt(sock, level, - optname, optval, optlen); - else if (sock->ops->compat_getsockopt) - err = sock->ops->compat_getsockopt(sock, level, - optname, optval, optlen); - else - err = sock->ops->getsockopt(sock, level, - optname, optval, optlen); - sockfd_put(sock); - } - return err; -} - -COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, - char __user *, optval, int __user *, optlen) -{ - return __compat_sys_getsockopt(fd, level, optname, optval, optlen); -} - /* Argument list sizes for compat_sys_socketcall */ #define AL(x) ((x) * sizeof(u32)) static unsigned char nas[21] = { @@ -603,13 +489,11 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) ret = __sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: - ret = __compat_sys_setsockopt(a0, a1, a[2], - compat_ptr(a[3]), a[4]); + ret = __sys_setsockopt(a0, a1, a[2], compat_ptr(a[3]), a[4]); break; case SYS_GETSOCKOPT: - ret = __compat_sys_getsockopt(a0, a1, a[2], - compat_ptr(a[3]), - compat_ptr(a[4])); + ret = __sys_getsockopt(a0, a1, a[2], compat_ptr(a[3]), + compat_ptr(a[4])); break; case SYS_SENDMSG: ret = __compat_sys_sendmsg(a0, compat_ptr(a1), a[2]); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d2c4d16dadba..d3377c90a291 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -6,13 +6,12 @@ #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> +#include <linux/btf_ids.h> #include <net/bpf_sk_storage.h> #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> -static atomic_t cache_idx; - #define SK_STORAGE_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_CLONE) @@ -81,6 +80,9 @@ struct bpf_sk_storage_elem { #define SDATA(_SELEM) (&(_SELEM)->sdata) #define BPF_SK_STORAGE_CACHE_SIZE 16 +static DEFINE_SPINLOCK(cache_idx_lock); +static u64 cache_idx_usage_counts[BPF_SK_STORAGE_CACHE_SIZE]; + struct bpf_sk_storage { struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE]; struct hlist_head list; /* List of bpf_sk_storage_elem */ @@ -512,6 +514,37 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map) return 0; } +static u16 cache_idx_get(void) +{ + u64 min_usage = U64_MAX; + u16 i, res = 0; + + spin_lock(&cache_idx_lock); + + for (i = 0; i < BPF_SK_STORAGE_CACHE_SIZE; i++) { + if (cache_idx_usage_counts[i] < min_usage) { + min_usage = cache_idx_usage_counts[i]; + res = i; + + /* Found a free cache_idx */ + if (!min_usage) + break; + } + } + cache_idx_usage_counts[res]++; + + spin_unlock(&cache_idx_lock); + + return res; +} + +static void cache_idx_free(u16 idx) +{ + spin_lock(&cache_idx_lock); + cache_idx_usage_counts[idx]--; + spin_unlock(&cache_idx_lock); +} + /* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { @@ -560,6 +593,8 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) smap = (struct bpf_sk_storage_map *)map; + cache_idx_free(smap->cache_idx); + /* Note that this map might be concurrently cloned from * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone * RCU read section to finish before proceeding. New RCU @@ -673,8 +708,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) } smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; - smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % - BPF_SK_STORAGE_CACHE_SIZE; + smap->cache_idx = cache_idx_get(); return &smap->map; } @@ -886,6 +920,7 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) return -ENOENT; } +static int sk_storage_map_btf_id; const struct bpf_map_ops sk_storage_map_ops = { .map_alloc_check = bpf_sk_storage_map_alloc_check, .map_alloc = bpf_sk_storage_map_alloc, @@ -895,6 +930,8 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_update_elem = bpf_fd_sk_storage_update_elem, .map_delete_elem = bpf_fd_sk_storage_delete_elem, .map_check_btf = bpf_sk_storage_map_check_btf, + .map_btf_name = "bpf_sk_storage_map", + .map_btf_id = &sk_storage_map_btf_id, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { @@ -907,6 +944,16 @@ const struct bpf_func_proto bpf_sk_storage_get_proto = { .arg4_type = ARG_ANYTHING, }; +const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto = { + .func = bpf_sk_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_CTX, /* context is 'struct sock' */ + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_sk_storage_delete_proto = { .func = bpf_sk_storage_delete, .gpl_only = false, @@ -1181,3 +1228,208 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, return err; } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put); + +struct bpf_iter_seq_sk_storage_map_info { + struct bpf_map *map; + unsigned int bucket_id; + unsigned skip_elems; +}; + +static struct bpf_sk_storage_elem * +bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, + struct bpf_sk_storage_elem *prev_selem) +{ + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_elem *selem; + u32 skip_elems = info->skip_elems; + struct bpf_sk_storage_map *smap; + u32 bucket_id = info->bucket_id; + u32 i, count, n_buckets; + struct bucket *b; + + smap = (struct bpf_sk_storage_map *)info->map; + n_buckets = 1U << smap->bucket_log; + if (bucket_id >= n_buckets) + return NULL; + + /* try to find next selem in the same bucket */ + selem = prev_selem; + count = 0; + while (selem) { + selem = hlist_entry_safe(selem->map_node.next, + struct bpf_sk_storage_elem, map_node); + if (!selem) { + /* not found, unlock and go to the next bucket */ + b = &smap->buckets[bucket_id++]; + raw_spin_unlock_bh(&b->lock); + skip_elems = 0; + break; + } + sk_storage = rcu_dereference_raw(selem->sk_storage); + if (sk_storage) { + info->skip_elems = skip_elems + count; + return selem; + } + count++; + } + + for (i = bucket_id; i < (1U << smap->bucket_log); i++) { + b = &smap->buckets[i]; + raw_spin_lock_bh(&b->lock); + count = 0; + hlist_for_each_entry(selem, &b->list, map_node) { + sk_storage = rcu_dereference_raw(selem->sk_storage); + if (sk_storage && count >= skip_elems) { + info->bucket_id = i; + info->skip_elems = count; + return selem; + } + count++; + } + raw_spin_unlock_bh(&b->lock); + skip_elems = 0; + } + + info->bucket_id = i; + info->skip_elems = 0; + return NULL; +} + +static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_sk_storage_elem *selem; + + selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL); + if (!selem) + return NULL; + + if (*pos == 0) + ++*pos; + return selem; +} + +static void *bpf_sk_storage_map_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + struct bpf_iter_seq_sk_storage_map_info *info = seq->private; + + ++*pos; + ++info->skip_elems; + return bpf_sk_storage_map_seq_find_next(seq->private, v); +} + +struct bpf_iter__bpf_sk_storage_map { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(struct sock *, sk); + __bpf_md_ptr(void *, value); +}; + +DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta, + struct bpf_map *map, struct sock *sk, + void *value) + +static int __bpf_sk_storage_map_seq_show(struct seq_file *seq, + struct bpf_sk_storage_elem *selem) +{ + struct bpf_iter_seq_sk_storage_map_info *info = seq->private; + struct bpf_iter__bpf_sk_storage_map ctx = {}; + struct bpf_sk_storage *sk_storage; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, selem == NULL); + if (prog) { + ctx.meta = &meta; + ctx.map = info->map; + if (selem) { + sk_storage = rcu_dereference_raw(selem->sk_storage); + ctx.sk = sk_storage->sk; + ctx.value = SDATA(selem)->data; + } + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; +} + +static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_sk_storage_map_seq_show(seq, v); +} + +static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_sk_storage_map_info *info = seq->private; + struct bpf_sk_storage_map *smap; + struct bucket *b; + + if (!v) { + (void)__bpf_sk_storage_map_seq_show(seq, v); + } else { + smap = (struct bpf_sk_storage_map *)info->map; + b = &smap->buckets[info->bucket_id]; + raw_spin_unlock_bh(&b->lock); + } +} + +static int bpf_iter_init_sk_storage_map(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; + + seq_info->map = aux->map; + return 0; +} + +static int bpf_iter_check_map(struct bpf_prog *prog, + struct bpf_iter_aux_info *aux) +{ + struct bpf_map *map = aux->map; + + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) + return -EINVAL; + + if (prog->aux->max_rdonly_access > map->value_size) + return -EACCES; + + return 0; +} + +static const struct seq_operations bpf_sk_storage_map_seq_ops = { + .start = bpf_sk_storage_map_seq_start, + .next = bpf_sk_storage_map_seq_next, + .stop = bpf_sk_storage_map_seq_stop, + .show = bpf_sk_storage_map_seq_show, +}; + +static const struct bpf_iter_seq_info iter_seq_info = { + .seq_ops = &bpf_sk_storage_map_seq_ops, + .init_seq_private = bpf_iter_init_sk_storage_map, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_sk_storage_map_info), +}; + +static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { + .target = "bpf_sk_storage_map", + .check_target = bpf_iter_check_map, + .req_linfo = BPF_ITER_LINK_MAP_FD, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__bpf_sk_storage_map, value), + PTR_TO_RDWR_BUF_OR_NULL }, + }, + .seq_info = &iter_seq_info, +}; + +static int __init bpf_sk_storage_map_iter_init(void) +{ + bpf_sk_storage_map_reg_info.ctx_arg_info[0].btf_id = + btf_sock_ids[BTF_SOCK_TYPE_SOCK]; + return bpf_iter_reg_target(&bpf_sk_storage_map_reg_info); +} +late_initcall(bpf_sk_storage_map_iter_init); diff --git a/net/core/dev.c b/net/core/dev.c index ba4de97b676b..7df6c9617321 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -143,6 +143,7 @@ #include <linux/net_namespace.h> #include <linux/indirect_call_wrapper.h> #include <net/devlink.h> +#include <linux/pm_runtime.h> #include "net-sysfs.h" @@ -1492,8 +1493,13 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) ASSERT_RTNL(); - if (!netif_device_present(dev)) - return -ENODEV; + if (!netif_device_present(dev)) { + /* may be detached because parent is runtime-suspended */ + if (dev->dev.parent) + pm_runtime_resume(dev->dev.parent); + if (!netif_device_present(dev)) + return -ENODEV; + } /* Block netpoll from trying to do any rx path servicing. * If we don't do this there is a chance ndo_poll_controller @@ -3448,10 +3454,9 @@ static netdev_features_t net_mpls_features(struct sk_buff *skb, static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t features) { - int tmp; __be16 type; - type = skb_network_protocol(skb, &tmp); + type = skb_network_protocol(skb, NULL); features = net_mpls_features(skb, features, type); if (skb->ip_summed != CHECKSUM_NONE && @@ -5442,6 +5447,8 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) for (i = 0; i < new->aux->used_map_cnt; i++) { if (dev_map_can_have_prog(new->aux->used_maps[i])) return -EINVAL; + if (cpu_map_prog_allowed(new->aux->used_maps[i])) + return -EINVAL; } } @@ -5460,10 +5467,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) } break; - case XDP_QUERY_PROG: - xdp->prog_id = old ? old->aux->id : 0; - break; - default: ret = -EINVAL; break; @@ -5585,7 +5588,7 @@ void netif_receive_skb_list(struct list_head *head) } EXPORT_SYMBOL(netif_receive_skb_list); -DEFINE_PER_CPU(struct work_struct, flush_works); +static DEFINE_PER_CPU(struct work_struct, flush_works); /* Network device is going away, flush any packets still pending */ static void flush_backlog(struct work_struct *work) @@ -6685,7 +6688,9 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) trace_napi_poll(n, work, weight); } - WARN_ON_ONCE(work > weight); + if (unlikely(work > weight)) + pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n", + n->poll, work, weight); if (likely(work < weight)) goto out_unlock; @@ -8706,182 +8711,489 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down_generic); -u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, - enum bpf_netdev_command cmd) +/** + * dev_change_proto_down_reason - proto down reason + * + * @dev: device + * @mask: proto down mask + * @value: proto down value + */ +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value) { - struct netdev_bpf xdp; + int b; - if (!bpf_op) - return 0; + if (!mask) { + dev->proto_down_reason = value; + } else { + for_each_set_bit(b, &mask, 32) { + if (value & (1 << b)) + dev->proto_down_reason |= BIT(b); + else + dev->proto_down_reason &= ~BIT(b); + } + } +} +EXPORT_SYMBOL(dev_change_proto_down_reason); - memset(&xdp, 0, sizeof(xdp)); - xdp.command = cmd; +struct bpf_xdp_link { + struct bpf_link link; + struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ + int flags; +}; - /* Query must always succeed. */ - WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG); +static enum bpf_xdp_mode dev_xdp_mode(u32 flags) +{ + if (flags & XDP_FLAGS_HW_MODE) + return XDP_MODE_HW; + if (flags & XDP_FLAGS_DRV_MODE) + return XDP_MODE_DRV; + return XDP_MODE_SKB; +} - return xdp.prog_id; +static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) +{ + switch (mode) { + case XDP_MODE_SKB: + return generic_xdp_install; + case XDP_MODE_DRV: + case XDP_MODE_HW: + return dev->netdev_ops->ndo_bpf; + default: + return NULL; + }; +} + +static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + return dev->xdp_state[mode].link; } -static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, - struct netlink_ext_ack *extack, u32 flags, - struct bpf_prog *prog) +static struct bpf_prog *dev_xdp_prog(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + struct bpf_xdp_link *link = dev_xdp_link(dev, mode); + + if (link) + return link->link.prog; + return dev->xdp_state[mode].prog; +} + +u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) +{ + struct bpf_prog *prog = dev_xdp_prog(dev, mode); + + return prog ? prog->aux->id : 0; +} + +static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_xdp_link *link) +{ + dev->xdp_state[mode].link = link; + dev->xdp_state[mode].prog = NULL; +} + +static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_prog *prog) +{ + dev->xdp_state[mode].link = NULL; + dev->xdp_state[mode].prog = prog; +} + +static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, + bpf_op_t bpf_op, struct netlink_ext_ack *extack, + u32 flags, struct bpf_prog *prog) { - bool non_hw = !(flags & XDP_FLAGS_HW_MODE); - struct bpf_prog *prev_prog = NULL; struct netdev_bpf xdp; int err; - if (non_hw) { - prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op, - XDP_QUERY_PROG)); - if (IS_ERR(prev_prog)) - prev_prog = NULL; - } - memset(&xdp, 0, sizeof(xdp)); - if (flags & XDP_FLAGS_HW_MODE) - xdp.command = XDP_SETUP_PROG_HW; - else - xdp.command = XDP_SETUP_PROG; + xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; xdp.extack = extack; xdp.flags = flags; xdp.prog = prog; + /* Drivers assume refcnt is already incremented (i.e, prog pointer is + * "moved" into driver), so they don't increment it on their own, but + * they do decrement refcnt when program is detached or replaced. + * Given net_device also owns link/prog, we need to bump refcnt here + * to prevent drivers from underflowing it. + */ + if (prog) + bpf_prog_inc(prog); err = bpf_op(dev, &xdp); - if (!err && non_hw) - bpf_prog_change_xdp(prev_prog, prog); + if (err) { + if (prog) + bpf_prog_put(prog); + return err; + } - if (prev_prog) - bpf_prog_put(prev_prog); + if (mode != XDP_MODE_HW) + bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog); - return err; + return 0; } static void dev_xdp_uninstall(struct net_device *dev) { - struct netdev_bpf xdp; - bpf_op_t ndo_bpf; + struct bpf_xdp_link *link; + struct bpf_prog *prog; + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; - /* Remove generic XDP */ - WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL)); + ASSERT_RTNL(); - /* Remove from the driver */ - ndo_bpf = dev->netdev_ops->ndo_bpf; - if (!ndo_bpf) - return; + for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { + prog = dev_xdp_prog(dev, mode); + if (!prog) + continue; - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG; - WARN_ON(ndo_bpf(dev, &xdp)); - if (xdp.prog_id) - WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, - NULL)); + bpf_op = dev_xdp_bpf_op(dev, mode); + if (!bpf_op) + continue; - /* Remove HW offload */ - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG_HW; - if (!ndo_bpf(dev, &xdp) && xdp.prog_id) - WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, - NULL)); + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + + /* auto-detach link from net device */ + link = dev_xdp_link(dev, mode); + if (link) + link->dev = NULL; + else + bpf_prog_put(prog); + + dev_xdp_set_link(dev, mode, NULL); + } } -/** - * dev_change_xdp_fd - set or clear a bpf program for a device rx path - * @dev: device - * @extack: netlink extended ack - * @fd: new program fd or negative value to clear - * @expected_fd: old program fd that userspace expects to replace or clear - * @flags: xdp-related flags - * - * Set or clear a bpf program for a device - */ -int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, - int fd, int expected_fd, u32 flags) +static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, + struct bpf_xdp_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog, u32 flags) { - const struct net_device_ops *ops = dev->netdev_ops; - enum bpf_netdev_command query; - u32 prog_id, expected_id = 0; - bpf_op_t bpf_op, bpf_chk; - struct bpf_prog *prog; - bool offload; + struct bpf_prog *cur_prog; + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; int err; ASSERT_RTNL(); - offload = flags & XDP_FLAGS_HW_MODE; - query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG; + /* either link or prog attachment, never both */ + if (link && (new_prog || old_prog)) + return -EINVAL; + /* link supports only XDP mode flags */ + if (link && (flags & ~XDP_FLAGS_MODES)) { + NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); + return -EINVAL; + } + /* just one XDP mode bit should be set, zero defaults to SKB mode */ + if (hweight32(flags & XDP_FLAGS_MODES) > 1) { + NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); + return -EINVAL; + } + /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ + if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { + NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); + return -EINVAL; + } - bpf_op = bpf_chk = ops->ndo_bpf; - if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) { - NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode"); - return -EOPNOTSUPP; + mode = dev_xdp_mode(flags); + /* can't replace attached link */ + if (dev_xdp_link(dev, mode)) { + NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); + return -EBUSY; } - if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE)) - bpf_op = generic_xdp_install; - if (bpf_op == bpf_chk) - bpf_chk = generic_xdp_install; - - prog_id = __dev_xdp_query(dev, bpf_op, query); - if (flags & XDP_FLAGS_REPLACE) { - if (expected_fd >= 0) { - prog = bpf_prog_get_type_dev(expected_fd, - BPF_PROG_TYPE_XDP, - bpf_op == ops->ndo_bpf); - if (IS_ERR(prog)) - return PTR_ERR(prog); - expected_id = prog->aux->id; - bpf_prog_put(prog); - } - if (prog_id != expected_id) { - NL_SET_ERR_MSG(extack, "Active program does not match expected"); - return -EEXIST; - } + cur_prog = dev_xdp_prog(dev, mode); + /* can't replace attached prog with link */ + if (link && cur_prog) { + NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); + return -EBUSY; + } + if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { + NL_SET_ERR_MSG(extack, "Active program does not match expected"); + return -EEXIST; + } + if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { + NL_SET_ERR_MSG(extack, "XDP program already attached"); + return -EBUSY; } - if (fd >= 0) { - if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) { - NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time"); - return -EEXIST; - } - if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) { - NL_SET_ERR_MSG(extack, "XDP program already attached"); - return -EBUSY; - } + /* put effective new program into new_prog */ + if (link) + new_prog = link->link.prog; - prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, - bpf_op == ops->ndo_bpf); - if (IS_ERR(prog)) - return PTR_ERR(prog); + if (new_prog) { + bool offload = mode == XDP_MODE_HW; + enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB + ? XDP_MODE_DRV : XDP_MODE_SKB; - if (!offload && bpf_prog_is_dev_bound(prog->aux)) { - NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported"); - bpf_prog_put(prog); + if (!offload && dev_xdp_prog(dev, other_mode)) { + NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); + return -EEXIST; + } + if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) { + NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported"); return -EINVAL; } - - if (prog->expected_attach_type == BPF_XDP_DEVMAP) { + if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); - bpf_prog_put(prog); return -EINVAL; } + if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device"); + return -EINVAL; + } + } - /* prog->aux->id may be 0 for orphaned device-bound progs */ - if (prog->aux->id && prog->aux->id == prog_id) { - bpf_prog_put(prog); - return 0; + /* don't call drivers if the effective program didn't change */ + if (new_prog != cur_prog) { + bpf_op = dev_xdp_bpf_op(dev, mode); + if (!bpf_op) { + NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode"); + return -EOPNOTSUPP; + } + + err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog); + if (err) + return err; + } + + if (link) + dev_xdp_set_link(dev, mode, link); + else + dev_xdp_set_prog(dev, mode, new_prog); + if (cur_prog) + bpf_prog_put(cur_prog); + + return 0; +} + +static int dev_xdp_attach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); +} + +static int dev_xdp_detach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + + ASSERT_RTNL(); + + mode = dev_xdp_mode(link->flags); + if (dev_xdp_link(dev, mode) != link) + return -EINVAL; + + bpf_op = dev_xdp_bpf_op(dev, mode); + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + dev_xdp_set_link(dev, mode, NULL); + return 0; +} + +static void bpf_xdp_link_release(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + rtnl_lock(); + + /* if racing with net_device's tear down, xdp_link->dev might be + * already NULL, in which case link was already auto-detached + */ + if (xdp_link->dev) { + WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + xdp_link->dev = NULL; + } + + rtnl_unlock(); +} + +static int bpf_xdp_link_detach(struct bpf_link *link) +{ + bpf_xdp_link_release(link); + return 0; +} + +static void bpf_xdp_link_dealloc(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + kfree(xdp_link); +} + +static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + seq_printf(seq, "ifindex:\t%u\n", ifindex); +} + +static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + info->xdp.ifindex = ifindex; + return 0; +} + +static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + int err = 0; + + rtnl_lock(); + + /* link might have been auto-released already, so fail */ + if (!xdp_link->dev) { + err = -ENOLINK; + goto out_unlock; + } + + if (old_prog && link->prog != old_prog) { + err = -EPERM; + goto out_unlock; + } + old_prog = link->prog; + if (old_prog == new_prog) { + /* no-op, don't disturb drivers */ + bpf_prog_put(new_prog); + goto out_unlock; + } + + mode = dev_xdp_mode(xdp_link->flags); + bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); + err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, + xdp_link->flags, new_prog); + if (err) + goto out_unlock; + + old_prog = xchg(&link->prog, new_prog); + bpf_prog_put(old_prog); + +out_unlock: + rtnl_unlock(); + return err; +} + +static const struct bpf_link_ops bpf_xdp_link_lops = { + .release = bpf_xdp_link_release, + .dealloc = bpf_xdp_link_dealloc, + .detach = bpf_xdp_link_detach, + .show_fdinfo = bpf_xdp_link_show_fdinfo, + .fill_link_info = bpf_xdp_link_fill_link_info, + .update_prog = bpf_xdp_link_update, +}; + +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct bpf_xdp_link *link; + struct net_device *dev; + int err, fd; + + dev = dev_get_by_index(net, attr->link_create.target_ifindex); + if (!dev) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_dev; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); + link->dev = dev; + link->flags = attr->link_create.flags; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto out_put_dev; + } + + rtnl_lock(); + err = dev_xdp_attach_link(dev, NULL, link); + rtnl_unlock(); + + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_dev; + } + + fd = bpf_link_settle(&link_primer); + /* link itself doesn't hold dev's refcnt to not complicate shutdown */ + dev_put(dev); + return fd; + +out_put_dev: + dev_put(dev); + return err; +} + +/** + * dev_change_xdp_fd - set or clear a bpf program for a device rx path + * @dev: device + * @extack: netlink extended ack + * @fd: new program fd or negative value to clear + * @expected_fd: old program fd that userspace expects to replace or clear + * @flags: xdp-related flags + * + * Set or clear a bpf program for a device + */ +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, int expected_fd, u32 flags) +{ + enum bpf_xdp_mode mode = dev_xdp_mode(flags); + struct bpf_prog *new_prog = NULL, *old_prog = NULL; + int err; + + ASSERT_RTNL(); + + if (fd >= 0) { + new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, + mode != XDP_MODE_SKB); + if (IS_ERR(new_prog)) + return PTR_ERR(new_prog); + } + + if (expected_fd >= 0) { + old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP, + mode != XDP_MODE_SKB); + if (IS_ERR(old_prog)) { + err = PTR_ERR(old_prog); + old_prog = NULL; + goto err_out; } - } else { - if (!prog_id) - return 0; - prog = NULL; } - err = dev_xdp_install(dev, bpf_op, extack, flags, prog); - if (err < 0 && prog) - bpf_prog_put(prog); + err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); +err_out: + if (err && new_prog) + bpf_prog_put(new_prog); + if (old_prog) + bpf_prog_put(old_prog); return err; } diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 547b587c1950..b2cf9b7bb7b8 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -5,6 +5,7 @@ #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> #include <linux/wireless.h> +#include <net/dsa.h> #include <net/wext.h> /* @@ -225,6 +226,26 @@ static int net_hwtstamp_validate(struct ifreq *ifr) return 0; } +static int dev_do_ioctl(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err = -EOPNOTSUPP; + + err = dsa_ndo_do_ioctl(dev, ifr, cmd); + if (err == 0 || err != -EOPNOTSUPP) + return err; + + if (ops->ndo_do_ioctl) { + if (netif_device_present(dev)) + err = ops->ndo_do_ioctl(dev, ifr, cmd); + else + err = -ENODEV; + } + + return err; +} + /* * Perform the SIOCxIFxxx calls, inside rtnl_lock() */ @@ -323,13 +344,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) cmd == SIOCSHWTSTAMP || cmd == SIOCGHWTSTAMP || cmd == SIOCWANDEV) { - err = -EOPNOTSUPP; - if (ops->ndo_do_ioctl) { - if (netif_device_present(dev)) - err = ops->ndo_do_ioctl(dev, ifr, cmd); - else - err = -ENODEV; - } + err = dev_do_ioctl(dev, ifr, cmd); } else err = -EINVAL; diff --git a/net/core/devlink.c b/net/core/devlink.c index 47f14a2f25fb..e674f0f46dc2 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -85,6 +85,10 @@ EXPORT_SYMBOL(devlink_dpipe_header_ipv6); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); +static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { + [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, +}; + static LIST_HEAD(devlink_list); /* devlink_mutex @@ -382,19 +386,19 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) return NULL; } -#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) -#define DEVLINK_NL_FLAG_NEED_PORT BIT(1) -#define DEVLINK_NL_FLAG_NEED_SB BIT(2) +#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) +#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) /* The per devlink instance lock is taken by default in the pre-doit * operation, yet several commands do not require this. The global * devlink lock is taken and protects from disruption by user-calls. */ -#define DEVLINK_NL_FLAG_NO_LOCK BIT(3) +#define DEVLINK_NL_FLAG_NO_LOCK BIT(2) static int devlink_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { + struct devlink_port *devlink_port; struct devlink *devlink; int err; @@ -406,27 +410,18 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, } if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_lock(&devlink->lock); - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { - info->user_ptr[0] = devlink; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { - struct devlink_port *devlink_port; - + info->user_ptr[0] = devlink; + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { devlink_port = devlink_port_get_from_info(devlink, info); if (IS_ERR(devlink_port)) { err = PTR_ERR(devlink_port); goto unlock; } - info->user_ptr[0] = devlink_port; - } - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) { - struct devlink_sb *devlink_sb; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) { - err = PTR_ERR(devlink_sb); - goto unlock; - } - info->user_ptr[1] = devlink_sb; + info->user_ptr[1] = devlink_port; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) { + devlink_port = devlink_port_get_from_info(devlink, info); + if (!IS_ERR(devlink_port)) + info->user_ptr[1] = devlink_port; } return 0; @@ -442,16 +437,8 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, { struct devlink *devlink; - /* When devlink changes netns, it would not be found - * by devlink_get_from_info(). So try if it is stored first. - */ - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { - devlink = info->user_ptr[0]; - } else { - devlink = devlink_get_from_info(info); - WARN_ON(IS_ERR(devlink)); - } - if (!IS_ERR(devlink) && ~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + devlink = info->user_ptr[0]; + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); } @@ -524,8 +511,14 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, { struct devlink_port_attrs *attrs = &devlink_port->attrs; - if (!attrs->set) + if (!devlink_port->attrs_set) return 0; + if (attrs->lanes) { + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) + return -EMSGSIZE; + } + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) + return -EMSGSIZE; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { @@ -563,10 +556,54 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return 0; } +static int +devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, + struct netlink_ext_ack *extack) +{ + struct devlink *devlink = port->devlink; + const struct devlink_ops *ops; + struct nlattr *function_attr; + bool empty_nest = true; + int err = 0; + + function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); + if (!function_attr) + return -EMSGSIZE; + + ops = devlink->ops; + if (ops->port_function_hw_addr_get) { + int hw_addr_len; + u8 hw_addr[MAX_ADDR_LEN]; + + err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack); + if (err == -EOPNOTSUPP) { + /* Port function attributes are optional for a port. If port doesn't + * support function attribute, returning -EOPNOTSUPP is not an error. + */ + err = 0; + goto out; + } else if (err) { + goto out; + } + err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); + if (err) + goto out; + empty_nest = false; + } + +out: + if (err || empty_nest) + nla_nest_cancel(msg, function_attr); + else + nla_nest_end(msg, function_attr); + return err; +} + static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, - u32 seq, int flags) + u32 seq, int flags, + struct netlink_ext_ack *extack) { void *hdr; @@ -607,6 +644,8 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, spin_unlock_bh(&devlink_port->type_lock); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; + if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) + goto nla_put_failure; genlmsg_end(msg, hdr); return 0; @@ -634,7 +673,8 @@ static void devlink_port_notify(struct devlink_port *devlink_port, if (!msg) return; - err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0); + err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0, + NULL); if (err) { nlmsg_free(msg); return; @@ -697,7 +737,7 @@ out: static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; struct sk_buff *msg; int err; @@ -708,7 +748,8 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, err = devlink_nl_port_fill(msg, devlink, devlink_port, DEVLINK_CMD_PORT_NEW, - info->snd_portid, info->snd_seq, 0); + info->snd_portid, info->snd_seq, 0, + info->extack); if (err) { nlmsg_free(msg); return err; @@ -740,7 +781,8 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI); + NLM_F_MULTI, + cb->extack); if (err) { mutex_unlock(&devlink->lock); goto out; @@ -778,10 +820,71 @@ static int devlink_port_type_set(struct devlink *devlink, return -EOPNOTSUPP; } +static int +devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port *port, + const struct nlattr *attr, struct netlink_ext_ack *extack) +{ + const struct devlink_ops *ops; + const u8 *hw_addr; + int hw_addr_len; + int err; + + hw_addr = nla_data(attr); + hw_addr_len = nla_len(attr); + if (hw_addr_len > MAX_ADDR_LEN) { + NL_SET_ERR_MSG_MOD(extack, "Port function hardware address too long"); + return -EINVAL; + } + if (port->type == DEVLINK_PORT_TYPE_ETH) { + if (hw_addr_len != ETH_ALEN) { + NL_SET_ERR_MSG_MOD(extack, "Address must be 6 bytes for Ethernet device"); + return -EINVAL; + } + if (!is_unicast_ether_addr(hw_addr)) { + NL_SET_ERR_MSG_MOD(extack, "Non-unicast hardware address unsupported"); + return -EINVAL; + } + } + + ops = devlink->ops; + if (!ops->port_function_hw_addr_set) { + NL_SET_ERR_MSG_MOD(extack, "Port doesn't support function attributes"); + return -EOPNOTSUPP; + } + + err = ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack); + if (err) + return err; + + devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); + return 0; +} + +static int +devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, + const struct nlattr *attr, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; + int err; + + err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr, + devlink_function_nl_policy, extack); + if (err < 0) { + NL_SET_ERR_MSG_MOD(extack, "Fail to parse port function attributes"); + return err; + } + + attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; + if (attr) + err = devlink_port_function_hw_addr_set(devlink, port, attr, extack); + + return err; +} + static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; int err; @@ -793,6 +896,16 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, if (err) return err; } + + if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) { + struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; + struct netlink_ext_ack *extack = info->extack; + + err = devlink_port_function_set(devlink, devlink_port, attr, extack); + if (err) + return err; + } + return 0; } @@ -810,6 +923,7 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + struct devlink_port *devlink_port; u32 port_index; u32 count; @@ -817,8 +931,27 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]) return -EINVAL; + devlink_port = devlink_port_get_from_info(devlink, info); port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); + + if (IS_ERR(devlink_port)) + return -EINVAL; + + if (!devlink_port->attrs.splittable) { + /* Split ports cannot be split. */ + if (devlink_port->attrs.split) + NL_SET_ERR_MSG_MOD(info->extack, "Port cannot be split further"); + else + NL_SET_ERR_MSG_MOD(info->extack, "Port cannot be split"); + return -EINVAL; + } + + if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) { + NL_SET_ERR_MSG_MOD(info->extack, "Invalid split count"); + return -EINVAL; + } + return devlink_port_split(devlink, port_index, count, info->extack); } @@ -886,10 +1019,14 @@ static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct devlink_sb *devlink_sb; struct sk_buff *msg; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -991,11 +1128,15 @@ static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct devlink_sb *devlink_sb; struct sk_buff *msg; u16 pool_index; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, &pool_index); if (err) @@ -1102,12 +1243,16 @@ static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; enum devlink_sb_threshold_type threshold_type; + struct devlink_sb *devlink_sb; u16 pool_index; u32 size; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, &pool_index); if (err) @@ -1186,13 +1331,17 @@ nla_put_failure: static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; - struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct devlink_sb *devlink_sb; struct sk_buff *msg; u16 pool_index; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, &pool_index); if (err) @@ -1304,12 +1453,17 @@ static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct devlink_port *devlink_port = info->user_ptr[1]; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb; u16 pool_index; u32 threshold; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, &pool_index); if (err) @@ -1391,14 +1545,18 @@ nla_put_failure: static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; - struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct devlink_sb *devlink_sb; struct sk_buff *msg; enum devlink_sb_pool_type pool_type; u16 tc_index; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + err = devlink_sb_pool_type_get_from_info(info, &pool_type); if (err) return err; @@ -1540,14 +1698,19 @@ static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct devlink_port *devlink_port = info->user_ptr[1]; + struct devlink *devlink = info->user_ptr[0]; enum devlink_sb_pool_type pool_type; + struct devlink_sb *devlink_sb; u16 tc_index; u16 pool_index; u32 threshold; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + err = devlink_sb_pool_type_get_from_info(info, &pool_type); if (err) return err; @@ -1575,8 +1738,12 @@ static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; const struct devlink_ops *ops = devlink->ops; + struct devlink_sb *devlink_sb; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); if (ops->sb_occ_snapshot) return ops->sb_occ_snapshot(devlink, devlink_sb->index); @@ -1587,8 +1754,12 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; const struct devlink_ops *ops = devlink->ops; + struct devlink_sb *devlink_sb; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); if (ops->sb_occ_max_clear) return ops->sb_occ_max_clear(devlink, devlink_sb->index); @@ -2772,7 +2943,7 @@ static void devlink_reload_netns_change(struct devlink *devlink, DEVLINK_CMD_PARAM_NEW); } -static bool devlink_reload_supported(struct devlink *devlink) +static bool devlink_reload_supported(const struct devlink *devlink) { return devlink->ops->reload_down && devlink->ops->reload_up; } @@ -2818,7 +2989,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) struct net *dest_net = NULL; int err; - if (!devlink_reload_supported(devlink) || !devlink->reload_enabled) + if (!devlink_reload_supported(devlink)) return -EOPNOTSUPP; err = devlink_resources_validate(devlink, NULL, info); @@ -4388,6 +4559,14 @@ int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) } EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); +int devlink_info_board_serial_number_put(struct devlink_info_req *req, + const char *bsn) +{ + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, + bsn); +} +EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put); + static int devlink_info_version_put(struct devlink_info_req *req, int attr, const char *version_name, const char *version_value) @@ -5141,6 +5320,7 @@ struct devlink_health_reporter { void *priv; const struct devlink_health_reporter_ops *ops; struct devlink *devlink; + struct devlink_port *devlink_port; struct devlink_fmsg *dump_fmsg; struct mutex dump_lock; /* lock parallel read/write from dump buffers */ u64 graceful_period; @@ -5163,18 +5343,98 @@ devlink_health_reporter_priv(struct devlink_health_reporter *reporter) EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); static struct devlink_health_reporter * -devlink_health_reporter_find_by_name(struct devlink *devlink, - const char *reporter_name) +__devlink_health_reporter_find_by_name(struct list_head *reporter_list, + struct mutex *list_lock, + const char *reporter_name) { struct devlink_health_reporter *reporter; - lockdep_assert_held(&devlink->reporters_lock); - list_for_each_entry(reporter, &devlink->reporter_list, list) + lockdep_assert_held(list_lock); + list_for_each_entry(reporter, reporter_list, list) if (!strcmp(reporter->ops->name, reporter_name)) return reporter; return NULL; } +static struct devlink_health_reporter * +devlink_health_reporter_find_by_name(struct devlink *devlink, + const char *reporter_name) +{ + return __devlink_health_reporter_find_by_name(&devlink->reporter_list, + &devlink->reporters_lock, + reporter_name); +} + +static struct devlink_health_reporter * +devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, + const char *reporter_name) +{ + return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list, + &devlink_port->reporters_lock, + reporter_name); +} + +static struct devlink_health_reporter * +__devlink_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) +{ + struct devlink_health_reporter *reporter; + + if (WARN_ON(graceful_period && !ops->recover)) + return ERR_PTR(-EINVAL); + + reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); + if (!reporter) + return ERR_PTR(-ENOMEM); + + reporter->priv = priv; + reporter->ops = ops; + reporter->devlink = devlink; + reporter->graceful_period = graceful_period; + reporter->auto_recover = !!ops->recover; + reporter->auto_dump = !!ops->dump; + mutex_init(&reporter->dump_lock); + refcount_set(&reporter->refcount, 1); + return reporter; +} + +/** + * devlink_port_health_reporter_create - create devlink health reporter for + * specified port instance + * + * @port: devlink_port which should contain the new reporter + * @ops: ops + * @graceful_period: to avoid recovery loops, in msecs + * @priv: priv + */ +struct devlink_health_reporter * +devlink_port_health_reporter_create(struct devlink_port *port, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) +{ + struct devlink_health_reporter *reporter; + + mutex_lock(&port->reporters_lock); + if (__devlink_health_reporter_find_by_name(&port->reporter_list, + &port->reporters_lock, ops->name)) { + reporter = ERR_PTR(-EEXIST); + goto unlock; + } + + reporter = __devlink_health_reporter_create(port->devlink, ops, + graceful_period, priv); + if (IS_ERR(reporter)) + goto unlock; + + reporter->devlink_port = port; + list_add_tail(&reporter->list, &port->reporter_list); +unlock: + mutex_unlock(&port->reporters_lock); + return reporter; +} +EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); + /** * devlink_health_reporter_create - create devlink health reporter * @@ -5196,25 +5456,11 @@ devlink_health_reporter_create(struct devlink *devlink, goto unlock; } - if (WARN_ON(graceful_period && !ops->recover)) { - reporter = ERR_PTR(-EINVAL); - goto unlock; - } - - reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); - if (!reporter) { - reporter = ERR_PTR(-ENOMEM); + reporter = __devlink_health_reporter_create(devlink, ops, + graceful_period, priv); + if (IS_ERR(reporter)) goto unlock; - } - reporter->priv = priv; - reporter->ops = ops; - reporter->devlink = devlink; - reporter->graceful_period = graceful_period; - reporter->auto_recover = !!ops->recover; - reporter->auto_dump = !!ops->dump; - mutex_init(&reporter->dump_lock); - refcount_set(&reporter->refcount, 1); list_add_tail(&reporter->list, &devlink->reporter_list); unlock: mutex_unlock(&devlink->reporters_lock); @@ -5222,6 +5468,29 @@ unlock: } EXPORT_SYMBOL_GPL(devlink_health_reporter_create); +static void +devlink_health_reporter_free(struct devlink_health_reporter *reporter) +{ + mutex_destroy(&reporter->dump_lock); + if (reporter->dump_fmsg) + devlink_fmsg_free(reporter->dump_fmsg); + kfree(reporter); +} + +static void +devlink_health_reporter_put(struct devlink_health_reporter *reporter) +{ + if (refcount_dec_and_test(&reporter->refcount)) + devlink_health_reporter_free(reporter); +} + +static void +__devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) +{ + list_del(&reporter->list); + devlink_health_reporter_put(reporter); +} + /** * devlink_health_reporter_destroy - destroy devlink health reporter * @@ -5230,18 +5499,30 @@ EXPORT_SYMBOL_GPL(devlink_health_reporter_create); void devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) { - mutex_lock(&reporter->devlink->reporters_lock); - list_del(&reporter->list); - mutex_unlock(&reporter->devlink->reporters_lock); - while (refcount_read(&reporter->refcount) > 1) - msleep(100); - mutex_destroy(&reporter->dump_lock); - if (reporter->dump_fmsg) - devlink_fmsg_free(reporter->dump_fmsg); - kfree(reporter); + struct mutex *lock = &reporter->devlink->reporters_lock; + + mutex_lock(lock); + __devlink_health_reporter_destroy(reporter); + mutex_unlock(lock); } EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); +/** + * devlink_port_health_reporter_destroy - destroy devlink port health reporter + * + * @reporter: devlink health reporter to destroy + */ +void +devlink_port_health_reporter_destroy(struct devlink_health_reporter *reporter) +{ + struct mutex *lock = &reporter->devlink_port->reporters_lock; + + mutex_lock(lock); + __devlink_health_reporter_destroy(reporter); + mutex_unlock(lock); +} +EXPORT_SYMBOL_GPL(devlink_port_health_reporter_destroy); + static int devlink_nl_health_reporter_fill(struct sk_buff *msg, struct devlink *devlink, @@ -5259,6 +5540,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, if (devlink_nl_put_handle(msg, devlink)) goto genlmsg_cancel; + if (reporter->devlink_port) { + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, reporter->devlink_port->index)) + goto genlmsg_cancel; + } reporter_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_HEALTH_REPORTER); if (!reporter_attr) @@ -5466,17 +5751,28 @@ devlink_health_reporter_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { struct devlink_health_reporter *reporter; + struct devlink_port *devlink_port; char *reporter_name; if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) return NULL; reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); - mutex_lock(&devlink->reporters_lock); - reporter = devlink_health_reporter_find_by_name(devlink, reporter_name); - if (reporter) - refcount_inc(&reporter->refcount); - mutex_unlock(&devlink->reporters_lock); + devlink_port = devlink_port_get_from_attrs(devlink, attrs); + if (IS_ERR(devlink_port)) { + mutex_lock(&devlink->reporters_lock); + reporter = devlink_health_reporter_find_by_name(devlink, reporter_name); + if (reporter) + refcount_inc(&reporter->refcount); + mutex_unlock(&devlink->reporters_lock); + } else { + mutex_lock(&devlink_port->reporters_lock); + reporter = devlink_port_health_reporter_find_by_name(devlink_port, reporter_name); + if (reporter) + refcount_inc(&reporter->refcount); + mutex_unlock(&devlink_port->reporters_lock); + } + return reporter; } @@ -5508,12 +5804,6 @@ unlock: return NULL; } -static void -devlink_health_reporter_put(struct devlink_health_reporter *reporter) -{ - refcount_dec(&reporter->refcount); -} - void devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state state) @@ -5570,6 +5860,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct devlink_health_reporter *reporter; + struct devlink_port *port; struct devlink *devlink; int start = cb->args[0]; int idx = 0; @@ -5600,6 +5891,31 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, } mutex_unlock(&devlink->reporters_lock); } + + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + list_for_each_entry(port, &devlink->port_list, list) { + mutex_lock(&port->reporters_lock); + list_for_each_entry(reporter, &port->reporter_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_health_reporter_fill(msg, devlink, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + mutex_unlock(&port->reporters_lock); + goto out; + } + idx++; + } + mutex_unlock(&port->reporters_lock); + } + } out: mutex_unlock(&devlink_mutex); @@ -6107,7 +6423,7 @@ static int __devlink_trap_action_set(struct devlink *devlink, } err = devlink->ops->trap_action_set(devlink, trap_item->trap, - trap_action); + trap_action, extack); if (err) return err; @@ -6397,7 +6713,8 @@ static int devlink_trap_group_set(struct devlink *devlink, } policer = policer_item ? policer_item->policer : NULL; - err = devlink->ops->trap_group_set(devlink, group_item->group, policer); + err = devlink->ops->trap_group_set(devlink, group_item->group, policer, + extack); if (err) return err; @@ -6721,6 +7038,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 }, [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, + [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -6729,7 +7047,6 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_get_doit, .dumpit = devlink_nl_cmd_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { @@ -6752,24 +7069,20 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_split_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_PORT_UNSPLIT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_unsplit_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_SB_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_get_doit, .dumpit = devlink_nl_cmd_sb_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB, /* can be retrieved by unprivileged users */ }, { @@ -6777,8 +7090,6 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_pool_get_doit, .dumpit = devlink_nl_cmd_sb_pool_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB, /* can be retrieved by unprivileged users */ }, { @@ -6786,16 +7097,13 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_pool_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_port_pool_get_doit, .dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | - DEVLINK_NL_FLAG_NEED_SB, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, { @@ -6803,16 +7111,14 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_port_pool_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | - DEVLINK_NL_FLAG_NEED_SB, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit, .dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | - DEVLINK_NL_FLAG_NEED_SB, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, { @@ -6820,60 +7126,50 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | - DEVLINK_NL_FLAG_NEED_SB, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_occ_snapshot_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_occ_max_clear_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_ESWITCH_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_eswitch_get_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_ESWITCH_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_eswitch_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_dpipe_table_get, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_dpipe_entries_get, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_dpipe_headers_get, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { @@ -6881,20 +7177,17 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_dpipe_table_counters_set, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_RESOURCE_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_resource_set, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_RESOURCE_DUMP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_resource_dump, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { @@ -6902,15 +7195,13 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_reload, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_PARAM_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_param_get_doit, .dumpit = devlink_nl_cmd_param_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { @@ -6918,7 +7209,6 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_param_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_PORT_PARAM_GET, @@ -6941,21 +7231,18 @@ static const struct genl_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_region_get_doit, .dumpit = devlink_nl_cmd_region_get_dumpit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_REGION_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_region_new, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_REGION_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_region_del, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_REGION_READ, @@ -6963,14 +7250,12 @@ static const struct genl_ops devlink_nl_ops[] = { GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_region_read_dumpit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_INFO_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_info_get_doit, .dumpit = devlink_nl_cmd_info_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { @@ -6978,7 +7263,7 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_get_doit, .dumpit = devlink_nl_cmd_health_reporter_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, /* can be retrieved by unprivileged users */ }, @@ -6987,7 +7272,7 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, }, { @@ -6995,7 +7280,7 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_recover_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, }, { @@ -7003,7 +7288,7 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_diagnose_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, }, { @@ -7012,7 +7297,7 @@ static const struct genl_ops devlink_nl_ops[] = { GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, }, { @@ -7020,7 +7305,7 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_dump_clear_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, }, { @@ -7028,46 +7313,39 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_flash_update, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_TRAP_GET, .doit = devlink_nl_cmd_trap_get_doit, .dumpit = devlink_nl_cmd_trap_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_TRAP_SET, .doit = devlink_nl_cmd_trap_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_TRAP_GROUP_GET, .doit = devlink_nl_cmd_trap_group_get_doit, .dumpit = devlink_nl_cmd_trap_group_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_TRAP_GROUP_SET, .doit = devlink_nl_cmd_trap_group_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_TRAP_POLICER_GET, .doit = devlink_nl_cmd_trap_policer_get_doit, .dumpit = devlink_nl_cmd_trap_policer_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_TRAP_POLICER_SET, .doit = devlink_nl_cmd_trap_policer_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, }; @@ -7132,9 +7410,9 @@ EXPORT_SYMBOL_GPL(devlink_alloc); */ int devlink_register(struct devlink *devlink, struct device *dev) { - mutex_lock(&devlink_mutex); devlink->dev = dev; devlink->registered = true; + mutex_lock(&devlink_mutex); list_add_tail(&devlink->list, &devlink_list); devlink_notify(devlink, DEVLINK_CMD_NEW); mutex_unlock(&devlink_mutex); @@ -7280,6 +7558,8 @@ int devlink_port_register(struct devlink *devlink, list_add_tail(&devlink_port->list, &devlink->port_list); INIT_LIST_HEAD(&devlink_port->param_list); mutex_unlock(&devlink->lock); + INIT_LIST_HEAD(&devlink_port->reporter_list); + mutex_init(&devlink_port->reporters_lock); INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); @@ -7296,6 +7576,8 @@ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; + WARN_ON(!list_empty(&devlink_port->reporter_list)); + mutex_destroy(&devlink_port->reporters_lock); devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); mutex_lock(&devlink->lock); @@ -7389,24 +7671,20 @@ void devlink_port_type_clear(struct devlink_port *devlink_port) EXPORT_SYMBOL_GPL(devlink_port_type_clear); static int __devlink_port_attrs_set(struct devlink_port *devlink_port, - enum devlink_port_flavour flavour, - const unsigned char *switch_id, - unsigned char switch_id_len) + enum devlink_port_flavour flavour) { struct devlink_port_attrs *attrs = &devlink_port->attrs; if (WARN_ON(devlink_port->registered)) return -EEXIST; - attrs->set = true; + devlink_port->attrs_set = true; attrs->flavour = flavour; - if (switch_id) { - attrs->switch_port = true; - if (WARN_ON(switch_id_len > MAX_PHYS_ITEM_ID_LEN)) - switch_id_len = MAX_PHYS_ITEM_ID_LEN; - memcpy(attrs->switch_id.id, switch_id, switch_id_len); - attrs->switch_id.id_len = switch_id_len; + if (attrs->switch_id.id_len) { + devlink_port->switch_port = true; + if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN)) + attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN; } else { - attrs->switch_port = false; + devlink_port->switch_port = false; } return 0; } @@ -7415,33 +7693,18 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port, * devlink_port_attrs_set - Set port attributes * * @devlink_port: devlink port - * @flavour: flavour of the port - * @port_number: number of the port that is facing user, for example - * the front panel port number - * @split: indicates if this is split port - * @split_subport_number: if the port is split, this is the number - * of subport. - * @switch_id: if the port is part of switch, this is buffer with ID, - * otwerwise this is NULL - * @switch_id_len: length of the switch_id buffer + * @attrs: devlink port attrs */ void devlink_port_attrs_set(struct devlink_port *devlink_port, - enum devlink_port_flavour flavour, - u32 port_number, bool split, - u32 split_subport_number, - const unsigned char *switch_id, - unsigned char switch_id_len) + struct devlink_port_attrs *attrs) { - struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - ret = __devlink_port_attrs_set(devlink_port, flavour, - switch_id, switch_id_len); + devlink_port->attrs = *attrs; + ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) return; - attrs->split = split; - attrs->phys.port_number = port_number; - attrs->phys.split_subport_number = split_subport_number; + WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); @@ -7450,20 +7713,14 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set); * * @devlink_port: devlink port * @pf: associated PF for the devlink port instance - * @switch_id: if the port is part of switch, this is buffer with ID, - * otherwise this is NULL - * @switch_id_len: length of the switch_id buffer */ -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, - const unsigned char *switch_id, - unsigned char switch_id_len, u16 pf) +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_PF, - switch_id, switch_id_len); + DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) return; @@ -7477,21 +7734,15 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); * @devlink_port: devlink port * @pf: associated PF for the devlink port instance * @vf: associated VF of a PF for the devlink port instance - * @switch_id: if the port is part of switch, this is buffer with ID, - * otherwise this is NULL - * @switch_id_len: length of the switch_id buffer */ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, - const unsigned char *switch_id, - unsigned char switch_id_len, u16 pf, u16 vf) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_VF, - switch_id, switch_id_len); + DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) return; attrs->pci_vf.pf = pf; @@ -7505,7 +7756,7 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, struct devlink_port_attrs *attrs = &devlink_port->attrs; int n = 0; - if (!attrs->set) + if (!devlink_port->attrs_set) return -EOPNOTSUPP; switch (attrs->flavour) { @@ -8551,6 +8802,7 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(PTP_GENERAL, CONTROL), DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), + DEVLINK_TRAP(EARLY_DROP, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8800,7 +9052,8 @@ static void devlink_trap_disable(struct devlink *devlink, if (WARN_ON_ONCE(!trap_item)) return; - devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP); + devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP, + NULL); trap_item->action = DEVLINK_TRAP_ACTION_DROP; } @@ -9341,7 +9594,7 @@ int devlink_compat_switch_id_get(struct net_device *dev, * any devlink lock as only permanent values are accessed. */ devlink_port = netdev_to_devlink_port(dev); - if (!devlink_port || !devlink_port->attrs.switch_port) + if (!devlink_port || !devlink_port->switch_port) return -EOPNOTSUPP; memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid)); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index bd7eba9066f8..51678a528f85 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -14,6 +14,20 @@ #include <net/sock.h> #include <net/fib_rules.h> #include <net/ip_tunnels.h> +#include <linux/indirect_call_wrapper.h> + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES +#ifdef CONFIG_IP_MULTIPLE_TABLES +#define INDIRECT_CALL_MT(f, f2, f1, ...) \ + INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) +#endif +#elif defined(CONFIG_IP_MULTIPLE_TABLES) +#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__) +#endif static const struct fib_kuid_range fib_kuid_range_unset = { KUIDT_INIT(0), @@ -267,7 +281,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, uid_gt(fl->flowi_uid, rule->uid_range.end)) goto out; - ret = ops->match(rule, fl, flags); + ret = INDIRECT_CALL_MT(ops->match, + fib6_rule_match, + fib4_rule_match, + rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; } @@ -298,9 +315,15 @@ jumped: } else if (rule->action == FR_ACT_NOP) continue; else - err = ops->action(rule, fl, flags, arg); - - if (!err && ops->suppress && ops->suppress(rule, arg)) + err = INDIRECT_CALL_MT(ops->action, + fib6_rule_action, + fib4_rule_action, + rule, fl, flags, arg); + + if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, + fib6_rule_suppress, + fib4_rule_suppress, + rule, arg)) continue; if (err != -EAGAIN) { diff --git a/net/core/filter.c b/net/core/filter.c index 82e1b5b06167..7124f0fe6974 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -47,6 +47,7 @@ #include <linux/seccomp.h> #include <linux/if_vlan.h> #include <linux/bpf.h> +#include <linux/btf.h> #include <net/sch_generic.h> #include <net/cls_cgroup.h> #include <net/dst_metadata.h> @@ -73,6 +74,31 @@ #include <net/lwtunnel.h> #include <net/ipv6_stubs.h> #include <net/bpf_sk_storage.h> +#include <net/transp_v6.h> +#include <linux/btf_ids.h> + +int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) +{ + if (in_compat_syscall()) { + struct compat_sock_fprog f32; + + if (len != sizeof(f32)) + return -EINVAL; + if (copy_from_sockptr(&f32, src, sizeof(f32))) + return -EFAULT; + memset(dst, 0, sizeof(*dst)); + dst->len = f32.len; + dst->filter = compat_ptr(f32.filter); + } else { + if (len != sizeof(*dst)) + return -EINVAL; + if (copy_from_sockptr(dst, src, sizeof(*dst))) + return -EFAULT; + } + + return 0; +} +EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); /** * sk_filter_trim_cap - run a packet through a socket filter @@ -3777,7 +3803,9 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static int bpf_skb_output_btf_ids[5]; +BTF_ID_LIST(bpf_skb_output_btf_ids) +BTF_ID(struct, sk_buff) + const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, @@ -4171,7 +4199,9 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static int bpf_xdp_output_btf_ids[5]; +BTF_ID_LIST(bpf_xdp_output_btf_ids) +BTF_ID(struct, xdp_buff) + const struct bpf_func_proto bpf_xdp_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, @@ -4289,10 +4319,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen, u32 flags) { char devname[IFNAMSIZ]; + int val, valbool; struct net *net; int ifindex; int ret = 0; - int val; if (!sk_fullsock(sk)) return -EINVAL; @@ -4303,6 +4333,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) return -EINVAL; val = *((int *)optval); + valbool = val ? 1 : 0; /* Only some socketops are supported */ switch (optname) { @@ -4361,6 +4392,11 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, } ret = sock_bindtoindex(sk, ifindex, false); break; + case SO_KEEPALIVE: + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, valbool); + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); + break; default: ret = -EINVAL; } @@ -4421,6 +4457,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, ret = tcp_set_congestion_control(sk, name, false, reinit, true); } else { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); if (optlen != sizeof(int)) @@ -4449,6 +4486,33 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, else tp->save_syn = val; break; + case TCP_KEEPIDLE: + ret = tcp_sock_set_keepidle_locked(sk, val); + break; + case TCP_KEEPINTVL: + if (val < 1 || val > MAX_TCP_KEEPINTVL) + ret = -EINVAL; + else + tp->keepalive_intvl = val * HZ; + break; + case TCP_KEEPCNT: + if (val < 1 || val > MAX_TCP_KEEPCNT) + ret = -EINVAL; + else + tp->keepalive_probes = val; + break; + case TCP_SYNCNT: + if (val < 1 || val > MAX_TCP_SYNCNT) + ret = -EINVAL; + else + icsk->icsk_syn_retries = val; + break; + case TCP_USER_TIMEOUT: + if (val < 0) + ret = -EINVAL; + else + icsk->icsk_user_timeout = val; + break; default: ret = -EINVAL; } @@ -6123,6 +6187,7 @@ bool bpf_helper_changes_pkt_data(void *func) } const struct bpf_func_proto bpf_event_output_data_proto __weak; +const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -6155,6 +6220,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; #endif + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_cg_sock_proto; default: return bpf_base_func_proto(func_id); } @@ -6858,6 +6925,7 @@ static bool __sock_filter_check_attach_type(int off, case offsetof(struct bpf_sock, priority): switch (attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: goto full_access; default: return false; @@ -9187,6 +9255,189 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; + +DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled); +EXPORT_SYMBOL(bpf_sk_lookup_enabled); + +BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, + struct sock *, sk, u64, flags) +{ + if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | + BPF_SK_LOOKUP_F_NO_REUSEPORT))) + return -EINVAL; + if (unlikely(sk && sk_is_refcounted(sk))) + return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ + if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED)) + return -ESOCKTNOSUPPORT; /* reject connected sockets */ + + /* Check if socket is suitable for packet L3/L4 protocol */ + if (sk && sk->sk_protocol != ctx->protocol) + return -EPROTOTYPE; + if (sk && sk->sk_family != ctx->family && + (sk->sk_family == AF_INET || ipv6_only_sock(sk))) + return -EAFNOSUPPORT; + + if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) + return -EEXIST; + + /* Select socket as lookup result */ + ctx->selected_sk = sk; + ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; + return 0; +} + +static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { + .func = bpf_sk_lookup_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_lookup_assign_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool sk_lookup_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) + return false; + if (off % size != 0) + return false; + if (type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct bpf_sk_lookup, sk): + info->reg_type = PTR_TO_SOCKET_OR_NULL; + return size == sizeof(__u64); + + case bpf_ctx_range(struct bpf_sk_lookup, family): + case bpf_ctx_range(struct bpf_sk_lookup, protocol): + case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): + case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): + case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + case bpf_ctx_range(struct bpf_sk_lookup, local_port): + bpf_ctx_record_field_size(info, sizeof(__u32)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); + + default: + return false; + } +} + +static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sk_lookup, sk): + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, selected_sk)); + break; + + case offsetof(struct bpf_sk_lookup, family): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + family, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, protocol): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + protocol, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, remote_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.saddr, 4, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.daddr, 4, target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sk_lookup, + remote_ip6[0], remote_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.saddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case bpf_ctx_range_till(struct bpf_sk_lookup, + local_ip6[0], local_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.daddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case offsetof(struct bpf_sk_lookup, remote_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + sport, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + dport, 2, target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_prog_ops sk_lookup_prog_ops = { +}; + +const struct bpf_verifier_ops sk_lookup_verifier_ops = { + .get_func_proto = sk_lookup_func_proto, + .is_valid_access = sk_lookup_is_valid_access, + .convert_ctx_access = sk_lookup_convert_ctx_access, +}; + #endif /* CONFIG_INET */ DEFINE_BPF_DISPATCHER(xdp) @@ -9195,3 +9446,132 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) { bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } + +#ifdef CONFIG_DEBUG_INFO_BTF +BTF_ID_LIST_GLOBAL(btf_sock_ids) +#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) +BTF_SOCK_TYPE_xxx +#undef BTF_SOCK_TYPE +#else +u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; +#endif + +static bool check_arg_btf_id(u32 btf_id, u32 arg) +{ + int i; + + /* only one argument, no need to check arg */ + for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) + if (btf_sock_ids[i] == btf_id) + return true; + return false; +} + +BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) +{ + /* tcp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct tcp6_sock); + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && + sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { + .func = bpf_skc_to_tcp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], +}; + +BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) +{ + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { + .func = bpf_skc_to_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +}; + +BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) +{ +#ifdef CONFIG_INET + if (sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; +#endif + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { + .func = bpf_skc_to_tcp_timewait_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], +}; + +BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) +{ +#ifdef CONFIG_INET + if (sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; +#endif + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { + .func = bpf_skc_to_tcp_request_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], +}; + +BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) +{ + /* udp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct udp6_sock); + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && + sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { + .func = bpf_skc_to_udp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], +}; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 142a8824f0a8..29806eb765cf 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -383,6 +383,23 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); +void skb_flow_dissect_hash(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_hash *key; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH)) + return; + + key = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_HASH, + target_container); + + key->hash = skb_get_hash_raw(skb); +} +EXPORT_SYMBOL(skb_flow_dissect_hash); + static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 2076219b8ba5..d4474c812b64 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -430,7 +430,7 @@ EXPORT_SYMBOL(flow_indr_dev_unregister); static void flow_block_indr_init(struct flow_block_cb *flow_block, struct flow_block_offload *bo, - struct net_device *dev, void *data, + struct net_device *dev, struct Qdisc *sch, void *data, void *cb_priv, void (*cleanup)(struct flow_block_cb *block_cb)) { @@ -438,6 +438,7 @@ static void flow_block_indr_init(struct flow_block_cb *flow_block, flow_block->indr.data = data; flow_block->indr.cb_priv = cb_priv; flow_block->indr.dev = dev; + flow_block->indr.sch = sch; flow_block->indr.cleanup = cleanup; } @@ -445,7 +446,8 @@ struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv), struct flow_block_offload *bo, - struct net_device *dev, void *data, + struct net_device *dev, + struct Qdisc *sch, void *data, void *indr_cb_priv, void (*cleanup)(struct flow_block_cb *block_cb)) { @@ -455,7 +457,7 @@ struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, if (IS_ERR(block_cb)) goto out; - flow_block_indr_init(block_cb, bo, dev, data, indr_cb_priv, cleanup); + flow_block_indr_init(block_cb, bo, dev, sch, data, indr_cb_priv, cleanup); list_add(&block_cb->indr.list, &flow_block_indr_list); out: @@ -463,7 +465,7 @@ out: } EXPORT_SYMBOL(flow_indr_block_cb_alloc); -int flow_indr_dev_setup_offload(struct net_device *dev, +int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void *data, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)) @@ -472,7 +474,7 @@ int flow_indr_dev_setup_offload(struct net_device *dev, mutex_lock(&flow_indr_block_lock); list_for_each_entry(this, &flow_block_indr_dev_list, list) - this->cb(dev, this->cb_priv, type, bo, data, cleanup); + this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); mutex_unlock(&flow_indr_block_lock); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ef6b5a8f629c..8e39e28b0a8d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1783,6 +1783,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_NH_ID] = { .type = NLA_U32 }, + [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 85a4b0101f76..68e0682450c6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1000,6 +1000,16 @@ static size_t rtnl_prop_list_size(const struct net_device *dev) return size; } +static size_t rtnl_proto_down_size(const struct net_device *dev) +{ + size_t size = nla_total_size(1); + + if (dev->proto_down_reason) + size += nla_total_size(0) + nla_total_size(4); + + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1041,7 +1051,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ - + nla_total_size(1) /* IFLA_PROTO_DOWN */ + + rtnl_proto_down_size(dev) /* proto down */ + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ @@ -1416,13 +1426,12 @@ static u32 rtnl_xdp_prog_skb(struct net_device *dev) static u32 rtnl_xdp_prog_drv(struct net_device *dev) { - return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG); + return dev_xdp_prog_id(dev, XDP_MODE_DRV); } static u32 rtnl_xdp_prog_hw(struct net_device *dev) { - return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, - XDP_QUERY_PROG_HW); + return dev_xdp_prog_id(dev, XDP_MODE_HW); } static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, @@ -1658,6 +1667,35 @@ nest_cancel: return ret; } +static int rtnl_fill_proto_down(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *pr; + u32 preason; + + if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + goto nla_put_failure; + + preason = dev->proto_down_reason; + if (!preason) + return 0; + + pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); + if (!pr) + return -EMSGSIZE; + + if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { + nla_nest_cancel(skb, pr); + goto nla_put_failure; + } + + nla_nest_end(skb, pr); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, @@ -1708,13 +1746,15 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + atomic_read(&dev->carrier_down_count)) || - nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) || nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, atomic_read(&dev->carrier_down_count))) goto nla_put_failure; + if (rtnl_fill_proto_down(skb, dev)) + goto nla_put_failure; + if (event != IFLA_EVENT_NONE) { if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; @@ -1834,6 +1874,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, + [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2483,6 +2524,67 @@ static int do_set_master(struct net_device *dev, int ifindex, return 0; } +static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { + [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, + [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, +}; + +static int do_set_proto_down(struct net_device *dev, + struct nlattr *nl_proto_down, + struct nlattr *nl_proto_down_reason, + struct netlink_ext_ack *extack) +{ + struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; + const struct net_device_ops *ops = dev->netdev_ops; + unsigned long mask = 0; + u32 value; + bool proto_down; + int err; + + if (!ops->ndo_change_proto_down) { + NL_SET_ERR_MSG(extack, "Protodown not supported by device"); + return -EOPNOTSUPP; + } + + if (nl_proto_down_reason) { + err = nla_parse_nested_deprecated(pdreason, + IFLA_PROTO_DOWN_REASON_MAX, + nl_proto_down_reason, + ifla_proto_down_reason_policy, + NULL); + if (err < 0) + return err; + + if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { + NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); + return -EINVAL; + } + + value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); + + if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) + mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); + + dev_change_proto_down_reason(dev, mask, value); + } + + if (nl_proto_down) { + proto_down = nla_get_u8(nl_proto_down); + + /* Dont turn off protodown if there are active reasons */ + if (!proto_down && dev->proto_down_reason) { + NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); + return -EBUSY; + } + err = dev_change_proto_down(dev, + proto_down); + if (err) + return err; + } + + return 0; +} + #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 @@ -2771,9 +2873,9 @@ static int do_setlink(const struct sk_buff *skb, } err = 0; - if (tb[IFLA_PROTO_DOWN]) { - err = dev_change_proto_down(dev, - nla_get_u8(tb[IFLA_PROTO_DOWN])); + if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { + err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], + tb[IFLA_PROTO_DOWN_REASON], extack); if (err) goto errout; status |= DO_SETLINK_NOTIFY; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b8afefe6f6b6..2828f6d5ba89 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3758,7 +3758,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int err = -ENOMEM; int i = 0; int pos; - int dummy; if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) && (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) { @@ -3780,7 +3779,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, } __skb_push(head_skb, doffset); - proto = skb_network_protocol(head_skb, &dummy); + proto = skb_network_protocol(head_skb, NULL); if (unlikely(!proto)) return ERR_PTR(-EINVAL); @@ -4413,7 +4412,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) * at the moment even if they are anonymous). */ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && - __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) + !__pskb_pull_tail(skb, __skb_pagelen(skb))) return -ENOMEM; /* Easy case. Most of packets will go this way. */ @@ -4692,7 +4691,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { - skb = tcp_get_timestamping_opt_stats(sk); + skb = tcp_get_timestamping_opt_stats(sk, orig_skb); opt_stats = true; } else #endif diff --git a/net/core/sock.c b/net/core/sock.c index 8ccdcdaaa673..49cd5ffe673e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -113,6 +113,7 @@ #include <linux/static_key.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> +#include <linux/compat.h> #include <linux/uaccess.h> @@ -360,7 +361,8 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval) return sizeof(tv); } -static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval) +static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, + bool old_timeval) { struct __kernel_sock_timeval tv; @@ -370,7 +372,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool if (optlen < sizeof(tv32)) return -EINVAL; - if (copy_from_user(&tv32, optval, sizeof(tv32))) + if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) return -EFAULT; tv.tv_sec = tv32.tv_sec; tv.tv_usec = tv32.tv_usec; @@ -379,14 +381,14 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool if (optlen < sizeof(old_tv)) return -EINVAL; - if (copy_from_user(&old_tv, optval, sizeof(old_tv))) + if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) return -EFAULT; tv.tv_sec = old_tv.tv_sec; tv.tv_usec = old_tv.tv_usec; } else { if (optlen < sizeof(tv)) return -EINVAL; - if (copy_from_user(&tv, optval, sizeof(tv))) + if (copy_from_sockptr(&tv, optval, sizeof(tv))) return -EFAULT; } if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) @@ -608,8 +610,7 @@ int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) } EXPORT_SYMBOL(sock_bindtoindex); -static int sock_setbindtodevice(struct sock *sk, char __user *optval, - int optlen) +static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -631,7 +632,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, memset(devname, 0, sizeof(devname)); ret = -EFAULT; - if (copy_from_user(devname, optval, optlen)) + if (copy_from_sockptr(devname, optval, optlen)) goto out; index = 0; @@ -695,15 +696,6 @@ out: return ret; } -static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, - int valbool) -{ - if (valbool) - sock_set_flag(sk, bit); - else - sock_reset_flag(sk, bit); -} - bool sk_mc_loop(struct sock *sk) { if (dev_recursion_level()) @@ -834,7 +826,7 @@ EXPORT_SYMBOL(sock_set_rcvbuf); */ int sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock_txtime sk_txtime; struct sock *sk = sock->sk; @@ -853,7 +845,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; @@ -966,7 +958,7 @@ set_sndbuf: ret = -EINVAL; /* 1003.1g */ break; } - if (copy_from_user(&ling, optval, sizeof(ling))) { + if (copy_from_sockptr(&ling, optval, sizeof(ling))) { ret = -EFAULT; break; } @@ -1060,60 +1052,52 @@ set_sndbuf: case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD); + ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, + optlen, optname == SO_RCVTIMEO_OLD); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD); + ret = sock_set_timeout(&sk->sk_sndtimeo, optval, + optlen, optname == SO_SNDTIMEO_OLD); break; - case SO_ATTACH_FILTER: - ret = -EINVAL; - if (optlen == sizeof(struct sock_fprog)) { - struct sock_fprog fprog; - - ret = -EFAULT; - if (copy_from_user(&fprog, optval, sizeof(fprog))) - break; + case SO_ATTACH_FILTER: { + struct sock_fprog fprog; + ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); + if (!ret) ret = sk_attach_filter(&fprog, sk); - } break; - + } case SO_ATTACH_BPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; - if (copy_from_user(&ufd, optval, sizeof(ufd))) + if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_attach_bpf(ufd, sk); } break; - case SO_ATTACH_REUSEPORT_CBPF: - ret = -EINVAL; - if (optlen == sizeof(struct sock_fprog)) { - struct sock_fprog fprog; - - ret = -EFAULT; - if (copy_from_user(&fprog, optval, sizeof(fprog))) - break; + case SO_ATTACH_REUSEPORT_CBPF: { + struct sock_fprog fprog; + ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); + if (!ret) ret = sk_reuseport_attach_filter(&fprog, sk); - } break; - + } case SO_ATTACH_REUSEPORT_EBPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; - if (copy_from_user(&ufd, optval, sizeof(ufd))) + if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_reuseport_attach_bpf(ufd, sk); @@ -1193,7 +1177,7 @@ set_sndbuf: if (sizeof(ulval) != sizeof(val) && optlen >= sizeof(ulval) && - get_user(ulval, (unsigned long __user *)optval)) { + copy_from_sockptr(&ulval, optval, sizeof(ulval))) { ret = -EFAULT; break; } @@ -1236,7 +1220,7 @@ set_sndbuf: if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; break; - } else if (copy_from_user(&sk_txtime, optval, + } else if (copy_from_sockptr(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; break; @@ -2802,20 +2786,6 @@ int sock_no_shutdown(struct socket *sock, int how) } EXPORT_SYMBOL(sock_no_shutdown); -int sock_no_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - return -EOPNOTSUPP; -} -EXPORT_SYMBOL(sock_no_setsockopt); - -int sock_no_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - return -EOPNOTSUPP; -} -EXPORT_SYMBOL(sock_no_getsockopt); - int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; @@ -3243,20 +3213,6 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname, } EXPORT_SYMBOL(sock_common_getsockopt); -#ifdef CONFIG_COMPAT -int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - struct sock *sk = sock->sk; - - if (sk->sk_prot->compat_getsockopt != NULL) - return sk->sk_prot->compat_getsockopt(sk, level, optname, - optval, optlen); - return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_sock_common_getsockopt); -#endif - int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -3276,7 +3232,7 @@ EXPORT_SYMBOL(sock_common_recvmsg); * Set socket options on an inet socket. */ int sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; @@ -3284,20 +3240,6 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname, } EXPORT_SYMBOL(sock_common_setsockopt); -#ifdef CONFIG_COMPAT -int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - struct sock *sk = sock->sk; - - if (sk->sk_prot->compat_setsockopt != NULL) - return sk->sk_prot->compat_setsockopt(sk, level, optname, - optval, optlen); - return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_sock_common_setsockopt); -#endif - void sk_common_release(struct sock *sk) { if (sk->sk_prot->destroy) @@ -3596,6 +3538,7 @@ int sock_load_diag_module(int family, int protocol) #ifdef CONFIG_INET if (family == AF_INET && protocol != IPPROTO_RAW && + protocol < MAX_INET_PROTOS && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 0971f17e8e54..119f52a99dc1 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -681,6 +681,7 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +static int sock_map_btf_id; const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, @@ -691,9 +692,11 @@ const struct bpf_map_ops sock_map_ops = { .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_stab", + .map_btf_id = &sock_map_btf_id, }; -struct bpf_htab_elem { +struct bpf_shtab_elem { struct rcu_head rcu; u32 hash; struct sock *sk; @@ -701,14 +704,14 @@ struct bpf_htab_elem { u8 key[]; }; -struct bpf_htab_bucket { +struct bpf_shtab_bucket { struct hlist_head head; raw_spinlock_t lock; }; -struct bpf_htab { +struct bpf_shtab { struct bpf_map map; - struct bpf_htab_bucket *buckets; + struct bpf_shtab_bucket *buckets; u32 buckets_num; u32 elem_size; struct sk_psock_progs progs; @@ -720,17 +723,17 @@ static inline u32 sock_hash_bucket_hash(const void *key, u32 len) return jhash(key, len, 0); } -static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab, - u32 hash) +static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab, + u32 hash) { return &htab->buckets[hash & (htab->buckets_num - 1)]; } -static struct bpf_htab_elem * +static struct bpf_shtab_elem * sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { - struct bpf_htab_elem *elem; + struct bpf_shtab_elem *elem; hlist_for_each_entry_rcu(elem, head, node) { if (elem->hash == hash && @@ -743,10 +746,10 @@ sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; - struct bpf_htab_bucket *bucket; - struct bpf_htab_elem *elem; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -757,8 +760,8 @@ static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) return elem ? elem->sk : NULL; } -static void sock_hash_free_elem(struct bpf_htab *htab, - struct bpf_htab_elem *elem) +static void sock_hash_free_elem(struct bpf_shtab *htab, + struct bpf_shtab_elem *elem) { atomic_dec(&htab->count); kfree_rcu(elem, rcu); @@ -767,9 +770,9 @@ static void sock_hash_free_elem(struct bpf_htab *htab, static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_htab_elem *elem_probe, *elem = link_raw; - struct bpf_htab_bucket *bucket; + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + struct bpf_shtab_elem *elem_probe, *elem = link_raw; + struct bpf_shtab_bucket *bucket; WARN_ON_ONCE(!rcu_read_lock_held()); bucket = sock_hash_select_bucket(htab, elem->hash); @@ -791,10 +794,10 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, static int sock_hash_delete_elem(struct bpf_map *map, void *key) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 hash, key_size = map->key_size; - struct bpf_htab_bucket *bucket; - struct bpf_htab_elem *elem; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; int ret = -ENOENT; hash = sock_hash_bucket_hash(key, key_size); @@ -812,12 +815,12 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) return ret; } -static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab, - void *key, u32 key_size, - u32 hash, struct sock *sk, - struct bpf_htab_elem *old) +static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, + void *key, u32 key_size, + u32 hash, struct sock *sk, + struct bpf_shtab_elem *old) { - struct bpf_htab_elem *new; + struct bpf_shtab_elem *new; if (atomic_inc_return(&htab->count) > htab->map.max_entries) { if (!old) { @@ -841,10 +844,10 @@ static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab, static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; - struct bpf_htab_elem *elem, *elem_new; - struct bpf_htab_bucket *bucket; + struct bpf_shtab_elem *elem, *elem_new; + struct bpf_shtab_bucket *bucket; struct sk_psock_link *link; struct sk_psock *psock; int ret; @@ -954,8 +957,8 @@ out: static int sock_hash_get_next_key(struct bpf_map *map, void *key, void *key_next) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_htab_elem *elem, *elem_next; + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + struct bpf_shtab_elem *elem, *elem_next; u32 hash, key_size = map->key_size; struct hlist_head *head; int i = 0; @@ -969,7 +972,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key, goto find_first_elem; elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)), - struct bpf_htab_elem, node); + struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; @@ -981,7 +984,7 @@ find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), - struct bpf_htab_elem, node); + struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; @@ -993,7 +996,7 @@ find_first_elem: static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { - struct bpf_htab *htab; + struct bpf_shtab *htab; int i, err; u64 cost; @@ -1015,15 +1018,15 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&htab->map, attr); htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); - htab->elem_size = sizeof(struct bpf_htab_elem) + + htab->elem_size = sizeof(struct bpf_shtab_elem) + round_up(htab->map.key_size, 8); if (htab->buckets_num == 0 || - htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) { + htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) { err = -EINVAL; goto free_htab; } - cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) + + cost = (u64) htab->buckets_num * sizeof(struct bpf_shtab_bucket) + (u64) htab->elem_size * htab->map.max_entries; if (cost >= U32_MAX - PAGE_SIZE) { err = -EINVAL; @@ -1034,7 +1037,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) goto free_htab; htab->buckets = bpf_map_area_alloc(htab->buckets_num * - sizeof(struct bpf_htab_bucket), + sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { bpf_map_charge_finish(&htab->map.memory); @@ -1055,10 +1058,10 @@ free_htab: static void sock_hash_free(struct bpf_map *map) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_htab_bucket *bucket; + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + struct bpf_shtab_bucket *bucket; struct hlist_head unlink_list; - struct bpf_htab_elem *elem; + struct bpf_shtab_elem *elem; struct hlist_node *node; int i; @@ -1134,7 +1137,7 @@ static void *sock_hash_lookup(struct bpf_map *map, void *key) static void sock_hash_release_progs(struct bpf_map *map) { - psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs); + psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs); } BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, @@ -1214,6 +1217,7 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .arg4_type = ARG_ANYTHING, }; +static int sock_hash_map_btf_id; const struct bpf_map_ops sock_hash_ops = { .map_alloc = sock_hash_alloc, .map_free = sock_hash_free, @@ -1224,6 +1228,8 @@ const struct bpf_map_ops sock_hash_ops = { .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_shtab", + .map_btf_id = &sock_hash_map_btf_id, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) @@ -1232,7 +1238,7 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) case BPF_MAP_TYPE_SOCKMAP: return &container_of(map, struct bpf_stab, map)->progs; case BPF_MAP_TYPE_SOCKHASH: - return &container_of(map, struct bpf_htab, map)->progs; + return &container_of(map, struct bpf_shtab, map)->progs; default: break; } diff --git a/net/core/tso.c b/net/core/tso.c index d4d5c077ad72..4148f6d48953 100644 --- a/net/core/tso.c +++ b/net/core/tso.c @@ -6,18 +6,17 @@ #include <asm/unaligned.h> /* Calculate expected number of TX descriptors */ -int tso_count_descs(struct sk_buff *skb) +int tso_count_descs(const struct sk_buff *skb) { /* The Marvell Way */ return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; } EXPORT_SYMBOL(tso_count_descs); -void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, +void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, int size, bool is_last) { - struct tcphdr *tcph; - int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + int hdr_len = skb_transport_offset(skb) + tso->tlen; int mac_hdr_len = skb_network_offset(skb); memcpy(hdr, skb->data, hdr_len); @@ -30,23 +29,31 @@ void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, } else { struct ipv6hdr *iph = (void *)(hdr + mac_hdr_len); - iph->payload_len = htons(size + tcp_hdrlen(skb)); + iph->payload_len = htons(size + tso->tlen); } - tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb)); - put_unaligned_be32(tso->tcp_seq, &tcph->seq); + hdr += skb_transport_offset(skb); + if (tso->tlen != sizeof(struct udphdr)) { + struct tcphdr *tcph = (struct tcphdr *)hdr; - if (!is_last) { - /* Clear all special flags for not last packet */ - tcph->psh = 0; - tcph->fin = 0; - tcph->rst = 0; + put_unaligned_be32(tso->tcp_seq, &tcph->seq); + + if (!is_last) { + /* Clear all special flags for not last packet */ + tcph->psh = 0; + tcph->fin = 0; + tcph->rst = 0; + } + } else { + struct udphdr *uh = (struct udphdr *)hdr; + + uh->len = htons(sizeof(*uh) + size); } } EXPORT_SYMBOL(tso_build_hdr); -void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size) +void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size) { - tso->tcp_seq += size; + tso->tcp_seq += size; /* not worth avoiding this operation for UDP */ tso->size -= size; tso->data += size; @@ -62,12 +69,14 @@ void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size) } EXPORT_SYMBOL(tso_build_data); -void tso_start(struct sk_buff *skb, struct tso_t *tso) +int tso_start(struct sk_buff *skb, struct tso_t *tso) { - int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + int tlen = skb_is_gso_tcp(skb) ? tcp_hdrlen(skb) : sizeof(struct udphdr); + int hdr_len = skb_transport_offset(skb) + tlen; + tso->tlen = tlen; tso->ip_id = ntohs(ip_hdr(skb)->id); - tso->tcp_seq = ntohl(tcp_hdr(skb)->seq); + tso->tcp_seq = (tlen != sizeof(struct udphdr)) ? ntohl(tcp_hdr(skb)->seq) : 0; tso->next_frag_idx = 0; tso->ipv6 = vlan_get_protocol(skb) == htons(ETH_P_IPV6); @@ -83,5 +92,6 @@ void tso_start(struct sk_buff *skb, struct tso_t *tso) tso->data = skb_frag_address(frag); tso->next_frag_idx++; } + return hdr_len; } EXPORT_SYMBOL(tso_start); diff --git a/net/core/xdp.c b/net/core/xdp.c index 3c45f99e26d5..48aba933a5a8 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -400,15 +400,6 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) } EXPORT_SYMBOL_GPL(__xdp_release_frame); -int xdp_attachment_query(struct xdp_attachment_info *info, - struct netdev_bpf *bpf) -{ - bpf->prog_id = info->prog ? info->prog->aux->id : 0; - bpf->prog_flags = info->prog ? info->flags : 0; - return 0; -} -EXPORT_SYMBOL_GPL(xdp_attachment_query); - bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index d2a4553bcf39..84dde5a2066e 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1736,7 +1736,7 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *netdev; struct dcbmsg *dcb = nlmsg_data(nlh); struct nlattr *tb[DCB_ATTR_MAX + 1]; - u32 portid = skb ? NETLINK_CB(skb).portid : 0; + u32 portid = NETLINK_CB(skb).portid; int ret = -EINVAL; struct sk_buff *reply_skb; struct nlmsghdr *reply_nlh = NULL; diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 51ac2631fb48..0c7d2f66ba27 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -5,7 +5,7 @@ menuconfig IP_DCCP help Datagram Congestion Control Protocol (RFC 4340) - From http://www.ietf.org/rfc/rfc4340.txt: + From https://www.ietf.org/rfc/rfc4340.txt: The Datagram Congestion Control Protocol (DCCP) is a transport protocol that implements bidirectional, unicast connections of diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 4d7771f36eff..a3eeb84d16f9 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -26,13 +26,13 @@ config IP_DCCP_CCID3 relatively smooth sending rate is of importance. CCID-3 is further described in RFC 4342, - http://www.ietf.org/rfc/rfc4342.txt + https://www.ietf.org/rfc/rfc4342.txt The TFRC congestion control algorithms were initially described in RFC 5348. This text was extracted from RFC 4340 (sec. 10.2), - http://www.ietf.org/rfc/rfc4340.txt + https://www.ietf.org/rfc/rfc4340.txt If in doubt, say N. diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 9ef9bee9610f..aef72f6a2829 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -7,7 +7,7 @@ * An implementation of the DCCP protocol * * This code has been developed by the University of Waikato WAND - * research group. For further information please see http://www.wand.net.nz/ + * research group. For further information please see https://www.wand.net.nz/ * * This code also uses code from Lulea University, rereleased as GPL by its * authors: diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 081c195e7f7d..02e0fc9f6334 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -6,7 +6,7 @@ * An implementation of the DCCP protocol * * This code has been developed by the University of Waikato WAND - * research group. For further information please see http://www.wand.net.nz/ + * research group. For further information please see https://www.wand.net.nz/ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz * * This code also uses code from Lulea University, rereleased as GPL by its diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 2d41bb036271..af08e2df7108 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -6,7 +6,7 @@ * An implementation of the DCCP protocol * * This code has been developed by the University of Waikato WAND - * research group. For further information please see http://www.wand.net.nz/ + * research group. For further information please see https://www.wand.net.nz/ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz * * This code also uses code from Lulea University, rereleased as GPL by its @@ -365,6 +365,7 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) /** * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against + * @h: The non-empty RX history object */ static inline struct tfrc_rx_hist_entry * tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) @@ -374,6 +375,7 @@ static inline struct tfrc_rx_hist_entry * /** * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry + * @h: The non-empty RX history object */ static inline struct tfrc_rx_hist_entry * tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index a157d874840b..159cc9326eab 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -6,7 +6,7 @@ * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. * * This code has been developed by the University of Waikato WAND - * research group. For further information please see http://www.wand.net.nz/ + * research group. For further information please see https://www.wand.net.nz/ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz * * This code also uses code from Lulea University, rereleased as GPL by its diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 7dce4f6c7025..9cc9d1ee6cdb 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -295,13 +295,7 @@ int dccp_disconnect(struct sock *sk, int flags); int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int dccp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -#ifdef CONFIG_COMPAT -int compat_dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -int compat_dccp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -#endif + sockptr_t optval, unsigned int optlen); int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg); int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 9c3b5e056234..afc071ea1271 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -165,6 +165,8 @@ static const struct { /** * dccp_feat_index - Hash function to map feature number into array position + * @feat_num: feature to hash, one of %dccp_feature_numbers + * * Returns consecutive array index or -1 if the feature is not understood. */ static int dccp_feat_index(u8 feat_num) @@ -567,6 +569,8 @@ cloning_failed: /** * dccp_feat_valid_nn_length - Enforce length constraints on NN options + * @feat_num: feature to return length of, one of %dccp_feature_numbers + * * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, * incoming options are accepted as long as their values are valid. */ @@ -1429,6 +1433,8 @@ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, /** * dccp_feat_init - Seed feature negotiation with host-specific defaults + * @sk: Socket to initialize. + * * This initialises global defaults, depending on the value of the sysctls. * These can later be overridden by registering changes via setsockopt calls. * The last link in the chain is finalise_settings, to make sure that between diff --git a/net/dccp/input.c b/net/dccp/input.c index 6dce68a55964..bd9cfdb67436 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -715,6 +715,7 @@ EXPORT_SYMBOL_GPL(dccp_rcv_state_process); /** * dccp_sample_rtt - Validate and finalise computation of RTT sample + * @sk: socket structure * @delta: number of microseconds between packet and acknowledgment * * The routine is kept generic to work in different contexts. It should be diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index d19557c6d04b..9c28c8251125 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -694,6 +694,8 @@ EXPORT_SYMBOL_GPL(dccp_v4_do_rcv); /** * dccp_invalid_packet - check for malformed packets + * @skb: Packet to validate + * * Implements RFC 4340, 8.5: Step 1: Check header basics * Packets that fail these checks are ignored and do not receive Resets. */ @@ -911,10 +913,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ip_setsockopt, - .compat_getsockopt = compat_ip_getsockopt, -#endif }; static int dccp_v4_init_sock(struct sock *sk) @@ -961,10 +959,6 @@ static struct proto dccp_v4_prot = { .rsk_prot = &dccp_request_sock_ops, .twsk_prot = &dccp_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_dccp_setsockopt, - .compat_getsockopt = compat_dccp_getsockopt, -#endif }; static const struct net_protocol dccp_v4_protocol = { @@ -997,10 +991,6 @@ static const struct proto_ops inet_dccp_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; static struct inet_protosw dccp_v4_protosw = { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 650187d68851..ef4ab28cfde0 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -970,10 +970,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ipv6_setsockopt, - .compat_getsockopt = compat_ipv6_getsockopt, -#endif }; /* @@ -990,10 +986,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ipv6_setsockopt, - .compat_getsockopt = compat_ipv6_getsockopt, -#endif }; /* NOTE: A lot of things set to zero explicitly by call to @@ -1049,10 +1041,6 @@ static struct proto dccp_v6_prot = { .rsk_prot = &dccp6_request_sock_ops, .twsk_prot = &dccp6_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_dccp_setsockopt, - .compat_getsockopt = compat_dccp_getsockopt, -#endif }; static const struct inet6_protocol dccp_v6_protocol = { @@ -1083,8 +1071,6 @@ static const struct proto_ops inet6_dccp_ops = { .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/dccp/options.c b/net/dccp/options.c index 9fed0ae21e63..51aaba7a5d45 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -43,6 +43,7 @@ u64 dccp_decode_value_var(const u8 *bf, const u8 len) * dccp_parse_options - Parse DCCP options present in @skb * @sk: client|server|listening dccp socket (when @dreq != NULL) * @dreq: request socket to use during connection setup, or NULL + * @skb: frame to parse */ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, struct sk_buff *skb) @@ -471,6 +472,8 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) /** * dccp_insert_option_mandatory - Mandatory option (5.8.2) + * @skb: frame into which to insert option + * * Note that since we are using skb_push, this function needs to be called * _after_ inserting the option it is supposed to influence (stack order). */ @@ -486,6 +489,7 @@ int dccp_insert_option_mandatory(struct sk_buff *skb) /** * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb + * @skb: frame to insert feature negotiation option into * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R * @feat: one out of %dccp_feature_numbers * @val: NN value or SP array (preferred element first) to copy diff --git a/net/dccp/proto.c b/net/dccp/proto.c index c13b6609474b..d148ab1530e5 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -375,6 +375,15 @@ int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) goto out; switch (cmd) { + case SIOCOUTQ: { + int amount = sk_wmem_alloc_get(sk); + /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and + * always 0, comparably to UDP. + */ + + rc = put_user(amount, (int __user *)arg); + } + break; case SIOCINQ: { struct sk_buff *skb; unsigned long amount = 0; @@ -402,7 +411,7 @@ out: EXPORT_SYMBOL_GPL(dccp_ioctl); static int dccp_setsockopt_service(struct sock *sk, const __be32 service, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_service_list *sl = NULL; @@ -417,9 +426,8 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service, return -ENOMEM; sl->dccpsl_nr = optlen / sizeof(u32) - 1; - if (copy_from_user(sl->dccpsl_list, - optval + sizeof(service), - optlen - sizeof(service)) || + if (copy_from_sockptr_offset(sl->dccpsl_list, optval, + sizeof(service), optlen - sizeof(service)) || dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { kfree(sl); return -EFAULT; @@ -473,7 +481,7 @@ static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) } static int dccp_setsockopt_ccid(struct sock *sk, int type, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { u8 *val; int rc = 0; @@ -481,7 +489,7 @@ static int dccp_setsockopt_ccid(struct sock *sk, int type, if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) return -EINVAL; - val = memdup_user(optval, optlen); + val = memdup_sockptr(optval, optlen); if (IS_ERR(val)) return PTR_ERR(val); @@ -498,7 +506,7 @@ static int dccp_setsockopt_ccid(struct sock *sk, int type, } static int do_dccp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); int val, err = 0; @@ -520,7 +528,7 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, if (optlen < (int)sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; if (optname == DCCP_SOCKOPT_SERVICE) @@ -563,8 +571,8 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, return err; } -int dccp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { if (level != SOL_DCCP) return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, @@ -575,19 +583,6 @@ int dccp_setsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL_GPL(dccp_setsockopt); -#ifdef CONFIG_COMPAT -int compat_dccp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (level != SOL_DCCP) - return inet_csk_compat_setsockopt(sk, level, optname, - optval, optlen); - return do_dccp_setsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(compat_dccp_setsockopt); -#endif - static int dccp_getsockopt_service(struct sock *sk, int len, __be32 __user *optval, int __user *optlen) @@ -696,19 +691,6 @@ int dccp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL_GPL(dccp_getsockopt); -#ifdef CONFIG_COMPAT -int compat_dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_DCCP) - return inet_csk_compat_getsockopt(sk, level, optname, - optval, optlen); - return do_dccp_getsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); -#endif - static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) { struct cmsghdr *cmsg; diff --git a/net/dccp/timer.c b/net/dccp/timer.c index c0b3672637c4..0e06dfc32273 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -216,6 +216,8 @@ out: /** * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface + * @data: Socket to act on + * * See the comments above %ccid_dequeueing_decision for supported modes. */ static void dccp_write_xmitlet(unsigned long data) diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 0a46ea3bddd5..3b53d766789d 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -150,7 +150,8 @@ static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; static atomic_long_t decnet_memory_allocated; -static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags); +static int __dn_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen, int flags); static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); static struct hlist_head *dn_find_list(struct sock *sk) @@ -1320,7 +1321,8 @@ out: return err; } -static int dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +static int dn_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err; @@ -1338,7 +1340,8 @@ static int dn_setsockopt(struct socket *sock, int level, int optname, char __use return err; } -static int __dn_setsockopt(struct socket *sock, int level,int optname, char __user *optval, unsigned int optlen, int flags) +static int __dn_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen, int flags) { struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); @@ -1354,13 +1357,13 @@ static int __dn_setsockopt(struct socket *sock, int level,int optname, char __us } u; int err; - if (optlen && !optval) + if (optlen && sockptr_is_null(optval)) return -EINVAL; if (optlen > sizeof(u)) return -EINVAL; - if (copy_from_user(&u, optval, optlen)) + if (copy_from_sockptr(&u, optval, optlen)) return -EFAULT; switch (optname) { @@ -2134,14 +2137,11 @@ static struct sock *dn_socket_get_next(struct seq_file *seq, struct dn_iter_state *state = seq->private; n = sk_next(n); -try_again: - if (n) - goto out; - if (++state->bucket >= DN_SK_HASH_SIZE) - goto out; - n = sk_head(&dn_sk_hash[state->bucket]); - goto try_again; -out: + while (!n) { + if (++state->bucket >= DN_SK_HASH_SIZE) + break; + n = sk_head(&dn_sk_hash[state->bucket]); + } return n; } diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 65abcf1b3210..15d42353f1a3 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -462,7 +462,9 @@ int dn_dev_ioctl(unsigned int cmd, void __user *arg) switch (cmd) { case SIOCGIFADDR: *((__le16 *)sdn->sdn_nodeaddr) = ifa->ifa_local; - goto rarok; + if (copy_to_user(arg, ifr, DN_IFREQ_SIZE)) + ret = -EFAULT; + break; case SIOCSIFADDR: if (!ifa) { @@ -485,10 +487,6 @@ done: rtnl_unlock(); return ret; -rarok: - if (copy_to_user(arg, ifr, DN_IFREQ_SIZE)) - ret = -EFAULT; - goto done; } struct net_device *dn_dev_get_default(void) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 06b9983325cc..4cac31d22a50 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -494,6 +494,8 @@ static int dn_return_long(struct sk_buff *skb) /** * dn_route_rx_packet - Try and find a route for an incoming packet + * @net: The applicable net namespace + * @sk: Socket packet transmitted on * @skb: The packet to find a route for * * Returns: result of input function if route is found, error code otherwise @@ -670,7 +672,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type if (decnet_debug_level & 1) printk(KERN_DEBUG "dn_route_rcv: got 0x%02x from %s [%d %d %d]\n", - (int)flags, (dev) ? dev->name : "???", len, skb->len, + (int)flags, dev->name, len, skb->len, padlen); if (flags & DN_RT_PKT_CNTL) { diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index dc705769acc9..26a9193df783 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -6,7 +6,7 @@ * * DECnet Routing Message Grabulator * - * (C) 2000 ChyGwyn Limited - http://www.chygwyn.com/ + * (C) 2000 ChyGwyn Limited - https://www.chygwyn.com/ * * Author: Steven Whitehouse <steve@chygwyn.com> */ diff --git a/net/devres.c b/net/devres.c index 57a6a88d11f6..1f9be2133787 100644 --- a/net/devres.c +++ b/net/devres.c @@ -39,7 +39,7 @@ struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, } EXPORT_SYMBOL(devm_alloc_etherdev_mqs); -static void devm_netdev_release(struct device *dev, void *this) +static void devm_unregister_netdev(struct device *dev, void *this) { struct net_device_devres *res = this; @@ -77,7 +77,7 @@ int devm_register_netdev(struct device *dev, struct net_device *ndev) netdev_devres_match, ndev))) return -EINVAL; - dr = devres_alloc(devm_netdev_release, sizeof(*dr), GFP_KERNEL); + dr = devres_alloc(devm_unregister_netdev, sizeof(*dr), GFP_KERNEL); if (!dr) return -ENOMEM; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index d5bc6ac599ef..1f9b9b11008c 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -86,6 +86,13 @@ config NET_DSA_TAG_KSZ Say Y if you want to enable support for tagging frames for the Microchip 8795/9477/9893 families of switches. +config NET_DSA_TAG_RTL4_A + tristate "Tag driver for Realtek 4 byte protocol A tags" + help + Say Y or M if you want to enable support for tagging frames for the + Realtek switches with 4 byte protocol A tags, sich as found in + the Realtek RTL8366RB. + config NET_DSA_TAG_OCELOT tristate "Tag driver for Ocelot family of switches" select PACKING diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 108486cfdeef..4f47b2025ff5 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o +obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 076908fdd29b..c0ffc7a2b65f 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -261,10 +261,15 @@ static int dsa_port_setup(struct dsa_port *dp) struct devlink_port *dlp = &dp->devlink_port; bool dsa_port_link_registered = false; bool devlink_port_registered = false; + struct devlink_port_attrs attrs = {}; struct devlink *dl = ds->devlink; bool dsa_port_enabled = false; int err = 0; + attrs.phys.port_number = dp->index; + memcpy(attrs.switch_id.id, id, len); + attrs.switch_id.id_len = len; + if (dp->setup) return 0; @@ -274,8 +279,8 @@ static int dsa_port_setup(struct dsa_port *dp) break; case DSA_PORT_TYPE_CPU: memset(dlp, 0, sizeof(*dlp)); - devlink_port_attrs_set(dlp, DEVLINK_PORT_FLAVOUR_CPU, - dp->index, false, 0, id, len); + attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU; + devlink_port_attrs_set(dlp, &attrs); err = devlink_port_register(dl, dlp, dp->index); if (err) break; @@ -294,8 +299,8 @@ static int dsa_port_setup(struct dsa_port *dp) break; case DSA_PORT_TYPE_DSA: memset(dlp, 0, sizeof(*dlp)); - devlink_port_attrs_set(dlp, DEVLINK_PORT_FLAVOUR_DSA, - dp->index, false, 0, id, len); + attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA; + devlink_port_attrs_set(dlp, &attrs); err = devlink_port_register(dl, dlp, dp->index); if (err) break; @@ -314,8 +319,8 @@ static int dsa_port_setup(struct dsa_port *dp) break; case DSA_PORT_TYPE_USER: memset(dlp, 0, sizeof(*dlp)); - devlink_port_attrs_set(dlp, DEVLINK_PORT_FLAVOUR_PHYSICAL, - dp->index, false, 0, id, len); + attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; + devlink_port_attrs_set(dlp, &attrs); err = devlink_port_register(dl, dlp, dp->index); if (err) break; @@ -722,8 +727,12 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, ports = of_get_child_by_name(dn, "ports"); if (!ports) { - dev_err(ds->dev, "no ports child node found\n"); - return -EINVAL; + /* The second possibility is "ethernet-ports" */ + ports = of_get_child_by_name(dn, "ethernet-ports"); + if (!ports) { + dev_err(ds->dev, "no ports child node found\n"); + return -EINVAL; + } } for_each_available_child_of_node(ports, port) { diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index adecf73bd608..1653e3377cb3 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -77,7 +77,7 @@ struct dsa_slave_priv { struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); - struct pcpu_sw_netstats *stats64; + struct pcpu_sw_netstats __percpu *stats64; struct gro_cells gcells; diff --git a/net/dsa/master.c b/net/dsa/master.c index 480a61460c23..61615ebc70e9 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -186,17 +186,6 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, } } -static int dsa_master_get_phys_port_name(struct net_device *dev, - char *name, size_t len) -{ - struct dsa_port *cpu_dp = dev->dsa_ptr; - - if (snprintf(name, len, "p%d", cpu_dp->index) >= len) - return -EINVAL; - - return 0; -} - static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_port *cpu_dp = dev->dsa_ptr; @@ -220,12 +209,16 @@ static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; } - if (cpu_dp->orig_ndo_ops && cpu_dp->orig_ndo_ops->ndo_do_ioctl) - err = cpu_dp->orig_ndo_ops->ndo_do_ioctl(dev, ifr, cmd); + if (dev->netdev_ops->ndo_do_ioctl) + err = dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd); return err; } +static const struct dsa_netdevice_ops dsa_netdev_ops = { + .ndo_do_ioctl = dsa_master_ioctl, +}; + static int dsa_master_ethtool_setup(struct net_device *dev) { struct dsa_port *cpu_dp = dev->dsa_ptr; @@ -260,38 +253,10 @@ static void dsa_master_ethtool_teardown(struct net_device *dev) cpu_dp->orig_ethtool_ops = NULL; } -static int dsa_master_ndo_setup(struct net_device *dev) +static void dsa_netdev_ops_set(struct net_device *dev, + const struct dsa_netdevice_ops *ops) { - struct dsa_port *cpu_dp = dev->dsa_ptr; - struct dsa_switch *ds = cpu_dp->ds; - struct net_device_ops *ops; - - if (dev->netdev_ops->ndo_get_phys_port_name) - return 0; - - ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL); - if (!ops) - return -ENOMEM; - - cpu_dp->orig_ndo_ops = dev->netdev_ops; - if (cpu_dp->orig_ndo_ops) - memcpy(ops, cpu_dp->orig_ndo_ops, sizeof(*ops)); - - ops->ndo_get_phys_port_name = dsa_master_get_phys_port_name; - ops->ndo_do_ioctl = dsa_master_ioctl; - - dev->netdev_ops = ops; - - return 0; -} - -static void dsa_master_ndo_teardown(struct net_device *dev) -{ - struct dsa_port *cpu_dp = dev->dsa_ptr; - - if (cpu_dp->orig_ndo_ops) - dev->netdev_ops = cpu_dp->orig_ndo_ops; - cpu_dp->orig_ndo_ops = NULL; + dev->dsa_ptr->netdev_ops = ops; } static ssize_t tagging_show(struct device *d, struct device_attribute *attr, @@ -353,9 +318,7 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) if (ret) return ret; - ret = dsa_master_ndo_setup(dev); - if (ret) - goto out_err_ethtool_teardown; + dsa_netdev_ops_set(dev, &dsa_netdev_ops); ret = sysfs_create_group(&dev->dev.kobj, &dsa_group); if (ret) @@ -364,8 +327,7 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) return ret; out_err_ndo_teardown: - dsa_master_ndo_teardown(dev); -out_err_ethtool_teardown: + dsa_netdev_ops_set(dev, NULL); dsa_master_ethtool_teardown(dev); return ret; } @@ -373,7 +335,7 @@ out_err_ethtool_teardown: void dsa_master_teardown(struct net_device *dev) { sysfs_remove_group(&dev->dev.kobj, &dsa_group); - dsa_master_ndo_teardown(dev); + dsa_netdev_ops_set(dev, NULL); dsa_master_ethtool_teardown(dev); dsa_master_reset_mtu(dev); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 4c7f086a047b..41d60eeefdbd 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1754,11 +1754,8 @@ int dsa_slave_create(struct dsa_port *port) eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; slave_dev->netdev_ops = &dsa_slave_netdev_ops; - slave_dev->min_mtu = 0; if (ds->ops->port_max_mtu) slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index); - else - slave_dev->max_mtu = ETH_MAX_MTU; SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, @@ -1795,7 +1792,8 @@ int dsa_slave_create(struct dsa_port *port) ret = dsa_slave_phy_setup(slave_dev); if (ret) { - netdev_err(master, "error %d setting up slave phy\n", ret); + netdev_err(master, "error %d setting up slave PHY for %s\n", + ret, slave_dev->name); goto out_gcells; } diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 90d055c4df9e..bd1a3158d79a 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -156,8 +156,9 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, { struct dsa_port *dp = dsa_slave_to_port(dev); struct sk_buff *nskb; - u16 *tag; + __be16 *tag; u8 *addr; + u16 val; nskb = ksz_common_xmit(skb, dev, KSZ9477_INGRESS_TAG_LEN); if (!nskb) @@ -167,12 +168,12 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, tag = skb_put(nskb, KSZ9477_INGRESS_TAG_LEN); addr = skb_mac_header(nskb); - *tag = BIT(dp->index); + val = BIT(dp->index); if (is_link_local_ether_addr(addr)) - *tag |= KSZ9477_TAIL_TAG_OVERRIDE; + val |= KSZ9477_TAIL_TAG_OVERRIDE; - *tag = cpu_to_be16(*tag); + *tag = cpu_to_be16(val); return nskb; } diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index eb0e7a32e53d..ccfb6f641bbf 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -55,7 +55,8 @@ static int lan9303_xmit_use_arl(struct dsa_port *dp, u8 *dest_addr) static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - u16 *lan9303_tag; + __be16 *lan9303_tag; + u16 tag; /* insert a special VLAN tag between the MAC addresses * and the current ethertype field. @@ -72,12 +73,12 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) /* make room between MACs and Ether-Type */ memmove(skb->data, skb->data + LAN9303_TAG_LEN, 2 * ETH_ALEN); - lan9303_tag = (u16 *)(skb->data + 2 * ETH_ALEN); + lan9303_tag = (__be16 *)(skb->data + 2 * ETH_ALEN); + tag = lan9303_xmit_use_arl(dp, skb->data) ? + LAN9303_TAG_TX_USE_ALR : + dp->index | LAN9303_TAG_TX_STP_OVERRIDE; lan9303_tag[0] = htons(ETH_P_8021Q); - lan9303_tag[1] = lan9303_xmit_use_arl(dp, skb->data) ? - LAN9303_TAG_TX_USE_ALR : - dp->index | LAN9303_TAG_TX_STP_OVERRIDE; - lan9303_tag[1] = htons(lan9303_tag[1]); + lan9303_tag[1] = htons(tag); return skb; } @@ -85,7 +86,7 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - u16 *lan9303_tag; + __be16 *lan9303_tag; u16 lan9303_tag1; unsigned int source_port; @@ -101,7 +102,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, * ^ * ->data */ - lan9303_tag = (u16 *)(skb->data - 2); + lan9303_tag = (__be16 *)(skb->data - 2); if (lan9303_tag[0] != htons(ETH_P_8021Q)) { dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid VLAN marker\n"); diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index d6619edd53e5..f602fc758d68 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -67,8 +67,9 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { + u16 hdr; int port; - __be16 *phdr, hdr; + __be16 *phdr; unsigned char *dest = eth_hdr(skb)->h_dest; bool is_multicast_skb = is_multicast_ether_addr(dest) && !is_broadcast_ether_addr(dest); diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index b0c98ee4e13b..42f327c06dca 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -137,11 +137,10 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct net_device *netdev) { struct dsa_port *dp = dsa_slave_to_port(netdev); - u64 bypass, dest, src, qos_class, rew_op; struct dsa_switch *ds = dp->ds; - int port = dp->index; struct ocelot *ocelot = ds->priv; - struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct ocelot_port *ocelot_port; + u64 qos_class, rew_op; u8 *injection; if (unlikely(skb_cow_head(skb, OCELOT_TAG_LEN) < 0)) { @@ -149,19 +148,15 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, return NULL; } - injection = skb_push(skb, OCELOT_TAG_LEN); + ocelot_port = ocelot->ports[dp->index]; - memset(injection, 0, OCELOT_TAG_LEN); + injection = skb_push(skb, OCELOT_TAG_LEN); - /* Set the source port as the CPU port module and not the NPI port */ - src = ocelot->num_phys_ports; - dest = BIT(port); - bypass = true; + memcpy(injection, ocelot_port->xmit_template, OCELOT_TAG_LEN); + /* Fix up the fields which are not statically determined + * in the template + */ qos_class = skb->priority; - - packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0); - packing(injection, &dest, 68, 56, OCELOT_TAG_LEN, PACK, 0); - packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0); packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0); if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 70db7c909f74..7066f5e697d7 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -31,7 +31,8 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - u16 *phdr, hdr; + __be16 *phdr; + u16 hdr; if (skb_cow_head(skb, QCA_HDR_LEN) < 0) return NULL; @@ -39,7 +40,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) skb_push(skb, QCA_HDR_LEN); memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN); - phdr = (u16 *)(skb->data + 2 * ETH_ALEN); + phdr = (__be16 *)(skb->data + 2 * ETH_ALEN); /* Set the version field, and set destination port information */ hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S | @@ -54,8 +55,9 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { u8 ver; + u16 hdr; int port; - __be16 *phdr, hdr; + __be16 *phdr; if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN))) return NULL; diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c new file mode 100644 index 000000000000..7b63010fa87b --- /dev/null +++ b/net/dsa/tag_rtl4_a.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Handler for Realtek 4 byte DSA switch tags + * Currently only supports protocol "A" found in RTL8366RB + * Copyright (c) 2020 Linus Walleij <linus.walleij@linaro.org> + * + * This "proprietary tag" header looks like so: + * + * ------------------------------------------------- + * | MAC DA | MAC SA | 0x8899 | 2 bytes tag | Type | + * ------------------------------------------------- + * + * The 2 bytes tag form a 16 bit big endian word. The exact + * meaning has been guessed from packet dumps from ingress + * frames, as no working egress traffic has been available + * we do not know the format of the egress tags or if they + * are even supported. + */ + +#include <linux/etherdevice.h> +#include <linux/bits.h> + +#include "dsa_priv.h" + +#define RTL4_A_HDR_LEN 4 +#define RTL4_A_ETHERTYPE 0x8899 +#define RTL4_A_PROTOCOL_SHIFT 12 +/* + * 0x1 = Realtek Remote Control protocol (RRCP) + * 0x2/0x3 seems to be used for loopback testing + * 0x9 = RTL8306 DSA protocol + * 0xa = RTL8366RB DSA protocol + */ +#define RTL4_A_PROTOCOL_RTL8366RB 0xa + +static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + /* + * Just let it pass thru, we don't know if it is possible + * to tag a frame with the 0x8899 ethertype and direct it + * to a specific port, all attempts at reverse-engineering have + * ended up with the frames getting dropped. + * + * The VLAN set-up needs to restrict the frames to the right port. + * + * If you have documentation on the tagging format for RTL8366RB + * (tag type A) then please contribute. + */ + return skb; +} + +static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt) +{ + u16 protport; + __be16 *p; + u16 etype; + u8 *tag; + u8 prot; + u8 port; + + if (unlikely(!pskb_may_pull(skb, RTL4_A_HDR_LEN))) + return NULL; + + /* The RTL4 header has its own custom Ethertype 0x8899 and that + * starts right at the beginning of the packet, after the src + * ethernet addr. Apparantly skb->data always points 2 bytes in, + * behind the Ethertype. + */ + tag = skb->data - 2; + p = (__be16 *)tag; + etype = ntohs(*p); + if (etype != RTL4_A_ETHERTYPE) { + /* Not custom, just pass through */ + netdev_dbg(dev, "non-realtek ethertype 0x%04x\n", etype); + return skb; + } + p = (__be16 *)(tag + 2); + protport = ntohs(*p); + /* The 4 upper bits are the protocol */ + prot = (protport >> RTL4_A_PROTOCOL_SHIFT) & 0x0f; + if (prot != RTL4_A_PROTOCOL_RTL8366RB) { + netdev_err(dev, "unknown realtek protocol 0x%01x\n", prot); + return NULL; + } + port = protport & 0xff; + + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) { + netdev_dbg(dev, "could not find slave for port %d\n", port); + return NULL; + } + + /* Remove RTL4 tag and recalculate checksum */ + skb_pull_rcsum(skb, RTL4_A_HDR_LEN); + + /* Move ethernet DA and SA in front of the data */ + memmove(skb->data - ETH_HLEN, + skb->data - ETH_HLEN - RTL4_A_HDR_LEN, + 2 * ETH_ALEN); + + skb->offload_fwd_mark = 1; + + return skb; +} + +static int rtl4a_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto, + int *offset) +{ + *offset = RTL4_A_HDR_LEN; + /* Skip past the tag and fetch the encapsulated Ethertype */ + *proto = ((__be16 *)skb->data)[1]; + + return 0; +} + +static const struct dsa_device_ops rtl4a_netdev_ops = { + .name = "rtl4a", + .proto = DSA_TAG_PROTO_RTL4_A, + .xmit = rtl4a_tag_xmit, + .rcv = rtl4a_tag_rcv, + .flow_dissect = rtl4a_tag_flow_dissect, + .overhead = RTL4_A_HDR_LEN, +}; +module_dsa_tag_driver(rtl4a_netdev_ops); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL4_A); diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 0c2b94f20499..7a849ff22dad 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -6,4 +6,5 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ - channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o + channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ + tunnels.o diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 7194956aa09e..888f6e101f34 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -58,6 +58,7 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ETHTOOL_A_CABLE_TEST_MAX + 1]; struct ethnl_req_info req_info = {}; + const struct ethtool_phy_ops *ops; struct net_device *dev; int ret; @@ -81,11 +82,17 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) } rtnl_lock(); + ops = ethtool_phy_ops; + if (!ops || !ops->start_cable_test) { + ret = -EOPNOTSUPP; + goto out_rtnl; + } + ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; - ret = phy_start_cable_test(dev->phydev, info->extack); + ret = ops->start_cable_test(dev->phydev, info->extack); ethnl_ops_complete(dev); @@ -308,6 +315,7 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1]; struct ethnl_req_info req_info = {}; + const struct ethtool_phy_ops *ops; struct phy_tdr_config cfg; struct net_device *dev; int ret; @@ -337,11 +345,17 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) goto out_dev_put; rtnl_lock(); + ops = ethtool_phy_ops; + if (!ops || !ops->start_cable_test_tdr) { + ret = -EOPNOTSUPP; + goto out_rtnl; + } + ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; - ret = phy_start_cable_test_tdr(dev->phydev, info->extack, &cfg); + ret = ops->start_cable_test_tdr(dev->phydev, info->extack, &cfg); ethnl_ops_complete(dev); diff --git a/net/ethtool/common.c b/net/ethtool/common.c index aaecfc916a4d..ed19573fccd7 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/ethtool_netlink.h> #include <linux/net_tstamp.h> #include <linux/phy.h> +#include <linux/rtnetlink.h> #include "common.h" @@ -175,6 +177,21 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(400000, DR8, Full), __DEFINE_LINK_MODE_NAME(400000, CR8, Full), __DEFINE_SPECIAL_MODE_NAME(FEC_LLRS, "LLRS"), + __DEFINE_LINK_MODE_NAME(100000, KR, Full), + __DEFINE_LINK_MODE_NAME(100000, SR, Full), + __DEFINE_LINK_MODE_NAME(100000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_NAME(100000, DR, Full), + __DEFINE_LINK_MODE_NAME(100000, CR, Full), + __DEFINE_LINK_MODE_NAME(200000, KR2, Full), + __DEFINE_LINK_MODE_NAME(200000, SR2, Full), + __DEFINE_LINK_MODE_NAME(200000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_NAME(200000, DR2, Full), + __DEFINE_LINK_MODE_NAME(200000, CR2, Full), + __DEFINE_LINK_MODE_NAME(400000, KR4, Full), + __DEFINE_LINK_MODE_NAME(400000, SR4, Full), + __DEFINE_LINK_MODE_NAME(400000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_NAME(400000, DR4, Full), + __DEFINE_LINK_MODE_NAME(400000, CR4, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -256,6 +273,14 @@ const char ts_rx_filter_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT); +const char udp_tunnel_type_names[][ETH_GSTRING_LEN] = { + [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN] = "vxlan", + [ETHTOOL_UDP_TUNNEL_TYPE_GENEVE] = "geneve", + [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE] = "vxlan-gpe", +}; +static_assert(ARRAY_SIZE(udp_tunnel_type_names) == + __ETHTOOL_UDP_TUNNEL_TYPE_CNT); + /* return false if legacy contained non-0 deprecated fields * maxtxpkt/maxrxpkt. rest of ksettings always updated */ @@ -373,3 +398,13 @@ int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) return 0; } + +const struct ethtool_phy_ops *ethtool_phy_ops; + +void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops) +{ + rtnl_lock(); + ethtool_phy_ops = ops; + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(ethtool_set_ethtool_phy_ops); diff --git a/net/ethtool/common.h b/net/ethtool/common.h index a62f68ccc43a..3d9251c95a8b 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -28,6 +28,7 @@ extern const char wol_mode_names[][ETH_GSTRING_LEN]; extern const char sof_timestamping_names[][ETH_GSTRING_LEN]; extern const char ts_tx_type_names[][ETH_GSTRING_LEN]; extern const char ts_rx_filter_names[][ETH_GSTRING_LEN]; +extern const char udp_tunnel_type_names[][ETH_GSTRING_LEN]; int __ethtool_get_link(struct net_device *dev); @@ -37,4 +38,6 @@ bool convert_legacy_settings_to_link_ksettings( int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max); int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info); +extern const struct ethtool_phy_ops *ethtool_phy_ops; + #endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 21d5fc0f6bb3..441794e0034f 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -135,6 +135,7 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr) static int __ethtool_get_sset_count(struct net_device *dev, int sset) { + const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; if (sset == ETH_SS_FEATURES) @@ -150,8 +151,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) return ARRAY_SIZE(phy_tunable_strings); if (sset == ETH_SS_PHY_STATS && dev->phydev && - !ops->get_ethtool_phy_stats) - return phy_ethtool_get_sset_count(dev->phydev); + !ops->get_ethtool_phy_stats && + phy_ops && phy_ops->get_sset_count) + return phy_ops->get_sset_count(dev->phydev); if (sset == ETH_SS_LINK_MODES) return __ETHTOOL_LINK_MODE_MASK_NBITS; @@ -165,6 +167,7 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) static void __ethtool_get_strings(struct net_device *dev, u32 stringset, u8 *data) { + const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; if (stringset == ETH_SS_FEATURES) @@ -178,8 +181,9 @@ static void __ethtool_get_strings(struct net_device *dev, else if (stringset == ETH_SS_PHY_TUNABLES) memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); else if (stringset == ETH_SS_PHY_STATS && dev->phydev && - !ops->get_ethtool_phy_stats) - phy_ethtool_get_strings(dev->phydev, data); + !ops->get_ethtool_phy_stats && phy_ops && + phy_ops->get_strings) + phy_ops->get_strings(dev->phydev, data); else if (stringset == ETH_SS_LINK_MODES) memcpy(data, link_mode_names, __ETHTOOL_LINK_MODE_MASK_NBITS * ETH_GSTRING_LEN); @@ -1918,7 +1922,7 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); - if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) + if (n_stats && copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) goto out; ret = 0; @@ -1929,6 +1933,7 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) { + const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; struct ethtool_stats stats; @@ -1938,8 +1943,9 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count)) return -EOPNOTSUPP; - if (dev->phydev && !ops->get_ethtool_phy_stats) - n_stats = phy_ethtool_get_sset_count(dev->phydev); + if (dev->phydev && !ops->get_ethtool_phy_stats && + phy_ops && phy_ops->get_sset_count) + n_stats = phy_ops->get_sset_count(dev->phydev); else n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); if (n_stats < 0) @@ -1958,8 +1964,9 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) if (!data) return -ENOMEM; - if (dev->phydev && !ops->get_ethtool_phy_stats) { - ret = phy_ethtool_get_stats(dev->phydev, &stats, data); + if (dev->phydev && !ops->get_ethtool_phy_stats && + phy_ops && phy_ops->get_stats) { + ret = phy_ops->get_stats(dev->phydev, &stats, data); if (ret < 0) goto out; } else { @@ -1973,7 +1980,7 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); - if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) + if (n_stats && copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) goto out; ret = 0; diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index fd4f3e58c6f6..7044a2853886 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -257,6 +257,21 @@ static const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS), + __DEFINE_LINK_MODE_PARAMS(100000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), }; static const struct nla_policy @@ -406,8 +421,7 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { - if (info) - GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); goto out_ops; } diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index afe5ac8a0f00..4834091ec24c 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -9,10 +9,12 @@ struct linkstate_req_info { }; struct linkstate_reply_data { - struct ethnl_reply_data base; - int link; - int sqi; - int sqi_max; + struct ethnl_reply_data base; + int link; + int sqi; + int sqi_max; + bool link_ext_state_provided; + struct ethtool_link_ext_state_info ethtool_link_ext_state_info; }; #define LINKSTATE_REPDATA(__reply_base) \ @@ -25,6 +27,8 @@ linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = { [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_SQI] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_SQI_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_EXT_STATE] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_EXT_SUBSTATE] = { .type = NLA_REJECT }, }; static int linkstate_get_sqi(struct net_device *dev) @@ -61,6 +65,23 @@ static int linkstate_get_sqi_max(struct net_device *dev) mutex_unlock(&phydev->lock); return ret; +}; + +static int linkstate_get_link_ext_state(struct net_device *dev, + struct linkstate_reply_data *data) +{ + int err; + + if (!dev->ethtool_ops->get_link_ext_state) + return -EOPNOTSUPP; + + err = dev->ethtool_ops->get_link_ext_state(dev, &data->ethtool_link_ext_state_info); + if (err) + return err; + + data->link_ext_state_provided = true; + + return 0; } static int linkstate_prepare_data(const struct ethnl_req_info *req_base, @@ -86,6 +107,12 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, goto out; data->sqi_max = ret; + if (dev->flags & IFF_UP) { + ret = linkstate_get_link_ext_state(dev, data); + if (ret < 0 && ret != -EOPNOTSUPP && ret != -ENODATA) + goto out; + } + ret = 0; out: ethnl_ops_complete(dev); @@ -107,6 +134,12 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base, if (data->sqi_max != -EOPNOTSUPP) len += nla_total_size(sizeof(u32)); + if (data->link_ext_state_provided) + len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */ + + if (data->ethtool_link_ext_state_info.__link_ext_substate) + len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_SUBSTATE */ + return len; } @@ -128,6 +161,17 @@ static int linkstate_fill_reply(struct sk_buff *skb, nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) return -EMSGSIZE; + if (data->link_ext_state_provided) { + if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE, + data->ethtool_link_ext_state_info.link_ext_state)) + return -EMSGSIZE; + + if (data->ethtool_link_ext_state_info.__link_ext_substate && + nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, + data->ethtool_link_ext_state_info.__link_ext_substate)) + return -EMSGSIZE; + } + return 0; } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index dd8a1c1dc07d..5c2072765be7 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -181,6 +181,12 @@ err: return NULL; } +void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd) +{ + return genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + ðtool_genl_family, 0, cmd); +} + void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) { return genlmsg_put(skb, 0, ++ethnl_bcast_seq, ðtool_genl_family, 0, @@ -848,6 +854,12 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test_tdr, }, + { + .cmd = ETHTOOL_MSG_TUNNEL_INFO_GET, + .doit = ethnl_tunnel_info_doit, + .start = ethnl_tunnel_info_start, + .dumpit = ethnl_tunnel_info_dumpit, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 9a96b6e90dc2..e2085005caac 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -19,6 +19,7 @@ int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp); +void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd); void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd); int ethnl_multicast(struct sk_buff *skb, struct net_device *dev); @@ -361,5 +362,8 @@ int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); +int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); +int ethnl_tunnel_info_start(struct netlink_callback *cb); +int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 0eed4e4909ab..82707b662fe4 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -75,6 +75,11 @@ static const struct strset_info info_template[] = { .count = __HWTSTAMP_FILTER_CNT, .strings = ts_rx_filter_names, }, + [ETH_SS_UDP_TUNNEL_TYPES] = { + .per_dev = false, + .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + .strings = udp_tunnel_type_names, + }, }; struct strset_req_info { @@ -209,13 +214,15 @@ static void strset_cleanup_data(struct ethnl_reply_data *reply_base) static int strset_prepare_set(struct strset_info *info, struct net_device *dev, unsigned int id, bool counts_only) { + const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; void *strings; int count, ret; if (id == ETH_SS_PHY_STATS && dev->phydev && - !ops->get_ethtool_phy_stats) - ret = phy_ethtool_get_sset_count(dev->phydev); + !ops->get_ethtool_phy_stats && phy_ops && + phy_ops->get_sset_count) + ret = phy_ops->get_sset_count(dev->phydev); else if (ops->get_sset_count && ops->get_strings) ret = ops->get_sset_count(dev, id); else @@ -231,8 +238,9 @@ static int strset_prepare_set(struct strset_info *info, struct net_device *dev, if (!strings) return -ENOMEM; if (id == ETH_SS_PHY_STATS && dev->phydev && - !ops->get_ethtool_phy_stats) - phy_ethtool_get_strings(dev->phydev, strings); + !ops->get_ethtool_phy_stats && phy_ops && + phy_ops->get_strings) + phy_ops->get_strings(dev->phydev, strings); else ops->get_strings(dev, id, strings); info->strings = strings; diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c new file mode 100644 index 000000000000..84f23289475b --- /dev/null +++ b/net/ethtool/tunnels.c @@ -0,0 +1,312 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ethtool_netlink.h> +#include <net/udp_tunnel.h> +#include <net/vxlan.h> + +#include "bitset.h" +#include "common.h" +#include "netlink.h" + +static const struct nla_policy +ethtool_tunnel_info_policy[ETHTOOL_A_TUNNEL_INFO_MAX + 1] = { + [ETHTOOL_A_TUNNEL_INFO_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_TUNNEL_INFO_HEADER] = { .type = NLA_NESTED }, +}; + +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN == ilog2(UDP_TUNNEL_TYPE_VXLAN)); +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_GENEVE == ilog2(UDP_TUNNEL_TYPE_GENEVE)); +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE == + ilog2(UDP_TUNNEL_TYPE_VXLAN_GPE)); + +static ssize_t ethnl_udp_table_reply_size(unsigned int types, bool compact) +{ + ssize_t size; + + size = ethnl_bitset32_size(&types, NULL, __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + udp_tunnel_type_names, compact); + if (size < 0) + return size; + + return size + + nla_total_size(0) + /* _UDP_TABLE */ + nla_total_size(sizeof(u32)); /* _UDP_TABLE_SIZE */ +} + +static ssize_t +ethnl_tunnel_info_reply_size(const struct ethnl_req_info *req_base, + struct netlink_ext_ack *extack) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct udp_tunnel_nic_info *info; + unsigned int i; + ssize_t ret; + size_t size; + + info = req_base->dev->udp_tunnel_nic_info; + if (!info) { + NL_SET_ERR_MSG(extack, + "device does not report tunnel offload info"); + return -EOPNOTSUPP; + } + + size = nla_total_size(0); /* _INFO_UDP_PORTS */ + + for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + break; + + ret = ethnl_udp_table_reply_size(info->tables[i].tunnel_types, + compact); + if (ret < 0) + return ret; + size += ret; + + size += udp_tunnel_nic_dump_size(req_base->dev, i); + } + + if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN) { + ret = ethnl_udp_table_reply_size(0, compact); + if (ret < 0) + return ret; + size += ret; + + size += nla_total_size(0) + /* _TABLE_ENTRY */ + nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ + nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ + } + + return size; +} + +static int +ethnl_tunnel_info_fill_reply(const struct ethnl_req_info *req_base, + struct sk_buff *skb) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct udp_tunnel_nic_info *info; + struct nlattr *ports, *table, *entry; + unsigned int i; + + info = req_base->dev->udp_tunnel_nic_info; + if (!info) + return -EOPNOTSUPP; + + ports = nla_nest_start(skb, ETHTOOL_A_TUNNEL_INFO_UDP_PORTS); + if (!ports) + return -EMSGSIZE; + + for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + break; + + table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE); + if (!table) + goto err_cancel_ports; + + if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, + info->tables[i].n_entries)) + goto err_cancel_table; + + if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, + &info->tables[i].tunnel_types, NULL, + __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + udp_tunnel_type_names, compact)) + goto err_cancel_table; + + if (udp_tunnel_nic_dump_write(req_base->dev, i, skb)) + goto err_cancel_table; + + nla_nest_end(skb, table); + } + + if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN) { + u32 zero = 0; + + table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE); + if (!table) + goto err_cancel_ports; + + if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, 1)) + goto err_cancel_table; + + if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, + &zero, NULL, + __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + udp_tunnel_type_names, compact)) + goto err_cancel_table; + + entry = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); + + if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, + htons(IANA_VXLAN_UDP_PORT)) || + nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, + ilog2(UDP_TUNNEL_TYPE_VXLAN))) + goto err_cancel_entry; + + nla_nest_end(skb, entry); + nla_nest_end(skb, table); + } + + nla_nest_end(skb, ports); + + return 0; + +err_cancel_entry: + nla_nest_cancel(skb, entry); +err_cancel_table: + nla_nest_cancel(skb, table); +err_cancel_ports: + nla_nest_cancel(skb, ports); + return -EMSGSIZE; +} + +static int +ethnl_tunnel_info_req_parse(struct ethnl_req_info *req_info, + const struct nlmsghdr *nlhdr, struct net *net, + struct netlink_ext_ack *extack, bool require_dev) +{ + struct nlattr *tb[ETHTOOL_A_TUNNEL_INFO_MAX + 1]; + int ret; + + ret = nlmsg_parse(nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_TUNNEL_INFO_MAX, + ethtool_tunnel_info_policy, extack); + if (ret < 0) + return ret; + + return ethnl_parse_header_dev_get(req_info, + tb[ETHTOOL_A_TUNNEL_INFO_HEADER], + net, extack, require_dev); +} + +int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct ethnl_req_info req_info = {}; + struct sk_buff *rskb; + void *reply_payload; + int reply_len; + int ret; + + ret = ethnl_tunnel_info_req_parse(&req_info, info->nlhdr, + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + rtnl_lock(); + ret = ethnl_tunnel_info_reply_size(&req_info, info->extack); + if (ret < 0) + goto err_unlock_rtnl; + reply_len = ret + ethnl_reply_header_size(); + + rskb = ethnl_reply_init(reply_len, req_info.dev, + ETHTOOL_MSG_TUNNEL_INFO_GET, + ETHTOOL_A_TUNNEL_INFO_HEADER, + info, &reply_payload); + if (!rskb) { + ret = -ENOMEM; + goto err_unlock_rtnl; + } + + ret = ethnl_tunnel_info_fill_reply(&req_info, rskb); + if (ret) + goto err_free_msg; + rtnl_unlock(); + dev_put(req_info.dev); + genlmsg_end(rskb, reply_payload); + + return genlmsg_reply(rskb, info); + +err_free_msg: + nlmsg_free(rskb); +err_unlock_rtnl: + rtnl_unlock(); + dev_put(req_info.dev); + return ret; +} + +struct ethnl_tunnel_info_dump_ctx { + struct ethnl_req_info req_info; + int pos_hash; + int pos_idx; +}; + +int ethnl_tunnel_info_start(struct netlink_callback *cb) +{ + struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + memset(ctx, 0, sizeof(*ctx)); + + ret = ethnl_tunnel_info_req_parse(&ctx->req_info, cb->nlh, + sock_net(cb->skb->sk), cb->extack, + false); + if (ctx->req_info.dev) { + dev_put(ctx->req_info.dev); + ctx->req_info.dev = NULL; + } + + return ret; +} + +int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + int s_idx = ctx->pos_idx; + int h, idx = 0; + int ret = 0; + void *ehdr; + + rtnl_lock(); + cb->seq = net->dev_base_seq; + for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + struct hlist_head *head; + struct net_device *dev; + + head = &net->dev_index_head[h]; + idx = 0; + hlist_for_each_entry(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + + ehdr = ethnl_dump_put(skb, cb, + ETHTOOL_MSG_TUNNEL_INFO_GET); + if (!ehdr) { + ret = -EMSGSIZE; + goto out; + } + + ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TUNNEL_INFO_HEADER); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + goto out; + } + + ctx->req_info.dev = dev; + ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); + ctx->req_info.dev = NULL; + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + if (ret == -EOPNOTSUPP) + goto cont; + goto out; + } + genlmsg_end(skb, ehdr); +cont: + idx++; + } + } +out: + rtnl_unlock(); + + ctx->pos_hash = h; + ctx->pos_idx = idx; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + + if (ret == -EMSGSIZE && skb->len) + return skb->len; + return ret; +} diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig index 8095b034e76e..1b048c17b6c8 100644 --- a/net/hsr/Kconfig +++ b/net/hsr/Kconfig @@ -4,24 +4,35 @@ # config HSR - tristate "High-availability Seamless Redundancy (HSR)" + tristate "High-availability Seamless Redundancy (HSR & PRP)" help + This enables IEC 62439 defined High-availability Seamless + Redundancy (HSR) and Parallel Redundancy Protocol (PRP). + If you say Y here, then your Linux box will be able to act as a - DANH ("Doubly attached node implementing HSR"). For this to work, - your Linux box needs (at least) two physical Ethernet interfaces, - and it must be connected as a node in a ring network together with - other HSR capable nodes. + DANH ("Doubly attached node implementing HSR") or DANP ("Doubly + attached node implementing PRP"). For this to work, your Linux box + needs (at least) two physical Ethernet interfaces. + + For DANH, it must be connected as a node in a ring network together + with other HSR capable nodes. All Ethernet frames sent over the HSR + device will be sent in both directions on the ring (over both slave + ports), giving a redundant, instant fail-over network. Each HSR node + in the ring acts like a bridge for HSR frames, but filters frames + that have been forwarded earlier. - All Ethernet frames sent over the hsr device will be sent in both - directions on the ring (over both slave ports), giving a redundant, - instant fail-over network. Each HSR node in the ring acts like a - bridge for HSR frames, but filters frames that have been forwarded - earlier. + For DANP, it must be connected as a node connecting to two + separate networks over the two slave interfaces. Like HSR, Ethernet + frames sent over the PRP device will be sent to both networks giving + a redundant, instant fail-over network. Unlike HSR, PRP networks + can have Singly Attached Nodes (SAN) such as PC, printer, bridges + etc and will be able to communicate with DANP nodes. This code is a "best effort" to comply with the HSR standard as described in IEC 62439-3:2010 (HSRv0) and IEC 62439-3:2012 (HSRv1), - but no compliancy tests have been made. Use iproute2 to select - the version you desire. + and PRP standard described in IEC 62439-4:2012 (PRP), but no + compliancy tests have been made. Use iproute2 to select the protocol + you would like to use. You need to perform any and all necessary tests yourself before relying on this code in a safety critical system! diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 9787ef11ca71..7e11a6c35bc3 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -1,5 +1,5 @@ /* - * hsr_debugfs code + * debugfs code for HSR & PRP * Copyright (C) 2019 Texas Instruments Incorporated * * Author(s): @@ -22,12 +22,6 @@ static struct dentry *hsr_debugfs_root_dir; -static void print_mac_address(struct seq_file *sfp, unsigned char *mac) -{ - seq_printf(sfp, "%02x:%02x:%02x:%02x:%02x:%02x:", - mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); -} - /* hsr_node_table_show - Formats and prints node_table entries */ static int hsr_node_table_show(struct seq_file *sfp, void *data) @@ -35,20 +29,32 @@ hsr_node_table_show(struct seq_file *sfp, void *data) struct hsr_priv *priv = (struct hsr_priv *)sfp->private; struct hsr_node *node; - seq_puts(sfp, "Node Table entries\n"); - seq_puts(sfp, "MAC-Address-A, MAC-Address-B, time_in[A], "); - seq_puts(sfp, "time_in[B], Address-B port\n"); + seq_printf(sfp, "Node Table entries for (%s) device\n", + (priv->prot_version == PRP_V1 ? "PRP" : "HSR")); + seq_puts(sfp, "MAC-Address-A, MAC-Address-B, time_in[A], "); + seq_puts(sfp, "time_in[B], Address-B port, "); + if (priv->prot_version == PRP_V1) + seq_puts(sfp, "SAN-A, SAN-B, DAN-P\n"); + else + seq_puts(sfp, "DAN-H\n"); + rcu_read_lock(); list_for_each_entry_rcu(node, &priv->node_db, mac_list) { /* skip self node */ if (hsr_addr_is_self(priv, node->macaddress_A)) continue; - print_mac_address(sfp, &node->macaddress_A[0]); - seq_puts(sfp, " "); - print_mac_address(sfp, &node->macaddress_B[0]); - seq_printf(sfp, "0x%lx, ", node->time_in[HSR_PT_SLAVE_A]); - seq_printf(sfp, "0x%lx ", node->time_in[HSR_PT_SLAVE_B]); - seq_printf(sfp, "0x%x\n", node->addr_B_port); + seq_printf(sfp, "%pM ", &node->macaddress_A[0]); + seq_printf(sfp, "%pM ", &node->macaddress_B[0]); + seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_A]); + seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_B]); + seq_printf(sfp, "%14x, ", node->addr_B_port); + + if (priv->prot_version == PRP_V1) + seq_printf(sfp, "%5x, %5x, %5x\n", + node->san_a, node->san_b, + (node->san_a == 0 && node->san_b == 0)); + else + seq_printf(sfp, "%5x\n", 1); } rcu_read_unlock(); return 0; @@ -57,7 +63,8 @@ hsr_node_table_show(struct seq_file *sfp, void *data) /* hsr_node_table_open - Open the node_table file * * Description: - * This routine opens a debugfs file node_table of specific hsr device + * This routine opens a debugfs file node_table of specific hsr + * or prp device */ static int hsr_node_table_open(struct inode *inode, struct file *filp) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index a6f4e9f65b14..ab953a1a0d6c 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -3,9 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se - * * This file contains device methods for creating, using and destroying - * virtual HSR devices. + * virtual HSR or PRP devices. */ #include <linux/netdevice.h> @@ -210,7 +209,7 @@ static netdev_features_t hsr_fix_features(struct net_device *dev, return hsr_features_recompute(hsr, features); } -static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct hsr_priv *hsr = netdev_priv(dev); struct hsr_port *master; @@ -231,66 +230,103 @@ static const struct header_ops hsr_header_ops = { .parse = eth_header_parse, }; -static void send_hsr_supervision_frame(struct hsr_port *master, - u8 type, u8 hsr_ver) +static struct sk_buff *hsr_init_skb(struct hsr_port *master, u16 proto) { + struct hsr_priv *hsr = master->hsr; struct sk_buff *skb; int hlen, tlen; - struct hsr_tag *hsr_tag; - struct hsr_sup_tag *hsr_stag; - struct hsr_sup_payload *hsr_sp; - unsigned long irqflags; hlen = LL_RESERVED_SPACE(master->dev); tlen = master->dev->needed_tailroom; + /* skb size is same for PRP/HSR frames, only difference + * being, for PRP it is a trailer and for HSR it is a + * header + */ skb = dev_alloc_skb(sizeof(struct hsr_tag) + sizeof(struct hsr_sup_tag) + sizeof(struct hsr_sup_payload) + hlen + tlen); if (!skb) - return; + return skb; skb_reserve(skb, hlen); - skb->dev = master->dev; - skb->protocol = htons(hsr_ver ? ETH_P_HSR : ETH_P_PRP); + skb->protocol = htons(proto); skb->priority = TC_PRIO_CONTROL; - if (dev_hard_header(skb, skb->dev, (hsr_ver ? ETH_P_HSR : ETH_P_PRP), - master->hsr->sup_multicast_addr, + if (dev_hard_header(skb, skb->dev, proto, + hsr->sup_multicast_addr, skb->dev->dev_addr, skb->len) <= 0) goto out; + skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); - if (hsr_ver > 0) { + return skb; +out: + kfree_skb(skb); + + return NULL; +} + +static void send_hsr_supervision_frame(struct hsr_port *master, + unsigned long *interval) +{ + struct hsr_priv *hsr = master->hsr; + __u8 type = HSR_TLV_LIFE_CHECK; + struct hsr_tag *hsr_tag = NULL; + struct hsr_sup_payload *hsr_sp; + struct hsr_sup_tag *hsr_stag; + unsigned long irqflags; + struct sk_buff *skb; + u16 proto; + + *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); + if (hsr->announce_count < 3 && hsr->prot_version == 0) { + type = HSR_TLV_ANNOUNCE; + *interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); + hsr->announce_count++; + } + + if (!hsr->prot_version) + proto = ETH_P_PRP; + else + proto = ETH_P_HSR; + + skb = hsr_init_skb(master, proto); + if (!skb) { + WARN_ONCE(1, "HSR: Could not send supervision frame\n"); + return; + } + + if (hsr->prot_version > 0) { hsr_tag = skb_put(skb, sizeof(struct hsr_tag)); hsr_tag->encap_proto = htons(ETH_P_PRP); set_hsr_tag_LSDU_size(hsr_tag, HSR_V1_SUP_LSDUSIZE); } hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); - set_hsr_stag_path(hsr_stag, (hsr_ver ? 0x0 : 0xf)); - set_hsr_stag_HSR_ver(hsr_stag, hsr_ver); + set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); + set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version); /* From HSRv1 on we have separate supervision sequence numbers. */ spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags); - if (hsr_ver > 0) { - hsr_stag->sequence_nr = htons(master->hsr->sup_sequence_nr); - hsr_tag->sequence_nr = htons(master->hsr->sequence_nr); - master->hsr->sup_sequence_nr++; - master->hsr->sequence_nr++; + if (hsr->prot_version > 0) { + hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); + hsr->sup_sequence_nr++; + hsr_tag->sequence_nr = htons(hsr->sequence_nr); + hsr->sequence_nr++; } else { - hsr_stag->sequence_nr = htons(master->hsr->sequence_nr); - master->hsr->sequence_nr++; + hsr_stag->sequence_nr = htons(hsr->sequence_nr); + hsr->sequence_nr++; } spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); hsr_stag->HSR_TLV_type = type; /* TODO: Why 12 in HSRv0? */ - hsr_stag->HSR_TLV_length = - hsr_ver ? sizeof(struct hsr_sup_payload) : 12; + hsr_stag->HSR_TLV_length = hsr->prot_version ? + sizeof(struct hsr_sup_payload) : 12; /* Payload: MacAddressA */ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); @@ -300,11 +336,57 @@ static void send_hsr_supervision_frame(struct hsr_port *master, return; hsr_forward_skb(skb, master); + return; +} -out: - WARN_ONCE(1, "HSR: Could not send supervision frame\n"); - kfree_skb(skb); +static void send_prp_supervision_frame(struct hsr_port *master, + unsigned long *interval) +{ + struct hsr_priv *hsr = master->hsr; + struct hsr_sup_payload *hsr_sp; + struct hsr_sup_tag *hsr_stag; + unsigned long irqflags; + struct sk_buff *skb; + struct prp_rct *rct; + u8 *tail; + + skb = hsr_init_skb(master, ETH_P_PRP); + if (!skb) { + WARN_ONCE(1, "PRP: Could not send supervision frame\n"); + return; + } + + *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); + hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); + set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); + set_hsr_stag_HSR_ver(hsr_stag, (hsr->prot_version ? 1 : 0)); + + /* From HSRv1 on we have separate supervision sequence numbers. */ + spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags); + hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); + hsr->sup_sequence_nr++; + hsr_stag->HSR_TLV_type = PRP_TLV_LIFE_CHECK_DD; + hsr_stag->HSR_TLV_length = sizeof(struct hsr_sup_payload); + + /* Payload: MacAddressA */ + hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); + ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); + + if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN)) { + spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); + return; + } + + tail = skb_tail_pointer(skb) - HSR_HLEN; + rct = (struct prp_rct *)tail; + rct->PRP_suffix = htons(ETH_P_PRP); + set_prp_LSDU_size(rct, HSR_V1_SUP_LSDUSIZE); + rct->sequence_nr = htons(hsr->sequence_nr); + hsr->sequence_nr++; + spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); + + hsr_forward_skb(skb, master); } /* Announce (supervision frame) timer function @@ -319,19 +401,7 @@ static void hsr_announce(struct timer_list *t) rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); - - if (hsr->announce_count < 3 && hsr->prot_version == 0) { - send_hsr_supervision_frame(master, HSR_TLV_ANNOUNCE, - hsr->prot_version); - hsr->announce_count++; - - interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); - } else { - send_hsr_supervision_frame(master, HSR_TLV_LIFE_CHECK, - hsr->prot_version); - - interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); - } + hsr->proto_ops->send_sv_frame(master, &interval); if (is_admin_up(master->dev)) mod_timer(&hsr->announce_timer, jiffies + interval); @@ -368,6 +438,24 @@ static struct device_type hsr_type = { .name = "hsr", }; +static struct hsr_proto_ops hsr_ops = { + .send_sv_frame = send_hsr_supervision_frame, + .create_tagged_frame = hsr_create_tagged_frame, + .get_untagged_frame = hsr_get_untagged_frame, + .fill_frame_info = hsr_fill_frame_info, + .invalid_dan_ingress_frame = hsr_invalid_dan_ingress_frame, +}; + +static struct hsr_proto_ops prp_ops = { + .send_sv_frame = send_prp_supervision_frame, + .create_tagged_frame = prp_create_tagged_frame, + .get_untagged_frame = prp_get_untagged_frame, + .drop_frame = prp_drop_frame, + .fill_frame_info = prp_fill_frame_info, + .handle_san_frame = prp_handle_san_frame, + .update_san_info = prp_update_san_info, +}; + void hsr_dev_setup(struct net_device *dev) { eth_hw_addr_random(dev); @@ -427,6 +515,17 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr); + /* initialize protocol specific functions */ + if (protocol_version == PRP_V1) { + /* For PRP, lan_id has most significant 3 bits holding + * the net_id of PRP_LAN_ID + */ + hsr->net_id = PRP_LAN_ID << 1; + hsr->proto_ops = &prp_ops; + } else { + hsr->proto_ops = &hsr_ops; + } + /* Make sure we recognize frames from ourselves in hsr_rcv() */ res = hsr_create_self_node(hsr, hsr_dev->dev_addr, slave[1]->dev_addr); diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index b8f9262ed101..868373822ee4 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_DEVICE_H diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 1ea17752fffc..cadfccd7876e 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Frame router for HSR and PRP. */ #include "hsr_forward.h" @@ -15,18 +17,6 @@ struct hsr_node; -struct hsr_frame_info { - struct sk_buff *skb_std; - struct sk_buff *skb_hsr; - struct hsr_port *port_rcv; - struct hsr_node *node_src; - u16 sequence_nr; - bool is_supervision; - bool is_vlan; - bool is_local_dest; - bool is_local_exclusive; -}; - /* The uses I can see for these HSR supervision frames are: * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type = * 22") to reset any sequence_nr counters belonging to that node. Useful if @@ -74,7 +64,9 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) } if (hsr_sup_tag->HSR_TLV_type != HSR_TLV_ANNOUNCE && - hsr_sup_tag->HSR_TLV_type != HSR_TLV_LIFE_CHECK) + hsr_sup_tag->HSR_TLV_type != HSR_TLV_LIFE_CHECK && + hsr_sup_tag->HSR_TLV_type != PRP_TLV_LIFE_CHECK_DD && + hsr_sup_tag->HSR_TLV_type != PRP_TLV_LIFE_CHECK_DA) return false; if (hsr_sup_tag->HSR_TLV_length != 12 && hsr_sup_tag->HSR_TLV_length != sizeof(struct hsr_sup_payload)) @@ -83,8 +75,8 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) return true; } -static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in, - struct hsr_frame_info *frame) +static struct sk_buff *create_stripped_skb_hsr(struct sk_buff *skb_in, + struct hsr_frame_info *frame) { struct sk_buff *skb; int copylen; @@ -112,38 +104,123 @@ static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in, return skb; } -static struct sk_buff *frame_get_stripped_skb(struct hsr_frame_info *frame, - struct hsr_port *port) +struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port) { - if (!frame->skb_std) - frame->skb_std = create_stripped_skb(frame->skb_hsr, frame); + if (!frame->skb_std) { + if (frame->skb_hsr) { + frame->skb_std = + create_stripped_skb_hsr(frame->skb_hsr, frame); + } else { + /* Unexpected */ + WARN_ONCE(1, "%s:%d: Unexpected frame received (port_src %s)\n", + __FILE__, __LINE__, port->dev->name); + return NULL; + } + } + return skb_clone(frame->skb_std, GFP_ATOMIC); } +struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port) +{ + if (!frame->skb_std) { + if (frame->skb_prp) { + /* trim the skb by len - HSR_HLEN to exclude RCT */ + skb_trim(frame->skb_prp, + frame->skb_prp->len - HSR_HLEN); + frame->skb_std = + __pskb_copy(frame->skb_prp, + skb_headroom(frame->skb_prp), + GFP_ATOMIC); + } else { + /* Unexpected */ + WARN_ONCE(1, "%s:%d: Unexpected frame received (port_src %s)\n", + __FILE__, __LINE__, port->dev->name); + return NULL; + } + } + + return skb_clone(frame->skb_std, GFP_ATOMIC); +} + +static void prp_set_lan_id(struct prp_rct *trailer, + struct hsr_port *port) +{ + int lane_id; + + if (port->type == HSR_PT_SLAVE_A) + lane_id = 0; + else + lane_id = 1; + + /* Add net_id in the upper 3 bits of lane_id */ + lane_id |= port->hsr->net_id; + set_prp_lan_id(trailer, lane_id); +} + +/* Tailroom for PRP rct should have been created before calling this */ +static struct sk_buff *prp_fill_rct(struct sk_buff *skb, + struct hsr_frame_info *frame, + struct hsr_port *port) +{ + struct prp_rct *trailer; + int min_size = ETH_ZLEN; + int lsdu_size; + + if (!skb) + return skb; + + if (frame->is_vlan) + min_size = VLAN_ETH_ZLEN; + + if (skb_put_padto(skb, min_size)) + return NULL; + + trailer = (struct prp_rct *)skb_put(skb, HSR_HLEN); + lsdu_size = skb->len - 14; + if (frame->is_vlan) + lsdu_size -= 4; + prp_set_lan_id(trailer, port); + set_prp_LSDU_size(trailer, lsdu_size); + trailer->sequence_nr = htons(frame->sequence_nr); + trailer->PRP_suffix = htons(ETH_P_PRP); + + return skb; +} + +static void hsr_set_path_id(struct hsr_ethhdr *hsr_ethhdr, + struct hsr_port *port) +{ + int path_id; + + if (port->type == HSR_PT_SLAVE_A) + path_id = 0; + else + path_id = 1; + + set_hsr_tag_path(&hsr_ethhdr->hsr_tag, path_id); +} + static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, struct hsr_port *port, u8 proto_version) { struct hsr_ethhdr *hsr_ethhdr; - int lane_id; int lsdu_size; /* pad to minimum packet size which is 60 + 6 (HSR tag) */ if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN)) return NULL; - if (port->type == HSR_PT_SLAVE_A) - lane_id = 0; - else - lane_id = 1; - lsdu_size = skb->len - 14; if (frame->is_vlan) lsdu_size -= 4; hsr_ethhdr = (struct hsr_ethhdr *)skb_mac_header(skb); - set_hsr_tag_path(&hsr_ethhdr->hsr_tag, lane_id); + hsr_set_path_id(hsr_ethhdr, port); set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size); hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr); hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; @@ -153,16 +230,28 @@ static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, return skb; } -static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o, - struct hsr_frame_info *frame, - struct hsr_port *port) +/* If the original frame was an HSR tagged frame, just clone it to be sent + * unchanged. Otherwise, create a private frame especially tagged for 'port'. + */ +struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port) { - int movelen; unsigned char *dst, *src; struct sk_buff *skb; + int movelen; + + if (frame->skb_hsr) { + struct hsr_ethhdr *hsr_ethhdr = + (struct hsr_ethhdr *)skb_mac_header(frame->skb_hsr); + + /* set the lane id properly */ + hsr_set_path_id(hsr_ethhdr, port); + return skb_clone(frame->skb_hsr, GFP_ATOMIC); + } /* Create the new skb with enough headroom to fit the HSR tag */ - skb = __pskb_copy(skb_o, skb_headroom(skb_o) + HSR_HLEN, GFP_ATOMIC); + skb = __pskb_copy(frame->skb_std, + skb_headroom(frame->skb_std) + HSR_HLEN, GFP_ATOMIC); if (!skb) return NULL; skb_reset_mac_header(skb); @@ -185,21 +274,29 @@ static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o, return hsr_fill_tag(skb, frame, port, port->hsr->prot_version); } -/* If the original frame was an HSR tagged frame, just clone it to be sent - * unchanged. Otherwise, create a private frame especially tagged for 'port'. - */ -static struct sk_buff *frame_get_tagged_skb(struct hsr_frame_info *frame, - struct hsr_port *port) +struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port) { - if (frame->skb_hsr) - return skb_clone(frame->skb_hsr, GFP_ATOMIC); + struct sk_buff *skb; - if (port->type != HSR_PT_SLAVE_A && port->type != HSR_PT_SLAVE_B) { - WARN_ONCE(1, "HSR: Bug: trying to create a tagged frame for a non-ring port"); - return NULL; + if (frame->skb_prp) { + struct prp_rct *trailer = skb_get_PRP_rct(frame->skb_prp); + + if (trailer) { + prp_set_lan_id(trailer, port); + } else { + WARN_ONCE(!trailer, "errored PRP skb"); + return NULL; + } + return skb_clone(frame->skb_prp, GFP_ATOMIC); } - return create_tagged_skb(frame->skb_std, frame, port); + skb = skb_copy_expand(frame->skb_std, 0, + skb_tailroom(frame->skb_std) + HSR_HLEN, + GFP_ATOMIC); + prp_fill_rct(skb, frame, port); + + return skb; } static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev, @@ -236,9 +333,18 @@ static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port, return dev_queue_xmit(skb); } +bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) +{ + return ((frame->port_rcv->type == HSR_PT_SLAVE_A && + port->type == HSR_PT_SLAVE_B) || + (frame->port_rcv->type == HSR_PT_SLAVE_B && + port->type == HSR_PT_SLAVE_A)); +} + /* Forward the frame through all devices except: * - Back through the receiving device * - If it's a HSR frame: through a device where it has passed before + * - if it's a PRP frame: through another PRP slave device (no bridge) * - To the local HSR master only if the frame is directly addressed to it, or * a non-supervision multicast or broadcast frame. * @@ -253,6 +359,7 @@ static void hsr_forward_do(struct hsr_frame_info *frame) struct sk_buff *skb; hsr_for_each_port(frame->port_rcv->hsr, port) { + struct hsr_priv *hsr = port->hsr; /* Don't send frame back the way it came */ if (port == frame->port_rcv) continue; @@ -265,24 +372,33 @@ static void hsr_forward_do(struct hsr_frame_info *frame) if (port->type != HSR_PT_MASTER && frame->is_local_exclusive) continue; - /* Don't send frame over port where it has been sent before */ - if (hsr_register_frame_out(port, frame->node_src, + /* Don't send frame over port where it has been sent before. + * Also fro SAN, this shouldn't be done. + */ + if (!frame->is_from_san && + hsr_register_frame_out(port, frame->node_src, frame->sequence_nr)) continue; if (frame->is_supervision && port->type == HSR_PT_MASTER) { - hsr_handle_sup_frame(frame->skb_hsr, - frame->node_src, - frame->port_rcv); + hsr_handle_sup_frame(frame); continue; } + /* Check if frame is to be dropped. Eg. for PRP no forward + * between ports. + */ + if (hsr->proto_ops->drop_frame && + hsr->proto_ops->drop_frame(frame, port)) + continue; + if (port->type != HSR_PT_MASTER) - skb = frame_get_tagged_skb(frame, port); + skb = hsr->proto_ops->create_tagged_frame(frame, port); else - skb = frame_get_stripped_skb(frame, port); + skb = hsr->proto_ops->get_untagged_frame(frame, port); + if (!skb) { - /* FIXME: Record the dropped frame? */ + frame->port_rcv->dev->stats.rx_dropped++; continue; } @@ -313,40 +429,95 @@ static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb, } } -static int hsr_fill_frame_info(struct hsr_frame_info *frame, - struct sk_buff *skb, struct hsr_port *port) +static void handle_std_frame(struct sk_buff *skb, + struct hsr_frame_info *frame) { - struct ethhdr *ethhdr; + struct hsr_port *port = frame->port_rcv; + struct hsr_priv *hsr = port->hsr; unsigned long irqflags; + frame->skb_hsr = NULL; + frame->skb_prp = NULL; + frame->skb_std = skb; + + if (port->type != HSR_PT_MASTER) { + frame->is_from_san = true; + } else { + /* Sequence nr for the master node */ + spin_lock_irqsave(&hsr->seqnr_lock, irqflags); + frame->sequence_nr = hsr->sequence_nr; + hsr->sequence_nr++; + spin_unlock_irqrestore(&hsr->seqnr_lock, irqflags); + } +} + +void hsr_fill_frame_info(__be16 proto, struct sk_buff *skb, + struct hsr_frame_info *frame) +{ + if (proto == htons(ETH_P_PRP) || + proto == htons(ETH_P_HSR)) { + /* HSR tagged frame :- Data or Supervision */ + frame->skb_std = NULL; + frame->skb_prp = NULL; + frame->skb_hsr = skb; + frame->sequence_nr = hsr_get_skb_sequence_nr(skb); + return; + } + + /* Standard frame or PRP from master port */ + handle_std_frame(skb, frame); +} + +void prp_fill_frame_info(__be16 proto, struct sk_buff *skb, + struct hsr_frame_info *frame) +{ + /* Supervision frame */ + struct prp_rct *rct = skb_get_PRP_rct(skb); + + if (rct && + prp_check_lsdu_size(skb, rct, frame->is_supervision)) { + frame->skb_hsr = NULL; + frame->skb_std = NULL; + frame->skb_prp = skb; + frame->sequence_nr = prp_get_skb_sequence_nr(rct); + return; + } + handle_std_frame(skb, frame); +} + +static int fill_frame_info(struct hsr_frame_info *frame, + struct sk_buff *skb, struct hsr_port *port) +{ + struct hsr_priv *hsr = port->hsr; + struct hsr_vlan_ethhdr *vlan_hdr; + struct ethhdr *ethhdr; + __be16 proto; + + memset(frame, 0, sizeof(*frame)); frame->is_supervision = is_supervision_frame(port->hsr, skb); - frame->node_src = hsr_get_node(port, skb, frame->is_supervision); + frame->node_src = hsr_get_node(port, &hsr->node_db, skb, + frame->is_supervision, + port->type); if (!frame->node_src) return -1; /* Unknown node and !is_supervision, or no mem */ ethhdr = (struct ethhdr *)skb_mac_header(skb); frame->is_vlan = false; - if (ethhdr->h_proto == htons(ETH_P_8021Q)) { + proto = ethhdr->h_proto; + + if (proto == htons(ETH_P_8021Q)) frame->is_vlan = true; + + if (frame->is_vlan) { + vlan_hdr = (struct hsr_vlan_ethhdr *)ethhdr; + proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; /* FIXME: */ netdev_warn_once(skb->dev, "VLAN not yet supported"); } - if (ethhdr->h_proto == htons(ETH_P_PRP) || - ethhdr->h_proto == htons(ETH_P_HSR)) { - frame->skb_std = NULL; - frame->skb_hsr = skb; - frame->sequence_nr = hsr_get_skb_sequence_nr(skb); - } else { - frame->skb_std = skb; - frame->skb_hsr = NULL; - /* Sequence nr for the master node */ - spin_lock_irqsave(&port->hsr->seqnr_lock, irqflags); - frame->sequence_nr = port->hsr->sequence_nr; - port->hsr->sequence_nr++; - spin_unlock_irqrestore(&port->hsr->seqnr_lock, irqflags); - } + frame->is_from_san = false; frame->port_rcv = port; + hsr->proto_ops->fill_frame_info(proto, skb, frame); check_local_dest(port->hsr, skb, frame); return 0; @@ -363,8 +534,9 @@ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) goto out_drop; } - if (hsr_fill_frame_info(&frame, skb, port) < 0) + if (fill_frame_info(&frame, skb, port) < 0) goto out_drop; + hsr_register_frame_in(frame.node_src, port, frame.sequence_nr); hsr_forward_do(&frame); /* Gets called for ingress frames as well as egress from master port. @@ -375,10 +547,9 @@ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) port->dev->stats.tx_bytes += skb->len; } - if (frame.skb_hsr) - kfree_skb(frame.skb_hsr); - if (frame.skb_std) - kfree_skb(frame.skb_std); + kfree_skb(frame.skb_hsr); + kfree_skb(frame.skb_prp); + kfree_skb(frame.skb_std); return; out_drop: diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h index 51a69295566c..618140d484ad 100644 --- a/net/hsr/hsr_forward.h +++ b/net/hsr/hsr_forward.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_FORWARD_H @@ -12,5 +14,17 @@ #include "hsr_main.h" void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port); - +struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port); +struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port); +struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port); +struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port); +bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port); +void prp_fill_frame_info(__be16 proto, struct sk_buff *skb, + struct hsr_frame_info *frame); +void hsr_fill_frame_info(__be16 proto, struct sk_buff *skb, + struct hsr_frame_info *frame); #endif /* __HSR_FORWARD_H */ diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 530de24b1fb5..5c97de459905 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -8,6 +8,7 @@ * interface. A frame is identified by its source MAC address and its HSR * sequence number. This code keeps track of senders and their sequence numbers * to allow filtering of duplicate frames, and to detect HSR ring errors. + * Same code handles filtering of duplicates for PRP as well. */ #include <linux/if_ether.h> @@ -126,6 +127,19 @@ void hsr_del_nodes(struct list_head *node_db) kfree(node); } +void prp_handle_san_frame(bool san, enum hsr_port_type port, + struct hsr_node *node) +{ + /* Mark if the SAN node is over LAN_A or LAN_B */ + if (port == HSR_PT_SLAVE_A) { + node->san_a = true; + return; + } + + if (port == HSR_PT_SLAVE_B) + node->san_b = true; +} + /* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A; * seq_out is used to initialize filtering of outgoing duplicate frames * originating from the newly added node. @@ -133,7 +147,8 @@ void hsr_del_nodes(struct list_head *node_db) static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, struct list_head *node_db, unsigned char addr[], - u16 seq_out) + u16 seq_out, bool san, + enum hsr_port_type rx_port) { struct hsr_node *new_node, *node; unsigned long now; @@ -154,6 +169,9 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, for (i = 0; i < HSR_PT_PORTS; i++) new_node->seq_out[i] = seq_out; + if (san && hsr->proto_ops->handle_san_frame) + hsr->proto_ops->handle_san_frame(san, rx_port, new_node); + spin_lock_bh(&hsr->list_lock); list_for_each_entry_rcu(node, node_db, mac_list, lockdep_is_held(&hsr->list_lock)) { @@ -171,15 +189,26 @@ out: return node; } +void prp_update_san_info(struct hsr_node *node, bool is_sup) +{ + if (!is_sup) + return; + + node->san_a = false; + node->san_b = false; +} + /* Get the hsr_node from which 'skb' was sent. */ -struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, - bool is_sup) +struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, + struct sk_buff *skb, bool is_sup, + enum hsr_port_type rx_port) { - struct list_head *node_db = &port->hsr->node_db; struct hsr_priv *hsr = port->hsr; struct hsr_node *node; struct ethhdr *ethhdr; + struct prp_rct *rct; + bool san = false; u16 seq_out; if (!skb_mac_header_was_set(skb)) @@ -188,14 +217,21 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, ethhdr = (struct ethhdr *)skb_mac_header(skb); list_for_each_entry_rcu(node, node_db, mac_list) { - if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) + if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) { + if (hsr->proto_ops->update_san_info) + hsr->proto_ops->update_san_info(node, is_sup); return node; - if (ether_addr_equal(node->macaddress_B, ethhdr->h_source)) + } + if (ether_addr_equal(node->macaddress_B, ethhdr->h_source)) { + if (hsr->proto_ops->update_san_info) + hsr->proto_ops->update_san_info(node, is_sup); return node; + } } - /* Everyone may create a node entry, connected node to a HSR device. */ - + /* Everyone may create a node entry, connected node to a HSR/PRP + * device. + */ if (ethhdr->h_proto == htons(ETH_P_PRP) || ethhdr->h_proto == htons(ETH_P_HSR)) { /* Use the existing sequence_nr from the tag as starting point @@ -203,31 +239,47 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, */ seq_out = hsr_get_skb_sequence_nr(skb) - 1; } else { - /* this is called also for frames from master port and - * so warn only for non master ports - */ - if (port->type != HSR_PT_MASTER) - WARN_ONCE(1, "%s: Non-HSR frame\n", __func__); - seq_out = HSR_SEQNR_START; + rct = skb_get_PRP_rct(skb); + if (rct && prp_check_lsdu_size(skb, rct, is_sup)) { + seq_out = prp_get_skb_sequence_nr(rct); + } else { + if (rx_port != HSR_PT_MASTER) + san = true; + seq_out = HSR_SEQNR_START; + } } - return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out); + return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out, + san, rx_port); } /* Use the Supervision frame's info about an eventual macaddress_B for merging * nodes that has previously had their macaddress_B registered as a separate * node. */ -void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, - struct hsr_port *port_rcv) +void hsr_handle_sup_frame(struct hsr_frame_info *frame) { + struct hsr_node *node_curr = frame->node_src; + struct hsr_port *port_rcv = frame->port_rcv; struct hsr_priv *hsr = port_rcv->hsr; struct hsr_sup_payload *hsr_sp; struct hsr_node *node_real; + struct sk_buff *skb = NULL; struct list_head *node_db; struct ethhdr *ethhdr; int i; + /* Here either frame->skb_hsr or frame->skb_prp should be + * valid as supervision frame always will have protocol + * header info. + */ + if (frame->skb_hsr) + skb = frame->skb_hsr; + else if (frame->skb_prp) + skb = frame->skb_prp; + if (!skb) + return; + ethhdr = (struct ethhdr *)skb_mac_header(skb); /* Leave the ethernet header. */ @@ -248,7 +300,8 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, if (!node_real) /* No frame received from AddrA of this node yet */ node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A, - HSR_SEQNR_START - 1); + HSR_SEQNR_START - 1, true, + port_rcv->type); if (!node_real) goto done; /* No mem */ if (node_real == node_curr) @@ -274,7 +327,11 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, kfree_rcu(node_curr, rcu_head); done: - skb_push(skb, sizeof(struct hsrv1_ethhdr_sp)); + /* PRP uses v0 header */ + if (ethhdr->h_proto == htons(ETH_P_HSR)) + skb_push(skb, sizeof(struct hsrv1_ethhdr_sp)); + else + skb_push(skb, sizeof(struct hsrv0_ethhdr_sp)); } /* 'skb' is a frame meant for this host, that is to be passed to upper layers. diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 0f0fa12b4329..86b43f539f2c 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_FRAMEREG_H @@ -12,12 +14,26 @@ struct hsr_node; +struct hsr_frame_info { + struct sk_buff *skb_std; + struct sk_buff *skb_hsr; + struct sk_buff *skb_prp; + struct hsr_port *port_rcv; + struct hsr_node *node_src; + u16 sequence_nr; + bool is_supervision; + bool is_vlan; + bool is_local_dest; + bool is_local_exclusive; + bool is_from_san; +}; + void hsr_del_self_node(struct hsr_priv *hsr); void hsr_del_nodes(struct list_head *node_db); -struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, - bool is_sup); -void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, - struct hsr_port *port); +struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, + struct sk_buff *skb, bool is_sup, + enum hsr_port_type rx_port); +void hsr_handle_sup_frame(struct hsr_frame_info *frame); bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr); void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb); @@ -47,6 +63,10 @@ int hsr_get_node_data(struct hsr_priv *hsr, int *if2_age, u16 *if2_seq); +void prp_handle_san_frame(bool san, enum hsr_port_type port, + struct hsr_node *node); +void prp_update_san_info(struct hsr_node *node, bool is_sup); + struct hsr_node { struct list_head mac_list; unsigned char macaddress_A[ETH_ALEN]; @@ -55,6 +75,9 @@ struct hsr_node { enum hsr_port_type addr_B_port; unsigned long time_in[HSR_PT_PORTS]; bool time_in_stale[HSR_PT_PORTS]; + /* if the node is a SAN */ + bool san_a; + bool san_b; u16 seq_out[HSR_PT_PORTS]; struct rcu_head rcu_head; }; diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 144da15f0a81..2fd1976e5b1c 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Event handling for HSR and PRP devices. */ #include <linux/netdevice.h> diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index f74193465bf5..7dc92ce5a134 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_PRIVATE_H @@ -10,6 +12,7 @@ #include <linux/netdevice.h> #include <linux/list.h> +#include <linux/if_vlan.h> /* Time constants as specified in the HSR specification (IEC-62439-3 2010) * Table 8. @@ -33,6 +36,10 @@ #define HSR_TLV_ANNOUNCE 22 #define HSR_TLV_LIFE_CHECK 23 +/* PRP V1 life check for Duplicate discard */ +#define PRP_TLV_LIFE_CHECK_DD 20 +/* PRP V1 life check for Duplicate Accept */ +#define PRP_TLV_LIFE_CHECK_DA 21 /* HSR Tag. * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB, @@ -80,7 +87,12 @@ struct hsr_ethhdr { struct hsr_tag hsr_tag; } __packed; -/* HSR Supervision Frame data types. +struct hsr_vlan_ethhdr { + struct vlan_ethhdr vlanhdr; + struct hsr_tag hsr_tag; +} __packed; + +/* HSR/PRP Supervision Frame data types. * Field names as defined in the IEC:2010 standard for HSR. */ struct hsr_sup_tag { @@ -124,6 +136,34 @@ enum hsr_port_type { HSR_PT_PORTS, /* This must be the last item in the enum */ }; +/* PRP Redunancy Control Trailor (RCT). + * As defined in IEC-62439-4:2012, the PRP RCT is really { sequence Nr, + * Lan indentifier (LanId), LSDU_size and PRP_suffix = 0x88FB }. + * + * Field names as defined in the IEC:2012 standard for PRP. + */ +struct prp_rct { + __be16 sequence_nr; + __be16 lan_id_and_LSDU_size; + __be16 PRP_suffix; +} __packed; + +static inline u16 get_prp_LSDU_size(struct prp_rct *rct) +{ + return ntohs(rct->lan_id_and_LSDU_size) & 0x0FFF; +} + +static inline void set_prp_lan_id(struct prp_rct *rct, u16 lan_id) +{ + rct->lan_id_and_LSDU_size = htons((ntohs(rct->lan_id_and_LSDU_size) & + 0x0FFF) | (lan_id << 12)); +} +static inline void set_prp_LSDU_size(struct prp_rct *rct, u16 LSDU_size) +{ + rct->lan_id_and_LSDU_size = htons((ntohs(rct->lan_id_and_LSDU_size) & + 0xF000) | (LSDU_size & 0x0FFF)); +} + struct hsr_port { struct list_head port_list; struct net_device *dev; @@ -131,6 +171,32 @@ struct hsr_port { enum hsr_port_type type; }; +/* used by driver internally to differentiate various protocols */ +enum hsr_version { + HSR_V0 = 0, + HSR_V1, + PRP_V1, +}; + +struct hsr_frame_info; +struct hsr_node; + +struct hsr_proto_ops { + /* format and send supervision frame */ + void (*send_sv_frame)(struct hsr_port *port, unsigned long *interval); + void (*handle_san_frame)(bool san, enum hsr_port_type port, + struct hsr_node *node); + bool (*drop_frame)(struct hsr_frame_info *frame, struct hsr_port *port); + struct sk_buff * (*get_untagged_frame)(struct hsr_frame_info *frame, + struct hsr_port *port); + struct sk_buff * (*create_tagged_frame)(struct hsr_frame_info *frame, + struct hsr_port *port); + void (*fill_frame_info)(__be16 proto, struct sk_buff *skb, + struct hsr_frame_info *frame); + bool (*invalid_dan_ingress_frame)(__be16 protocol); + void (*update_san_info)(struct hsr_node *node, bool is_sup); +}; + struct hsr_priv { struct rcu_head rcu_head; struct list_head ports; @@ -141,9 +207,16 @@ struct hsr_priv { int announce_count; u16 sequence_nr; u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ - u8 prot_version; /* Indicate if HSRv0 or HSRv1. */ + enum hsr_version prot_version; /* Indicate if HSRv0, HSRv1 or PRPv1 */ spinlock_t seqnr_lock; /* locking for sequence_nr */ spinlock_t list_lock; /* locking for node list */ + struct hsr_proto_ops *proto_ops; +#define PRP_LAN_ID 0x5 /* 0x1010 for A and 0x1011 for B. Bit 0 is set + * based on SLAVE_A or SLAVE_B + */ + u8 net_id; /* for PRP, it occupies most significant 3 bits + * of lan_id + */ unsigned char sup_multicast_addr[ETH_ALEN]; #ifdef CONFIG_DEBUG_FS struct dentry *node_tbl_root; @@ -164,6 +237,49 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) return ntohs(hsr_ethhdr->hsr_tag.sequence_nr); } +static inline struct prp_rct *skb_get_PRP_rct(struct sk_buff *skb) +{ + unsigned char *tail = skb_tail_pointer(skb) - HSR_HLEN; + + struct prp_rct *rct = (struct prp_rct *)tail; + + if (rct->PRP_suffix == htons(ETH_P_PRP)) + return rct; + + return NULL; +} + +/* Assume caller has confirmed this skb is PRP suffixed */ +static inline u16 prp_get_skb_sequence_nr(struct prp_rct *rct) +{ + return ntohs(rct->sequence_nr); +} + +static inline u16 get_prp_lan_id(struct prp_rct *rct) +{ + return ntohs(rct->lan_id_and_LSDU_size) >> 12; +} + +/* assume there is a valid rct */ +static inline bool prp_check_lsdu_size(struct sk_buff *skb, + struct prp_rct *rct, + bool is_sup) +{ + struct ethhdr *ethhdr; + int expected_lsdu_size; + + if (is_sup) { + expected_lsdu_size = HSR_V1_SUP_LSDUSIZE; + } else { + ethhdr = (struct ethhdr *)skb_mac_header(skb); + expected_lsdu_size = skb->len - 14; + if (ethhdr->h_proto == htons(ETH_P_8021Q)) + expected_lsdu_size -= 4; + } + + return (expected_lsdu_size == get_prp_LSDU_size(rct)); +} + #if IS_ENABLED(CONFIG_DEBUG_FS) void hsr_debugfs_rename(struct net_device *dev); void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 6e14b7d22639..06c3cd988760 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -4,7 +4,7 @@ * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * - * Routines for handling Netlink messages for HSR. + * Routines for handling Netlink messages for HSR and PRP. */ #include "hsr_netlink.h" @@ -22,6 +22,7 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = { [IFLA_HSR_VERSION] = { .type = NLA_U8 }, [IFLA_HSR_SUPERVISION_ADDR] = { .len = ETH_ALEN }, [IFLA_HSR_SEQ_NR] = { .type = NLA_U16 }, + [IFLA_HSR_PROTOCOL] = { .type = NLA_U8 }, }; /* Here, it seems a netdevice has already been allocated for us, and the @@ -31,8 +32,10 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { + enum hsr_version proto_version; + unsigned char multicast_spec; + u8 proto = HSR_PROTOCOL_HSR; struct net_device *link[2]; - unsigned char multicast_spec, hsr_version; if (!data) { NL_SET_ERR_MSG_MOD(extack, "No slave devices specified"); @@ -69,18 +72,34 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, else multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]); + if (data[IFLA_HSR_PROTOCOL]) + proto = nla_get_u8(data[IFLA_HSR_PROTOCOL]); + + if (proto >= HSR_PROTOCOL_MAX) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported protocol\n"); + return -EINVAL; + } + if (!data[IFLA_HSR_VERSION]) { - hsr_version = 0; + proto_version = HSR_V0; } else { - hsr_version = nla_get_u8(data[IFLA_HSR_VERSION]); - if (hsr_version > 1) { + if (proto == HSR_PROTOCOL_PRP) { + NL_SET_ERR_MSG_MOD(extack, "PRP version unsupported\n"); + return -EINVAL; + } + + proto_version = nla_get_u8(data[IFLA_HSR_VERSION]); + if (proto_version > HSR_V1) { NL_SET_ERR_MSG_MOD(extack, - "Only versions 0..1 are supported"); + "Only HSR version 0/1 supported\n"); return -EINVAL; } } - return hsr_dev_finalize(dev, link, multicast_spec, hsr_version, extack); + if (proto == HSR_PROTOCOL_PRP) + proto_version = PRP_V1; + + return hsr_dev_finalize(dev, link, multicast_spec, proto_version, extack); } static void hsr_dellink(struct net_device *dev, struct list_head *head) @@ -102,6 +121,7 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head) static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct hsr_priv *hsr = netdev_priv(dev); + u8 proto = HSR_PROTOCOL_HSR; struct hsr_port *port; port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); @@ -120,6 +140,10 @@ static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) hsr->sup_multicast_addr) || nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr->sequence_nr)) goto nla_put_failure; + if (hsr->prot_version == PRP_V1) + proto = HSR_PROTOCOL_PRP; + if (nla_put_u8(skb, IFLA_HSR_PROTOCOL, proto)) + goto nla_put_failure; return 0; diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h index 1121bb192a18..501552d9753b 100644 --- a/net/hsr/hsr_netlink.h +++ b/net/hsr/hsr_netlink.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_NETLINK_H diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 25b6ffba26cd..36d5fcf09c61 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Frame handler other utility functions for HSR and PRP. */ #include "hsr_slave.h" @@ -14,12 +16,22 @@ #include "hsr_forward.h" #include "hsr_framereg.h" +bool hsr_invalid_dan_ingress_frame(__be16 protocol) +{ + return (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR)); +} + static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct hsr_port *port; + struct hsr_priv *hsr; __be16 protocol; + /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ + if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) + return RX_HANDLER_PASS; + if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: skb invalid", __func__); return RX_HANDLER_PASS; @@ -28,6 +40,7 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) port = hsr_port_get_rcu(skb->dev); if (!port) goto finish_pass; + hsr = port->hsr; if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) { /* Directly kill frames sent by ourselves */ @@ -35,12 +48,23 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) goto finish_consume; } + /* For HSR, only tagged frames are expected, but for PRP + * there could be non tagged frames as well from Single + * attached nodes (SANs). + */ protocol = eth_hdr(skb)->h_proto; - if (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR)) + if (hsr->proto_ops->invalid_dan_ingress_frame && + hsr->proto_ops->invalid_dan_ingress_frame(protocol)) goto finish_pass; skb_push(skb, ETH_HLEN); + if (skb_mac_header(skb) != skb->data) { + WARN_ONCE(1, "%s:%d: Malformed frame at source port %s)\n", + __func__, __LINE__, port->dev->name); + goto finish_consume; + } + hsr_forward_skb(skb, port); finish_consume: diff --git a/net/hsr/hsr_slave.h b/net/hsr/hsr_slave.h index 8953ea279ce9..edc4612bb009 100644 --- a/net/hsr/hsr_slave.h +++ b/net/hsr/hsr_slave.h @@ -2,6 +2,8 @@ /* Copyright 2011-2014 Autronica Fire and Security AS * * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_SLAVE_H @@ -30,4 +32,6 @@ static inline struct hsr_port *hsr_port_get_rcu(const struct net_device *dev) rcu_dereference(dev->rx_handler_data) : NULL; } +bool hsr_invalid_dan_ingress_frame(__be16 protocol); + #endif /* __HSR_SLAVE_H */ diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index d93d4531aa9b..a45a0401adc5 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -382,7 +382,7 @@ static int raw_getsockopt(struct sock *sk, int level, int optname, } static int raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { return -EOPNOTSUPP; } @@ -423,10 +423,6 @@ static const struct proto_ops ieee802154_raw_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; /* DGRAM Sockets (802.15.4 dataframes) */ @@ -876,7 +872,7 @@ static int dgram_getsockopt(struct sock *sk, int level, int optname, } static int dgram_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct dgram_sock *ro = dgram_sk(sk); struct net *net = sock_net(sk); @@ -886,7 +882,7 @@ static int dgram_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); @@ -986,10 +982,6 @@ static const struct proto_ops ieee802154_dgram_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; /* Create a socket. Initialise the socket, blank the addresses diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e64e59b536d3..60db5a6487cc 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -10,7 +10,7 @@ config IP_MULTICAST intend to participate in the MBONE, a high bandwidth network on top of the Internet which carries audio and video broadcasts. More information about the MBONE is on the WWW at - <http://www.savetz.com/mbone/>. For most people, it's safe to say N. + <https://www.savetz.com/mbone/>. For most people, it's safe to say N. config IP_ADVANCED_ROUTER bool "IP: advanced router" @@ -73,7 +73,7 @@ config IP_MULTIPLE_TABLES If you need more information, see the Linux Advanced Routing and Traffic Control documentation at - <http://lartc.org/howto/lartc.rpdb.html> + <https://lartc.org/howto/lartc.rpdb.html> If unsure, say N. @@ -280,7 +280,7 @@ config SYN_COOKIES continue to connect, even when your machine is under attack. There is no need for the legitimate users to change their TCP/IP software; SYN cookies work transparently to them. For technical information - about SYN cookies, check out <http://cr.yp.to/syncookies.html>. + about SYN cookies, check out <https://cr.yp.to/syncookies.html>. If you are SYN flooded, the source address reported by the kernel is likely to have been forged by the attacker; it is only reported as @@ -525,7 +525,7 @@ config TCP_CONG_HSTCP A modification to TCP's congestion control mechanism for use with large congestion windows. A table indicates how much to increase the congestion window by when an ACK is received. - For more detail see http://www.icir.org/floyd/hstcp.html + For more detail see https://www.icir.org/floyd/hstcp.html config TCP_CONG_HYBLA tristate "TCP-Hybla congestion control algorithm" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9e1a186a3671..5b77a46885b9 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -14,7 +14,7 @@ obj-y := route.o inetpeer.o protocol.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ - metrics.o netlink.o nexthop.o + metrics.o netlink.o nexthop.o udp_tunnel_stub.o obj-$(CONFIG_BPFILTER) += bpfilter/ @@ -29,6 +29,7 @@ gre-y := gre_demux.o obj-$(CONFIG_NET_FOU) += fou.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o +udp_tunnel-y := udp_tunnel_core.o udp_tunnel_nic.o obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o obj-$(CONFIG_NET_IPVTI) += ip_vti.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 02aa5cb3a4fd..4307503a6f0b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -411,6 +411,9 @@ int inet_release(struct socket *sock) if (sk) { long timeout; + if (!sk->sk_kern_sock) + BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk); + /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); @@ -1040,8 +1043,6 @@ const struct proto_ops inet_stream_ops = { .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif .set_rcvlowat = tcp_set_rcvlowat, @@ -1070,8 +1071,6 @@ const struct proto_ops inet_dgram_ops = { .sendpage = inet_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif }; @@ -1102,8 +1101,6 @@ static const struct proto_ops inet_sockraw_ops = { .mmap = sock_no_mmap, .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif }; @@ -1432,10 +1429,6 @@ static struct sk_buff *ipip_gso_segment(struct sk_buff *skb, return inet_gso_segment(skb, features); } -INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *, - struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, - struct sk_buff *)); struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; @@ -1608,8 +1601,6 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) return -EINVAL; } -INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *, int)); -INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); int inet_gro_complete(struct sk_buff *skb, int nhoff) { __be16 newlen = htons(skb->len - nhoff); diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 9063c6767d34..545b2640f019 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -21,8 +21,7 @@ void bpfilter_umh_cleanup(struct umd_info *info) } EXPORT_SYMBOL_GPL(bpfilter_umh_cleanup); -static int bpfilter_mbox_request(struct sock *sk, int optname, - char __user *optval, +static int bpfilter_mbox_request(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen, bool is_set) { int err; @@ -52,20 +51,23 @@ out: return err; } -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { return bpfilter_mbox_request(sk, optname, optval, optlen, true); } -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen) +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, + char __user *user_optval, int __user *optlen) { - int len; + sockptr_t optval; + int err, len; if (get_user(len, optlen)) return -EFAULT; - + err = init_user_sockptr(&optval, user_optval, len); + if (err) + return err; return bpfilter_mbox_request(sk, optname, optval, len, false); } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index a23094b050f8..2eb71579f4d2 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -10,9 +10,9 @@ * * The CIPSO draft specification can be found in the kernel's Documentation * directory as well as the following URL: - * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt + * https://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt * The FIPS-188 specification can be found at the following URL: - * http://www.itl.nist.gov/fipspubs/fip188.htm + * https://www.itl.nist.gov/fipspubs/fip188.htm * * Author: Paul Moore <paul.moore@hp.com> */ @@ -283,7 +283,7 @@ static int cipso_v4_cache_check(const unsigned char *key, /** * cipso_v4_cache_add - Add an entry to the CIPSO cache - * @skb: the packet + * @cipso_ptr: pointer to CIPSO IP option * @secattr: the packet's security attributes * * Description: @@ -1535,6 +1535,7 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) /** * cipso_v4_validate - Validate a CIPSO option + * @skb: the packet * @option: the start of the option, on error it is set to point to the error * * Description: @@ -2066,7 +2067,7 @@ void cipso_v4_sock_delattr(struct sock *sk) /** * cipso_v4_req_delattr - Delete the CIPSO option from a request socket - * @reg: the request socket + * @req: the request socket * * Description: * Removes the CIPSO option from a request socket, if present. @@ -2158,6 +2159,7 @@ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) /** * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet * @skb: the packet + * @doi_def: the DOI structure * @secattr: the security attributes * * Description: diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index f99e3bac5cab..ce54a30c2ef1 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -29,6 +29,7 @@ #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/fib_rules.h> +#include <linux/indirect_call_wrapper.h> struct fib4_rule { struct fib_rule common; @@ -103,8 +104,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, } EXPORT_SYMBOL_GPL(__fib_lookup); -static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, - int flags, struct fib_lookup_arg *arg) +INDIRECT_CALLABLE_SCOPE int fib4_rule_action(struct fib_rule *rule, + struct flowi *flp, int flags, + struct fib_lookup_arg *arg) { int err = -EAGAIN; struct fib_table *tbl; @@ -138,7 +140,8 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, return err; } -static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule, + struct fib_lookup_arg *arg) { struct fib_result *result = (struct fib_result *) arg->result; struct net_device *dev = NULL; @@ -169,7 +172,8 @@ suppress_route: return true; } -static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) +INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, + struct flowi *fl, int flags) { struct fib4_rule *r = (struct fib4_rule *) rule; struct flowi4 *fl4 = &fl->u.ip4; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3c65f71d0e82..c89b46fec153 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -13,7 +13,7 @@ * * An experimental study of compression methods for dynamic tries * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - * http://www.csc.kth.se/~snilsson/software/dyntrie2/ + * https://www.csc.kth.se/~snilsson/software/dyntrie2/ * * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 2e6d1b7a7bc9..e0a246575887 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -15,12 +15,12 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + bool need_csum, need_recompute_csum, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; - bool need_csum, gso_partial; if (!skb->encapsulation) goto out; @@ -41,6 +41,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, skb->protocol = skb->inner_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); + need_recompute_csum = skb->csum_not_inet; skb->encap_hdr_csum = need_csum; features &= skb->dev->hw_enc_features; @@ -98,7 +99,15 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, } *(pcsum + 1) = 0; - *pcsum = gso_make_checksum(skb, 0); + if (need_recompute_csum && !skb_is_gso(skb)) { + __wsum csum; + + csum = skb_checksum(skb, gre_offset, + skb->len - gre_offset, 0); + *pcsum = csum_fold(csum); + } else { + *pcsum = gso_make_checksum(skb, 0); + } } while ((skb = skb->next)); out: return segs; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e30515f89802..cf36f955bfe6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1116,6 +1116,65 @@ error: goto drop; } +static bool ip_icmp_error_rfc4884_validate(const struct sk_buff *skb, int off) +{ + struct icmp_extobj_hdr *objh, _objh; + struct icmp_ext_hdr *exth, _exth; + u16 olen; + + exth = skb_header_pointer(skb, off, sizeof(_exth), &_exth); + if (!exth) + return false; + if (exth->version != 2) + return true; + + if (exth->checksum && + csum_fold(skb_checksum(skb, off, skb->len - off, 0))) + return false; + + off += sizeof(_exth); + while (off < skb->len) { + objh = skb_header_pointer(skb, off, sizeof(_objh), &_objh); + if (!objh) + return false; + + olen = ntohs(objh->length); + if (olen < sizeof(_objh)) + return false; + + off += olen; + if (off > skb->len) + return false; + } + + return true; +} + +void ip_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out, + int thlen, int off) +{ + int hlen; + + /* original datagram headers: end of icmph to payload (skb->data) */ + hlen = -skb_transport_offset(skb) - thlen; + + /* per rfc 4884: minimal datagram length of 128 bytes */ + if (off < 128 || off < hlen) + return; + + /* kernel has stripped headers: return payload offset in bytes */ + off -= hlen; + if (off + sizeof(struct icmp_ext_hdr) > skb->len) + return; + + out->len = off; + + if (!ip_icmp_error_rfc4884_validate(skb, off)) + out->flags |= SO_EE_RFC4884_FLAG_INVALID; +} +EXPORT_SYMBOL_GPL(ip_icmp_error_rfc4884); + int icmp_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index afaf582a5aa9..d1a3913eebe0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -648,20 +648,19 @@ no_route: EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); /* Decide when to expire the request and when to resend SYN-ACK */ -static inline void syn_ack_recalc(struct request_sock *req, const int thresh, - const int max_retries, - const u8 rskq_defer_accept, - int *expire, int *resend) +static void syn_ack_recalc(struct request_sock *req, + const int max_syn_ack_retries, + const u8 rskq_defer_accept, + int *expire, int *resend) { if (!rskq_defer_accept) { - *expire = req->num_timeout >= thresh; + *expire = req->num_timeout >= max_syn_ack_retries; *resend = 1; return; } - *expire = req->num_timeout >= thresh && - (!inet_rsk(req)->acked || req->num_timeout >= max_retries); - /* - * Do not resend while waiting for data after ACK, + *expire = req->num_timeout >= max_syn_ack_retries && + (!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept); + /* Do not resend while waiting for data after ACK, * start to resend on end of deferring period to give * last chance for data or ACK to create established socket. */ @@ -720,15 +719,12 @@ static void reqsk_timer_handler(struct timer_list *t) struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - int qlen, expire = 0, resend = 0; - int max_retries, thresh; - u8 defer_accept; + int max_syn_ack_retries, qlen, expire = 0, resend = 0; if (inet_sk_state_load(sk_listener) != TCP_LISTEN) goto drop; - max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; - thresh = max_retries; + max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means @@ -750,17 +746,14 @@ static void reqsk_timer_handler(struct timer_list *t) if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; - while (thresh > 2) { + while (max_syn_ack_retries > 2) { if (qlen < young) break; - thresh--; + max_syn_ack_retries--; young <<= 1; } } - defer_accept = READ_ONCE(queue->rskq_defer_accept); - if (defer_accept) - max_retries = defer_accept; - syn_ack_recalc(req, thresh, max_retries, defer_accept, + syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept), &expire, &resend); req->rsk_ops->syn_ack_timeout(req); if (!expire && @@ -1064,34 +1057,6 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) } EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); -#ifdef CONFIG_COMPAT -int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - - if (icsk->icsk_af_ops->compat_getsockopt) - return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname, - optval, optlen); - return icsk->icsk_af_ops->getsockopt(sk, level, optname, - optval, optlen); -} -EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt); - -int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - - if (icsk->icsk_af_ops->compat_setsockopt) - return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname, - optval, optlen); - return icsk->icsk_af_ops->setsockopt(sk, level, optname, - optval, optlen); -} -EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); -#endif - static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) { const struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 125f4f8a36b4..4a98dd736270 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -52,6 +52,11 @@ static DEFINE_MUTEX(inet_diag_table_mutex); static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { + if (proto < 0 || proto >= IPPROTO_MAX) { + mutex_lock(&inet_diag_table_mutex); + return ERR_PTR(-ENOENT); + } + if (!inet_diag_table[proto]) sock_load_diag_module(AF_INET, proto); @@ -181,6 +186,28 @@ errout: } EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); +static void inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr **req_nlas) +{ + struct nlattr *nla; + int remaining; + + nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) { + int type = nla_type(nla); + + if (type < __INET_DIAG_REQ_MAX) + req_nlas[type] = nla; + } +} + +static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req, + const struct inet_diag_dump_data *data) +{ + if (data->req_nlas[INET_DIAG_REQ_PROTOCOL]) + return nla_get_u32(data->req_nlas[INET_DIAG_REQ_PROTOCOL]); + return req->sdiag_protocol; +} + #define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, @@ -198,7 +225,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, void *info = NULL; cb_data = cb->data; - handler = inet_diag_table[req->sdiag_protocol]; + handler = inet_diag_table[inet_diag_get_protocol(req, cb_data)]; BUG_ON(!handler); nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -539,20 +566,25 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, + int hdrlen, const struct inet_diag_req_v2 *req) { const struct inet_diag_handler *handler; - int err; + struct inet_diag_dump_data dump_data; + int err, protocol; - handler = inet_diag_lock_handler(req->sdiag_protocol); + memset(&dump_data, 0, sizeof(dump_data)); + inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas); + protocol = inet_diag_get_protocol(req, &dump_data); + + handler = inet_diag_lock_handler(protocol); if (IS_ERR(handler)) { err = PTR_ERR(handler); } else if (cmd == SOCK_DIAG_BY_FAMILY) { - struct inet_diag_dump_data empty_dump_data = {}; struct netlink_callback cb = { .nlh = nlh, .skb = in_skb, - .data = &empty_dump_data, + .data = &dump_data, }; err = handler->dump_one(&cb, req); } else if (cmd == SOCK_DESTROY && handler->destroy) { @@ -1103,13 +1135,16 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { + struct inet_diag_dump_data *cb_data = cb->data; const struct inet_diag_handler *handler; u32 prev_min_dump_alloc; - int err = 0; + int protocol, err = 0; + + protocol = inet_diag_get_protocol(r, cb_data); again: prev_min_dump_alloc = cb->min_dump_alloc; - handler = inet_diag_lock_handler(r->sdiag_protocol); + handler = inet_diag_lock_handler(protocol); if (!IS_ERR(handler)) handler->dump(skb, cb, r); else @@ -1139,19 +1174,13 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) struct inet_diag_dump_data *cb_data; struct sk_buff *skb = cb->skb; struct nlattr *nla; - int rem, err; + int err; cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL); if (!cb_data) return -ENOMEM; - nla_for_each_attr(nla, nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), rem) { - int type = nla_type(nla); - - if (type < __INET_DIAG_REQ_MAX) - cb_data->req_nlas[type] = nla; - } + inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas); nla = cb_data->inet_diag_nla_bc; if (nla) { @@ -1237,7 +1266,8 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb, req.idiag_states = rc->idiag_states; req.id = rc->id; - return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, &req); + return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, + sizeof(struct inet_diag_req), &req); } static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -1279,7 +1309,8 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) return netlink_dump_start(net->diag_nlsk, skb, h, &c); } - return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h)); + return inet_diag_cmd_exact(h->nlmsg_type, skb, h, hdrlen, + nlmsg_data(h)); } static diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 2bbaaf0c7176..4eb4cd8d20dd 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -246,6 +246,21 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } +static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum) +{ + struct sock *reuse_sk = NULL; + u32 phash; + + if (sk->sk_reuseport) { + phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + reuse_sk = reuseport_select_sock(sk, phash, skb, doff); + } + return reuse_sk; +} + /* * Here are some nice properties to exploit here. The BSD API * does not allow a listening sock to specify the remote port nor the @@ -265,21 +280,17 @@ static struct sock *inet_lhash2_lookup(struct net *net, struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; int score, hiscore = 0; - u32 phash = 0; inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { sk = (struct sock *)icsk; score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - if (sk->sk_reuseport) { - phash = inet_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, phash, - skb, doff); - if (result) - return result; - } + result = lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum); + if (result) + return result; + result = sk; hiscore = score; } @@ -288,6 +299,29 @@ static struct sock *inet_lhash2_lookup(struct net *net, return result; } +static inline struct sock *inet_lookup_run_bpf(struct net *net, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum) +{ + struct sock *sk, *reuse_sk; + bool no_reuseport; + + if (hashinfo != &tcp_hashinfo) + return NULL; /* only TCP is supported */ + + no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, + saddr, sport, daddr, hnum, &sk); + if (no_reuseport || IS_ERR_OR_NULL(sk)) + return sk; + + reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + if (reuse_sk) + sk = reuse_sk; + return sk; +} + struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -299,6 +333,14 @@ struct sock *__inet_lookup_listener(struct net *net, struct sock *result = NULL; unsigned int hash2; + /* Lookup redirect from BPF */ + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { + result = inet_lookup_run_bpf(net, hashinfo, skb, doff, + saddr, sport, daddr, hnum); + if (result) + goto done; + } + hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ddaa01ec2bce..948747aac4e2 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -519,15 +519,20 @@ void ip_options_undo(struct ip_options *opt) } } -static struct ip_options_rcu *ip_options_get_alloc(const int optlen) +int ip_options_get(struct net *net, struct ip_options_rcu **optp, + sockptr_t data, int optlen) { - return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), + struct ip_options_rcu *opt; + + opt = kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), GFP_KERNEL); -} + if (!opt) + return -ENOMEM; + if (optlen && copy_from_sockptr(opt->opt.__data, data, optlen)) { + kfree(opt); + return -EFAULT; + } -static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp, - struct ip_options_rcu *opt, int optlen) -{ while (optlen & 3) opt->opt.__data[optlen++] = IPOPT_END; opt->opt.optlen = optlen; @@ -540,32 +545,6 @@ static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp, return 0; } -int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp, - unsigned char __user *data, int optlen) -{ - struct ip_options_rcu *opt = ip_options_get_alloc(optlen); - - if (!opt) - return -ENOMEM; - if (optlen && copy_from_user(opt->opt.__data, data, optlen)) { - kfree(opt); - return -EFAULT; - } - return ip_options_get_finish(net, optp, opt, optlen); -} - -int ip_options_get(struct net *net, struct ip_options_rcu **optp, - unsigned char *data, int optlen) -{ - struct ip_options_rcu *opt = ip_options_get_alloc(optlen); - - if (!opt) - return -ENOMEM; - if (optlen) - memcpy(opt->opt.__data, data, optlen); - return ip_options_get_finish(net, optp, opt, optlen); -} - void ip_forward_options(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 17206677d503..61f802d5350c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -539,6 +539,12 @@ no_route: } EXPORT_SYMBOL(__ip_queue_xmit); +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) +{ + return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); +} +EXPORT_SYMBOL(ip_queue_xmit); + static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { to->pkt_type = from->pkt_type; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 84ec3703c909..d2c223554ff7 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -280,7 +280,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, err = cmsg->cmsg_len - sizeof(struct cmsghdr); /* Our caller is responsible for freeing ipc->opt */ - err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), + err = ip_options_get(net, &ipc->opt, + KERNEL_SOCKPTR(CMSG_DATA(cmsg)), err < 40 ? err : 40); if (err) return err; @@ -389,6 +390,18 @@ int ip_ra_control(struct sock *sk, unsigned char on, return 0; } +static void ipv4_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out) +{ + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + ip_icmp_error_rfc4884(skb, out, sizeof(struct icmphdr), + icmp_hdr(skb)->un.reserved[1] * 4); + } +} + void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { @@ -411,6 +424,9 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->port = port; if (skb_pull(skb, payload - skb->data)) { + if (inet_sk(sk)->recverr_rfc4884) + ipv4_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb) == 0) return; @@ -679,20 +695,48 @@ Eaddrnotavail: return -EADDRNOTAVAIL; } +static int copy_group_source_from_sockptr(struct group_source_req *greqs, + sockptr_t optval, int optlen) +{ + if (in_compat_syscall()) { + struct compat_group_source_req gr32; + + if (optlen != sizeof(gr32)) + return -EINVAL; + if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) + return -EFAULT; + greqs->gsr_interface = gr32.gsr_interface; + greqs->gsr_group = gr32.gsr_group; + greqs->gsr_source = gr32.gsr_source; + } else { + if (optlen != sizeof(*greqs)) + return -EINVAL; + if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) + return -EFAULT; + } + + return 0; +} + static int do_mcast_group_source(struct sock *sk, int optname, - struct group_source_req *greqs) + sockptr_t optval, int optlen) { + struct group_source_req greqs; struct ip_mreq_source mreqs; struct sockaddr_in *psin; int omode, add, err; - if (greqs->gsr_group.ss_family != AF_INET || - greqs->gsr_source.ss_family != AF_INET) + err = copy_group_source_from_sockptr(&greqs, optval, optlen); + if (err) + return err; + + if (greqs.gsr_group.ss_family != AF_INET || + greqs.gsr_source.ss_family != AF_INET) return -EADDRNOTAVAIL; - psin = (struct sockaddr_in *)&greqs->gsr_group; + psin = (struct sockaddr_in *)&greqs.gsr_group; mreqs.imr_multiaddr = psin->sin_addr.s_addr; - psin = (struct sockaddr_in *)&greqs->gsr_source; + psin = (struct sockaddr_in *)&greqs.gsr_source; mreqs.imr_sourceaddr = psin->sin_addr.s_addr; mreqs.imr_interface = 0; /* use index for mc_source */ @@ -705,25 +749,145 @@ static int do_mcast_group_source(struct sock *sk, int optname, } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct ip_mreqn mreq; - psin = (struct sockaddr_in *)&greqs->gsr_group; + psin = (struct sockaddr_in *)&greqs.gsr_group; mreq.imr_multiaddr = psin->sin_addr; mreq.imr_address.s_addr = 0; - mreq.imr_ifindex = greqs->gsr_interface; + mreq.imr_ifindex = greqs.gsr_interface; err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) return err; - greqs->gsr_interface = mreq.imr_ifindex; + greqs.gsr_interface = mreq.imr_ifindex; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } - return ip_mc_source(add, omode, sk, &mreqs, greqs->gsr_interface); + return ip_mc_source(add, omode, sk, &mreqs, greqs.gsr_interface); +} + +static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) +{ + struct group_filter *gsf = NULL; + int err; + + if (optlen < GROUP_FILTER_SIZE(0)) + return -EINVAL; + if (optlen > sysctl_optmem_max) + return -ENOBUFS; + + gsf = memdup_sockptr(optval, optlen); + if (IS_ERR(gsf)) + return PTR_ERR(gsf); + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + err = -ENOBUFS; + if (gsf->gf_numsrc >= 0x1ffffff || + gsf->gf_numsrc > sock_net(sk)->ipv4.sysctl_igmp_max_msf) + goto out_free_gsf; + + err = -EINVAL; + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) + goto out_free_gsf; + + err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc, + gsf->gf_fmode, &gsf->gf_group, gsf->gf_slist); +out_free_gsf: + kfree(gsf); + return err; +} + +static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, + int optlen) +{ + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter *gf32; + unsigned int n; + void *p; + int err; + + if (optlen < size0) + return -EINVAL; + if (optlen > sysctl_optmem_max - 4) + return -ENOBUFS; + + p = kmalloc(optlen + 4, GFP_KERNEL); + if (!p) + return -ENOMEM; + gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ + + err = -EFAULT; + if (copy_from_sockptr(gf32, optval, optlen)) + goto out_free_gsf; + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + n = gf32->gf_numsrc; + err = -ENOBUFS; + if (n >= 0x1ffffff) + goto out_free_gsf; + + err = -EINVAL; + if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) + goto out_free_gsf; + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + err = -ENOBUFS; + if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf) + goto out_free_gsf; + err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode, + &gf32->gf_group, gf32->gf_slist); +out_free_gsf: + kfree(p); + return err; } -static int do_ip_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, unsigned int optlen) +static int ip_mcast_join_leave(struct sock *sk, int optname, + sockptr_t optval, int optlen) +{ + struct ip_mreqn mreq = { }; + struct sockaddr_in *psin; + struct group_req greq; + + if (optlen < sizeof(struct group_req)) + return -EINVAL; + if (copy_from_sockptr(&greq, optval, sizeof(greq))) + return -EFAULT; + + psin = (struct sockaddr_in *)&greq.gr_group; + if (psin->sin_family != AF_INET) + return -EINVAL; + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_ifindex = greq.gr_interface; + if (optname == MCAST_JOIN_GROUP) + return ip_mc_join_group(sk, &mreq); + return ip_mc_leave_group(sk, &mreq); +} + +static int compat_ip_mcast_join_leave(struct sock *sk, int optname, + sockptr_t optval, int optlen) +{ + struct compat_group_req greq; + struct ip_mreqn mreq = { }; + struct sockaddr_in *psin; + + if (optlen < sizeof(struct compat_group_req)) + return -EINVAL; + if (copy_from_sockptr(&greq, optval, sizeof(greq))) + return -EFAULT; + + psin = (struct sockaddr_in *)&greq.gr_group; + if (psin->sin_family != AF_INET) + return -EINVAL; + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_ifindex = greq.gr_interface; + + if (optname == MCAST_JOIN_GROUP) + return ip_mc_join_group(sk, &mreq); + return ip_mc_leave_group(sk, &mreq); +} + +static int do_ip_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -755,13 +919,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_RECVORIGDSTADDR: case IP_CHECKSUM: case IP_RECVFRAGSIZE: + case IP_RECVERR_RFC4884: if (optlen >= sizeof(int)) { - if (get_user(val, (int __user *) optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else if (optlen >= sizeof(char)) { unsigned char ucval; - if (get_user(ucval, (unsigned char __user *) optval)) + if (copy_from_sockptr(&ucval, optval, sizeof(ucval))) return -EFAULT; val = (int) ucval; } @@ -786,8 +951,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (optlen > 40) goto e_inval; - err = ip_options_get_from_user(sock_net(sk), &opt, - optval, optlen); + err = ip_options_get(sock_net(sk), &opt, optval, optlen); if (err) break; old = rcu_dereference_protected(inet->inet_opt, @@ -914,6 +1078,11 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (!val) skb_queue_purge(&sk->sk_error_queue); break; + case IP_RECVERR_RFC4884: + if (val < 0 || val > 1) + goto e_inval; + inet->recverr_rfc4884 = !!val; + break; case IP_MULTICAST_TTL: if (sk->sk_type == SOCK_STREAM) goto e_inval; @@ -980,17 +1149,17 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -EFAULT; if (optlen >= sizeof(struct ip_mreqn)) { - if (copy_from_user(&mreq, optval, sizeof(mreq))) + if (copy_from_sockptr(&mreq, optval, sizeof(mreq))) break; } else { memset(&mreq, 0, sizeof(mreq)); if (optlen >= sizeof(struct ip_mreq)) { - if (copy_from_user(&mreq, optval, - sizeof(struct ip_mreq))) + if (copy_from_sockptr(&mreq, optval, + sizeof(struct ip_mreq))) break; } else if (optlen >= sizeof(struct in_addr)) { - if (copy_from_user(&mreq.imr_address, optval, - sizeof(struct in_addr))) + if (copy_from_sockptr(&mreq.imr_address, optval, + sizeof(struct in_addr))) break; } } @@ -1042,11 +1211,12 @@ static int do_ip_setsockopt(struct sock *sk, int level, goto e_inval; err = -EFAULT; if (optlen >= sizeof(struct ip_mreqn)) { - if (copy_from_user(&mreq, optval, sizeof(mreq))) + if (copy_from_sockptr(&mreq, optval, sizeof(mreq))) break; } else { memset(&mreq, 0, sizeof(mreq)); - if (copy_from_user(&mreq, optval, sizeof(struct ip_mreq))) + if (copy_from_sockptr(&mreq, optval, + sizeof(struct ip_mreq))) break; } @@ -1066,7 +1236,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -ENOBUFS; break; } - msf = memdup_user(optval, optlen); + msf = memdup_sockptr(optval, optlen); if (IS_ERR(msf)) { err = PTR_ERR(msf); break; @@ -1097,7 +1267,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (optlen != sizeof(struct ip_mreq_source)) goto e_inval; - if (copy_from_user(&mreqs, optval, sizeof(mreqs))) { + if (copy_from_sockptr(&mreqs, optval, sizeof(mreqs))) { err = -EFAULT; break; } @@ -1127,77 +1297,24 @@ static int do_ip_setsockopt(struct sock *sk, int level, } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: - { - struct group_req greq; - struct sockaddr_in *psin; - struct ip_mreqn mreq; - - if (optlen < sizeof(struct group_req)) - goto e_inval; - err = -EFAULT; - if (copy_from_user(&greq, optval, sizeof(greq))) - break; - psin = (struct sockaddr_in *)&greq.gr_group; - if (psin->sin_family != AF_INET) - goto e_inval; - memset(&mreq, 0, sizeof(mreq)); - mreq.imr_multiaddr = psin->sin_addr; - mreq.imr_ifindex = greq.gr_interface; - - if (optname == MCAST_JOIN_GROUP) - err = ip_mc_join_group(sk, &mreq); + if (in_compat_syscall()) + err = compat_ip_mcast_join_leave(sk, optname, optval, + optlen); else - err = ip_mc_leave_group(sk, &mreq); + err = ip_mcast_join_leave(sk, optname, optval, optlen); break; - } case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: - { - struct group_source_req greqs; - - if (optlen != sizeof(struct group_source_req)) - goto e_inval; - if (copy_from_user(&greqs, optval, sizeof(greqs))) { - err = -EFAULT; - break; - } - err = do_mcast_group_source(sk, optname, &greqs); + err = do_mcast_group_source(sk, optname, optval, optlen); break; - } case MCAST_MSFILTER: - { - struct group_filter *gsf = NULL; - - if (optlen < GROUP_FILTER_SIZE(0)) - goto e_inval; - if (optlen > sysctl_optmem_max) { - err = -ENOBUFS; - break; - } - gsf = memdup_user(optval, optlen); - if (IS_ERR(gsf)) { - err = PTR_ERR(gsf); - break; - } - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (gsf->gf_numsrc >= 0x1ffffff || - gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) { - err = -ENOBUFS; - goto mc_msf_out; - } - if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { - err = -EINVAL; - goto mc_msf_out; - } - err = set_mcast_msfilter(sk, gsf->gf_interface, - gsf->gf_numsrc, gsf->gf_fmode, - &gsf->gf_group, gsf->gf_slist); -mc_msf_out: - kfree(gsf); + if (in_compat_syscall()) + err = compat_ip_set_mcast_msfilter(sk, optval, optlen); + else + err = ip_set_mcast_msfilter(sk, optval, optlen); break; - } case IP_MULTICAST_ALL: if (optlen < 1) goto e_inval; @@ -1296,8 +1413,8 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); } -int ip_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, unsigned int optlen) +int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { int err; @@ -1322,138 +1439,6 @@ int ip_setsockopt(struct sock *sk, int level, } EXPORT_SYMBOL(ip_setsockopt); -#ifdef CONFIG_COMPAT -int compat_ip_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - int err; - - if (level != SOL_IP) - return -ENOPROTOOPT; - - switch (optname) { - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - { - struct compat_group_req __user *gr32 = (void __user *)optval; - struct group_req greq; - struct sockaddr_in *psin = (struct sockaddr_in *)&greq.gr_group; - struct ip_mreqn mreq; - - if (optlen < sizeof(struct compat_group_req)) - return -EINVAL; - - if (get_user(greq.gr_interface, &gr32->gr_interface) || - copy_from_user(&greq.gr_group, &gr32->gr_group, - sizeof(greq.gr_group))) - return -EFAULT; - - if (psin->sin_family != AF_INET) - return -EINVAL; - - memset(&mreq, 0, sizeof(mreq)); - mreq.imr_multiaddr = psin->sin_addr; - mreq.imr_ifindex = greq.gr_interface; - - rtnl_lock(); - lock_sock(sk); - if (optname == MCAST_JOIN_GROUP) - err = ip_mc_join_group(sk, &mreq); - else - err = ip_mc_leave_group(sk, &mreq); - release_sock(sk); - rtnl_unlock(); - return err; - } - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - { - struct compat_group_source_req __user *gsr32 = (void __user *)optval; - struct group_source_req greqs; - - if (optlen != sizeof(struct compat_group_source_req)) - return -EINVAL; - - if (get_user(greqs.gsr_interface, &gsr32->gsr_interface) || - copy_from_user(&greqs.gsr_group, &gsr32->gsr_group, - sizeof(greqs.gsr_group)) || - copy_from_user(&greqs.gsr_source, &gsr32->gsr_source, - sizeof(greqs.gsr_source))) - return -EFAULT; - - rtnl_lock(); - lock_sock(sk); - err = do_mcast_group_source(sk, optname, &greqs); - release_sock(sk); - rtnl_unlock(); - return err; - } - case MCAST_MSFILTER: - { - const int size0 = offsetof(struct compat_group_filter, gf_slist); - struct compat_group_filter *gf32; - unsigned int n; - void *p; - - if (optlen < size0) - return -EINVAL; - if (optlen > sysctl_optmem_max - 4) - return -ENOBUFS; - - p = kmalloc(optlen + 4, GFP_KERNEL); - if (!p) - return -ENOMEM; - gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ - if (copy_from_user(gf32, optval, optlen)) { - err = -EFAULT; - goto mc_msf_out; - } - - n = gf32->gf_numsrc; - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (n >= 0x1ffffff) { - err = -ENOBUFS; - goto mc_msf_out; - } - if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) { - err = -EINVAL; - goto mc_msf_out; - } - - rtnl_lock(); - lock_sock(sk); - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf) - err = -ENOBUFS; - else - err = set_mcast_msfilter(sk, gf32->gf_interface, - n, gf32->gf_fmode, - &gf32->gf_group, gf32->gf_slist); - release_sock(sk); - rtnl_unlock(); -mc_msf_out: - kfree(p); - return err; - } - } - - err = do_ip_setsockopt(sk, level, optname, optval, optlen); -#ifdef CONFIG_NETFILTER - /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IP_HDRINCL && - optname != IP_IPSEC_POLICY && - optname != IP_XFRM_POLICY && - !ip_mroute_opt(optname)) - err = compat_nf_setsockopt(sk, PF_INET, optname, optval, - optlen); -#endif - return err; -} -EXPORT_SYMBOL(compat_ip_setsockopt); -#endif - /* * Get the options. Note for future reference. The GET of IP options gets * the _received_ ones. The set sets the _sent_ ones. @@ -1469,8 +1454,67 @@ static bool getsockopt_needs_rtnl(int optname) return false; } +static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval, + int __user *optlen, int len) +{ + const int size0 = offsetof(struct group_filter, gf_slist); + struct group_filter __user *p = optval; + struct group_filter gsf; + int num; + int err; + + if (len < size0) + return -EINVAL; + if (copy_from_user(&gsf, p, size0)) + return -EFAULT; + + num = gsf.gf_numsrc; + err = ip_mc_gsfget(sk, &gsf, p->gf_slist); + if (err) + return err; + if (gsf.gf_numsrc < num) + num = gsf.gf_numsrc; + if (put_user(GROUP_FILTER_SIZE(num), optlen) || + copy_to_user(p, &gsf, size0)) + return -EFAULT; + return 0; +} + +static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval, + int __user *optlen, int len) +{ + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter __user *p = optval; + struct compat_group_filter gf32; + struct group_filter gf; + int num; + int err; + + if (len < size0) + return -EINVAL; + if (copy_from_user(&gf32, p, size0)) + return -EFAULT; + + gf.gf_interface = gf32.gf_interface; + gf.gf_fmode = gf32.gf_fmode; + num = gf.gf_numsrc = gf32.gf_numsrc; + gf.gf_group = gf32.gf_group; + + err = ip_mc_gsfget(sk, &gf, p->gf_slist); + if (err) + return err; + if (gf.gf_numsrc < num) + num = gf.gf_numsrc; + len = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32)); + if (put_user(len, optlen) || + put_user(gf.gf_fmode, &p->gf_fmode) || + put_user(gf.gf_numsrc, &p->gf_numsrc)) + return -EFAULT; + return 0; +} + static int do_ip_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen, unsigned int flags) + char __user *optval, int __user *optlen) { struct inet_sock *inet = inet_sk(sk); bool needs_rtnl = getsockopt_needs_rtnl(optname); @@ -1588,6 +1632,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_RECVERR: val = inet->recverr; break; + case IP_RECVERR_RFC4884: + val = inet->recverr_rfc4884; + break; case IP_MULTICAST_TTL: val = inet->mc_ttl; break; @@ -1627,31 +1674,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, goto out; } case MCAST_MSFILTER: - { - struct group_filter __user *p = (void __user *)optval; - struct group_filter gsf; - const int size0 = offsetof(struct group_filter, gf_slist); - int num; - - if (len < size0) { - err = -EINVAL; - goto out; - } - if (copy_from_user(&gsf, p, size0)) { - err = -EFAULT; - goto out; - } - num = gsf.gf_numsrc; - err = ip_mc_gsfget(sk, &gsf, p->gf_slist); - if (err) - goto out; - if (gsf.gf_numsrc < num) - num = gsf.gf_numsrc; - if (put_user(GROUP_FILTER_SIZE(num), optlen) || - copy_to_user(p, &gsf, size0)) - err = -EFAULT; + if (in_compat_syscall()) + err = compat_ip_get_mcast_msfilter(sk, optval, optlen, + len); + else + err = ip_get_mcast_msfilter(sk, optval, optlen, len); goto out; - } case IP_MULTICAST_ALL: val = inet->mc_all; break; @@ -1667,7 +1695,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, msg.msg_control_is_user = true; msg.msg_control_user = optval; msg.msg_controllen = len; - msg.msg_flags = flags; + msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; if (inet->cmsg_flags & IP_CMSG_PKTINFO) { struct in_pktinfo info; @@ -1731,7 +1759,8 @@ int ip_getsockopt(struct sock *sk, int level, { int err; - err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); + err = do_ip_getsockopt(sk, level, optname, optval, optlen); + #if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) @@ -1755,79 +1784,3 @@ int ip_getsockopt(struct sock *sk, int level, return err; } EXPORT_SYMBOL(ip_getsockopt); - -#ifdef CONFIG_COMPAT -int compat_ip_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - int err; - - if (optname == MCAST_MSFILTER) { - const int size0 = offsetof(struct compat_group_filter, gf_slist); - struct compat_group_filter __user *p = (void __user *)optval; - struct compat_group_filter gf32; - struct group_filter gf; - int ulen, err; - int num; - - if (level != SOL_IP) - return -EOPNOTSUPP; - - if (get_user(ulen, optlen)) - return -EFAULT; - - if (ulen < size0) - return -EINVAL; - - if (copy_from_user(&gf32, p, size0)) - return -EFAULT; - - gf.gf_interface = gf32.gf_interface; - gf.gf_fmode = gf32.gf_fmode; - num = gf.gf_numsrc = gf32.gf_numsrc; - gf.gf_group = gf32.gf_group; - - rtnl_lock(); - lock_sock(sk); - err = ip_mc_gsfget(sk, &gf, p->gf_slist); - release_sock(sk); - rtnl_unlock(); - if (err) - return err; - if (gf.gf_numsrc < num) - num = gf.gf_numsrc; - ulen = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32)); - if (put_user(ulen, optlen) || - put_user(gf.gf_fmode, &p->gf_fmode) || - put_user(gf.gf_numsrc, &p->gf_numsrc)) - return -EFAULT; - return 0; - } - - err = do_ip_getsockopt(sk, level, optname, optval, optlen, - MSG_CMSG_COMPAT); - -#if IS_ENABLED(CONFIG_BPFILTER_UMH) - if (optname >= BPFILTER_IPT_SO_GET_INFO && - optname < BPFILTER_IPT_GET_MAX) - err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); -#endif -#ifdef CONFIG_NETFILTER - /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && - !ip_mroute_opt(optname)) { - int len; - - if (get_user(len, optlen)) - return -EFAULT; - - err = compat_nf_getsockopt(sk, PF_INET, optname, optval, &len); - if (err >= 0) - err = put_user(len, optlen); - return err; - } -#endif - return err; -} -EXPORT_SYMBOL(compat_ip_getsockopt); -#endif diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index f8b419e2475c..75c6013ff9a4 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -25,6 +25,7 @@ #include <net/protocol.h> #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h> +#include <net/ip6_checksum.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> @@ -184,6 +185,250 @@ int iptunnel_handle_offloads(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); +/** + * iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD + * @skb: Original packet with L2 header + * @mtu: MTU value for ICMP error + * + * Return: length on success, negative error code if message couldn't be built. + */ +static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) +{ + const struct iphdr *iph = ip_hdr(skb); + struct icmphdr *icmph; + struct iphdr *niph; + struct ethhdr eh; + int len, err; + + if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) + return -EINVAL; + + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); + pskb_pull(skb, ETH_HLEN); + skb_reset_network_header(skb); + + err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph)); + if (err) + return err; + + len = skb->len + sizeof(*icmph); + err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN); + if (err) + return err; + + icmph = skb_push(skb, sizeof(*icmph)); + *icmph = (struct icmphdr) { + .type = ICMP_DEST_UNREACH, + .code = ICMP_FRAG_NEEDED, + .checksum = 0, + .un.frag.__unused = 0, + .un.frag.mtu = ntohs(mtu), + }; + icmph->checksum = ip_compute_csum(icmph, len); + skb_reset_transport_header(skb); + + niph = skb_push(skb, sizeof(*niph)); + *niph = (struct iphdr) { + .ihl = sizeof(*niph) / 4u, + .version = 4, + .tos = 0, + .tot_len = htons(len + sizeof(*niph)), + .id = 0, + .frag_off = htons(IP_DF), + .ttl = iph->ttl, + .protocol = IPPROTO_ICMP, + .saddr = iph->daddr, + .daddr = iph->saddr, + }; + ip_send_check(niph); + skb_reset_network_header(skb); + + skb->ip_summed = CHECKSUM_NONE; + + eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0); + skb_reset_mac_header(skb); + + return skb->len; +} + +/** + * iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed + * @skb: Buffer being sent by encapsulation, L2 headers expected + * @mtu: Network MTU for path + * + * Return: 0 for no ICMP reply, length if built, negative value on error. + */ +static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) +{ + const struct icmphdr *icmph = icmp_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); + + if (mtu <= 576 || iph->frag_off != htons(IP_DF)) + return 0; + + if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) || + ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr) || + ipv4_is_lbcast(iph->saddr) || ipv4_is_multicast(iph->saddr)) + return 0; + + if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type)) + return 0; + + return iptunnel_pmtud_build_icmp(skb, mtu); +} + +#if IS_ENABLED(CONFIG_IPV6) +/** + * iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD + * @skb: Original packet with L2 header + * @mtu: MTU value for ICMPv6 error + * + * Return: length on success, negative error code if message couldn't be built. + */ +static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct icmp6hdr *icmp6h; + struct ipv6hdr *nip6h; + struct ethhdr eh; + int len, err; + __wsum csum; + + if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) + return -EINVAL; + + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); + pskb_pull(skb, ETH_HLEN); + skb_reset_network_header(skb); + + err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h)); + if (err) + return err; + + len = skb->len + sizeof(*icmp6h); + err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN); + if (err) + return err; + + icmp6h = skb_push(skb, sizeof(*icmp6h)); + *icmp6h = (struct icmp6hdr) { + .icmp6_type = ICMPV6_PKT_TOOBIG, + .icmp6_code = 0, + .icmp6_cksum = 0, + .icmp6_mtu = htonl(mtu), + }; + skb_reset_transport_header(skb); + + nip6h = skb_push(skb, sizeof(*nip6h)); + *nip6h = (struct ipv6hdr) { + .priority = 0, + .version = 6, + .flow_lbl = { 0 }, + .payload_len = htons(len), + .nexthdr = IPPROTO_ICMPV6, + .hop_limit = ip6h->hop_limit, + .saddr = ip6h->daddr, + .daddr = ip6h->saddr, + }; + skb_reset_network_header(skb); + + csum = csum_partial(icmp6h, len, 0); + icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len, + IPPROTO_ICMPV6, csum); + + skb->ip_summed = CHECKSUM_NONE; + + eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0); + skb_reset_mac_header(skb); + + return skb->len; +} + +/** + * iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed + * @skb: Buffer being sent by encapsulation, L2 headers expected + * @mtu: Network MTU for path + * + * Return: 0 for no ICMPv6 reply, length if built, negative value on error. + */ +static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + int stype = ipv6_addr_type(&ip6h->saddr); + u8 proto = ip6h->nexthdr; + __be16 frag_off; + int offset; + + if (mtu <= IPV6_MIN_MTU) + return 0; + + if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST || + stype == IPV6_ADDR_LOOPBACK) + return 0; + + offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto, + &frag_off); + if (offset < 0 || (frag_off & htons(~0x7))) + return 0; + + if (proto == IPPROTO_ICMPV6) { + struct icmp6hdr *icmp6h; + + if (!pskb_may_pull(skb, skb_network_header(skb) + + offset + 1 - skb->data)) + return 0; + + icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset); + if (icmpv6_is_err(icmp6h->icmp6_type) || + icmp6h->icmp6_type == NDISC_REDIRECT) + return 0; + } + + return iptunnel_pmtud_build_icmpv6(skb, mtu); +} +#endif /* IS_ENABLED(CONFIG_IPV6) */ + +/** + * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed + * @skb: Buffer being sent by encapsulation, L2 headers expected + * @encap_dst: Destination for tunnel encapsulation (outer IP) + * @headroom: Encapsulation header size, bytes + * @reply: Build matching ICMP or ICMPv6 message as a result + * + * L2 tunnel implementations that can carry IP and can be directly bridged + * (currently UDP tunnels) can't always rely on IP forwarding paths to handle + * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built + * based on payload and sent back by the encapsulation itself. + * + * For routable interfaces, we just need to update the PMTU for the destination. + * + * Return: 0 if ICMP error not needed, length if built, negative value on error + */ +int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, + int headroom, bool reply) +{ + u32 mtu = dst_mtu(encap_dst) - headroom; + + if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) || + (!skb_is_gso(skb) && (skb->len - skb_mac_header_len(skb)) <= mtu)) + return 0; + + skb_dst_update_pmtu_no_confirm(skb, mtu); + + if (!reply || skb->pkt_type == PACKET_HOST) + return 0; + + if (skb->protocol == htons(ETH_P_IP)) + return iptunnel_pmtud_check_icmp(skb, mtu); + +#if IS_ENABLED(CONFIG_IPV6) + if (skb->protocol == htons(ETH_P_IPV6)) + return iptunnel_pmtud_check_icmpv6(skb, mtu); +#endif + return 0; +} +EXPORT_SYMBOL(skb_tunnel_check_pmtu); + /* Often modified stats are per cpu, other are shared (netdev->stats) */ void ip_tunnel_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *tot) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 460ca1099e8a..49daaed89764 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -91,32 +91,6 @@ static int vti_rcv_proto(struct sk_buff *skb) return vti_rcv(skb, 0, false); } -static int vti_rcv_tunnel(struct sk_buff *skb) -{ - struct ip_tunnel_net *itn = net_generic(dev_net(skb->dev), vti_net_id); - const struct iphdr *iph = ip_hdr(skb); - struct ip_tunnel *tunnel; - - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->saddr, iph->daddr, 0); - if (tunnel) { - struct tnl_ptk_info tpi = { - .proto = htons(ETH_P_IP), - }; - - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto, false)) - goto drop; - return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, false); - } - - return -EINVAL; -drop: - kfree_skb(skb); - return 0; -} - static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; @@ -244,12 +218,15 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, } dst_hold(dst); - dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0); + dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { dev->stats.tx_carrier_errors++; goto tx_error_icmp; } + if (dst->flags & DST_XFRM_QUEUE) + goto queued; + if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { dev->stats.tx_carrier_errors++; dst_release(dst); @@ -281,6 +258,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } +queued: skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; @@ -496,11 +474,29 @@ static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .priority = 100, }; -static struct xfrm_tunnel ipip_handler __read_mostly = { +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) +static int vti_rcv_tunnel(struct sk_buff *skb) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false); +} + +static struct xfrm_tunnel vti_ipip_handler __read_mostly = { + .handler = vti_rcv_tunnel, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 0, +}; + +static struct xfrm_tunnel vti_ipip6_handler __read_mostly = { .handler = vti_rcv_tunnel, + .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; +#endif static int __net_init vti_init_net(struct net *net) { @@ -670,10 +666,17 @@ static int __init vti_init(void) if (err < 0) goto xfrm_proto_comp_failed; +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) msg = "ipip tunnel"; - err = xfrm4_tunnel_register(&ipip_handler, AF_INET); + err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET); + if (err < 0) + goto xfrm_tunnel_ipip_failed; +#if IS_ENABLED(CONFIG_IPV6) + err = xfrm4_tunnel_register(&vti_ipip6_handler, AF_INET6); if (err < 0) - goto xfrm_tunnel_failed; + goto xfrm_tunnel_ipip6_failed; +#endif +#endif msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); @@ -683,8 +686,14 @@ static int __init vti_init(void) return err; rtnl_link_failed: - xfrm4_tunnel_deregister(&ipip_handler, AF_INET); -xfrm_tunnel_failed: +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) +#if IS_ENABLED(CONFIG_IPV6) + xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); +xfrm_tunnel_ipip6_failed: +#endif + xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); +xfrm_tunnel_ipip_failed: +#endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); @@ -700,7 +709,12 @@ pernet_dev_failed: static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); - xfrm4_tunnel_deregister(&ipip_handler, AF_INET); +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) +#if IS_ENABLED(CONFIG_IPV6) + xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); +#endif + xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); +#endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 59bfa3825810..b42683212c65 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -72,6 +72,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t->props.flags = x->props.flags; t->props.extra_flags = x->props.extra_flags; memcpy(&t->mark, &x->mark, sizeof(t->mark)); + t->if_id = x->if_id; if (xfrm_init_state(t)) goto error; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index f5c7a58844a4..876fd6ff1ff9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -636,7 +636,10 @@ static int call_ipmr_mfc_entry_notifiers(struct net *net, /** * vif_delete - Delete a VIF entry + * @mrt: Table to delete from + * @vifi: VIF identifier to delete * @notify: Set to 1, if the caller is a notifier_call + * @head: if unregistering the VIF, place it on this queue */ static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) @@ -1338,7 +1341,7 @@ static void mrtsock_destruct(struct sock *sk) * MOSPF/PIM router set up we can clean this up. */ -int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, +int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { struct net *net = sock_net(sk); @@ -1410,7 +1413,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (copy_from_user(&vif, optval, sizeof(vif))) { + if (copy_from_sockptr(&vif, optval, sizeof(vif))) { ret = -EFAULT; break; } @@ -1438,7 +1441,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (copy_from_user(&mfc, optval, sizeof(mfc))) { + if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) { ret = -EFAULT; break; } @@ -1456,7 +1459,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (get_user(val, (int __user *)optval)) { + if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } @@ -1468,7 +1471,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (get_user(val, (int __user *)optval)) { + if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } @@ -1483,7 +1486,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (get_user(val, (int __user *)optval)) { + if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } @@ -1505,7 +1508,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (get_user(uval, (u32 __user *)optval)) { + if (copy_from_sockptr(&uval, optval, sizeof(uval))) { ret = -EFAULT; break; } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b167f4a5b684..d1e04d2b5170 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -787,8 +787,7 @@ static int compat_table_info(const struct xt_table_info *info, } #endif -static int get_info(struct net *net, void __user *user, - const int *len, int compat) +static int get_info(struct net *net, void __user *user, const int *len) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -802,7 +801,7 @@ static int get_info(struct net *net, void __user *user, name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_lock(NFPROTO_ARP); #endif t = xt_request_find_table_lock(net, NFPROTO_ARP, name); @@ -812,7 +811,7 @@ static int get_info(struct net *net, void __user *user, #ifdef CONFIG_COMPAT struct xt_table_info tmp; - if (compat) { + if (in_compat_syscall()) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(NFPROTO_ARP); private = &tmp; @@ -837,7 +836,7 @@ static int get_info(struct net *net, void __user *user, } else ret = PTR_ERR(t); #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_unlock(NFPROTO_ARP); #endif return ret; @@ -948,8 +947,7 @@ static int __do_replace(struct net *net, const char *name, return ret; } -static int do_replace(struct net *net, const void __user *user, - unsigned int len) +static int do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct arpt_replace tmp; @@ -957,7 +955,7 @@ static int do_replace(struct net *net, const void __user *user, void *loc_cpu_entry; struct arpt_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -973,8 +971,8 @@ static int do_replace(struct net *net, const void __user *user, return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -997,8 +995,7 @@ static int do_replace(struct net *net, const void __user *user, return ret; } -static int do_add_counters(struct net *net, const void __user *user, - unsigned int len, int compat) +static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) { unsigned int i; struct xt_counters_info tmp; @@ -1009,7 +1006,7 @@ static int do_add_counters(struct net *net, const void __user *user, struct arpt_entry *iter; unsigned int addend; - paddc = xt_copy_counters_from_user(user, len, &tmp, compat); + paddc = xt_copy_counters(arg, len, &tmp); if (IS_ERR(paddc)) return PTR_ERR(paddc); @@ -1246,8 +1243,7 @@ out_unlock: return ret; } -static int compat_do_replace(struct net *net, void __user *user, - unsigned int len) +static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct compat_arpt_replace tmp; @@ -1255,7 +1251,7 @@ static int compat_do_replace(struct net *net, void __user *user, void *loc_cpu_entry; struct arpt_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -1271,7 +1267,8 @@ static int compat_do_replace(struct net *net, void __user *user, return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -1294,30 +1291,6 @@ static int compat_do_replace(struct net *net, void __user *user, return ret; } -static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, - unsigned int len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case ARPT_SO_SET_REPLACE: - ret = compat_do_replace(sock_net(sk), user, len); - break; - - case ARPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 1); - break; - - default: - ret = -EINVAL; - } - - return ret; -} - static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, compat_uint_t *size, struct xt_counters *counters, @@ -1425,32 +1398,10 @@ static int compat_get_entries(struct net *net, xt_compat_unlock(NFPROTO_ARP); return ret; } - -static int do_arpt_get_ctl(struct sock *, int, void __user *, int *); - -static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, - int *len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case ARPT_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 1); - break; - case ARPT_SO_GET_ENTRIES: - ret = compat_get_entries(sock_net(sk), user, len); - break; - default: - ret = do_arpt_get_ctl(sk, cmd, user, len); - } - return ret; -} #endif -static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +static int do_arpt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, + unsigned int len) { int ret; @@ -1459,11 +1410,16 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned switch (cmd) { case ARPT_SO_SET_REPLACE: - ret = do_replace(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_do_replace(sock_net(sk), arg, len); + else +#endif + ret = do_replace(sock_net(sk), arg, len); break; case ARPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 0); + ret = do_add_counters(sock_net(sk), arg, len); break; default: @@ -1482,11 +1438,16 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len switch (cmd) { case ARPT_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 0); + ret = get_info(sock_net(sk), user, len); break; case ARPT_SO_GET_ENTRIES: - ret = get_entries(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_get_entries(sock_net(sk), user, len); + else +#endif + ret = get_entries(sock_net(sk), user, len); break; case ARPT_SO_GET_REVISION_TARGET: { @@ -1610,15 +1571,9 @@ static struct nf_sockopt_ops arpt_sockopts = { .set_optmin = ARPT_BASE_CTL, .set_optmax = ARPT_SO_SET_MAX+1, .set = do_arpt_set_ctl, -#ifdef CONFIG_COMPAT - .compat_set = compat_do_arpt_set_ctl, -#endif .get_optmin = ARPT_BASE_CTL, .get_optmax = ARPT_SO_GET_MAX+1, .get = do_arpt_get_ctl, -#ifdef CONFIG_COMPAT - .compat_get = compat_do_arpt_get_ctl, -#endif .owner = THIS_MODULE, }; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 5bf9fa06aee0..f15bc21d7301 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -944,8 +944,7 @@ static int compat_table_info(const struct xt_table_info *info, } #endif -static int get_info(struct net *net, void __user *user, - const int *len, int compat) +static int get_info(struct net *net, void __user *user, const int *len) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -959,7 +958,7 @@ static int get_info(struct net *net, void __user *user, name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_lock(AF_INET); #endif t = xt_request_find_table_lock(net, AF_INET, name); @@ -969,7 +968,7 @@ static int get_info(struct net *net, void __user *user, #ifdef CONFIG_COMPAT struct xt_table_info tmp; - if (compat) { + if (in_compat_syscall()) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET); private = &tmp; @@ -995,7 +994,7 @@ static int get_info(struct net *net, void __user *user, } else ret = PTR_ERR(t); #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_unlock(AF_INET); #endif return ret; @@ -1103,7 +1102,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, } static int -do_replace(struct net *net, const void __user *user, unsigned int len) +do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct ipt_replace tmp; @@ -1111,7 +1110,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) void *loc_cpu_entry; struct ipt_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -1127,8 +1126,8 @@ do_replace(struct net *net, const void __user *user, unsigned int len) return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -1152,8 +1151,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) } static int -do_add_counters(struct net *net, const void __user *user, - unsigned int len, int compat) +do_add_counters(struct net *net, sockptr_t arg, unsigned int len) { unsigned int i; struct xt_counters_info tmp; @@ -1164,7 +1162,7 @@ do_add_counters(struct net *net, const void __user *user, struct ipt_entry *iter; unsigned int addend; - paddc = xt_copy_counters_from_user(user, len, &tmp, compat); + paddc = xt_copy_counters(arg, len, &tmp); if (IS_ERR(paddc)) return PTR_ERR(paddc); @@ -1486,7 +1484,7 @@ out_unlock: } static int -compat_do_replace(struct net *net, void __user *user, unsigned int len) +compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct compat_ipt_replace tmp; @@ -1494,7 +1492,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) void *loc_cpu_entry; struct ipt_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -1510,8 +1508,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -1534,31 +1532,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return ret; } -static int -compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, - unsigned int len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case IPT_SO_SET_REPLACE: - ret = compat_do_replace(sock_net(sk), user, len); - break; - - case IPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 1); - break; - - default: - ret = -EINVAL; - } - - return ret; -} - struct compat_ipt_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; @@ -1634,33 +1607,10 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_unlock(AF_INET); return ret; } - -static int do_ipt_get_ctl(struct sock *, int, void __user *, int *); - -static int -compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case IPT_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 1); - break; - case IPT_SO_GET_ENTRIES: - ret = compat_get_entries(sock_net(sk), user, len); - break; - default: - ret = do_ipt_get_ctl(sk, cmd, user, len); - } - return ret; -} #endif static int -do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) { int ret; @@ -1669,11 +1619,16 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) switch (cmd) { case IPT_SO_SET_REPLACE: - ret = do_replace(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_do_replace(sock_net(sk), arg, len); + else +#endif + ret = do_replace(sock_net(sk), arg, len); break; case IPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 0); + ret = do_add_counters(sock_net(sk), arg, len); break; default: @@ -1693,11 +1648,16 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) switch (cmd) { case IPT_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 0); + ret = get_info(sock_net(sk), user, len); break; case IPT_SO_GET_ENTRIES: - ret = get_entries(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_get_entries(sock_net(sk), user, len); + else +#endif + ret = get_entries(sock_net(sk), user, len); break; case IPT_SO_GET_REVISION_MATCH: @@ -1886,15 +1846,9 @@ static struct nf_sockopt_ops ipt_sockopts = { .set_optmin = IPT_BASE_CTL, .set_optmax = IPT_SO_SET_MAX+1, .set = do_ipt_set_ctl, -#ifdef CONFIG_COMPAT - .compat_set = compat_do_ipt_set_ctl, -#endif .get_optmin = IPT_BASE_CTL, .get_optmax = IPT_SO_GET_MAX+1, .get = do_ipt_get_ctl, -#ifdef CONFIG_COMPAT - .compat_get = compat_do_ipt_get_ctl, -#endif .owner = THIS_MODULE, }; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index f8755a4ae9d4..a8b980ad11d4 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -3,7 +3,7 @@ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> * based on ideas of Fabio Olive Leite <olive@unixforge.org> * - * Development of this code funded by SuSE Linux AG, http://www.suse.com/ + * Development of this code funded by SuSE Linux AG, https://www.suse.com/ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 2361fdac2c43..9dcfa4e461b6 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -96,6 +96,21 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, } EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); +static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) +{ + struct dst_entry *dst = NULL; + struct flowi fl; + + memset(&fl, 0, sizeof(struct flowi)); + fl.u.ip4.daddr = ip_hdr(skb_in)->saddr; + nf_ip_route(dev_net(skb_in->dev), &dst, &fl, false); + if (!dst) + return -1; + + skb_dst_set(skb_in, dst); + return 0; +} + /* Send RST reply */ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) { @@ -109,6 +124,9 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) if (!oth) return; + if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(oldskb)) + return; + if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) return; @@ -175,6 +193,9 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) if (iph->frag_off & htons(IP_OFFSET)) return; + if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(skb_in)) + return; + if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(proto)) { icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); return; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 75545a829a2b..1074df726ec0 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -292,6 +292,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), + SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 47665919048f..6fd4330287c2 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -809,11 +809,11 @@ static int raw_sk_init(struct sock *sk) return 0; } -static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen) +static int raw_seticmpfilter(struct sock *sk, sockptr_t optval, int optlen) { if (optlen > sizeof(struct icmp_filter)) optlen = sizeof(struct icmp_filter); - if (copy_from_user(&raw_sk(sk)->filter, optval, optlen)) + if (copy_from_sockptr(&raw_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; } @@ -838,7 +838,7 @@ out: return ret; } static int do_raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { if (optname == ICMP_FILTER) { if (inet_sk(sk)->inet_num != IPPROTO_ICMP) @@ -850,23 +850,13 @@ static int do_raw_setsockopt(struct sock *sk, int level, int optname, } static int raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { if (level != SOL_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); return do_raw_setsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -static int compat_raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (level != SOL_RAW) - return compat_ip_setsockopt(sk, level, optname, optval, optlen); - return do_raw_setsockopt(sk, level, optname, optval, optlen); -} -#endif - static int do_raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -887,16 +877,6 @@ static int raw_getsockopt(struct sock *sk, int level, int optname, return do_raw_getsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -static int compat_raw_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_RAW) - return compat_ip_getsockopt(sk, level, optname, optval, optlen); - return do_raw_getsockopt(sk, level, optname, optval, optlen); -} -#endif - static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) { switch (cmd) { @@ -980,8 +960,6 @@ struct proto raw_prot = { .usersize = sizeof_field(struct raw_sock, filter), .h.raw_hash = &raw_v4_hashinfo, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_raw_setsockopt, - .compat_getsockopt = compat_raw_getsockopt, .compat_ioctl = compat_raw_ioctl, #endif .diag_destroy = raw_abort, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a01efa062f6b..8ca6bcab7b03 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1050,6 +1050,11 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); + + /* Don't make lookup fail for bridged encapsulations */ + if (skb && netif_is_any_bridge_port(skb->dev)) + fl4.flowi4_oif = 0; + __ip_rt_update_pmtu(rt, &fl4, mtu); } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 9a4f6b16c9bc..f0794f0232ba 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -212,6 +212,12 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, refcount_set(&req->rsk_refcnt, 1); tcp_sk(child)->tsoffset = tsoff; sock_rps_save_rxhash(child, skb); + + if (rsk_drop_req(req)) { + refcount_set(&req->rsk_refcnt, 2); + return child; + } + if (inet_csk_reqsk_queue_add(sk, req, child)) return child; @@ -276,6 +282,40 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, } EXPORT_SYMBOL(cookie_ecn_ok); +struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk, + struct sk_buff *skb) +{ + struct request_sock *req; + +#ifdef CONFIG_MPTCP + struct tcp_request_sock *treq; + + if (sk_is_mptcp(sk)) + ops = &mptcp_subflow_request_sock_ops; +#endif + + req = inet_reqsk_alloc(ops, sk, false); + if (!req) + return NULL; + +#if IS_ENABLED(CONFIG_MPTCP) + treq = tcp_rsk(req); + treq->is_mptcp = sk_is_mptcp(sk); + if (treq->is_mptcp) { + int err = mptcp_subflow_init_cookie_req(req, sk, skb); + + if (err) { + reqsk_free(req); + return NULL; + } + } +#endif + + return req; +} +EXPORT_SYMBOL_GPL(cookie_tcp_reqsk_alloc); + /* On input, sk is a listener. * Output is listener if incoming packet would not create a child * NULL if memory could not be allocated. @@ -326,7 +366,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */ + req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb); if (!req) goto out; @@ -350,9 +390,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) treq->snt_synack = 0; treq->tfo_listener = false; - if (IS_ENABLED(CONFIG_MPTCP)) - treq->is_mptcp = 0; - if (IS_ENABLED(CONFIG_SMC)) ireq->smc_ok = 0; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6f0caf9a866d..c06d2bfd2ec4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2764,7 +2764,7 @@ static inline bool tcp_can_repair_sock(const struct sock *sk) (sk->sk_state != TCP_LISTEN); } -static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len) +static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) { struct tcp_repair_window opt; @@ -2774,7 +2774,7 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l if (len != sizeof(opt)) return -EINVAL; - if (copy_from_user(&opt, optbuf, sizeof(opt))) + if (copy_from_sockptr(&opt, optbuf, sizeof(opt))) return -EFAULT; if (opt.max_window < opt.snd_wnd) @@ -2796,17 +2796,18 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l return 0; } -static int tcp_repair_options_est(struct sock *sk, - struct tcp_repair_opt __user *optbuf, unsigned int len) +static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, + unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_repair_opt opt; + size_t offset = 0; while (len >= sizeof(opt)) { - if (copy_from_user(&opt, optbuf, sizeof(opt))) + if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt))) return -EFAULT; - optbuf++; + offset += sizeof(opt); len -= sizeof(opt); switch (opt.opt_code) { @@ -2960,7 +2961,7 @@ void tcp_sock_set_user_timeout(struct sock *sk, u32 val) } EXPORT_SYMBOL(tcp_sock_set_user_timeout); -static int __tcp_sock_set_keepidle(struct sock *sk, int val) +int tcp_sock_set_keepidle_locked(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); @@ -2987,7 +2988,7 @@ int tcp_sock_set_keepidle(struct sock *sk, int val) int err; lock_sock(sk); - err = __tcp_sock_set_keepidle(sk, val); + err = tcp_sock_set_keepidle_locked(sk, val); release_sock(sk); return err; } @@ -3020,8 +3021,8 @@ EXPORT_SYMBOL(tcp_sock_set_keepcnt); /* * Socket option code for TCP. */ -static int do_tcp_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, unsigned int optlen) +static int do_tcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -3037,7 +3038,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (optlen < 1) return -EINVAL; - val = strncpy_from_user(name, optval, + val = strncpy_from_sockptr(name, optval, min_t(long, TCP_CA_NAME_MAX-1, optlen)); if (val < 0) return -EFAULT; @@ -3056,7 +3057,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (optlen < 1) return -EINVAL; - val = strncpy_from_user(name, optval, + val = strncpy_from_sockptr(name, optval, min_t(long, TCP_ULP_NAME_MAX - 1, optlen)); if (val < 0) @@ -3079,7 +3080,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) return -EINVAL; - if (copy_from_user(key, optval, optlen)) + if (copy_from_sockptr(key, optval, optlen)) return -EFAULT; if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) @@ -3095,7 +3096,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3174,9 +3175,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (!tp->repair) err = -EINVAL; else if (sk->sk_state == TCP_ESTABLISHED) - err = tcp_repair_options_est(sk, - (struct tcp_repair_opt __user *)optval, - optlen); + err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; break; @@ -3186,7 +3185,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_KEEPIDLE: - err = __tcp_sock_set_keepidle(sk, val); + err = tcp_sock_set_keepidle_locked(sk, val); break; case TCP_KEEPINTVL: if (val < 1 || val > MAX_TCP_KEEPINTVL) @@ -3325,7 +3324,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, return err; } -int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, +int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3337,18 +3336,6 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, } EXPORT_SYMBOL(tcp_setsockopt); -#ifdef CONFIG_COMPAT -int compat_tcp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (level != SOL_TCP) - return inet_csk_compat_setsockopt(sk, level, optname, - optval, optlen); - return do_tcp_setsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_tcp_setsockopt); -#endif - static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, struct tcp_info *info) { @@ -3514,10 +3501,12 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ 0; } -struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, + const struct sk_buff *orig_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; @@ -3571,6 +3560,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, max_t(int, 0, tp->write_seq - tp->snd_nxt)); + nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, + TCP_NLA_PAD); return stats; } @@ -3896,18 +3887,6 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, } EXPORT_SYMBOL(tcp_getsockopt); -#ifdef CONFIG_COMPAT -int compat_tcp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_TCP) - return inet_csk_compat_getsockopt(sk, level, optname, - optval, optlen); - return do_tcp_getsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_tcp_getsockopt); -#endif - #ifdef CONFIG_TCP_MD5SIG static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool); static DEFINE_MUTEX(tcp_md5sig_mutex); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index bfdfbb972c57..349069d6cd0a 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -2,7 +2,7 @@ /* * Sally Floyd's High Speed TCP (RFC 3649) congestion control * - * See http://www.icir.org/floyd/hstcp.html + * See https://www.icir.org/floyd/hstcp.html * * John Heffner <jheffner@psc.edu> */ diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 88e1f011afe0..55adcfcf96fe 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -4,7 +4,7 @@ * R.N.Shorten, D.J.Leith: * "H-TCP: TCP for high-speed and long-distance networks" * Proc. PFLDnet, Argonne, 2004. - * http://www.hamilton.ie/net/htcp3.pdf + * https://www.hamilton.ie/net/htcp3.pdf */ #include <linux/mm.h> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 518f04355fbf..184ea556f50e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -518,7 +518,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss); * * The algorithm for RTT estimation w/o timestamps is based on * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. - * <http://public.lanl.gov/radiant/pubs.html#DRS> + * <https://public.lanl.gov/radiant/pubs.html#DRS> * * More detail on this code can be found at * <http://staff.psc.edu/jheffner/>, @@ -871,12 +871,41 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } +struct tcp_sacktag_state { + /* Timestamps for earliest and latest never-retransmitted segment + * that was SACKed. RTO needs the earliest RTT to stay conservative, + * but congestion control should still get an accurate delay signal. + */ + u64 first_sackt; + u64 last_sackt; + u32 reord; + u32 sack_delivered; + int flag; + unsigned int mss_now; + struct rate_sample *rate; +}; + /* Take a notice that peer is sending D-SACKs */ -static void tcp_dsack_seen(struct tcp_sock *tp) +static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, + u32 end_seq, struct tcp_sacktag_state *state) { + u32 seq_len, dup_segs = 1; + + if (before(start_seq, end_seq)) { + seq_len = end_seq - start_seq; + if (seq_len > tp->mss_cache) + dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); + } + tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; tp->rack.dsack_seen = 1; - tp->dsack_dups++; + tp->dsack_dups += dup_segs; + + state->flag |= FLAG_DSACKING_ACK; + /* A spurious retransmission is delivered */ + state->sack_delivered += dup_segs; + + return dup_segs; } /* It's reordering when higher sequence was delivered (i.e. sacked) before @@ -962,6 +991,15 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) } } +/* Updates the delivered and delivered_ce counts */ +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, + bool ece_ack) +{ + tp->delivered += delivered; + if (ece_ack) + tp->delivered_ce += delivered; +} + /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -1094,52 +1132,38 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, - u32 prior_snd_una) + u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); - bool dup_sack = false; + u32 dup_segs; if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { - dup_sack = true; - tcp_dsack_seen(tp); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); - if (!after(end_seq_0, end_seq_1) && - !before(start_seq_0, start_seq_1)) { - dup_sack = true; - tcp_dsack_seen(tp); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPDSACKOFORECV); - } + if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1)) + return false; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); + } else { + return false; } + dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); + /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && + if (tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) - tp->undo_retrans--; + tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs); - return dup_sack; + return true; } -struct tcp_sacktag_state { - u32 reord; - /* Timestamps for earliest and latest never-retransmitted segment - * that was SACKed. RTO needs the earliest RTT to stay conservative, - * but congestion control should still get an accurate delay signal. - */ - u64 first_sackt; - u64 last_sackt; - struct rate_sample *rate; - int flag; - unsigned int mss_now; -}; - /* Check if skb is fully within the SACK block. In presence of GSO skbs, * the incoming SACK may not exactly match but we can find smaller MSS * aligned portion of it that matches. Therefore we might need to fragment @@ -1258,7 +1282,8 @@ static u8 tcp_sacktag_one(struct sock *sk, sacked |= TCPCB_SACKED_ACKED; state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; - tp->delivered += pcount; /* Out-of-order packets delivered */ + /* Out-of-order packets delivered */ + state->sack_delivered += pcount; /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (tp->lost_skb_hint && @@ -1681,11 +1706,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, tcp_highest_sack_reset(sk); found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, - num_sacks, prior_snd_una); - if (found_dup_sack) { - state->flag |= FLAG_DSACKING_ACK; - tp->delivered++; /* A spurious retransmission is delivered */ - } + num_sacks, prior_snd_una, state); /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1893,7 +1914,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct sock *sk, int num_dupack) +static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack) { if (num_dupack) { struct tcp_sock *tp = tcp_sk(sk); @@ -1904,20 +1925,21 @@ static void tcp_add_reno_sack(struct sock *sk, int num_dupack) tcp_check_reno_reordering(sk, 0); delivered = tp->sacked_out - prior_sacked; if (delivered > 0) - tp->delivered += delivered; + tcp_count_delivered(tp, delivered, ece_ack); tcp_verify_left_out(tp); } } /* Account for ACK, ACKing some data in Reno Recovery phase. */ -static void tcp_remove_reno_sacks(struct sock *sk, int acked) +static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ - tp->delivered += max_t(int, acked - tp->sacked_out, 1); + tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1), + ece_ack); if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else @@ -2697,7 +2719,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, * delivered. Lower inflight to clock out (re)tranmissions. */ if (after(tp->snd_nxt, tp->high_seq) && num_dupack) - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE); else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } @@ -2779,6 +2801,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; + bool ece_ack = flag & FLAG_ECE; bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && tcp_force_fast_retransmit(sk)); @@ -2787,7 +2810,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ - if (flag & FLAG_ECE) + if (ece_ack) tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ @@ -2828,7 +2851,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, ece_ack); } else { if (tcp_try_undo_partial(sk, prior_snd_una)) return; @@ -2853,7 +2876,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, ece_ack); } if (icsk->icsk_ca_state <= TCP_CA_Disorder) @@ -2877,7 +2900,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, } /* Otherwise enter Recovery state */ - tcp_enter_recovery(sk, (flag & FLAG_ECE)); + tcp_enter_recovery(sk, ece_ack); fast_rexmit = 1; } @@ -2927,6 +2950,8 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + if (!delta) + delta = 1; seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); ca_rtt_us = seq_rtt_us; } @@ -3053,7 +3078,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, */ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, u32 prior_snd_una, - struct tcp_sacktag_state *sack) + struct tcp_sacktag_state *sack, bool ece_ack) { const struct inet_connection_sock *icsk = inet_csk(sk); u64 first_ackt, last_ackt; @@ -3078,8 +3103,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, u8 sacked = scb->sacked; u32 acked_pcount; - tcp_ack_tstamp(sk, skb, prior_snd_una); - /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { if (tcp_skb_pcount(skb) == 1 || @@ -3114,7 +3137,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; } else if (tcp_is_sack(tp)) { - tp->delivered += acked_pcount; + tcp_count_delivered(tp, acked_pcount, ece_ack); if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, tcp_skb_timestamp_us(skb)); @@ -3143,6 +3166,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (!fully_acked) break; + tcp_ack_tstamp(sk, skb, prior_snd_una); + next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; @@ -3158,8 +3183,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) tp->snd_up = tp->snd_una; - if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) - flag |= FLAG_SACK_RENEGING; + if (skb) { + tcp_ack_tstamp(sk, skb, prior_snd_una); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + flag |= FLAG_SACK_RENEGING; + } if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); @@ -3191,7 +3219,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, } if (tcp_is_reno(tp)) { - tcp_remove_reno_sacks(sk, pkts_acked); + tcp_remove_reno_sacks(sk, pkts_acked, ece_ack); /* If any of the cumulatively ACKed segments was * retransmitted, non-SACK case cannot confirm that @@ -3559,10 +3587,9 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); - if (flag & FLAG_ECE) { - tp->delivered_ce += delivered; + if (flag & FLAG_ECE) NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); - } + return delivered; } @@ -3586,6 +3613,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) sack_state.first_sackt = 0; sack_state.rate = &rs; + sack_state.sack_delivered = 0; /* We very likely will need to access rtx queue. */ prefetch(sk->tcp_rtx_queue.rb_node); @@ -3661,6 +3689,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ack_ev_flags |= CA_ACK_ECE; } + if (sack_state.sack_delivered) + tcp_count_delivered(tp, sack_state.sack_delivered, + flag & FLAG_ECE); + if (flag & FLAG_WIN_UPDATE) ack_ev_flags |= CA_ACK_WIN_UPDATE; @@ -3686,7 +3718,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); + flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state, + flag & FLAG_ECE); tcp_rack_update_reo_wnd(sk, &rs); @@ -4427,7 +4460,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket - * @dest: destination queue * @to: prior buffer * @from: buffer to add in queue * @fragstolen: pointer to boolean @@ -6489,7 +6521,6 @@ static void tcp_openreq_init(struct request_sock *req, struct inet_request_sock *ireq = inet_rsk(req); req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ - req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; @@ -6644,6 +6675,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (!req) goto drop; + req->syncookie = want_cookie; tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; #if IS_ENABLED(CONFIG_MPTCP) @@ -6671,9 +6703,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, af_ops->init_req(req, sk, skb); - if (IS_ENABLED(CONFIG_MPTCP) && want_cookie) - tcp_rsk(req)->is_mptcp = 0; - if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; @@ -6709,7 +6738,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; if (!tmp_opt.tstamp_ok) inet_rsk(req)->ecn_ok = 0; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 04bfcbbfee83..5084333b5ab6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -76,6 +76,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/inetdevice.h> +#include <linux/btf_ids.h> #include <crypto/hash.h> #include <linux/scatterlist.h> @@ -1194,7 +1195,7 @@ static void tcp_clear_md5_list(struct sock *sk) } static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, - char __user *optval, int optlen) + sockptr_t optval, int optlen) { struct tcp_md5sig cmd; struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; @@ -1205,7 +1206,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, if (optlen < sizeof(cmd)) return -EINVAL; - if (copy_from_user(&cmd, optval, sizeof(cmd))) + if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) return -EFAULT; if (sin->sin_family != AF_INET) @@ -2134,10 +2135,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ip_setsockopt, - .compat_getsockopt = compat_ip_getsockopt, -#endif .mtu_reduced = tcp_v4_mtu_reduced, }; EXPORT_SYMBOL(ipv4_specific); @@ -2223,13 +2220,18 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); */ static void *listening_get_next(struct seq_file *seq, void *cur) { - struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; struct sock *sk = cur; + if (st->bpf_seq_afinfo) + afinfo = st->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + if (!sk) { get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; @@ -2247,7 +2249,8 @@ get_sk: sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_family == afinfo->family) + if (afinfo->family == AF_UNSPEC || + sk->sk_family == afinfo->family) return sk; } spin_unlock(&ilb->lock); @@ -2284,11 +2287,16 @@ static inline bool empty_bucket(const struct tcp_iter_state *st) */ static void *established_get_first(struct seq_file *seq) { - struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); void *rc = NULL; + if (st->bpf_seq_afinfo) + afinfo = st->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + st->offset = 0; for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { struct sock *sk; @@ -2301,7 +2309,8 @@ static void *established_get_first(struct seq_file *seq) spin_lock_bh(lock); sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { - if (sk->sk_family != afinfo->family || + if ((afinfo->family != AF_UNSPEC && + sk->sk_family != afinfo->family) || !net_eq(sock_net(sk), net)) { continue; } @@ -2316,19 +2325,25 @@ out: static void *established_get_next(struct seq_file *seq, void *cur) { - struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct tcp_seq_afinfo *afinfo; struct sock *sk = cur; struct hlist_nulls_node *node; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); + if (st->bpf_seq_afinfo) + afinfo = st->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + ++st->num; ++st->offset; sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { - if (sk->sk_family == afinfo->family && + if ((afinfo->family == AF_UNSPEC || + sk->sk_family == afinfo->family) && net_eq(sock_net(sk), net)) return sk; } @@ -2607,6 +2622,74 @@ out: return 0; } +#ifdef CONFIG_BPF_SYSCALL +struct bpf_iter__tcp { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct sock_common *, sk_common); + uid_t uid __aligned(8); +}; + +static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, + struct sock_common *sk_common, uid_t uid) +{ + struct bpf_iter__tcp ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.sk_common = sk_common; + ctx.uid = uid; + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + struct sock *sk = v; + uid_t uid; + + if (v == SEQ_START_TOKEN) + return 0; + + if (sk->sk_state == TCP_TIME_WAIT) { + uid = 0; + } else if (sk->sk_state == TCP_NEW_SYN_RECV) { + const struct request_sock *req = v; + + uid = from_kuid_munged(seq_user_ns(seq), + sock_i_uid(req->rsk_listener)); + } else { + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + } + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + return tcp_prog_seq_show(prog, &meta, v, uid); +} + +static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)tcp_prog_seq_show(prog, &meta, v, 0); + } + + tcp_seq_stop(seq, v); +} + +static const struct seq_operations bpf_iter_tcp_seq_ops = { + .show = bpf_iter_tcp_seq_show, + .start = tcp_seq_start, + .next = tcp_seq_next, + .stop = bpf_iter_tcp_seq_stop, +}; +#endif + static const struct seq_operations tcp4_seq_ops = { .show = tcp4_seq_show, .start = tcp_seq_start, @@ -2687,10 +2770,6 @@ struct proto tcp_prot = { .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, .no_autobind = true, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_tcp_setsockopt, - .compat_getsockopt = compat_tcp_getsockopt, -#endif .diag_destroy = tcp_abort, }; EXPORT_SYMBOL(tcp_prot); @@ -2838,8 +2917,68 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { .exit_batch = tcp_sk_exit_batch, }; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, + struct sock_common *sk_common, uid_t uid) + +static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct tcp_iter_state *st = priv_data; + struct tcp_seq_afinfo *afinfo; + int ret; + + afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); + if (!afinfo) + return -ENOMEM; + + afinfo->family = AF_UNSPEC; + st->bpf_seq_afinfo = afinfo; + ret = bpf_iter_init_seq_net(priv_data, aux); + if (ret) + kfree(afinfo); + return ret; +} + +static void bpf_iter_fini_tcp(void *priv_data) +{ + struct tcp_iter_state *st = priv_data; + + kfree(st->bpf_seq_afinfo); + bpf_iter_fini_seq_net(priv_data); +} + +static const struct bpf_iter_seq_info tcp_seq_info = { + .seq_ops = &bpf_iter_tcp_seq_ops, + .init_seq_private = bpf_iter_init_tcp, + .fini_seq_private = bpf_iter_fini_tcp, + .seq_priv_size = sizeof(struct tcp_iter_state), +}; + +static struct bpf_iter_reg tcp_reg_info = { + .target = "tcp", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__tcp, sk_common), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &tcp_seq_info, +}; + +static void __init bpf_iter_register(void) +{ + tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; + if (bpf_iter_reg_target(&tcp_reg_info)) + pr_warn("Warning: could not register bpf iterator tcp\n"); +} + +#endif + void __init tcp_v4_init(void) { if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); + +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_register(); +#endif } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0bc05d68cd74..85ff417bda7f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1066,6 +1066,10 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } +INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); +INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); +INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -1209,7 +1213,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } #endif - icsk->icsk_af_ops->send_check(sk, skb); + INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, + tcp_v6_send_check, tcp_v4_send_check, + sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); @@ -1237,7 +1243,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcp_add_tx_delay(skb, tp); - err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); + err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit, + inet6_csk_xmit, ip_queue_xmit, + sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { tcp_enter_cwr(sk); @@ -3332,6 +3340,8 @@ int tcp_send_synack(struct sock *sk) * sk: listener socket * dst: dst entry attached to the SYNACK * req: request_sock pointer + * foc: cookie for tcp fast open + * synack_type: Type of synback to prepare * * Allocate one skb and build a SYNACK packet. * @dst is consumed : Caller should not use it again. @@ -3383,7 +3393,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES - if (unlikely(req->cookie_ts)) + if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) skb->skb_mstamp_ns = cookie_init_timestamp(req, now); else #endif diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ada046f425d2..0c08c420fbc2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -314,7 +314,7 @@ out: /** * tcp_delack_timer() - The TCP delayed ACK timeout handler - * @data: Pointer to the current socket. (gets casted to struct sock *) + * @t: Pointer to the timer. (gets casted to struct sock *) * * This function gets (indirectly) called when the kernel timer for a TCP packet * of this socket expires. Calls tcp_delack_timer_handler() to do the actual work. diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 50a9a6e2c4cd..cd50a61c9976 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -7,7 +7,7 @@ * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." * IEEE Journal on Selected Areas in Communication, * Feb. 2003. - * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf + * See https://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf */ #include <linux/mm.h> diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index c4b2ccbeba04..e44aaf41a138 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -110,6 +110,33 @@ drop: return 0; } +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) +static int tunnel4_rcv_cb(struct sk_buff *skb, u8 proto, int err) +{ + struct xfrm_tunnel __rcu *head; + struct xfrm_tunnel *handler; + int ret; + + head = (proto == IPPROTO_IPIP) ? tunnel4_handlers : tunnel64_handlers; + + for_each_tunnel_rcu(head, handler) { + if (handler->cb_handler) { + ret = handler->cb_handler(skb, err); + if (ret <= 0) + return ret; + } + } + + return 0; +} + +static const struct xfrm_input_afinfo tunnel4_input_afinfo = { + .family = AF_INET, + .is_ipip = true, + .callback = tunnel4_rcv_cb, +}; +#endif + #if IS_ENABLED(CONFIG_IPV6) static int tunnel64_rcv(struct sk_buff *skb) { @@ -231,6 +258,18 @@ static int __init tunnel4_init(void) goto err; } #endif +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) + if (xfrm_input_register_afinfo(&tunnel4_input_afinfo)) { + inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); +#if IS_ENABLED(CONFIG_IPV6) + inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); +#endif +#if IS_ENABLED(CONFIG_MPLS) + inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS); +#endif + goto err; + } +#endif return 0; err: @@ -240,6 +279,10 @@ err: static void __exit tunnel4_fini(void) { +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) + if (xfrm_input_unregister_afinfo(&tunnel4_input_afinfo)) + pr_err("tunnel4 close: can't remove input afinfo\n"); +#endif #if IS_ENABLED(CONFIG_MPLS) if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) pr_err("tunnelmpls4 close: can't remove protocol\n"); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4077d589b72e..e88efba07551 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -106,6 +106,7 @@ #include <net/xfrm.h> #include <trace/events/udp.h> #include <linux/static_key.h> +#include <linux/btf_ids.h> #include <trace/events/skb.h> #include <net/busy_poll.h> #include "udp_impl.h" @@ -408,6 +409,22 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, udp_ehash_secret + net_hash_mix(net)); } +static struct sock *lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum) +{ + struct sock *reuse_sk = NULL; + u32 hash; + + if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { + hash = udp_ehashfn(net, daddr, hnum, saddr, sport); + reuse_sk = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + } + return reuse_sk; +} + /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, @@ -416,9 +433,8 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct udp_hslot *hslot2, struct sk_buff *skb) { - struct sock *sk, *result, *reuseport_result; + struct sock *sk, *result; int score, badness; - u32 hash = 0; result = NULL; badness = 0; @@ -426,25 +442,42 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - reuseport_result = NULL; - - if (sk->sk_reuseport && - sk->sk_state != TCP_ESTABLISHED) { - hash = udp_ehashfn(net, daddr, hnum, - saddr, sport); - reuseport_result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (reuseport_result && !reuseport_has_conns(sk, false)) - return reuseport_result; - } + result = lookup_reuseport(net, sk, skb, + saddr, sport, daddr, hnum); + /* Fall back to scoring if group has connections */ + if (result && !reuseport_has_conns(sk, false)) + return result; - result = reuseport_result ? : sk; + result = result ? : sk; badness = score; } } return result; } +static struct sock *udp4_lookup_run_bpf(struct net *net, + struct udp_table *udptable, + struct sk_buff *skb, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum) +{ + struct sock *sk, *reuse_sk; + bool no_reuseport; + + if (udptable != &udp_table) + return NULL; /* only UDP is supported */ + + no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, + saddr, sport, daddr, hnum, &sk); + if (no_reuseport || IS_ERR_OR_NULL(sk)) + return sk; + + reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); + if (reuse_sk) + sk = reuse_sk; + return sk; +} + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ @@ -452,27 +485,45 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { - struct sock *result; unsigned short hnum = ntohs(dport); unsigned int hash2, slot2; struct udp_hslot *hslot2; + struct sock *result, *sk; hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; + /* Lookup connected or non-wildcard socket */ result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, hslot2, skb); - if (!result) { - hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; - - result = udp4_lib_lookup2(net, saddr, sport, - htonl(INADDR_ANY), hnum, dif, sdif, - hslot2, skb); + if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED) + goto done; + + /* Lookup redirect from BPF */ + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { + sk = udp4_lookup_run_bpf(net, udptable, skb, + saddr, sport, daddr, hnum); + if (sk) { + result = sk; + goto done; + } } + + /* Got non-wildcard socket or error on first lookup */ + if (result) + goto done; + + /* Lookup wildcard sockets */ + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + + result = udp4_lib_lookup2(net, saddr, sport, + htonl(INADDR_ANY), hnum, dif, sdif, + hslot2, skb); +done: if (IS_ERR(result)) return NULL; return result; @@ -2535,7 +2586,7 @@ void udp_destroy_sock(struct sock *sk) * Socket option code for UDP */ int udp_lib_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen, + sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)) { struct udp_sock *up = udp_sk(sk); @@ -2546,7 +2597,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; @@ -2650,26 +2701,16 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, } EXPORT_SYMBOL(udp_lib_setsockopt); -int udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_setsockopt(sk, level, optname, optval, optlen, + return udp_lib_setsockopt(sk, level, optname, + optval, optlen, udp_push_pending_frames); return ip_setsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -int compat_udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_setsockopt(sk, level, optname, optval, optlen, - udp_push_pending_frames); - return compat_ip_setsockopt(sk, level, optname, optval, optlen); -} -#endif - int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -2735,20 +2776,11 @@ int udp_getsockopt(struct sock *sk, int level, int optname, return ip_getsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -int compat_udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_getsockopt(sk, level, optname, optval, optlen); - return compat_ip_getsockopt(sk, level, optname, optval, optlen); -} -#endif /** * udp_poll - wait for a UDP event. - * @file - file struct - * @sock - socket - * @wait - poll table + * @file: - file struct + * @sock: - socket + * @wait: - poll table * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd @@ -2815,10 +2847,6 @@ struct proto udp_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), .h.udp_table = &udp_table, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udp_setsockopt, - .compat_getsockopt = compat_udp_getsockopt, -#endif .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); @@ -2829,10 +2857,15 @@ EXPORT_SYMBOL(udp_prot); static struct sock *udp_get_first(struct seq_file *seq, int start) { struct sock *sk; - struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct udp_seq_afinfo *afinfo; struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); + if (state->bpf_seq_afinfo) + afinfo = state->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + for (state->bucket = start; state->bucket <= afinfo->udp_table->mask; ++state->bucket) { struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket]; @@ -2844,7 +2877,8 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) sk_for_each(sk, &hslot->head) { if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_family == afinfo->family) + if (afinfo->family == AF_UNSPEC || + sk->sk_family == afinfo->family) goto found; } spin_unlock_bh(&hslot->lock); @@ -2856,13 +2890,20 @@ found: static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) { - struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct udp_seq_afinfo *afinfo; struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); + if (state->bpf_seq_afinfo) + afinfo = state->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + do { sk = sk_next(sk); - } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != afinfo->family)); + } while (sk && (!net_eq(sock_net(sk), net) || + (afinfo->family != AF_UNSPEC && + sk->sk_family != afinfo->family))); if (!sk) { if (state->bucket <= afinfo->udp_table->mask) @@ -2907,9 +2948,14 @@ EXPORT_SYMBOL(udp_seq_next); void udp_seq_stop(struct seq_file *seq, void *v) { - struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct udp_seq_afinfo *afinfo; struct udp_iter_state *state = seq->private; + if (state->bpf_seq_afinfo) + afinfo = state->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + if (state->bucket <= afinfo->udp_table->mask) spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock); } @@ -2953,6 +2999,67 @@ int udp4_seq_show(struct seq_file *seq, void *v) return 0; } +#ifdef CONFIG_BPF_SYSCALL +struct bpf_iter__udp { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct udp_sock *, udp_sk); + uid_t uid __aligned(8); + int bucket __aligned(8); +}; + +static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, + struct udp_sock *udp_sk, uid_t uid, int bucket) +{ + struct bpf_iter__udp ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.udp_sk = udp_sk; + ctx.uid = uid; + ctx.bucket = bucket; + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v) +{ + struct udp_iter_state *state = seq->private; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + struct sock *sk = v; + uid_t uid; + + if (v == SEQ_START_TOKEN) + return 0; + + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + return udp_prog_seq_show(prog, &meta, v, uid, state->bucket); +} + +static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)udp_prog_seq_show(prog, &meta, v, 0, 0); + } + + udp_seq_stop(seq, v); +} + +static const struct seq_operations bpf_iter_udp_seq_ops = { + .start = udp_seq_start, + .next = udp_seq_next, + .stop = bpf_iter_udp_seq_stop, + .show = bpf_iter_udp_seq_show, +}; +#endif + const struct seq_operations udp_seq_ops = { .start = udp_seq_start, .next = udp_seq_next, @@ -3070,6 +3177,62 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = { .init = udp_sysctl_init, }; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta, + struct udp_sock *udp_sk, uid_t uid, int bucket) + +static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct udp_iter_state *st = priv_data; + struct udp_seq_afinfo *afinfo; + int ret; + + afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); + if (!afinfo) + return -ENOMEM; + + afinfo->family = AF_UNSPEC; + afinfo->udp_table = &udp_table; + st->bpf_seq_afinfo = afinfo; + ret = bpf_iter_init_seq_net(priv_data, aux); + if (ret) + kfree(afinfo); + return ret; +} + +static void bpf_iter_fini_udp(void *priv_data) +{ + struct udp_iter_state *st = priv_data; + + kfree(st->bpf_seq_afinfo); + bpf_iter_fini_seq_net(priv_data); +} + +static const struct bpf_iter_seq_info udp_seq_info = { + .seq_ops = &bpf_iter_udp_seq_ops, + .init_seq_private = bpf_iter_init_udp, + .fini_seq_private = bpf_iter_fini_udp, + .seq_priv_size = sizeof(struct udp_iter_state), +}; + +static struct bpf_iter_reg udp_reg_info = { + .target = "udp", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__udp, udp_sk), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &udp_seq_info, +}; + +static void __init bpf_iter_register(void) +{ + udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP]; + if (bpf_iter_reg_target(&udp_reg_info)) + pr_warn("Warning: could not register bpf iterator udp\n"); +} +#endif + void __init udp_init(void) { unsigned long limit; @@ -3095,4 +3258,8 @@ void __init udp_init(void) if (register_pernet_subsys(&udp_sysctl_ops)) panic("UDP: failed to init sysctl parameters.\n"); + +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_register(); +#endif } diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 6b2fa77eeb1c..2878d8285caf 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -12,17 +12,11 @@ int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); int udp_v4_get_port(struct sock *sk, unsigned short snum); void udp_v4_rehash(struct sock *sk); -int udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); +int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); -#ifdef CONFIG_COMPAT -int compat_udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -int compat_udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -#endif int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel_core.c index 3eecba0874aa..3eecba0874aa 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel_core.c diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c new file mode 100644 index 000000000000..69962165c0e8 --- /dev/null +++ b/net/ipv4/udp_tunnel_nic.c @@ -0,0 +1,897 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2020 Facebook Inc. + +#include <linux/ethtool_netlink.h> +#include <linux/netdevice.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <net/udp_tunnel.h> +#include <net/vxlan.h> + +enum udp_tunnel_nic_table_entry_flags { + UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0), + UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1), + UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2), + UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3), +}; + +struct udp_tunnel_nic_table_entry { + __be16 port; + u8 type; + u8 use_cnt; + u8 flags; + u8 hw_priv; +}; + +/** + * struct udp_tunnel_nic - UDP tunnel port offload state + * @work: async work for talking to hardware from process context + * @dev: netdev pointer + * @need_sync: at least one port start changed + * @need_replay: space was freed, we need a replay of all ports + * @work_pending: @work is currently scheduled + * @n_tables: number of tables under @entries + * @missed: bitmap of tables which overflown + * @entries: table of tables of ports currently offloaded + */ +struct udp_tunnel_nic { + struct work_struct work; + + struct net_device *dev; + + u8 need_sync:1; + u8 need_replay:1; + u8 work_pending:1; + + unsigned int n_tables; + unsigned long missed; + struct udp_tunnel_nic_table_entry **entries; +}; + +/* We ensure all work structs are done using driver state, but not the code. + * We need a workqueue we can flush before module gets removed. + */ +static struct workqueue_struct *udp_tunnel_nic_workqueue; + +static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type) +{ + switch (type) { + case UDP_TUNNEL_TYPE_VXLAN: + return "vxlan"; + case UDP_TUNNEL_TYPE_GENEVE: + return "geneve"; + case UDP_TUNNEL_TYPE_VXLAN_GPE: + return "vxlan-gpe"; + default: + return "unknown"; + } +} + +static bool +udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->use_cnt == 0 && !entry->flags; +} + +static bool +udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); +} + +static bool +udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN; +} + +static void +udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry) +{ + if (!udp_tunnel_nic_entry_is_free(entry)) + entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN; +} + +static void +udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry) +{ + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN; +} + +static bool +udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD | + UDP_TUNNEL_NIC_ENTRY_DEL); +} + +static void +udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn, + struct udp_tunnel_nic_table_entry *entry, + unsigned int flag) +{ + entry->flags |= flag; + utn->need_sync = 1; +} + +static void +udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry, + struct udp_tunnel_info *ti) +{ + memset(ti, 0, sizeof(*ti)); + ti->port = entry->port; + ti->type = entry->type; + ti->hw_priv = entry->hw_priv; +} + +static bool +udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) + return false; + return true; +} + +static bool +udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_table_info *table; + unsigned int i, j; + + if (!utn->missed) + return false; + + for (i = 0; i < utn->n_tables; i++) { + table = &dev->udp_tunnel_nic_info->tables[i]; + if (!test_bit(i, &utn->missed)) + continue; + + for (j = 0; j < table->n_entries; j++) + if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) + return true; + } + + return false; +} + +static void +__udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, + unsigned int idx, struct udp_tunnel_info *ti) +{ + struct udp_tunnel_nic_table_entry *entry; + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + entry = &utn->entries[table][idx]; + + if (entry->use_cnt) + udp_tunnel_nic_ti_from_entry(entry, ti); +} + +static void +__udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, + unsigned int idx, u8 priv) +{ + dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv; +} + +static void +udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry, + int err) +{ + bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; + + WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && + entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL); + + if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && + (!err || (err == -EEXIST && dodgy))) + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD; + + if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL && + (!err || (err == -ENOENT && dodgy))) + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL; + + if (!err) + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL; + else + entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL; +} + +static void +udp_tunnel_nic_device_sync_one(struct net_device *dev, + struct udp_tunnel_nic *utn, + unsigned int table, unsigned int idx) +{ + struct udp_tunnel_nic_table_entry *entry; + struct udp_tunnel_info ti; + int err; + + entry = &utn->entries[table][idx]; + if (!udp_tunnel_nic_entry_is_queued(entry)) + return; + + udp_tunnel_nic_ti_from_entry(entry, &ti); + if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD) + err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti); + else + err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx, + &ti); + udp_tunnel_nic_entry_update_done(entry, err); + + if (err) + netdev_warn(dev, + "UDP tunnel port sync failed port %d type %s: %d\n", + be16_to_cpu(entry->port), + udp_tunnel_nic_tunnel_type_name(entry->type), + err); +} + +static void +udp_tunnel_nic_device_sync_by_port(struct net_device *dev, + struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + udp_tunnel_nic_device_sync_one(dev, utn, i, j); +} + +static void +udp_tunnel_nic_device_sync_by_table(struct net_device *dev, + struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + int err; + + for (i = 0; i < utn->n_tables; i++) { + /* Find something that needs sync in this table */ + for (j = 0; j < info->tables[i].n_entries; j++) + if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j])) + break; + if (j == info->tables[i].n_entries) + continue; + + err = info->sync_table(dev, i); + if (err) + netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n", + i, err); + + for (j = 0; j < info->tables[i].n_entries; j++) { + struct udp_tunnel_nic_table_entry *entry; + + entry = &utn->entries[i][j]; + if (udp_tunnel_nic_entry_is_queued(entry)) + udp_tunnel_nic_entry_update_done(entry, err); + } + } +} + +static void +__udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + if (!utn->need_sync) + return; + + if (dev->udp_tunnel_nic_info->sync_table) + udp_tunnel_nic_device_sync_by_table(dev, utn); + else + udp_tunnel_nic_device_sync_by_port(dev, utn); + + utn->need_sync = 0; + /* Can't replay directly here, in case we come from the tunnel driver's + * notification - trying to replay may deadlock inside tunnel driver. + */ + utn->need_replay = udp_tunnel_nic_should_replay(dev, utn); +} + +static void +udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + bool may_sleep; + + if (!utn->need_sync) + return; + + /* Drivers which sleep in the callback need to update from + * the workqueue, if we come from the tunnel driver's notification. + */ + may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP; + if (!may_sleep) + __udp_tunnel_nic_device_sync(dev, utn); + if (may_sleep || utn->need_replay) { + queue_work(udp_tunnel_nic_workqueue, &utn->work); + utn->work_pending = 1; + } +} + +static bool +udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table, + struct udp_tunnel_info *ti) +{ + return table->tunnel_types & ti->type; +} + +static bool +udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i; + + /* Special case IPv4-only NICs */ + if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY && + ti->sa_family != AF_INET) + return false; + + for (i = 0; i < utn->n_tables; i++) + if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti)) + return true; + return false; +} + +static int +udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic_table_entry *entry; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) { + entry = &utn->entries[i][j]; + + if (!udp_tunnel_nic_entry_is_free(entry) && + entry->port == ti->port && + entry->type != ti->type) { + __set_bit(i, &utn->missed); + return true; + } + } + return false; +} + +static void +udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn, + unsigned int table, unsigned int idx, int use_cnt_adj) +{ + struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; + bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; + unsigned int from, to; + + /* If not going from used to unused or vice versa - all done. + * For dodgy entries make sure we try to sync again (queue the entry). + */ + entry->use_cnt += use_cnt_adj; + if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj)) + return; + + /* Cancel the op before it was sent to the device, if possible, + * otherwise we'd need to take special care to issue commands + * in the same order the ports arrived. + */ + if (use_cnt_adj < 0) { + from = UDP_TUNNEL_NIC_ENTRY_ADD; + to = UDP_TUNNEL_NIC_ENTRY_DEL; + } else { + from = UDP_TUNNEL_NIC_ENTRY_DEL; + to = UDP_TUNNEL_NIC_ENTRY_ADD; + } + + if (entry->flags & from) { + entry->flags &= ~from; + if (!dodgy) + return; + } + + udp_tunnel_nic_entry_queue(utn, entry, to); +} + +static bool +udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn, + unsigned int table, unsigned int idx, + struct udp_tunnel_info *ti, int use_cnt_adj) +{ + struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; + + if (udp_tunnel_nic_entry_is_free(entry) || + entry->port != ti->port || + entry->type != ti->type) + return false; + + if (udp_tunnel_nic_entry_is_frozen(entry)) + return true; + + udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj); + return true; +} + +/* Try to find existing matching entry and adjust its use count, instead of + * adding a new one. Returns true if entry was found. In case of delete the + * entry may have gotten removed in the process, in which case it will be + * queued for removal. + */ +static bool +udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti, int use_cnt_adj) +{ + const struct udp_tunnel_nic_table_info *table; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) { + table = &dev->udp_tunnel_nic_info->tables[i]; + if (!udp_tunnel_nic_table_is_capable(table, ti)) + continue; + + for (j = 0; j < table->n_entries; j++) + if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti, + use_cnt_adj)) + return true; + } + + return false; +} + +static bool +udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + return udp_tunnel_nic_try_existing(dev, utn, ti, +1); +} + +static bool +udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + return udp_tunnel_nic_try_existing(dev, utn, ti, -1); +} + +static bool +udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_table_info *table; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) { + table = &dev->udp_tunnel_nic_info->tables[i]; + if (!udp_tunnel_nic_table_is_capable(table, ti)) + continue; + + for (j = 0; j < table->n_entries; j++) { + struct udp_tunnel_nic_table_entry *entry; + + entry = &utn->entries[i][j]; + if (!udp_tunnel_nic_entry_is_free(entry)) + continue; + + entry->port = ti->port; + entry->type = ti->type; + entry->use_cnt = 1; + udp_tunnel_nic_entry_queue(utn, entry, + UDP_TUNNEL_NIC_ENTRY_ADD); + return true; + } + + /* The different table may still fit this port in, but there + * are no devices currently which have multiple tables accepting + * the same tunnel type, and false positives are okay. + */ + __set_bit(i, &utn->missed); + } + + return false; +} + +static void +__udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (!utn) + return; + if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY) + return; + if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN && + ti->port == htons(IANA_VXLAN_UDP_PORT)) { + if (ti->type != UDP_TUNNEL_TYPE_VXLAN) + netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n"); + return; + } + + if (!udp_tunnel_nic_is_capable(dev, utn, ti)) + return; + + /* It may happen that a tunnel of one type is removed and different + * tunnel type tries to reuse its port before the device was informed. + * Rely on utn->missed to re-add this port later. + */ + if (udp_tunnel_nic_has_collision(dev, utn, ti)) + return; + + if (!udp_tunnel_nic_add_existing(dev, utn, ti)) + udp_tunnel_nic_add_new(dev, utn, ti); + + udp_tunnel_nic_device_sync(dev, utn); +} + +static void +__udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) +{ + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (!utn) + return; + + if (!udp_tunnel_nic_is_capable(dev, utn, ti)) + return; + + udp_tunnel_nic_del_existing(dev, utn, ti); + + udp_tunnel_nic_device_sync(dev, utn); +} + +static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int i, j; + + ASSERT_RTNL(); + + utn = dev->udp_tunnel_nic; + if (!utn) + return; + + utn->need_sync = false; + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) { + struct udp_tunnel_nic_table_entry *entry; + + entry = &utn->entries[i][j]; + + entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | + UDP_TUNNEL_NIC_ENTRY_OP_FAIL); + /* We don't release rtnl across ops */ + WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); + if (!entry->use_cnt) + continue; + + udp_tunnel_nic_entry_queue(utn, entry, + UDP_TUNNEL_NIC_ENTRY_ADD); + } + + __udp_tunnel_nic_device_sync(dev, utn); +} + +static size_t +__udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int j; + size_t size; + + utn = dev->udp_tunnel_nic; + if (!utn) + return 0; + + size = 0; + for (j = 0; j < info->tables[table].n_entries; j++) { + if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) + continue; + + size += nla_total_size(0) + /* _TABLE_ENTRY */ + nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ + nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ + } + + return size; +} + +static int +__udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, + struct sk_buff *skb) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + struct nlattr *nest; + unsigned int j; + + utn = dev->udp_tunnel_nic; + if (!utn) + return 0; + + for (j = 0; j < info->tables[table].n_entries; j++) { + if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) + continue; + + nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); + + if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, + utn->entries[table][j].port) || + nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, + ilog2(utn->entries[table][j].type))) + goto err_cancel; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { + .get_port = __udp_tunnel_nic_get_port, + .set_port_priv = __udp_tunnel_nic_set_port_priv, + .add_port = __udp_tunnel_nic_add_port, + .del_port = __udp_tunnel_nic_del_port, + .reset_ntf = __udp_tunnel_nic_reset_ntf, + .dump_size = __udp_tunnel_nic_dump_size, + .dump_write = __udp_tunnel_nic_dump_write, +}; + +static void +udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) { + int adj_cnt = -utn->entries[i][j].use_cnt; + + if (adj_cnt) + udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt); + } + + __udp_tunnel_nic_device_sync(dev, utn); + + for (i = 0; i < utn->n_tables; i++) + memset(utn->entries[i], 0, array_size(info->tables[i].n_entries, + sizeof(**utn->entries))); + WARN_ON(utn->need_sync); + utn->need_replay = 0; +} + +static void +udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + /* Freeze all the ports we are already tracking so that the replay + * does not double up the refcount. + */ + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]); + utn->missed = 0; + utn->need_replay = 0; + + udp_tunnel_get_rx_info(dev); + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]); +} + +static void udp_tunnel_nic_device_sync_work(struct work_struct *work) +{ + struct udp_tunnel_nic *utn = + container_of(work, struct udp_tunnel_nic, work); + + rtnl_lock(); + utn->work_pending = 0; + __udp_tunnel_nic_device_sync(utn->dev, utn); + + if (utn->need_replay) + udp_tunnel_nic_replay(utn->dev, utn); + rtnl_unlock(); +} + +static struct udp_tunnel_nic * +udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, + unsigned int n_tables) +{ + struct udp_tunnel_nic *utn; + unsigned int i; + + utn = kzalloc(sizeof(*utn), GFP_KERNEL); + if (!utn) + return NULL; + utn->n_tables = n_tables; + INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); + + utn->entries = kmalloc_array(n_tables, sizeof(void *), GFP_KERNEL); + if (!utn->entries) + goto err_free_utn; + + for (i = 0; i < n_tables; i++) { + utn->entries[i] = kcalloc(info->tables[i].n_entries, + sizeof(*utn->entries[i]), GFP_KERNEL); + if (!utn->entries[i]) + goto err_free_prev_entries; + } + + return utn; + +err_free_prev_entries: + while (i--) + kfree(utn->entries[i]); + kfree(utn->entries); +err_free_utn: + kfree(utn); + return NULL; +} + +static int udp_tunnel_nic_register(struct net_device *dev) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int n_tables, i; + + BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE < + UDP_TUNNEL_NIC_MAX_TABLES); + + if (WARN_ON(!info->set_port != !info->unset_port) || + WARN_ON(!info->set_port == !info->sync_table) || + WARN_ON(!info->tables[0].n_entries)) + return -EINVAL; + + n_tables = 1; + for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + continue; + + n_tables++; + if (WARN_ON(!info->tables[i - 1].n_entries)) + return -EINVAL; + } + + utn = udp_tunnel_nic_alloc(info, n_tables); + if (!utn) + return -ENOMEM; + + utn->dev = dev; + dev_hold(dev); + dev->udp_tunnel_nic = utn; + + if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) + udp_tunnel_get_rx_info(dev); + + return 0; +} + +static void +udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + unsigned int i; + + /* Flush before we check work, so we don't waste time adding entries + * from the work which we will boot immediately. + */ + udp_tunnel_nic_flush(dev, utn); + + /* Wait for the work to be done using the state, netdev core will + * retry unregister until we give up our reference on this device. + */ + if (utn->work_pending) + return; + + for (i = 0; i < utn->n_tables; i++) + kfree(utn->entries[i]); + kfree(utn->entries); + kfree(utn); + dev->udp_tunnel_nic = NULL; + dev_put(dev); +} + +static int +udp_tunnel_nic_netdevice_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + const struct udp_tunnel_nic_info *info; + struct udp_tunnel_nic *utn; + + info = dev->udp_tunnel_nic_info; + if (!info) + return NOTIFY_DONE; + + if (event == NETDEV_REGISTER) { + int err; + + err = udp_tunnel_nic_register(dev); + if (err) + netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); + return notifier_from_errno(err); + } + /* All other events will need the udp_tunnel_nic state */ + utn = dev->udp_tunnel_nic; + if (!utn) + return NOTIFY_DONE; + + if (event == NETDEV_UNREGISTER) { + udp_tunnel_nic_unregister(dev, utn); + return NOTIFY_OK; + } + + /* All other events only matter if NIC has to be programmed open */ + if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) + return NOTIFY_DONE; + + if (event == NETDEV_UP) { + WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); + udp_tunnel_get_rx_info(dev); + return NOTIFY_OK; + } + if (event == NETDEV_GOING_DOWN) { + udp_tunnel_nic_flush(dev, utn); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = { + .notifier_call = udp_tunnel_nic_netdevice_event, +}; + +static int __init udp_tunnel_nic_init_module(void) +{ + int err; + + udp_tunnel_nic_workqueue = alloc_workqueue("udp_tunnel_nic", 0, 0); + if (!udp_tunnel_nic_workqueue) + return -ENOMEM; + + rtnl_lock(); + udp_tunnel_nic_ops = &__udp_tunnel_nic_ops; + rtnl_unlock(); + + err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block); + if (err) + goto err_unset_ops; + + return 0; + +err_unset_ops: + rtnl_lock(); + udp_tunnel_nic_ops = NULL; + rtnl_unlock(); + destroy_workqueue(udp_tunnel_nic_workqueue); + return err; +} +late_initcall(udp_tunnel_nic_init_module); + +static void __exit udp_tunnel_nic_cleanup_module(void) +{ + unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block); + + rtnl_lock(); + udp_tunnel_nic_ops = NULL; + rtnl_unlock(); + + destroy_workqueue(udp_tunnel_nic_workqueue); +} +module_exit(udp_tunnel_nic_cleanup_module); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/udp_tunnel_stub.c b/net/ipv4/udp_tunnel_stub.c new file mode 100644 index 000000000000..c4b2888f5fef --- /dev/null +++ b/net/ipv4/udp_tunnel_stub.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2020 Facebook Inc. + +#include <net/udp_tunnel.h> + +const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops; +EXPORT_SYMBOL_GPL(udp_tunnel_nic_ops); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 5936d66d1ce2..bd8773b49e72 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -56,10 +56,6 @@ struct proto udplite_prot = { .sysctl_mem = sysctl_udp_mem, .obj_size = sizeof(struct udp_sock), .h.udp_table = &udplite_table, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udp_setsockopt, - .compat_getsockopt = compat_udp_getsockopt, -#endif }; EXPORT_SYMBOL(udplite_prot); diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index f4f19e89af5e..76bff79d6fed 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -14,7 +14,7 @@ menuconfig IPV6 <https://en.wikipedia.org/wiki/IPv6>. For specific information about IPv6 under Linux, see Documentation/networking/ipv6.rst and read the HOWTO at - <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/> + <https://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/> To compile this protocol support as a module, choose M here: the module will be called ipv6. diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 840bfdb3d7bd..8e761b8c47c6 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -163,7 +163,7 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); static void addrconf_type_change(struct net_device *dev, unsigned long event); -static int addrconf_ifdown(struct net_device *dev, int how); +static int addrconf_ifdown(struct net_device *dev, bool unregister); static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, int plen, @@ -1983,6 +1983,45 @@ int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev) } EXPORT_SYMBOL(ipv6_chk_prefix); +/** + * ipv6_dev_find - find the first device with a given source address. + * @net: the net namespace + * @addr: the source address + * + * The caller should be protected by RCU, or RTNL. + */ +struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr) +{ + unsigned int hash = inet6_addr_hash(net, addr); + struct inet6_ifaddr *ifp, *result = NULL; + struct net_device *dev = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { + if (net_eq(dev_net(ifp->idev->dev), net) && + ipv6_addr_equal(&ifp->addr, addr)) { + result = ifp; + break; + } + } + + if (!result) { + struct rt6_info *rt; + + rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); + if (rt) { + dev = rt->dst.dev; + ip6_rt_put(rt); + } + } else { + dev = result->idev->dev; + } + rcu_read_unlock(); + + return dev; +} +EXPORT_SYMBOL(ipv6_dev_find); + struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, struct net_device *dev, int strict) { @@ -3630,7 +3669,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, * an L3 master device (e.g., VRF) */ if (info->upper_dev && netif_is_l3_master(info->upper_dev)) - addrconf_ifdown(dev, 0); + addrconf_ifdown(dev, false); } return NOTIFY_OK; @@ -3663,9 +3702,9 @@ static bool addr_is_local(const struct in6_addr *addr) (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } -static int addrconf_ifdown(struct net_device *dev, int how) +static int addrconf_ifdown(struct net_device *dev, bool unregister) { - unsigned long event = how ? NETDEV_UNREGISTER : NETDEV_DOWN; + unsigned long event = unregister ? NETDEV_UNREGISTER : NETDEV_DOWN; struct net *net = dev_net(dev); struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; @@ -3684,7 +3723,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) * Step 1: remove reference to ipv6 device from parent device. * Do not dev_put! */ - if (how) { + if (unregister) { idev->dead = 1; /* protected by rtnl_lock */ @@ -3698,7 +3737,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) /* combine the user config with event to determine if permanent * addresses are to be removed from address hash table */ - if (!how && !idev->cnf.disable_ipv6) { + if (!unregister && !idev->cnf.disable_ipv6) { /* aggregate the system setting and interface setting */ int _keep_addr = net->ipv6.devconf_all->keep_addr_on_down; @@ -3736,7 +3775,7 @@ restart: addrconf_del_rs_timer(idev); /* Step 2: clear flags for stateless addrconf */ - if (!how) + if (!unregister) idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); /* Step 3: clear tempaddr list */ @@ -3806,7 +3845,7 @@ restart: write_unlock_bh(&idev->lock); /* Step 5: Discard anycast and multicast list */ - if (how) { + if (unregister) { ipv6_ac_destroy_dev(idev); ipv6_mc_destroy_dev(idev); } else { @@ -3816,7 +3855,7 @@ restart: idev->tstamp = jiffies; /* Last: Shot the device (if unregistered) */ - if (how) { + if (unregister) { addrconf_sysctl_unregister(idev); neigh_parms_release(&nd_tbl, idev->nd_parms); neigh_ifdown(&nd_tbl, dev); @@ -4038,7 +4077,7 @@ static void addrconf_dad_work(struct work_struct *w) in6_ifa_hold(ifp); addrconf_dad_stop(ifp, 1); if (disable_ipv6) - addrconf_ifdown(idev->dev, 0); + addrconf_ifdown(idev->dev, false); goto out; } @@ -7187,9 +7226,9 @@ void addrconf_cleanup(void) for_each_netdev(&init_net, dev) { if (__in6_dev_get(dev) == NULL) continue; - addrconf_ifdown(dev, 1); + addrconf_ifdown(dev, true); } - addrconf_ifdown(init_net.loopback_dev, 2); + addrconf_ifdown(init_net.loopback_dev, true); /* * Check hash table. diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b304b882e031..0306509ab063 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -688,8 +688,6 @@ const struct proto_ops inet6_stream_ops = { .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif .set_rcvlowat = tcp_set_rcvlowat, }; @@ -717,8 +715,6 @@ const struct proto_ops inet6_dgram_ops = { .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 390bedde21a5..cc8ad7ddecda 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -19,6 +19,7 @@ #include <linux/route.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/icmp.h> #include <net/ipv6.h> #include <net/ndisc.h> @@ -284,6 +285,17 @@ int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, } EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only); +static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out) +{ + switch (icmp6_hdr(skb)->icmp6_type) { + case ICMPV6_TIME_EXCEED: + case ICMPV6_DEST_UNREACH: + ip_icmp_error_rfc4884(skb, out, sizeof(struct icmp6hdr), + icmp6_hdr(skb)->icmp6_datagram_len * 8); + } +} + void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { @@ -313,6 +325,10 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->port = port; __skb_pull(skb, payload - skb->data); + + if (inet6_sk(sk)->recverr_rfc4884) + ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 5a8bbcdcaf2b..374105e4394f 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -580,7 +580,7 @@ looped_back: hdr->segments_left--; i = n - hdr->segments_left; - buf = kzalloc(ipv6_rpl_srh_alloc_size(n + 1) * 2, GFP_ATOMIC); + buf = kcalloc(struct_size(hdr, segments.addr, n + 2), 2, GFP_ATOMIC); if (unlikely(!buf)) { kfree_skb(skb); return -1; @@ -1232,7 +1232,6 @@ static void ipv6_renew_option(int renewtype, * @opt: original options * @newtype: option type to replace in @opt * @newopt: new option of type @newtype to replace (user-mem) - * @newoptlen: length of @newopt * * Returns a new set of options which is a copy of @opt with the * option type @newtype replaced with @newopt. diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index fafe556d21e0..8f9a83314de7 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -13,6 +13,7 @@ #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/export.h> +#include <linux/indirect_call_wrapper.h> #include <net/fib_rules.h> #include <net/ipv6.h> @@ -111,11 +112,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, } else { struct rt6_info *rt; - rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); + rt = pol_lookup_func(lookup, + net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put_flags(rt, flags); - rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); + rt = pol_lookup_func(lookup, + net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put_flags(rt, flags); @@ -226,7 +229,8 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } - rt = lookup(net, table, flp6, arg->lookup_data, flags); + rt = pol_lookup_func(lookup, + net, table, flp6, arg->lookup_data, flags); if (rt != net->ipv6.ip6_null_entry) { err = fib6_rule_saddr(net, rule, flags, flp6, ip6_dst_idev(&rt->dst)->dev); @@ -252,8 +256,9 @@ out: return err; } -static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, - int flags, struct fib_lookup_arg *arg) +INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule, + struct flowi *flp, int flags, + struct fib_lookup_arg *arg) { if (arg->lookup_ptr == fib6_table_lookup) return fib6_rule_action_alt(rule, flp, flags, arg); @@ -261,7 +266,8 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, return __fib6_rule_action(rule, flp, flags, arg); } -static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule, + struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; struct rt6_info *rt = res->rt6; @@ -293,7 +299,8 @@ suppress_route: return true; } -static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) +INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule, + struct flowi *fl, int flags) { struct fib6_rule *r = (struct fib6_rule *) rule; struct flowi6 *fl6 = &fl->u.ip6; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 9df8737ae0d3..a4e4912ad607 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -439,8 +439,8 @@ static int icmp6_iif(const struct sk_buff *skb) /* * Send an ICMP message in response to a packet in error */ -static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, - const struct in6_addr *force_saddr) +void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + const struct in6_addr *force_saddr) { struct inet6_dev *idev = NULL; struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -625,6 +625,7 @@ out: out_bh_enable: local_bh_enable(); } +EXPORT_SYMBOL(icmp6_send); /* Slightly more convenient version of icmp6_send. */ diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index fbe9d4295eac..2d3add9e6116 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -21,6 +21,8 @@ #include <net/ip.h> #include <net/sock_reuseport.h> +extern struct inet_hashinfo tcp_hashinfo; + u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) @@ -111,6 +113,23 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } +static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + __be16 sport, + const struct in6_addr *daddr, + unsigned short hnum) +{ + struct sock *reuse_sk = NULL; + u32 phash; + + if (sk->sk_reuseport) { + phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + reuse_sk = reuseport_select_sock(sk, phash, skb, doff); + } + return reuse_sk; +} + /* called with rcu_read_lock() */ static struct sock *inet6_lhash2_lookup(struct net *net, struct inet_listen_hashbucket *ilb2, @@ -123,21 +142,17 @@ static struct sock *inet6_lhash2_lookup(struct net *net, struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; int score, hiscore = 0; - u32 phash = 0; inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { sk = (struct sock *)icsk; score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - if (sk->sk_reuseport) { - phash = inet6_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, phash, - skb, doff); - if (result) - return result; - } + result = lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum); + if (result) + return result; + result = sk; hiscore = score; } @@ -146,6 +161,31 @@ static struct sock *inet6_lhash2_lookup(struct net *net, return result; } +static inline struct sock *inet6_lookup_run_bpf(struct net *net, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum) +{ + struct sock *sk, *reuse_sk; + bool no_reuseport; + + if (hashinfo != &tcp_hashinfo) + return NULL; /* only TCP is supported */ + + no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_TCP, + saddr, sport, daddr, hnum, &sk); + if (no_reuseport || IS_ERR_OR_NULL(sk)) + return sk; + + reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + if (reuse_sk) + sk = reuse_sk; + return sk; +} + struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -157,6 +197,14 @@ struct sock *inet6_lookup_listener(struct net *net, struct sock *result = NULL; unsigned int hash2; + /* Lookup redirect from BPF */ + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { + result = inet6_lookup_run_bpf(net, hashinfo, skb, doff, + saddr, sport, daddr, hnum); + if (result) + goto done; + } + hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 49ee89bbcba0..25a90f3f705c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -314,7 +314,8 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, { struct rt6_info *rt; - rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); + rt = pol_lookup_func(lookup, + net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 73bb047e6037..aa673a6a7e43 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -371,7 +371,7 @@ static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned lo static struct ip6_flowlabel * fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, - char __user *optval, int optlen, int *err_p) + sockptr_t optval, int optlen, int *err_p) { struct ip6_flowlabel *fl = NULL; int olen; @@ -401,7 +401,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, memset(fl->opt, 0, sizeof(*fl->opt)); fl->opt->tot_len = sizeof(*fl->opt) + olen; err = -EFAULT; - if (copy_from_user(fl->opt+1, optval+CMSG_ALIGN(sizeof(*freq)), olen)) + if (copy_from_sockptr_offset(fl->opt + 1, optval, + CMSG_ALIGN(sizeof(*freq)), olen)) goto done; msg.msg_controllen = olen; @@ -533,187 +534,212 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, return -ENOENT; } -int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) +#define socklist_dereference(__sflp) \ + rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock)) + +static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) { - int err; - struct net *net = sock_net(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_flowlabel_req freq; - struct ipv6_fl_socklist *sfl1 = NULL; - struct ipv6_fl_socklist *sfl; struct ipv6_fl_socklist __rcu **sflp; - struct ip6_flowlabel *fl, *fl1 = NULL; + struct ipv6_fl_socklist *sfl; + if (freq->flr_flags & IPV6_FL_F_REFLECT) { + if (sk->sk_protocol != IPPROTO_TCP) + return -ENOPROTOOPT; + if (!np->repflow) + return -ESRCH; + np->flow_label = 0; + np->repflow = 0; + return 0; + } - if (optlen < sizeof(freq)) - return -EINVAL; + spin_lock_bh(&ip6_sk_fl_lock); + for (sflp = &np->ipv6_fl_list; + (sfl = socklist_dereference(*sflp)) != NULL; + sflp = &sfl->next) { + if (sfl->fl->label == freq->flr_label) + goto found; + } + spin_unlock_bh(&ip6_sk_fl_lock); + return -ESRCH; +found: + if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK)) + np->flow_label &= ~IPV6_FLOWLABEL_MASK; + *sflp = sfl->next; + spin_unlock_bh(&ip6_sk_fl_lock); + fl_release(sfl->fl); + kfree_rcu(sfl, rcu); + return 0; +} - if (copy_from_user(&freq, optval, sizeof(freq))) - return -EFAULT; +static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + struct ipv6_fl_socklist *sfl; + int err; - switch (freq.flr_action) { - case IPV6_FL_A_PUT: - if (freq.flr_flags & IPV6_FL_F_REFLECT) { - if (sk->sk_protocol != IPPROTO_TCP) - return -ENOPROTOOPT; - if (!np->repflow) - return -ESRCH; - np->flow_label = 0; - np->repflow = 0; - return 0; - } - spin_lock_bh(&ip6_sk_fl_lock); - for (sflp = &np->ipv6_fl_list; - (sfl = rcu_dereference_protected(*sflp, - lockdep_is_held(&ip6_sk_fl_lock))) != NULL; - sflp = &sfl->next) { - if (sfl->fl->label == freq.flr_label) { - if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) - np->flow_label &= ~IPV6_FLOWLABEL_MASK; - *sflp = sfl->next; - spin_unlock_bh(&ip6_sk_fl_lock); - fl_release(sfl->fl); - kfree_rcu(sfl, rcu); - return 0; - } + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { + if (sfl->fl->label == freq->flr_label) { + err = fl6_renew(sfl->fl, freq->flr_linger, + freq->flr_expires); + rcu_read_unlock_bh(); + return err; } - spin_unlock_bh(&ip6_sk_fl_lock); - return -ESRCH; + } + rcu_read_unlock_bh(); - case IPV6_FL_A_RENEW: - rcu_read_lock_bh(); - for_each_sk_fl_rcu(np, sfl) { - if (sfl->fl->label == freq.flr_label) { - err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires); - rcu_read_unlock_bh(); - return err; - } - } - rcu_read_unlock_bh(); + if (freq->flr_share == IPV6_FL_S_NONE && + ns_capable(net->user_ns, CAP_NET_ADMIN)) { + struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label); - if (freq.flr_share == IPV6_FL_S_NONE && - ns_capable(net->user_ns, CAP_NET_ADMIN)) { - fl = fl_lookup(net, freq.flr_label); - if (fl) { - err = fl6_renew(fl, freq.flr_linger, freq.flr_expires); - fl_release(fl); - return err; - } + if (fl) { + err = fl6_renew(fl, freq->flr_linger, + freq->flr_expires); + fl_release(fl); + return err; } - return -ESRCH; - - case IPV6_FL_A_GET: - if (freq.flr_flags & IPV6_FL_F_REFLECT) { - struct net *net = sock_net(sk); - if (net->ipv6.sysctl.flowlabel_consistency) { - net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); - return -EPERM; - } + } + return -ESRCH; +} - if (sk->sk_protocol != IPPROTO_TCP) - return -ENOPROTOOPT; +static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, + sockptr_t optval, int optlen) +{ + struct ipv6_fl_socklist *sfl, *sfl1 = NULL; + struct ip6_flowlabel *fl, *fl1 = NULL; + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + int err; - np->repflow = 1; - return 0; + if (freq->flr_flags & IPV6_FL_F_REFLECT) { + if (net->ipv6.sysctl.flowlabel_consistency) { + net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); + return -EPERM; } - if (freq.flr_label & ~IPV6_FLOWLABEL_MASK) - return -EINVAL; + if (sk->sk_protocol != IPPROTO_TCP) + return -ENOPROTOOPT; + np->repflow = 1; + return 0; + } - if (net->ipv6.sysctl.flowlabel_state_ranges && - (freq.flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) - return -ERANGE; + if (freq->flr_label & ~IPV6_FLOWLABEL_MASK) + return -EINVAL; + if (net->ipv6.sysctl.flowlabel_state_ranges && + (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) + return -ERANGE; - fl = fl_create(net, sk, &freq, optval, optlen, &err); - if (!fl) - return err; - sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); + fl = fl_create(net, sk, freq, optval, optlen, &err); + if (!fl) + return err; - if (freq.flr_label) { - err = -EEXIST; - rcu_read_lock_bh(); - for_each_sk_fl_rcu(np, sfl) { - if (sfl->fl->label == freq.flr_label) { - if (freq.flr_flags&IPV6_FL_F_EXCL) { - rcu_read_unlock_bh(); - goto done; - } - fl1 = sfl->fl; - if (!atomic_inc_not_zero(&fl1->users)) - fl1 = NULL; - break; + sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); + + if (freq->flr_label) { + err = -EEXIST; + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { + if (sfl->fl->label == freq->flr_label) { + if (freq->flr_flags & IPV6_FL_F_EXCL) { + rcu_read_unlock_bh(); + goto done; } + fl1 = sfl->fl; + if (!atomic_inc_not_zero(&fl1->users)) + fl1 = NULL; + break; } - rcu_read_unlock_bh(); + } + rcu_read_unlock_bh(); - if (!fl1) - fl1 = fl_lookup(net, freq.flr_label); - if (fl1) { + if (!fl1) + fl1 = fl_lookup(net, freq->flr_label); + if (fl1) { recheck: - err = -EEXIST; - if (freq.flr_flags&IPV6_FL_F_EXCL) - goto release; - err = -EPERM; - if (fl1->share == IPV6_FL_S_EXCL || - fl1->share != fl->share || - ((fl1->share == IPV6_FL_S_PROCESS) && - (fl1->owner.pid != fl->owner.pid)) || - ((fl1->share == IPV6_FL_S_USER) && - !uid_eq(fl1->owner.uid, fl->owner.uid))) - goto release; - - err = -ENOMEM; - if (!sfl1) - goto release; - if (fl->linger > fl1->linger) - fl1->linger = fl->linger; - if ((long)(fl->expires - fl1->expires) > 0) - fl1->expires = fl->expires; - fl_link(np, sfl1, fl1); - fl_free(fl); - return 0; + err = -EEXIST; + if (freq->flr_flags&IPV6_FL_F_EXCL) + goto release; + err = -EPERM; + if (fl1->share == IPV6_FL_S_EXCL || + fl1->share != fl->share || + ((fl1->share == IPV6_FL_S_PROCESS) && + (fl1->owner.pid != fl->owner.pid)) || + ((fl1->share == IPV6_FL_S_USER) && + !uid_eq(fl1->owner.uid, fl->owner.uid))) + goto release; + + err = -ENOMEM; + if (!sfl1) + goto release; + if (fl->linger > fl1->linger) + fl1->linger = fl->linger; + if ((long)(fl->expires - fl1->expires) > 0) + fl1->expires = fl->expires; + fl_link(np, sfl1, fl1); + fl_free(fl); + return 0; release: - fl_release(fl1); - goto done; - } - } - err = -ENOENT; - if (!(freq.flr_flags&IPV6_FL_F_CREATE)) - goto done; - - err = -ENOMEM; - if (!sfl1) + fl_release(fl1); goto done; + } + } + err = -ENOENT; + if (!(freq->flr_flags & IPV6_FL_F_CREATE)) + goto done; - err = mem_check(sk); - if (err != 0) - goto done; + err = -ENOMEM; + if (!sfl1) + goto done; - fl1 = fl_intern(net, fl, freq.flr_label); - if (fl1) - goto recheck; + err = mem_check(sk); + if (err != 0) + goto done; - if (!freq.flr_label) { - if (copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label, - &fl->label, sizeof(fl->label))) { - /* Intentionally ignore fault. */ - } - } + fl1 = fl_intern(net, fl, freq->flr_label); + if (fl1) + goto recheck; - fl_link(np, sfl1, fl); - return 0; + if (!freq->flr_label) { + size_t offset = offsetof(struct in6_flowlabel_req, flr_label); - default: - return -EINVAL; + if (copy_to_sockptr_offset(optval, offset, &fl->label, + sizeof(fl->label))) { + /* Intentionally ignore fault. */ + } } + fl_link(np, sfl1, fl); + return 0; done: fl_free(fl); kfree(sfl1); return err; } +int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen) +{ + struct in6_flowlabel_req freq; + + if (optlen < sizeof(freq)) + return -EINVAL; + if (copy_from_sockptr(&freq, optval, sizeof(freq))) + return -EFAULT; + + switch (freq.flr_action) { + case IPV6_FL_A_PUT: + return ipv6_flowlabel_put(sk, &freq); + case IPV6_FL_A_RENEW: + return ipv6_flowlabel_renew(sk, &freq); + case IPV6_FL_A_GET: + return ipv6_flowlabel_get(sk, &freq, optval, optlen); + default: + return -EINVAL; + } +} + #ifdef CONFIG_PROC_FS struct ip6fl_iter_state { diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index e0086758b6ee..70c8c2f36c98 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -9,6 +9,8 @@ #if IS_ENABLED(CONFIG_IPV6) +#if !IS_BUILTIN(CONFIG_IPV6) + static ip6_icmp_send_t __rcu *ip6_icmp_send; int inet6_register_icmp_sender(ip6_icmp_send_t *fn) @@ -37,14 +39,12 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) rcu_read_lock(); send = rcu_dereference(ip6_icmp_send); - - if (!send) - goto out; - send(skb, type, code, info, NULL); -out: + if (send) + send(skb, type, code, info, NULL); rcu_read_unlock(); } EXPORT_SYMBOL(icmpv6_send); +#endif #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_conntrack.h> diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 7fbb44736a34..a80f90bf3ae7 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -13,6 +13,8 @@ #include <net/protocol.h> #include <net/ipv6.h> #include <net/inet_common.h> +#include <net/tcp.h> +#include <net/udp.h> #include "ip6_offload.h" @@ -177,10 +179,6 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph, return len; } -INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *, - struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, - struct sk_buff *)); INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, struct sk_buff *skb) { @@ -319,8 +317,6 @@ static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, return inet_gro_receive(head, skb); } -INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *, int)); -INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8a8c2d0cfcc8..c78e67d7747f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1118,6 +1118,7 @@ out_err_release: /** * ip6_dst_lookup - perform route lookup on flow + * @net: Network namespace to perform lookup in * @sk: socket which provides route info * @dst: pointer to dst_entry * for result * @fl6: flow to lookup @@ -1136,6 +1137,7 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup); /** * ip6_dst_lookup_flow - perform route lookup on flow with ipsec + * @net: Network namespace to perform lookup in * @sk: socket which provides route info * @fl6: flow to lookup * @final_dst: final destination address for ipsec lookup @@ -1202,11 +1204,11 @@ EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); * @skb: Packet for which lookup is done * @dev: Tunnel device * @net: Network namespace of tunnel device - * @sk: Socket which provides route info + * @sock: Socket which provides route info * @saddr: Memory to store the src ip address * @info: Tunnel information * @protocol: IP protocol - * @use_cahce: Flag to enable cache usage + * @use_cache: Flag to enable cache usage * This function performs a route lookup on a tunnel * * It returns a valid dst pointer and stores src address to be used in diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a18c378ca5f4..f635914f42ec 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -124,8 +124,12 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) return &dev->stats; } +#define for_each_ip6_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) + /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses + * @net: network namespace * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point @@ -136,9 +140,6 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * else %NULL **/ -#define for_each_ip6_tunnel_rcu(start) \ - for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) - static struct ip6_tnl * ip6_tnl_lookup(struct net *net, int link, const struct in6_addr *remote, const struct in6_addr *local) @@ -302,8 +303,8 @@ out: /** * ip6_tnl_create - create a new tunnel + * @net: network namespace * @p: tunnel parameters - * @pt: pointer to new tunnel * * Description: * Create tunnel matching given parameters. @@ -351,6 +352,7 @@ failed: /** * ip6_tnl_locate - find or create tunnel matching given parameters + * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 0d964160a9dd..fac01b80a104 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -491,13 +491,16 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) } dst_hold(dst); - dst = xfrm_lookup(t->net, dst, fl, NULL, 0); + dst = xfrm_lookup_route(t->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } + if (dst->flags & DST_XFRM_QUEUE) + goto queued; + x = dst->xfrm; if (!vti6_state_check(x, &t->parms.raddr, &t->parms.laddr)) goto tx_err_link_failure; @@ -533,6 +536,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) goto tx_err_dst_release; } +queued: skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; @@ -1219,6 +1223,33 @@ static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { .priority = 100, }; +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) +static int vti6_rcv_tunnel(struct sk_buff *skb) +{ + const xfrm_address_t *saddr; + __be32 spi; + + saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; + spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); + + return vti6_input_proto(skb, IPPROTO_IPV6, spi, 0); +} + +static struct xfrm6_tunnel vti_ipv6_handler __read_mostly = { + .handler = vti6_rcv_tunnel, + .cb_handler = vti6_rcv_cb, + .err_handler = vti6_err, + .priority = 0, +}; + +static struct xfrm6_tunnel vti_ip6ip_handler __read_mostly = { + .handler = vti6_rcv_tunnel, + .cb_handler = vti6_rcv_cb, + .err_handler = vti6_err, + .priority = 0, +}; +#endif + /** * vti6_tunnel_init - register protocol and reserve needed resources * @@ -1244,6 +1275,15 @@ static int __init vti6_tunnel_init(void) err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + msg = "ipv6 tunnel"; + err = xfrm6_tunnel_register(&vti_ipv6_handler, AF_INET6); + if (err < 0) + goto vti_tunnel_ipv6_failed; + err = xfrm6_tunnel_register(&vti_ip6ip_handler, AF_INET); + if (err < 0) + goto vti_tunnel_ip6ip_failed; +#endif msg = "netlink interface"; err = rtnl_link_register(&vti6_link_ops); @@ -1253,6 +1293,12 @@ static int __init vti6_tunnel_init(void) return 0; rtnl_link_failed: +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + err = xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET); +vti_tunnel_ip6ip_failed: + err = xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6); +vti_tunnel_ipv6_failed: +#endif xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); @@ -1271,6 +1317,10 @@ pernet_dev_failed: static void __exit vti6_tunnel_cleanup(void) { rtnl_link_unregister(&vti6_link_ops); +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET); + xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6); +#endif xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 1f4d20e97c07..06b0d2c329b9 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1629,7 +1629,8 @@ EXPORT_SYMBOL(mroute6_is_socket); * MOSPF/PIM router set up we can clean this up. */ -int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) +int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, + unsigned int optlen) { int ret, parent = 0; struct mif6ctl vif; @@ -1665,7 +1666,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns case MRT6_ADD_MIF: if (optlen < sizeof(vif)) return -EINVAL; - if (copy_from_user(&vif, optval, sizeof(vif))) + if (copy_from_sockptr(&vif, optval, sizeof(vif))) return -EFAULT; if (vif.mif6c_mifi >= MAXMIFS) return -ENFILE; @@ -1678,7 +1679,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns case MRT6_DEL_MIF: if (optlen < sizeof(mifi_t)) return -EINVAL; - if (copy_from_user(&mifi, optval, sizeof(mifi_t))) + if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t))) return -EFAULT; rtnl_lock(); ret = mif6_delete(mrt, mifi, 0, NULL); @@ -1697,7 +1698,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) return -EINVAL; - if (copy_from_user(&mfc, optval, sizeof(mfc))) + if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) return -EFAULT; if (parent == 0) parent = mfc.mf6cc_parent; @@ -1718,7 +1719,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (optlen != sizeof(flags)) return -EINVAL; - if (get_user(flags, (int __user *)optval)) + if (copy_from_sockptr(&flags, optval, sizeof(flags))) return -EFAULT; rtnl_lock(); mroute_clean_tables(mrt, flags); @@ -1735,7 +1736,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (optlen != sizeof(v)) return -EINVAL; - if (get_user(v, (int __user *)optval)) + if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; mrt->mroute_do_assert = v; return 0; @@ -1748,7 +1749,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (optlen != sizeof(v)) return -EINVAL; - if (get_user(v, (int __user *)optval)) + if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; v = !!v; rtnl_lock(); @@ -1769,7 +1770,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (optlen != sizeof(u32)) return -EINVAL; - if (get_user(v, (u32 __user *)optval)) + if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (v != RT_TABLE_DEFAULT && v >= 100000000) diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 99668bfebd85..daef890460b7 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -91,6 +91,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) t->props.mode = x->props.mode; memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); memcpy(&t->mark, &x->mark, sizeof(t->mark)); + t->if_id = x->if_id; if (xfrm_init_state(t)) goto error; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 76f9e41859a2..43a894bf9a1b 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -136,13 +136,42 @@ static bool setsockopt_needs_rtnl(int optname) return false; } +static int copy_group_source_from_sockptr(struct group_source_req *greqs, + sockptr_t optval, int optlen) +{ + if (in_compat_syscall()) { + struct compat_group_source_req gr32; + + if (optlen < sizeof(gr32)) + return -EINVAL; + if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) + return -EFAULT; + greqs->gsr_interface = gr32.gsr_interface; + greqs->gsr_group = gr32.gsr_group; + greqs->gsr_source = gr32.gsr_source; + } else { + if (optlen < sizeof(*greqs)) + return -EINVAL; + if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) + return -EFAULT; + } + + return 0; +} + static int do_ipv6_mcast_group_source(struct sock *sk, int optname, - struct group_source_req *greqs) + sockptr_t optval, int optlen) { + struct group_source_req greqs; int omode, add; + int ret; + + ret = copy_group_source_from_sockptr(&greqs, optval, optlen); + if (ret) + return ret; - if (greqs->gsr_group.ss_family != AF_INET6 || - greqs->gsr_source.ss_family != AF_INET6) + if (greqs.gsr_group.ss_family != AF_INET6 || + greqs.gsr_source.ss_family != AF_INET6) return -EADDRNOTAVAIL; if (optname == MCAST_BLOCK_SOURCE) { @@ -155,8 +184,8 @@ static int do_ipv6_mcast_group_source(struct sock *sk, int optname, struct sockaddr_in6 *psin6; int retv; - psin6 = (struct sockaddr_in6 *)&greqs->gsr_group; - retv = ipv6_sock_mc_join_ssm(sk, greqs->gsr_interface, + psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; + retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, &psin6->sin6_addr, MCAST_INCLUDE); /* prior join w/ different source is ok */ @@ -168,11 +197,200 @@ static int do_ipv6_mcast_group_source(struct sock *sk, int optname, omode = MCAST_INCLUDE; add = 0; } - return ip6_mc_source(add, omode, sk, greqs); + return ip6_mc_source(add, omode, sk, &greqs); +} + +static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, + int optlen) +{ + struct group_filter *gsf; + int ret; + + if (optlen < GROUP_FILTER_SIZE(0)) + return -EINVAL; + if (optlen > sysctl_optmem_max) + return -ENOBUFS; + + gsf = memdup_sockptr(optval, optlen); + if (IS_ERR(gsf)) + return PTR_ERR(gsf); + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + ret = -ENOBUFS; + if (gsf->gf_numsrc >= 0x1ffffffU || + gsf->gf_numsrc > sysctl_mld_max_msf) + goto out_free_gsf; + + ret = -EINVAL; + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) + goto out_free_gsf; + + ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist); +out_free_gsf: + kfree(gsf); + return ret; +} + +static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, + int optlen) +{ + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter *gf32; + void *p; + int ret; + int n; + + if (optlen < size0) + return -EINVAL; + if (optlen > sysctl_optmem_max - 4) + return -ENOBUFS; + + p = kmalloc(optlen + 4, GFP_KERNEL); + if (!p) + return -ENOMEM; + + gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ + ret = -EFAULT; + if (copy_from_sockptr(gf32, optval, optlen)) + goto out_free_p; + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + ret = -ENOBUFS; + n = gf32->gf_numsrc; + if (n >= 0x1ffffffU || n > sysctl_mld_max_msf) + goto out_free_p; + + ret = -EINVAL; + if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) + goto out_free_p; + + ret = ip6_mc_msfilter(sk, &(struct group_filter){ + .gf_interface = gf32->gf_interface, + .gf_group = gf32->gf_group, + .gf_fmode = gf32->gf_fmode, + .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist); + +out_free_p: + kfree(p); + return ret; +} + +static int ipv6_mcast_join_leave(struct sock *sk, int optname, + sockptr_t optval, int optlen) +{ + struct sockaddr_in6 *psin6; + struct group_req greq; + + if (optlen < sizeof(greq)) + return -EINVAL; + if (copy_from_sockptr(&greq, optval, sizeof(greq))) + return -EFAULT; + + if (greq.gr_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + psin6 = (struct sockaddr_in6 *)&greq.gr_group; + if (optname == MCAST_JOIN_GROUP) + return ipv6_sock_mc_join(sk, greq.gr_interface, + &psin6->sin6_addr); + return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr); +} + +static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname, + sockptr_t optval, int optlen) +{ + struct compat_group_req gr32; + struct sockaddr_in6 *psin6; + + if (optlen < sizeof(gr32)) + return -EINVAL; + if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) + return -EFAULT; + + if (gr32.gr_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + psin6 = (struct sockaddr_in6 *)&gr32.gr_group; + if (optname == MCAST_JOIN_GROUP) + return ipv6_sock_mc_join(sk, gr32.gr_interface, + &psin6->sin6_addr); + return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr); +} + +static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval, + int optlen) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_opt_hdr *new = NULL; + struct net *net = sock_net(sk); + struct ipv6_txoptions *opt; + int err; + + /* hop-by-hop / destination options are privileged option */ + if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) + return -EPERM; + + /* remove any sticky options header with a zero option + * length, per RFC3542. + */ + if (optlen > 0) { + if (sockptr_is_null(optval)) + return -EINVAL; + if (optlen < sizeof(struct ipv6_opt_hdr) || + optlen & 0x7 || + optlen > 8 * 255) + return -EINVAL; + + new = memdup_sockptr(optval, optlen); + if (IS_ERR(new)) + return PTR_ERR(new); + if (unlikely(ipv6_optlen(new) > optlen)) { + kfree(new); + return -EINVAL; + } + } + + opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); + opt = ipv6_renew_options(sk, opt, optname, new); + kfree(new); + if (IS_ERR(opt)) + return PTR_ERR(opt); + + /* routing header option needs extra check */ + err = -EINVAL; + if (optname == IPV6_RTHDR && opt && opt->srcrt) { + struct ipv6_rt_hdr *rthdr = opt->srcrt; + switch (rthdr->type) { +#if IS_ENABLED(CONFIG_IPV6_MIP6) + case IPV6_SRCRT_TYPE_2: + if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) + goto sticky_done; + break; +#endif + case IPV6_SRCRT_TYPE_4: + { + struct ipv6_sr_hdr *srh = + (struct ipv6_sr_hdr *)opt->srcrt; + + if (!seg6_validate_srh(srh, optlen, false)) + goto sticky_done; + break; + } + default: + goto sticky_done; + } + } + + err = 0; + opt = ipv6_update_options(sk, opt); +sticky_done: + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } + return err; } static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); @@ -180,11 +398,11 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, int retv = -ENOPROTOOPT; bool needs_rtnl = setsockopt_needs_rtnl(optname); - if (!optval) + if (sockptr_is_null(optval)) val = 0; else { if (optlen >= sizeof(int)) { - if (get_user(val, (int __user *) optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else val = 0; @@ -436,82 +654,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: - { - struct ipv6_txoptions *opt; - struct ipv6_opt_hdr *new = NULL; - - /* hop-by-hop / destination options are privileged option */ - retv = -EPERM; - if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) - break; - - /* remove any sticky options header with a zero option - * length, per RFC3542. - */ - if (optlen == 0) - optval = NULL; - else if (!optval) - goto e_inval; - else if (optlen < sizeof(struct ipv6_opt_hdr) || - optlen & 0x7 || optlen > 8 * 255) - goto e_inval; - else { - new = memdup_user(optval, optlen); - if (IS_ERR(new)) { - retv = PTR_ERR(new); - break; - } - if (unlikely(ipv6_optlen(new) > optlen)) { - kfree(new); - goto e_inval; - } - } - - opt = rcu_dereference_protected(np->opt, - lockdep_sock_is_held(sk)); - opt = ipv6_renew_options(sk, opt, optname, new); - kfree(new); - if (IS_ERR(opt)) { - retv = PTR_ERR(opt); - break; - } - - /* routing header option needs extra check */ - retv = -EINVAL; - if (optname == IPV6_RTHDR && opt && opt->srcrt) { - struct ipv6_rt_hdr *rthdr = opt->srcrt; - switch (rthdr->type) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - case IPV6_SRCRT_TYPE_2: - if (rthdr->hdrlen != 2 || - rthdr->segments_left != 1) - goto sticky_done; - - break; -#endif - case IPV6_SRCRT_TYPE_4: - { - struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *) - opt->srcrt; - - if (!seg6_validate_srh(srh, optlen, false)) - goto sticky_done; - break; - } - default: - goto sticky_done; - } - } - - retv = 0; - opt = ipv6_update_options(sk, opt); -sticky_done: - if (opt) { - atomic_sub(opt->tot_len, &sk->sk_omem_alloc); - txopt_put(opt); - } + retv = ipv6_set_opt_hdr(sk, optname, optval, optlen); break; - } case IPV6_PKTINFO: { @@ -519,12 +663,13 @@ sticky_done: if (optlen == 0) goto e_inval; - else if (optlen < sizeof(struct in6_pktinfo) || !optval) + else if (optlen < sizeof(struct in6_pktinfo) || + sockptr_is_null(optval)) goto e_inval; - if (copy_from_user(&pkt, optval, sizeof(struct in6_pktinfo))) { - retv = -EFAULT; - break; + if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) { + retv = -EFAULT; + break; } if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; @@ -565,7 +710,7 @@ sticky_done: refcount_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; - if (copy_from_user(opt+1, optval, optlen)) + if (copy_from_sockptr(opt + 1, optval, optlen)) goto done; msg.msg_controllen = optlen; @@ -687,7 +832,7 @@ done: break; retv = -EFAULT; - if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) + if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_ADD_MEMBERSHIP) @@ -705,7 +850,7 @@ done: goto e_inval; retv = -EFAULT; - if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) + if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_JOIN_ANYCAST) @@ -723,77 +868,26 @@ done: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: - { - struct group_req greq; - struct sockaddr_in6 *psin6; - - if (optlen < sizeof(struct group_req)) - goto e_inval; - - retv = -EFAULT; - if (copy_from_user(&greq, optval, sizeof(struct group_req))) - break; - if (greq.gr_group.ss_family != AF_INET6) { - retv = -EADDRNOTAVAIL; - break; - } - psin6 = (struct sockaddr_in6 *)&greq.gr_group; - if (optname == MCAST_JOIN_GROUP) - retv = ipv6_sock_mc_join(sk, greq.gr_interface, - &psin6->sin6_addr); + if (in_compat_syscall()) + retv = compat_ipv6_mcast_join_leave(sk, optname, optval, + optlen); else - retv = ipv6_sock_mc_drop(sk, greq.gr_interface, - &psin6->sin6_addr); + retv = ipv6_mcast_join_leave(sk, optname, optval, + optlen); break; - } case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: - { - struct group_source_req greqs; - - if (optlen < sizeof(struct group_source_req)) - goto e_inval; - if (copy_from_user(&greqs, optval, sizeof(greqs))) { - retv = -EFAULT; - break; - } - retv = do_ipv6_mcast_group_source(sk, optname, &greqs); + retv = do_ipv6_mcast_group_source(sk, optname, optval, optlen); break; - } case MCAST_MSFILTER: - { - struct group_filter *gsf; - - if (optlen < GROUP_FILTER_SIZE(0)) - goto e_inval; - if (optlen > sysctl_optmem_max) { - retv = -ENOBUFS; - break; - } - gsf = memdup_user(optval, optlen); - if (IS_ERR(gsf)) { - retv = PTR_ERR(gsf); - break; - } - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (gsf->gf_numsrc >= 0x1ffffffU || - gsf->gf_numsrc > sysctl_mld_max_msf) { - kfree(gsf); - retv = -ENOBUFS; - break; - } - if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { - kfree(gsf); - retv = -EINVAL; - break; - } - retv = ip6_mc_msfilter(sk, gsf, gsf->gf_slist); - kfree(gsf); - + if (in_compat_syscall()) + retv = compat_ipv6_set_mcast_msfilter(sk, optval, + optlen); + else + retv = ipv6_set_mcast_msfilter(sk, optval, optlen); break; - } case IPV6_ROUTER_ALERT: if (optlen < sizeof(int)) goto e_inval; @@ -872,6 +966,14 @@ done: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; + case IPV6_RECVERR_RFC4884: + if (optlen < sizeof(int)) + goto e_inval; + if (val < 0 || val > 1) + goto e_inval; + np->recverr_rfc4884 = valbool; + retv = 0; + break; } release_sock(sk); @@ -887,8 +989,8 @@ e_inval: return -EINVAL; } -int ipv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { int err; @@ -909,140 +1011,6 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, } EXPORT_SYMBOL(ipv6_setsockopt); -#ifdef CONFIG_COMPAT -int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - int err; - - if (level == SOL_IP && sk->sk_type != SOCK_RAW) { - if (udp_prot.compat_setsockopt != NULL) - return udp_prot.compat_setsockopt(sk, level, optname, - optval, optlen); - return udp_prot.setsockopt(sk, level, optname, optval, optlen); - } - - if (level != SOL_IPV6) - return -ENOPROTOOPT; - - switch (optname) { - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - { - struct compat_group_req __user *gr32 = (void __user *)optval; - struct group_req greq; - struct sockaddr_in6 *psin6 = (struct sockaddr_in6 *)&greq.gr_group; - - if (optlen < sizeof(struct compat_group_req)) - return -EINVAL; - - if (get_user(greq.gr_interface, &gr32->gr_interface) || - copy_from_user(&greq.gr_group, &gr32->gr_group, - sizeof(greq.gr_group))) - return -EFAULT; - - if (greq.gr_group.ss_family != AF_INET6) - return -EADDRNOTAVAIL; - - rtnl_lock(); - lock_sock(sk); - if (optname == MCAST_JOIN_GROUP) - err = ipv6_sock_mc_join(sk, greq.gr_interface, - &psin6->sin6_addr); - else - err = ipv6_sock_mc_drop(sk, greq.gr_interface, - &psin6->sin6_addr); - release_sock(sk); - rtnl_unlock(); - return err; - } - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - { - struct compat_group_source_req __user *gsr32 = (void __user *)optval; - struct group_source_req greqs; - - if (optlen < sizeof(struct compat_group_source_req)) - return -EINVAL; - - if (get_user(greqs.gsr_interface, &gsr32->gsr_interface) || - copy_from_user(&greqs.gsr_group, &gsr32->gsr_group, - sizeof(greqs.gsr_group)) || - copy_from_user(&greqs.gsr_source, &gsr32->gsr_source, - sizeof(greqs.gsr_source))) - return -EFAULT; - - rtnl_lock(); - lock_sock(sk); - err = do_ipv6_mcast_group_source(sk, optname, &greqs); - release_sock(sk); - rtnl_unlock(); - return err; - } - case MCAST_MSFILTER: - { - const int size0 = offsetof(struct compat_group_filter, gf_slist); - struct compat_group_filter *gf32; - void *p; - int n; - - if (optlen < size0) - return -EINVAL; - if (optlen > sysctl_optmem_max - 4) - return -ENOBUFS; - - p = kmalloc(optlen + 4, GFP_KERNEL); - if (!p) - return -ENOMEM; - - gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ - if (copy_from_user(gf32, optval, optlen)) { - err = -EFAULT; - goto mc_msf_out; - } - - n = gf32->gf_numsrc; - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (n >= 0x1ffffffU || - n > sysctl_mld_max_msf) { - err = -ENOBUFS; - goto mc_msf_out; - } - if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) { - err = -EINVAL; - goto mc_msf_out; - } - - rtnl_lock(); - lock_sock(sk); - err = ip6_mc_msfilter(sk, &(struct group_filter){ - .gf_interface = gf32->gf_interface, - .gf_group = gf32->gf_group, - .gf_fmode = gf32->gf_fmode, - .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist); - release_sock(sk); - rtnl_unlock(); -mc_msf_out: - kfree(p); - return err; - } - } - - err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); -#ifdef CONFIG_NETFILTER - /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && - optname != IPV6_XFRM_POLICY) - err = compat_nf_setsockopt(sk, PF_INET6, optname, optval, - optlen); -#endif - return err; -} -EXPORT_SYMBOL(compat_ipv6_setsockopt); -#endif - static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, int optname, char __user *optval, int len) { @@ -1077,6 +1045,75 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, return len; } +static int ipv6_get_msfilter(struct sock *sk, void __user *optval, + int __user *optlen, int len) +{ + const int size0 = offsetof(struct group_filter, gf_slist); + struct group_filter __user *p = optval; + struct group_filter gsf; + int num; + int err; + + if (len < size0) + return -EINVAL; + if (copy_from_user(&gsf, p, size0)) + return -EFAULT; + if (gsf.gf_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + num = gsf.gf_numsrc; + lock_sock(sk); + err = ip6_mc_msfget(sk, &gsf, p->gf_slist); + if (!err) { + if (num > gsf.gf_numsrc) + num = gsf.gf_numsrc; + if (put_user(GROUP_FILTER_SIZE(num), optlen) || + copy_to_user(p, &gsf, size0)) + err = -EFAULT; + } + release_sock(sk); + return err; +} + +static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval, + int __user *optlen) +{ + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter __user *p = optval; + struct compat_group_filter gf32; + struct group_filter gf; + int len, err; + int num; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < size0) + return -EINVAL; + + if (copy_from_user(&gf32, p, size0)) + return -EFAULT; + gf.gf_interface = gf32.gf_interface; + gf.gf_fmode = gf32.gf_fmode; + num = gf.gf_numsrc = gf32.gf_numsrc; + gf.gf_group = gf32.gf_group; + + if (gf.gf_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + + lock_sock(sk); + err = ip6_mc_msfget(sk, &gf, p->gf_slist); + release_sock(sk); + if (err) + return err; + if (num > gf.gf_numsrc) + num = gf.gf_numsrc; + len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); + if (put_user(len, optlen) || + put_user(gf.gf_fmode, &p->gf_fmode) || + put_user(gf.gf_numsrc, &p->gf_numsrc)) + return -EFAULT; + return 0; +} + static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen, unsigned int flags) { @@ -1100,33 +1137,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = sk->sk_family; break; case MCAST_MSFILTER: - { - struct group_filter __user *p = (void __user *)optval; - struct group_filter gsf; - const int size0 = offsetof(struct group_filter, gf_slist); - int num; - int err; - - if (len < size0) - return -EINVAL; - if (copy_from_user(&gsf, p, size0)) - return -EFAULT; - if (gsf.gf_group.ss_family != AF_INET6) - return -EADDRNOTAVAIL; - num = gsf.gf_numsrc; - lock_sock(sk); - err = ip6_mc_msfget(sk, &gsf, p->gf_slist); - if (!err) { - if (num > gsf.gf_numsrc) - num = gsf.gf_numsrc; - if (put_user(GROUP_FILTER_SIZE(num), optlen) || - copy_to_user(p, &gsf, size0)) - err = -EFAULT; - } - release_sock(sk); - return err; - } - + if (in_compat_syscall()) + return compat_ipv6_get_msfilter(sk, optval, optlen); + return ipv6_get_msfilter(sk, optval, optlen, len); case IPV6_2292PKTOPTIONS: { struct msghdr msg; @@ -1435,6 +1448,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->rtalert_isolate; break; + case IPV6_RECVERR_RFC4884: + val = np->recverr_rfc4884; + break; + default: return -ENOPROTOOPT; } @@ -1474,78 +1491,3 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, return err; } EXPORT_SYMBOL(ipv6_getsockopt); - -#ifdef CONFIG_COMPAT -int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - int err; - - if (level == SOL_IP && sk->sk_type != SOCK_RAW) { - if (udp_prot.compat_getsockopt != NULL) - return udp_prot.compat_getsockopt(sk, level, optname, - optval, optlen); - return udp_prot.getsockopt(sk, level, optname, optval, optlen); - } - - if (level != SOL_IPV6) - return -ENOPROTOOPT; - - if (optname == MCAST_MSFILTER) { - const int size0 = offsetof(struct compat_group_filter, gf_slist); - struct compat_group_filter __user *p = (void __user *)optval; - struct compat_group_filter gf32; - struct group_filter gf; - int ulen, err; - int num; - - if (get_user(ulen, optlen)) - return -EFAULT; - - if (ulen < size0) - return -EINVAL; - - if (copy_from_user(&gf32, p, size0)) - return -EFAULT; - - gf.gf_interface = gf32.gf_interface; - gf.gf_fmode = gf32.gf_fmode; - num = gf.gf_numsrc = gf32.gf_numsrc; - gf.gf_group = gf32.gf_group; - - if (gf.gf_group.ss_family != AF_INET6) - return -EADDRNOTAVAIL; - lock_sock(sk); - err = ip6_mc_msfget(sk, &gf, p->gf_slist); - release_sock(sk); - if (err) - return err; - if (num > gf.gf_numsrc) - num = gf.gf_numsrc; - ulen = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); - if (put_user(ulen, optlen) || - put_user(gf.gf_fmode, &p->gf_fmode) || - put_user(gf.gf_numsrc, &p->gf_numsrc)) - return -EFAULT; - return 0; - } - - err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, - MSG_CMSG_COMPAT); -#ifdef CONFIG_NETFILTER - /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { - int len; - - if (get_user(len, optlen)) - return -EFAULT; - - err = compat_nf_getsockopt(sk, PF_INET6, optname, optval, &len); - if (err >= 0) - err = put_user(len, optlen); - } -#endif - return err; -} -EXPORT_SYMBOL(compat_ipv6_getsockopt); -#endif diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index e96a431549bc..2e2119bfcf13 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -960,8 +960,7 @@ static int compat_table_info(const struct xt_table_info *info, } #endif -static int get_info(struct net *net, void __user *user, - const int *len, int compat) +static int get_info(struct net *net, void __user *user, const int *len) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -975,7 +974,7 @@ static int get_info(struct net *net, void __user *user, name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_lock(AF_INET6); #endif t = xt_request_find_table_lock(net, AF_INET6, name); @@ -985,7 +984,7 @@ static int get_info(struct net *net, void __user *user, #ifdef CONFIG_COMPAT struct xt_table_info tmp; - if (compat) { + if (in_compat_syscall()) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET6); private = &tmp; @@ -1011,7 +1010,7 @@ static int get_info(struct net *net, void __user *user, } else ret = PTR_ERR(t); #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_unlock(AF_INET6); #endif return ret; @@ -1120,7 +1119,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, } static int -do_replace(struct net *net, const void __user *user, unsigned int len) +do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct ip6t_replace tmp; @@ -1128,7 +1127,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) void *loc_cpu_entry; struct ip6t_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -1144,8 +1143,8 @@ do_replace(struct net *net, const void __user *user, unsigned int len) return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -1169,8 +1168,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) } static int -do_add_counters(struct net *net, const void __user *user, unsigned int len, - int compat) +do_add_counters(struct net *net, sockptr_t arg, unsigned int len) { unsigned int i; struct xt_counters_info tmp; @@ -1181,7 +1179,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, struct ip6t_entry *iter; unsigned int addend; - paddc = xt_copy_counters_from_user(user, len, &tmp, compat); + paddc = xt_copy_counters(arg, len, &tmp); if (IS_ERR(paddc)) return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET6, tmp.name); @@ -1495,7 +1493,7 @@ out_unlock: } static int -compat_do_replace(struct net *net, void __user *user, unsigned int len) +compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct compat_ip6t_replace tmp; @@ -1503,7 +1501,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) void *loc_cpu_entry; struct ip6t_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -1519,8 +1517,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -1543,31 +1541,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return ret; } -static int -compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, - unsigned int len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case IP6T_SO_SET_REPLACE: - ret = compat_do_replace(sock_net(sk), user, len); - break; - - case IP6T_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 1); - break; - - default: - ret = -EINVAL; - } - - return ret; -} - struct compat_ip6t_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; @@ -1643,33 +1616,10 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_unlock(AF_INET6); return ret; } - -static int do_ip6t_get_ctl(struct sock *, int, void __user *, int *); - -static int -compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case IP6T_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 1); - break; - case IP6T_SO_GET_ENTRIES: - ret = compat_get_entries(sock_net(sk), user, len); - break; - default: - ret = do_ip6t_get_ctl(sk, cmd, user, len); - } - return ret; -} #endif static int -do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +do_ip6t_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) { int ret; @@ -1678,11 +1628,16 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) switch (cmd) { case IP6T_SO_SET_REPLACE: - ret = do_replace(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_do_replace(sock_net(sk), arg, len); + else +#endif + ret = do_replace(sock_net(sk), arg, len); break; case IP6T_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 0); + ret = do_add_counters(sock_net(sk), arg, len); break; default: @@ -1702,11 +1657,16 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) switch (cmd) { case IP6T_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 0); + ret = get_info(sock_net(sk), user, len); break; case IP6T_SO_GET_ENTRIES: - ret = get_entries(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_get_entries(sock_net(sk), user, len); + else +#endif + ret = get_entries(sock_net(sk), user, len); break; case IP6T_SO_GET_REVISION_MATCH: @@ -1897,15 +1857,9 @@ static struct nf_sockopt_ops ip6t_sockopts = { .set_optmin = IP6T_BASE_CTL, .set_optmax = IP6T_SO_SET_MAX+1, .set = do_ip6t_set_ctl, -#ifdef CONFIG_COMPAT - .compat_set = compat_do_ip6t_set_ctl, -#endif .get_optmin = IP6T_BASE_CTL, .get_optmax = IP6T_SO_GET_MAX+1, .get = do_ip6t_get_ctl, -#ifdef CONFIG_COMPAT - .compat_get = compat_do_ip6t_get_ctl, -#endif .owner = THIS_MODULE, }; diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c index 4e15a14435e4..70da2f2ce064 100644 --- a/net/ipv6/netfilter/ip6t_ah.c +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -74,8 +74,7 @@ static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par) ahinfo->hdrres, ah->reserved, !(ahinfo->hdrres && ah->reserved)); - return (ah != NULL) && - spi_match(ahinfo->spis[0], ahinfo->spis[1], + return spi_match(ahinfo->spis[0], ahinfo->spis[1], ntohl(ah->spi), !!(ahinfo->invflags & IP6T_AH_INV_SPI)) && (!ahinfo->hdrlen || diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c index fb91eeee4a1e..3aad6439386b 100644 --- a/net/ipv6/netfilter/ip6t_frag.c +++ b/net/ipv6/netfilter/ip6t_frag.c @@ -85,8 +85,7 @@ frag_mt6(const struct sk_buff *skb, struct xt_action_param *par) !((fraginfo->flags & IP6T_FRAG_NMF) && (ntohs(fh->frag_off) & IP6_MF))); - return (fh != NULL) && - id_match(fraginfo->ids[0], fraginfo->ids[1], + return id_match(fraginfo->ids[0], fraginfo->ids[1], ntohl(fh->identification), !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)) && !((fraginfo->flags & IP6T_FRAG_RES) && diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c index 467b2a86031b..e7a3fb9355ee 100644 --- a/net/ipv6/netfilter/ip6t_hbh.c +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -86,8 +86,7 @@ hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par) ((optinfo->hdrlen == hdrlen) ^ !!(optinfo->invflags & IP6T_OPTS_INV_LEN)))); - ret = (oh != NULL) && - (!(optinfo->flags & IP6T_OPTS_LEN) || + ret = (!(optinfo->flags & IP6T_OPTS_LEN) || ((optinfo->hdrlen == hdrlen) ^ !!(optinfo->invflags & IP6T_OPTS_INV_LEN))); diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index f633dc84ca3f..733c83d38b30 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -89,8 +89,7 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) !((rtinfo->flags & IP6T_RT_RES) && (((const struct rt0_hdr *)rh)->reserved))); - ret = (rh != NULL) && - (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], + ret = (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], rh->segments_left, !!(rtinfo->invflags & IP6T_RT_INV_SGS))) && (!(rtinfo->flags & IP6T_RT_LEN) || diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 5fae66f66671..4aef6baaa55e 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -126,6 +126,21 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, } EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); +static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) +{ + struct dst_entry *dst = NULL; + struct flowi fl; + + memset(&fl, 0, sizeof(struct flowi)); + fl.u.ip6.daddr = ipv6_hdr(skb_in)->saddr; + nf_ip6_route(dev_net(skb_in->dev), &dst, &fl, false); + if (!dst) + return -1; + + skb_dst_set(skb_in, dst); + return 0; +} + void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) { struct net_device *br_indev __maybe_unused; @@ -154,6 +169,14 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) fl6.daddr = oip6h->saddr; fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; + + if (hook == NF_INET_PRE_ROUTING) { + nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); + if (!dst) + return; + skb_dst_set(oldskb, dst); + } + fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev); fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); @@ -245,6 +268,9 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) skb_in->dev = net->loopback_dev; + if (hooknum == NF_INET_PRE_ROUTING && nf_reject6_fill_skb_dst(skb_in)) + return; + icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); } EXPORT_SYMBOL_GPL(nf_send_unreach6); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 98ac32b49d8c..6caa062f68e7 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -114,6 +114,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); ipcm6_init_sk(&ipc6, np); + ipc6.sockc.mark = sk->sk_mark; fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 8ef5a7b30524..874f01cd7aec 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -972,13 +972,13 @@ do_confirm: } static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + sockptr_t optval, int optlen) { switch (optname) { case ICMPV6_FILTER: if (optlen > sizeof(struct icmp6_filter)) optlen = sizeof(struct icmp6_filter); - if (copy_from_user(&raw6_sk(sk)->filter, optval, optlen)) + if (copy_from_sockptr(&raw6_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; default: @@ -1015,12 +1015,12 @@ static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct raw6_sock *rp = raw6_sk(sk); int val; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { @@ -1062,7 +1062,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, } static int rawv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { switch (level) { case SOL_RAW: @@ -1084,30 +1084,6 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, return do_rawv6_setsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - switch (level) { - case SOL_RAW: - break; - case SOL_ICMPV6: - if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) - return -EOPNOTSUPP; - return rawv6_seticmpfilter(sk, level, optname, optval, optlen); - case SOL_IPV6: - if (optname == IPV6_CHECKSUM || - optname == IPV6_HDRINCL) - break; - fallthrough; - default: - return compat_ipv6_setsockopt(sk, level, optname, - optval, optlen); - } - return do_rawv6_setsockopt(sk, level, optname, optval, optlen); -} -#endif - static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -1169,30 +1145,6 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, return do_rawv6_getsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - switch (level) { - case SOL_RAW: - break; - case SOL_ICMPV6: - if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) - return -EOPNOTSUPP; - return rawv6_geticmpfilter(sk, level, optname, optval, optlen); - case SOL_IPV6: - if (optname == IPV6_CHECKSUM || - optname == IPV6_HDRINCL) - break; - fallthrough; - default: - return compat_ipv6_getsockopt(sk, level, optname, - optval, optlen); - } - return do_rawv6_getsockopt(sk, level, optname, optval, optlen); -} -#endif - static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) { switch (cmd) { @@ -1297,8 +1249,6 @@ struct proto rawv6_prot = { .usersize = sizeof_field(struct raw6_sock, filter), .h.raw_hash = &raw_v6_hashinfo, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_rawv6_setsockopt, - .compat_getsockopt = compat_rawv6_getsockopt, .compat_ioctl = compat_rawv6_ioctl, #endif .diag_destroy = raw_abort, @@ -1378,8 +1328,6 @@ const struct proto_ops inet6_sockraw_ops = { .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4c36bd0c7930..5e7e25e2523a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -61,6 +61,7 @@ #include <net/l3mdev.h> #include <net/ip.h> #include <linux/uaccess.h> +#include <linux/btf_ids.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> @@ -1210,7 +1211,7 @@ fallback: return nrt; } -static struct rt6_info *ip6_pol_route_lookup(struct net *net, +INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, @@ -2277,7 +2278,7 @@ out: } EXPORT_SYMBOL_GPL(ip6_pol_route); -static struct rt6_info *ip6_pol_route_input(struct net *net, +INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, @@ -2468,7 +2469,7 @@ void ip6_route_input(struct sk_buff *skb) &fl6, skb, flags)); } -static struct rt6_info *ip6_pol_route_output(struct net *net, +INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, @@ -2915,7 +2916,7 @@ struct ip6rd_flowi { struct in6_addr gateway; }; -static struct rt6_info *__ip6_route_redirect(struct net *net, +INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, @@ -6423,21 +6424,29 @@ void __init ip6_route_init_special_entries(void) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) -static const struct bpf_iter_reg ipv6_route_reg_info = { - .target = "ipv6_route", +BTF_ID_LIST(btf_fib6_info_id) +BTF_ID(struct, fib6_info) + +static const struct bpf_iter_seq_info ipv6_route_seq_info = { .seq_ops = &ipv6_route_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct ipv6_route_iter), +}; + +static struct bpf_iter_reg ipv6_route_reg_info = { + .target = "ipv6_route", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__ipv6_route, rt), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &ipv6_route_seq_info, }; static int __init bpf_iter_register(void) { + ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id; return bpf_iter_reg_target(&ipv6_route_reg_info); } diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index c3ececd7cfc1..5fdf3ebb953f 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -136,8 +136,7 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, oldhdr = ipv6_hdr(skb); - buf = kzalloc(ipv6_rpl_srh_alloc_size(srh->segments_left - 1) * 2, - GFP_ATOMIC); + buf = kcalloc(struct_size(srh, segments.addr, srh->segments_left), 2, GFP_ATOMIC); if (!buf) return -ENOMEM; diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index e0e9f48ab14f..897fa59c47de 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -27,6 +27,23 @@ #include <net/seg6_hmac.h> #endif +static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) +{ + int head = 0; + + switch (tuninfo->mode) { + case SEG6_IPTUN_MODE_INLINE: + break; + case SEG6_IPTUN_MODE_ENCAP: + head = sizeof(struct ipv6hdr); + break; + case SEG6_IPTUN_MODE_L2ENCAP: + return 0; + } + + return ((tuninfo->srh->hdrlen + 1) << 3) + head; +} + struct seg6_lwt { struct dst_cache cache; struct seg6_iptunnel_encap tuninfo[]; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 13235a012388..e796a64be308 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -170,7 +170,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false); + req = cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, sk, skb); if (!req) goto out; @@ -178,9 +178,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) treq = tcp_rsk(req); treq->tfo_listener = false; - if (IS_ENABLED(CONFIG_MPTCP)) - treq->is_mptcp = 0; - if (security_inet_conn_request(sk, skb, req)) goto out_free; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f67d45ff00b4..305870a72352 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -567,7 +567,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk, } static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, - char __user *optval, int optlen) + sockptr_t optval, int optlen) { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; @@ -577,7 +577,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (optlen < sizeof(cmd)) return -EINVAL; - if (copy_from_user(&cmd, optval, sizeof(cmd))) + if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) return -EFAULT; if (sin6->sin6_family != AF_INET6) @@ -1811,6 +1811,13 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_destructor = tcp_twsk_destructor, }; +INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + + __tcp_v6_send_check(skb, &np->saddr, &sk->sk_v6_daddr); +} + const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, @@ -1824,10 +1831,6 @@ const struct inet_connection_sock_af_ops ipv6_specific = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ipv6_setsockopt, - .compat_getsockopt = compat_ipv6_getsockopt, -#endif .mtu_reduced = tcp_v6_mtu_reduced, }; @@ -1854,10 +1857,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ipv6_setsockopt, - .compat_getsockopt = compat_ipv6_getsockopt, -#endif .mtu_reduced = tcp_v4_mtu_reduced, }; @@ -2115,10 +2114,6 @@ struct proto tcpv6_prot = { .rsk_prot = &tcp6_request_sock_ops, .h.hashinfo = &tcp_hashinfo, .no_autobind = true, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_tcp_setsockopt, - .compat_getsockopt = compat_tcp_getsockopt, -#endif .diag_destroy = tcp_abort, }; EXPORT_SYMBOL_GPL(tcpv6_prot); diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 06c02ebe6b9b..00e8d8b1c9a7 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -155,6 +155,33 @@ drop: return 0; } +#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) +static int tunnel6_rcv_cb(struct sk_buff *skb, u8 proto, int err) +{ + struct xfrm6_tunnel __rcu *head; + struct xfrm6_tunnel *handler; + int ret; + + head = (proto == IPPROTO_IPV6) ? tunnel6_handlers : tunnel46_handlers; + + for_each_tunnel_rcu(head, handler) { + if (handler->cb_handler) { + ret = handler->cb_handler(skb, err); + if (ret <= 0) + return ret; + } + } + + return 0; +} + +static const struct xfrm_input_afinfo tunnel6_input_afinfo = { + .family = AF_INET6, + .is_ipip = true, + .callback = tunnel6_rcv_cb, +}; +#endif + static int tunnel46_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; @@ -245,11 +272,25 @@ static int __init tunnel6_init(void) inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP); return -EAGAIN; } +#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) + if (xfrm_input_register_afinfo(&tunnel6_input_afinfo)) { + pr_err("%s: can't add input afinfo\n", __func__); + inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); + inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP); + if (xfrm6_tunnel_mpls_supported()) + inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS); + return -EAGAIN; + } +#endif return 0; } static void __exit tunnel6_fini(void) { +#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) + if (xfrm_input_unregister_afinfo(&tunnel6_input_afinfo)) + pr_err("%s: can't remove input afinfo\n", __func__); +#endif if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP)) pr_err("%s: can't remove protocol\n", __func__); if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6)) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index a8d74f44056a..29d9691359b9 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -141,6 +141,24 @@ static int compute_score(struct sock *sk, struct net *net, return score; } +static struct sock *lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, + const struct in6_addr *saddr, + __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum) +{ + struct sock *reuse_sk = NULL; + u32 hash; + + if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { + hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); + reuse_sk = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + } + return reuse_sk; +} + /* called with rcu_read_lock() */ static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -148,9 +166,8 @@ static struct sock *udp6_lib_lookup2(struct net *net, int dif, int sdif, struct udp_hslot *hslot2, struct sk_buff *skb) { - struct sock *sk, *result, *reuseport_result; + struct sock *sk, *result; int score, badness; - u32 hash = 0; result = NULL; badness = -1; @@ -158,26 +175,44 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - reuseport_result = NULL; + result = lookup_reuseport(net, sk, skb, + saddr, sport, daddr, hnum); + /* Fall back to scoring if group has connections */ + if (result && !reuseport_has_conns(sk, false)) + return result; - if (sk->sk_reuseport && - sk->sk_state != TCP_ESTABLISHED) { - hash = udp6_ehashfn(net, daddr, hnum, - saddr, sport); - - reuseport_result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (reuseport_result && !reuseport_has_conns(sk, false)) - return reuseport_result; - } - - result = reuseport_result ? : sk; + result = result ? : sk; badness = score; } } return result; } +static inline struct sock *udp6_lookup_run_bpf(struct net *net, + struct udp_table *udptable, + struct sk_buff *skb, + const struct in6_addr *saddr, + __be16 sport, + const struct in6_addr *daddr, + u16 hnum) +{ + struct sock *sk, *reuse_sk; + bool no_reuseport; + + if (udptable != &udp_table) + return NULL; /* only UDP is supported */ + + no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP, + saddr, sport, daddr, hnum, &sk); + if (no_reuseport || IS_ERR_OR_NULL(sk)) + return sk; + + reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); + if (reuse_sk) + sk = reuse_sk; + return sk; +} + /* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -188,25 +223,42 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2; struct udp_hslot *hslot2; - struct sock *result; + struct sock *result, *sk; hash2 = ipv6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; + /* Lookup connected or non-wildcard sockets */ result = udp6_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, hslot2, skb); - if (!result) { - hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); - slot2 = hash2 & udptable->mask; + if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED) + goto done; + + /* Lookup redirect from BPF */ + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { + sk = udp6_lookup_run_bpf(net, udptable, skb, + saddr, sport, daddr, hnum); + if (sk) { + result = sk; + goto done; + } + } - hslot2 = &udptable->hash2[slot2]; + /* Got non-wildcard socket or error on first lookup */ + if (result) + goto done; - result = udp6_lib_lookup2(net, saddr, sport, - &in6addr_any, hnum, dif, sdif, - hslot2, skb); - } + /* Lookup wildcard sockets */ + hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + + result = udp6_lib_lookup2(net, saddr, sport, + &in6addr_any, hnum, dif, sdif, + hslot2, skb); +done: if (IS_ERR(result)) return NULL; return result; @@ -1062,6 +1114,9 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, * @sk: socket we are sending on * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) + * @saddr: source address + * @daddr: destination address + * @len: length of packet */ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, const struct in6_addr *saddr, @@ -1561,26 +1616,16 @@ void udpv6_destroy_sock(struct sock *sk) /* * Socket option code for UDP */ -int udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_setsockopt(sk, level, optname, optval, optlen, + return udp_lib_setsockopt(sk, level, optname, + optval, optlen, udp_v6_push_pending_frames); return ipv6_setsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_setsockopt(sk, level, optname, optval, optlen, - udp_v6_push_pending_frames); - return compat_ipv6_setsockopt(sk, level, optname, optval, optlen); -} -#endif - int udpv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -1589,16 +1634,6 @@ int udpv6_getsockopt(struct sock *sk, int level, int optname, return ipv6_getsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_getsockopt(sk, level, optname, optval, optlen); - return compat_ipv6_getsockopt(sk, level, optname, optval, optlen); -} -#endif - /* thinking of making this const? Don't. * early_demux can change based on sysctl. */ @@ -1681,10 +1716,6 @@ struct proto udpv6_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), .h.udp_table = &udp_table, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udpv6_setsockopt, - .compat_getsockopt = compat_udpv6_getsockopt, -#endif .diag_destroy = udp_abort, }; diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 20e324b6f358..b2fcc46c1630 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -17,14 +17,8 @@ void udp_v6_rehash(struct sock *sk); int udpv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); -int udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -#ifdef CONFIG_COMPAT -int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -#endif +int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index bf7a7acd39b1..fbb700d3f437 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -52,10 +52,6 @@ struct proto udplitev6_prot = { .sysctl_mem = sysctl_udp_mem, .obj_size = sizeof(struct udp6_sock), .h.udp_table = &udplite_table, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udpv6_setsockopt, - .compat_getsockopt = compat_udpv6_getsockopt, -#endif }; static struct inet_protosw udplite6_protosw = { diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index ee0add15497d..6ee9851ac7c6 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1494,7 +1494,7 @@ static int iucv_sock_release(struct socket *sock) /* getsockopt and setsockopt */ static int iucv_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); @@ -1507,7 +1507,7 @@ static int iucv_sock_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *) optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; rc = 0; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 56fac24a627a..56dad9565bc9 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1265,7 +1265,7 @@ static void kcm_recv_enable(struct kcm_sock *kcm) } static int kcm_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct kcm_sock *kcm = kcm_sk(sock->sk); int val, valbool; @@ -1277,8 +1277,8 @@ static int kcm_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; valbool = val ? 1 : 0; diff --git a/net/key/af_key.c b/net/key/af_key.c index a915bc86620a..c12dbc51ef5f 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3741,8 +3741,6 @@ static const struct proto_ops pfkey_ops = { .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 6434d17e6e8e..701fc72ad9f4 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* - * L2TP core. +/* L2TP core. * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd * @@ -94,7 +93,7 @@ struct l2tp_skb_cb { unsigned long expires; }; -#define L2TP_SKB_CB(skb) ((struct l2tp_skb_cb *) &skb->cb[sizeof(struct inet_skb_parm)]) +#define L2TP_SKB_CB(skb) ((struct l2tp_skb_cb *)&(skb)->cb[sizeof(struct inet_skb_parm)]) static struct workqueue_struct *l2tp_wq; @@ -102,8 +101,10 @@ static struct workqueue_struct *l2tp_wq; static unsigned int l2tp_net_id; struct l2tp_net { struct list_head l2tp_tunnel_list; + /* Lock for write access to l2tp_tunnel_list */ spinlock_t l2tp_tunnel_list_lock; struct hlist_head l2tp_session_hlist[L2TP_HASH_SIZE_2]; + /* Lock for write access to l2tp_session_hlist */ spinlock_t l2tp_session_hlist_lock; }; @@ -134,7 +135,6 @@ static inline struct hlist_head * l2tp_session_id_hash_2(struct l2tp_net *pn, u32 session_id) { return &pn->l2tp_session_hlist[hash_32(session_id, L2TP_HASH_BITS_2)]; - } /* Session hash list. @@ -149,12 +149,51 @@ l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id) return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)]; } -void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) +static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) { sock_put(tunnel->sock); /* the tunnel is freed in the socket destructor */ } -EXPORT_SYMBOL(l2tp_tunnel_free); + +static void l2tp_session_free(struct l2tp_session *session) +{ + struct l2tp_tunnel *tunnel = session->tunnel; + + if (tunnel) { + if (WARN_ON(tunnel->magic != L2TP_TUNNEL_MAGIC)) + goto out; + l2tp_tunnel_dec_refcount(tunnel); + } + +out: + kfree(session); +} + +void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel) +{ + refcount_inc(&tunnel->ref_count); +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_inc_refcount); + +void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel) +{ + if (refcount_dec_and_test(&tunnel->ref_count)) + l2tp_tunnel_free(tunnel); +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_dec_refcount); + +void l2tp_session_inc_refcount(struct l2tp_session *session) +{ + refcount_inc(&session->ref_count); +} +EXPORT_SYMBOL_GPL(l2tp_session_inc_refcount); + +void l2tp_session_dec_refcount(struct l2tp_session *session) +{ + if (refcount_dec_and_test(&session->ref_count)) + l2tp_session_free(session); +} +EXPORT_SYMBOL_GPL(l2tp_session_dec_refcount); /* Lookup a tunnel. A new reference is held on the returned tunnel. */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) @@ -412,7 +451,7 @@ static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff * } /* call private receive handler */ - if (session->recv_skb != NULL) + if (session->recv_skb) (*session->recv_skb)(session, skb, L2TP_SKB_CB(skb)->length); else kfree_skb(skb); @@ -621,8 +660,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, int length) { struct l2tp_tunnel *tunnel = session->tunnel; + u32 ns = 0, nr = 0; int offset; - u32 ns, nr; /* Parse and check optional cookie */ if (session->peer_cookie_len > 0) { @@ -644,13 +683,12 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, * the control of the LNS. If no sequence numbers present but * we were expecting them, discard frame. */ - ns = nr = 0; L2TP_SKB_CB(skb)->has_seq = 0; if (tunnel->version == L2TP_HDR_VER_2) { if (hdrflags & L2TP_HDRFLAG_S) { - ns = ntohs(*(__be16 *) ptr); + ns = ntohs(*(__be16 *)ptr); ptr += 2; - nr = ntohs(*(__be16 *) ptr); + nr = ntohs(*(__be16 *)ptr); ptr += 2; /* Store L2TP info in the skb */ @@ -662,7 +700,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, session->name, ns, nr, session->nr); } } else if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) { - u32 l2h = ntohl(*(__be32 *) ptr); + u32 l2h = ntohl(*(__be32 *)ptr); if (l2h & 0x40000000) { ns = l2h & 0x00ffffff; @@ -679,11 +717,11 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, } if (L2TP_SKB_CB(skb)->has_seq) { - /* Received a packet with sequence numbers. If we're the LNS, + /* Received a packet with sequence numbers. If we're the LAC, * check if we sre sending sequence numbers and if not, * configure it so. */ - if ((!session->lns_mode) && (!session->send_seq)) { + if (!session->lns_mode && !session->send_seq) { l2tp_info(session, L2TP_MSG_SEQ, "%s: requested to enable seq numbers by LNS\n", session->name); @@ -707,7 +745,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, * If we're the LNS and we're sending sequence numbers, the * LAC is broken. Discard the frame. */ - if ((!session->lns_mode) && (session->send_seq)) { + if (!session->lns_mode && session->send_seq) { l2tp_info(session, L2TP_MSG_SEQ, "%s: requested to disable seq numbers by LNS\n", session->name); @@ -770,20 +808,21 @@ discard: atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); } -EXPORT_SYMBOL(l2tp_recv_common); +EXPORT_SYMBOL_GPL(l2tp_recv_common); /* Drop skbs from the session's reorder_q */ -static int l2tp_session_queue_purge(struct l2tp_session *session) +static void l2tp_session_queue_purge(struct l2tp_session *session) { struct sk_buff *skb = NULL; - BUG_ON(!session); - BUG_ON(session->magic != L2TP_SESSION_MAGIC); + + if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) + return; + while ((skb = skb_dequeue(&session->reorder_q))) { atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); } - return 0; } /* Internal UDP receive frame. Do the real work of receiving an L2TP data frame @@ -825,10 +864,11 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) } /* Point to L2TP header */ - optr = ptr = skb->data; + optr = skb->data; + ptr = skb->data; /* Get L2TP header flags */ - hdrflags = ntohs(*(__be16 *) ptr); + hdrflags = ntohs(*(__be16 *)ptr); /* Check protocol version */ version = hdrflags & L2TP_HDR_VER_MASK; @@ -859,14 +899,14 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) ptr += 2; /* Extract tunnel and session ID */ - tunnel_id = ntohs(*(__be16 *) ptr); + tunnel_id = ntohs(*(__be16 *)ptr); ptr += 2; - session_id = ntohs(*(__be16 *) ptr); + session_id = ntohs(*(__be16 *)ptr); ptr += 2; } else { ptr += 2; /* skip reserved bits */ tunnel_id = tunnel->tunnel_id; - session_id = ntohl(*(__be32 *) ptr); + session_id = ntohl(*(__be32 *)ptr); ptr += 4; } @@ -910,7 +950,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) struct l2tp_tunnel *tunnel; tunnel = rcu_dereference_sk_user_data(sk); - if (tunnel == NULL) + if (!tunnel) goto pass_up; l2tp_dbg(tunnel, L2TP_MSG_DATA, "%s: received %d bytes\n", @@ -971,13 +1011,13 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf) */ if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { u16 flags = L2TP_HDR_VER_3; - *((__be16 *) bufp) = htons(flags); + *((__be16 *)bufp) = htons(flags); bufp += 2; - *((__be16 *) bufp) = 0; + *((__be16 *)bufp) = 0; bufp += 2; } - *((__be32 *) bufp) = htonl(session->peer_session_id); + *((__be32 *)bufp) = htonl(session->peer_session_id); bufp += 4; if (session->cookie_len) { memcpy(bufp, &session->cookie[0], session->cookie_len); @@ -1076,7 +1116,10 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len } /* Setup L2TP header */ - session->build_header(session, __skb_push(skb, hdr_len)); + if (tunnel->version == L2TP_HDR_VER_2) + l2tp_build_l2tpv2_header(session, __skb_push(skb, hdr_len)); + else + l2tp_build_l2tpv3_header(session, __skb_push(skb, hdr_len)); /* Reset skb netfilter state */ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1121,8 +1164,8 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len &sk->sk_v6_daddr, udp_len); else #endif - udp_set_csum(sk->sk_no_check_tx, skb, inet->inet_saddr, - inet->inet_daddr, udp_len); + udp_set_csum(sk->sk_no_check_tx, skb, inet->inet_saddr, + inet->inet_daddr, udp_len); break; case L2TP_ENCAPTYPE_IP: @@ -1149,7 +1192,7 @@ static void l2tp_tunnel_destruct(struct sock *sk) { struct l2tp_tunnel *tunnel = l2tp_tunnel(sk); - if (tunnel == NULL) + if (!tunnel) goto end; l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing...\n", tunnel->name); @@ -1179,6 +1222,30 @@ end: return; } +/* Remove an l2tp session from l2tp_core's hash lists. */ +static void l2tp_session_unhash(struct l2tp_session *session) +{ + struct l2tp_tunnel *tunnel = session->tunnel; + + /* Remove the session from core hashes */ + if (tunnel) { + /* Remove from the per-tunnel hash */ + write_lock_bh(&tunnel->hlist_lock); + hlist_del_init(&session->hlist); + write_unlock_bh(&tunnel->hlist_lock); + + /* For L2TPv3 we have a per-net hash: remove from there, too */ + if (tunnel->version != L2TP_HDR_VER_2) { + struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); + + spin_lock_bh(&pn->l2tp_session_hlist_lock); + hlist_del_init_rcu(&session->global_hlist); + spin_unlock_bh(&pn->l2tp_session_hlist_lock); + synchronize_rcu(); + } + } +} + /* When the tunnel is closed, all the attached sessions need to go too. */ static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) @@ -1188,8 +1255,6 @@ static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) struct hlist_node *tmp; struct l2tp_session *session; - BUG_ON(tunnel == NULL); - l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing all sessions...\n", tunnel->name); @@ -1210,10 +1275,10 @@ again: write_unlock_bh(&tunnel->hlist_lock); - __l2tp_session_unhash(session); + l2tp_session_unhash(session); l2tp_session_queue_purge(session); - if (session->session_close != NULL) + if (session->session_close) (*session->session_close)(session); l2tp_session_dec_refcount(session); @@ -1284,10 +1349,10 @@ static void l2tp_tunnel_del_work(struct work_struct *work) * exit hook. */ static int l2tp_tunnel_sock_create(struct net *net, - u32 tunnel_id, - u32 peer_tunnel_id, - struct l2tp_tunnel_cfg *cfg, - struct socket **sockp) + u32 tunnel_id, + u32 peer_tunnel_id, + struct l2tp_tunnel_cfg *cfg, + struct socket **sockp) { int err = -EINVAL; struct socket *sock = NULL; @@ -1305,9 +1370,9 @@ static int l2tp_tunnel_sock_create(struct net *net, memcpy(&udp_conf.peer_ip6, cfg->peer_ip6, sizeof(udp_conf.peer_ip6)); udp_conf.use_udp6_tx_checksums = - ! cfg->udp6_zero_tx_checksums; + !cfg->udp6_zero_tx_checksums; udp_conf.use_udp6_rx_checksums = - ! cfg->udp6_zero_rx_checksums; + !cfg->udp6_zero_rx_checksums; } else #endif { @@ -1332,7 +1397,7 @@ static int l2tp_tunnel_sock_create(struct net *net, struct sockaddr_l2tpip6 ip6_addr = {0}; err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, - IPPROTO_L2TP, &sock); + IPPROTO_L2TP, &sock); if (err < 0) goto out; @@ -1340,7 +1405,7 @@ static int l2tp_tunnel_sock_create(struct net *net, memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = tunnel_id; - err = kernel_bind(sock, (struct sockaddr *) &ip6_addr, + err = kernel_bind(sock, (struct sockaddr *)&ip6_addr, sizeof(ip6_addr)); if (err < 0) goto out; @@ -1350,7 +1415,7 @@ static int l2tp_tunnel_sock_create(struct net *net, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = peer_tunnel_id; err = kernel_connect(sock, - (struct sockaddr *) &ip6_addr, + (struct sockaddr *)&ip6_addr, sizeof(ip6_addr), 0); if (err < 0) goto out; @@ -1360,14 +1425,14 @@ static int l2tp_tunnel_sock_create(struct net *net, struct sockaddr_l2tpip ip_addr = {0}; err = sock_create_kern(net, AF_INET, SOCK_DGRAM, - IPPROTO_L2TP, &sock); + IPPROTO_L2TP, &sock); if (err < 0) goto out; ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->local_ip; ip_addr.l2tp_conn_id = tunnel_id; - err = kernel_bind(sock, (struct sockaddr *) &ip_addr, + err = kernel_bind(sock, (struct sockaddr *)&ip_addr, sizeof(ip_addr)); if (err < 0) goto out; @@ -1375,7 +1440,7 @@ static int l2tp_tunnel_sock_create(struct net *net, ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->peer_ip; ip_addr.l2tp_conn_id = peer_tunnel_id; - err = kernel_connect(sock, (struct sockaddr *) &ip_addr, + err = kernel_connect(sock, (struct sockaddr *)&ip_addr, sizeof(ip_addr), 0); if (err < 0) goto out; @@ -1388,7 +1453,7 @@ static int l2tp_tunnel_sock_create(struct net *net, out: *sockp = sock; - if ((err < 0) && sock) { + if (err < 0 && sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); *sockp = NULL; @@ -1399,17 +1464,18 @@ out: static struct lock_class_key l2tp_socket_class; -int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp) +int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, + struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp) { struct l2tp_tunnel *tunnel = NULL; int err; enum l2tp_encap_type encap = L2TP_ENCAPTYPE_UDP; - if (cfg != NULL) + if (cfg) encap = cfg->encap; - tunnel = kzalloc(sizeof(struct l2tp_tunnel), GFP_KERNEL); - if (tunnel == NULL) { + tunnel = kzalloc(sizeof(*tunnel), GFP_KERNEL); + if (!tunnel) { err = -ENOMEM; goto err; } @@ -1424,7 +1490,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 rwlock_init(&tunnel->hlist_lock); tunnel->acpt_newsess = true; - if (cfg != NULL) + if (cfg) tunnel->debug = cfg->debug; tunnel->encap = encap; @@ -1557,67 +1623,17 @@ void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); -/* Really kill the session. - */ -void l2tp_session_free(struct l2tp_session *session) -{ - struct l2tp_tunnel *tunnel = session->tunnel; - - BUG_ON(refcount_read(&session->ref_count) != 0); - - if (tunnel) { - BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC); - l2tp_tunnel_dec_refcount(tunnel); - } - - kfree(session); -} -EXPORT_SYMBOL_GPL(l2tp_session_free); - -/* Remove an l2tp session from l2tp_core's hash lists. - * Provides a tidyup interface for pseudowire code which can't just route all - * shutdown via. l2tp_session_delete and a pseudowire-specific session_close - * callback. - */ -void __l2tp_session_unhash(struct l2tp_session *session) -{ - struct l2tp_tunnel *tunnel = session->tunnel; - - /* Remove the session from core hashes */ - if (tunnel) { - /* Remove from the per-tunnel hash */ - write_lock_bh(&tunnel->hlist_lock); - hlist_del_init(&session->hlist); - write_unlock_bh(&tunnel->hlist_lock); - - /* For L2TPv3 we have a per-net hash: remove from there, too */ - if (tunnel->version != L2TP_HDR_VER_2) { - struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); - spin_lock_bh(&pn->l2tp_session_hlist_lock); - hlist_del_init_rcu(&session->global_hlist); - spin_unlock_bh(&pn->l2tp_session_hlist_lock); - synchronize_rcu(); - } - } -} -EXPORT_SYMBOL_GPL(__l2tp_session_unhash); - -/* This function is used by the netlink SESSION_DELETE command and by - pseudowire modules. - */ -int l2tp_session_delete(struct l2tp_session *session) +void l2tp_session_delete(struct l2tp_session *session) { if (test_and_set_bit(0, &session->dead)) - return 0; + return; - __l2tp_session_unhash(session); + l2tp_session_unhash(session); l2tp_session_queue_purge(session); - if (session->session_close != NULL) + if (session->session_close) (*session->session_close)(session); l2tp_session_dec_refcount(session); - - return 0; } EXPORT_SYMBOL_GPL(l2tp_session_delete); @@ -1636,16 +1652,16 @@ void l2tp_session_set_header_len(struct l2tp_session *session, int version) if (session->tunnel->encap == L2TP_ENCAPTYPE_UDP) session->hdr_len += 4; } - } EXPORT_SYMBOL_GPL(l2tp_session_set_header_len); -struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) +struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, + u32 peer_session_id, struct l2tp_session_cfg *cfg) { struct l2tp_session *session; - session = kzalloc(sizeof(struct l2tp_session) + priv_size, GFP_KERNEL); - if (session != NULL) { + session = kzalloc(sizeof(*session) + priv_size, GFP_KERNEL); + if (session) { session->magic = L2TP_SESSION_MAGIC; session->tunnel = tunnel; @@ -1687,11 +1703,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn memcpy(&session->peer_cookie[0], &cfg->peer_cookie[0], cfg->peer_cookie_len); } - if (tunnel->version == L2TP_HDR_VER_2) - session->build_header = l2tp_build_l2tpv2_header; - else - session->build_header = l2tp_build_l2tpv3_header; - l2tp_session_set_header_len(session, tunnel->version); refcount_set(&session->ref_count, 1); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 10cf7c3dcbb3..3468d6b177a0 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* - * L2TP internal definitions. +/* L2TP internal definitions. * * Copyright (c) 2008,2009 Katalix Systems Ltd */ @@ -16,17 +15,17 @@ #include <net/xfrm.h> #endif -/* Just some random numbers */ +/* Random numbers used for internal consistency checks of tunnel and session structures */ #define L2TP_TUNNEL_MAGIC 0x42114DDA #define L2TP_SESSION_MAGIC 0x0C04EB7D -/* Per tunnel, session hash table size */ +/* Per tunnel session hash table size */ #define L2TP_HASH_BITS 4 -#define L2TP_HASH_SIZE (1 << L2TP_HASH_BITS) +#define L2TP_HASH_SIZE BIT(L2TP_HASH_BITS) -/* System-wide, session hash table size */ +/* System-wide session hash table size */ #define L2TP_HASH_BITS_2 8 -#define L2TP_HASH_SIZE_2 (1 << L2TP_HASH_BITS_2) +#define L2TP_HASH_SIZE_2 BIT(L2TP_HASH_BITS_2) struct sk_buff; @@ -44,37 +43,34 @@ struct l2tp_stats { struct l2tp_tunnel; -/* Describes a session. Contains information to determine incoming - * packets and transmit outgoing ones. - */ +/* L2TP session configuration */ struct l2tp_session_cfg { enum l2tp_pwtype pw_type; - unsigned int recv_seq:1; /* expect receive packets with - * sequence numbers? */ - unsigned int send_seq:1; /* send packets with sequence - * numbers? */ - unsigned int lns_mode:1; /* behave as LNS? LAC enables - * sequence numbers under - * control of LNS. */ - int debug; /* bitmask of debug message - * categories */ + unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */ + unsigned int send_seq:1; /* send packets with sequence numbers? */ + unsigned int lns_mode:1; /* behave as LNS? + * LAC enables sequence numbers under LNS control. + */ + int debug; /* bitmask of debug message categories */ u16 l2specific_type; /* Layer 2 specific type */ u8 cookie[8]; /* optional cookie */ int cookie_len; /* 0, 4 or 8 bytes */ u8 peer_cookie[8]; /* peer's cookie */ int peer_cookie_len; /* 0, 4 or 8 bytes */ - int reorder_timeout; /* configured reorder timeout - * (in jiffies) */ + int reorder_timeout; /* configured reorder timeout (in jiffies) */ char *ifname; }; +/* Represents a session (pseudowire) instance. + * Tracks runtime state including cookies, dataplane packet sequencing, and IO statistics. + * Is linked into a per-tunnel session hashlist; and in the case of an L2TPv3 session into + * an additional per-net ("global") hashlist. + */ struct l2tp_session { - int magic; /* should be - * L2TP_SESSION_MAGIC */ + int magic; /* should be L2TP_SESSION_MAGIC */ long dead; - struct l2tp_tunnel *tunnel; /* back pointer to tunnel - * context */ + struct l2tp_tunnel *tunnel; /* back pointer to tunnel context */ u32 session_id; u32 peer_session_id; u8 cookie[8]; @@ -89,42 +85,53 @@ struct l2tp_session { u32 nr_max; /* max NR. Depends on tunnel */ u32 nr_window_size; /* NR window size */ u32 nr_oos; /* NR of last OOS packet */ - int nr_oos_count; /* For OOS recovery */ + int nr_oos_count; /* for OOS recovery */ int nr_oos_count_max; - struct hlist_node hlist; /* Hash list node */ + struct hlist_node hlist; /* hash list node */ refcount_t ref_count; char name[32]; /* for logging */ char ifname[IFNAMSIZ]; - unsigned int recv_seq:1; /* expect receive packets with - * sequence numbers? */ - unsigned int send_seq:1; /* send packets with sequence - * numbers? */ - unsigned int lns_mode:1; /* behave as LNS? LAC enables - * sequence numbers under - * control of LNS. */ - int debug; /* bitmask of debug message - * categories */ - int reorder_timeout; /* configured reorder timeout - * (in jiffies) */ + unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */ + unsigned int send_seq:1; /* send packets with sequence numbers? */ + unsigned int lns_mode:1; /* behave as LNS? + * LAC enables sequence numbers under LNS control. + */ + int debug; /* bitmask of debug message categories */ + int reorder_timeout; /* configured reorder timeout (in jiffies) */ int reorder_skip; /* set if skip to next nr */ enum l2tp_pwtype pwtype; struct l2tp_stats stats; - struct hlist_node global_hlist; /* Global hash list node */ + struct hlist_node global_hlist; /* global hash list node */ - int (*build_header)(struct l2tp_session *session, void *buf); + /* Session receive handler for data packets. + * Each pseudowire implementation should implement this callback in order to + * handle incoming packets. Packets are passed to the pseudowire handler after + * reordering, if data sequence numbers are enabled for the session. + */ void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len); + + /* Session close handler. + * Each pseudowire implementation may implement this callback in order to carry + * out pseudowire-specific shutdown actions. + * The callback is called by core after unhashing the session and purging its + * reorder queue. + */ void (*session_close)(struct l2tp_session *session); + + /* Session show handler. + * Pseudowire-specific implementation of debugfs session rendering. + * The callback is called by l2tp_debugfs.c after rendering core session + * information. + */ void (*show)(struct seq_file *m, void *priv); - u8 priv[]; /* private data */ + + u8 priv[]; /* private data */ }; -/* Describes the tunnel. It contains info to track all the associated - * sessions so incoming packets can be sorted out - */ +/* L2TP tunnel configuration */ struct l2tp_tunnel_cfg { - int debug; /* bitmask of debug message - * categories */ + int debug; /* bitmask of debug message categories */ enum l2tp_encap_type encap; /* Used only for kernel-created sockets */ @@ -141,6 +148,12 @@ struct l2tp_tunnel_cfg { udp6_zero_rx_checksums:1; }; +/* Represents a tunnel instance. + * Tracks runtime state including IO statistics. + * Holds the tunnel socket (either passed from userspace or directly created by the kernel). + * Maintains a hashlist of sessions belonging to the tunnel instance. + * Is linked into a per-net list of tunnels. + */ struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ @@ -148,40 +161,51 @@ struct l2tp_tunnel { struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ - bool acpt_newsess; /* Indicates whether this - * tunnel accepts new sessions. - * Protected by hlist_lock. + bool acpt_newsess; /* indicates whether this tunnel accepts + * new sessions. Protected by hlist_lock. */ struct hlist_head session_hlist[L2TP_HASH_SIZE]; - /* hashed list of sessions, - * hashed by id */ + /* hashed list of sessions, hashed by id */ u32 tunnel_id; u32 peer_tunnel_id; int version; /* 2=>L2TPv2, 3=>L2TPv3 */ char name[20]; /* for logging */ - int debug; /* bitmask of debug message - * categories */ + int debug; /* bitmask of debug message categories */ enum l2tp_encap_type encap; struct l2tp_stats stats; - struct list_head list; /* Keep a list of all tunnels */ + struct list_head list; /* list node on per-namespace list of tunnels */ struct net *l2tp_net; /* the net we belong to */ refcount_t ref_count; - void (*old_sk_destruct)(struct sock *); - struct sock *sock; /* Parent socket */ - int fd; /* Parent fd, if tunnel socket - * was created by userspace */ + void (*old_sk_destruct)(struct sock *sk); + struct sock *sock; /* parent socket */ + int fd; /* parent fd, if tunnel socket was created + * by userspace + */ struct work_struct del_work; }; +/* Pseudowire ops callbacks for use with the l2tp genetlink interface */ struct l2tp_nl_cmd_ops { + /* The pseudowire session create callback is responsible for creating a session + * instance for a specific pseudowire type. + * It must call l2tp_session_create and l2tp_session_register to register the + * session instance, as well as carry out any pseudowire-specific initialisation. + * It must return >= 0 on success, or an appropriate negative errno value on failure. + */ int (*session_create)(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); - int (*session_delete)(struct l2tp_session *session); + + /* The pseudowire session delete callback is responsible for initiating the deletion + * of a session instance. + * It must call l2tp_session_delete, as well as carry out any pseudowire-specific + * teardown actions. + */ + void (*session_delete)(struct l2tp_session *session); }; static inline void *l2tp_session_priv(struct l2tp_session *session) @@ -189,73 +213,68 @@ static inline void *l2tp_session_priv(struct l2tp_session *session) return &session->priv[0]; } +/* Tunnel and session refcounts */ +void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel); +void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel); +void l2tp_session_inc_refcount(struct l2tp_session *session); +void l2tp_session_dec_refcount(struct l2tp_session *session); + +/* Tunnel and session lookup. + * These functions take a reference on the instances they return, so + * the caller must ensure that the reference is dropped appropriately. + */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth); struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel, u32 session_id); -void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); - struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id); struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth); struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname); +/* Tunnel and session lifetime management. + * Creation of a new instance is a two-step process: create, then register. + * Destruction is triggered using the *_delete functions, and completes asynchronously. + */ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, struct l2tp_tunnel_cfg *cfg); - void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); + struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); int l2tp_session_register(struct l2tp_session *session, struct l2tp_tunnel *tunnel); +void l2tp_session_delete(struct l2tp_session *session); -void __l2tp_session_unhash(struct l2tp_session *session); -int l2tp_session_delete(struct l2tp_session *session); -void l2tp_session_free(struct l2tp_session *session); +/* Receive path helpers. If data sequencing is enabled for the session these + * functions handle queuing and reordering prior to passing packets to the + * pseudowire code to be passed to userspace. + */ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length); int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); -void l2tp_session_set_header_len(struct l2tp_session *session, int version); +/* Transmit path helpers for sending packets over the tunnel socket. */ +void l2tp_session_set_header_len(struct l2tp_session *session, int version); int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len); -int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, - const struct l2tp_nl_cmd_ops *ops); -void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); -int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg); - -static inline void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel) -{ - refcount_inc(&tunnel->ref_count); -} - -static inline void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel) -{ - if (refcount_dec_and_test(&tunnel->ref_count)) - l2tp_tunnel_free(tunnel); -} - -/* Session reference counts. Incremented when code obtains a reference - * to a session. +/* Pseudowire management. + * Pseudowires should register with l2tp core on module init, and unregister + * on module exit. */ -static inline void l2tp_session_inc_refcount(struct l2tp_session *session) -{ - refcount_inc(&session->ref_count); -} +int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops); +void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); -static inline void l2tp_session_dec_refcount(struct l2tp_session *session) -{ - if (refcount_dec_and_test(&session->ref_count)) - l2tp_session_free(session); -} +/* IOCTL helper for IP encap modules. */ +int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg); static inline int l2tp_get_l2specific_len(struct l2tp_session *session) { diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 35bb4f3bdbe0..96cb9601c21b 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * L2TP subsystem debugfs +/* L2TP subsystem debugfs * * Copyright (c) 2010 Katalix Systems Ltd */ @@ -59,11 +58,10 @@ static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd) pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx); pd->session_idx++; - if (pd->session == NULL) { + if (!pd->session) { pd->session_idx = 0; l2tp_dfs_next_tunnel(pd); } - } static void *l2tp_dfs_seq_start(struct seq_file *m, loff_t *offs) @@ -74,23 +72,25 @@ static void *l2tp_dfs_seq_start(struct seq_file *m, loff_t *offs) if (!pos) goto out; - BUG_ON(m->private == NULL); + if (WARN_ON(!m->private)) { + pd = NULL; + goto out; + } pd = m->private; - if (pd->tunnel == NULL) + if (!pd->tunnel) l2tp_dfs_next_tunnel(pd); else l2tp_dfs_next_session(pd); /* NULL tunnel and session indicates end of list */ - if ((pd->tunnel == NULL) && (pd->session == NULL)) + if (!pd->tunnel && !pd->session) pd = NULL; out: return pd; } - static void *l2tp_dfs_seq_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; @@ -148,11 +148,13 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) const struct ipv6_pinfo *np = inet6_sk(tunnel->sock); seq_printf(m, " from %pI6c to %pI6c\n", - &np->saddr, &tunnel->sock->sk_v6_daddr); - } else + &np->saddr, &tunnel->sock->sk_v6_daddr); + } #endif - seq_printf(m, " from %pI4 to %pI4\n", - &inet->inet_saddr, &inet->inet_daddr); + if (tunnel->sock->sk_family == AF_INET) + seq_printf(m, " from %pI4 to %pI4\n", + &inet->inet_saddr, &inet->inet_daddr); + if (tunnel->encap == L2TP_ENCAPTYPE_UDP) seq_printf(m, " source port %hu, dest port %hu\n", ntohs(inet->inet_sport), ntohs(inet->inet_dport)); @@ -202,7 +204,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) seq_printf(m, "%02x%02x%02x%02x", session->cookie[4], session->cookie[5], session->cookie[6], session->cookie[7]); - seq_printf(m, "\n"); + seq_puts(m, "\n"); } if (session->peer_cookie_len) { seq_printf(m, " peer cookie %02x%02x%02x%02x", @@ -212,7 +214,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) seq_printf(m, "%02x%02x%02x%02x", session->peer_cookie[4], session->peer_cookie[5], session->peer_cookie[6], session->peer_cookie[7]); - seq_printf(m, "\n"); + seq_puts(m, "\n"); } seq_printf(m, " %hu/%hu tx %ld/%ld/%ld rx %ld/%ld/%ld\n", @@ -224,7 +226,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) atomic_long_read(&session->stats.rx_bytes), atomic_long_read(&session->stats.rx_errors)); - if (session->show != NULL) + if (session->show) session->show(m, session); } @@ -271,7 +273,7 @@ static int l2tp_dfs_seq_open(struct inode *inode, struct file *file) int rc = -ENOMEM; pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (pd == NULL) + if (!pd) goto out; /* Derive the network namespace from the pid opening the diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index fd5ac2788e45..7ed2b4eced94 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * L2TPv3 ethernet pseudowire driver +/* L2TPv3 ethernet pseudowire driver * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd */ @@ -51,7 +50,6 @@ struct l2tp_eth_sess { struct net_device __rcu *dev; }; - static int l2tp_eth_dev_init(struct net_device *dev) { eth_hw_addr_random(dev); @@ -73,7 +71,7 @@ static void l2tp_eth_dev_uninit(struct net_device *dev) */ } -static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct l2tp_eth *priv = netdev_priv(dev); struct l2tp_session *session = priv->session; @@ -94,13 +92,12 @@ static void l2tp_eth_get_stats64(struct net_device *dev, { struct l2tp_eth *priv = netdev_priv(dev); - stats->tx_bytes = (unsigned long) atomic_long_read(&priv->tx_bytes); - stats->tx_packets = (unsigned long) atomic_long_read(&priv->tx_packets); - stats->tx_dropped = (unsigned long) atomic_long_read(&priv->tx_dropped); - stats->rx_bytes = (unsigned long) atomic_long_read(&priv->rx_bytes); - stats->rx_packets = (unsigned long) atomic_long_read(&priv->rx_packets); - stats->rx_errors = (unsigned long) atomic_long_read(&priv->rx_errors); - + stats->tx_bytes = (unsigned long)atomic_long_read(&priv->tx_bytes); + stats->tx_packets = (unsigned long)atomic_long_read(&priv->tx_packets); + stats->tx_dropped = (unsigned long)atomic_long_read(&priv->tx_dropped); + stats->rx_bytes = (unsigned long)atomic_long_read(&priv->rx_bytes); + stats->rx_packets = (unsigned long)atomic_long_read(&priv->rx_packets); + stats->rx_errors = (unsigned long)atomic_long_read(&priv->rx_errors); } static const struct net_device_ops l2tp_eth_netdev_ops = { @@ -348,13 +345,11 @@ err: return rc; } - static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = { .session_create = l2tp_eth_create, .session_delete = l2tp_session_delete, }; - static int __init l2tp_eth_init(void) { int err = 0; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 955662a6dee7..df2a35b5714a 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * L2TPv3 IP encapsulation support +/* L2TPv3 IP encapsulation support * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd */ @@ -125,8 +124,9 @@ static int l2tp_ip_recv(struct sk_buff *skb) goto discard; /* Point to L2TP header */ - optr = ptr = skb->data; - session_id = ntohl(*((__be32 *) ptr)); + optr = skb->data; + ptr = skb->data; + session_id = ntohl(*((__be32 *)ptr)); ptr += 4; /* RFC3931: L2TP/IP packets have the first 4 bytes containing @@ -154,7 +154,8 @@ static int l2tp_ip_recv(struct sk_buff *skb) goto discard_sess; /* Point to L2TP header */ - optr = ptr = skb->data; + optr = skb->data; + ptr = skb->data; ptr += 4; pr_debug("%s: ip recv\n", tunnel->name); print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); @@ -176,7 +177,7 @@ pass_up: if ((skb->data[0] & 0xc0) != 0xc0) goto discard; - tunnel_id = ntohl(*(__be32 *) &skb->data[4]); + tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = (struct iphdr *)skb_network_header(skb); read_lock_bh(&l2tp_ip_lock); @@ -260,7 +261,7 @@ static void l2tp_ip_destroy_sock(struct sock *sk) static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); - struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *) uaddr; + struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *)uaddr; struct net *net = sock_net(sk); int ret; int chk_addr_ret; @@ -285,8 +286,10 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) goto out; - if (addr->l2tp_addr.s_addr) - inet->inet_rcv_saddr = inet->inet_saddr = addr->l2tp_addr.s_addr; + if (addr->l2tp_addr.s_addr) { + inet->inet_rcv_saddr = addr->l2tp_addr.s_addr; + inet->inet_saddr = addr->l2tp_addr.s_addr; + } if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ @@ -316,7 +319,7 @@ out: static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *) uaddr; + struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; int rc; if (addr_len < sizeof(*lsa)) @@ -375,6 +378,7 @@ static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr, lsa->l2tp_addr.s_addr = inet->inet_daddr; } else { __be32 addr = inet->inet_rcv_saddr; + if (!addr) addr = inet->inet_saddr; lsa->l2tp_conn_id = lsk->conn_id; @@ -422,6 +426,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* Get and verify the address. */ if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_l2tpip *, lip, msg->msg_name); + rc = -EINVAL; if (msg->msg_namelen < sizeof(*lip)) goto out; @@ -456,7 +461,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) skb_reset_transport_header(skb); /* Insert 0 session_id */ - *((__be32 *) skb_put(skb, 4)) = 0; + *((__be32 *)skb_put(skb, 4)) = 0; /* Copy user data into skb */ rc = memcpy_from_msg(skb_put(skb, len), msg, len); @@ -467,10 +472,10 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &inet->cork.fl.u.ip4; if (connected) - rt = (struct rtable *) __sk_dst_check(sk, 0); + rt = (struct rtable *)__sk_dst_check(sk, 0); rcu_read_lock(); - if (rt == NULL) { + if (!rt) { const struct ip_options_rcu *inet_opt; inet_opt = rcu_dereference(inet->inet_opt); @@ -592,7 +597,7 @@ int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg) return put_user(amount, (int __user *)arg); } -EXPORT_SYMBOL(l2tp_ioctl); +EXPORT_SYMBOL_GPL(l2tp_ioctl); static struct proto l2tp_ip_prot = { .name = "L2TP/IP", @@ -612,10 +617,6 @@ static struct proto l2tp_ip_prot = { .hash = l2tp_ip_hash, .unhash = l2tp_ip_unhash, .obj_size = sizeof(struct l2tp_ip_sock), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ip_setsockopt, - .compat_getsockopt = compat_ip_getsockopt, -#endif }; static const struct proto_ops l2tp_ip_ops = { @@ -638,10 +639,6 @@ static const struct proto_ops l2tp_ip_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; static struct inet_protosw l2tp_ip_protosw = { diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 526ed2c24dd5..bc757bc7e264 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * L2TPv3 IP encapsulation support for IPv6 +/* L2TPv3 IP encapsulation support for IPv6 * * Copyright (c) 2012 Katalix Systems Ltd */ @@ -38,7 +37,8 @@ struct l2tp_ip6_sock { u32 peer_conn_id; /* ipv6_pinfo has to be the last member of l2tp_ip6_sock, see - inet6_sk_generic */ + * inet6_sk_generic + */ struct ipv6_pinfo inet6; }; @@ -137,8 +137,9 @@ static int l2tp_ip6_recv(struct sk_buff *skb) goto discard; /* Point to L2TP header */ - optr = ptr = skb->data; - session_id = ntohl(*((__be32 *) ptr)); + optr = skb->data; + ptr = skb->data; + session_id = ntohl(*((__be32 *)ptr)); ptr += 4; /* RFC3931: L2TP/IP packets have the first 4 bytes containing @@ -166,7 +167,8 @@ static int l2tp_ip6_recv(struct sk_buff *skb) goto discard_sess; /* Point to L2TP header */ - optr = ptr = skb->data; + optr = skb->data; + ptr = skb->data; ptr += 4; pr_debug("%s: ip recv\n", tunnel->name); print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); @@ -188,7 +190,7 @@ pass_up: if ((skb->data[0] & 0xc0) != 0xc0) goto discard; - tunnel_id = ntohl(*(__be32 *) &skb->data[4]); + tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = ipv6_hdr(skb); read_lock_bh(&l2tp_ip6_lock); @@ -276,7 +278,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr; + struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *)uaddr; struct net *net = sock_net(sk); __be32 v4addr = 0; int bound_dev_if; @@ -375,8 +377,8 @@ out_unlock: static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *) uaddr; - struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; + struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; struct in6_addr *daddr; int addr_type; int rc; @@ -486,7 +488,7 @@ static int l2tp_ip6_push_pending_frames(struct sock *sk) int err = 0; skb = skb_peek(&sk->sk_write_queue); - if (skb == NULL) + if (!skb) goto out; transhdr = (__be32 *)skb_transport_header(skb); @@ -519,7 +521,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int err; /* Rough check on arithmetic overflow, - better check is made in ip6_append_data(). + * better check is made in ip6_append_data(). */ if (len > INT_MAX) return -EMSGSIZE; @@ -528,9 +530,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; - /* - * Get and verify the address. - */ + /* Get and verify the address */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = sk->sk_mark; @@ -548,15 +548,14 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = &lsa->l2tp_addr; if (np->sndflow) { fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK; - if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } } - /* - * Otherwise it will be difficult to maintain + /* Otherwise it will be difficult to maintain * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && @@ -594,7 +593,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (IS_ERR(flowlabel)) return -EINVAL; } - if (!(opt->opt_nflen|opt->opt_flen)) + if (!(opt->opt_nflen | opt->opt_flen)) opt = NULL; } @@ -745,10 +744,6 @@ static struct proto l2tp_ip6_prot = { .hash = l2tp_ip6_hash, .unhash = l2tp_ip6_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ipv6_setsockopt, - .compat_getsockopt = compat_ipv6_getsockopt, -#endif }; static const struct proto_ops l2tp_ip6_ops = { @@ -773,8 +768,6 @@ static const struct proto_ops l2tp_ip6_ops = { .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index ebb381c3f1b9..def78eebca4c 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* - * L2TP netlink layer, for management +/* L2TP netlink layer, for management * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd * @@ -27,7 +26,6 @@ #include "l2tp_core.h" - static struct genl_family l2tp_nl_family; static const struct genl_multicast_group l2tp_multicast_group[] = { @@ -157,81 +155,82 @@ static int l2tp_session_notify(struct genl_family *family, return ret; } +static int l2tp_nl_cmd_tunnel_create_get_addr(struct nlattr **attrs, struct l2tp_tunnel_cfg *cfg) +{ + if (attrs[L2TP_ATTR_UDP_SPORT]) + cfg->local_udp_port = nla_get_u16(attrs[L2TP_ATTR_UDP_SPORT]); + if (attrs[L2TP_ATTR_UDP_DPORT]) + cfg->peer_udp_port = nla_get_u16(attrs[L2TP_ATTR_UDP_DPORT]); + cfg->use_udp_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_CSUM]); + + /* Must have either AF_INET or AF_INET6 address for source and destination */ +#if IS_ENABLED(CONFIG_IPV6) + if (attrs[L2TP_ATTR_IP6_SADDR] && attrs[L2TP_ATTR_IP6_DADDR]) { + cfg->local_ip6 = nla_data(attrs[L2TP_ATTR_IP6_SADDR]); + cfg->peer_ip6 = nla_data(attrs[L2TP_ATTR_IP6_DADDR]); + cfg->udp6_zero_tx_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]); + cfg->udp6_zero_rx_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]); + return 0; + } +#endif + if (attrs[L2TP_ATTR_IP_SADDR] && attrs[L2TP_ATTR_IP_DADDR]) { + cfg->local_ip.s_addr = nla_get_in_addr(attrs[L2TP_ATTR_IP_SADDR]); + cfg->peer_ip.s_addr = nla_get_in_addr(attrs[L2TP_ATTR_IP_DADDR]); + return 0; + } + return -EINVAL; +} + static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info) { u32 tunnel_id; u32 peer_tunnel_id; int proto_version; - int fd; + int fd = -1; int ret = 0; struct l2tp_tunnel_cfg cfg = { 0, }; struct l2tp_tunnel *tunnel; struct net *net = genl_info_net(info); + struct nlattr **attrs = info->attrs; - if (!info->attrs[L2TP_ATTR_CONN_ID]) { + if (!attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } - tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); + tunnel_id = nla_get_u32(attrs[L2TP_ATTR_CONN_ID]); - if (!info->attrs[L2TP_ATTR_PEER_CONN_ID]) { + if (!attrs[L2TP_ATTR_PEER_CONN_ID]) { ret = -EINVAL; goto out; } - peer_tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_CONN_ID]); + peer_tunnel_id = nla_get_u32(attrs[L2TP_ATTR_PEER_CONN_ID]); - if (!info->attrs[L2TP_ATTR_PROTO_VERSION]) { + if (!attrs[L2TP_ATTR_PROTO_VERSION]) { ret = -EINVAL; goto out; } - proto_version = nla_get_u8(info->attrs[L2TP_ATTR_PROTO_VERSION]); + proto_version = nla_get_u8(attrs[L2TP_ATTR_PROTO_VERSION]); - if (!info->attrs[L2TP_ATTR_ENCAP_TYPE]) { + if (!attrs[L2TP_ATTR_ENCAP_TYPE]) { ret = -EINVAL; goto out; } - cfg.encap = nla_get_u16(info->attrs[L2TP_ATTR_ENCAP_TYPE]); - - fd = -1; - if (info->attrs[L2TP_ATTR_FD]) { - fd = nla_get_u32(info->attrs[L2TP_ATTR_FD]); + cfg.encap = nla_get_u16(attrs[L2TP_ATTR_ENCAP_TYPE]); + + /* Managed tunnels take the tunnel socket from userspace. + * Unmanaged tunnels must call out the source and destination addresses + * for the kernel to create the tunnel socket itself. + */ + if (attrs[L2TP_ATTR_FD]) { + fd = nla_get_u32(attrs[L2TP_ATTR_FD]); } else { -#if IS_ENABLED(CONFIG_IPV6) - if (info->attrs[L2TP_ATTR_IP6_SADDR] && - info->attrs[L2TP_ATTR_IP6_DADDR]) { - cfg.local_ip6 = nla_data( - info->attrs[L2TP_ATTR_IP6_SADDR]); - cfg.peer_ip6 = nla_data( - info->attrs[L2TP_ATTR_IP6_DADDR]); - } else -#endif - if (info->attrs[L2TP_ATTR_IP_SADDR] && - info->attrs[L2TP_ATTR_IP_DADDR]) { - cfg.local_ip.s_addr = nla_get_in_addr( - info->attrs[L2TP_ATTR_IP_SADDR]); - cfg.peer_ip.s_addr = nla_get_in_addr( - info->attrs[L2TP_ATTR_IP_DADDR]); - } else { - ret = -EINVAL; + ret = l2tp_nl_cmd_tunnel_create_get_addr(attrs, &cfg); + if (ret < 0) goto out; - } - if (info->attrs[L2TP_ATTR_UDP_SPORT]) - cfg.local_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_SPORT]); - if (info->attrs[L2TP_ATTR_UDP_DPORT]) - cfg.peer_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_DPORT]); - cfg.use_udp_checksums = nla_get_flag( - info->attrs[L2TP_ATTR_UDP_CSUM]); - -#if IS_ENABLED(CONFIG_IPV6) - cfg.udp6_zero_tx_checksums = nla_get_flag( - info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]); - cfg.udp6_zero_rx_checksums = nla_get_flag( - info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]); -#endif } - if (info->attrs[L2TP_ATTR_DEBUG]) - cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); + if (attrs[L2TP_ATTR_DEBUG]) + cfg.debug = nla_get_u32(attrs[L2TP_ATTR_DEBUG]); ret = -EINVAL; switch (cfg.encap) { @@ -320,16 +319,79 @@ out: return ret; } +#if IS_ENABLED(CONFIG_IPV6) +static int l2tp_nl_tunnel_send_addr6(struct sk_buff *skb, struct sock *sk, + enum l2tp_encap_type encap) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + + switch (encap) { + case L2TP_ENCAPTYPE_UDP: + if (udp_get_no_check6_tx(sk) && + nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_TX)) + return -1; + if (udp_get_no_check6_rx(sk) && + nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_RX)) + return -1; + if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || + nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport))) + return -1; + fallthrough; + case L2TP_ENCAPTYPE_IP: + if (nla_put_in6_addr(skb, L2TP_ATTR_IP6_SADDR, &np->saddr) || + nla_put_in6_addr(skb, L2TP_ATTR_IP6_DADDR, &sk->sk_v6_daddr)) + return -1; + break; + } + return 0; +} +#endif + +static int l2tp_nl_tunnel_send_addr4(struct sk_buff *skb, struct sock *sk, + enum l2tp_encap_type encap) +{ + struct inet_sock *inet = inet_sk(sk); + + switch (encap) { + case L2TP_ENCAPTYPE_UDP: + if (nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx) || + nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || + nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport))) + return -1; + fallthrough; + case L2TP_ENCAPTYPE_IP: + if (nla_put_in_addr(skb, L2TP_ATTR_IP_SADDR, inet->inet_saddr) || + nla_put_in_addr(skb, L2TP_ATTR_IP_DADDR, inet->inet_daddr)) + return -1; + break; + } + + return 0; +} + +/* Append attributes for the tunnel address, handling the different attribute types + * used for different tunnel encapsulation and AF_INET v.s. AF_INET6. + */ +static int l2tp_nl_tunnel_send_addr(struct sk_buff *skb, struct l2tp_tunnel *tunnel) +{ + struct sock *sk = tunnel->sock; + + if (!sk) + return 0; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return l2tp_nl_tunnel_send_addr6(skb, sk, tunnel->encap); +#endif + return l2tp_nl_tunnel_send_addr4(skb, sk, tunnel->encap); +} + static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_tunnel *tunnel, u8 cmd) { void *hdr; struct nlattr *nest; - struct sock *sk = NULL; - struct inet_sock *inet; -#if IS_ENABLED(CONFIG_IPV6) - struct ipv6_pinfo *np = NULL; -#endif hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd); if (!hdr) @@ -343,7 +405,7 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla goto nla_put_failure; nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS); - if (nest == NULL) + if (!nest) goto nla_put_failure; if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS, @@ -373,58 +435,9 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla goto nla_put_failure; nla_nest_end(skb, nest); - sk = tunnel->sock; - if (!sk) - goto out; - -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) - np = inet6_sk(sk); -#endif - - inet = inet_sk(sk); - - switch (tunnel->encap) { - case L2TP_ENCAPTYPE_UDP: - switch (sk->sk_family) { - case AF_INET: - if (nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx)) - goto nla_put_failure; - break; -#if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - if (udp_get_no_check6_tx(sk) && - nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_TX)) - goto nla_put_failure; - if (udp_get_no_check6_rx(sk) && - nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_RX)) - goto nla_put_failure; - break; -#endif - } - if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || - nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport))) - goto nla_put_failure; - /* fall through */ - case L2TP_ENCAPTYPE_IP: -#if IS_ENABLED(CONFIG_IPV6) - if (np) { - if (nla_put_in6_addr(skb, L2TP_ATTR_IP6_SADDR, - &np->saddr) || - nla_put_in6_addr(skb, L2TP_ATTR_IP6_DADDR, - &sk->sk_v6_daddr)) - goto nla_put_failure; - } else -#endif - if (nla_put_in_addr(skb, L2TP_ATTR_IP_SADDR, - inet->inet_saddr) || - nla_put_in_addr(skb, L2TP_ATTR_IP_DADDR, - inet->inet_daddr)) - goto nla_put_failure; - break; - } + if (l2tp_nl_tunnel_send_addr(skb, tunnel)) + goto nla_put_failure; -out: genlmsg_end(skb, hdr); return 0; @@ -485,7 +498,7 @@ static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback for (;;) { tunnel = l2tp_tunnel_get_nth(net, ti); - if (tunnel == NULL) + if (!tunnel) goto out; if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).portid, @@ -570,6 +583,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf if (info->attrs[L2TP_ATTR_COOKIE]) { u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]); + if (len > 8) { ret = -EINVAL; goto out_tunnel; @@ -579,6 +593,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf } if (info->attrs[L2TP_ATTR_PEER_COOKIE]) { u16 len = nla_len(info->attrs[L2TP_ATTR_PEER_COOKIE]); + if (len > 8) { ret = -EINVAL; goto out_tunnel; @@ -606,14 +621,13 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); #ifdef CONFIG_MODULES - if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) { + if (!l2tp_nl_cmd_ops[cfg.pw_type]) { genl_unlock(); request_module("net-l2tp-type-%u", cfg.pw_type); genl_lock(); } #endif - if ((l2tp_nl_cmd_ops[cfg.pw_type] == NULL) || - (l2tp_nl_cmd_ops[cfg.pw_type]->session_create == NULL)) { + if (!l2tp_nl_cmd_ops[cfg.pw_type] || !l2tp_nl_cmd_ops[cfg.pw_type]->session_create) { ret = -EPROTONOSUPPORT; goto out_tunnel; } @@ -645,7 +659,7 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf u16 pw_type; session = l2tp_nl_session_get(info); - if (session == NULL) { + if (!session) { ret = -ENODEV; goto out; } @@ -656,7 +670,7 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf pw_type = session->pwtype; if (pw_type < __L2TP_PWTYPE_MAX) if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete) - ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session); + l2tp_nl_cmd_ops[pw_type]->session_delete(session); l2tp_session_dec_refcount(session); @@ -670,7 +684,7 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf struct l2tp_session *session; session = l2tp_nl_session_get(info); - if (session == NULL) { + if (!session) { ret = -ENODEV; goto out; } @@ -715,8 +729,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl if (nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) || nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || - nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, - session->peer_session_id) || + nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id) || nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) || nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype)) goto nla_put_failure; @@ -724,11 +737,9 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl if ((session->ifname[0] && nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) || (session->cookie_len && - nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len, - &session->cookie[0])) || + nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len, session->cookie)) || (session->peer_cookie_len && - nla_put(skb, L2TP_ATTR_PEER_COOKIE, session->peer_cookie_len, - &session->peer_cookie[0])) || + nla_put(skb, L2TP_ATTR_PEER_COOKIE, session->peer_cookie_len, session->peer_cookie)) || nla_put_u8(skb, L2TP_ATTR_RECV_SEQ, session->recv_seq) || nla_put_u8(skb, L2TP_ATTR_SEND_SEQ, session->send_seq) || nla_put_u8(skb, L2TP_ATTR_LNS_MODE, session->lns_mode) || @@ -740,7 +751,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl goto nla_put_failure; nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS); - if (nest == NULL) + if (!nest) goto nla_put_failure; if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS, @@ -785,7 +796,7 @@ static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info) int ret; session = l2tp_nl_session_get(info); - if (session == NULL) { + if (!session) { ret = -ENODEV; goto err; } @@ -824,14 +835,14 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback int si = cb->args[1]; for (;;) { - if (tunnel == NULL) { + if (!tunnel) { tunnel = l2tp_tunnel_get_nth(net, ti); - if (tunnel == NULL) + if (!tunnel) goto out; } session = l2tp_session_get_nth(tunnel, si); - if (session == NULL) { + if (!session) { ti++; l2tp_tunnel_dec_refcount(tunnel); tunnel = NULL; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index c54cb59593ef..13c3153b40d6 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -117,8 +117,7 @@ struct pppol2tp_session { int owner; /* pid that opened the socket */ struct mutex sk_lock; /* Protects .sk */ - struct sock __rcu *sk; /* Pointer to the session - * PPPoX socket */ + struct sock __rcu *sk; /* Pointer to the session PPPoX socket */ struct sock *__sk; /* Copy of .sk, for cleanup */ struct rcu_head rcu; /* For asynchronous release */ }; @@ -155,17 +154,20 @@ static inline struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk) { struct l2tp_session *session; - if (sk == NULL) + if (!sk) return NULL; sock_hold(sk); session = (struct l2tp_session *)(sk->sk_user_data); - if (session == NULL) { + if (!session) { + sock_put(sk); + goto out; + } + if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) { + session = NULL; sock_put(sk); goto out; } - - BUG_ON(session->magic != L2TP_SESSION_MAGIC); out: return session; @@ -218,7 +220,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int */ rcu_read_lock(); sk = rcu_dereference(ps->sk); - if (sk == NULL) + if (!sk) goto no_sock; /* If the first two bytes are 0xFF03, consider that it is the PPP's @@ -286,7 +288,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, /* Get session and tunnel contexts */ error = -EBADF; session = pppol2tp_sock_to_session(sk); - if (session == NULL) + if (!session) goto error; tunnel = session->tunnel; @@ -351,7 +353,7 @@ error: */ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { - struct sock *sk = (struct sock *) chan->private; + struct sock *sk = (struct sock *)chan->private; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int uhlen, headroom; @@ -361,7 +363,7 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) /* Get session and tunnel contexts from the socket */ session = pppol2tp_sock_to_session(sk); - if (session == NULL) + if (!session) goto abort; tunnel = session->tunnel; @@ -420,7 +422,8 @@ static void pppol2tp_session_destruct(struct sock *sk) if (session) { sk->sk_user_data = NULL; - BUG_ON(session->magic != L2TP_SESSION_MAGIC); + if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) + return; l2tp_session_dec_refcount(session); } } @@ -704,7 +707,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, * tunnel id. */ if (!info.session_id && !info.peer_session_id) { - if (tunnel == NULL) { + if (!tunnel) { struct l2tp_tunnel_cfg tcfg = { .encap = L2TP_ENCAPTYPE_UDP, .debug = 0, @@ -739,11 +742,11 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, } else { /* Error if we can't find the tunnel */ error = -ENOENT; - if (tunnel == NULL) + if (!tunnel) goto end; /* Error if socket is not prepped */ - if (tunnel->sock == NULL) + if (!tunnel->sock) goto end; } @@ -803,8 +806,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, * the internal context for use by ioctl() and sockopt() * handlers. */ - if ((session->session_id == 0) && - (session->peer_session_id == 0)) { + if (session->session_id == 0 && session->peer_session_id == 0) { error = 0; goto out_no_ppp; } @@ -912,22 +914,23 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, struct pppol2tp_session *pls; error = -ENOTCONN; - if (sk == NULL) + if (!sk) goto end; if (!(sk->sk_state & PPPOX_CONNECTED)) goto end; error = -EBADF; session = pppol2tp_sock_to_session(sk); - if (session == NULL) + if (!session) goto end; pls = l2tp_session_priv(session); tunnel = session->tunnel; inet = inet_sk(tunnel->sock); - if ((tunnel->version == 2) && (tunnel->sock->sk_family == AF_INET)) { + if (tunnel->version == 2 && tunnel->sock->sk_family == AF_INET) { struct sockaddr_pppol2tp sp; + len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; @@ -943,8 +946,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr; memcpy(uaddr, &sp, len); #if IS_ENABLED(CONFIG_IPV6) - } else if ((tunnel->version == 2) && - (tunnel->sock->sk_family == AF_INET6)) { + } else if (tunnel->version == 2 && tunnel->sock->sk_family == AF_INET6) { struct sockaddr_pppol2tpin6 sp; len = sizeof(sp); @@ -962,8 +964,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, memcpy(&sp.pppol2tp.addr.sin6_addr, &tunnel->sock->sk_v6_daddr, sizeof(tunnel->sock->sk_v6_daddr)); memcpy(uaddr, &sp, len); - } else if ((tunnel->version == 3) && - (tunnel->sock->sk_family == AF_INET6)) { + } else if (tunnel->version == 3 && tunnel->sock->sk_family == AF_INET6) { struct sockaddr_pppol2tpv3in6 sp; len = sizeof(sp); @@ -984,6 +985,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, #endif } else if (tunnel->version == 3) { struct sockaddr_pppol2tpv3 sp; + len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; @@ -1178,7 +1180,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk, switch (optname) { case PPPOL2TP_SO_RECVSEQ: - if ((val != 0) && (val != 1)) { + if (val != 0 && val != 1) { err = -EINVAL; break; } @@ -1189,7 +1191,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk, break; case PPPOL2TP_SO_SENDSEQ: - if ((val != 0) && (val != 1)) { + if (val != 0 && val != 1) { err = -EINVAL; break; } @@ -1207,7 +1209,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk, break; case PPPOL2TP_SO_LNSMODE: - if ((val != 0) && (val != 1)) { + if (val != 0 && val != 1) { err = -EINVAL; break; } @@ -1244,7 +1246,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk, * session or the special tunnel type. */ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2tp_session *session; @@ -1258,23 +1260,22 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; err = -ENOTCONN; - if (sk->sk_user_data == NULL) + if (!sk->sk_user_data) goto end; /* Get session context from the socket */ err = -EBADF; session = pppol2tp_sock_to_session(sk); - if (session == NULL) + if (!session) goto end; /* Special case: if session_id == 0x0000, treat as operation on tunnel */ - if ((session->session_id == 0) && - (session->peer_session_id == 0)) { + if (session->session_id == 0 && session->peer_session_id == 0) { tunnel = session->tunnel; err = pppol2tp_tunnel_setsockopt(sk, tunnel, optname, val); } else { @@ -1343,7 +1344,7 @@ static int pppol2tp_session_getsockopt(struct sock *sk, break; case PPPOL2TP_SO_REORDERTO: - *val = (int) jiffies_to_msecs(session->reorder_timeout); + *val = (int)jiffies_to_msecs(session->reorder_timeout); l2tp_info(session, L2TP_MSG_CONTROL, "%s: get reorder_timeout=%d\n", session->name, *val); break; @@ -1381,18 +1382,17 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, return -EINVAL; err = -ENOTCONN; - if (sk->sk_user_data == NULL) + if (!sk->sk_user_data) goto end; /* Get the session context */ err = -EBADF; session = pppol2tp_sock_to_session(sk); - if (session == NULL) + if (!session) goto end; /* Special case: if session_id == 0x0000, treat as operation on tunnel */ - if ((session->session_id == 0) && - (session->peer_session_id == 0)) { + if (session->session_id == 0 && session->peer_session_id == 0) { tunnel = session->tunnel; err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val); if (err) @@ -1407,7 +1407,7 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, if (put_user(len, optlen)) goto end_put_sess; - if (copy_to_user((void __user *) optval, &val, len)) + if (copy_to_user((void __user *)optval, &val, len)) goto end_put_sess; err = 0; @@ -1463,7 +1463,7 @@ static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd) pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx); pd->session_idx++; - if (pd->session == NULL) { + if (!pd->session) { pd->session_idx = 0; pppol2tp_next_tunnel(net, pd); } @@ -1478,17 +1478,21 @@ static void *pppol2tp_seq_start(struct seq_file *m, loff_t *offs) if (!pos) goto out; - BUG_ON(m->private == NULL); + if (WARN_ON(!m->private)) { + pd = NULL; + goto out; + } + pd = m->private; net = seq_file_net(m); - if (pd->tunnel == NULL) + if (!pd->tunnel) pppol2tp_next_tunnel(net, pd); else pppol2tp_next_session(net, pd); /* NULL tunnel and session indicates end of list */ - if ((pd->tunnel == NULL) && (pd->session == NULL)) + if (!pd->tunnel && !pd->session) pd = NULL; out: @@ -1551,6 +1555,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) if (tunnel->sock) { struct inet_sock *inet = inet_sk(tunnel->sock); + ip = ntohl(inet->inet_saddr); port = ntohs(inet->inet_sport); } @@ -1564,8 +1569,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) user_data_ok = 'N'; } - seq_printf(m, " SESSION '%s' %08X/%d %04X/%04X -> " - "%04X/%04X %d %c\n", + seq_printf(m, " SESSION '%s' %08X/%d %04X/%04X -> %04X/%04X %d %c\n", session->name, ip, port, tunnel->tunnel_id, session->session_id, @@ -1604,8 +1608,7 @@ static int pppol2tp_seq_show(struct seq_file *m, void *v) seq_puts(m, "PPPoL2TP driver info, " PPPOL2TP_DRV_VERSION "\n"); seq_puts(m, "TUNNEL name, user-data-ok session-count\n"); seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); - seq_puts(m, " SESSION name, addr/port src-tid/sid " - "dest-tid/sid state user-data-ok\n"); + seq_puts(m, " SESSION name, addr/port src-tid/sid dest-tid/sid state user-data-ok\n"); seq_puts(m, " mtu/mru/rcvseq/sendseq/lns debug reorderto\n"); seq_puts(m, " nr/ns tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); goto out; @@ -1638,7 +1641,7 @@ static __net_init int pppol2tp_init_net(struct net *net) int err = 0; pde = proc_create_net("pppol2tp", 0444, net->proc_net, - &pppol2tp_seq_ops, sizeof(struct pppol2tp_seq_data)); + &pppol2tp_seq_ops, sizeof(struct pppol2tp_seq_data)); if (!pde) { err = -ENOMEM; goto out; diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index f35899d45a9a..e71ca5aec684 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -9,6 +9,99 @@ #include <net/fib_rules.h> #include <net/l3mdev.h> +static DEFINE_SPINLOCK(l3mdev_lock); + +struct l3mdev_handler { + lookup_by_table_id_t dev_lookup; +}; + +static struct l3mdev_handler l3mdev_handlers[L3MDEV_TYPE_MAX + 1]; + +static int l3mdev_check_type(enum l3mdev_type l3type) +{ + if (l3type <= L3MDEV_TYPE_UNSPEC || l3type > L3MDEV_TYPE_MAX) + return -EINVAL; + + return 0; +} + +int l3mdev_table_lookup_register(enum l3mdev_type l3type, + lookup_by_table_id_t fn) +{ + struct l3mdev_handler *hdlr; + int res; + + res = l3mdev_check_type(l3type); + if (res) + return res; + + hdlr = &l3mdev_handlers[l3type]; + + spin_lock(&l3mdev_lock); + + if (hdlr->dev_lookup) { + res = -EBUSY; + goto unlock; + } + + hdlr->dev_lookup = fn; + res = 0; + +unlock: + spin_unlock(&l3mdev_lock); + + return res; +} +EXPORT_SYMBOL_GPL(l3mdev_table_lookup_register); + +void l3mdev_table_lookup_unregister(enum l3mdev_type l3type, + lookup_by_table_id_t fn) +{ + struct l3mdev_handler *hdlr; + + if (l3mdev_check_type(l3type)) + return; + + hdlr = &l3mdev_handlers[l3type]; + + spin_lock(&l3mdev_lock); + + if (hdlr->dev_lookup == fn) + hdlr->dev_lookup = NULL; + + spin_unlock(&l3mdev_lock); +} +EXPORT_SYMBOL_GPL(l3mdev_table_lookup_unregister); + +int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, + struct net *net, u32 table_id) +{ + lookup_by_table_id_t lookup; + struct l3mdev_handler *hdlr; + int ifindex = -EINVAL; + int res; + + res = l3mdev_check_type(l3type); + if (res) + return res; + + hdlr = &l3mdev_handlers[l3type]; + + spin_lock(&l3mdev_lock); + + lookup = hdlr->dev_lookup; + if (!lookup) + goto unlock; + + ifindex = lookup(net, table_id); + +unlock: + spin_unlock(&l3mdev_lock); + + return ifindex; +} +EXPORT_SYMBOL_GPL(l3mdev_ifindex_lookup_by_table_id); + /** * l3mdev_master_ifindex - get index of L3 master device * @dev: targeted interface diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 6e53e43c1907..7180979114e4 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -984,7 +984,6 @@ out: * llc_ui_getname - return the address info of a socket * @sock: Socket to get address of. * @uaddr: Address structure to return information. - * @uaddrlen: Length of address structure. * @peer: Does user want local or remote address information. * * Return the address information of a socket. @@ -1054,7 +1053,7 @@ static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, * Set various connection specific parameters. */ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); @@ -1064,7 +1063,7 @@ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, lock_sock(sk); if (unlikely(level != SOL_LLC || optlen != sizeof(int))) goto out; - rc = get_user(opt, (int __user *)optval); + rc = copy_from_sockptr(&opt, optval, sizeof(opt)); if (rc) goto out; rc = -EINVAL; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 7b620acaca9e..1144cda2a0fc 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -284,8 +284,8 @@ out:; /** * llc_conn_remove_acked_pdus - Removes acknowledged pdus from tx queue * @sk: active connection - * nr: NR - * how_many_unacked: size of pdu_unack_q after removing acked pdus + * @nr: NR + * @how_many_unacked: size of pdu_unack_q after removing acked pdus * * Removes acknowledged pdus from transmit queue (pdu_unack_q). Returns * the number of pdus that removed from queue. @@ -906,6 +906,7 @@ static void llc_sk_init(struct sock *sk) /** * llc_sk_alloc - Allocates LLC sock + * @net: network namespace * @family: upper layer protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @@ -951,7 +952,7 @@ void llc_sk_stop_all_timers(struct sock *sk, bool sync) /** * llc_sk_free - Frees a LLC socket - * @sk - socket to free + * @sk: - socket to free * * Frees a LLC socket */ diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index 82cb93f66b9b..c309b72a5877 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -144,6 +144,7 @@ static inline int llc_fixup_skb(struct sk_buff *skb) * @skb: received pdu * @dev: device that receive pdu * @pt: packet type + * @orig_dev: the original receive net device * * When the system receives a 802.2 frame this function is called. It * checks SAP and connection of received pdu and passes frame to diff --git a/net/llc/llc_pdu.c b/net/llc/llc_pdu.c index 2e6cb79196bb..792d195c8bae 100644 --- a/net/llc/llc_pdu.c +++ b/net/llc/llc_pdu.c @@ -25,7 +25,7 @@ void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 pdu_type) /** * pdu_set_pf_bit - sets poll/final bit in LLC header - * @pdu_frame: input frame that p/f bit must be set into it. + * @skb: Frame to set bit in * @bit_value: poll/final bit (0 or 1). * * This function sets poll/final bit in LLC header (based on type of PDU). diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index be419062e19a..6805ce43a055 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -37,6 +37,7 @@ static int llc_mac_header_len(unsigned short devtype) /** * llc_alloc_frame - allocates sk_buff for frame + * @sk: socket to allocate frame to * @dev: network device this skb will be sent over * @type: pdu type to allocate * @data_size: data size to allocate @@ -273,6 +274,7 @@ void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb, * llc_sap_rcv - sends received pdus to the sap state machine * @sap: current sap component structure. * @skb: received frame. + * @sk: socket to associate to frame * * Sends received pdus to the sap state machine. */ @@ -379,6 +381,7 @@ static void llc_do_mcast(struct llc_sap *sap, struct sk_buff *skb, * llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets. * @sap: SAP * @laddr: address of local LLC (MAC + SAP) + * @skb: PDU to deliver * * Search socket list of the SAP and finds connections with same sap. * Deliver clone to each. diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 7f245e9f114c..313ba97acae3 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -477,7 +477,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, size_t len) { u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num; - struct ieee802_11_elems elems = { 0 }; + struct ieee802_11_elems elems = { }; u8 dialog_token; int ies_len; diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c index 9fc2968856c0..366f76c9003d 100644 --- a/net/mac80211/airtime.c +++ b/net/mac80211/airtime.c @@ -551,7 +551,7 @@ EXPORT_SYMBOL_GPL(ieee80211_calc_tx_airtime); u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *pubsta, - int len) + int len, bool ampdu) { struct ieee80211_supported_band *sband; struct ieee80211_chanctx_conf *conf; @@ -572,10 +572,26 @@ u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, if (pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); - - return ieee80211_calc_tx_airtime_rate(hw, - &sta->tx_stats.last_rate, - band, len); + struct ieee80211_tx_rate *rate = &sta->tx_stats.last_rate; + u32 airtime; + + if (!(rate->flags & (IEEE80211_TX_RC_VHT_MCS | + IEEE80211_TX_RC_MCS))) + ampdu = false; + + /* + * Assume that HT/VHT transmission on any AC except VO will + * use aggregation. Since we don't have reliable reporting + * of aggregation length, assume an average of 16. + * This will not be very accurate, but much better than simply + * assuming un-aggregated tx. + */ + airtime = ieee80211_calc_tx_airtime_rate(hw, rate, band, + ampdu ? len * 16 : len); + if (ampdu) + airtime /= 16; + + return airtime; } if (!conf) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 1079a07e43e4..87fddd84c621 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -608,12 +608,12 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, case WLAN_CIPHER_SUITE_BIP_CMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_cmac)); - /* fall through */ + fallthrough; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_gmac)); - /* fall through */ + fallthrough; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != @@ -991,9 +991,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS | BSS_CHANGED_TXPOWER | - BSS_CHANGED_TWT | - BSS_CHANGED_HE_OBSS_PD | - BSS_CHANGED_HE_BSS_COLOR; + BSS_CHANGED_TWT; int i, err; int prev_beacon_int; @@ -1019,6 +1017,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.frame_time_rts_th = le32_get_bits(params->he_oper->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); + changed |= BSS_CHANGED_HE_OBSS_PD; + + if (params->he_bss_color.enabled) + changed |= BSS_CHANGED_HE_BSS_COLOR; } mutex_lock(&local->mtx); @@ -2126,6 +2128,11 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_GATE, mask)) conf->dot11MeshConnectedToMeshGate = nconf->dot11MeshConnectedToMeshGate; + if (_chg_mesh_attr(NL80211_MESHCONF_NOLEARN, mask)) + conf->dot11MeshNolearn = nconf->dot11MeshNolearn; + if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_AS, mask)) + conf->dot11MeshConnectedToAuthServer = + nconf->dot11MeshConnectedToAuthServer; ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } @@ -2337,7 +2344,7 @@ static int ieee80211_scan(struct wiphy *wiphy, * for now fall through to allow scanning only when * beaconing hasn't been configured yet */ - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: /* * If the scan has been forced (and the driver supports @@ -3584,7 +3591,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, } local_bh_disable(); - ieee80211_xmit(sdata, sta, skb, 0); + ieee80211_xmit(sdata, sta, skb); local_bh_enable(); ret = 0; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index e6e192f53e4e..bdc0f29dc6cd 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -313,9 +313,14 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, lockdep_assert_held(&local->chanctx_mtx); - /* don't optimize 5MHz, 10MHz, and radar_enabled confs */ + /* don't optimize non-20MHz based and radar_enabled confs */ if (ctx->conf.def.width == NL80211_CHAN_WIDTH_5 || ctx->conf.def.width == NL80211_CHAN_WIDTH_10 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_1 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_2 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_4 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_8 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_16 || ctx->conf.radar_enabled) { ctx->conf.min_def = ctx->conf.def; return; @@ -743,7 +748,7 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, default: WARN_ONCE(1, "Invalid SMPS mode %d\n", sdata->smps_mode); - /* fall through */ + fallthrough; case IEEE80211_SMPS_OFF: needed_static = sdata->needed_rx_chains; needed_dynamic = sdata->needed_rx_chains; diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index d7e955127d5c..fe8a7a87e513 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -638,6 +638,9 @@ IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration, u.mesh.mshcfg.dot11MeshAwakeWindowDuration, DEC); IEEE80211_IF_FILE(dot11MeshConnectedToMeshGate, u.mesh.mshcfg.dot11MeshConnectedToMeshGate, DEC); +IEEE80211_IF_FILE(dot11MeshNolearn, u.mesh.mshcfg.dot11MeshNolearn, DEC); +IEEE80211_IF_FILE(dot11MeshConnectedToAuthServer, + u.mesh.mshcfg.dot11MeshConnectedToAuthServer, DEC); #endif #define DEBUGFS_ADD_MODE(name, mode) \ @@ -762,6 +765,8 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata) MESHPARAMS_ADD(power_mode); MESHPARAMS_ADD(dot11MeshAwakeWindowDuration); MESHPARAMS_ADD(dot11MeshConnectedToMeshGate); + MESHPARAMS_ADD(dot11MeshNolearn); + MESHPARAMS_ADD(dot11MeshConnectedToAuthServer); #undef MESHPARAMS_ADD } #endif diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index de69fc9c4f07..41d495d73d3a 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -12,12 +12,11 @@ #include "ieee80211_i.h" #include "trace.h" -static inline bool check_sdata_in_driver(struct ieee80211_sub_if_data *sdata) -{ - return !WARN(!(sdata->flags & IEEE80211_SDATA_IN_DRIVER), - "%s: Failed check-sdata-in-driver check, flags: 0x%x\n", - sdata->dev ? sdata->dev->name : sdata->name, sdata->flags); -} +#define check_sdata_in_driver(sdata) ({ \ + !WARN_ONCE(!(sdata->flags & IEEE80211_SDATA_IN_DRIVER), \ + "%s: Failed check-sdata-in-driver check, flags: 0x%x\n", \ + sdata->dev ? sdata->dev->name : sdata->name, sdata->flags); \ +}) static inline struct ieee80211_sub_if_data * get_bss_sdata(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index e32906202575..3d62a80b5790 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -250,7 +250,7 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.bss_conf.chandef.width) { default: WARN_ON_ONCE(1); - /* fall through */ + fallthrough; case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: bw = IEEE80211_STA_RX_BW_20; @@ -517,7 +517,7 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); - /* fall through */ + fallthrough; case IEEE80211_SMPS_OFF: action_frame->u.action.u.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DISABLED; diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 81d26fef41e9..53632c2f5217 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -791,7 +791,7 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20_NOHT: sta_flags |= IEEE80211_STA_DISABLE_HT; - /* fall through */ + fallthrough; case NL80211_CHAN_WIDTH_20: sta_flags |= IEEE80211_STA_DISABLE_40MHZ; break; @@ -1401,7 +1401,7 @@ ieee80211_ibss_setup_scan_channels(struct wiphy *wiphy, break; case NL80211_CHAN_WIDTH_80P80: cf2 = chandef->center_freq2; - /* fall through */ + fallthrough; case NL80211_CHAN_WIDTH_80: width = 80; break; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ec1a71ac65f2..0b1eaec6649f 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -164,7 +164,6 @@ typedef unsigned __bitwise ieee80211_tx_result; #define TX_DROP ((__force ieee80211_tx_result) 1u) #define TX_QUEUED ((__force ieee80211_tx_result) 2u) -#define IEEE80211_TX_NO_SEQNO BIT(0) #define IEEE80211_TX_UNICAST BIT(1) #define IEEE80211_TX_PS_BUFFERED BIT(2) @@ -218,7 +217,7 @@ enum ieee80211_rx_flags { }; struct ieee80211_rx_data { - struct napi_struct *napi; + struct list_head *list; struct sk_buff *skb; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; @@ -1967,12 +1966,11 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, bool bss_notify, bool enable_qos); void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb, - u32 txdata_flags); + struct sta_info *sta, struct sk_buff *skb); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band, u32 txdata_flags); + enum nl80211_band band); /* sta_out needs to be checked for ERR_PTR() before using */ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, @@ -1982,10 +1980,10 @@ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, static inline void ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band, u32 txdata_flags) + enum nl80211_band band) { rcu_read_lock(); - __ieee80211_tx_skb_tid_band(sdata, skb, tid, band, txdata_flags); + __ieee80211_tx_skb_tid_band(sdata, skb, tid, band); rcu_read_unlock(); } @@ -2003,7 +2001,7 @@ static inline void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, } __ieee80211_tx_skb_tid_band(sdata, skb, tid, - chanctx_conf->def.chan->band, 0); + chanctx_conf->def.chan->band); rcu_read_unlock(); } @@ -2290,7 +2288,7 @@ extern const struct ethtool_ops ieee80211_ethtool_ops; u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *pubsta, - int len); + int len, bool ampdu); #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline #else diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index f900c84fb40f..9740ae8fa697 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -978,7 +978,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_P2P_DEVICE: /* relies on synchronize_rcu() below */ RCU_INIT_POINTER(local->p2p_sdata, NULL); - /* fall through */ + fallthrough; default: cancel_work_sync(&sdata->work); /* @@ -1048,7 +1048,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) break; - /* fall through */ + fallthrough; default: if (going_down) drv_remove_interface(local, sdata); @@ -1183,17 +1183,24 @@ static u16 ieee80211_monitor_select_queue(struct net_device *dev, { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr; - struct ieee80211_radiotap_header *rtap = (void *)skb->data; + int len_rthdr; if (local->hw.queues < IEEE80211_NUM_ACS) return 0; - if (skb->len < 4 || - skb->len < le16_to_cpu(rtap->it_len) + 2 /* frame control */) + /* reset flags and info before parsing radiotap header */ + memset(info, 0, sizeof(*info)); + + if (!ieee80211_parse_tx_radiotap(skb, dev)) return 0; /* doesn't matter, frame will be dropped */ - hdr = (void *)((u8 *)skb->data + le16_to_cpu(rtap->it_len)); + len_rthdr = ieee80211_get_radiotap_len(skb->data); + hdr = (struct ieee80211_hdr *)(skb->data + len_rthdr); + if (skb->len < len_rthdr + 2 || + skb->len < len_rthdr + ieee80211_hdrlen(hdr->frame_control)) + return 0; /* doesn't matter, frame will be dropped */ return ieee80211_select_queue_80211(sdata, skb, hdr); } @@ -1497,7 +1504,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, type = NL80211_IFTYPE_AP; sdata->vif.type = type; sdata->vif.p2p = true; - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: skb_queue_head_init(&sdata->u.ap.ps.bc_buf); INIT_LIST_HEAD(&sdata->u.ap.vlans); @@ -1507,7 +1514,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, type = NL80211_IFTYPE_STATION; sdata->vif.type = type; sdata->vif.p2p = true; - /* fall through */ + fallthrough; case NL80211_IFTYPE_STATION: sdata->vif.bss_conf.bssid = sdata->u.mgd.bssid; ieee80211_sta_setup_sdata(sdata); @@ -1703,7 +1710,7 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local, goto out_unlock; } } - /* fall through */ + fallthrough; default: /* assign a new address if possible -- try n_addresses first */ for (i = 0; i < local->hw.wiphy->n_addresses; i++) { diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 8f403c1bb908..9c2888004878 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -225,7 +225,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) */ if (sdata->hw_80211_encap) return -EINVAL; - /* Fall through */ + fallthrough; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index e88beb3ff6db..7ecd801a943b 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -260,6 +260,7 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, bool is_connected_to_gate = ifmsh->num_gates > 0 || ifmsh->mshcfg.dot11MeshGateAnnouncementProtocol || ifmsh->mshcfg.dot11MeshConnectedToMeshGate; + bool is_connected_to_as = ifmsh->mshcfg.dot11MeshConnectedToAuthServer; if (skb_tailroom(skb) < 2 + meshconf_len) return -ENOMEM; @@ -284,7 +285,9 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, /* Mesh Formation Info - number of neighbors */ neighbors = atomic_read(&ifmsh->estab_plinks); neighbors = min_t(int, neighbors, IEEE80211_MAX_MESH_PEERINGS); - *pos++ = (neighbors << 1) | is_connected_to_gate; + *pos++ = (is_connected_to_as << 7) | + (neighbors << 1) | + is_connected_to_gate; /* Mesh capability */ *pos = 0x00; *pos |= ifmsh->mshcfg.dot11MeshForwarding ? @@ -1107,10 +1110,10 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.bss_conf.chandef.width) { case NL80211_CHAN_WIDTH_20_NOHT: sta_flags |= IEEE80211_STA_DISABLE_HT; - /* fall through */ + fallthrough; case NL80211_CHAN_WIDTH_20: sta_flags |= IEEE80211_STA_DISABLE_40MHZ; - /* fall through */ + fallthrough; case NL80211_CHAN_WIDTH_40: sta_flags |= IEEE80211_STA_DISABLE_VHT; break; diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 02cde0fd08fe..bec23d2eee7a 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1173,6 +1173,40 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, } /** + * mesh_nexthop_lookup_nolearn - try to set next hop without path discovery + * @skb: 802.11 frame to be sent + * @sdata: network subif the frame will be sent through + * + * Check if the meshDA (addr3) of a unicast frame is a direct neighbor. + * And if so, set the RA (addr1) to it to transmit to this node directly, + * avoiding PREQ/PREP path discovery. + * + * Returns: 0 if the next hop was found and -ENOENT otherwise. + */ +static int mesh_nexthop_lookup_nolearn(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + struct sta_info *sta; + + if (is_multicast_ether_addr(hdr->addr1)) + return -ENOENT; + + rcu_read_lock(); + sta = sta_info_get(sdata, hdr->addr3); + + if (!sta || sta->mesh->plink_state != NL80211_PLINK_ESTAB) { + rcu_read_unlock(); + return -ENOENT; + } + rcu_read_unlock(); + + memcpy(hdr->addr1, hdr->addr3, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + return 0; +} + +/** * mesh_nexthop_lookup - put the appropriate next hop on a mesh frame. Calling * this function is considered "using" the associated mpath, so preempt a path * refresh if this mpath expires soon. @@ -1185,11 +1219,16 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, int mesh_nexthop_lookup(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_path *mpath; struct sta_info *next_hop; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; u8 *target_addr = hdr->addr3; + if (ifmsh->mshcfg.dot11MeshNolearn && + !mesh_nexthop_lookup_nolearn(sdata, skb)) + return 0; + mpath = mesh_path_lookup(sdata, target_addr); if (!mpath || !(mpath->flags & MESH_PATH_ACTIVE)) return -ENOENT; @@ -1266,7 +1305,7 @@ void mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata) break; case IEEE80211_PROACTIVE_PREQ_WITH_PREP: flags |= IEEE80211_PREQ_PROACTIVE_PREP_FLAG; - /* fall through */ + fallthrough; case IEEE80211_PROACTIVE_PREQ_NO_PREP: interval = ifmsh->mshcfg.dot11MeshHWMPactivePathToRootTimeout; target_flags |= IEEE80211_PREQ_TO_FLAG | diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index aca608ae313f..48f31ac9233c 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -72,7 +72,6 @@ static void mesh_table_free(struct mesh_table *tbl) } /** - * * mesh_path_assign_nexthop - update mesh path next hop * * @mpath: mesh path to update @@ -140,7 +139,6 @@ static void prepare_for_gate(struct sk_buff *skb, char *dst_addr, } /** - * * mesh_path_move_to_queue - Move or copy frames from one mpath queue to another * * This function is used to transfer or copy frames from an unresolved mpath to @@ -152,7 +150,7 @@ static void prepare_for_gate(struct sk_buff *skb, char *dst_addr, * * The gate mpath must be an active mpath with a valid mpath->next_hop. * - * @mpath: An active mpath the frames will be sent to (i.e. the gate) + * @gate_mpath: An active mpath the frames will be sent to (i.e. the gate) * @from_mpath: The failed mpath * @copy: When true, copy all the frames to the new mpath queue. When false, * move them. diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 798e4b6b383f..15f2fc658f70 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -699,7 +699,7 @@ void mesh_plink_timer(struct timer_list *t) break; } reason = WLAN_REASON_MESH_MAX_RETRIES; - /* fall through */ + fallthrough; case NL80211_PLINK_CNF_RCVD: /* confirm timer */ if (!reason) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b2a9d47cf86d..ac870309b911 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -533,7 +533,7 @@ static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata, case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); - /* fall through */ + fallthrough; case IEEE80211_SMPS_OFF: cap |= WLAN_HT_CAP_SM_PS_DISABLED << IEEE80211_HT_CAP_SM_PS_SHIFT; @@ -1529,7 +1529,7 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, switch (channel->band) { default: WARN_ON_ONCE(1); - /* fall through */ + fallthrough; case NL80211_BAND_2GHZ: case NL80211_BAND_60GHZ: chan_increment = 1; @@ -2988,7 +2988,10 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); if (auth_alg == WLAN_AUTH_SAE && - status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED) + (status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED || + (auth_transaction == 1 && + (status_code == WLAN_STATUS_SAE_HASH_TO_ELEMENT || + status_code == WLAN_STATUS_SAE_PK)))) return; sdata_info(sdata, "%pM denied authentication (status %d)\n", @@ -3460,10 +3463,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, bss_conf->he_bss_color.partial = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR); - bss_conf->he_bss_color.disabled = - le32_get_bits(elems->he_operation->he_oper_params, - IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); - changed |= BSS_CHANGED_HE_BSS_COLOR; + bss_conf->he_bss_color.enabled = + !le32_get_bits(elems->he_operation->he_oper_params, + IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); + + if (bss_conf->he_bss_color.enabled) + changed |= BSS_CHANGED_HE_BSS_COLOR; bss_conf->htc_trig_based_pkt_ext = le32_get_bits(elems->he_operation->he_oper_params, @@ -4558,6 +4563,9 @@ static void ieee80211_sta_bcn_mon_timer(struct timer_list *t) if (sdata->vif.csa_active && !ifmgd->csa_waiting_bcn) return; + if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) + return; + sdata->u.mgd.connection_loss = false; ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.beacon_connection_loss_work); diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index db3b8bf75656..f470d1a7ce9b 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -264,7 +264,7 @@ static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc, if (roc->mgmt_tx_cookie) { if (!WARN_ON(!roc->frame)) { ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7, - roc->chan->band, 0); + roc->chan->band); roc->frame = NULL; } } else { @@ -808,13 +808,13 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, if (!sdata->vif.bss_conf.ibss_joined) need_offchan = true; #ifdef CONFIG_MAC80211_MESH - /* fall through */ + fallthrough; case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif) && !sdata->u.mesh.mesh_id_len) need_offchan = true; #endif - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_GO: diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 5c5af4b5fc08..836cde516a18 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2578,8 +2578,8 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb, memset(skb->cb, 0, sizeof(skb->cb)); /* deliver to local stack */ - if (rx->napi) - napi_gro_receive(rx->napi, skb); + if (rx->list) + list_add_tail(&skb->list, rx->list); else netif_receive_skb(skb); } @@ -3591,7 +3591,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) } __ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7, - status->band, 0); + status->band); } dev_kfree_skb(rx->skb); return RX_QUEUED; @@ -3736,7 +3736,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop); if (rx->sta) rx->sta->rx_stats.dropped++; - /* fall through */ + fallthrough; case RX_CONTINUE: { struct ieee80211_rate *rate = NULL; struct ieee80211_supported_band *sband; @@ -3869,7 +3869,6 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) /* This is OK -- must be QoS data frame */ .security_idx = tid, .seqno_idx = tid, - .napi = NULL, /* must be NULL to not have races */ }; struct tid_ampdu_rx *tid_agg_rx; @@ -4479,8 +4478,8 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, /* deliver to local stack */ skb->protocol = eth_type_trans(skb, fast_rx->dev); memset(skb->cb, 0, sizeof(skb->cb)); - if (rx->napi) - napi_gro_receive(rx->napi, skb); + if (rx->list) + list_add_tail(&skb->list, rx->list); else netif_receive_skb(skb); @@ -4547,7 +4546,7 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct sk_buff *skb, - struct napi_struct *napi) + struct list_head *list) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; @@ -4562,7 +4561,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, memset(&rx, 0, sizeof(rx)); rx.skb = skb; rx.local = local; - rx.napi = napi; + rx.list = list; if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) I802_DEBUG_INC(local->dot11ReceivedFragmentCount); @@ -4670,8 +4669,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, * This is the receive path handler. It is called by a low level driver when an * 802.11 MPDU is received from the hardware. */ -void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, - struct sk_buff *skb, struct napi_struct *napi) +void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, + struct sk_buff *skb, struct list_head *list) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate = NULL; @@ -4752,7 +4751,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, break; default: WARN_ON_ONCE(1); - /* fall through */ + fallthrough; case RX_ENC_LEGACY: if (WARN_ON(status->rate_idx >= sband->n_bitrates)) goto drop; @@ -4763,36 +4762,53 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, status->rx_flags = 0; /* - * key references and virtual interfaces are protected using RCU - * and this requires that we are in a read-side RCU section during - * receive processing - */ - rcu_read_lock(); - - /* * Frames with failed FCS/PLCP checksum are not returned, * all other frames are returned without radiotap header * if it was previously present. * Also, frames with less than 16 bytes are dropped. */ skb = ieee80211_rx_monitor(local, skb, rate); - if (!skb) { - rcu_read_unlock(); + if (!skb) return; - } ieee80211_tpt_led_trig_rx(local, ((struct ieee80211_hdr *)skb->data)->frame_control, skb->len); - __ieee80211_rx_handle_packet(hw, pubsta, skb, napi); - - rcu_read_unlock(); + __ieee80211_rx_handle_packet(hw, pubsta, skb, list); return; drop: kfree_skb(skb); } +EXPORT_SYMBOL(ieee80211_rx_list); + +void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, + struct sk_buff *skb, struct napi_struct *napi) +{ + struct sk_buff *tmp; + LIST_HEAD(list); + + + /* + * key references and virtual interfaces are protected using RCU + * and this requires that we are in a read-side RCU section during + * receive processing + */ + rcu_read_lock(); + ieee80211_rx_list(hw, pubsta, skb, &list); + rcu_read_unlock(); + + if (!napi) { + netif_receive_skb_list(&list); + return; + } + + list_for_each_entry_safe(skb, tmp, &list, list) { + skb_list_del_init(skb); + napi_gro_receive(napi, skb); + } +} EXPORT_SYMBOL(ieee80211_rx_napi); /* This is a version of the rx handler that can be called from hard irq diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index ad90bbe57457..5ac2785cdc7b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -591,7 +591,6 @@ static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel) { struct sk_buff *skb; - u32 txdata_flags = 0; skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, ssid, ssid_len, @@ -600,15 +599,15 @@ static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, if (skb) { if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) { struct ieee80211_hdr *hdr = (void *)skb->data; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 sn = get_random_u32(); - txdata_flags |= IEEE80211_TX_NO_SEQNO; + info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; hdr->seq_ctrl = cpu_to_le16(IEEE80211_SN_TO_SEQ(sn)); } IEEE80211_SKB_CB(skb)->flags |= tx_flags; - ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band, - txdata_flags); + ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); } } @@ -913,6 +912,7 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, case NL80211_BSS_CHAN_WIDTH_10: local->scan_chandef.width = NL80211_CHAN_WIDTH_10; break; + default: case NL80211_BSS_CHAN_WIDTH_20: /* If scanning on oper channel, use whatever channel-type * is currently in use. diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index af4cc5fb678e..f2840d1d95cf 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1050,7 +1050,7 @@ static void __sta_info_destroy_part2(struct sta_info *sta) might_sleep(); lockdep_assert_held(&local->sta_mtx); - while (sta->sta_state == IEEE80211_STA_AUTHORIZED) { + if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); WARN_ON_ONCE(ret); } @@ -1455,7 +1455,7 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid, } info->band = chanctx_conf->def.chan->band; - ieee80211_xmit(sdata, sta, skb, 0); + ieee80211_xmit(sdata, sta, skb); rcu_read_unlock(); } @@ -2424,7 +2424,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | BIT_ULL(NL80211_STA_INFO_PEER_PM) | BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | - BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE); + BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | + BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; @@ -2437,6 +2438,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->peer_pm = sta->mesh->peer_pm; sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; sinfo->connected_to_gate = sta->mesh->connected_to_gate; + sinfo->connected_to_as = sta->mesh->connected_to_as; #endif } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 49728047dfad..9d398c9daa4c 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -385,6 +385,7 @@ DECLARE_EWMA(mesh_tx_rate_avg, 8, 16) * @processed_beacon: set to true after peer rates and capabilities are * processed * @connected_to_gate: true if mesh STA has a path to a mesh gate + * @connected_to_as: true if mesh STA has a path to a authentication server * @fail_avg: moving percentage of failed MSDUs * @tx_rate_avg: moving average of tx bitrate */ @@ -404,6 +405,7 @@ struct mesh_sta { bool processed_beacon; bool connected_to_gate; + bool connected_to_as; enum nl80211_plink_state plink_state; u32 plink_timeout; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index cbc40b358ba2..adb1d30ce06e 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -799,7 +799,6 @@ static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int *retry_count) { - int rates_idx = -1; int count = -1; int i; @@ -821,13 +820,12 @@ static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, count += info->status.rates[i].count; } - rates_idx = i - 1; if (count < 0) count = 0; *retry_count = count; - return rates_idx; + return i - 1; } void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 4b0cff4a07bd..e01e4daeb8cd 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -239,7 +239,7 @@ static enum ieee80211_ac_numbers ieee80211_ac_from_wmm(int ac) switch (ac) { default: WARN_ON_ONCE(1); - /* fall through */ + fallthrough; case 0: return IEEE80211_AC_BE; case 1: @@ -952,7 +952,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, set_sta_flag(sta, WLAN_STA_TDLS_INITIATOR); sta->sta.tdls_initiator = false; } - /* fall-through */ + fallthrough; case WLAN_TDLS_SETUP_CONFIRM: case WLAN_TDLS_DISCOVERY_REQUEST: initiator = true; @@ -967,7 +967,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, clear_sta_flag(sta, WLAN_STA_TDLS_INITIATOR); sta->sta.tdls_initiator = true; } - /* fall-through */ + fallthrough; case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: initiator = false; break; @@ -1222,7 +1222,7 @@ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, * by the AP. */ drv_mgd_protect_tdls_discover(sdata->local, sdata); - /* fall-through */ + fallthrough; case WLAN_TDLS_SETUP_CONFIRM: case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: /* no special handling */ diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 1b4709694d2a..50ab5b9d8eab 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -22,7 +22,8 @@ #define LOCAL_PR_ARG __entry->wiphy_name #define STA_ENTRY __array(char, sta_addr, ETH_ALEN) -#define STA_ASSIGN (sta ? memcpy(__entry->sta_addr, sta->addr, ETH_ALEN) : memset(__entry->sta_addr, 0, ETH_ALEN)) +#define STA_ASSIGN (sta ? memcpy(__entry->sta_addr, sta->addr, ETH_ALEN) : \ + eth_zero_addr(__entry->sta_addr)) #define STA_NAMED_ASSIGN(s) memcpy(__entry->sta_addr, (s)->addr, ETH_ALEN) #define STA_PR_FMT " sta:%pM" #define STA_PR_ARG __entry->sta_addr diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 3529d1368068..dca01d7e6e3e 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -166,6 +166,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, if (r->flags & IEEE80211_RATE_MANDATORY_A) mrate = r->bitrate; break; + case NL80211_BAND_S1GHZ: case NL80211_BAND_60GHZ: /* TODO, for now fall through */ case NUM_NL80211_BANDS: @@ -824,6 +825,9 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) if (ieee80211_is_qos_nullfunc(hdr->frame_control)) return TX_CONTINUE; + if (info->control.flags & IEEE80211_TX_CTRL_NO_SEQNO) + return TX_CONTINUE; + /* * Anything but QoS data that has a sequence number field * (is long enough) gets a sequence number from the global @@ -832,8 +836,6 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) */ if (!ieee80211_is_data_qos(hdr->frame_control) || is_multicast_ether_addr(hdr->addr1)) { - if (tx->flags & IEEE80211_TX_NO_SEQNO) - return TX_CONTINUE; /* driver should assign sequence number */ info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; /* for pure STA mode without beacons, we can do it */ @@ -1739,7 +1741,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, case NL80211_IFTYPE_AP_VLAN: sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); - /* fall through */ + fallthrough; default: vif = &sdata->vif; break; @@ -1890,7 +1892,7 @@ EXPORT_SYMBOL(ieee80211_tx_prepare_skb); */ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb, - bool txpending, u32 txdata_flags) + bool txpending) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_data tx; @@ -1908,8 +1910,6 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, led_len = skb->len; res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb); - tx.flags |= txdata_flags; - if (unlikely(res_prepare == TX_DROP)) { ieee80211_free_txskb(&local->hw, skb); return true; @@ -1977,8 +1977,7 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, } void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb, - u32 txdata_flags) + struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); @@ -2013,12 +2012,13 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, } ieee80211_set_qos_hdr(sdata, skb); - ieee80211_tx(sdata, sta, skb, false, txdata_flags); + ieee80211_tx(sdata, sta, skb, false); } -static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, - struct sk_buff *skb) +bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, + struct net_device *dev) { + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_radiotap_iterator iterator; struct ieee80211_radiotap_header *rthdr = (struct ieee80211_radiotap_header *) skb->data; @@ -2037,6 +2037,18 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, u8 vht_mcs = 0, vht_nss = 0; int i; + /* check for not even having the fixed radiotap header part */ + if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header))) + return false; /* too short to be possibly valid */ + + /* is it a header version we can trust to find length from? */ + if (unlikely(rthdr->it_version)) + return false; /* only version 0 is supported */ + + /* does the skb contain enough to deliver on the alleged length? */ + if (unlikely(skb->len < ieee80211_get_radiotap_len(skb->data))) + return false; /* skb too short for claimed rt header extent */ + info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_CTL_DONTFRAG; @@ -2084,6 +2096,8 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, txflags = get_unaligned_le16(iterator.this_arg); if (txflags & IEEE80211_RADIOTAP_F_TX_NOACK) info->flags |= IEEE80211_TX_CTL_NO_ACK; + if (txflags & IEEE80211_RADIOTAP_F_TX_NOSEQNO) + info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; break; case IEEE80211_RADIOTAP_RATE: @@ -2188,13 +2202,6 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, local->hw.max_rate_tries); } - /* - * remove the radiotap header - * iterator->_max_length was sanity-checked against - * skb->len by iterator init - */ - skb_pull(skb, iterator._max_length); - return true; } @@ -2203,8 +2210,6 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_chanctx_conf *chanctx_conf; - struct ieee80211_radiotap_header *prthdr = - (struct ieee80211_radiotap_header *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr; struct ieee80211_sub_if_data *tmp_sdata, *sdata; @@ -2212,21 +2217,17 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, u16 len_rthdr; int hdrlen; - /* check for not even having the fixed radiotap header part */ - if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header))) - goto fail; /* too short to be possibly valid */ + memset(info, 0, sizeof(*info)); + info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | + IEEE80211_TX_CTL_INJECTED; - /* is it a header version we can trust to find length from? */ - if (unlikely(prthdr->it_version)) - goto fail; /* only version 0 is supported */ + /* Sanity-check and process the injection radiotap header */ + if (!ieee80211_parse_tx_radiotap(skb, dev)) + goto fail; - /* then there must be a radiotap header with a length we can use */ + /* we now know there is a radiotap header with a length we can use */ len_rthdr = ieee80211_get_radiotap_len(skb->data); - /* does the skb contain enough to deliver on the alleged length? */ - if (unlikely(skb->len < len_rthdr)) - goto fail; /* skb too short for claimed rt header extent */ - /* * fix up the pointers accounting for the radiotap * header still being in there. We are being given @@ -2272,11 +2273,6 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK; } - memset(info, 0, sizeof(*info)); - - info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | - IEEE80211_TX_CTL_INJECTED; - rcu_read_lock(); /* @@ -2342,11 +2338,10 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, info->band = chandef->chan->band; - /* process and remove the injection radiotap header */ - if (!ieee80211_parse_tx_radiotap(local, skb)) - goto fail_rcu; + /* remove the injection radiotap header */ + skb_pull(skb, len_rthdr); - ieee80211_xmit(sdata, NULL, skb, 0); + ieee80211_xmit(sdata, NULL, skb); rcu_read_unlock(); return NETDEV_TX_OK; @@ -2382,7 +2377,7 @@ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, } else if (sdata->wdev.use_4addr) { return -ENOLINK; } - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_ADHOC: @@ -2552,7 +2547,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, band = chanctx_conf->def.chan->band; if (sdata->wdev.use_4addr) break; - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: if (sdata->vif.type == NL80211_IFTYPE_AP) chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); @@ -2999,7 +2994,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) build.hdr_len = 30; break; } - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); /* DA BSSID SA */ @@ -3618,7 +3613,7 @@ begin: tx.skb = skb; tx.sdata = vif_to_sdata(info->control.vif); - if (txq->sta) { + if (txq->sta && !(info->flags & IEEE80211_TX_CTL_INJECTED)) { tx.sta = container_of(txq->sta, struct sta_info, sta); /* * Drop unicast frames to unauthorised stations unless they are @@ -3710,7 +3705,7 @@ begin: case NL80211_IFTYPE_AP_VLAN: tx.sdata = container_of(tx.sdata->bss, struct ieee80211_sub_if_data, u.ap); - /* fall through */ + fallthrough; default: vif = &tx.sdata->vif; break; @@ -3721,10 +3716,11 @@ encap_out: if (vif && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) { + bool ampdu = txq->ac != IEEE80211_AC_VO; u32 airtime; airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta, - skb->len); + skb->len, ampdu); if (airtime) { airtime = ieee80211_info_set_tx_time_est(info, airtime); ieee80211_sta_update_pending_airtime(local, tx.sta, @@ -3950,6 +3946,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, if (local->ops->wake_tx_queue) { u16 queue = __ieee80211_select_queue(sdata, sta, skb); skb_set_queue_mapping(skb, queue); + skb_get_hash(skb); } if (sta) { @@ -4008,7 +4005,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, ieee80211_tx_stats(dev, skb->len); - ieee80211_xmit(sdata, sta, skb, 0); + ieee80211_xmit(sdata, sta, skb); } goto out; out_free: @@ -4049,7 +4046,7 @@ static bool ieee80211_multicast_to_unicast(struct sk_buff *skb, return false; if (sdata->wdev.use_4addr) return false; - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: /* check runtime toggle for this bss */ if (!sdata->bss->multicast_to_unicast) @@ -4375,7 +4372,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, return true; } info->band = chanctx_conf->def.chan->band; - result = ieee80211_tx(sdata, NULL, skb, true, 0); + result = ieee80211_tx(sdata, NULL, skb, true); } else if (info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP) { if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { dev_kfree_skb(skb); @@ -5334,7 +5331,7 @@ EXPORT_SYMBOL(ieee80211_unreserve_tid); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band, u32 txdata_flags) + enum nl80211_band band) { int ac = ieee80211_ac_from_tid(tid); @@ -5351,7 +5348,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, */ local_bh_disable(); IEEE80211_SKB_CB(skb)->band = band; - ieee80211_xmit(sdata, NULL, skb, txdata_flags); + ieee80211_xmit(sdata, NULL, skb); local_bh_enable(); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index dd9f5c7a1ade..c8504ffc71a1 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2347,10 +2347,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) case NL80211_IFTYPE_ADHOC: if (sdata->vif.bss_conf.ibss_joined) WARN_ON(drv_join_ibss(local, sdata)); - /* fall through */ + fallthrough; default: ieee80211_reconfig_stations(sdata); - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: /* AP stations are handled later */ for (i = 0; i < IEEE80211_NUM_ACS; i++) drv_conf_tx(local, sdata, i, @@ -2399,7 +2399,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) break; case NL80211_IFTYPE_ADHOC: changed |= BSS_CHANGED_IBSS; - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: changed |= BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS; @@ -2414,8 +2414,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) if (rcu_access_pointer(sdata->u.ap.beacon)) drv_start_ap(local, sdata); } - - /* fall through */ + fallthrough; case NL80211_IFTYPE_MESH_POINT: if (sdata->vif.bss_conf.enable_beacon) { changed |= BSS_CHANGED_BEACON | @@ -2889,7 +2888,7 @@ void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); - /* fall through */ + fallthrough; case IEEE80211_SMPS_OFF: cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED, IEEE80211_HE_6GHZ_CAP_SM_PS); @@ -3200,7 +3199,7 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, break; case 0x01: support_80_80 = false; - /* fall through */ + fallthrough; case 0x02: case 0x03: ccf1 = ccfs2; @@ -3570,7 +3569,7 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, break; default: WARN_ON(1); - /* fall through */ + fallthrough; case RX_ENC_LEGACY: { struct ieee80211_supported_band *sband; int shift = 0; @@ -3734,6 +3733,11 @@ u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c) c->width = NL80211_CHAN_WIDTH_20_NOHT; ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; break; + case NL80211_CHAN_WIDTH_1: + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: WARN_ON_ONCE(1); diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index 72920d82928c..2fb99325135a 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -198,7 +198,7 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, sta = rcu_dereference(sdata->u.vlan.sta); if (sta) break; - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: ra = skb->data; break; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index fd30ea61336e..6fdd0c9f865a 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1584,21 +1584,10 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, unsigned int flags; if (event == NETDEV_REGISTER) { + mdev = mpls_add_dev(dev); + if (IS_ERR(mdev)) + return notifier_from_errno(PTR_ERR(mdev)); - /* For now just support Ethernet, IPGRE, IP6GRE, SIT and - * IPIP devices - */ - if (dev->type == ARPHRD_ETHER || - dev->type == ARPHRD_LOOPBACK || - dev->type == ARPHRD_IPGRE || - dev->type == ARPHRD_IP6GRE || - dev->type == ARPHRD_SIT || - dev->type == ARPHRD_TUNNEL || - dev->type == ARPHRD_TUNNEL6) { - mdev = mpls_add_dev(dev); - if (IS_ERR(mdev)) - return notifier_from_errno(PTR_ERR(mdev)); - } return NOTIFY_OK; } diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index a9ed3bf1d93f..698bc3525160 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -13,17 +13,29 @@ config MPTCP if MPTCP +config INET_MPTCP_DIAG + depends on INET_DIAG + def_tristate INET_DIAG + config MPTCP_IPV6 bool "MPTCP: IPv6 support for Multipath TCP" select IPV6 default y -config MPTCP_HMAC_TEST - bool "Tests for MPTCP HMAC implementation" +endif + +config MPTCP_KUNIT_TESTS + tristate "This builds the MPTCP KUnit tests" if !KUNIT_ALL_TESTS + select MPTCP + depends on KUNIT + default KUNIT_ALL_TESTS help - This option enable boot time self-test for the HMAC implementation - used by the MPTCP code + Currently covers the MPTCP crypto and token helpers. + Only useful for kernel devs running KUnit test harness and are not + for inclusion into a production build. - Say N if you are unsure. + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. -endif diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index baa0640527c7..a611968be4d7 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -3,3 +3,10 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mib.o pm_netlink.o + +obj-$(CONFIG_SYN_COOKIES) += syncookies.o +obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o + +mptcp_crypto_test-objs := crypto_test.o +mptcp_token_test-objs := token_test.o +obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index 82bd2b54d741..05d398d3fde4 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -78,65 +78,6 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac); } -#ifdef CONFIG_MPTCP_HMAC_TEST -struct test_cast { - char *key; - char *msg; - char *result; -}; - -/* we can't reuse RFC 4231 test vectors, as we have constraint on the - * input and key size. - */ -static struct test_cast tests[] = { - { - .key = "0b0b0b0b0b0b0b0b", - .msg = "48692054", - .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa", - }, - { - .key = "aaaaaaaaaaaaaaaa", - .msg = "dddddddd", - .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9", - }, - { - .key = "0102030405060708", - .msg = "cdcdcdcd", - .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d", - }, -}; - -static int __init test_mptcp_crypto(void) -{ - char hmac[32], hmac_hex[65]; - u32 nonce1, nonce2; - u64 key1, key2; - u8 msg[8]; - int i, j; - - for (i = 0; i < ARRAY_SIZE(tests); ++i) { - /* mptcp hmap will convert to be before computing the hmac */ - key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0])); - key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8])); - nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); - nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); - - put_unaligned_be32(nonce1, &msg[0]); - put_unaligned_be32(nonce2, &msg[4]); - - mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); - for (j = 0; j < 32; ++j) - sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); - hmac_hex[64] = 0; - - if (memcmp(hmac_hex, tests[i].result, 64)) - pr_err("test %d failed, got %s expected %s", i, - hmac_hex, tests[i].result); - else - pr_info("test %d [ ok ]", i); - } - return 0; -} - -late_initcall(test_mptcp_crypto); +#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS) +EXPORT_SYMBOL_GPL(mptcp_crypto_hmac_sha); #endif diff --git a/net/mptcp/crypto_test.c b/net/mptcp/crypto_test.c new file mode 100644 index 000000000000..017248dea038 --- /dev/null +++ b/net/mptcp/crypto_test.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <kunit/test.h> + +#include "protocol.h" + +struct test_case { + char *key; + char *msg; + char *result; +}; + +/* we can't reuse RFC 4231 test vectors, as we have constraint on the + * input and key size. + */ +static struct test_case tests[] = { + { + .key = "0b0b0b0b0b0b0b0b", + .msg = "48692054", + .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa", + }, + { + .key = "aaaaaaaaaaaaaaaa", + .msg = "dddddddd", + .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9", + }, + { + .key = "0102030405060708", + .msg = "cdcdcdcd", + .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d", + }, +}; + +static void mptcp_crypto_test_basic(struct kunit *test) +{ + char hmac[32], hmac_hex[65]; + u32 nonce1, nonce2; + u64 key1, key2; + u8 msg[8]; + int i, j; + + for (i = 0; i < ARRAY_SIZE(tests); ++i) { + /* mptcp hmap will convert to be before computing the hmac */ + key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0])); + key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8])); + nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); + nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); + + put_unaligned_be32(nonce1, &msg[0]); + put_unaligned_be32(nonce2, &msg[4]); + + mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); + for (j = 0; j < 32; ++j) + sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); + hmac_hex[64] = 0; + + KUNIT_EXPECT_STREQ(test, &hmac_hex[0], tests[i].result); + } +} + +static struct kunit_case mptcp_crypto_test_cases[] = { + KUNIT_CASE(mptcp_crypto_test_basic), + {} +}; + +static struct kunit_suite mptcp_crypto_suite = { + .name = "mptcp-crypto", + .test_cases = mptcp_crypto_test_cases, +}; + +kunit_test_suite(mptcp_crypto_suite); + +MODULE_LICENSE("GPL"); diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 8e39585d37f3..54b888f94009 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -112,6 +112,7 @@ static struct pernet_operations mptcp_pernet_ops = { void __init mptcp_init(void) { + mptcp_join_cookie_init(); mptcp_proto_init(); if (register_pernet_subsys(&mptcp_pernet_ops) < 0) diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c new file mode 100644 index 000000000000..5f390a97f556 --- /dev/null +++ b/net/mptcp/mptcp_diag.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP socket monitoring support + * + * Copyright (c) 2020 Red Hat + * + * Author: Paolo Abeni <pabeni@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/net.h> +#include <linux/inet_diag.h> +#include <net/netlink.h> +#include <uapi/linux/mptcp.h> +#include "protocol.h" + +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *req, + struct nlattr *bc, bool net_admin) +{ + if (!inet_diag_bc_sk(bc, sk)) + return 0; + + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI, + net_admin); +} + +static int mptcp_diag_dump_one(struct netlink_callback *cb, + const struct inet_diag_req_v2 *req) +{ + struct sk_buff *in_skb = cb->skb; + struct mptcp_sock *msk = NULL; + struct sk_buff *rep; + int err = -ENOENT; + struct net *net; + struct sock *sk; + + net = sock_net(in_skb->sk); + msk = mptcp_token_get_sock(req->id.idiag_cookie[0]); + if (!msk) + goto out_nosk; + + err = -ENOMEM; + sk = (struct sock *)msk; + rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct mptcp_info)) + + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, + GFP_KERNEL); + if (!rep) + goto out; + + err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(rep); + goto out; + } + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, + MSG_DONTWAIT); + if (err > 0) + err = 0; +out: + sock_put(sk); + +out_nosk: + return err; +} + +static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r) +{ + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct net *net = sock_net(skb->sk); + struct inet_diag_dump_data *cb_data; + struct mptcp_sock *msk; + struct nlattr *bc; + + cb_data = cb->data; + bc = cb_data->inet_diag_nla_bc; + + while ((msk = mptcp_token_iter_next(net, &cb->args[0], &cb->args[1])) != + NULL) { + struct inet_sock *inet = (struct inet_sock *)msk; + struct sock *sk = (struct sock *)msk; + int ret = 0; + + if (!(r->idiag_states & (1 << sk->sk_state))) + goto next; + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next; + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next; + if (r->id.idiag_dport != inet->inet_dport && + r->id.idiag_dport) + goto next; + + ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); +next: + sock_put(sk); + if (ret < 0) { + /* will retry on the same position */ + cb->args[1]--; + break; + } + cond_resched(); + } +} + +static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_info *info = _info; + u32 flags = 0; + bool slow; + u8 val; + + r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_wqueue = sk_wmem_alloc_get(sk); + if (!info) + return; + + slow = lock_sock_fast(sk); + info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); + info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); + info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max); + val = READ_ONCE(msk->pm.add_addr_signal_max); + info->mptcpi_add_addr_signal_max = val; + val = READ_ONCE(msk->pm.add_addr_accept_max); + info->mptcpi_add_addr_accepted_max = val; + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) + flags |= MPTCP_INFO_FLAG_FALLBACK; + if (READ_ONCE(msk->can_ack)) + flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; + info->mptcpi_flags = flags; + info->mptcpi_token = READ_ONCE(msk->token); + info->mptcpi_write_seq = READ_ONCE(msk->write_seq); + info->mptcpi_snd_una = atomic64_read(&msk->snd_una); + info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + unlock_sock_fast(sk, slow); +} + +static const struct inet_diag_handler mptcp_diag_handler = { + .dump = mptcp_diag_dump, + .dump_one = mptcp_diag_dump_one, + .idiag_get_info = mptcp_diag_get_info, + .idiag_type = IPPROTO_MPTCP, + .idiag_info_size = sizeof(struct mptcp_info), +}; + +static int __init mptcp_diag_init(void) +{ + return inet_diag_register(&mptcp_diag_handler); +} + +static void __exit mptcp_diag_exit(void) +{ + inet_diag_unregister(&mptcp_diag_handler); +} + +module_init(mptcp_diag_init); +module_exit(mptcp_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 8f940be42f98..7fa822b55c34 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -451,6 +451,8 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_ext *ext) { + u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq); + if (!ext->use_map || !skb->len) { /* RFC6824 requires a DSS mapping with specific values * if DATA_FIN is set but no data payload is mapped @@ -458,10 +460,13 @@ static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, ext->data_fin = 1; ext->use_map = 1; ext->dsn64 = 1; - ext->data_seq = subflow->data_fin_tx_seq; + /* The write_seq value has already been incremented, so + * the actual sequence number for the DATA_FIN is one less. + */ + ext->data_seq = data_fin_tx_seq - 1; ext->subflow_seq = 0; ext->data_len = 1; - } else if (ext->data_seq + ext->data_len == subflow->data_fin_tx_seq) { + } else if (ext->data_seq + ext->data_len == data_fin_tx_seq) { /* If there's an existing DSS mapping and it is the * final mapping, DATA_FIN consumes 1 additional byte of * mapping space. @@ -477,22 +482,17 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int dss_size = 0; + u64 snd_data_fin_enable; struct mptcp_ext *mpext; - struct mptcp_sock *msk; unsigned int ack_size; bool ret = false; - u8 tcp_fin; - if (skb) { - mpext = mptcp_get_ext(skb); - tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; - } else { - mpext = NULL; - tcp_fin = 0; - } + mpext = skb ? mptcp_get_ext(skb) : NULL; + snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable); - if (!skb || (mpext && mpext->use_map) || tcp_fin) { + if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size; map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; @@ -502,7 +502,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, if (mpext) opts->ext_copy = *mpext; - if (skb && tcp_fin && subflow->data_fin_tx_enable) + if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); ret = true; } @@ -511,7 +511,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, * if the first subflow may have the already the remote key handy */ opts->ext_copy.use_ack = 0; - msk = mptcp_sk(subflow->conn); if (!READ_ONCE(msk->can_ack)) { *size = ALIGN(dss_size, 4); return ret; @@ -624,6 +623,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, opts->suboptions = 0; + if (unlikely(mptcp_check_fallback(sk))) + return false; + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, @@ -706,6 +708,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, * additional ack. */ subflow->fully_established = 1; + WRITE_ONCE(msk->fully_established, true); goto fully_established; } @@ -714,15 +717,14 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, */ if (!mp_opt->mp_capable) { subflow->mp_capable = 0; - tcp_sk(sk)->is_mptcp = 0; + pr_fallback(msk); + __mptcp_do_fallback(msk); return false; } if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); - subflow->fully_established = 1; - subflow->remote_key = mp_opt->sndr_key; - subflow->can_ack = 1; + mptcp_subflow_fully_established(subflow, mp_opt); fully_established: if (likely(subflow->pm_notified)) @@ -780,6 +782,22 @@ static void update_una(struct mptcp_sock *msk, } } +bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq) +{ + /* Skip if DATA_FIN was already received. + * If updating simultaneously with the recvmsg loop, values + * should match. If they mismatch, the peer is misbehaving and + * we will prefer the most recent information. + */ + if (READ_ONCE(msk->rcv_data_fin) || !READ_ONCE(msk->first)) + return false; + + WRITE_ONCE(msk->rcv_data_fin_seq, data_fin_seq); + WRITE_ONCE(msk->rcv_data_fin, 1); + + return true; +} + static bool add_addr_hmac_valid(struct mptcp_sock *msk, struct mptcp_options_received *mp_opt) { @@ -814,6 +832,9 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; + if (__mptcp_check_fallback(msk)) + return; + mptcp_get_options(skb, &mp_opt); if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; @@ -847,6 +868,20 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, if (mp_opt.use_ack) update_una(msk, &mp_opt); + /* Zero-data-length packets are dropped by the caller and not + * propagated to the MPTCP layer, so the skb extension does not + * need to be allocated or populated. DATA_FIN information, if + * present, needs to be updated here before the skb is freed. + */ + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { + if (mp_opt.data_fin && mp_opt.data_len == 1 && + mptcp_update_rcv_data_fin(msk, mp_opt.data_seq) && + schedule_work(&msk->work)) + sock_hold(subflow->conn); + + return; + } + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) return; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 977d9c8b1453..a8ad20559aaa 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -10,8 +10,6 @@ #include <net/mptcp.h> #include "protocol.h" -static struct workqueue_struct *pm_wq; - /* path manager command handlers */ int mptcp_pm_announce_addr(struct mptcp_sock *msk, @@ -78,7 +76,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, return false; msk->pm.status |= BIT(new_status); - if (queue_work(pm_wq, &msk->pm.work)) + if (schedule_work(&msk->work)) sock_hold((struct sock *)msk); return true; } @@ -181,35 +179,6 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_get_local_id(msk, skc); } -static void pm_worker(struct work_struct *work) -{ - struct mptcp_pm_data *pm = container_of(work, struct mptcp_pm_data, - work); - struct mptcp_sock *msk = container_of(pm, struct mptcp_sock, pm); - struct sock *sk = (struct sock *)msk; - - lock_sock(sk); - spin_lock_bh(&msk->pm.lock); - - pr_debug("msk=%p status=%x", msk, pm->status); - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); - mptcp_pm_nl_add_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); - mptcp_pm_nl_fully_established(msk); - } - if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); - mptcp_pm_nl_subflow_established(msk); - } - - spin_unlock_bh(&msk->pm.lock); - release_sock(sk); - sock_put(sk); -} - void mptcp_pm_data_init(struct mptcp_sock *msk) { msk->pm.add_addr_signaled = 0; @@ -223,22 +192,11 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) msk->pm.status = 0; spin_lock_init(&msk->pm.lock); - INIT_WORK(&msk->pm.work, pm_worker); mptcp_pm_nl_data_init(msk); } -void mptcp_pm_close(struct mptcp_sock *msk) -{ - if (cancel_work_sync(&msk->pm.work)) - sock_put((struct sock *)msk); -} - -void mptcp_pm_init(void) +void __init mptcp_pm_init(void) { - pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); - if (!pm_wq) - panic("Failed to allocate workqueue"); - mptcp_pm_nl_init(); } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index b78edf237ba0..c8820c4156e6 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -851,7 +851,7 @@ static struct pernet_operations mptcp_pm_pernet_ops = { .size = sizeof(struct pm_nl_pernet), }; -void mptcp_pm_nl_init(void) +void __init mptcp_pm_nl_init(void) { if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) panic("Failed to register MPTCP PM pernet subsystem.\n"); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c0abe738e7d3..8c1d1a595701 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -16,6 +16,7 @@ #include <net/inet_hashtables.h> #include <net/protocol.h> #include <net/tcp.h> +#include <net/tcp_states.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/transp_v6.h> #endif @@ -52,18 +53,10 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) return msk->subflow; } -static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) -{ - return msk->first && !sk_is_mptcp(msk->first); -} - -static struct socket *mptcp_is_tcpsk(struct sock *sk) +static bool mptcp_is_tcpsk(struct sock *sk) { struct socket *sock = sk->sk_socket; - if (sock->sk != sk) - return NULL; - if (unlikely(sk->sk_prot == &tcp_prot)) { /* we are being invoked after mptcp_accept() has * accepted a non-mp-capable flow: sk is a tcp_sk, @@ -73,59 +66,37 @@ static struct socket *mptcp_is_tcpsk(struct sock *sk) * bypass mptcp. */ sock->ops = &inet_stream_ops; - return sock; + return true; #if IS_ENABLED(CONFIG_MPTCP_IPV6) } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { sock->ops = &inet6_stream_ops; - return sock; + return true; #endif } - return NULL; + return false; } -static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) +static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { - struct socket *sock; - sock_owned_by_me((const struct sock *)msk); - sock = mptcp_is_tcpsk((struct sock *)msk); - if (unlikely(sock)) - return sock; - - if (likely(!__mptcp_needs_tcp_fallback(msk))) + if (likely(!__mptcp_check_fallback(msk))) return NULL; - return msk->subflow; -} - -static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) -{ - return !msk->first; + return msk->first; } -static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) +static int __mptcp_socket_create(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; struct socket *ssock; int err; - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) - return ssock; - - ssock = __mptcp_nmpc_socket(msk); - if (ssock) - goto set_state; - - if (!__mptcp_can_create_subflow(msk)) - return ERR_PTR(-EINVAL); - err = mptcp_subflow_create_socket(sk, &ssock); if (err) - return ERR_PTR(err); + return err; msk->first = ssock->sk; msk->subflow = ssock; @@ -133,10 +104,12 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) list_add(&subflow->node, &msk->conn_list); subflow->request_mptcp = 1; -set_state: - if (state != MPTCP_SAME_STATE) - inet_sk_state_store(sk, state); - return ssock; + /* accept() will wait on first subflow sk_wq, and we always wakes up + * via msk->sk_socket + */ + RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq); + + return 0; } static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, @@ -170,6 +143,14 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->offset = offset; } +static void mptcp_stop_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + mptcp_sk(sk)->timer_ival = 0; +} + /* both sockets must be locked */ static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, struct sock *ssk) @@ -191,6 +172,139 @@ static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, return mptcp_subflow_data_available(ssk); } +static void mptcp_check_data_fin_ack(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (__mptcp_check_fallback(msk)) + return; + + /* Look for an acknowledged DATA_FIN */ + if (((1 << sk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && + msk->write_seq == atomic64_read(&msk->snd_una)) { + mptcp_stop_timer(sk); + + WRITE_ONCE(msk->snd_data_fin_enable, 0); + + switch (sk->sk_state) { + case TCP_FIN_WAIT1: + inet_sk_state_store(sk, TCP_FIN_WAIT2); + sk->sk_state_change(sk); + break; + case TCP_CLOSING: + fallthrough; + case TCP_LAST_ACK: + inet_sk_state_store(sk, TCP_CLOSE); + sk->sk_state_change(sk); + break; + } + + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + } +} + +static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (READ_ONCE(msk->rcv_data_fin) && + ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { + u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); + + if (msk->ack_seq == rcv_data_fin_seq) { + if (seq) + *seq = rcv_data_fin_seq; + + return true; + } + } + + return false; +} + +static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) +{ + long tout = ssk && inet_csk(ssk)->icsk_pending ? + inet_csk(ssk)->icsk_timeout - jiffies : 0; + + if (tout <= 0) + tout = mptcp_sk(sk)->timer_ival; + mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; +} + +static void mptcp_check_data_fin(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + u64 rcv_data_fin_seq; + + if (__mptcp_check_fallback(msk) || !msk->first) + return; + + /* Need to ack a DATA_FIN received from a peer while this side + * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. + * msk->rcv_data_fin was set when parsing the incoming options + * at the subflow level and the msk lock was not held, so this + * is the first opportunity to act on the DATA_FIN and change + * the msk state. + * + * If we are caught up to the sequence number of the incoming + * DATA_FIN, send the DATA_ACK now and do state transition. If + * not caught up, do nothing and let the recv code send DATA_ACK + * when catching up. + */ + + if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { + struct mptcp_subflow_context *subflow; + + msk->ack_seq++; + WRITE_ONCE(msk->rcv_data_fin, 0); + + sk->sk_shutdown |= RCV_SHUTDOWN; + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + inet_sk_state_store(sk, TCP_CLOSE_WAIT); + break; + case TCP_FIN_WAIT1: + inet_sk_state_store(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + inet_sk_state_store(sk, TCP_CLOSE); + // @@ Close subflows now? + break; + default: + /* Other states not expected */ + WARN_ON_ONCE(1); + break; + } + + mptcp_set_timeout(sk, NULL); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + tcp_send_ack(ssk); + release_sock(ssk); + } + + sk->sk_state_change(sk); + + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + } +} + static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sock *ssk, unsigned int *bytes) @@ -207,13 +321,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, return false; } - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf); - - if (rcvbuf > sk->sk_rcvbuf) - sk->sk_rcvbuf = rcvbuf; - } - tp = tcp_sk(ssk); do { u32 map_remaining, offset; @@ -229,6 +336,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, if (!skb) break; + if (__mptcp_check_fallback(msk)) { + /* if we are running under the workqueue, TCP could have + * collapsed skbs between dummy map creation and now + * be sure to adjust the size + */ + map_remaining = skb->len; + subflow->map_data_len = skb->len; + } + offset = seq - TCP_SKB_CB(skb)->seq; fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (fin) { @@ -265,6 +381,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, *bytes = moved; + /* If the moves have caught up with the DATA_FIN sequence number + * it's time to ack the DATA_FIN and change socket state, but + * this is not a good place to change state. Let the workqueue + * do it. + */ + if (mptcp_pending_data_fin(sk, NULL) && + schedule_work(&msk->work)) + sock_hold(sk); + return done; } @@ -329,16 +454,6 @@ static void __mptcp_flush_join_list(struct mptcp_sock *msk) spin_unlock_bh(&msk->join_list_lock); } -static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) -{ - long tout = ssk && inet_csk(ssk)->icsk_pending ? - inet_csk(ssk)->icsk_timeout - jiffies : 0; - - if (tout <= 0) - tout = mptcp_sk(sk)->timer_ival; - mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; -} - static bool mptcp_timer_pending(struct sock *sk) { return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); @@ -360,7 +475,8 @@ void mptcp_data_acked(struct sock *sk) { mptcp_reset_timer(sk); - if (!sk_stream_is_writeable(sk) && + if ((!sk_stream_is_writeable(sk) || + (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && schedule_work(&mptcp_sk(sk)->work)) sock_hold(sk); } @@ -395,14 +511,6 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) } } -static void mptcp_stop_timer(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); - mptcp_sk(sk)->timer_ival = 0; -} - static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) { const struct sock *sk = (const struct sock *)msk; @@ -466,8 +574,15 @@ static void mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - u64 snd_una = atomic64_read(&msk->snd_una); bool cleaned = false; + u64 snd_una; + + /* on fallback we just need to ignore snd_una, as this is really + * plain TCP + */ + if (__mptcp_check_fallback(msk)) + atomic64_set(&msk->snd_una, msk->write_seq); + snd_una = atomic64_read(&msk->snd_una); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) @@ -479,15 +594,20 @@ static void mptcp_clean_una(struct sock *sk) dfrag = mptcp_rtx_head(sk); if (dfrag && after64(snd_una, dfrag->data_seq)) { - u64 delta = dfrag->data_seq + dfrag->data_len - snd_una; + u64 delta = snd_una - dfrag->data_seq; + + if (WARN_ON_ONCE(delta > dfrag->data_len)) + goto out; dfrag->data_seq += delta; + dfrag->offset += delta; dfrag->data_len -= delta; dfrag_uncharge(sk, delta); cleaned = true; } +out: if (cleaned) { sk_mem_reclaim_partial(sk); @@ -673,7 +793,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, out: if (!retransmission) pfrag->offset += frag_truesize; - *write_seq += ret; + WRITE_ONCE(*write_seq, *write_seq + ret); mptcp_subflow_ctx(ssk)->rel_write_seq += ret; return ret; @@ -740,7 +860,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; - struct socket *ssock; size_t copied = 0; struct sock *ssk; bool tx_ok; @@ -759,19 +878,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; } -fallback: - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) { - release_sock(sk); - pr_debug("fallback passthrough"); - ret = sock_sendmsg(ssock, msg); - return ret >= 0 ? ret + copied : (copied ? copied : ret); - } - pfrag = sk_page_frag(sk); restart: mptcp_clean_una(sk); + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { + ret = -EPIPE; + goto out; + } + wait_for_sndbuf: __mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); @@ -819,17 +934,6 @@ wait_for_sndbuf: } break; } - if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { - /* Can happen for passive sockets: - * 3WHS negotiated MPTCP, but first packet after is - * plain TCP (e.g. due to middlebox filtering unknown - * options). - * - * Fall back to TCP. - */ - release_sock(ssk); - goto fallback; - } copied += ret; @@ -880,7 +984,6 @@ wait_for_sndbuf: mptcp_set_timeout(sk, ssk); if (copied) { - ret = copied; tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal); @@ -893,7 +996,7 @@ wait_for_sndbuf: release_sock(ssk); out: release_sock(sk); - return ret; + return copied ? : ret; } static void mptcp_wait_data(struct sock *sk, long *timeo) @@ -949,6 +1052,100 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, return copied; } +/* receive buffer autotuning. See tcp_rcv_space_adjust for more information. + * + * Only difference: Use highest rtt estimate of the subflows in use. + */ +static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + u32 time, advmss = 1; + u64 rtt_us, mstamp; + + sock_owned_by_me(sk); + + if (copied <= 0) + return; + + msk->rcvq_space.copied += copied; + + mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); + time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); + + rtt_us = msk->rcvq_space.rtt_us; + if (rtt_us && time < (rtt_us >> 3)) + return; + + rtt_us = 0; + mptcp_for_each_subflow(msk, subflow) { + const struct tcp_sock *tp; + u64 sf_rtt_us; + u32 sf_advmss; + + tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); + + sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); + sf_advmss = READ_ONCE(tp->advmss); + + rtt_us = max(sf_rtt_us, rtt_us); + advmss = max(sf_advmss, advmss); + } + + msk->rcvq_space.rtt_us = rtt_us; + if (time < (rtt_us >> 3) || rtt_us == 0) + return; + + if (msk->rcvq_space.copied <= msk->rcvq_space.space) + goto new_measure; + + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int rcvmem, rcvbuf; + u64 rcvwin, grow; + + rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; + + grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); + + do_div(grow, msk->rcvq_space.space); + rcvwin += (grow << 1); + + rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); + while (tcp_win_from_space(sk, rcvmem) < advmss) + rcvmem += 128; + + do_div(rcvwin, advmss); + rcvbuf = min_t(u64, rcvwin * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + + if (rcvbuf > sk->sk_rcvbuf) { + u32 window_clamp; + + window_clamp = tcp_win_from_space(sk, rcvbuf); + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + + /* Make subflows follow along. If we do not do this, we + * get drops at subflow level if skbs can't be moved to + * the mptcp rx queue fast enough (announced rcv_win can + * exceed ssk->sk_rcvbuf). + */ + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk; + + ssk = mptcp_subflow_tcp_sock(subflow); + WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); + tcp_sk(ssk)->window_clamp = window_clamp; + } + } + } + + msk->rcvq_space.space = msk->rcvq_space.copied; +new_measure: + msk->rcvq_space.copied = 0; + msk->rcvq_space.time = mstamp; +} + static bool __mptcp_move_skbs(struct mptcp_sock *msk) { unsigned int moved = 0; @@ -972,7 +1169,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; int copied = 0; int target; long timeo; @@ -981,16 +1177,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return -EOPNOTSUPP; lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) { -fallback: - release_sock(sk); - pr_debug("fallback-read subflow=%p", - mptcp_subflow_ctx(ssock->sk)); - copied = sock_recvmsg(ssock, msg, flags); - return copied; - } - timeo = sock_rcvtimeo(sk, nonblock); len = min_t(size_t, len, INT_MAX); @@ -1056,9 +1242,6 @@ fallback: pr_debug("block timeout %ld", timeo); mptcp_wait_data(sk, &timeo); - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) - goto fallback; } if (skb_queue_empty(&sk->sk_receive_queue)) { @@ -1075,6 +1258,8 @@ fallback: set_bit(MPTCP_DATA_READY, &msk->flags); } out_err: + mptcp_rcv_space_adjust(msk, copied); + release_sock(sk); return copied; } @@ -1083,7 +1268,7 @@ static void mptcp_retransmit_handler(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (atomic64_read(&msk->snd_una) == msk->write_seq) { + if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->write_seq)) { mptcp_stop_timer(sk); } else { set_bit(MPTCP_WORK_RTX, &msk->flags); @@ -1172,6 +1357,29 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) return 0; } +static void pm_work(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + + spin_lock_bh(&msk->pm.lock); + + pr_debug("msk=%p status=%x", msk, pm->status); + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); + mptcp_pm_nl_add_addr_received(msk); + } + if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); + mptcp_pm_nl_fully_established(msk); + } + if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); + mptcp_pm_nl_subflow_established(msk); + } + + spin_unlock_bh(&msk->pm.lock); +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -1185,12 +1393,18 @@ static void mptcp_worker(struct work_struct *work) lock_sock(sk); mptcp_clean_una(sk); + mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); __mptcp_move_skbs(msk); + if (msk->pm.status) + pm_work(msk); + if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); + mptcp_check_data_fin(sk); + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) goto unlock; @@ -1283,7 +1497,12 @@ static int mptcp_init_sock(struct sock *sk) if (ret) return ret; + ret = __mptcp_socket_create(mptcp_sk(sk)); + if (ret) + return ret; + sk_sockets_allocated_inc(sk); + sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; return 0; @@ -1308,8 +1527,7 @@ static void mptcp_cancel_work(struct sock *sk) sock_put(sk); } -static void mptcp_subflow_shutdown(struct sock *ssk, int how, - bool data_fin_tx_enable, u64 data_fin_tx_seq) +static void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) { lock_sock(ssk); @@ -1322,36 +1540,84 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how, tcp_disconnect(ssk, O_NONBLOCK); break; default: - if (data_fin_tx_enable) { - struct mptcp_subflow_context *subflow; - - subflow = mptcp_subflow_ctx(ssk); - subflow->data_fin_tx_seq = data_fin_tx_seq; - subflow->data_fin_tx_enable = 1; + if (__mptcp_check_fallback(mptcp_sk(sk))) { + pr_debug("Fallback"); + ssk->sk_shutdown |= how; + tcp_shutdown(ssk, how); + } else { + pr_debug("Sending DATA_FIN on subflow %p", ssk); + mptcp_set_timeout(sk, ssk); + tcp_send_ack(ssk); } - - ssk->sk_shutdown |= how; - tcp_shutdown(ssk, how); break; } - /* Wake up anyone sleeping in poll. */ - ssk->sk_state_change(ssk); release_sock(ssk); } -/* Called with msk lock held, releases such lock before returning */ +static const unsigned char new_state[16] = { + /* current state: new state: action: */ + [0 /* (Invalid) */] = TCP_CLOSE, + [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_SYN_SENT] = TCP_CLOSE, + [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, + [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, + [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */ + [TCP_CLOSE] = TCP_CLOSE, + [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, + [TCP_LAST_ACK] = TCP_LAST_ACK, + [TCP_LISTEN] = TCP_CLOSE, + [TCP_CLOSING] = TCP_CLOSING, + [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ +}; + +static int mptcp_close_state(struct sock *sk) +{ + int next = (int)new_state[sk->sk_state]; + int ns = next & TCP_STATE_MASK; + + inet_sk_state_store(sk, ns); + + return next & TCP_ACTION_FIN; +} + static void mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); LIST_HEAD(conn_list); - u64 data_fin_tx_seq; lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + if (sk->sk_state == TCP_LISTEN) { + inet_sk_state_store(sk, TCP_CLOSE); + goto cleanup; + } else if (sk->sk_state == TCP_CLOSE) { + goto cleanup; + } + + if (__mptcp_check_fallback(msk)) { + goto update_state; + } else if (mptcp_close_state(sk)) { + pr_debug("Sending DATA_FIN sk=%p", sk); + WRITE_ONCE(msk->write_seq, msk->write_seq + 1); + WRITE_ONCE(msk->snd_data_fin_enable, 1); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + mptcp_subflow_shutdown(sk, tcp_sk, SHUTDOWN_MASK); + } + } + + sk_stream_wait_close(sk, timeout); +update_state: inet_sk_state_store(sk, TCP_CLOSE); +cleanup: /* be sure to always acquire the join list lock, to sync vs * mptcp_finish_join(). */ @@ -1360,22 +1626,16 @@ static void mptcp_close(struct sock *sk, long timeout) spin_unlock_bh(&msk->join_list_lock); list_splice_init(&msk->conn_list, &conn_list); - data_fin_tx_seq = msk->write_seq; - __mptcp_clear_xmit(sk); release_sock(sk); list_for_each_entry_safe(subflow, tmp, &conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - subflow->data_fin_tx_seq = data_fin_tx_seq; - subflow->data_fin_tx_enable = 1; __mptcp_close_ssk(sk, ssk, subflow, timeout); } mptcp_cancel_work(sk); - mptcp_pm_close(msk); __skb_queue_purge(&sk->sk_receive_queue); @@ -1447,20 +1707,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->local_key = subflow_req->local_key; msk->token = subflow_req->token; msk->subflow = NULL; - - if (unlikely(mptcp_token_new_accept(subflow_req->token, nsk))) { - nsk->sk_state = TCP_CLOSE; - bh_unlock_sock(nsk); - - /* we can't call into mptcp_close() here - possible BH context - * free the sock directly. - * sk_clone_lock() sets nsk refcnt to two, hence call sk_free() - * too. - */ - sk_common_release(nsk); - sk_free(nsk); - return NULL; - } + WRITE_ONCE(msk->fully_established, false); msk->write_seq = subflow_req->idsn + 1; atomic64_set(&msk->snd_una, msk->write_seq); @@ -1482,6 +1729,22 @@ struct sock *mptcp_sk_clone(const struct sock *sk, return nsk; } +void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) +{ + const struct tcp_sock *tp = tcp_sk(ssk); + + msk->rcvq_space.copied = 0; + msk->rcvq_space.rtt_us = 0; + + msk->rcvq_space.time = tp->tcp_mstamp; + + /* initial rcv_space offering made to peer */ + msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, + TCP_INIT_CWND * tp->advmss); + if (msk->rcvq_space.space == 0) + msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; +} + static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, bool kern) { @@ -1501,7 +1764,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, return NULL; pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); - if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; @@ -1529,8 +1791,8 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, newsk = new_mptcp_sock; mptcp_copy_inaddrs(newsk, ssk); list_add(&subflow->node, &msk->conn_list); - inet_sk_state_store(newsk, TCP_ESTABLISHED); + mptcp_rcv_space_init(msk, ssk); bh_unlock_sock(new_mptcp_sock); __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); @@ -1547,21 +1809,82 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - mptcp_token_destroy(msk->token); + mptcp_token_destroy(msk); if (msk->cached_ext) __skb_ext_put(msk->cached_ext); sk_sockets_allocated_dec(sk); } +static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int ret; + + switch (optname) { + case SO_REUSEPORT: + case SO_REUSEADDR: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); + if (ret == 0) { + if (optname == SO_REUSEPORT) + sk->sk_reuseport = ssock->sk->sk_reuseport; + else if (optname == SO_REUSEADDR) + sk->sk_reuse = ssock->sk->sk_reuse; + } + release_sock(sk); + return ret; + } + + return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); +} + +static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + int ret = -EOPNOTSUPP; + struct socket *ssock; + + switch (optname) { + case IPV6_V6ONLY: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); + if (ret == 0) + sk->sk_ipv6only = ssock->sk->sk_ipv6only; + + release_sock(sk); + break; + } + + return ret; +} + static int mptcp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; + struct sock *ssk; pr_debug("msk=%p", msk); + if (level == SOL_SOCKET) + return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); + /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the * MPTCP-level socket to configure the subflows until the subflow @@ -1569,11 +1892,13 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname, * to the one remaining subflow. */ lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); + ssk = __mptcp_tcp_fallback(msk); release_sock(sk); - if (ssock) - return tcp_setsockopt(ssock->sk, level, optname, optval, - optlen); + if (ssk) + return tcp_setsockopt(ssk, level, optname, optval, optlen); + + if (level == SOL_IPV6) + return mptcp_setsockopt_v6(msk, optname, optval, optlen); return -EOPNOTSUPP; } @@ -1582,7 +1907,7 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; + struct sock *ssk; pr_debug("msk=%p", msk); @@ -1593,11 +1918,10 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, * to the one remaining subflow. */ lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); + ssk = __mptcp_tcp_fallback(msk); release_sock(sk); - if (ssock) - return tcp_getsockopt(ssock->sk, level, optname, optval, - option); + if (ssk) + return tcp_getsockopt(ssk, level, optname, optval, option); return -EOPNOTSUPP; } @@ -1636,6 +1960,20 @@ static void mptcp_release_cb(struct sock *sk) } } +static int mptcp_hash(struct sock *sk) +{ + /* should never be called, + * we hash the TCP subflows not the master socket + */ + WARN_ON_ONCE(1); + return 0; +} + +static void mptcp_unhash(struct sock *sk) +{ + /* called from sk_common_release(), but nothing to do here */ +} + static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1660,32 +1998,26 @@ void mptcp_finish_connect(struct sock *ssk) sk = subflow->conn; msk = mptcp_sk(sk); - if (!subflow->mp_capable) { - MPTCP_INC_STATS(sock_net(sk), - MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); - return; - } - pr_debug("msk=%p, token=%u", sk, subflow->token); mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); ack_seq++; subflow->map_seq = ack_seq; subflow->map_subflow_seq = 1; - subflow->rel_write_seq = 1; /* the socket is not connected yet, no msk/subflow ops can access/race * accessing the field below */ WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->local_key, subflow->local_key); - WRITE_ONCE(msk->token, subflow->token); WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->can_ack, 1); atomic64_set(&msk->snd_una, msk->write_seq); mptcp_pm_new_connection(msk, 0); + + mptcp_rcv_space_init(msk, ssk); } static void mptcp_sock_graft(struct sock *sk, struct socket *parent) @@ -1708,7 +2040,7 @@ bool mptcp_finish_join(struct sock *sk) pr_debug("msk=%p, subflow=%p", msk, subflow); /* mptcp socket already closing? */ - if (inet_sk_state_load(parent) != TCP_ESTABLISHED) + if (!mptcp_is_fully_established(parent)) return false; if (!msk->pm.server_side) @@ -1761,8 +2093,8 @@ static struct proto mptcp_prot = { .sendmsg = mptcp_sendmsg, .recvmsg = mptcp_recvmsg, .release_cb = mptcp_release_cb, - .hash = inet_hash, - .unhash = inet_unhash, + .hash = mptcp_hash, + .unhash = mptcp_unhash, .get_port = mptcp_get_port, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, @@ -1771,6 +2103,7 @@ static struct proto mptcp_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), + .slab_flags = SLAB_TYPESAFE_BY_RCU, .no_autobind = true, }; @@ -1781,9 +2114,9 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int err; lock_sock(sock->sk); - ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + err = -EINVAL; goto unlock; } @@ -1796,10 +2129,18 @@ unlock: return err; } +static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + subflow->request_mptcp = 0; + __mptcp_do_fallback(msk); +} + static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct mptcp_subflow_context *subflow; struct socket *ssock; int err; @@ -1812,19 +2153,24 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto do_connect; } - ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + err = -EINVAL; goto unlock; } + mptcp_token_destroy(msk); + inet_sk_state_store(sock->sk, TCP_SYN_SENT); + subflow = mptcp_subflow_ctx(ssock->sk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) - mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0; + mptcp_subflow_early_fallback(msk, subflow); #endif + if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) + mptcp_subflow_early_fallback(msk, subflow); do_connect: err = ssock->ops->connect(ssock, uaddr, addr_len, flags); @@ -1843,42 +2189,6 @@ unlock: return err; } -static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) -{ - if (sock->sk->sk_prot == &tcp_prot) { - /* we are being invoked from __sys_accept4, after - * mptcp_accept() has just accepted a non-mp-capable - * flow: sk is a tcp_sk, not an mptcp one. - * - * Hand the socket over to tcp so all further socket ops - * bypass mptcp. - */ - sock->ops = &inet_stream_ops; - } - - return inet_getname(sock, uaddr, peer); -} - -#if IS_ENABLED(CONFIG_MPTCP_IPV6) -static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) -{ - if (sock->sk->sk_prot == &tcpv6_prot) { - /* we are being invoked from __sys_accept4 after - * mptcp_accept() has accepted a non-mp-capable - * subflow: sk is a tcp_sk, not mptcp. - * - * Hand the socket over to tcp so all further - * socket ops bypass mptcp. - */ - sock->ops = &inet6_stream_ops; - } - - return inet6_getname(sock, uaddr, peer); -} -#endif - static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); @@ -1888,12 +2198,14 @@ static int mptcp_listen(struct socket *sock, int backlog) pr_debug("msk=%p", msk); lock_sock(sock->sk); - ssock = __mptcp_socket_create(msk, TCP_LISTEN); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + err = -EINVAL; goto unlock; } + mptcp_token_destroy(msk); + inet_sk_state_store(sock->sk, TCP_LISTEN); sock_set_flag(sock->sk, SOCK_RCU_FREE); err = ssock->ops->listen(ssock, backlog); @@ -1906,15 +2218,6 @@ unlock: return err; } -static bool is_tcp_proto(const struct proto *p) -{ -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - return p == &tcp_prot || p == &tcpv6_prot; -#else - return p == &tcp_prot; -#endif -} - static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { @@ -1932,11 +2235,12 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssock) goto unlock_fail; + clear_bit(MPTCP_DATA_READY, &msk->flags); sock_hold(ssock->sk); release_sock(sock->sk); err = ssock->ops->accept(sock, newsock, flags, kern); - if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) { + if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { struct mptcp_sock *msk = mptcp_sk(newsock->sk); struct mptcp_subflow_context *subflow; @@ -1944,7 +2248,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, * This is needed so NOSPACE flag can be set from tcp stack. */ __mptcp_flush_join_list(msk); - list_for_each_entry(subflow, &msk->conn_list, node) { + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!ssk->sk_socket) @@ -1952,6 +2256,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, } } + if (inet_csk_listen_poll(ssock->sk)) + set_bit(MPTCP_DATA_READY, &msk->flags); sock_put(ssock->sk); return err; @@ -1960,39 +2266,36 @@ unlock_fail: return -EINVAL; } +static __poll_t mptcp_check_readable(struct mptcp_sock *msk) +{ + return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : + 0; +} + static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { struct sock *sk = sock->sk; struct mptcp_sock *msk; - struct socket *ssock; __poll_t mask = 0; + int state; msk = mptcp_sk(sk); - lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); - if (!ssock) - ssock = __mptcp_nmpc_socket(msk); - if (ssock) { - mask = ssock->ops->poll(file, ssock, wait); - release_sock(sk); - return mask; - } - - release_sock(sk); sock_poll_wait(file, sock, wait); - lock_sock(sk); - if (test_bit(MPTCP_DATA_READY, &msk->flags)) - mask = EPOLLIN | EPOLLRDNORM; - if (sk_stream_is_writeable(sk) && - test_bit(MPTCP_SEND_SPACE, &msk->flags)) - mask |= EPOLLOUT | EPOLLWRNORM; + state = inet_sk_state_load(sk); + if (state == TCP_LISTEN) + return mptcp_check_readable(msk); + + if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { + mask |= mptcp_check_readable(msk); + if (sk_stream_is_writeable(sk) && + test_bit(MPTCP_SEND_SPACE, &msk->flags)) + mask |= EPOLLOUT | EPOLLWRNORM; + } if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - release_sock(sk); - return mask; } @@ -2000,23 +2303,13 @@ static int mptcp_shutdown(struct socket *sock, int how) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct mptcp_subflow_context *subflow; - struct socket *ssock; int ret = 0; pr_debug("sk=%p, how=%d", msk, how); lock_sock(sock->sk); - ssock = __mptcp_tcp_fallback(msk); - if (ssock) { - release_sock(sock->sk); - return inet_shutdown(ssock, how); - } - - if (how == SHUT_WR || how == SHUT_RDWR) - inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); how++; - if ((how & ~SHUTDOWN_MASK) || !how) { ret = -EINVAL; goto out_unlock; @@ -2030,13 +2323,36 @@ static int mptcp_shutdown(struct socket *sock, int how) sock->state = SS_CONNECTED; } - __mptcp_flush_join_list(msk); - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + /* If we've already sent a FIN, or it's a closed state, skip this. */ + if (__mptcp_check_fallback(msk)) { + if (how == SHUT_WR || how == SHUT_RDWR) + inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); - mptcp_subflow_shutdown(tcp_sk, how, 1, msk->write_seq); + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + mptcp_subflow_shutdown(sock->sk, tcp_sk, how); + } + } else if ((how & SEND_SHUTDOWN) && + ((1 << sock->sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | + TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) && + mptcp_close_state(sock->sk)) { + __mptcp_flush_join_list(msk); + + WRITE_ONCE(msk->write_seq, msk->write_seq + 1); + WRITE_ONCE(msk->snd_data_fin_enable, 1); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + mptcp_subflow_shutdown(sock->sk, tcp_sk, how); + } } + /* Wake up anyone sleeping in poll. */ + sock->sk->sk_state_change(sock->sk); + out_unlock: release_sock(sock->sk); @@ -2051,7 +2367,7 @@ static const struct proto_ops mptcp_stream_ops = { .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, - .getname = mptcp_v4_getname, + .getname = inet_getname, .poll = mptcp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, @@ -2063,10 +2379,6 @@ static const struct proto_ops mptcp_stream_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; static struct inet_protosw mptcp_protosw = { @@ -2077,7 +2389,7 @@ static struct inet_protosw mptcp_protosw = { .flags = INET_PROTOSW_ICSK, }; -void mptcp_proto_init(void) +void __init mptcp_proto_init(void) { mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; @@ -2086,6 +2398,7 @@ void mptcp_proto_init(void) mptcp_subflow_init(); mptcp_pm_init(); + mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); @@ -2104,7 +2417,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, - .getname = mptcp_v6_getname, + .getname = inet6_getname, .poll = mptcp_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, @@ -2118,8 +2431,6 @@ static const struct proto_ops mptcp_v6_stream_ops = { .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; @@ -2139,7 +2450,7 @@ static struct inet_protosw mptcp_v6_protosw = { .flags = INET_PROTOSW_ICSK, }; -int mptcp_proto_v6_init(void) +int __init mptcp_proto_v6_init(void) { int err; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index c6eeaf3e8dcb..60b27d44c184 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -89,6 +89,7 @@ #define MPTCP_SEND_SPACE 1 #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 +#define MPTCP_FALLBACK_DONE 4 struct mptcp_options_received { u64 sndr_key; @@ -173,8 +174,6 @@ struct mptcp_pm_data { u8 local_addr_max; u8 subflows_max; u8 status; - - struct work_struct work; }; struct mptcp_data_frag { @@ -194,11 +193,15 @@ struct mptcp_sock { u64 remote_key; u64 write_seq; u64 ack_seq; + u64 rcv_data_fin_seq; atomic64_t snd_una; unsigned long timer_ival; u32 token; unsigned long flags; bool can_ack; + bool fully_established; + bool rcv_data_fin; + bool snd_data_fin_enable; spinlock_t join_list_lock; struct work_struct work; struct list_head conn_list; @@ -208,6 +211,12 @@ struct mptcp_sock { struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct sock *first; struct mptcp_pm_data pm; + struct { + u32 space; /* bytes copied in last measurement window */ + u32 copied; /* bytes copied in this measurement window */ + u64 time; /* start time of measurement window */ + u64 rtt_us; /* last maximum rtt of subflows */ + } rcvq_space; }; #define mptcp_for_each_subflow(__msk, __subflow) \ @@ -250,6 +259,7 @@ struct mptcp_subflow_request_sock { u32 local_nonce; u32 remote_nonce; struct mptcp_sock *msk; + struct hlist_nulls_node token_node; }; static inline struct mptcp_subflow_request_sock * @@ -284,10 +294,8 @@ struct mptcp_subflow_context { backup : 1, data_avail : 1, rx_eof : 1, - data_fin_tx_enable : 1, use_64bit_ack : 1, /* Set when we received a 64-bit DSN */ can_ack : 1; /* only after processing the remote a key */ - u64 data_fin_tx_seq; u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -336,8 +344,10 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) } int mptcp_is_enabled(struct net *net); +void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, + struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); -void mptcp_subflow_init(void); +void __init mptcp_subflow_init(void); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, int ifindex, @@ -355,14 +365,9 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk, inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } -extern const struct inet_connection_sock_af_ops ipv4_specific; +void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) -extern const struct inet_connection_sock_af_ops ipv6_specific; -#endif - -void mptcp_proto_init(void); -#if IS_ENABLED(CONFIG_MPTCP_IPV6) -int mptcp_proto_v6_init(void); +int __init mptcp_proto_v6_init(void); #endif struct sock *mptcp_sk_clone(const struct sock *sk, @@ -372,36 +377,41 @@ void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); +static inline bool mptcp_is_fully_established(struct sock *sk) +{ + return inet_sk_state_load(sk) == TCP_ESTABLISHED && + READ_ONCE(mptcp_sk(sk)->fully_established); +} +void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); void mptcp_data_acked(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); +bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq); + +void __init mptcp_token_init(void); +static inline void mptcp_token_init_request(struct request_sock *req) +{ + mptcp_subflow_rsk(req)->token_node.pprev = NULL; +} int mptcp_token_new_request(struct request_sock *req); -void mptcp_token_destroy_request(u32 token); +void mptcp_token_destroy_request(struct request_sock *req); int mptcp_token_new_connect(struct sock *sk); -int mptcp_token_new_accept(u32 token, struct sock *conn); +void mptcp_token_accept(struct mptcp_subflow_request_sock *r, + struct mptcp_sock *msk); +bool mptcp_token_exists(u32 token); struct mptcp_sock *mptcp_token_get_sock(u32 token); -void mptcp_token_destroy(u32 token); +struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, + long *s_num); +void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); -static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) -{ - /* we might consider a faster version that computes the key as a - * hash of some information available in the MPTCP socket. Use - * random data at the moment, as it's probably the safest option - * in case multiple sockets are opened in different namespaces at - * the same time. - */ - get_random_bytes(key, sizeof(u64)); - mptcp_crypto_key_sha(*key, token, idsn); -} void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); -void mptcp_pm_init(void); +void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); -void mptcp_pm_close(struct mptcp_sock *msk); void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); @@ -433,7 +443,7 @@ bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_addr_info *saddr); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); -void mptcp_pm_nl_init(void); +void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_data_init(struct mptcp_sock *msk); void mptcp_pm_nl_fully_established(struct mptcp_sock *msk); void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk); @@ -454,4 +464,66 @@ static inline bool before64(__u64 seq1, __u64 seq2) void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); +static inline bool __mptcp_check_fallback(struct mptcp_sock *msk) +{ + return test_bit(MPTCP_FALLBACK_DONE, &msk->flags); +} + +static inline bool mptcp_check_fallback(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + return __mptcp_check_fallback(msk); +} + +static inline void __mptcp_do_fallback(struct mptcp_sock *msk) +{ + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) { + pr_debug("TCP fallback already done (msk=%p)", msk); + return; + } + set_bit(MPTCP_FALLBACK_DONE, &msk->flags); +} + +static inline void mptcp_do_fallback(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + __mptcp_do_fallback(msk); +} + +#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) + +static inline bool subflow_simultaneous_connect(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + return sk->sk_state == TCP_ESTABLISHED && + !mptcp_sk(parent)->pm.server_side && + !subflow->conn_finished; +} + +#ifdef CONFIG_SYN_COOKIES +void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb); +bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb); +void __init mptcp_join_cookie_init(void); +#else +static inline void +subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb) {} +static inline bool +mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb) +{ + return false; +} + +static inline void mptcp_join_cookie_init(void) {} +#endif + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3838a0b3a21f..96f4f2fe50ad 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -29,40 +29,6 @@ static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); } -static int subflow_rebuild_header(struct sock *sk) -{ - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - int local_id, err = 0; - - if (subflow->request_mptcp && !subflow->token) { - pr_debug("subflow=%p", sk); - err = mptcp_token_new_connect(sk); - } else if (subflow->request_join && !subflow->local_nonce) { - struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn; - - pr_debug("subflow=%p", sk); - - do { - get_random_bytes(&subflow->local_nonce, sizeof(u32)); - } while (!subflow->local_nonce); - - if (subflow->local_id) - goto out; - - local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); - if (local_id < 0) - return -EINVAL; - - subflow->local_id = local_id; - } - -out: - if (err) - return err; - - return subflow->icsk_af_ops->rebuild_header(sk); -} - static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); @@ -72,8 +38,7 @@ static void subflow_req_destructor(struct request_sock *req) if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); - if (subflow_req->mp_capable) - mptcp_token_destroy_request(subflow_req->token); + mptcp_token_destroy_request(req); tcp_request_sock_ops.destructor(req); } @@ -88,6 +53,12 @@ static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); } +static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) +{ + return mptcp_is_fully_established((void *)msk) && + READ_ONCE(msk->pm.accept_subflow); +} + /* validate received token and create truncated hmac and nonce for SYN-ACK */ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, const struct sk_buff *skb) @@ -120,30 +91,43 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, return msk; } -static void subflow_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener) { - struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - struct mptcp_options_received mp_opt; - - pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); - - mptcp_get_options(skb, &mp_opt); subflow_req->mp_capable = 0; subflow_req->mp_join = 0; subflow_req->msk = NULL; + mptcp_token_init_request(req); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) - return; + return -EINVAL; #endif + return 0; +} + +static void subflow_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_options_received mp_opt; + int ret; + + pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); + + ret = __subflow_init_req(req, sk_listener); + if (ret) + return; + + mptcp_get_options(skb, &mp_opt); + if (mp_opt.mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); @@ -154,13 +138,33 @@ static void subflow_init_req(struct request_sock *req, } if (mp_opt.mp_capable && listener->request_mptcp) { - int err; + int err, retries = 4; + + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; +again: + do { + get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); + } while (subflow_req->local_key == 0); + + if (unlikely(req->syncookie)) { + mptcp_crypto_key_sha(subflow_req->local_key, + &subflow_req->token, + &subflow_req->idsn); + if (mptcp_token_exists(subflow_req->token)) { + if (retries-- > 0) + goto again; + } else { + subflow_req->mp_capable = 1; + } + return; + } err = mptcp_token_new_request(req); if (err == 0) subflow_req->mp_capable = 1; + else if (retries-- > 0) + goto again; - subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; } else if (mp_opt.mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; subflow_req->mp_join = 1; @@ -169,11 +173,60 @@ static void subflow_init_req(struct request_sock *req, subflow_req->token = mp_opt.token; subflow_req->remote_nonce = mp_opt.nonce; subflow_req->msk = subflow_token_join_request(req, skb); + + if (unlikely(req->syncookie) && subflow_req->msk) { + if (mptcp_can_accept_new_subflow(subflow_req->msk)) + subflow_init_req_cookie_join_save(subflow_req, skb); + } + pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } } +int mptcp_subflow_init_cookie_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_options_received mp_opt; + int err; + + err = __subflow_init_req(req, sk_listener); + if (err) + return err; + + mptcp_get_options(skb, &mp_opt); + + if (mp_opt.mp_capable && mp_opt.mp_join) + return -EINVAL; + + if (mp_opt.mp_capable && listener->request_mptcp) { + if (mp_opt.sndr_key == 0) + return -EINVAL; + + subflow_req->local_key = mp_opt.rcvr_key; + err = mptcp_token_new_request(req); + if (err) + return err; + + subflow_req->mp_capable = 1; + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; + } else if (mp_opt.mp_join && listener->request_mptcp) { + if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) + return -EINVAL; + + if (mptcp_can_accept_new_subflow(subflow_req->msk)) + subflow_req->mp_join = 1; + + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); + static void subflow_v4_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) @@ -222,7 +275,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_options_received mp_opt; struct sock *parent = subflow->conn; - struct tcp_sock *tp = tcp_sk(sk); subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); @@ -235,46 +287,40 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (subflow->conn_finished) return; + subflow->rel_write_seq = 1; subflow->conn_finished = 1; + subflow->ssn_offset = TCP_SKB_CB(skb)->seq; + pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); - if (subflow->request_mptcp && mp_opt.mp_capable) { + if (subflow->request_mptcp) { + if (!mp_opt.mp_capable) { + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); + mptcp_do_fallback(sk); + pr_fallback(mptcp_sk(subflow->conn)); + goto fallback; + } + subflow->mp_capable = 1; subflow->can_ack = 1; subflow->remote_key = mp_opt.sndr_key; pr_debug("subflow=%p, remote_key=%llu", subflow, subflow->remote_key); - } else if (subflow->request_join && mp_opt.mp_join) { - subflow->mp_join = 1; + mptcp_finish_connect(sk); + } else if (subflow->request_join) { + u8 hmac[SHA256_DIGEST_SIZE]; + + if (!mp_opt.mp_join) + goto do_reset; + subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, subflow->thmac, subflow->remote_nonce); - } else if (subflow->request_mptcp) { - tp->is_mptcp = 0; - } - if (!tp->is_mptcp) - return; - - if (subflow->mp_capable) { - pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), - subflow->remote_key); - mptcp_finish_connect(sk); - - if (skb) { - pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); - subflow->ssn_offset = TCP_SKB_CB(skb)->seq; - } - } else if (subflow->mp_join) { - u8 hmac[SHA256_DIGEST_SIZE]; - - pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", - subflow, subflow->thmac, - subflow->remote_nonce); if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); - subflow->mp_join = 0; goto do_reset; } @@ -282,24 +328,26 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->local_nonce, subflow->remote_nonce, hmac); - memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); - if (skb) - subflow->ssn_offset = TCP_SKB_CB(skb)->seq; - if (!mptcp_finish_join(sk)) goto do_reset; + subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); - } else { -do_reset: - tcp_send_active_reset(sk, GFP_ATOMIC); - tcp_done(sk); + } else if (mptcp_check_fallback(sk)) { +fallback: + mptcp_rcv_space_init(mptcp_sk(parent), sk); } + return; + +do_reset: + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_done(sk); } -static struct request_sock_ops subflow_request_sock_ops; +struct request_sock_ops mptcp_subflow_request_sock_ops; +EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops); static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -312,7 +360,7 @@ static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; - return tcp_conn_request(&subflow_request_sock_ops, + return tcp_conn_request(&mptcp_subflow_request_sock_ops, &subflow_request_sock_ipv4_ops, sk, skb); drop: @@ -337,7 +385,7 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (!ipv6_unicast_destination(skb)) goto drop; - return tcp_conn_request(&subflow_request_sock_ops, + return tcp_conn_request(&mptcp_subflow_request_sock_ops, &subflow_request_sock_ipv6_ops, sk, skb); drop: @@ -386,7 +434,7 @@ static void mptcp_sock_destruct(struct sock *sk) sock_orphan(sk); } - mptcp_token_destroy(mptcp_sk(sk)->token); + mptcp_token_destroy(mptcp_sk(sk)); inet_sock_destruct(sk); } @@ -421,6 +469,17 @@ static void subflow_drop_ctx(struct sock *ssk) kfree_rcu(ctx, rcu); } +void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, + struct mptcp_options_received *mp_opt) +{ + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + subflow->remote_key = mp_opt->sndr_key; + subflow->fully_established = 1; + subflow->can_ack = 1; + WRITE_ONCE(msk->fully_established, true); +} + static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -444,7 +503,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, /* hopefully temporary handling for MP_JOIN+syncookie */ subflow_req = mptcp_subflow_rsk(req); - fallback_is_fatal = subflow_req->mp_join; + fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; fallback = !tcp_rsk(req)->is_mptcp; if (fallback) goto create_child; @@ -472,6 +531,7 @@ create_msk: } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); if (!mp_opt.mp_join || + !mptcp_can_accept_new_subflow(subflow_req->msk) || !subflow_hmac_valid(req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); fallback = true; @@ -500,20 +560,25 @@ create_child: } if (ctx->mp_capable) { + /* this can't race with mptcp_close(), as the msk is + * not yet exposted to user-space + */ + inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); + /* new mpc subflow takes ownership of the newly * created mptcp socket */ new_msk->sk_destruct = mptcp_sock_destruct; mptcp_pm_new_connection(mptcp_sk(new_msk), 1); + mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); ctx->conn = new_msk; new_msk = NULL; /* with OoO packets we can reach here without ingress * mpc option */ - ctx->remote_key = mp_opt.sndr_key; - ctx->fully_established = mp_opt.mp_capable; - ctx->can_ack = mp_opt.mp_capable; + if (mp_opt.mp_capable) + mptcp_subflow_fully_established(ctx, &mp_opt); } else if (ctx->mp_join) { struct mptcp_sock *owner; @@ -548,9 +613,9 @@ out: dispose_child: subflow_drop_ctx(child); tcp_rsk(req)->drop_req = true; - tcp_send_active_reset(child, GFP_ATOMIC); inet_csk_prepare_for_destroy_sock(child); tcp_done(child); + req->rsk_ops->send_reset(sk, skb); /* The last child reference will be released by the caller */ return child; @@ -562,7 +627,8 @@ enum mapping_status { MAPPING_OK, MAPPING_INVALID, MAPPING_EMPTY, - MAPPING_DATA_FIN + MAPPING_DATA_FIN, + MAPPING_DUMMY }; static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) @@ -614,7 +680,8 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) return true; } -static enum mapping_status get_mapping_status(struct sock *ssk) +static enum mapping_status get_mapping_status(struct sock *ssk, + struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_ext *mpext; @@ -626,6 +693,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk) if (!skb) return MAPPING_EMPTY; + if (mptcp_check_fallback(ssk)) + return MAPPING_DUMMY; + mpext = mptcp_get_ext(skb); if (!mpext || !mpext->use_map) { if (!subflow->map_valid && !skb->len) { @@ -661,7 +731,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk) if (mpext->data_fin == 1) { if (data_len == 1) { - pr_debug("DATA_FIN with no payload"); + mptcp_update_rcv_data_fin(msk, mpext->data_seq); + pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping @@ -673,6 +744,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk) } else { return MAPPING_DATA_FIN; } + } else { + mptcp_update_rcv_data_fin(msk, mpext->data_seq + data_len); + pr_debug("DATA_FIN with mapping seq=%llu", mpext->data_seq + data_len); } /* Adjust for DATA_FIN using 1 byte of sequence space */ @@ -761,12 +835,22 @@ static bool subflow_check_data_avail(struct sock *ssk) u64 ack_seq; u64 old_ack; - status = get_mapping_status(ssk); + status = get_mapping_status(ssk, msk); pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); if (status == MAPPING_INVALID) { ssk->sk_err = EBADMSG; goto fatal; } + if (status == MAPPING_DUMMY) { + __mptcp_do_fallback(msk); + skb = skb_peek(&ssk->sk_receive_queue); + subflow->map_valid = 1; + subflow->map_seq = READ_ONCE(msk->ack_seq); + subflow->map_data_len = skb->len; + subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - + subflow->ssn_offset; + return true; + } if (status != MAPPING_OK) return false; @@ -889,15 +973,20 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + u16 state = 1 << inet_sk_state_load(sk); struct sock *parent = subflow->conn; + struct mptcp_sock *msk; - if (!subflow->mp_capable && !subflow->mp_join) { - subflow->tcp_data_ready(sk); - + msk = mptcp_sk(parent); + if (state & TCPF_LISTEN) { + set_bit(MPTCP_DATA_READY, &msk->flags); parent->sk_data_ready(parent); return; } + WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && + !subflow->mp_join && !(state & TCPF_CLOSE)); + if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); } @@ -974,19 +1063,34 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; struct sockaddr_storage addr; + int local_id = loc->id; struct socket *sf; + struct sock *ssk; u32 remote_token; int addrlen; int err; - if (sk->sk_state != TCP_ESTABLISHED) + if (!mptcp_is_fully_established(sk)) return -ENOTCONN; err = mptcp_subflow_create_socket(sk, &sf); if (err) return err; - subflow = mptcp_subflow_ctx(sf->sk); + ssk = sf->sk; + subflow = mptcp_subflow_ctx(ssk); + do { + get_random_bytes(&subflow->local_nonce, sizeof(u32)); + } while (!subflow->local_nonce); + + if (!local_id) { + err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk); + if (err < 0) + goto failed; + + local_id = err; + } + subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; subflow->token = msk->token; @@ -997,15 +1101,16 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, if (loc->family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif - sf->sk->sk_bound_dev_if = ifindex; + ssk->sk_bound_dev_if = ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); if (err) goto failed; mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); - pr_debug("msk=%p remote_token=%u", msk, remote_token); + pr_debug("msk=%p remote_token=%u local_id=%d", msk, remote_token, + local_id); subflow->remote_token = remote_token; - subflow->local_id = loc->id; + subflow->local_id = local_id; subflow->request_join = 1; subflow->request_bkup = 1; mptcp_info2sockaddr(remote, &addr); @@ -1032,6 +1137,12 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) struct socket *sf; int err; + /* un-accepted server sockets can reach here - on bad configuration + * bail early to avoid greater trouble later + */ + if (unlikely(!sk->sk_socket)) + return -EINVAL; + err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, &sf); if (err) @@ -1118,14 +1229,26 @@ static void subflow_state_change(struct sock *sk) __subflow_state_change(sk); + if (subflow_simultaneous_connect(sk)) { + mptcp_do_fallback(sk); + mptcp_rcv_space_init(mptcp_sk(parent), sk); + pr_fallback(mptcp_sk(parent)); + subflow->conn_finished = 1; + if (inet_sk_state_load(parent) == TCP_SYN_SENT) { + inet_sk_state_store(parent, TCP_ESTABLISHED); + parent->sk_state_change(parent); + } + } + /* as recvmsg() does not acquire the subflow socket for ssk selection * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here. */ - if (subflow->mp_capable && mptcp_subflow_data_available(sk)) + if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); - if (!(parent->sk_shutdown & RCV_SHUTDOWN) && + if (__mptcp_check_fallback(mptcp_sk(parent)) && + !(parent->sk_shutdown & RCV_SHUTDOWN) && !subflow->rx_eof && subflow_is_done(sk)) { subflow->rx_eof = 1; mptcp_subflow_eof(parent); @@ -1255,10 +1378,10 @@ static int subflow_ops_init(struct request_sock_ops *subflow_ops) return 0; } -void mptcp_subflow_init(void) +void __init mptcp_subflow_init(void) { - subflow_request_sock_ops = tcp_request_sock_ops; - if (subflow_ops_init(&subflow_request_sock_ops) != 0) + mptcp_subflow_request_sock_ops = tcp_request_sock_ops; + if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0) panic("MPTCP: failed to init subflow request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; @@ -1268,7 +1391,6 @@ void mptcp_subflow_init(void) subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; - subflow_specific.rebuild_header = subflow_rebuild_header; #if IS_ENABLED(CONFIG_MPTCP_IPV6) subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; @@ -1278,7 +1400,6 @@ void mptcp_subflow_init(void) subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; - subflow_v6_specific.rebuild_header = subflow_rebuild_header; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; diff --git a/net/mptcp/syncookies.c b/net/mptcp/syncookies.c new file mode 100644 index 000000000000..abe0fd099746 --- /dev/null +++ b/net/mptcp/syncookies.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/skbuff.h> + +#include "protocol.h" + +/* Syncookies do not work for JOIN requests. + * + * Unlike MP_CAPABLE, where the ACK cookie contains the needed MPTCP + * options to reconstruct the initial syn state, MP_JOIN does not contain + * the token to obtain the mptcp socket nor the server-generated nonce + * that was used in the cookie SYN/ACK response. + * + * Keep a small best effort state table to store the syn/synack data, + * indexed by skb hash. + * + * A MP_JOIN SYN packet handled by syn cookies is only stored if the 32bit + * token matches a known mptcp connection that can still accept more subflows. + * + * There is no timeout handling -- state is only re-constructed + * when the TCP ACK passed the cookie validation check. + */ + +struct join_entry { + u32 token; + u32 remote_nonce; + u32 local_nonce; + u8 join_id; + u8 local_id; + u8 backup; + u8 valid; +}; + +#define COOKIE_JOIN_SLOTS 1024 + +static struct join_entry join_entries[COOKIE_JOIN_SLOTS] __cacheline_aligned_in_smp; +static spinlock_t join_entry_locks[COOKIE_JOIN_SLOTS] __cacheline_aligned_in_smp; + +static u32 mptcp_join_entry_hash(struct sk_buff *skb, struct net *net) +{ + u32 i = skb_get_hash(skb) ^ net_hash_mix(net); + + return i % ARRAY_SIZE(join_entries); +} + +static void mptcp_join_store_state(struct join_entry *entry, + const struct mptcp_subflow_request_sock *subflow_req) +{ + entry->token = subflow_req->token; + entry->remote_nonce = subflow_req->remote_nonce; + entry->local_nonce = subflow_req->local_nonce; + entry->backup = subflow_req->backup; + entry->join_id = subflow_req->remote_id; + entry->local_id = subflow_req->local_id; + entry->valid = 1; +} + +void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb) +{ + struct net *net = read_pnet(&subflow_req->sk.req.ireq_net); + u32 i = mptcp_join_entry_hash(skb, net); + + /* No use in waiting if other cpu is already using this slot -- + * would overwrite the data that got stored. + */ + spin_lock_bh(&join_entry_locks[i]); + mptcp_join_store_state(&join_entries[i], subflow_req); + spin_unlock_bh(&join_entry_locks[i]); +} + +/* Called for a cookie-ack with MP_JOIN option present. + * Look up the saved state based on skb hash & check token matches msk + * in same netns. + * + * Caller will check msk can still accept another subflow. The hmac + * present in the cookie ACK mptcp option space will be checked later. + */ +bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb) +{ + struct net *net = read_pnet(&subflow_req->sk.req.ireq_net); + u32 i = mptcp_join_entry_hash(skb, net); + struct mptcp_sock *msk; + struct join_entry *e; + + e = &join_entries[i]; + + spin_lock_bh(&join_entry_locks[i]); + + if (e->valid == 0) { + spin_unlock_bh(&join_entry_locks[i]); + return false; + } + + e->valid = 0; + + msk = mptcp_token_get_sock(e->token); + if (!msk) { + spin_unlock_bh(&join_entry_locks[i]); + return false; + } + + /* If this fails, the token got re-used in the mean time by another + * mptcp socket in a different netns, i.e. entry is outdated. + */ + if (!net_eq(sock_net((struct sock *)msk), net)) + goto err_put; + + subflow_req->remote_nonce = e->remote_nonce; + subflow_req->local_nonce = e->local_nonce; + subflow_req->backup = e->backup; + subflow_req->remote_id = e->join_id; + subflow_req->token = e->token; + subflow_req->msk = msk; + spin_unlock_bh(&join_entry_locks[i]); + return true; + +err_put: + spin_unlock_bh(&join_entry_locks[i]); + sock_put((struct sock *)msk); + return false; +} + +void __init mptcp_join_cookie_init(void) +{ + int i; + + for (i = 0; i < COOKIE_JOIN_SLOTS; i++) + spin_lock_init(&join_entry_locks[i]); +} diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 33352dd99d4d..8b47c4bb1c6b 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -24,7 +24,7 @@ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/radix-tree.h> +#include <linux/memblock.h> #include <linux/ip.h> #include <linux/tcp.h> #include <net/sock.h> @@ -33,10 +33,67 @@ #include <net/mptcp.h> #include "protocol.h" -static RADIX_TREE(token_tree, GFP_ATOMIC); -static RADIX_TREE(token_req_tree, GFP_ATOMIC); -static DEFINE_SPINLOCK(token_tree_lock); -static int token_used __read_mostly; +#define TOKEN_MAX_RETRIES 4 +#define TOKEN_MAX_CHAIN_LEN 4 + +struct token_bucket { + spinlock_t lock; + int chain_len; + struct hlist_nulls_head req_chain; + struct hlist_nulls_head msk_chain; +}; + +static struct token_bucket *token_hash __read_mostly; +static unsigned int token_mask __read_mostly; + +static struct token_bucket *token_bucket(u32 token) +{ + return &token_hash[token & token_mask]; +} + +/* called with bucket lock held */ +static struct mptcp_subflow_request_sock * +__token_lookup_req(struct token_bucket *t, u32 token) +{ + struct mptcp_subflow_request_sock *req; + struct hlist_nulls_node *pos; + + hlist_nulls_for_each_entry_rcu(req, pos, &t->req_chain, token_node) + if (req->token == token) + return req; + return NULL; +} + +/* called with bucket lock held */ +static struct mptcp_sock * +__token_lookup_msk(struct token_bucket *t, u32 token) +{ + struct hlist_nulls_node *pos; + struct sock *sk; + + sk_nulls_for_each_rcu(sk, pos, &t->msk_chain) + if (mptcp_sk(sk)->token == token) + return mptcp_sk(sk); + return NULL; +} + +static bool __token_bucket_busy(struct token_bucket *t, u32 token) +{ + return !token || t->chain_len >= TOKEN_MAX_CHAIN_LEN || + __token_lookup_req(t, token) || __token_lookup_msk(t, token); +} + +static void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) +{ + /* we might consider a faster version that computes the key as a + * hash of some information available in the MPTCP socket. Use + * random data at the moment, as it's probably the safest option + * in case multiple sockets are opened in different namespaces at + * the same time. + */ + get_random_bytes(key, sizeof(u64)); + mptcp_crypto_key_sha(*key, token, idsn); +} /** * mptcp_token_new_request - create new key/idsn/token for subflow_request @@ -52,30 +109,28 @@ static int token_used __read_mostly; int mptcp_token_new_request(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - int err; - - while (1) { - u32 token; - - mptcp_crypto_key_gen_sha(&subflow_req->local_key, - &subflow_req->token, - &subflow_req->idsn); - pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", - req, subflow_req->local_key, subflow_req->token, - subflow_req->idsn); - - token = subflow_req->token; - spin_lock_bh(&token_tree_lock); - if (!radix_tree_lookup(&token_req_tree, token) && - !radix_tree_lookup(&token_tree, token)) - break; - spin_unlock_bh(&token_tree_lock); + struct token_bucket *bucket; + u32 token; + + mptcp_crypto_key_sha(subflow_req->local_key, + &subflow_req->token, + &subflow_req->idsn); + pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", + req, subflow_req->local_key, subflow_req->token, + subflow_req->idsn); + + token = subflow_req->token; + bucket = token_bucket(token); + spin_lock_bh(&bucket->lock); + if (__token_bucket_busy(bucket, token)) { + spin_unlock_bh(&bucket->lock); + return -EBUSY; } - err = radix_tree_insert(&token_req_tree, - subflow_req->token, &token_used); - spin_unlock_bh(&token_tree_lock); - return err; + hlist_nulls_add_head_rcu(&subflow_req->token_node, &bucket->req_chain); + bucket->chain_len++; + spin_unlock_bh(&bucket->lock); + return 0; } /** @@ -97,48 +152,82 @@ int mptcp_token_new_request(struct request_sock *req) int mptcp_token_new_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct sock *mptcp_sock = subflow->conn; - int err; - - while (1) { - u32 token; + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + int retries = TOKEN_MAX_RETRIES; + struct token_bucket *bucket; - mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, - &subflow->idsn); + pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", + sk, subflow->local_key, subflow->token, subflow->idsn); - pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", - sk, subflow->local_key, subflow->token, subflow->idsn); +again: + mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, + &subflow->idsn); - token = subflow->token; - spin_lock_bh(&token_tree_lock); - if (!radix_tree_lookup(&token_req_tree, token) && - !radix_tree_lookup(&token_tree, token)) - break; - spin_unlock_bh(&token_tree_lock); + bucket = token_bucket(subflow->token); + spin_lock_bh(&bucket->lock); + if (__token_bucket_busy(bucket, subflow->token)) { + spin_unlock_bh(&bucket->lock); + if (!--retries) + return -EBUSY; + goto again; } - err = radix_tree_insert(&token_tree, subflow->token, mptcp_sock); - spin_unlock_bh(&token_tree_lock); - return err; + WRITE_ONCE(msk->token, subflow->token); + __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); + bucket->chain_len++; + spin_unlock_bh(&bucket->lock); + return 0; } /** - * mptcp_token_new_accept - insert token for later processing - * @token: the token to insert to the tree - * @conn: the just cloned socket linked to the new connection + * mptcp_token_accept - replace a req sk with full sock in token hash + * @req: the request socket to be removed + * @msk: the just cloned socket linked to the new connection * * Called when a SYN packet creates a new logical connection, i.e. * is not a join request. */ -int mptcp_token_new_accept(u32 token, struct sock *conn) +void mptcp_token_accept(struct mptcp_subflow_request_sock *req, + struct mptcp_sock *msk) { - int err; + struct mptcp_subflow_request_sock *pos; + struct token_bucket *bucket; - spin_lock_bh(&token_tree_lock); - err = radix_tree_insert(&token_tree, token, conn); - spin_unlock_bh(&token_tree_lock); + bucket = token_bucket(req->token); + spin_lock_bh(&bucket->lock); - return err; + /* pedantic lookup check for the moved token */ + pos = __token_lookup_req(bucket, req->token); + if (!WARN_ON_ONCE(pos != req)) + hlist_nulls_del_init_rcu(&req->token_node); + __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); + spin_unlock_bh(&bucket->lock); +} + +bool mptcp_token_exists(u32 token) +{ + struct hlist_nulls_node *pos; + struct token_bucket *bucket; + struct mptcp_sock *msk; + struct sock *sk; + + rcu_read_lock(); + bucket = token_bucket(token); + +again: + sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { + msk = mptcp_sk(sk); + if (READ_ONCE(msk->token) == token) + goto found; + } + if (get_nulls_value(pos) != (token & token_mask)) + goto again; + + rcu_read_unlock(); + return false; +found: + rcu_read_unlock(); + return true; } /** @@ -152,45 +241,171 @@ int mptcp_token_new_accept(u32 token, struct sock *conn) */ struct mptcp_sock *mptcp_token_get_sock(u32 token) { - struct sock *conn; - - spin_lock_bh(&token_tree_lock); - conn = radix_tree_lookup(&token_tree, token); - if (conn) { - /* token still reserved? */ - if (conn == (struct sock *)&token_used) - conn = NULL; - else - sock_hold(conn); + struct hlist_nulls_node *pos; + struct token_bucket *bucket; + struct mptcp_sock *msk; + struct sock *sk; + + rcu_read_lock(); + bucket = token_bucket(token); + +again: + sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { + msk = mptcp_sk(sk); + if (READ_ONCE(msk->token) != token) + continue; + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + goto not_found; + if (READ_ONCE(msk->token) != token) { + sock_put(sk); + goto again; + } + goto found; + } + if (get_nulls_value(pos) != (token & token_mask)) + goto again; + +not_found: + msk = NULL; + +found: + rcu_read_unlock(); + return msk; +} +EXPORT_SYMBOL_GPL(mptcp_token_get_sock); + +/** + * mptcp_token_iter_next - iterate over the token container from given pos + * @net: namespace to be iterated + * @s_slot: start slot number + * @s_num: start number inside the given lock + * + * This function returns the first mptcp connection structure found inside the + * token container starting from the specified position, or NULL. + * + * On successful iteration, the iterator is move to the next position and the + * the acquires a reference to the returned socket. + */ +struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, + long *s_num) +{ + struct mptcp_sock *ret = NULL; + struct hlist_nulls_node *pos; + int slot, num; + + for (slot = *s_slot; slot <= token_mask; *s_num = 0, slot++) { + struct token_bucket *bucket = &token_hash[slot]; + struct sock *sk; + + num = 0; + + if (hlist_nulls_empty(&bucket->msk_chain)) + continue; + + rcu_read_lock(); + sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { + ++num; + if (!net_eq(sock_net(sk), net)) + continue; + + if (num <= *s_num) + continue; + + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + continue; + + if (!net_eq(sock_net(sk), net)) { + sock_put(sk); + continue; + } + + ret = mptcp_sk(sk); + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); } - spin_unlock_bh(&token_tree_lock); - return mptcp_sk(conn); +out: + *s_slot = slot; + *s_num = num; + return ret; } +EXPORT_SYMBOL_GPL(mptcp_token_iter_next); /** * mptcp_token_destroy_request - remove mptcp connection/token - * @token: token of mptcp connection to remove + * @req: mptcp request socket dropping the token * - * Remove not-yet-fully-established incoming connection identified - * by @token. + * Remove the token associated to @req. */ -void mptcp_token_destroy_request(u32 token) +void mptcp_token_destroy_request(struct request_sock *req) { - spin_lock_bh(&token_tree_lock); - radix_tree_delete(&token_req_tree, token); - spin_unlock_bh(&token_tree_lock); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_subflow_request_sock *pos; + struct token_bucket *bucket; + + if (hlist_nulls_unhashed(&subflow_req->token_node)) + return; + + bucket = token_bucket(subflow_req->token); + spin_lock_bh(&bucket->lock); + pos = __token_lookup_req(bucket, subflow_req->token); + if (!WARN_ON_ONCE(pos != subflow_req)) { + hlist_nulls_del_init_rcu(&pos->token_node); + bucket->chain_len--; + } + spin_unlock_bh(&bucket->lock); } /** * mptcp_token_destroy - remove mptcp connection/token - * @token: token of mptcp connection to remove + * @msk: mptcp connection dropping the token * - * Remove the connection identified by @token. + * Remove the token associated to @msk */ -void mptcp_token_destroy(u32 token) +void mptcp_token_destroy(struct mptcp_sock *msk) { - spin_lock_bh(&token_tree_lock); - radix_tree_delete(&token_tree, token); - spin_unlock_bh(&token_tree_lock); + struct token_bucket *bucket; + struct mptcp_sock *pos; + + if (sk_unhashed((struct sock *)msk)) + return; + + bucket = token_bucket(msk->token); + spin_lock_bh(&bucket->lock); + pos = __token_lookup_msk(bucket, msk->token); + if (!WARN_ON_ONCE(pos != msk)) { + __sk_nulls_del_node_init_rcu((struct sock *)pos); + bucket->chain_len--; + } + spin_unlock_bh(&bucket->lock); } + +void __init mptcp_token_init(void) +{ + int i; + + token_hash = alloc_large_system_hash("MPTCP token", + sizeof(struct token_bucket), + 0, + 20,/* one slot per 1MB of memory */ + HASH_ZERO, + NULL, + &token_mask, + 0, + 64 * 1024); + for (i = 0; i < token_mask + 1; ++i) { + INIT_HLIST_NULLS_HEAD(&token_hash[i].req_chain, i); + INIT_HLIST_NULLS_HEAD(&token_hash[i].msk_chain, i); + spin_lock_init(&token_hash[i].lock); + } +} + +#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS) +EXPORT_SYMBOL_GPL(mptcp_token_new_request); +EXPORT_SYMBOL_GPL(mptcp_token_new_connect); +EXPORT_SYMBOL_GPL(mptcp_token_accept); +EXPORT_SYMBOL_GPL(mptcp_token_destroy_request); +EXPORT_SYMBOL_GPL(mptcp_token_destroy); +#endif diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c new file mode 100644 index 000000000000..e1bd6f0a0676 --- /dev/null +++ b/net/mptcp/token_test.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <kunit/test.h> + +#include "protocol.h" + +static struct mptcp_subflow_request_sock *build_req_sock(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req; + + req = kunit_kzalloc(test, sizeof(struct mptcp_subflow_request_sock), + GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, req); + mptcp_token_init_request((struct request_sock *)req); + return req; +} + +static void mptcp_token_test_req_basic(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req = build_req_sock(test); + struct mptcp_sock *null_msk = NULL; + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_request((struct request_sock *)req)); + KUNIT_EXPECT_NE(test, 0, (int)req->token); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(req->token)); + + /* cleanup */ + mptcp_token_destroy_request((struct request_sock *)req); +} + +static struct inet_connection_sock *build_icsk(struct kunit *test) +{ + struct inet_connection_sock *icsk; + + icsk = kunit_kzalloc(test, sizeof(struct inet_connection_sock), + GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, icsk); + return icsk; +} + +static struct mptcp_subflow_context *build_ctx(struct kunit *test) +{ + struct mptcp_subflow_context *ctx; + + ctx = kunit_kzalloc(test, sizeof(struct mptcp_subflow_context), + GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, ctx); + return ctx; +} + +static struct mptcp_sock *build_msk(struct kunit *test) +{ + struct mptcp_sock *msk; + + msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk); + refcount_set(&((struct sock *)msk)->sk_refcnt, 1); + return msk; +} + +static void mptcp_token_test_msk_basic(struct kunit *test) +{ + struct inet_connection_sock *icsk = build_icsk(test); + struct mptcp_subflow_context *ctx = build_ctx(test); + struct mptcp_sock *msk = build_msk(test); + struct mptcp_sock *null_msk = NULL; + struct sock *sk; + + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + ctx->conn = (struct sock *)msk; + sk = (struct sock *)msk; + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_connect((struct sock *)icsk)); + KUNIT_EXPECT_NE(test, 0, (int)ctx->token); + KUNIT_EXPECT_EQ(test, ctx->token, msk->token); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(ctx->token)); + KUNIT_EXPECT_EQ(test, 2, (int)refcount_read(&sk->sk_refcnt)); + + mptcp_token_destroy(msk); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(ctx->token)); +} + +static void mptcp_token_test_accept(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req = build_req_sock(test); + struct mptcp_sock *msk = build_msk(test); + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_request((struct request_sock *)req)); + msk->token = req->token; + mptcp_token_accept(req, msk); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + + /* this is now a no-op */ + mptcp_token_destroy_request((struct request_sock *)req); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + + /* cleanup */ + mptcp_token_destroy(msk); +} + +static void mptcp_token_test_destroyed(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req = build_req_sock(test); + struct mptcp_sock *msk = build_msk(test); + struct mptcp_sock *null_msk = NULL; + struct sock *sk; + + sk = (struct sock *)msk; + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_request((struct request_sock *)req)); + msk->token = req->token; + mptcp_token_accept(req, msk); + + /* simulate race on removal */ + refcount_set(&sk->sk_refcnt, 0); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(msk->token)); + + /* cleanup */ + mptcp_token_destroy(msk); +} + +static struct kunit_case mptcp_token_test_cases[] = { + KUNIT_CASE(mptcp_token_test_req_basic), + KUNIT_CASE(mptcp_token_test_msk_basic), + KUNIT_CASE(mptcp_token_test_accept), + KUNIT_CASE(mptcp_token_test_destroyed), + {} +}; + +static struct kunit_suite mptcp_token_suite = { + .name = "mptcp-token", + .test_cases = mptcp_token_test_cases, +}; + +kunit_test_suite(mptcp_token_suite); + +MODULE_LICENSE("GPL"); diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index a94bb59793f0..5b1f4ec66dd9 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -471,7 +471,7 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr) memcpy(&ncf->addrs[index], cmd->mac, ETH_ALEN); } else { clear_bit(cmd->index - 1, bitmap); - memset(&ncf->addrs[index], 0, ETH_ALEN); + eth_zero_addr(&ncf->addrs[index]); } spin_unlock_irqrestore(&nc->lock, flags); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0ffe2b8723c4..25313c29d799 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -447,7 +447,7 @@ config NF_TABLES replace the existing {ip,ip6,arp,eb}_tables infrastructure. It provides a pseudo-state machine with an extensible instruction-set (also known as expressions) that the userspace 'nft' utility - (http://www.netfilter.org/projects/nftables) uses to build the + (https://www.netfilter.org/projects/nftables) uses to build the rule-set. It also comes with the generic set infrastructure that allows you to construct mappings between matchings and actions for performance lookups. diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 56621d6bfd29..920b7c4331f0 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1644,7 +1644,7 @@ dump_last: goto next_set; if (set->variant->uref) set->variant->uref(set, cb, true); - /* fall through */ + fallthrough; default: ret = set->variant->list(set, skb, cb); if (!cb->args[IPSET_CB_ARG0]) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 02f2f636798d..a90b8eac16ac 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -807,6 +807,31 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head) kmem_cache_free(ip_vs_conn_cachep, cp); } +/* Try to delete connection while not holding reference */ +static void ip_vs_conn_del(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) { + /* Drop cp->control chain too */ + if (cp->control) + cp->timeout = 0; + ip_vs_conn_expire(&cp->timer); + } +} + +/* Try to delete connection while holding reference */ +static void ip_vs_conn_del_put(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) { + /* Drop cp->control chain too */ + if (cp->control) + cp->timeout = 0; + __ip_vs_conn_put(cp); + ip_vs_conn_expire(&cp->timer); + } else { + __ip_vs_conn_put(cp); + } +} + static void ip_vs_conn_expire(struct timer_list *t) { struct ip_vs_conn *cp = from_timer(cp, t, timer); @@ -827,14 +852,17 @@ static void ip_vs_conn_expire(struct timer_list *t) /* does anybody control me? */ if (ct) { + bool has_ref = !cp->timeout && __ip_vs_conn_get(ct); + ip_vs_control_del(cp); /* Drop CTL or non-assured TPL if not used anymore */ - if (!cp->timeout && !atomic_read(&ct->n_control) && + if (has_ref && !atomic_read(&ct->n_control) && (!(ct->flags & IP_VS_CONN_F_TEMPLATE) || !(ct->state & IP_VS_CTPL_S_ASSURED))) { IP_VS_DBG(4, "drop controlling connection\n"); - ct->timeout = 0; - ip_vs_conn_expire_now(ct); + ip_vs_conn_del_put(ct); + } else if (has_ref) { + __ip_vs_conn_put(ct); } } @@ -1317,8 +1345,7 @@ try_drop: drop: IP_VS_DBG(4, "drop connection\n"); - cp->timeout = 0; - ip_vs_conn_expire_now(cp); + ip_vs_conn_del(cp); } cond_resched_rcu(); } @@ -1341,19 +1368,15 @@ flush_again: hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { if (cp->ipvs != ipvs) continue; - /* As timers are expired in LIFO order, restart - * the timer of controlling connection first, so - * that it is expired after us. - */ + if (atomic_read(&cp->n_control)) + continue; cp_c = cp->control; - /* cp->control is valid only with reference to cp */ - if (cp_c && __ip_vs_conn_get(cp)) { + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_del(cp); + if (cp_c && !atomic_read(&cp_c->n_control)) { IP_VS_DBG(4, "del controlling connection\n"); - ip_vs_conn_expire_now(cp_c); - __ip_vs_conn_put(cp); + ip_vs_conn_del(cp_c); } - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); } cond_resched_rcu(); } @@ -1366,6 +1389,45 @@ flush_again: goto flush_again; } } + +#ifdef CONFIG_SYSCTL +void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) +{ + int idx; + struct ip_vs_conn *cp, *cp_c; + struct ip_vs_dest *dest; + + rcu_read_lock(); + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + if (cp->ipvs != ipvs) + continue; + + dest = cp->dest; + if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE)) + continue; + + if (atomic_read(&cp->n_control)) + continue; + + cp_c = cp->control; + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_del(cp); + if (cp_c && !atomic_read(&cp_c->n_control)) { + IP_VS_DBG(4, "del controlling connection\n"); + ip_vs_conn_del(cp_c); + } + } + cond_resched_rcu(); + + /* netns clean up started, abort delayed work */ + if (!ipvs->enable) + break; + } + rcu_read_unlock(); +} +#endif + /* * per netns init and exit */ diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index aa6a603a2425..e3668a6e54e4 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -694,16 +694,10 @@ static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) return ipvs->sysctl_nat_icmp_send; } -static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) -{ - return ipvs->sysctl_expire_nodest_conn; -} - #else static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } -static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } #endif @@ -2066,14 +2060,14 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) { - bool uses_ct = false, resched = false; + bool old_ct = false, resched = false; if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && unlikely(!atomic_read(&cp->dest->weight))) { resched = true; - uses_ct = ip_vs_conn_uses_conntrack(cp, skb); + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); } else if (is_new_conn_expected(cp, conn_reuse_mode)) { - uses_ct = ip_vs_conn_uses_conntrack(cp, skb); + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); if (!atomic_read(&cp->n_control)) { resched = true; } else { @@ -2081,50 +2075,51 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int * that uses conntrack while it is still * referenced by controlled connection(s). */ - resched = !uses_ct; + resched = !old_ct; } } if (resched) { + if (!old_ct) + cp->flags &= ~IP_VS_CONN_F_NFCT; if (!atomic_read(&cp->n_control)) ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); - if (uses_ct) + if (old_ct) return NF_DROP; cp = NULL; } } - if (unlikely(!cp)) { - int v; - - if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) - return v; - } - - IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); - /* Check the server status */ - if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ + if (sysctl_expire_nodest_conn(ipvs)) { + bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); - __u32 flags = cp->flags; - - /* when timer already started, silently drop the packet.*/ - if (timer_pending(&cp->timer)) - __ip_vs_conn_put(cp); - else - ip_vs_conn_put(cp); + if (!old_ct) + cp->flags &= ~IP_VS_CONN_F_NFCT; - if (sysctl_expire_nodest_conn(ipvs) && - !(flags & IP_VS_CONN_F_ONE_PACKET)) { - /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + if (old_ct) + return NF_DROP; + cp = NULL; + } else { + __ip_vs_conn_put(cp); + return NF_DROP; } + } - return NF_DROP; + if (unlikely(!cp)) { + int v; + + if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) + return v; } + IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); + ip_vs_in_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); if (cp->packet_xmit) @@ -2256,7 +2251,7 @@ ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, #endif -static const struct nf_hook_ops ip_vs_ops[] = { +static const struct nf_hook_ops ip_vs_ops4[] = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply4, @@ -2302,7 +2297,10 @@ static const struct nf_hook_ops ip_vs_ops[] = { .hooknum = NF_INET_FORWARD, .priority = 100, }, +}; + #ifdef CONFIG_IP_VS_IPV6 +static const struct nf_hook_ops ip_vs_ops6[] = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply6, @@ -2348,8 +2346,64 @@ static const struct nf_hook_ops ip_vs_ops[] = { .hooknum = NF_INET_FORWARD, .priority = 100, }, -#endif }; +#endif + +int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af) +{ + const struct nf_hook_ops *ops; + unsigned int count; + unsigned int afmask; + int ret = 0; + + if (af == AF_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + ops = ip_vs_ops6; + count = ARRAY_SIZE(ip_vs_ops6); + afmask = 2; +#else + return -EINVAL; +#endif + } else { + ops = ip_vs_ops4; + count = ARRAY_SIZE(ip_vs_ops4); + afmask = 1; + } + + if (!(ipvs->hooks_afmask & afmask)) { + ret = nf_register_net_hooks(ipvs->net, ops, count); + if (ret >= 0) + ipvs->hooks_afmask |= afmask; + } + return ret; +} + +void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af) +{ + const struct nf_hook_ops *ops; + unsigned int count; + unsigned int afmask; + + if (af == AF_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + ops = ip_vs_ops6; + count = ARRAY_SIZE(ip_vs_ops6); + afmask = 2; +#else + return; +#endif + } else { + ops = ip_vs_ops4; + count = ARRAY_SIZE(ip_vs_ops4); + afmask = 1; + } + + if (ipvs->hooks_afmask & afmask) { + nf_unregister_net_hooks(ipvs->net, ops, count); + ipvs->hooks_afmask &= ~afmask; + } +} + /* * Initialize IP Virtual Server netns mem. */ @@ -2425,19 +2479,6 @@ static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) } } -static int __net_init __ip_vs_dev_init(struct net *net) -{ - int ret; - - ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - if (ret < 0) - goto hook_fail; - return 0; - -hook_fail: - return ret; -} - static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) { struct netns_ipvs *ipvs; @@ -2446,7 +2487,8 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) EnterFunction(2); list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); - nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + ip_vs_unregister_hooks(ipvs, AF_INET); + ip_vs_unregister_hooks(ipvs, AF_INET6); ipvs->enable = 0; /* Disable packet reception */ smp_wmb(); ip_vs_sync_net_cleanup(ipvs); @@ -2462,7 +2504,6 @@ static struct pernet_operations ipvs_core_ops = { }; static struct pernet_operations ipvs_core_dev_ops = { - .init = __ip_vs_dev_init, .exit_batch = __ip_vs_dev_cleanup_batch, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 412656c34f20..678c5b14841c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -210,6 +210,17 @@ static void update_defense_level(struct netns_ipvs *ipvs) local_bh_enable(); } +/* Handler for delayed work for expiring no + * destination connections + */ +static void expire_nodest_conn_handler(struct work_struct *work) +{ + struct netns_ipvs *ipvs; + + ipvs = container_of(work, struct netns_ipvs, + expire_nodest_conn_work.work); + ip_vs_expire_nodest_conn_flush(ipvs); +} /* * Timer for checking the defense @@ -224,7 +235,8 @@ static void defense_work_handler(struct work_struct *work) update_defense_level(ipvs); if (atomic_read(&ipvs->dropentry)) ip_vs_random_dropentry(ipvs); - schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + queue_delayed_work(system_long_wq, &ipvs->defense_work, + DEFENSE_TIMER_PERIOD); } #endif @@ -1163,6 +1175,12 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, list_add(&dest->t_list, &ipvs->dest_trash); dest->idle_start = 0; spin_unlock_bh(&ipvs->dest_trash_lock); + + /* Queue up delayed work to expire all no destination connections. + * No-op when CONFIG_SYSCTL is disabled. + */ + if (!cleanup) + ip_vs_enqueue_expire_nodest_conns(ipvs); } @@ -1272,6 +1290,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; + int ret_hooks = -1; /* increase the module use count */ if (!ip_vs_use_count_inc()) @@ -1313,6 +1332,14 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, } #endif + if ((u->af == AF_INET && !ipvs->num_services) || + (u->af == AF_INET6 && !ipvs->num_services6)) { + ret = ip_vs_register_hooks(ipvs, u->af); + if (ret < 0) + goto out_err; + ret_hooks = ret; + } + svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); if (svc == NULL) { IP_VS_DBG(1, "%s(): no memory\n", __func__); @@ -1374,6 +1401,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services++; + else if (svc->af == AF_INET6) + ipvs->num_services6++; /* Hash the service into the service table */ ip_vs_svc_hash(svc); @@ -1385,6 +1414,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, out_err: + if (ret_hooks >= 0) + ip_vs_unregister_hooks(ipvs, u->af); if (svc != NULL) { ip_vs_unbind_scheduler(svc, sched); ip_vs_service_free(svc); @@ -1500,9 +1531,15 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) struct ip_vs_pe *old_pe; struct netns_ipvs *ipvs = svc->ipvs; - /* Count only IPv4 services for old get/setsockopt interface */ - if (svc->af == AF_INET) + if (svc->af == AF_INET) { ipvs->num_services--; + if (!ipvs->num_services) + ip_vs_unregister_hooks(ipvs, svc->af); + } else if (svc->af == AF_INET6) { + ipvs->num_services6--; + if (!ipvs->num_services6) + ip_vs_unregister_hooks(ipvs, svc->af); + } ip_vs_stop_estimator(svc->ipvs, &svc->stats); @@ -2414,7 +2451,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, } static int -do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) { struct net *net = sock_net(sk); int ret; @@ -2438,7 +2475,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) return -EINVAL; } - if (copy_from_user(arg, user, len) != 0) + if (copy_from_sockptr(arg, ptr, len) != 0) return -EFAULT; /* Handle daemons since they have another lock */ @@ -4063,7 +4100,12 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) ipvs->sysctl_tbl = tbl; /* Schedule defense work */ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); - schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + queue_delayed_work(system_long_wq, &ipvs->defense_work, + DEFENSE_TIMER_PERIOD); + + /* Init delayed work for expiring no dest conn */ + INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, + expire_nodest_conn_handler); return 0; } @@ -4072,6 +4114,7 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; + cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f33d72c5b06e..e38b60fc183e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1006,7 +1006,7 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) * * @skb: skb that causes the clash * @h: tuplehash of the clashing entry already in table - * @hash_reply: hash slot for reply direction + * @reply_hash: hash slot for reply direction * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple. @@ -1344,18 +1344,6 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) return false; } -#define DAY (86400 * HZ) - -/* Set an arbitrary timeout large enough not to ever expire, this save - * us a check for the IPS_OFFLOAD_BIT from the packet path via - * nf_ct_is_expired(). - */ -static void nf_ct_offload_timeout(struct nf_conn *ct) -{ - if (nf_ct_expires(ct) < DAY / 2) - ct->timeout = nfct_time_stamp + DAY; -} - static void gc_worker(struct work_struct *work) { unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 573cb4481481..e697a824b001 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -257,15 +257,15 @@ static unsigned int get_uint(struct bitstr *bs, int b) case 4: v |= *bs->cur++; v <<= 8; - /* fall through */ + fallthrough; case 3: v |= *bs->cur++; v <<= 8; - /* fall through */ + fallthrough; case 2: v |= *bs->cur++; v <<= 8; - /* fall through */ + fallthrough; case 1: v |= *bs->cur++; break; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index a0560d175a7f..95f79980348c 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -610,7 +610,7 @@ void nf_ct_netns_put(struct net *net, uint8_t nfproto) switch (nfproto) { case NFPROTO_BRIDGE: nf_ct_netns_do_put(net, NFPROTO_BRIDGE); - /* fall through */ + fallthrough; case NFPROTO_INET: nf_ct_netns_do_put(net, NFPROTO_IPV4); nf_ct_netns_do_put(net, NFPROTO_IPV6); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 1926fd56df56..6892e497781c 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -900,7 +900,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, return -NF_REPEAT; return NF_DROP; } - /* Fall through */ + fallthrough; case TCP_CONNTRACK_IGNORE: /* Ignored packets: * diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 6a26299cb064..a604f43e3e6b 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -60,7 +60,7 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, ntohs(tuple->src.u.tcp.port), ntohs(tuple->dst.u.tcp.port)); break; - case IPPROTO_UDPLITE: /* fallthrough */ + case IPPROTO_UDPLITE: case IPPROTO_UDP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.udp.port), diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index b1eb5272b379..4f7a567c536e 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -243,6 +243,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } + nf_ct_offload_timeout(flow->ct); + if (nf_flowtable_hw_offload(flow_table)) { __set_bit(NF_FLOW_HW, &flow->flags); nf_flow_offload_add(flow_table, flow); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 5fff1e040168..2a6993fa40d7 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -964,7 +964,7 @@ static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo, nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, extack); - return flow_indr_dev_setup_offload(dev, TC_SETUP_FT, flowtable, bo, + return flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_FT, flowtable, bo, nf_flow_table_indr_cleanup); } diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index bfc555fcbc72..ea923f8cf9c4 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -408,7 +408,7 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, static const unsigned int max_attempts = 128; switch (tuple->dst.protonum) { - case IPPROTO_ICMP: /* fallthrough */ + case IPPROTO_ICMP: case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; @@ -442,11 +442,11 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, } goto find_free_id; #endif - case IPPROTO_UDP: /* fallthrough */ - case IPPROTO_UDPLITE: /* fallthrough */ - case IPPROTO_TCP: /* fallthrough */ - case IPPROTO_SCTP: /* fallthrough */ - case IPPROTO_DCCP: /* fallthrough */ + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_TCP: + case IPPROTO_SCTP: + case IPPROTO_DCCP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c index 46cb3786e0ec..34afcd03b6f6 100644 --- a/net/netfilter/nf_sockopt.c +++ b/net/netfilter/nf_sockopt.c @@ -89,78 +89,32 @@ out: return ops; } -/* Call get/setsockopt() */ -static int nf_sockopt(struct sock *sk, u_int8_t pf, int val, - char __user *opt, int *len, int get) +int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, sockptr_t opt, + unsigned int len) { struct nf_sockopt_ops *ops; int ret; - ops = nf_sockopt_find(sk, pf, val, get); + ops = nf_sockopt_find(sk, pf, val, 0); if (IS_ERR(ops)) return PTR_ERR(ops); - - if (get) - ret = ops->get(sk, val, opt, len); - else - ret = ops->set(sk, val, opt, *len); - + ret = ops->set(sk, val, opt, len); module_put(ops->owner); return ret; } - -int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, - unsigned int len) -{ - return nf_sockopt(sk, pf, val, opt, &len, 0); -} EXPORT_SYMBOL(nf_setsockopt); int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int *len) { - return nf_sockopt(sk, pf, val, opt, len, 1); -} -EXPORT_SYMBOL(nf_getsockopt); - -#ifdef CONFIG_COMPAT -static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val, - char __user *opt, int *len, int get) -{ struct nf_sockopt_ops *ops; int ret; - ops = nf_sockopt_find(sk, pf, val, get); + ops = nf_sockopt_find(sk, pf, val, 1); if (IS_ERR(ops)) return PTR_ERR(ops); - - if (get) { - if (ops->compat_get) - ret = ops->compat_get(sk, val, opt, len); - else - ret = ops->get(sk, val, opt, len); - } else { - if (ops->compat_set) - ret = ops->compat_set(sk, val, opt, *len); - else - ret = ops->set(sk, val, opt, *len); - } - + ret = ops->get(sk, val, opt, len); module_put(ops->owner); return ret; } - -int compat_nf_setsockopt(struct sock *sk, u_int8_t pf, - int val, char __user *opt, unsigned int len) -{ - return compat_nf_sockopt(sk, pf, val, opt, &len, 0); -} -EXPORT_SYMBOL(compat_nf_setsockopt); - -int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, - int val, char __user *opt, int *len) -{ - return compat_nf_sockopt(sk, pf, val, opt, len, 1); -} -EXPORT_SYMBOL(compat_nf_getsockopt); -#endif +EXPORT_SYMBOL(nf_getsockopt); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index ebcdc8e54476..9cca35d22927 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -704,8 +704,7 @@ ipv4_synproxy_hook(void *priv, struct sk_buff *skb, nf_ct_seqadj_init(ct, ctinfo, 0); synproxy->tsoff = 0; this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ + fallthrough; case TCP_CONNTRACK_SYN_SENT: if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; @@ -1128,8 +1127,7 @@ ipv6_synproxy_hook(void *priv, struct sk_buff *skb, nf_ct_seqadj_init(ct, ctinfo, 0); synproxy->tsoff = 0; this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ + fallthrough; case TCP_CONNTRACK_SYN_SENT: if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2b3862ea0505..d878e34e3354 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -269,9 +269,15 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) if (trans == NULL) return ERR_PTR(-ENOMEM); - if (msg_type == NFT_MSG_NEWCHAIN) + if (msg_type == NFT_MSG_NEWCHAIN) { nft_activate_next(ctx->net, ctx->chain); + if (ctx->nla[NFTA_CHAIN_ID]) { + nft_trans_chain_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); + } + } + list_add_tail(&trans->list, &ctx->net->nft.commit_list); return trans; } @@ -1049,6 +1055,9 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; + if (nft_chain_is_bound(chain)) + continue; + ctx->chain = chain; err = nft_delrule_by_chain(ctx); @@ -1091,6 +1100,9 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; + if (nft_chain_is_bound(chain)) + continue; + ctx->chain = chain; err = nft_delchain(ctx); @@ -1273,6 +1285,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, + [NFTA_CHAIN_ID] = { .type = NLA_U32 }, }; static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { @@ -1405,13 +1418,12 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, lockdep_commit_lock_is_held(net)); if (nft_dump_stats(skb, stats)) goto nla_put_failure; - - if ((chain->flags & NFT_CHAIN_HW_OFFLOAD) && - nla_put_be32(skb, NFTA_CHAIN_FLAGS, - htonl(NFT_CHAIN_HW_OFFLOAD))) - goto nla_put_failure; } + if (chain->flags && + nla_put_be32(skb, NFTA_CHAIN_FLAGS, htonl(chain->flags))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) goto nla_put_failure; @@ -1625,7 +1637,7 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) kvfree(chain->rules_next); } -static void nf_tables_chain_destroy(struct nft_ctx *ctx) +void nf_tables_chain_destroy(struct nft_ctx *ctx) { struct nft_chain *chain = ctx->chain; struct nft_hook *hook, *next; @@ -1907,7 +1919,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, nft_basechain_hook_init(&basechain->ops, family, hook, chain); } - chain->flags |= NFT_BASE_CHAIN | flags; + chain->flags |= NFT_CHAIN_BASE | flags; basechain->policy = NF_ACCEPT; if (chain->flags & NFT_CHAIN_HW_OFFLOAD && nft_chain_offload_priority(basechain) < 0) @@ -1918,6 +1930,22 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, return 0; } +static int nft_chain_add(struct nft_table *table, struct nft_chain *chain) +{ + int err; + + err = rhltable_insert_key(&table->chains_ht, chain->name, + &chain->rhlhead, nft_chain_ht_params); + if (err) + return err; + + list_add_tail_rcu(&chain->list, &table->chains); + + return 0; +} + +static u64 chain_id; + static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, u8 policy, u32 flags) { @@ -1926,6 +1954,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_base_chain *basechain; struct nft_stats __percpu *stats; struct net *net = ctx->net; + char name[NFT_NAME_MAXLEN]; struct nft_trans *trans; struct nft_chain *chain; struct nft_rule **rules; @@ -1937,6 +1966,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_HOOK]) { struct nft_chain_hook hook; + if (flags & NFT_CHAIN_BINDING) + return -EOPNOTSUPP; + err = nft_chain_parse_hook(net, nla, &hook, family, true); if (err < 0) return err; @@ -1966,16 +1998,33 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, return err; } } else { + if (flags & NFT_CHAIN_BASE) + return -EINVAL; + if (flags & NFT_CHAIN_HW_OFFLOAD) + return -EOPNOTSUPP; + chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) return -ENOMEM; + + chain->flags = flags; } ctx->chain = chain; INIT_LIST_HEAD(&chain->rules); chain->handle = nf_tables_alloc_handle(table); chain->table = table; - chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); + + if (nla[NFTA_CHAIN_NAME]) { + chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); + } else { + if (!(flags & NFT_CHAIN_BINDING)) + return -EINVAL; + + snprintf(name, sizeof(name), "__chain%llu", ++chain_id); + chain->name = kstrdup(name, GFP_KERNEL); + } + if (!chain->name) { err = -ENOMEM; goto err1; @@ -1995,16 +2044,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (err < 0) goto err1; - err = rhltable_insert_key(&table->chains_ht, chain->name, - &chain->rhlhead, nft_chain_ht_params); - if (err) - goto err2; - trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (IS_ERR(trans)) { err = PTR_ERR(trans); - rhltable_remove(&table->chains_ht, &chain->rhlhead, - nft_chain_ht_params); goto err2; } @@ -2012,8 +2054,13 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nft_is_base_chain(chain)) nft_trans_chain_policy(trans) = policy; + err = nft_chain_add(table, chain); + if (err < 0) { + nft_trans_destroy(trans); + goto err2; + } + table->use++; - list_add_tail_rcu(&chain->list, &table->chains); return 0; err2: @@ -2061,7 +2108,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nla[NFTA_CHAIN_HOOK]) { if (!nft_is_base_chain(chain)) - return -EBUSY; + return -EEXIST; err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family, false); @@ -2071,21 +2118,21 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, basechain = nft_base_chain(chain); if (basechain->type != hook.type) { nft_chain_release_hook(&hook); - return -EBUSY; + return -EEXIST; } if (ctx->family == NFPROTO_NETDEV) { if (!nft_hook_list_equal(&basechain->hook_list, &hook.list)) { nft_chain_release_hook(&hook); - return -EBUSY; + return -EEXIST; } } else { ops = &basechain->ops; if (ops->hooknum != hook.num || ops->priority != hook.priority) { nft_chain_release_hook(&hook); - return -EBUSY; + return -EEXIST; } } nft_chain_release_hook(&hook); @@ -2157,6 +2204,22 @@ err: return err; } +static struct nft_chain *nft_chain_lookup_byid(const struct net *net, + const struct nlattr *nla) +{ + u32 id = ntohl(nla_get_be32(nla)); + struct nft_trans *trans; + + list_for_each_entry(trans, &net->nft.commit_list, list) { + struct nft_chain *chain = trans->ctx.chain; + + if (trans->msg_type == NFT_MSG_NEWCHAIN && + id == nft_trans_chain_id(trans)) + return chain; + } + return ERR_PTR(-ENOENT); +} + static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -2165,9 +2228,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + struct nft_chain *chain = NULL; const struct nlattr *attr; struct nft_table *table; - struct nft_chain *chain; u8 policy = NF_ACCEPT; struct nft_ctx ctx; u64 handle = 0; @@ -2192,7 +2255,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, return PTR_ERR(chain); } attr = nla[NFTA_CHAIN_HANDLE]; - } else { + } else if (nla[NFTA_CHAIN_NAME]) { chain = nft_chain_lookup(net, table, attr, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) { @@ -2201,6 +2264,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } chain = NULL; } + } else if (!nla[NFTA_CHAIN_ID]) { + return -EINVAL; } if (nla[NFTA_CHAIN_POLICY]) { @@ -2231,6 +2296,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, else if (chain) flags = chain->flags; + if (flags & ~NFT_CHAIN_FLAGS) + return -EOPNOTSUPP; + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain != NULL) { @@ -2241,7 +2309,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - flags |= chain->flags & NFT_BASE_CHAIN; + flags |= chain->flags & NFT_CHAIN_BASE; return nf_tables_updchain(&ctx, genmask, policy, flags); } @@ -2318,7 +2386,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, /** * nft_register_expr - register nf_tables expr type - * @ops: expr type + * @type: expr type * * Registers the expr type for use with nf_tables. Returns zero on * success or a negative errno code otherwise. @@ -2337,7 +2405,7 @@ EXPORT_SYMBOL_GPL(nft_register_expr); /** * nft_unregister_expr - unregister nf_tables expr type - * @ops: expr type + * @type: expr type * * Unregisters the expr typefor use with nf_tables. */ @@ -2452,6 +2520,7 @@ nla_put_failure: struct nft_expr_info { const struct nft_expr_ops *ops; + const struct nlattr *attr; struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; }; @@ -2499,7 +2568,9 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, } else ops = type->ops; + info->attr = nla; info->ops = ops; + return 0; err1: @@ -2635,6 +2706,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { .len = NFT_USERDATA_MAXLEN }, [NFTA_RULE_ID] = { .type = NLA_U32 }, [NFTA_RULE_POSITION_ID] = { .type = NLA_U32 }, + [NFTA_RULE_CHAIN_ID] = { .type = NLA_U32 }, }; static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, @@ -2961,8 +3033,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -static void nf_tables_rule_release(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); @@ -3053,10 +3124,24 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); - if (IS_ERR(chain)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); - return PTR_ERR(chain); + if (nla[NFTA_RULE_CHAIN]) { + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], + genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); + return PTR_ERR(chain); + } + if (nft_chain_is_bound(chain)) + return -EOPNOTSUPP; + + } else if (nla[NFTA_RULE_CHAIN_ID]) { + chain = nft_chain_lookup_byid(net, nla[NFTA_RULE_CHAIN_ID]); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]); + return PTR_ERR(chain); + } + } else { + return -EINVAL; } if (nla[NFTA_RULE_HANDLE]) { @@ -3155,8 +3240,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, expr = nft_expr_first(rule); for (i = 0; i < n; i++) { err = nf_tables_newexpr(&ctx, &info[i], expr); - if (err < 0) + if (err < 0) { + NL_SET_BAD_ATTR(extack, info[i].attr); goto err2; + } if (info[i].ops->validate) nft_validate_state_update(net, NFT_VALIDATE_NEED); @@ -3268,6 +3355,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } + if (nft_chain_is_bound(chain)) + return -EOPNOTSUPP; } nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); @@ -4326,7 +4415,7 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: set->use--; - /* fall through */ + fallthrough; default: nf_tables_unbind_set(ctx, set, binding, phase == NFT_TRANS_COMMIT); @@ -5220,10 +5309,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) || nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ - nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) { - err = -EBUSY; + nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) goto err_element_clash; - } if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && memcmp(nft_set_ext_data(ext), @@ -5231,7 +5318,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) && *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2))) - err = -EBUSY; + goto err_element_clash; else if (!(nlmsg_flags & NLM_F_EXCL)) err = 0; } else if (err == -ENOTEMPTY) { @@ -5328,11 +5415,24 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, */ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { + struct nft_chain *chain; + struct nft_rule *rule; + if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use++; + chain = data->verdict.chain; + chain->use++; + + if (!nft_chain_is_bound(chain)) + break; + + chain->table->use++; + list_for_each_entry(rule, &chain->rules, list) + chain->use++; + + nft_chain_add(chain->table, chain); break; } } @@ -5545,7 +5645,7 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, /** * nft_register_obj- register nf_tables stateful object type - * @obj: object type + * @obj_type: object type * * Registers the object type for use with nf_tables. Returns zero on * success or a negative errno code otherwise. @@ -5564,7 +5664,7 @@ EXPORT_SYMBOL_GPL(nft_register_obj); /** * nft_unregister_obj - unregister nf_tables object type - * @obj: object type + * @obj_type: object type * * Unregisters the object type for use with nf_tables. */ @@ -6243,7 +6343,7 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: flowtable->use--; - /* fall through */ + fallthrough; default: return; } @@ -6405,7 +6505,7 @@ static int nft_register_flowtable_net_hooks(struct net *net, list_for_each_entry(hook2, &ft->hook_list, list) { if (hook->ops.dev == hook2->ops.dev && hook->ops.pf == hook2->ops.pf) { - err = -EBUSY; + err = -EEXIST; goto err_unregister_net_hooks; } } @@ -7264,7 +7364,7 @@ static int nf_tables_validate(struct net *net) break; case NFT_VALIDATE_NEED: nft_validate_state_update(net, NFT_VALIDATE_DO); - /* fall through */ + fallthrough; case NFT_VALIDATE_DO: list_for_each_entry(table, &net->nft.tables, list) { if (nft_table_validate(net, table) < 0) @@ -7402,6 +7502,12 @@ static void nf_tables_trans_destroy_work(struct work_struct *w) } } +void nf_tables_trans_destroy_flush_work(void) +{ + flush_work(&trans_destroy_work); +} +EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work); + static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain) { struct nft_rule *rule; @@ -7524,7 +7630,7 @@ static void nft_obj_del(struct nft_object *obj) list_del_rcu(&obj->list); } -static void nft_chain_del(struct nft_chain *chain) +void nft_chain_del(struct nft_chain *chain) { struct nft_table *table = chain->table; @@ -7584,9 +7690,9 @@ static void nf_tables_commit_release(struct net *net) spin_unlock(&nf_tables_destroy_list_lock); nf_tables_module_autoload_cleanup(net); - mutex_unlock(&net->nft.commit_mutex); - schedule_work(&trans_destroy_work); + + mutex_unlock(&net->nft.commit_mutex); } static int nf_tables_commit(struct net *net, struct sk_buff *skb) @@ -7875,6 +7981,10 @@ static int __nf_tables_abort(struct net *net, bool autoload) kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { + if (nft_chain_is_bound(trans->ctx.chain)) { + nft_trans_destroy(trans); + break; + } trans->ctx.table->use--; nft_chain_del(trans->ctx.chain); nf_tables_unregister_hook(trans->ctx.net, @@ -8304,6 +8414,7 @@ static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { [NFTA_VERDICT_CODE] = { .type = NLA_U32 }, [NFTA_VERDICT_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, + [NFTA_VERDICT_CHAIN_ID] = { .type = NLA_U32 }, }; static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, @@ -8333,17 +8444,26 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, default: return -EINVAL; } - /* fall through */ + fallthrough; case NFT_CONTINUE: case NFT_BREAK: case NFT_RETURN: break; case NFT_JUMP: case NFT_GOTO: - if (!tb[NFTA_VERDICT_CHAIN]) + if (tb[NFTA_VERDICT_CHAIN]) { + chain = nft_chain_lookup(ctx->net, ctx->table, + tb[NFTA_VERDICT_CHAIN], + genmask); + } else if (tb[NFTA_VERDICT_CHAIN_ID]) { + chain = nft_chain_lookup_byid(ctx->net, + tb[NFTA_VERDICT_CHAIN_ID]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + } else { return -EINVAL; - chain = nft_chain_lookup(ctx->net, ctx->table, - tb[NFTA_VERDICT_CHAIN], genmask); + } + if (IS_ERR(chain)) return PTR_ERR(chain); if (nft_is_base_chain(chain)) @@ -8361,10 +8481,23 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, static void nft_verdict_uninit(const struct nft_data *data) { + struct nft_chain *chain; + struct nft_rule *rule; + switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use--; + chain = data->verdict.chain; + chain->use--; + + if (!nft_chain_is_bound(chain)) + break; + + chain->table->use--; + list_for_each_entry(rule, &chain->rules, list) + chain->use--; + + nft_chain_del(chain); break; } } diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 96c74c4c7176..587897a2498b 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -213,7 +213,7 @@ next_rule: jumpstack[stackptr].chain = chain; jumpstack[stackptr].rules = rules + 1; stackptr++; - /* fall through */ + fallthrough; case NFT_GOTO: nft_trace_packet(&info, chain, rule, NFT_TRACETYPE_RULE); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index c7cf1cde46de..9ef37c1b7b3b 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -312,7 +312,7 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack); - err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, basechain, &bo, + err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo, nft_indr_block_cleanup); if (err < 0) return err; diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5827117f2635..5bfec829c12f 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> - * (C) 2011 Intra2net AG <http://www.intra2net.com> + * (C) 2011 Intra2net AG <https://www.intra2net.com> */ #include <linux/init.h> #include <linux/module.h> diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index da915c224a82..89a381f7f945 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -451,7 +451,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, case IPPROTO_TCP: timeouts = nf_tcp_pernet(net)->timeouts; break; - case IPPROTO_UDP: /* fallthrough */ + case IPPROTO_UDP: case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(net)->timeouts; break; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 8a28c127effc..16f4d84599ac 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -43,7 +43,7 @@ void nft_cmp_eval(const struct nft_expr *expr, case NFT_CMP_LT: if (d == 0) goto mismatch; - /* fall through */ + fallthrough; case NFT_CMP_LTE: if (d > 0) goto mismatch; @@ -51,7 +51,7 @@ void nft_cmp_eval(const struct nft_expr *expr, case NFT_CMP_GT: if (d == 0) goto mismatch; - /* fall through */ + fallthrough; case NFT_CMP_GTE: if (d < 0) goto mismatch; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index aa1a066cb74b..6428856ccbec 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -27,6 +27,8 @@ struct nft_xt_match_priv { void *info; }; +static refcount_t nft_compat_pending_destroy = REFCOUNT_INIT(1); + static int nft_compat_chain_validate_dependency(const struct nft_ctx *ctx, const char *tablename) { @@ -236,6 +238,15 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); + /* xtables matches or targets can have side effects, e.g. + * creation/destruction of /proc files. + * The xt ->destroy functions are run asynchronously from + * work queue. If we have pending invocations we thus + * need to wait for those to finish. + */ + if (refcount_read(&nft_compat_pending_destroy) > 1) + nf_tables_trans_destroy_flush_work(); + ret = xt_check_target(&par, size, proto, inv); if (ret < 0) return ret; @@ -247,6 +258,13 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return 0; } +static void __nft_mt_tg_destroy(struct module *me, const struct nft_expr *expr) +{ + refcount_dec(&nft_compat_pending_destroy); + module_put(me); + kfree(expr->ops); +} + static void nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -262,8 +280,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) if (par.target->destroy != NULL) par.target->destroy(&par); - module_put(me); - kfree(expr->ops); + __nft_mt_tg_destroy(me, expr); } static int nft_extension_dump_info(struct sk_buff *skb, int attr, @@ -494,8 +511,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr, if (par.match->destroy != NULL) par.match->destroy(&par); - module_put(me); - kfree(expr->ops); + __nft_mt_tg_destroy(me, expr); } static void @@ -700,6 +716,14 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = { static struct nft_expr_type nft_match_type; +static void nft_mt_tg_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + enum nft_trans_phase phase) +{ + if (phase == NFT_TRANS_COMMIT) + refcount_inc(&nft_compat_pending_destroy); +} + static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -738,6 +762,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, ops->type = &nft_match_type; ops->eval = nft_match_eval; ops->init = nft_match_init; + ops->deactivate = nft_mt_tg_deactivate, ops->destroy = nft_match_destroy; ops->dump = nft_match_dump; ops->validate = nft_match_validate; @@ -828,6 +853,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, ops->size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); ops->init = nft_target_init; ops->destroy = nft_target_destroy; + ops->deactivate = nft_mt_tg_deactivate, ops->dump = nft_target_dump; ops->validate = nft_target_validate; ops->data = target; @@ -891,6 +917,8 @@ static void __exit nft_compat_module_exit(void) nfnetlink_subsys_unregister(&nfnl_compat_subsys); nft_unregister_expr(&nft_target_type); nft_unregister_expr(&nft_match_type); + + WARN_ON_ONCE(refcount_read(&nft_compat_pending_destroy) != 1); } MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 77258af1fce0..322bd674963e 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -129,7 +129,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; } #endif - case NFT_CT_BYTES: /* fallthrough */ + case NFT_CT_BYTES: case NFT_CT_PKTS: { const struct nf_conn_acct *acct = nf_conn_acct_find(ct); u64 count = 0; @@ -1013,8 +1013,8 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, help6 = nf_conntrack_helper_try_module_get(name, family, priv->l4proto); break; - case NFPROTO_NETDEV: /* fallthrough */ - case NFPROTO_BRIDGE: /* same */ + case NFPROTO_NETDEV: + case NFPROTO_BRIDGE: case NFPROTO_INET: help4 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV4, priv->l4proto); diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index cfac0964f48d..4dfdaeaf09a5 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -32,7 +32,7 @@ int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, unsigned int hooks; switch (priv->result) { - case NFT_FIB_RESULT_OIF: /* fallthrough */ + case NFT_FIB_RESULT_OIF: case NFT_FIB_RESULT_OIFNAME: hooks = (1 << NF_INET_PRE_ROUTING); break; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index c7f0ef73d939..c63eb3b17178 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -54,6 +54,23 @@ static int nft_immediate_init(const struct nft_ctx *ctx, if (err < 0) goto err1; + if (priv->dreg == NFT_REG_VERDICT) { + struct nft_chain *chain = priv->data.verdict.chain; + + switch (priv->data.verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + if (nft_chain_is_bound(chain)) { + err = -EBUSY; + goto err1; + } + chain->bound = true; + break; + default: + break; + } + } + return 0; err1: @@ -81,6 +98,39 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg)); } +static void nft_immediate_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_rule *rule, *n; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + + if (priv->dreg != NFT_REG_VERDICT) + return; + + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + + if (!nft_chain_is_bound(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry_safe(rule, n, &chain->rules, list) + nf_tables_rule_release(&chain_ctx, rule); + + nf_tables_chain_destroy(&chain_ctx); + break; + default: + break; + } +} + static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); @@ -170,6 +220,7 @@ static const struct nft_expr_ops nft_imm_ops = { .init = nft_immediate_init, .activate = nft_immediate_activate, .deactivate = nft_immediate_deactivate, + .destroy = nft_immediate_destroy, .dump = nft_immediate_dump, .validate = nft_immediate_validate, .offload = nft_immediate_offload, diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 951b6e87ed5d..7bc6537f3ccb 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -253,7 +253,7 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, return false; break; case NFT_META_IIFGROUP: - if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) + if (!nft_meta_store_ifgroup(dest, nft_in(pkt))) return false; break; case NFT_META_OIFGROUP: diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index a7de3a58f553..ed7cb9f747f6 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -467,7 +467,7 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, case IPPROTO_UDP: if (!nft_payload_udp_checksum(skb, pkt->xt.thoff)) return -1; - /* Fall through. */ + fallthrough; case IPPROTO_UDPLITE: *l4csum_offset = offsetof(struct udphdr, check); break; diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 86eafbb0fdd0..61fb7e8afbf0 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -30,7 +30,8 @@ int nft_reject_validate(const struct nft_ctx *ctx, return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | - (1 << NF_INET_LOCAL_OUT)); + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_PRE_ROUTING)); } EXPORT_SYMBOL_GPL(nft_reject_validate); diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 8c04388296b0..9944523f5c2c 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -312,7 +312,7 @@ * Jay Ligatti, Josh Kuhn, and Chris Gage. * Proceedings of the IEEE International Conference on Computer * Communication Networks (ICCCN), August 2010. - * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf + * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf * * [Rottenstreich 2010] * Worst-Case TCAM Rule Expansion @@ -325,7 +325,7 @@ * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, * and Patrick Eugster. * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. - * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf + * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf */ #include <linux/kernel.h> @@ -401,7 +401,7 @@ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, * nft_pipapo_lookup() - Lookup function * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @key: nftables API element representation containing key data * @ext: nftables API extension pointer, filled with matching reference * * For more details, see DOC: Theory of Operation. @@ -1075,7 +1075,7 @@ out: * @m: Matching data, including mapping table * @map: Table of rule maps: array of first rule and amount of rules * in next field a given rule maps to, for each field - * @ext: For last field, nft_set_ext pointer matching rules map to + * @e: For last field, nft_set_ext pointer matching rules map to */ static void pipapo_map(struct nft_pipapo_match *m, union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], @@ -1099,7 +1099,7 @@ static void pipapo_map(struct nft_pipapo_match *m, /** * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results * @clone: Copy of matching data with pending insertions and deletions - * @bsize_max Maximum bucket size, scratch maps cover two buckets + * @bsize_max: Maximum bucket size, scratch maps cover two buckets * * Return: 0 on success, -ENOMEM on failure. */ @@ -1249,8 +1249,6 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, if (err) return err; - this_cpu_write(nft_pipapo_scratch_index, false); - m->bsize_max = bsize_max; } else { put_cpu_ptr(m->scratch); @@ -1449,7 +1447,7 @@ static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules, /** * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map * @m: Matching data - * @rulemap Table of rule maps, arrays of first rule and amount of rules + * @rulemap: Table of rule maps, arrays of first rule and amount of rules * in next field a given entry maps to, for each field * * For each rule in lookup table buckets mapping to this set of rules, drop diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index 51b454d8fa9c..cedf47ab3c6f 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -25,7 +25,7 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, skb->ip_summed = CHECKSUM_UNNECESSARY; break; } - /* fall through */ + fallthrough; case CHECKSUM_NONE: if (protocol != IPPROTO_TCP && protocol != IPPROTO_UDP) skb->csum = 0; @@ -51,7 +51,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, case CHECKSUM_COMPLETE: if (len == skb->len - dataoff) return nf_ip_checksum(skb, hook, dataoff, protocol); - /* fall through */ + fallthrough; case CHECKSUM_NONE: skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, skb->len - dataoff, 0); @@ -79,7 +79,7 @@ __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, skb->ip_summed = CHECKSUM_UNNECESSARY; break; } - /* fall through */ + fallthrough; case CHECKSUM_NONE: skb->csum = ~csum_unfold( csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, @@ -106,7 +106,7 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, case CHECKSUM_COMPLETE: if (len == skb->len - dataoff) return nf_ip6_checksum(skb, hook, dataoff, protocol); - /* fall through */ + fallthrough; case CHECKSUM_NONE: hsum = skb_checksum(skb, 0, dataoff, 0); skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 9ad8f3ff66f5..af22dbe85e2c 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1028,34 +1028,34 @@ int xt_check_target(struct xt_tgchk_param *par, EXPORT_SYMBOL_GPL(xt_check_target); /** - * xt_copy_counters_from_user - copy counters and metadata from userspace + * xt_copy_counters - copy counters and metadata from a sockptr_t * - * @user: src pointer to userspace memory + * @arg: src sockptr * @len: alleged size of userspace memory * @info: where to store the xt_counters_info metadata - * @compat: true if we setsockopt call is done by 32bit task on 64bit kernel * * Copies counter meta data from @user and stores it in @info. * * vmallocs memory to hold the counters, then copies the counter data * from @user to the new memory and returns a pointer to it. * - * If @compat is true, @info gets converted automatically to the 64bit - * representation. + * If called from a compat syscall, @info gets converted automatically to the + * 64bit representation. * * The metadata associated with the counters is stored in @info. * * Return: returns pointer that caller has to test via IS_ERR(). * If IS_ERR is false, caller has to vfree the pointer. */ -void *xt_copy_counters_from_user(const void __user *user, unsigned int len, - struct xt_counters_info *info, bool compat) +void *xt_copy_counters(sockptr_t arg, unsigned int len, + struct xt_counters_info *info) { + size_t offset; void *mem; u64 size; #ifdef CONFIG_COMPAT - if (compat) { + if (in_compat_syscall()) { /* structures only differ in size due to alignment */ struct compat_xt_counters_info compat_tmp; @@ -1063,12 +1063,12 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, return ERR_PTR(-EINVAL); len -= sizeof(compat_tmp); - if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0) + if (copy_from_sockptr(&compat_tmp, arg, sizeof(compat_tmp)) != 0) return ERR_PTR(-EFAULT); memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1); info->num_counters = compat_tmp.num_counters; - user += sizeof(compat_tmp); + offset = sizeof(compat_tmp); } else #endif { @@ -1076,10 +1076,10 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, return ERR_PTR(-EINVAL); len -= sizeof(*info); - if (copy_from_user(info, user, sizeof(*info)) != 0) + if (copy_from_sockptr(info, arg, sizeof(*info)) != 0) return ERR_PTR(-EFAULT); - user += sizeof(*info); + offset = sizeof(*info); } info->name[sizeof(info->name) - 1] = '\0'; @@ -1093,13 +1093,13 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, if (!mem) return ERR_PTR(-ENOMEM); - if (copy_from_user(mem, user, len) == 0) + if (copy_from_sockptr_offset(mem, arg, offset, len) == 0) return mem; vfree(mem); return ERR_PTR(-EFAULT); } -EXPORT_SYMBOL_GPL(xt_copy_counters_from_user); +EXPORT_SYMBOL_GPL(xt_copy_counters); #ifdef CONFIG_COMPAT int xt_compat_target_offset(const struct xt_target *target) @@ -1572,7 +1572,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, trav->curr = trav->curr->next; if (trav->curr != trav->head) break; - /* fall through */ + fallthrough; default: return NULL; } diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index a5c8b653476a..76acecf3e757 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -6,7 +6,7 @@ * with the SECMARK target and state match. * * Based somewhat on CONNMARK: - * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com> * by Henrik Nordstrom <hno@marasystems.com> * * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com> diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index eec2f3a88d73..e5ebc0810675 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -2,7 +2,7 @@ /* * xt_connmark - Netfilter module to operate on connection marks * - * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com> * by Henrik Nordstrom <hno@marasystems.com> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * Jan Engelhardt <jengelh@medozas.de> diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index 5aab6df74e0f..a97c2259bbc8 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> - * (C) 2011 Intra2net AG <http://www.intra2net.com> + * (C) 2011 Intra2net AG <https://www.intra2net.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 67cb98489415..6aa12d0f54e2 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -5,7 +5,7 @@ * based on ipt_time by Fabrice MARIE <fabrice@netfilter.org> * This is a module which is used for time matching * It is using some modified code from dietlibc (localtime() function) - * that you can find at http://www.fefe.de/dietlibc/ + * that you can find at https://www.fefe.de/dietlibc/ * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl. */ diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index a1f2320ecc16..d07de2c0fbc7 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -92,7 +92,7 @@ static void netlbl_domhsh_free_entry(struct rcu_head *entry) /** * netlbl_domhsh_hash - Hashing function for the domain hash table - * @domain: the domain name to hash + * @key: the domain name to hash * * Description: * This is the hashing function for the domain hash table, it returns the diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4f2c3b14ddbf..b5f30d7d30d0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -60,6 +60,7 @@ #include <linux/genetlink.h> #include <linux/net_namespace.h> #include <linux/nospec.h> +#include <linux/btf_ids.h> #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -1620,7 +1621,7 @@ static void netlink_update_socket_mc(struct netlink_sock *nlk, } static int netlink_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); @@ -1631,7 +1632,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, return -ENOPROTOOPT; if (optlen >= sizeof(int) && - get_user(val, (unsigned int __user *)optval)) + copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { @@ -2803,21 +2804,29 @@ static const struct rhashtable_params netlink_rhashtable_params = { }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) -static const struct bpf_iter_reg netlink_reg_info = { - .target = "netlink", +BTF_ID_LIST(btf_netlink_sock_id) +BTF_ID(struct, netlink_sock) + +static const struct bpf_iter_seq_info netlink_seq_info = { .seq_ops = &netlink_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct nl_seq_iter), +}; + +static struct bpf_iter_reg netlink_reg_info = { + .target = "netlink", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__netlink, sk), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &netlink_seq_info, }; static int __init bpf_iter_register(void) { + netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id; return bpf_iter_reg_target(&netlink_reg_info); } #endif diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index f90ef6934b8f..6d16e1ab1a8a 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -294,7 +294,7 @@ void nr_destroy_socket(struct sock *sk) */ static int nr_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); @@ -306,7 +306,7 @@ static int nr_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(unsigned int)) return -EINVAL; - if (get_user(opt, (unsigned int __user *)optval)) + if (copy_from_sockptr(&opt, optval, sizeof(unsigned int))) return -EFAULT; switch (optname) { diff --git a/net/nfc/core.c b/net/nfc/core.c index c5f9c3ee82f8..eb377f87bcae 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -704,7 +704,6 @@ EXPORT_SYMBOL(nfc_tm_deactivated); * nfc_alloc_send_skb - allocate a skb for data exchange responses * * @size: size to allocate - * @gfp: gfp flags */ struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk, unsigned int flags, unsigned int size, @@ -749,7 +748,7 @@ EXPORT_SYMBOL(nfc_alloc_recv_skb); * * @dev: The nfc device that found the targets * @targets: array of nfc targets found - * @ntargets: targets array size + * @n_targets: targets array size * * The device driver must call this function when one or many nfc targets * are found. After calling this function, the device driver must stop diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 28604414dec1..d257ed3b732a 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -218,7 +218,7 @@ error: } static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); @@ -241,7 +241,7 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -263,7 +263,7 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -921,8 +921,6 @@ static const struct proto_ops llcp_rawsock_ops = { .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = sock_no_sendmsg, .recvmsg = llcp_sock_recvmsg, .mmap = sock_no_mmap, diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 78ea8c94dcba..741da8f81c2b 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1182,7 +1182,7 @@ EXPORT_SYMBOL(nci_free_device); /** * nci_register_device - register a nci device in the nfc subsystem * - * @dev: The nci device to register + * @ndev: The nci device to register */ int nci_register_device(struct nci_dev *ndev) { @@ -1249,7 +1249,7 @@ EXPORT_SYMBOL(nci_register_device); /** * nci_unregister_device - unregister a nci device in the nfc subsystem * - * @dev: The nci device to unregister + * @ndev: The nci device to unregister */ void nci_unregister_device(struct nci_dev *ndev) { diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index ba5ffd3badd3..b2061b6746ea 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -276,8 +276,6 @@ static const struct proto_ops rawsock_ops = { .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = rawsock_sendmsg, .recvmsg = rawsock_recvmsg, .mmap = sock_no_mmap, @@ -296,8 +294,6 @@ static const struct proto_ops rawsock_raw_ops = { .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = sock_no_sendmsg, .recvmsg = rawsock_recvmsg, .mmap = sock_no_mmap, diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 4340f25fe390..98d393e70de3 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -276,10 +276,6 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) ovs_ct_update_key(skb, NULL, key, false, false); } -#define IN6_ADDR_INITIALIZER(ADDR) \ - { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \ - (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] } - int ovs_ct_put_key(const struct sw_flow_key *swkey, const struct sw_flow_key *output, struct sk_buff *skb) { @@ -301,24 +297,30 @@ int ovs_ct_put_key(const struct sw_flow_key *swkey, if (swkey->ct_orig_proto) { if (swkey->eth.type == htons(ETH_P_IP)) { - struct ovs_key_ct_tuple_ipv4 orig = { - output->ipv4.ct_orig.src, - output->ipv4.ct_orig.dst, - output->ct.orig_tp.src, - output->ct.orig_tp.dst, - output->ct_orig_proto, - }; + struct ovs_key_ct_tuple_ipv4 orig; + + memset(&orig, 0, sizeof(orig)); + orig.ipv4_src = output->ipv4.ct_orig.src; + orig.ipv4_dst = output->ipv4.ct_orig.dst; + orig.src_port = output->ct.orig_tp.src; + orig.dst_port = output->ct.orig_tp.dst; + orig.ipv4_proto = output->ct_orig_proto; + if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, sizeof(orig), &orig)) return -EMSGSIZE; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - struct ovs_key_ct_tuple_ipv6 orig = { - IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src), - IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst), - output->ct.orig_tp.src, - output->ct.orig_tp.dst, - output->ct_orig_proto, - }; + struct ovs_key_ct_tuple_ipv6 orig; + + memset(&orig, 0, sizeof(orig)); + memcpy(orig.ipv6_src, output->ipv6.ct_orig.src.s6_addr32, + sizeof(orig.ipv6_src)); + memcpy(orig.ipv6_dst, output->ipv6.ct_orig.dst.s6_addr32, + sizeof(orig.ipv6_dst)); + orig.src_port = output->ct.orig_tp.src; + orig.dst_port = output->ct.orig_tp.dst; + orig.ipv6_proto = output->ct_orig_proto; + if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, sizeof(orig), &orig)) return -EMSGSIZE; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 94b024534987..42f8cc70bb2c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -130,6 +130,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct dp_upcall_info *, uint32_t cutlen); +static void ovs_dp_masks_rebalance(struct work_struct *work); + /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { @@ -223,13 +225,14 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; + u32 n_cache_hit; int error; stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), - &n_mask_hit); + &n_mask_hit, &n_cache_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -260,6 +263,7 @@ out: u64_stats_update_begin(&stats->syncp); (*stats_counter)++; stats->n_mask_hit += n_mask_hit; + stats->n_cache_hit += n_cache_hit; u64_stats_update_end(&stats->syncp); } @@ -697,6 +701,7 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, stats->n_missed += local_stats.n_missed; stats->n_lost += local_stats.n_lost; mega_stats->n_mask_hit += local_stats.n_mask_hit; + mega_stats->n_cache_hit += local_stats.n_cache_hit; } } @@ -1493,6 +1498,7 @@ static size_t ovs_dp_cmd_msg_size(void) msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_MASKS_CACHE_SIZE */ return msgsize; } @@ -1530,6 +1536,10 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; + if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE, + ovs_flow_tbl_masks_cache_size(&dp->table))) + goto nla_put_failure; + genlmsg_end(skb, ovs_header); return 0; @@ -1594,6 +1604,16 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) #endif } + if (a[OVS_DP_ATTR_MASKS_CACHE_SIZE]) { + int err; + u32 cache_size; + + cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]); + err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size); + if (err) + return err; + } + dp->user_features = user_features; if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) @@ -1882,6 +1902,8 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, + [OVS_DP_ATTR_MASKS_CACHE_SIZE] = NLA_POLICY_RANGE(NLA_U32, 0, + PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)), }; static const struct genl_ops dp_datapath_genl_ops[] = { @@ -2338,6 +2360,23 @@ out: return skb->len; } +static void ovs_dp_masks_rebalance(struct work_struct *work) +{ + struct ovs_net *ovs_net = container_of(work, struct ovs_net, + masks_rebalance.work); + struct datapath *dp; + + ovs_lock(); + + list_for_each_entry(dp, &ovs_net->dps, list_node) + ovs_flow_masks_rebalance(&dp->table); + + ovs_unlock(); + + schedule_delayed_work(&ovs_net->masks_rebalance, + msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); +} + static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, @@ -2432,6 +2471,9 @@ static int __net_init ovs_init_net(struct net *net) INIT_LIST_HEAD(&ovs_net->dps); INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); + INIT_DELAYED_WORK(&ovs_net->masks_rebalance, ovs_dp_masks_rebalance); + schedule_delayed_work(&ovs_net->masks_rebalance, + msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); return ovs_ct_init(net); } @@ -2486,6 +2528,7 @@ static void __net_exit ovs_exit_net(struct net *dnet) ovs_unlock(); + cancel_delayed_work_sync(&ovs_net->masks_rebalance); cancel_work_sync(&ovs_net->dp_notify_work); } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 2016dd107939..38f7d3e66ca6 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -20,8 +20,9 @@ #include "meter.h" #include "vport-internal_dev.h" -#define DP_MAX_PORTS USHRT_MAX -#define DP_VPORT_HASH_BUCKETS 1024 +#define DP_MAX_PORTS USHRT_MAX +#define DP_VPORT_HASH_BUCKETS 1024 +#define DP_MASKS_REBALANCE_INTERVAL 4000 /** * struct dp_stats_percpu - per-cpu packet processing statistics for a given @@ -37,12 +38,15 @@ * @n_mask_hit: Number of masks looked up for flow match. * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked * up per packet. + * @n_cache_hit: The number of received packets that had their mask found using + * the mask cache. */ struct dp_stats_percpu { u64 n_hit; u64 n_missed; u64 n_lost; u64 n_mask_hit; + u64 n_cache_hit; struct u64_stats_sync syncp; }; @@ -131,6 +135,7 @@ struct dp_upcall_info { struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; + struct delayed_work masks_rebalance; #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) struct ovs_ct_limit_info *ct_limit_info; #endif diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 9d375e74b607..03942c30d83e 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -890,6 +890,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, if (static_branch_unlikely(&tc_recirc_sharing_support)) { tc_ext = skb_ext_find(skb, TC_SKB_EXT); key->recirc_id = tc_ext ? tc_ext->chain : 0; + OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0; } else { key->recirc_id = 0; } diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 79252d4887ff..9d3e50c4d29f 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1763,11 +1763,11 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * does not include any don't care bit. * @net: Used to determine per-namespace field support. * @match: receives the extracted flow match information. - * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * @nla_key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. The fields should of the packet that triggered the creation * of this flow. - * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink - * attribute specifies the mask field of the wildcarded flow. + * @nla_mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* + * Netlink attribute specifies the mask field of the wildcarded flow. * @log: Boolean to allow kernel error logging. Normally true, but when * probing for feature compatibility this should be passed in as false to * suppress unnecessary error logging. diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 2398d7238300..8c12675cbb67 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -29,6 +29,7 @@ #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/rculist.h> +#include <linux/sort.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> @@ -37,8 +38,8 @@ #define MASK_ARRAY_SIZE_MIN 16 #define REHASH_INTERVAL (10 * 60 * HZ) +#define MC_DEFAULT_HASH_ENTRIES 256 #define MC_HASH_SHIFT 8 -#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT) #define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) static struct kmem_cache *flow_cache; @@ -169,16 +170,70 @@ static struct table_instance *table_instance_alloc(int new_size) return ti; } +static void __mask_array_destroy(struct mask_array *ma) +{ + free_percpu(ma->masks_usage_cntr); + kfree(ma); +} + +static void mask_array_rcu_cb(struct rcu_head *rcu) +{ + struct mask_array *ma = container_of(rcu, struct mask_array, rcu); + + __mask_array_destroy(ma); +} + +static void tbl_mask_array_reset_counters(struct mask_array *ma) +{ + int i, cpu; + + /* As the per CPU counters are not atomic we can not go ahead and + * reset them from another CPU. To be able to still have an approximate + * zero based counter we store the value at reset, and subtract it + * later when processing. + */ + for (i = 0; i < ma->max; i++) { + ma->masks_usage_zero_cntr[i] = 0; + + for_each_possible_cpu(cpu) { + u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, + cpu); + unsigned int start; + u64 counter; + + do { + start = u64_stats_fetch_begin_irq(&ma->syncp); + counter = usage_counters[i]; + } while (u64_stats_fetch_retry_irq(&ma->syncp, start)); + + ma->masks_usage_zero_cntr[i] += counter; + } + } +} + static struct mask_array *tbl_mask_array_alloc(int size) { struct mask_array *new; size = max(MASK_ARRAY_SIZE_MIN, size); new = kzalloc(sizeof(struct mask_array) + - sizeof(struct sw_flow_mask *) * size, GFP_KERNEL); + sizeof(struct sw_flow_mask *) * size + + sizeof(u64) * size, GFP_KERNEL); if (!new) return NULL; + new->masks_usage_zero_cntr = (u64 *)((u8 *)new + + sizeof(struct mask_array) + + sizeof(struct sw_flow_mask *) * + size); + + new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size, + __alignof__(u64)); + if (!new->masks_usage_cntr) { + kfree(new); + return NULL; + } + new->count = 0; new->max = size; @@ -202,10 +257,10 @@ static int tbl_mask_array_realloc(struct flow_table *tbl, int size) if (ovsl_dereference(old->masks[i])) new->masks[new->count++] = old->masks[i]; } + call_rcu(&old->rcu, mask_array_rcu_cb); } rcu_assign_pointer(tbl->mask_array, new); - kfree_rcu(old, rcu); return 0; } @@ -223,6 +278,11 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl, return err; ma = ovsl_dereference(tbl->mask_array); + } else { + /* On every add or delete we need to reset the counters so + * every new mask gets a fair chance of being prioritized. + */ + tbl_mask_array_reset_counters(ma); } BUG_ON(ovsl_dereference(ma->masks[ma_count])); @@ -260,6 +320,9 @@ found: if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) && ma_count <= (ma->max / 3)) tbl_mask_array_realloc(tbl, ma->max / 2); + else + tbl_mask_array_reset_counters(ma); + } /* Remove 'mask' from the mask list, if it is not needed any more. */ @@ -278,15 +341,79 @@ static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) } } +static void __mask_cache_destroy(struct mask_cache *mc) +{ + free_percpu(mc->mask_cache); + kfree(mc); +} + +static void mask_cache_rcu_cb(struct rcu_head *rcu) +{ + struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu); + + __mask_cache_destroy(mc); +} + +static struct mask_cache *tbl_mask_cache_alloc(u32 size) +{ + struct mask_cache_entry __percpu *cache = NULL; + struct mask_cache *new; + + /* Only allow size to be 0, or a power of 2, and does not exceed + * percpu allocation size. + */ + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return NULL; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->cache_size = size; + if (new->cache_size > 0) { + cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry), + new->cache_size), + __alignof__(struct mask_cache_entry)); + if (!cache) { + kfree(new); + return NULL; + } + } + + new->mask_cache = cache; + return new; +} +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size) +{ + struct mask_cache *mc = rcu_dereference(table->mask_cache); + struct mask_cache *new; + + if (size == mc->cache_size) + return 0; + + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return -EINVAL; + + new = tbl_mask_cache_alloc(size); + if (!new) + return -ENOMEM; + + rcu_assign_pointer(table->mask_cache, new); + call_rcu(&mc->rcu, mask_cache_rcu_cb); + + return 0; +} + int ovs_flow_tbl_init(struct flow_table *table) { struct table_instance *ti, *ufid_ti; + struct mask_cache *mc; struct mask_array *ma; - table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) * - MC_HASH_ENTRIES, - __alignof__(struct mask_cache_entry)); - if (!table->mask_cache) + mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES); + if (!mc) return -ENOMEM; ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); @@ -304,6 +431,7 @@ int ovs_flow_tbl_init(struct flow_table *table) rcu_assign_pointer(table->ti, ti); rcu_assign_pointer(table->ufid_ti, ufid_ti); rcu_assign_pointer(table->mask_array, ma); + rcu_assign_pointer(table->mask_cache, mc); table->last_rehash = jiffies; table->count = 0; table->ufid_count = 0; @@ -312,9 +440,9 @@ int ovs_flow_tbl_init(struct flow_table *table) free_ti: __table_instance_destroy(ti); free_mask_array: - kfree(ma); + __mask_array_destroy(ma); free_mask_cache: - free_percpu(table->mask_cache); + __mask_cache_destroy(mc); return -ENOMEM; } @@ -390,9 +518,11 @@ void ovs_flow_tbl_destroy(struct flow_table *table) { struct table_instance *ti = rcu_dereference_raw(table->ti); struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); + struct mask_cache *mc = rcu_dereference_raw(table->mask_cache); + struct mask_array *ma = rcu_dereference_raw(table->mask_array); - free_percpu(table->mask_cache); - kfree_rcu(rcu_dereference_raw(table->mask_array), rcu); + call_rcu(&mc->rcu, mask_cache_rcu_cb); + call_rcu(&ma->rcu, mask_array_rcu_cb); table_instance_destroy(table, ti, ufid_ti, false); } @@ -604,8 +734,10 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, + u32 *n_cache_hit, u32 *index) { + u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr); struct sw_flow *flow; struct sw_flow_mask *mask; int i; @@ -614,8 +746,13 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, mask = rcu_dereference_ovsl(ma->masks[*index]); if (mask) { flow = masked_flow_lookup(ti, key, mask, n_mask_hit); - if (flow) + if (flow) { + u64_stats_update_begin(&ma->syncp); + usage_counters[*index]++; + u64_stats_update_end(&ma->syncp); + (*n_cache_hit)++; return flow; + } } } @@ -631,6 +768,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, flow = masked_flow_lookup(ti, key, mask, n_mask_hit); if (flow) { /* Found */ *index = i; + u64_stats_update_begin(&ma->syncp); + usage_counters[*index]++; + u64_stats_update_end(&ma->syncp); return flow; } } @@ -648,8 +788,10 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 skb_hash, - u32 *n_mask_hit) + u32 *n_mask_hit, + u32 *n_cache_hit) { + struct mask_cache *mc = rcu_dereference(tbl->mask_cache); struct mask_array *ma = rcu_dereference(tbl->mask_array); struct table_instance *ti = rcu_dereference(tbl->ti); struct mask_cache_entry *entries, *ce; @@ -658,10 +800,13 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, int seg; *n_mask_hit = 0; - if (unlikely(!skb_hash)) { + *n_cache_hit = 0; + if (unlikely(!skb_hash || mc->cache_size == 0)) { u32 mask_index = 0; + u32 cache = 0; - return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); + return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache, + &mask_index); } /* Pre and post recirulation flows usually have the same skb_hash @@ -672,17 +817,17 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, ce = NULL; hash = skb_hash; - entries = this_cpu_ptr(tbl->mask_cache); + entries = this_cpu_ptr(mc->mask_cache); /* Find the cache entry 'ce' to operate on. */ for (seg = 0; seg < MC_HASH_SEGS; seg++) { - int index = hash & (MC_HASH_ENTRIES - 1); + int index = hash & (mc->cache_size - 1); struct mask_cache_entry *e; e = &entries[index]; if (e->skb_hash == skb_hash) { flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, - &e->mask_index); + n_cache_hit, &e->mask_index); if (!flow) e->skb_hash = 0; return flow; @@ -695,10 +840,12 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, } /* Cache miss, do full lookup. */ - flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit, + &ce->mask_index); if (flow) ce->skb_hash = skb_hash; + *n_cache_hit = 0; return flow; } @@ -708,9 +855,10 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); u32 __always_unused n_mask_hit; + u32 __always_unused n_cache_hit; u32 index = 0; - return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index); + return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, @@ -787,6 +935,13 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table) return READ_ONCE(ma->count); } +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table) +{ + struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache); + + return READ_ONCE(mc->cache_size); +} + static struct table_instance *table_instance_expand(struct table_instance *ti, bool ufid) { @@ -934,6 +1089,98 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, return 0; } +static int compare_mask_and_count(const void *a, const void *b) +{ + const struct mask_count *mc_a = a; + const struct mask_count *mc_b = b; + + return (s64)mc_b->counter - (s64)mc_a->counter; +} + +/* Must be called with OVS mutex held. */ +void ovs_flow_masks_rebalance(struct flow_table *table) +{ + struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); + struct mask_count *masks_and_count; + struct mask_array *new; + int masks_entries = 0; + int i; + + /* Build array of all current entries with use counters. */ + masks_and_count = kmalloc_array(ma->max, sizeof(*masks_and_count), + GFP_KERNEL); + if (!masks_and_count) + return; + + for (i = 0; i < ma->max; i++) { + struct sw_flow_mask *mask; + unsigned int start; + int cpu; + + mask = rcu_dereference_ovsl(ma->masks[i]); + if (unlikely(!mask)) + break; + + masks_and_count[i].index = i; + masks_and_count[i].counter = 0; + + for_each_possible_cpu(cpu) { + u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, + cpu); + u64 counter; + + do { + start = u64_stats_fetch_begin_irq(&ma->syncp); + counter = usage_counters[i]; + } while (u64_stats_fetch_retry_irq(&ma->syncp, start)); + + masks_and_count[i].counter += counter; + } + + /* Subtract the zero count value. */ + masks_and_count[i].counter -= ma->masks_usage_zero_cntr[i]; + + /* Rather than calling tbl_mask_array_reset_counters() + * below when no change is needed, do it inline here. + */ + ma->masks_usage_zero_cntr[i] += masks_and_count[i].counter; + } + + if (i == 0) + goto free_mask_entries; + + /* Sort the entries */ + masks_entries = i; + sort(masks_and_count, masks_entries, sizeof(*masks_and_count), + compare_mask_and_count, NULL); + + /* If the order is the same, nothing to do... */ + for (i = 0; i < masks_entries; i++) { + if (i != masks_and_count[i].index) + break; + } + if (i == masks_entries) + goto free_mask_entries; + + /* Rebuilt the new list in order of usage. */ + new = tbl_mask_array_alloc(ma->max); + if (!new) + goto free_mask_entries; + + for (i = 0; i < masks_entries; i++) { + int index = masks_and_count[i].index; + + if (ovsl_dereference(ma->masks[index])) + new->masks[new->count++] = ma->masks[index]; + } + + rcu_assign_pointer(table->mask_array, new); + call_rcu(&ma->rcu, mask_array_rcu_cb); + +free_mask_entries: + kfree(masks_and_count); +} + /* Initializes the flow module. * Returns zero if successful or a negative error code. */ int ovs_flow_init(void) diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 8a5cea6ae111..74ce48fecba9 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -27,9 +27,23 @@ struct mask_cache_entry { u32 mask_index; }; +struct mask_cache { + struct rcu_head rcu; + u32 cache_size; /* Must be ^2 value. */ + struct mask_cache_entry __percpu *mask_cache; +}; + +struct mask_count { + int index; + u64 counter; +}; + struct mask_array { struct rcu_head rcu; int count, max; + u64 __percpu *masks_usage_cntr; + u64 *masks_usage_zero_cntr; + struct u64_stats_sync syncp; struct sw_flow_mask __rcu *masks[]; }; @@ -45,7 +59,7 @@ struct table_instance { struct flow_table { struct table_instance __rcu *ti; struct table_instance __rcu *ufid_ti; - struct mask_cache_entry __percpu *mask_cache; + struct mask_cache __rcu *mask_cache; struct mask_array __rcu *mask_array; unsigned long last_rehash; unsigned int count; @@ -69,12 +83,15 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, const struct sw_flow_mask *mask); void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); int ovs_flow_tbl_num_masks(const struct flow_table *table); +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table); +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, const struct sw_flow_key *, u32 skb_hash, - u32 *n_mask_hit); + u32 *n_mask_hit, + u32 *n_cache_hit); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, @@ -86,4 +103,7 @@ bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *); void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, bool full, const struct sw_flow_mask *mask); + +void ovs_flow_masks_rebalance(struct flow_table *table); + #endif /* flow_table.h */ diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 47febb4504f0..0d44c5c013fa 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -87,6 +87,7 @@ EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister); /** * ovs_vport_locate - find a port that has already been created * + * @net: network namespace * @name: name of port to find * * Must be called with ovs or RCU read lock. @@ -418,7 +419,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) * * @vport: vport that received the packet * @skb: skb that was received - * @tun_key: tunnel (if any) that carried packet + * @tun_info: tunnel (if any) that carried packet * * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 29bd405adbbd..0b8160d1a6e0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -593,6 +593,7 @@ static void init_prb_bdqc(struct packet_sock *po, req_u->req3.tp_block_size); p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; + rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); @@ -659,10 +660,9 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) * */ if (BLOCK_NUM_PKTS(pbd)) { - while (atomic_read(&pkc->blk_fill_in_prog)) { - /* Waiting for skb_copy_bits to finish... */ - cpu_relax(); - } + /* Waiting for skb_copy_bits to finish... */ + write_lock(&pkc->blk_fill_in_prog_lock); + write_unlock(&pkc->blk_fill_in_prog_lock); } if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { @@ -921,10 +921,9 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, * the timer-handler already handled this case. */ if (!(status & TP_STATUS_BLK_TMO)) { - while (atomic_read(&pkc->blk_fill_in_prog)) { - /* Waiting for skb_copy_bits to finish... */ - cpu_relax(); - } + /* Waiting for skb_copy_bits to finish... */ + write_lock(&pkc->blk_fill_in_prog_lock); + write_unlock(&pkc->blk_fill_in_prog_lock); } prb_close_block(pkc, pbd, po, status); return; @@ -944,7 +943,8 @@ static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); - atomic_dec(&pkc->blk_fill_in_prog); + + read_unlock(&pkc->blk_fill_in_prog_lock); } static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, @@ -998,7 +998,7 @@ static void prb_fill_curr_block(char *curr, pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_NUM_PKTS(pbd) += 1; - atomic_inc(&pkc->blk_fill_in_prog); + read_lock(&pkc->blk_fill_in_prog_lock); prb_run_all_ft_ops(pkc, ppd); } @@ -1536,7 +1536,7 @@ static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) } } -static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, +static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; @@ -1545,10 +1545,10 @@ static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; - if (len != sizeof(fprog)) - return -EINVAL; - if (copy_from_user(&fprog, data, len)) - return -EFAULT; + + ret = copy_bpf_fprog_from_user(&fprog, data, len); + if (ret) + return ret; ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); if (ret) @@ -1558,7 +1558,7 @@ static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, return 0; } -static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, +static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; @@ -1568,7 +1568,7 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, return -EPERM; if (len != sizeof(fd)) return -EINVAL; - if (copy_from_user(&fd, data, len)) + if (copy_from_sockptr(&fd, data, len)) return -EFAULT; new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); @@ -1579,7 +1579,7 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, return 0; } -static int fanout_set_data(struct packet_sock *po, char __user *data, +static int fanout_set_data(struct packet_sock *po, sockptr_t data, unsigned int len) { switch (po->fanout->type) { @@ -3652,7 +3652,8 @@ static void packet_flush_mclist(struct sock *sk) } static int -packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, + unsigned int optlen) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); @@ -3672,7 +3673,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv return -EINVAL; if (len > sizeof(mreq)) len = sizeof(mreq); - if (copy_from_user(&mreq, optval, len)) + if (copy_from_sockptr(&mreq, optval, len)) return -EFAULT; if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) return -EINVAL; @@ -3703,7 +3704,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen < len) { ret = -EINVAL; } else { - if (copy_from_user(&req_u.req, optval, len)) + if (copy_from_sockptr(&req_u.req, optval, len)) ret = -EFAULT; else ret = packet_set_ring(sk, &req_u, 0, @@ -3718,7 +3719,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; pkt_sk(sk)->copy_thresh = val; @@ -3730,7 +3731,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (val) { case TPACKET_V1: @@ -3756,7 +3757,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; @@ -3776,7 +3777,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3795,7 +3796,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen < sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3809,7 +3810,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen < sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3825,7 +3826,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv return -EINVAL; if (optlen < sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3844,7 +3845,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; po->tp_tstamp = val; @@ -3856,7 +3857,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; return fanout_add(sk, val & 0xffff, val >> 16); @@ -3874,7 +3875,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val < 0 || val > 1) return -EINVAL; @@ -3888,7 +3889,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3907,7 +3908,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; po->xmit = val ? packet_direct_xmit : dev_queue_xmit; @@ -4040,28 +4041,6 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, return 0; } - -#ifdef CONFIG_COMPAT -static int compat_packet_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - struct packet_sock *po = pkt_sk(sock->sk); - - if (level != SOL_PACKET) - return -ENOPROTOOPT; - - if (optname == PACKET_FANOUT_DATA && - po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) { - optval = (char __user *)get_compat_bpf_fprog(optval); - if (!optval) - return -EFAULT; - optlen = sizeof(struct sock_fprog); - } - - return packet_setsockopt(sock, level, optname, optval, optlen); -} -#endif - static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { @@ -4293,7 +4272,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; __be16 num; - int err = -EINVAL; + int err; /* Added to avoid minimal code churn */ struct tpacket_req *req = &req_u->req; @@ -4525,8 +4504,6 @@ static const struct proto_ops packet_ops_spkt = { .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = packet_sendmsg_spkt, .recvmsg = packet_recvmsg, .mmap = sock_no_mmap, @@ -4549,9 +4526,6 @@ static const struct proto_ops packet_ops = { .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, .getsockopt = packet_getsockopt, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_packet_setsockopt, -#endif .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, diff --git a/net/packet/internal.h b/net/packet/internal.h index 907f4cd2a718..fd41ecb7f605 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -39,7 +39,7 @@ struct tpacket_kbdq_core { char *nxt_offset; struct sk_buff *skb; - atomic_t blk_fill_in_prog; + rwlock_t blk_fill_in_prog_lock; /* Default is set to 8ms */ #define DEFAULT_PRB_RETIRE_TOV (8) diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 4577e43cb777..e47d09aca4af 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -975,7 +975,7 @@ static int pep_init(struct sock *sk) } static int pep_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct pep_sock *pn = pep_sk(sk); int val = 0, err = 0; @@ -983,7 +983,7 @@ static int pep_setsockopt(struct sock *sk, int level, int optname, if (level != SOL_PNPIPE) return -ENOPROTOOPT; if (optlen >= sizeof(int)) { - if (get_user(val, (int __user *) optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 76d499f6af9a..2599235d592e 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -439,12 +439,6 @@ const struct proto_ops phonet_dgram_ops = { .ioctl = pn_socket_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, -#ifdef CONFIG_COMPAT - .compat_setsockopt = sock_no_setsockopt, - .compat_getsockopt = sock_no_getsockopt, -#endif .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, @@ -466,10 +460,6 @@ const struct proto_ops phonet_stream_ops = { .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 300a104b9a0f..b4c0db0b7d31 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1209,8 +1209,6 @@ static const struct proto_ops qrtr_proto_ops = { .gettstamp = sock_gettstamp, .poll = datagram_poll, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .release = qrtr_release, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 1a5bf3fa4578..b239120dd9ca 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -290,8 +290,7 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return 0; } -static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, - int len) +static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len) { struct sockaddr_in6 sin6; struct sockaddr_in sin; @@ -308,14 +307,15 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, goto out; } else if (len < sizeof(struct sockaddr_in6)) { /* Assume IPv4 */ - if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) { + if (copy_from_sockptr(&sin, optval, + sizeof(struct sockaddr_in))) { ret = -EFAULT; goto out; } ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); sin6.sin6_port = sin.sin_port; } else { - if (copy_from_user(&sin6, optval, + if (copy_from_sockptr(&sin6, optval, sizeof(struct sockaddr_in6))) { ret = -EFAULT; goto out; @@ -327,21 +327,20 @@ out: return ret; } -static int rds_set_bool_option(unsigned char *optvar, char __user *optval, +static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval, int optlen) { int value; if (optlen < sizeof(int)) return -EINVAL; - if (get_user(value, (int __user *) optval)) + if (copy_from_sockptr(&value, optval, sizeof(int))) return -EFAULT; *optvar = !!value; return 0; } -static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, - int optlen) +static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen) { int ret; @@ -358,8 +357,7 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, return ret; } -static int rds_set_transport(struct rds_sock *rs, char __user *optval, - int optlen) +static int rds_set_transport(struct rds_sock *rs, sockptr_t optval, int optlen) { int t_type; @@ -369,7 +367,7 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval, if (optlen != sizeof(int)) return -EINVAL; - if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type))) + if (copy_from_sockptr(&t_type, optval, sizeof(t_type))) return -EFAULT; if (t_type < 0 || t_type >= RDS_TRANS_COUNT) @@ -380,7 +378,7 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval, return rs->rs_transport ? 0 : -ENOPROTOOPT; } -static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, +static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval, int optlen, int optname) { int val, valbool; @@ -388,7 +386,7 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, if (optlen != sizeof(int)) return -EFAULT; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; valbool = val ? 1 : 0; @@ -404,7 +402,7 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, return 0; } -static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, +static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_rx_trace_so trace; @@ -413,7 +411,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, if (optlen != sizeof(struct rds_rx_trace_so)) return -EFAULT; - if (copy_from_user(&trace, optval, sizeof(trace))) + if (copy_from_sockptr(&trace, optval, sizeof(trace))) return -EFAULT; if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX) @@ -432,7 +430,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, } static int rds_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct rds_sock *rs = rds_sk_to_rs(sock->sk); int ret; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index a7ae11846cd7..ccdd304eae0a 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -353,21 +353,20 @@ out: return ret; } -int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen) +int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_args args; if (optlen != sizeof(struct rds_get_mr_args)) return -EINVAL; - if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval, - sizeof(struct rds_get_mr_args))) + if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args))) return -EFAULT; return __rds_rdma_map(rs, &args, NULL, NULL, NULL); } -int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) +int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_for_dest_args args; struct rds_get_mr_args new_args; @@ -375,7 +374,7 @@ int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) if (optlen != sizeof(struct rds_get_mr_for_dest_args)) return -EINVAL; - if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval, + if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_for_dest_args))) return -EFAULT; @@ -394,7 +393,7 @@ int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) /* * Free the MR indicated by the given R_Key */ -int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen) +int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_free_mr_args args; struct rds_mr *mr; @@ -403,8 +402,7 @@ int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen) if (optlen != sizeof(struct rds_free_mr_args)) return -EINVAL; - if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval, - sizeof(struct rds_free_mr_args))) + if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args))) return -EFAULT; /* Special case - a null cookie means flush all unused MRs */ diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index bfafd4a6d827..ca4c3a667091 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -13,7 +13,7 @@ /* Below reject reason is for legacy interoperability issue with non-linux * RDS endpoints where older version incompatibility is conveyed via value 1. - * For future version(s), proper encoded reject reason should be be used. + * For future version(s), proper encoded reject reason should be used. */ #define RDS_RDMA_REJ_INCOMPAT 1 diff --git a/net/rds/rds.h b/net/rds/rds.h index 106e862996b9..d35d1fc39807 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -924,9 +924,9 @@ int rds_send_pong(struct rds_conn_path *cp, __be16 dport); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); -int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); -int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); -int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); +int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen); +int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen); +int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen); void rds_rdma_drop_keys(struct rds_sock *rs); int rds_rdma_extra_size(struct rds_rdma_args *args, struct rds_iov_vector *iov); diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ce85656ac9c1..cf7d974e0f61 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -365,7 +365,7 @@ void rose_destroy_socket(struct sock *sk) */ static int rose_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); @@ -377,7 +377,7 @@ static int rose_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(opt, (int __user *)optval)) + if (copy_from_sockptr(&opt, optval, sizeof(int))) return -EFAULT; switch (optname) { diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 394189b81849..e6725a6de015 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -267,7 +267,7 @@ static int rxrpc_listen(struct socket *sock, int backlog) * @gfp: The allocation constraints * @notify_rx: Where to send notifications instead of socket queue * @upgrade: Request service upgrade for call - * @intr: The call is interruptible + * @interruptibility: The call is interruptible, or can be canceled. * @debug_id: The debug ID for tracing to be assigned to the call * * Allow a kernel service to begin a call on the nominated socket. This just @@ -588,7 +588,7 @@ EXPORT_SYMBOL(rxrpc_sock_set_min_security_level); * set RxRPC socket options */ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); unsigned int min_sec_level; @@ -639,8 +639,8 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; - ret = get_user(min_sec_level, - (unsigned int __user *) optval); + ret = copy_from_sockptr(&min_sec_level, optval, + sizeof(unsigned int)); if (ret < 0) goto error; ret = -EINVAL; @@ -658,7 +658,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, if (rx->sk.sk_state != RXRPC_SERVER_BOUND2) goto error; ret = -EFAULT; - if (copy_from_user(service_upgrade, optval, + if (copy_from_sockptr(service_upgrade, optval, sizeof(service_upgrade)) != 0) goto error; ret = -EINVAL; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9a2139ebd67d..6d29a3603a3e 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -909,8 +909,8 @@ extern const struct rxrpc_security rxrpc_no_security; extern struct key_type key_type_rxrpc; extern struct key_type key_type_rxrpc_s; -int rxrpc_request_key(struct rxrpc_sock *, char __user *, int); -int rxrpc_server_keyring(struct rxrpc_sock *, char __user *, int); +int rxrpc_request_key(struct rxrpc_sock *, sockptr_t , int); +int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int); int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t, u32); diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 0c98313dd7a8..94c3df392651 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -896,7 +896,7 @@ static void rxrpc_describe(const struct key *key, struct seq_file *m) /* * grab the security key for a socket */ -int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) +int rxrpc_request_key(struct rxrpc_sock *rx, sockptr_t optval, int optlen) { struct key *key; char *description; @@ -906,7 +906,7 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) if (optlen <= 0 || optlen > PAGE_SIZE - 1) return -EINVAL; - description = memdup_user_nul(optval, optlen); + description = memdup_sockptr_nul(optval, optlen); if (IS_ERR(description)) return PTR_ERR(description); @@ -926,8 +926,7 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) /* * grab the security keyring for a server socket */ -int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, - int optlen) +int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) { struct key *key; char *description; @@ -937,7 +936,7 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, if (optlen <= 0 || optlen > PAGE_SIZE - 1) return -EINVAL; - description = memdup_user_nul(optval, optlen); + description = memdup_sockptr_nul(optval, optlen); if (IS_ERR(description)) return PTR_ERR(description); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 84badf00647e..a3b37d88800e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -468,6 +468,9 @@ choice config DEFAULT_FQ_CODEL bool "Fair Queue Controlled Delay" if NET_SCH_FQ_CODEL + config DEFAULT_FQ_PIE + bool "Flow Queue Proportional Integral controller Enhanced" if NET_SCH_FQ_PIE + config DEFAULT_SFQ bool "Stochastic Fair Queue" if NET_SCH_SFQ @@ -480,6 +483,7 @@ config DEFAULT_NET_SCH default "pfifo_fast" if DEFAULT_PFIFO_FAST default "fq" if DEFAULT_FQ default "fq_codel" if DEFAULT_FQ_CODEL + default "fq_pie" if DEFAULT_FQ_PIE default "sfq" if DEFAULT_SFQ default "pfifo_fast" endif diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 8ac7eb0a8309..063d8aaf2900 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1059,14 +1059,13 @@ err: return err; } -void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, - bool drop, bool hw) +void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, bool hw) { if (a->cpu_bstats) { _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - if (drop) - this_cpu_ptr(a->cpu_qstats)->drops += packets; + this_cpu_ptr(a->cpu_qstats)->drops += drops; if (hw) _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), @@ -1075,8 +1074,7 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, } _bstats_update(&a->tcfa_bstats, bytes, packets); - if (drop) - a->tcfa_qstats.drops += packets; + a->tcfa_qstats.drops += drops; if (hw) _bstats_update(&a->tcfa_bstats_hw, bytes, packets); } @@ -1475,7 +1473,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_ROOT_MAX + 1]; - u32 portid = skb ? NETLINK_CB(skb).portid : 0; + u32 portid = NETLINK_CB(skb).portid; int ret = 0, ovr = 0; if ((n->nlmsg_type != RTM_GETACTION) && diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index c60674cf25c4..f5826e457679 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -598,7 +598,8 @@ again: if (!tcf_csum_ipv6(skb, update_flags)) goto drop; break; - case cpu_to_be16(ETH_P_8021AD): /* fall through */ + case cpu_to_be16(ETH_P_8021AD): + fallthrough; case cpu_to_be16(ETH_P_8021Q): if (skb_vlan_tag_present(skb) && !orig_vlan_tag_present) { protocol = skb->protocol; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 6ed1652d1e26..e6ad42b11835 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -706,8 +706,10 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, if (err && err != -EINPROGRESS) goto out_free; - if (!err) + if (!err) { *defrag = true; + cb.mru = IPCB(skb)->frag_max_size; + } } else { /* NFPROTO_IPV6 */ #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; @@ -717,8 +719,10 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, if (err && err != -EINPROGRESS) goto out_free; - if (!err) + if (!err) { *defrag = true; + cb.mru = IP6CB(skb)->frag_max_size; + } #else err = -EOPNOTSUPP; goto out_free; @@ -792,7 +796,7 @@ static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, } } /* Non-ICMP, fall thru to initialize if needed. */ - /* fall through */ + fallthrough; case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. @@ -1464,12 +1468,12 @@ static int tcf_ct_search(struct net *net, struct tc_action **a, u32 index) return tcf_idr_search(tn, a, index); } -static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse, bool hw) +static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, u64 lastuse, bool hw) { struct tcf_ct *c = to_ct(a); - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse); } diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 416065772719..410e3bbfb9ca 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -171,14 +171,15 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, return action; } -static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse, bool hw) +static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, u64 lastuse, bool hw) { struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); struct tcf_t *tm = &gact->tcf_tm; - tcf_action_update_stats(a, bytes, packets, action == TC_ACT_SHOT, hw); + tcf_action_update_stats(a, bytes, packets, + action == TC_ACT_SHOT ? packets : drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 323ae7f6315d..1fb8d428d2c1 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -578,13 +578,13 @@ static int tcf_gate_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse, bool hw) +static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, u64 lastuse, bool hw) { struct tcf_gate *gact = to_gate(a); struct tcf_t *tm = &gact->tcf_tm; - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 83dd82fc9f40..b2705318993b 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -312,13 +312,13 @@ out: return retval; } -static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse, bool hw) +static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, u64 lastuse, bool hw) { struct tcf_mirred *m = to_mirred(a); struct tcf_t *tm = &m->tcf_tm; - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index d41d6200d9de..c158bfed86d5 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -409,13 +409,13 @@ done: return p->tcf_action; } -static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse, bool hw) +static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, u64 lastuse, bool hw) { struct tcf_pedit *d = to_pedit(a); struct tcf_t *tm = &d->tcf_tm; - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } @@ -436,8 +436,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, return -ENOBUFS; spin_lock_bh(&p->tcf_lock); - memcpy(opt->keys, p->tcfp_keys, - p->tcfp_nkeys * sizeof(struct tc_pedit_key)); + memcpy(opt->keys, p->tcfp_keys, flex_array_size(opt, keys, p->tcfp_nkeys)); opt->index = p->tcf_index; opt->nkeys = p->tcfp_nkeys; opt->flags = p->tcfp_flags; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 8b7a0ac96c51..0b431d493768 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -288,13 +288,13 @@ static void tcf_police_cleanup(struct tc_action *a) } static void tcf_police_stats_update(struct tc_action *a, - u64 bytes, u32 packets, + u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_police *police = to_police(a); struct tcf_t *tm = &police->tcf_tm; - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index b2b3faa57294..d0652386c6e2 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -74,12 +74,13 @@ err: } static void tcf_skbedit_stats_update(struct tc_action *a, u64 bytes, - u32 packets, u64 lastuse, bool hw) + u64 packets, u64 drops, + u64 lastuse, bool hw) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_t *tm = &d->tcf_tm; - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index c91d3958fcbb..a5ff9f68ab02 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -302,13 +302,13 @@ static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse, bool hw) +static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, u64 lastuse, bool hw) { struct tcf_vlan *v = to_vlan(a); struct tcf_t *tm = &v->tcf_tm; - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 4619cb3cb0a8..41a55c6cbeb8 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -621,7 +621,7 @@ static int tcf_block_setup(struct tcf_block *block, struct flow_block_offload *bo); static void tcf_block_offload_init(struct flow_block_offload *bo, - struct net_device *dev, + struct net_device *dev, struct Qdisc *sch, enum flow_block_command command, enum flow_block_binder_type binder_type, struct flow_block *flow_block, @@ -633,6 +633,7 @@ static void tcf_block_offload_init(struct flow_block_offload *bo, bo->block = flow_block; bo->block_shared = shared; bo->extack = extack; + bo->sch = sch; INIT_LIST_HEAD(&bo->cb_list); } @@ -643,10 +644,11 @@ static void tc_block_indr_cleanup(struct flow_block_cb *block_cb) { struct tcf_block *block = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; + struct Qdisc *sch = block_cb->indr.sch; struct netlink_ext_ack extack = {}; struct flow_block_offload bo; - tcf_block_offload_init(&bo, dev, FLOW_BLOCK_UNBIND, + tcf_block_offload_init(&bo, dev, sch, FLOW_BLOCK_UNBIND, block_cb->indr.binder_type, &block->flow_block, tcf_block_shared(block), &extack); @@ -665,14 +667,14 @@ static bool tcf_block_offload_in_use(struct tcf_block *block) } static int tcf_block_offload_cmd(struct tcf_block *block, - struct net_device *dev, + struct net_device *dev, struct Qdisc *sch, struct tcf_block_ext_info *ei, enum flow_block_command command, struct netlink_ext_ack *extack) { struct flow_block_offload bo = {}; - tcf_block_offload_init(&bo, dev, command, ei->binder_type, + tcf_block_offload_init(&bo, dev, sch, command, ei->binder_type, &block->flow_block, tcf_block_shared(block), extack); @@ -689,7 +691,7 @@ static int tcf_block_offload_cmd(struct tcf_block *block, return tcf_block_setup(block, &bo); } - flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block, &bo, + flow_indr_dev_setup_offload(dev, sch, TC_SETUP_BLOCK, block, &bo, tc_block_indr_cleanup); tcf_block_setup(block, &bo); @@ -716,7 +718,7 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, goto err_unlock; } - err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_BIND, extack); + err = tcf_block_offload_cmd(block, dev, q, ei, FLOW_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; if (err) @@ -743,7 +745,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, int err; down_write(&block->cb_lock); - err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); + err = tcf_block_offload_cmd(block, dev, q, ei, FLOW_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; up_write(&block->cb_lock); @@ -1627,6 +1629,7 @@ int tcf_classify_ingress(struct sk_buff *skb, if (WARN_ON_ONCE(!ext)) return TC_ACT_SHOT; ext->chain = last_executed_chain; + ext->mru = qdisc_skb_cb(skb)->mru; } return ret; @@ -3659,9 +3662,11 @@ int tc_setup_flow_action(struct flow_action *flow_action, tcf_sample_get_group(entry, act); } else if (is_tcf_police(act)) { entry->id = FLOW_ACTION_POLICE; - entry->police.burst = tcf_police_tcfp_burst(act); + entry->police.burst = tcf_police_burst(act); entry->police.rate_bytes_ps = tcf_police_rate_bytes_ps(act); + entry->police.mtu = tcf_police_tcfp_mtu(act); + entry->police.index = act->tcfa_index; } else if (is_tcf_ct(act)) { entry->id = FLOW_ACTION_CT; entry->ct.action = tcf_ct_action(act); @@ -3745,6 +3750,119 @@ unsigned int tcf_exts_num_actions(struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_num_actions); +#ifdef CONFIG_NET_CLS_ACT +static int tcf_qevent_parse_block_index(struct nlattr *block_index_attr, + u32 *p_block_index, + struct netlink_ext_ack *extack) +{ + *p_block_index = nla_get_u32(block_index_attr); + if (!*p_block_index) { + NL_SET_ERR_MSG(extack, "Block number may not be zero"); + return -EINVAL; + } + + return 0; +} + +int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, + enum flow_block_binder_type binder_type, + struct nlattr *block_index_attr, + struct netlink_ext_ack *extack) +{ + u32 block_index; + int err; + + if (!block_index_attr) + return 0; + + err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack); + if (err) + return err; + + if (!block_index) + return 0; + + qe->info.binder_type = binder_type; + qe->info.chain_head_change = tcf_chain_head_change_dflt; + qe->info.chain_head_change_priv = &qe->filter_chain; + qe->info.block_index = block_index; + + return tcf_block_get_ext(&qe->block, sch, &qe->info, extack); +} +EXPORT_SYMBOL(tcf_qevent_init); + +void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch) +{ + if (qe->info.block_index) + tcf_block_put_ext(qe->block, sch, &qe->info); +} +EXPORT_SYMBOL(tcf_qevent_destroy); + +int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, + struct netlink_ext_ack *extack) +{ + u32 block_index; + int err; + + if (!block_index_attr) + return 0; + + err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack); + if (err) + return err; + + /* Bounce newly-configured block or change in block. */ + if (block_index != qe->info.block_index) { + NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL(tcf_qevent_validate_change); + +struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, + struct sk_buff **to_free, int *ret) +{ + struct tcf_result cl_res; + struct tcf_proto *fl; + + if (!qe->info.block_index) + return skb; + + fl = rcu_dereference_bh(qe->filter_chain); + + switch (tcf_classify(skb, fl, &cl_res, false)) { + case TC_ACT_SHOT: + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + *ret = __NET_XMIT_BYPASS; + return NULL; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + __qdisc_drop(skb, to_free); + *ret = __NET_XMIT_STOLEN; + return NULL; + case TC_ACT_REDIRECT: + skb_do_redirect(skb); + *ret = __NET_XMIT_STOLEN; + return NULL; + } + + return skb; +} +EXPORT_SYMBOL(tcf_qevent_handle); + +int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe) +{ + if (!qe->info.block_index) + return 0; + return nla_put_u32(skb, attr_name, qe->info.block_index); +} +EXPORT_SYMBOL(tcf_qevent_dump); +#endif + static __net_init int tcf_net_init(struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index e30bd969fc48..a4f7ef1de7e7 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -64,6 +64,7 @@ struct fl_flow_key { }; } tp_range; struct flow_dissector_key_ct ct; + struct flow_dissector_key_hash hash; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -318,6 +319,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, fl_ct_info_to_flower_map, ARRAY_SIZE(fl_ct_info_to_flower_map)); + skb_flow_dissect_hash(skb, &mask->dissector, &skb_key); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); f = fl_mask_lookup(mask, &skb_key); @@ -491,6 +493,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f, tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes, cls_flower.stats.pkts, + cls_flower.stats.drops, cls_flower.stats.lastused, cls_flower.stats.used_hw_stats, cls_flower.stats.used_hw_stats_valid); @@ -694,6 +697,9 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY, .len = 128 / BITS_PER_BYTE }, [TCA_FLOWER_FLAGS] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_HASH] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_HASH_MASK] = { .type = NLA_U32 }, + }; static const struct nla_policy @@ -1625,6 +1631,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip); + fl_set_key_val(tb, &key->hash.hash, TCA_FLOWER_KEY_HASH, + &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK, + sizeof(key->hash.hash)); + if (tb[TCA_FLOWER_KEY_ENC_OPTS]) { ret = fl_set_enc_opt(tb, key, mask, extack); if (ret) @@ -1739,6 +1749,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_CT, ct); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_HASH, hash); skb_flow_dissector_init(dissector, keys, cnt); } @@ -2959,6 +2971,11 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) goto nla_put_failure; + if (fl_dump_key_val(skb, &key->hash.hash, TCA_FLOWER_KEY_HASH, + &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK, + sizeof(key->hash.hash))) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 8d39dbcf1746..cafb84480bab 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -338,7 +338,8 @@ static void mall_stats_hw_filter(struct tcf_proto *tp, tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true); tcf_exts_stats_update(&head->exts, cls_mall.stats.bytes, - cls_mall.stats.pkts, cls_mall.stats.lastused, + cls_mall.stats.pkts, cls_mall.stats.drops, + cls_mall.stats.lastused, cls_mall.stats.used_hw_stats, cls_mall.stats.used_hw_stats_valid); } diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 61e95029c18f..78bec347b8b6 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -533,7 +533,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," "p %p,r %p,*arg %p\n", - tp, handle, tca, arg, opt, p, r, arg ? *arg : NULL); + tp, handle, tca, arg, opt, p, r, *arg); if (!opt) return 0; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index e15ff335953d..7b69ab1993ba 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -796,9 +796,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, struct tc_u32_sel *s = &n->sel; struct tc_u_knode *new; - new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), - GFP_KERNEL); - + new = kzalloc(struct_size(new, sel.keys, s->nkeys), GFP_KERNEL); if (!new) return NULL; @@ -854,9 +852,6 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, u32 htid, flags = 0; size_t sel_size; int err; -#ifdef CONFIG_CLS_U32_PERF - size_t size; -#endif if (!opt) { if (handle) { @@ -1024,15 +1019,15 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, goto erridr; } - n = kzalloc(offsetof(typeof(*n), sel) + sel_size, GFP_KERNEL); + n = kzalloc(struct_size(n, sel.keys, s->nkeys), GFP_KERNEL); if (n == NULL) { err = -ENOBUFS; goto erridr; } #ifdef CONFIG_CLS_U32_PERF - size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64); - n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt)); + n->pf = __alloc_percpu(struct_size(n->pf, kcnts, s->nkeys), + __alignof__(struct tc_u32_pcnt)); if (!n->pf) { err = -ENOBUFS; goto errfree; @@ -1296,8 +1291,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, int cpu; #endif - if (nla_put(skb, TCA_U32_SEL, - sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), + if (nla_put(skb, TCA_U32_SEL, struct_size(&n->sel, keys, n->sel.nkeys), &n->sel)) goto nla_put_failure; @@ -1347,9 +1341,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, goto nla_put_failure; } #ifdef CONFIG_CLS_U32_PERF - gpf = kzalloc(sizeof(struct tc_u32_pcnt) + - n->sel.nkeys * sizeof(u64), - GFP_KERNEL); + gpf = kzalloc(struct_size(gpf, kcnts, n->sel.nkeys), GFP_KERNEL); if (!gpf) goto nla_put_failure; @@ -1363,9 +1355,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, gpf->kcnts[i] += pf->kcnts[i]; } - if (nla_put_64bit(skb, TCA_U32_PCNT, - sizeof(struct tc_u32_pcnt) + - n->sel.nkeys * sizeof(u64), + if (nla_put_64bit(skb, TCA_U32_PCNT, struct_size(gpf, kcnts, n->sel.nkeys), gpf, TCA_U32_PAD)) { kfree(gpf); goto nla_put_failure; diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c index b9a94fdf9397..5ea84decec19 100644 --- a/net/sched/em_canid.c +++ b/net/sched/em_canid.c @@ -40,6 +40,7 @@ struct canid_match { /** * em_canid_get_id() - Extracts Can ID out of the sk_buff structure. + * @skb: buffer to extract Can ID from */ static canid_t em_canid_get_id(struct sk_buff *skb) { diff --git a/net/sched/ematch.c b/net/sched/ematch.c index dd3b8c11a2e0..f885bea5b452 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -389,7 +389,6 @@ EXPORT_SYMBOL(tcf_em_tree_validate); /** * tcf_em_tree_destroy - destroy an ematch tree * - * @tp: classifier kind handle * @tree: ematch tree to be deleted * * This functions destroys an ematch tree previously created by @@ -425,7 +424,7 @@ EXPORT_SYMBOL(tcf_em_tree_destroy); * tcf_em_tree_dump - dump ematch tree into a rtnl message * * @skb: skb holding the rtnl message - * @t: ematch tree to be dumped + * @tree: ematch tree to be dumped * @tlv: TLV type to be used to encapsulate the tree * * This function dumps a ematch tree into a rtnl message. It is valid to diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 9a3449b56bd6..2a76a2f5ed88 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -267,7 +267,8 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) root->handle == handle) return root; - hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) { + hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle, + lockdep_rtnl_is_held()) { if (q->handle == handle) return q; } @@ -1093,8 +1094,7 @@ skip: int err; /* Only support running class lockless if parent is lockless */ - if (new && (new->flags & TCQ_F_NOLOCK) && - parent && !(parent->flags & TCQ_F_NOLOCK)) + if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK)) qdisc_clear_nolock(new); if (!cops || !cops->graft) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index c7b587897585..0618b63f87c4 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -312,8 +312,8 @@ static const u8 precedence[] = { }; static const u8 diffserv8[] = { - 2, 5, 1, 2, 4, 2, 2, 2, - 0, 2, 1, 2, 1, 2, 1, 2, + 2, 0, 1, 2, 4, 2, 2, 2, + 1, 2, 1, 2, 1, 2, 1, 2, 5, 2, 4, 2, 4, 2, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 6, 2, 3, 2, 3, 2, 3, 2, @@ -323,7 +323,7 @@ static const u8 diffserv8[] = { }; static const u8 diffserv4[] = { - 0, 2, 0, 0, 2, 0, 0, 0, + 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, @@ -334,7 +334,7 @@ static const u8 diffserv4[] = { }; static const u8 diffserv3[] = { - 0, 0, 0, 0, 2, 0, 0, 0, + 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index ce4519358106..53d45e029c36 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -250,7 +250,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; case TC_ACT_RECLASSIFY: diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 07a2b0b35495..dde564670ad8 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -324,7 +324,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index a87e9159338c..c1e84d1eeaba 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -397,7 +397,7 @@ static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 985d5208f563..bbd5f8753600 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -99,7 +99,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return 0; } diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index 4d307f17e084..4dda15588cf4 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -102,7 +102,7 @@ static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return 0; } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 92ad4115e473..d1902fca9844 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1136,7 +1136,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 6feab225b4ba..cd70dbcbd72f 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -239,7 +239,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 1330ad224931..5c27b4270b90 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -43,7 +43,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 647941702f9f..3eabb871a1d5 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -46,7 +46,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 0b05ac7c848e..6335230a971e 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -699,7 +699,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 555a1b9e467f..deac82f3ad7b 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -46,6 +46,8 @@ struct red_sched_data { struct red_vars vars; struct red_stats stats; struct Qdisc *qdisc; + struct tcf_qevent qe_early_drop; + struct tcf_qevent qe_mark; }; #define TC_RED_SUPPORTED_FLAGS (TC_RED_HISTORIC_FLAGS | TC_RED_NODROP) @@ -92,6 +94,9 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (INET_ECN_set_ce(skb)) { q->stats.prob_mark++; + skb = tcf_qevent_handle(&q->qe_mark, sch, skb, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { q->stats.prob_drop++; goto congestion_drop; @@ -109,6 +114,9 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (INET_ECN_set_ce(skb)) { q->stats.forced_mark++; + skb = tcf_qevent_handle(&q->qe_mark, sch, skb, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { q->stats.forced_drop++; goto congestion_drop; @@ -129,6 +137,10 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, return ret; congestion_drop: + skb = tcf_qevent_handle(&q->qe_early_drop, sch, skb, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } @@ -202,6 +214,8 @@ static void red_destroy(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); + tcf_qevent_destroy(&q->qe_mark, sch); + tcf_qevent_destroy(&q->qe_early_drop, sch); del_timer_sync(&q->adapt_timer); red_offload(sch, false); qdisc_put(q->qdisc); @@ -213,14 +227,15 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, [TCA_RED_MAX_P] = { .type = NLA_U32 }, [TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS), + [TCA_RED_EARLY_DROP_BLOCK] = { .type = NLA_U32 }, + [TCA_RED_MARK_BLOCK] = { .type = NLA_U32 }, }; -static int red_change(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) +static int __red_change(struct Qdisc *sch, struct nlattr **tb, + struct netlink_ext_ack *extack) { struct Qdisc *old_child = NULL, *child = NULL; struct red_sched_data *q = qdisc_priv(sch); - struct nlattr *tb[TCA_RED_MAX + 1]; struct nla_bitfield32 flags_bf; struct tc_red_qopt *ctl; unsigned char userbits; @@ -228,14 +243,6 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, int err; u32 max_P; - if (opt == NULL) - return -EINVAL; - - err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy, - NULL); - if (err < 0) - return err; - if (tb[TCA_RED_PARMS] == NULL || tb[TCA_RED_STAB] == NULL) return -EINVAL; @@ -323,11 +330,74 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct red_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_RED_MAX + 1]; + int err; q->qdisc = &noop_qdisc; q->sch = sch; timer_setup(&q->adapt_timer, red_adaptative_timer, 0); - return red_change(sch, opt, extack); + + if (!opt) + return -EINVAL; + + err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy, + extack); + if (err < 0) + return err; + + err = __red_change(sch, tb, extack); + if (err) + return err; + + err = tcf_qevent_init(&q->qe_early_drop, sch, + FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, + tb[TCA_RED_EARLY_DROP_BLOCK], extack); + if (err) + goto err_early_drop_init; + + err = tcf_qevent_init(&q->qe_mark, sch, + FLOW_BLOCK_BINDER_TYPE_RED_MARK, + tb[TCA_RED_MARK_BLOCK], extack); + if (err) + goto err_mark_init; + + return 0; + +err_mark_init: + tcf_qevent_destroy(&q->qe_early_drop, sch); +err_early_drop_init: + del_timer_sync(&q->adapt_timer); + red_offload(sch, false); + qdisc_put(q->qdisc); + return err; +} + +static int red_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_RED_MAX + 1]; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy, + extack); + if (err < 0) + return err; + + err = tcf_qevent_validate_change(&q->qe_early_drop, + tb[TCA_RED_EARLY_DROP_BLOCK], extack); + if (err) + return err; + + err = tcf_qevent_validate_change(&q->qe_mark, + tb[TCA_RED_MARK_BLOCK], extack); + if (err) + return err; + + return __red_change(sch, tb, extack); } static int red_dump_offload_stats(struct Qdisc *sch) @@ -371,7 +441,9 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) || nla_put_bitfield32(skb, TCA_RED_FLAGS, - q->flags, TC_RED_SUPPORTED_FLAGS)) + q->flags, TC_RED_SUPPORTED_FLAGS) || + tcf_qevent_dump(skb, TCA_RED_MARK_BLOCK, &q->qe_mark) || + tcf_qevent_dump(skb, TCA_RED_EARLY_DROP_BLOCK, &q->qe_early_drop)) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 4074c50ac3d7..da047a37a3bf 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -265,7 +265,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return false; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 15f400dcb400..bca2be57d9fc 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -186,7 +186,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return 0; } diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index b1eb12d33b9a..e981992634dd 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1108,11 +1108,10 @@ static void setup_txtime(struct taprio_sched *q, static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries) { - size_t size = sizeof(struct tc_taprio_sched_entry) * num_entries + - sizeof(struct __tc_taprio_qopt_offload); struct __tc_taprio_qopt_offload *__offload; - __offload = kzalloc(size, GFP_KERNEL); + __offload = kzalloc(struct_size(__offload, offload.entries, num_entries), + GFP_KERNEL); if (!__offload) return NULL; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index ccfa0ab3e7f4..aea2a982984d 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -1033,8 +1033,6 @@ static const struct proto_ops inet6_seqpacket_ops = { .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; @@ -1089,10 +1087,6 @@ static struct sctp_af sctp_af_inet6 = { .net_header_len = sizeof(struct ipv6hdr), .sockaddr_len = sizeof(struct sockaddr_in6), .ip_options_len = sctp_v6_ip_options_len, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ipv6_setsockopt, - .compat_getsockopt = compat_ipv6_getsockopt, -#endif }; static struct sctp_pf sctp_pf_inet6 = { diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index cde29f3c7fb3..d19db22262fd 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1036,10 +1036,6 @@ static const struct proto_ops inet_seqpacket_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; /* Registration with AF_INET family. */ @@ -1093,10 +1089,6 @@ static struct sctp_af sctp_af_inet = { .net_header_len = sizeof(struct iphdr), .sockaddr_len = sizeof(struct sockaddr_in), .ip_options_len = sctp_v4_ip_options_len, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ip_setsockopt, - .compat_getsockopt = compat_ip_getsockopt, -#endif }; struct sctp_pf *sctp_get_pf_specific(sa_family_t family) @@ -1375,15 +1367,15 @@ static struct pernet_operations sctp_ctrlsock_ops = { /* Initialize the universe into something sensible. */ static __init int sctp_init(void) { - int i; - int status = -EINVAL; - unsigned long goal; - unsigned long limit; unsigned long nr_pages = totalram_pages(); + unsigned long limit; + unsigned long goal; + int max_entry_order; + int num_entries; int max_share; + int status; int order; - int num_entries; - int max_entry_order; + int i; sock_skb_cb_check_size(sizeof(struct sctp_ulpevent)); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d57e1a002ffc..ec1fba1fbe71 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -979,9 +979,8 @@ int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw) * * Returns 0 if ok, <0 errno code on error. */ -static int sctp_setsockopt_bindx_kernel(struct sock *sk, - struct sockaddr *addrs, int addrs_size, - int op) +static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *addrs, + int addrs_size, int op) { int err; int addrcnt = 0; @@ -991,7 +990,7 @@ static int sctp_setsockopt_bindx_kernel(struct sock *sk, struct sctp_af *af; pr_debug("%s: sk:%p addrs:%p addrs_size:%d opt:%d\n", - __func__, sk, addrs, addrs_size, op); + __func__, sk, addr_buf, addrs_size, op); if (unlikely(addrs_size <= 0)) return -EINVAL; @@ -1037,29 +1036,13 @@ static int sctp_setsockopt_bindx_kernel(struct sock *sk, } } -static int sctp_setsockopt_bindx(struct sock *sk, - struct sockaddr __user *addrs, - int addrs_size, int op) -{ - struct sockaddr *kaddrs; - int err; - - kaddrs = memdup_user(addrs, addrs_size); - if (IS_ERR(kaddrs)) - return PTR_ERR(kaddrs); - err = sctp_setsockopt_bindx_kernel(sk, kaddrs, addrs_size, op); - kfree(kaddrs); - return err; -} - static int sctp_bind_add(struct sock *sk, struct sockaddr *addrs, int addrlen) { int err; lock_sock(sk); - err = sctp_setsockopt_bindx_kernel(sk, addrs, addrlen, - SCTP_BINDX_ADD_ADDR); + err = sctp_setsockopt_bindx(sk, addrs, addrlen, SCTP_BINDX_ADD_ADDR); release_sock(sk); return err; } @@ -1303,36 +1286,29 @@ out_free: * it. * * sk The sk of the socket - * addrs The pointer to the addresses in user land + * addrs The pointer to the addresses * addrssize Size of the addrs buffer * * Returns >=0 if ok, <0 errno code on error. */ -static int __sctp_setsockopt_connectx(struct sock *sk, - struct sockaddr __user *addrs, - int addrs_size, - sctp_assoc_t *assoc_id) +static int __sctp_setsockopt_connectx(struct sock *sk, struct sockaddr *kaddrs, + int addrs_size, sctp_assoc_t *assoc_id) { - struct sockaddr *kaddrs; int err = 0, flags = 0; pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n", - __func__, sk, addrs, addrs_size); + __func__, sk, kaddrs, addrs_size); /* make sure the 1st addr's sa_family is accessible later */ if (unlikely(addrs_size < sizeof(sa_family_t))) return -EINVAL; - kaddrs = memdup_user(addrs, addrs_size); - if (IS_ERR(kaddrs)) - return PTR_ERR(kaddrs); - /* Allow security module to validate connectx addresses. */ err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_CONNECTX, (struct sockaddr *)kaddrs, addrs_size); if (err) - goto out_free; + return err; /* in-kernel sockets don't generally have a file allocated to them * if all they do is call sock_create_kern(). @@ -1340,12 +1316,7 @@ static int __sctp_setsockopt_connectx(struct sock *sk, if (sk->sk_socket->file) flags = sk->sk_socket->file->f_flags; - err = __sctp_connect(sk, kaddrs, addrs_size, flags, assoc_id); - -out_free: - kfree(kaddrs); - - return err; + return __sctp_connect(sk, kaddrs, addrs_size, flags, assoc_id); } /* @@ -1353,10 +1324,10 @@ out_free: * to the option that doesn't provide association id. */ static int sctp_setsockopt_connectx_old(struct sock *sk, - struct sockaddr __user *addrs, + struct sockaddr *kaddrs, int addrs_size) { - return __sctp_setsockopt_connectx(sk, addrs, addrs_size, NULL); + return __sctp_setsockopt_connectx(sk, kaddrs, addrs_size, NULL); } /* @@ -1366,13 +1337,13 @@ static int sctp_setsockopt_connectx_old(struct sock *sk, * always positive. */ static int sctp_setsockopt_connectx(struct sock *sk, - struct sockaddr __user *addrs, + struct sockaddr *kaddrs, int addrs_size) { sctp_assoc_t assoc_id = 0; int err = 0; - err = __sctp_setsockopt_connectx(sk, addrs, addrs_size, &assoc_id); + err = __sctp_setsockopt_connectx(sk, kaddrs, addrs_size, &assoc_id); if (err) return err; @@ -1402,6 +1373,7 @@ static int sctp_getsockopt_connectx3(struct sock *sk, int len, { struct sctp_getaddrs_old param; sctp_assoc_t assoc_id = 0; + struct sockaddr *kaddrs; int err = 0; #ifdef CONFIG_COMPAT @@ -1425,9 +1397,12 @@ static int sctp_getsockopt_connectx3(struct sock *sk, int len, return -EFAULT; } - err = __sctp_setsockopt_connectx(sk, (struct sockaddr __user *) - param.addrs, param.addr_num, - &assoc_id); + kaddrs = memdup_user(param.addrs, param.addr_num); + if (IS_ERR(kaddrs)) + return PTR_ERR(kaddrs); + + err = __sctp_setsockopt_connectx(sk, kaddrs, param.addr_num, &assoc_id); + kfree(kaddrs); if (err == 0 || err == -EINPROGRESS) { if (copy_to_user(optval, &assoc_id, sizeof(assoc_id))) return -EFAULT; @@ -2209,28 +2184,18 @@ out: * exceeds the current PMTU size, the message will NOT be sent and * instead a error will be indicated to the user. */ -static int sctp_setsockopt_disable_fragments(struct sock *sk, - char __user *optval, +static int sctp_setsockopt_disable_fragments(struct sock *sk, int *val, unsigned int optlen) { - int val; - if (optlen < sizeof(int)) return -EINVAL; - - if (get_user(val, (int __user *)optval)) - return -EFAULT; - - sctp_sk(sk)->disable_fragments = (val == 0) ? 0 : 1; - + sctp_sk(sk)->disable_fragments = (*val == 0) ? 0 : 1; return 0; } -static int sctp_setsockopt_events(struct sock *sk, char __user *optval, +static int sctp_setsockopt_events(struct sock *sk, __u8 *sn_type, unsigned int optlen) { - struct sctp_event_subscribe subscribe; - __u8 *sn_type = (__u8 *)&subscribe; struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; int i; @@ -2238,9 +2203,6 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval, if (optlen > sizeof(struct sctp_event_subscribe)) return -EINVAL; - if (copy_from_user(&subscribe, optval, optlen)) - return -EFAULT; - for (i = 0; i < optlen; i++) sctp_ulpevent_type_set(&sp->subscribe, SCTP_SN_TYPE_BASE + i, sn_type[i]); @@ -2280,7 +2242,7 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval, * integer defining the number of seconds of idle time before an * association is closed. */ -static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, +static int sctp_setsockopt_autoclose(struct sock *sk, u32 *optval, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); @@ -2291,9 +2253,8 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, return -EOPNOTSUPP; if (optlen != sizeof(int)) return -EINVAL; - if (copy_from_user(&sp->autoclose, optval, optlen)) - return -EFAULT; + sp->autoclose = *optval; if (sp->autoclose > net->sctp.max_autoclose) sp->autoclose = net->sctp.max_autoclose; @@ -2628,48 +2589,42 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, } static int sctp_setsockopt_peer_addr_params(struct sock *sk, - char __user *optval, + struct sctp_paddrparams *params, unsigned int optlen) { - struct sctp_paddrparams params; struct sctp_transport *trans = NULL; struct sctp_association *asoc = NULL; struct sctp_sock *sp = sctp_sk(sk); int error; int hb_change, pmtud_change, sackdelay_change; - if (optlen == sizeof(params)) { - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - } else if (optlen == ALIGN(offsetof(struct sctp_paddrparams, + if (optlen == ALIGN(offsetof(struct sctp_paddrparams, spp_ipv6_flowlabel), 4)) { - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - if (params.spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL)) + if (params->spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL)) return -EINVAL; - } else { + } else if (optlen != sizeof(*params)) { return -EINVAL; } /* Validate flags and value parameters. */ - hb_change = params.spp_flags & SPP_HB; - pmtud_change = params.spp_flags & SPP_PMTUD; - sackdelay_change = params.spp_flags & SPP_SACKDELAY; + hb_change = params->spp_flags & SPP_HB; + pmtud_change = params->spp_flags & SPP_PMTUD; + sackdelay_change = params->spp_flags & SPP_SACKDELAY; if (hb_change == SPP_HB || pmtud_change == SPP_PMTUD || sackdelay_change == SPP_SACKDELAY || - params.spp_sackdelay > 500 || - (params.spp_pathmtu && - params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT)) + params->spp_sackdelay > 500 || + (params->spp_pathmtu && + params->spp_pathmtu < SCTP_DEFAULT_MINSEGMENT)) return -EINVAL; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ - if (!sctp_is_any(sk, (union sctp_addr *)¶ms.spp_address)) { - trans = sctp_addr_id2transport(sk, ¶ms.spp_address, - params.spp_assoc_id); + if (!sctp_is_any(sk, (union sctp_addr *)¶ms->spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms->spp_address, + params->spp_assoc_id); if (!trans) return -EINVAL; } @@ -2678,19 +2633,19 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk, * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ - asoc = sctp_id2assoc(sk, params.spp_assoc_id); - if (!asoc && params.spp_assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->spp_assoc_id); + if (!asoc && params->spp_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* Heartbeat demand can only be sent on a transport or * association, but not a socket. */ - if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc) + if (params->spp_flags & SPP_HB_DEMAND && !trans && !asoc) return -EINVAL; /* Process parameters. */ - error = sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + error = sctp_apply_peer_addr_params(params, trans, asoc, sp, hb_change, pmtud_change, sackdelay_change); @@ -2703,7 +2658,7 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk, if (!trans && asoc) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { - sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + sctp_apply_peer_addr_params(params, trans, asoc, sp, hb_change, pmtud_change, sackdelay_change); } @@ -2794,83 +2749,86 @@ static void sctp_apply_asoc_delayed_ack(struct sctp_sack_info *params, * timer to expire. The default value for this is 2, setting this * value to 1 will disable the delayed sack algorithm. */ - -static int sctp_setsockopt_delayed_ack(struct sock *sk, - char __user *optval, unsigned int optlen) +static int __sctp_setsockopt_delayed_ack(struct sock *sk, + struct sctp_sack_info *params) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; - struct sctp_sack_info params; - - if (optlen == sizeof(struct sctp_sack_info)) { - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - - if (params.sack_delay == 0 && params.sack_freq == 0) - return 0; - } else if (optlen == sizeof(struct sctp_assoc_value)) { - pr_warn_ratelimited(DEPRECATED - "%s (pid %d) " - "Use of struct sctp_assoc_value in delayed_ack socket option.\n" - "Use struct sctp_sack_info instead\n", - current->comm, task_pid_nr(current)); - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - - if (params.sack_delay == 0) - params.sack_freq = 1; - else - params.sack_freq = 0; - } else - return -EINVAL; /* Validate value parameter. */ - if (params.sack_delay > 500) + if (params->sack_delay > 500) return -EINVAL; /* Get association, if sack_assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ - asoc = sctp_id2assoc(sk, params.sack_assoc_id); - if (!asoc && params.sack_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, params->sack_assoc_id); + if (!asoc && params->sack_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { - sctp_apply_asoc_delayed_ack(¶ms, asoc); + sctp_apply_asoc_delayed_ack(params, asoc); return 0; } if (sctp_style(sk, TCP)) - params.sack_assoc_id = SCTP_FUTURE_ASSOC; + params->sack_assoc_id = SCTP_FUTURE_ASSOC; - if (params.sack_assoc_id == SCTP_FUTURE_ASSOC || - params.sack_assoc_id == SCTP_ALL_ASSOC) { - if (params.sack_delay) { - sp->sackdelay = params.sack_delay; + if (params->sack_assoc_id == SCTP_FUTURE_ASSOC || + params->sack_assoc_id == SCTP_ALL_ASSOC) { + if (params->sack_delay) { + sp->sackdelay = params->sack_delay; sp->param_flags = sctp_spp_sackdelay_enable(sp->param_flags); } - if (params.sack_freq == 1) { + if (params->sack_freq == 1) { sp->param_flags = sctp_spp_sackdelay_disable(sp->param_flags); - } else if (params.sack_freq > 1) { - sp->sackfreq = params.sack_freq; + } else if (params->sack_freq > 1) { + sp->sackfreq = params->sack_freq; sp->param_flags = sctp_spp_sackdelay_enable(sp->param_flags); } } - if (params.sack_assoc_id == SCTP_CURRENT_ASSOC || - params.sack_assoc_id == SCTP_ALL_ASSOC) + if (params->sack_assoc_id == SCTP_CURRENT_ASSOC || + params->sack_assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &sp->ep->asocs, asocs) - sctp_apply_asoc_delayed_ack(¶ms, asoc); + sctp_apply_asoc_delayed_ack(params, asoc); return 0; } +static int sctp_setsockopt_delayed_ack(struct sock *sk, + struct sctp_sack_info *params, + unsigned int optlen) +{ + if (optlen == sizeof(struct sctp_assoc_value)) { + struct sctp_assoc_value *v = (struct sctp_assoc_value *)params; + struct sctp_sack_info p; + + pr_warn_ratelimited(DEPRECATED + "%s (pid %d) " + "Use of struct sctp_assoc_value in delayed_ack socket option.\n" + "Use struct sctp_sack_info instead\n", + current->comm, task_pid_nr(current)); + + p.sack_assoc_id = v->assoc_id; + p.sack_delay = v->assoc_value; + p.sack_freq = v->assoc_value ? 0 : 1; + return __sctp_setsockopt_delayed_ack(sk, &p); + } + + if (optlen != sizeof(struct sctp_sack_info)) + return -EINVAL; + if (params->sack_delay == 0 && params->sack_freq == 0) + return 0; + return __sctp_setsockopt_delayed_ack(sk, params); +} + /* 7.1.3 Initialization Parameters (SCTP_INITMSG) * * Applications can specify protocol parameters for the default association @@ -2882,24 +2840,22 @@ static int sctp_setsockopt_delayed_ack(struct sock *sk, * by the change). With TCP-style sockets, this option is inherited by * sockets derived from a listener socket. */ -static int sctp_setsockopt_initmsg(struct sock *sk, char __user *optval, unsigned int optlen) +static int sctp_setsockopt_initmsg(struct sock *sk, struct sctp_initmsg *sinit, + unsigned int optlen) { - struct sctp_initmsg sinit; struct sctp_sock *sp = sctp_sk(sk); if (optlen != sizeof(struct sctp_initmsg)) return -EINVAL; - if (copy_from_user(&sinit, optval, optlen)) - return -EFAULT; - if (sinit.sinit_num_ostreams) - sp->initmsg.sinit_num_ostreams = sinit.sinit_num_ostreams; - if (sinit.sinit_max_instreams) - sp->initmsg.sinit_max_instreams = sinit.sinit_max_instreams; - if (sinit.sinit_max_attempts) - sp->initmsg.sinit_max_attempts = sinit.sinit_max_attempts; - if (sinit.sinit_max_init_timeo) - sp->initmsg.sinit_max_init_timeo = sinit.sinit_max_init_timeo; + if (sinit->sinit_num_ostreams) + sp->initmsg.sinit_num_ostreams = sinit->sinit_num_ostreams; + if (sinit->sinit_max_instreams) + sp->initmsg.sinit_max_instreams = sinit->sinit_max_instreams; + if (sinit->sinit_max_attempts) + sp->initmsg.sinit_max_attempts = sinit->sinit_max_attempts; + if (sinit->sinit_max_init_timeo) + sp->initmsg.sinit_max_init_timeo = sinit->sinit_max_init_timeo; return 0; } @@ -2919,57 +2875,54 @@ static int sctp_setsockopt_initmsg(struct sock *sk, char __user *optval, unsigne * to this call if the caller is using the UDP model. */ static int sctp_setsockopt_default_send_param(struct sock *sk, - char __user *optval, + struct sctp_sndrcvinfo *info, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; - struct sctp_sndrcvinfo info; - if (optlen != sizeof(info)) + if (optlen != sizeof(*info)) return -EINVAL; - if (copy_from_user(&info, optval, optlen)) - return -EFAULT; - if (info.sinfo_flags & + if (info->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_ABORT | SCTP_EOF)) return -EINVAL; - asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); - if (!asoc && info.sinfo_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, info->sinfo_assoc_id); + if (!asoc && info->sinfo_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { - asoc->default_stream = info.sinfo_stream; - asoc->default_flags = info.sinfo_flags; - asoc->default_ppid = info.sinfo_ppid; - asoc->default_context = info.sinfo_context; - asoc->default_timetolive = info.sinfo_timetolive; + asoc->default_stream = info->sinfo_stream; + asoc->default_flags = info->sinfo_flags; + asoc->default_ppid = info->sinfo_ppid; + asoc->default_context = info->sinfo_context; + asoc->default_timetolive = info->sinfo_timetolive; return 0; } if (sctp_style(sk, TCP)) - info.sinfo_assoc_id = SCTP_FUTURE_ASSOC; + info->sinfo_assoc_id = SCTP_FUTURE_ASSOC; - if (info.sinfo_assoc_id == SCTP_FUTURE_ASSOC || - info.sinfo_assoc_id == SCTP_ALL_ASSOC) { - sp->default_stream = info.sinfo_stream; - sp->default_flags = info.sinfo_flags; - sp->default_ppid = info.sinfo_ppid; - sp->default_context = info.sinfo_context; - sp->default_timetolive = info.sinfo_timetolive; + if (info->sinfo_assoc_id == SCTP_FUTURE_ASSOC || + info->sinfo_assoc_id == SCTP_ALL_ASSOC) { + sp->default_stream = info->sinfo_stream; + sp->default_flags = info->sinfo_flags; + sp->default_ppid = info->sinfo_ppid; + sp->default_context = info->sinfo_context; + sp->default_timetolive = info->sinfo_timetolive; } - if (info.sinfo_assoc_id == SCTP_CURRENT_ASSOC || - info.sinfo_assoc_id == SCTP_ALL_ASSOC) { + if (info->sinfo_assoc_id == SCTP_CURRENT_ASSOC || + info->sinfo_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { - asoc->default_stream = info.sinfo_stream; - asoc->default_flags = info.sinfo_flags; - asoc->default_ppid = info.sinfo_ppid; - asoc->default_context = info.sinfo_context; - asoc->default_timetolive = info.sinfo_timetolive; + asoc->default_stream = info->sinfo_stream; + asoc->default_flags = info->sinfo_flags; + asoc->default_ppid = info->sinfo_ppid; + asoc->default_context = info->sinfo_context; + asoc->default_timetolive = info->sinfo_timetolive; } } @@ -2980,54 +2933,51 @@ static int sctp_setsockopt_default_send_param(struct sock *sk, * (SCTP_DEFAULT_SNDINFO) */ static int sctp_setsockopt_default_sndinfo(struct sock *sk, - char __user *optval, + struct sctp_sndinfo *info, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; - struct sctp_sndinfo info; - if (optlen != sizeof(info)) + if (optlen != sizeof(*info)) return -EINVAL; - if (copy_from_user(&info, optval, optlen)) - return -EFAULT; - if (info.snd_flags & + if (info->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_ABORT | SCTP_EOF)) return -EINVAL; - asoc = sctp_id2assoc(sk, info.snd_assoc_id); - if (!asoc && info.snd_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, info->snd_assoc_id); + if (!asoc && info->snd_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { - asoc->default_stream = info.snd_sid; - asoc->default_flags = info.snd_flags; - asoc->default_ppid = info.snd_ppid; - asoc->default_context = info.snd_context; + asoc->default_stream = info->snd_sid; + asoc->default_flags = info->snd_flags; + asoc->default_ppid = info->snd_ppid; + asoc->default_context = info->snd_context; return 0; } if (sctp_style(sk, TCP)) - info.snd_assoc_id = SCTP_FUTURE_ASSOC; + info->snd_assoc_id = SCTP_FUTURE_ASSOC; - if (info.snd_assoc_id == SCTP_FUTURE_ASSOC || - info.snd_assoc_id == SCTP_ALL_ASSOC) { - sp->default_stream = info.snd_sid; - sp->default_flags = info.snd_flags; - sp->default_ppid = info.snd_ppid; - sp->default_context = info.snd_context; + if (info->snd_assoc_id == SCTP_FUTURE_ASSOC || + info->snd_assoc_id == SCTP_ALL_ASSOC) { + sp->default_stream = info->snd_sid; + sp->default_flags = info->snd_flags; + sp->default_ppid = info->snd_ppid; + sp->default_context = info->snd_context; } - if (info.snd_assoc_id == SCTP_CURRENT_ASSOC || - info.snd_assoc_id == SCTP_ALL_ASSOC) { + if (info->snd_assoc_id == SCTP_CURRENT_ASSOC || + info->snd_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { - asoc->default_stream = info.snd_sid; - asoc->default_flags = info.snd_flags; - asoc->default_ppid = info.snd_ppid; - asoc->default_context = info.snd_context; + asoc->default_stream = info->snd_sid; + asoc->default_flags = info->snd_flags; + asoc->default_ppid = info->snd_ppid; + asoc->default_context = info->snd_context; } } @@ -3040,10 +2990,9 @@ static int sctp_setsockopt_default_sndinfo(struct sock *sk, * the association primary. The enclosed address must be one of the * association peer's addresses. */ -static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval, +static int sctp_setsockopt_primary_addr(struct sock *sk, struct sctp_prim *prim, unsigned int optlen) { - struct sctp_prim prim; struct sctp_transport *trans; struct sctp_af *af; int err; @@ -3051,21 +3000,18 @@ static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval, if (optlen != sizeof(struct sctp_prim)) return -EINVAL; - if (copy_from_user(&prim, optval, sizeof(struct sctp_prim))) - return -EFAULT; - /* Allow security module to validate address but need address len. */ - af = sctp_get_af_specific(prim.ssp_addr.ss_family); + af = sctp_get_af_specific(prim->ssp_addr.ss_family); if (!af) return -EINVAL; err = security_sctp_bind_connect(sk, SCTP_PRIMARY_ADDR, - (struct sockaddr *)&prim.ssp_addr, + (struct sockaddr *)&prim->ssp_addr, af->sockaddr_len); if (err) return err; - trans = sctp_addr_id2transport(sk, &prim.ssp_addr, prim.ssp_assoc_id); + trans = sctp_addr_id2transport(sk, &prim->ssp_addr, prim->ssp_assoc_id); if (!trans) return -EINVAL; @@ -3082,17 +3028,12 @@ static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval, * introduced, at the cost of more packets in the network. Expects an * integer boolean flag. */ -static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval, +static int sctp_setsockopt_nodelay(struct sock *sk, int *val, unsigned int optlen) { - int val; - if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EFAULT; - - sctp_sk(sk)->nodelay = (val == 0) ? 0 : 1; + sctp_sk(sk)->nodelay = (*val == 0) ? 0 : 1; return 0; } @@ -3108,9 +3049,10 @@ static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval, * be changed. * */ -static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigned int optlen) +static int sctp_setsockopt_rtoinfo(struct sock *sk, + struct sctp_rtoinfo *rtoinfo, + unsigned int optlen) { - struct sctp_rtoinfo rtoinfo; struct sctp_association *asoc; unsigned long rto_min, rto_max; struct sctp_sock *sp = sctp_sk(sk); @@ -3118,18 +3060,15 @@ static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigne if (optlen != sizeof (struct sctp_rtoinfo)) return -EINVAL; - if (copy_from_user(&rtoinfo, optval, optlen)) - return -EFAULT; - - asoc = sctp_id2assoc(sk, rtoinfo.srto_assoc_id); + asoc = sctp_id2assoc(sk, rtoinfo->srto_assoc_id); /* Set the values to the specific association */ - if (!asoc && rtoinfo.srto_assoc_id != SCTP_FUTURE_ASSOC && + if (!asoc && rtoinfo->srto_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; - rto_max = rtoinfo.srto_max; - rto_min = rtoinfo.srto_min; + rto_max = rtoinfo->srto_max; + rto_min = rtoinfo->srto_min; if (rto_max) rto_max = asoc ? msecs_to_jiffies(rto_max) : rto_max; @@ -3145,17 +3084,17 @@ static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigne return -EINVAL; if (asoc) { - if (rtoinfo.srto_initial != 0) + if (rtoinfo->srto_initial != 0) asoc->rto_initial = - msecs_to_jiffies(rtoinfo.srto_initial); + msecs_to_jiffies(rtoinfo->srto_initial); asoc->rto_max = rto_max; asoc->rto_min = rto_min; } else { /* If there is no association or the association-id = 0 * set the values to the endpoint. */ - if (rtoinfo.srto_initial != 0) - sp->rtoinfo.srto_initial = rtoinfo.srto_initial; + if (rtoinfo->srto_initial != 0) + sp->rtoinfo.srto_initial = rtoinfo->srto_initial; sp->rtoinfo.srto_max = rto_max; sp->rtoinfo.srto_min = rto_min; } @@ -3174,26 +3113,25 @@ static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigne * See [SCTP] for more information. * */ -static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, unsigned int optlen) +static int sctp_setsockopt_associnfo(struct sock *sk, + struct sctp_assocparams *assocparams, + unsigned int optlen) { - struct sctp_assocparams assocparams; struct sctp_association *asoc; if (optlen != sizeof(struct sctp_assocparams)) return -EINVAL; - if (copy_from_user(&assocparams, optval, optlen)) - return -EFAULT; - asoc = sctp_id2assoc(sk, assocparams.sasoc_assoc_id); + asoc = sctp_id2assoc(sk, assocparams->sasoc_assoc_id); - if (!asoc && assocparams.sasoc_assoc_id != SCTP_FUTURE_ASSOC && + if (!asoc && assocparams->sasoc_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* Set the values to the specific association */ if (asoc) { - if (assocparams.sasoc_asocmaxrxt != 0) { + if (assocparams->sasoc_asocmaxrxt != 0) { __u32 path_sum = 0; int paths = 0; struct sctp_transport *peer_addr; @@ -3210,24 +3148,25 @@ static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, unsig * then one path. */ if (paths > 1 && - assocparams.sasoc_asocmaxrxt > path_sum) + assocparams->sasoc_asocmaxrxt > path_sum) return -EINVAL; - asoc->max_retrans = assocparams.sasoc_asocmaxrxt; + asoc->max_retrans = assocparams->sasoc_asocmaxrxt; } - if (assocparams.sasoc_cookie_life != 0) - asoc->cookie_life = ms_to_ktime(assocparams.sasoc_cookie_life); + if (assocparams->sasoc_cookie_life != 0) + asoc->cookie_life = + ms_to_ktime(assocparams->sasoc_cookie_life); } else { /* Set the values to the endpoint */ struct sctp_sock *sp = sctp_sk(sk); - if (assocparams.sasoc_asocmaxrxt != 0) + if (assocparams->sasoc_asocmaxrxt != 0) sp->assocparams.sasoc_asocmaxrxt = - assocparams.sasoc_asocmaxrxt; - if (assocparams.sasoc_cookie_life != 0) + assocparams->sasoc_asocmaxrxt; + if (assocparams->sasoc_cookie_life != 0) sp->assocparams.sasoc_cookie_life = - assocparams.sasoc_cookie_life; + assocparams->sasoc_cookie_life; } return 0; } @@ -3242,16 +3181,14 @@ static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, unsig * addresses and a user will receive both PF_INET6 and PF_INET type * addresses on the socket. */ -static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsigned int optlen) +static int sctp_setsockopt_mappedv4(struct sock *sk, int *val, + unsigned int optlen) { - int val; struct sctp_sock *sp = sctp_sk(sk); if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EFAULT; - if (val) + if (*val) sp->v4mapped = 1; else sp->v4mapped = 0; @@ -3286,11 +3223,13 @@ static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsign * changed (effecting future associations only). * assoc_value: This parameter specifies the maximum size in bytes. */ -static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen) +static int sctp_setsockopt_maxseg(struct sock *sk, + struct sctp_assoc_value *params, + unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); - struct sctp_assoc_value params; struct sctp_association *asoc; + sctp_assoc_t assoc_id; int val; if (optlen == sizeof(int)) { @@ -3299,19 +3238,17 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned "Use of int in maxseg socket option.\n" "Use struct sctp_assoc_value instead\n", current->comm, task_pid_nr(current)); - if (copy_from_user(&val, optval, optlen)) - return -EFAULT; - params.assoc_id = SCTP_FUTURE_ASSOC; + assoc_id = SCTP_FUTURE_ASSOC; + val = *(int *)params; } else if (optlen == sizeof(struct sctp_assoc_value)) { - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - val = params.assoc_value; + assoc_id = params->assoc_id; + val = params->assoc_value; } else { return -EINVAL; } - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, assoc_id); + if (!asoc && assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; @@ -3346,12 +3283,12 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned * locally bound addresses. The following structure is used to make a * set primary request: */ -static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optval, +static int sctp_setsockopt_peer_primary_addr(struct sock *sk, + struct sctp_setpeerprim *prim, unsigned int optlen) { struct sctp_sock *sp; struct sctp_association *asoc = NULL; - struct sctp_setpeerprim prim; struct sctp_chunk *chunk; struct sctp_af *af; int err; @@ -3364,10 +3301,7 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva if (optlen != sizeof(struct sctp_setpeerprim)) return -EINVAL; - if (copy_from_user(&prim, optval, optlen)) - return -EFAULT; - - asoc = sctp_id2assoc(sk, prim.sspp_assoc_id); + asoc = sctp_id2assoc(sk, prim->sspp_assoc_id); if (!asoc) return -EINVAL; @@ -3380,26 +3314,26 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva if (!sctp_state(asoc, ESTABLISHED)) return -ENOTCONN; - af = sctp_get_af_specific(prim.sspp_addr.ss_family); + af = sctp_get_af_specific(prim->sspp_addr.ss_family); if (!af) return -EINVAL; - if (!af->addr_valid((union sctp_addr *)&prim.sspp_addr, sp, NULL)) + if (!af->addr_valid((union sctp_addr *)&prim->sspp_addr, sp, NULL)) return -EADDRNOTAVAIL; - if (!sctp_assoc_lookup_laddr(asoc, (union sctp_addr *)&prim.sspp_addr)) + if (!sctp_assoc_lookup_laddr(asoc, (union sctp_addr *)&prim->sspp_addr)) return -EADDRNOTAVAIL; /* Allow security module to validate address. */ err = security_sctp_bind_connect(sk, SCTP_SET_PEER_PRIMARY_ADDR, - (struct sockaddr *)&prim.sspp_addr, + (struct sockaddr *)&prim->sspp_addr, af->sockaddr_len); if (err) return err; /* Create an ASCONF chunk with SET_PRIMARY parameter */ chunk = sctp_make_asconf_set_prim(asoc, - (union sctp_addr *)&prim.sspp_addr); + (union sctp_addr *)&prim->sspp_addr); if (!chunk) return -ENOMEM; @@ -3410,17 +3344,14 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva return err; } -static int sctp_setsockopt_adaptation_layer(struct sock *sk, char __user *optval, +static int sctp_setsockopt_adaptation_layer(struct sock *sk, + struct sctp_setadaptation *adapt, unsigned int optlen) { - struct sctp_setadaptation adaptation; - if (optlen != sizeof(struct sctp_setadaptation)) return -EINVAL; - if (copy_from_user(&adaptation, optval, optlen)) - return -EFAULT; - sctp_sk(sk)->adaptation_ind = adaptation.ssb_adaptation_ind; + sctp_sk(sk)->adaptation_ind = adapt->ssb_adaptation_ind; return 0; } @@ -3439,40 +3370,38 @@ static int sctp_setsockopt_adaptation_layer(struct sock *sk, char __user *optval * received messages from the peer and does not effect the value that is * saved with outbound messages. */ -static int sctp_setsockopt_context(struct sock *sk, char __user *optval, +static int sctp_setsockopt_context(struct sock *sk, + struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); - struct sctp_assoc_value params; struct sctp_association *asoc; if (optlen != sizeof(struct sctp_assoc_value)) return -EINVAL; - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { - asoc->default_rcv_context = params.assoc_value; + asoc->default_rcv_context = params->assoc_value; return 0; } if (sctp_style(sk, TCP)) - params.assoc_id = SCTP_FUTURE_ASSOC; + params->assoc_id = SCTP_FUTURE_ASSOC; - if (params.assoc_id == SCTP_FUTURE_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) - sp->default_rcv_context = params.assoc_value; + if (params->assoc_id == SCTP_FUTURE_ASSOC || + params->assoc_id == SCTP_ALL_ASSOC) + sp->default_rcv_context = params->assoc_value; - if (params.assoc_id == SCTP_CURRENT_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) + if (params->assoc_id == SCTP_CURRENT_ASSOC || + params->assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &sp->ep->asocs, asocs) - asoc->default_rcv_context = params.assoc_value; + asoc->default_rcv_context = params->assoc_value; return 0; } @@ -3501,18 +3430,13 @@ static int sctp_setsockopt_context(struct sock *sk, char __user *optval, * application using the one to many model may become confused and act * incorrectly. */ -static int sctp_setsockopt_fragment_interleave(struct sock *sk, - char __user *optval, +static int sctp_setsockopt_fragment_interleave(struct sock *sk, int *val, unsigned int optlen) { - int val; - if (optlen != sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EFAULT; - sctp_sk(sk)->frag_interleave = !!val; + sctp_sk(sk)->frag_interleave = !!*val; if (!sctp_sk(sk)->frag_interleave) sctp_sk(sk)->ep->intl_enable = 0; @@ -3537,24 +3461,19 @@ static int sctp_setsockopt_fragment_interleave(struct sock *sk, * call as long as the user provided buffer is large enough to hold the * message. */ -static int sctp_setsockopt_partial_delivery_point(struct sock *sk, - char __user *optval, +static int sctp_setsockopt_partial_delivery_point(struct sock *sk, u32 *val, unsigned int optlen) { - u32 val; - if (optlen != sizeof(u32)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EFAULT; /* Note: We double the receive buffer from what the user sets * it to be, also initial rwnd is based on rcvbuf/2. */ - if (val > (sk->sk_rcvbuf >> 1)) + if (*val > (sk->sk_rcvbuf >> 1)) return -EINVAL; - sctp_sk(sk)->pd_point = val; + sctp_sk(sk)->pd_point = *val; return 0; /* is this the right error code? */ } @@ -3571,12 +3490,13 @@ static int sctp_setsockopt_partial_delivery_point(struct sock *sk, * future associations inheriting the socket value. */ static int sctp_setsockopt_maxburst(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); - struct sctp_assoc_value params; struct sctp_association *asoc; + sctp_assoc_t assoc_id; + u32 assoc_value; if (optlen == sizeof(int)) { pr_warn_ratelimited(DEPRECATED @@ -3584,37 +3504,33 @@ static int sctp_setsockopt_maxburst(struct sock *sk, "Use of int in max_burst socket option deprecated.\n" "Use struct sctp_assoc_value instead\n", current->comm, task_pid_nr(current)); - if (copy_from_user(¶ms.assoc_value, optval, optlen)) - return -EFAULT; - params.assoc_id = SCTP_FUTURE_ASSOC; + assoc_id = SCTP_FUTURE_ASSOC; + assoc_value = *((int *)params); } else if (optlen == sizeof(struct sctp_assoc_value)) { - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; + assoc_id = params->assoc_id; + assoc_value = params->assoc_value; } else return -EINVAL; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id > SCTP_ALL_ASSOC && - sctp_style(sk, UDP)) + asoc = sctp_id2assoc(sk, assoc_id); + if (!asoc && assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { - asoc->max_burst = params.assoc_value; + asoc->max_burst = assoc_value; return 0; } if (sctp_style(sk, TCP)) - params.assoc_id = SCTP_FUTURE_ASSOC; + assoc_id = SCTP_FUTURE_ASSOC; - if (params.assoc_id == SCTP_FUTURE_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) - sp->max_burst = params.assoc_value; + if (assoc_id == SCTP_FUTURE_ASSOC || assoc_id == SCTP_ALL_ASSOC) + sp->max_burst = assoc_value; - if (params.assoc_id == SCTP_CURRENT_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) + if (assoc_id == SCTP_CURRENT_ASSOC || assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &sp->ep->asocs, asocs) - asoc->max_burst = params.assoc_value; + asoc->max_burst = assoc_value; return 0; } @@ -3627,21 +3543,18 @@ static int sctp_setsockopt_maxburst(struct sock *sk, * will only effect future associations on the socket. */ static int sctp_setsockopt_auth_chunk(struct sock *sk, - char __user *optval, + struct sctp_authchunk *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; - struct sctp_authchunk val; if (!ep->auth_enable) return -EACCES; if (optlen != sizeof(struct sctp_authchunk)) return -EINVAL; - if (copy_from_user(&val, optval, optlen)) - return -EFAULT; - switch (val.sauth_chunk) { + switch (val->sauth_chunk) { case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: case SCTP_CID_SHUTDOWN_COMPLETE: @@ -3650,7 +3563,7 @@ static int sctp_setsockopt_auth_chunk(struct sock *sk, } /* add this chunk id to the endpoint */ - return sctp_auth_ep_add_chunkid(ep, val.sauth_chunk); + return sctp_auth_ep_add_chunkid(ep, val->sauth_chunk); } /* @@ -3660,13 +3573,11 @@ static int sctp_setsockopt_auth_chunk(struct sock *sk, * endpoint requires the peer to use. */ static int sctp_setsockopt_hmac_ident(struct sock *sk, - char __user *optval, + struct sctp_hmacalgo *hmacs, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; - struct sctp_hmacalgo *hmacs; u32 idents; - int err; if (!ep->auth_enable) return -EACCES; @@ -3676,21 +3587,12 @@ static int sctp_setsockopt_hmac_ident(struct sock *sk, optlen = min_t(unsigned int, optlen, sizeof(struct sctp_hmacalgo) + SCTP_AUTH_NUM_HMACS * sizeof(u16)); - hmacs = memdup_user(optval, optlen); - if (IS_ERR(hmacs)) - return PTR_ERR(hmacs); - idents = hmacs->shmac_num_idents; if (idents == 0 || idents > SCTP_AUTH_NUM_HMACS || - (idents * sizeof(u16)) > (optlen - sizeof(struct sctp_hmacalgo))) { - err = -EINVAL; - goto out; - } + (idents * sizeof(u16)) > (optlen - sizeof(struct sctp_hmacalgo))) + return -EINVAL; - err = sctp_auth_ep_set_hmacs(ep, hmacs); -out: - kfree(hmacs); - return err; + return sctp_auth_ep_set_hmacs(ep, hmacs); } /* @@ -3700,11 +3602,10 @@ out: * association shared key. */ static int sctp_setsockopt_auth_key(struct sock *sk, - char __user *optval, + struct sctp_authkey *authkey, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; - struct sctp_authkey *authkey; struct sctp_association *asoc; int ret = -EINVAL; @@ -3715,10 +3616,6 @@ static int sctp_setsockopt_auth_key(struct sock *sk, */ optlen = min_t(unsigned int, optlen, USHRT_MAX + sizeof(*authkey)); - authkey = memdup_user(optval, optlen); - if (IS_ERR(authkey)) - return PTR_ERR(authkey); - if (authkey->sca_keylength > optlen - sizeof(*authkey)) goto out; @@ -3755,7 +3652,7 @@ static int sctp_setsockopt_auth_key(struct sock *sk, } out: - kzfree(authkey); + memzero_explicit(authkey, optlen); return ret; } @@ -3766,42 +3663,39 @@ out: * the association shared key. */ static int sctp_setsockopt_active_key(struct sock *sk, - char __user *optval, + struct sctp_authkeyid *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; - struct sctp_authkeyid val; int ret = 0; if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; - if (copy_from_user(&val, optval, optlen)) - return -EFAULT; - asoc = sctp_id2assoc(sk, val.scact_assoc_id); - if (!asoc && val.scact_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, val->scact_assoc_id); + if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) - return sctp_auth_set_active_key(ep, asoc, val.scact_keynumber); + return sctp_auth_set_active_key(ep, asoc, val->scact_keynumber); if (sctp_style(sk, TCP)) - val.scact_assoc_id = SCTP_FUTURE_ASSOC; + val->scact_assoc_id = SCTP_FUTURE_ASSOC; - if (val.scact_assoc_id == SCTP_FUTURE_ASSOC || - val.scact_assoc_id == SCTP_ALL_ASSOC) { - ret = sctp_auth_set_active_key(ep, asoc, val.scact_keynumber); + if (val->scact_assoc_id == SCTP_FUTURE_ASSOC || + val->scact_assoc_id == SCTP_ALL_ASSOC) { + ret = sctp_auth_set_active_key(ep, asoc, val->scact_keynumber); if (ret) return ret; } - if (val.scact_assoc_id == SCTP_CURRENT_ASSOC || - val.scact_assoc_id == SCTP_ALL_ASSOC) { + if (val->scact_assoc_id == SCTP_CURRENT_ASSOC || + val->scact_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_set_active_key(ep, asoc, - val.scact_keynumber); + val->scact_keynumber); if (res && !ret) ret = res; @@ -3817,42 +3711,39 @@ static int sctp_setsockopt_active_key(struct sock *sk, * This set option will delete a shared secret key from use. */ static int sctp_setsockopt_del_key(struct sock *sk, - char __user *optval, + struct sctp_authkeyid *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; - struct sctp_authkeyid val; int ret = 0; if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; - if (copy_from_user(&val, optval, optlen)) - return -EFAULT; - asoc = sctp_id2assoc(sk, val.scact_assoc_id); - if (!asoc && val.scact_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, val->scact_assoc_id); + if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) - return sctp_auth_del_key_id(ep, asoc, val.scact_keynumber); + return sctp_auth_del_key_id(ep, asoc, val->scact_keynumber); if (sctp_style(sk, TCP)) - val.scact_assoc_id = SCTP_FUTURE_ASSOC; + val->scact_assoc_id = SCTP_FUTURE_ASSOC; - if (val.scact_assoc_id == SCTP_FUTURE_ASSOC || - val.scact_assoc_id == SCTP_ALL_ASSOC) { - ret = sctp_auth_del_key_id(ep, asoc, val.scact_keynumber); + if (val->scact_assoc_id == SCTP_FUTURE_ASSOC || + val->scact_assoc_id == SCTP_ALL_ASSOC) { + ret = sctp_auth_del_key_id(ep, asoc, val->scact_keynumber); if (ret) return ret; } - if (val.scact_assoc_id == SCTP_CURRENT_ASSOC || - val.scact_assoc_id == SCTP_ALL_ASSOC) { + if (val->scact_assoc_id == SCTP_CURRENT_ASSOC || + val->scact_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_del_key_id(ep, asoc, - val.scact_keynumber); + val->scact_keynumber); if (res && !ret) ret = res; @@ -3867,42 +3758,40 @@ static int sctp_setsockopt_del_key(struct sock *sk, * * This set option will deactivate a shared secret key. */ -static int sctp_setsockopt_deactivate_key(struct sock *sk, char __user *optval, +static int sctp_setsockopt_deactivate_key(struct sock *sk, + struct sctp_authkeyid *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; - struct sctp_authkeyid val; int ret = 0; if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; - if (copy_from_user(&val, optval, optlen)) - return -EFAULT; - asoc = sctp_id2assoc(sk, val.scact_assoc_id); - if (!asoc && val.scact_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, val->scact_assoc_id); + if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) - return sctp_auth_deact_key_id(ep, asoc, val.scact_keynumber); + return sctp_auth_deact_key_id(ep, asoc, val->scact_keynumber); if (sctp_style(sk, TCP)) - val.scact_assoc_id = SCTP_FUTURE_ASSOC; + val->scact_assoc_id = SCTP_FUTURE_ASSOC; - if (val.scact_assoc_id == SCTP_FUTURE_ASSOC || - val.scact_assoc_id == SCTP_ALL_ASSOC) { - ret = sctp_auth_deact_key_id(ep, asoc, val.scact_keynumber); + if (val->scact_assoc_id == SCTP_FUTURE_ASSOC || + val->scact_assoc_id == SCTP_ALL_ASSOC) { + ret = sctp_auth_deact_key_id(ep, asoc, val->scact_keynumber); if (ret) return ret; } - if (val.scact_assoc_id == SCTP_CURRENT_ASSOC || - val.scact_assoc_id == SCTP_ALL_ASSOC) { + if (val->scact_assoc_id == SCTP_CURRENT_ASSOC || + val->scact_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_deact_key_id(ep, asoc, - val.scact_keynumber); + val->scact_keynumber); if (res && !ret) ret = res; @@ -3926,26 +3815,23 @@ static int sctp_setsockopt_deactivate_key(struct sock *sk, char __user *optval, * Note. In this implementation, socket operation overrides default parameter * being set by sysctl as well as FreeBSD implementation */ -static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, +static int sctp_setsockopt_auto_asconf(struct sock *sk, int *val, unsigned int optlen) { - int val; struct sctp_sock *sp = sctp_sk(sk); if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EFAULT; - if (!sctp_is_ep_boundall(sk) && val) + if (!sctp_is_ep_boundall(sk) && *val) return -EINVAL; - if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf)) + if ((*val && sp->do_auto_asconf) || (!*val && !sp->do_auto_asconf)) return 0; spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock); - if (val == 0 && sp->do_auto_asconf) { + if (*val == 0 && sp->do_auto_asconf) { list_del(&sp->auto_asconf_list); sp->do_auto_asconf = 0; - } else if (val && !sp->do_auto_asconf) { + } else if (*val && !sp->do_auto_asconf) { list_add_tail(&sp->auto_asconf_list, &sock_net(sk)->sctp.auto_asconf_splist); sp->do_auto_asconf = 1; @@ -3962,176 +3848,154 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt */ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, - char __user *optval, + struct sctp_paddrthlds_v2 *val, unsigned int optlen, bool v2) { - struct sctp_paddrthlds_v2 val; struct sctp_transport *trans; struct sctp_association *asoc; int len; - len = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds); + len = v2 ? sizeof(*val) : sizeof(struct sctp_paddrthlds); if (optlen < len) return -EINVAL; - if (copy_from_user(&val, optval, len)) - return -EFAULT; - if (v2 && val.spt_pathpfthld > val.spt_pathcpthld) + if (v2 && val->spt_pathpfthld > val->spt_pathcpthld) return -EINVAL; - if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) { - trans = sctp_addr_id2transport(sk, &val.spt_address, - val.spt_assoc_id); + if (!sctp_is_any(sk, (const union sctp_addr *)&val->spt_address)) { + trans = sctp_addr_id2transport(sk, &val->spt_address, + val->spt_assoc_id); if (!trans) return -ENOENT; - if (val.spt_pathmaxrxt) - trans->pathmaxrxt = val.spt_pathmaxrxt; + if (val->spt_pathmaxrxt) + trans->pathmaxrxt = val->spt_pathmaxrxt; if (v2) - trans->ps_retrans = val.spt_pathcpthld; - trans->pf_retrans = val.spt_pathpfthld; + trans->ps_retrans = val->spt_pathcpthld; + trans->pf_retrans = val->spt_pathpfthld; return 0; } - asoc = sctp_id2assoc(sk, val.spt_assoc_id); - if (!asoc && val.spt_assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, val->spt_assoc_id); + if (!asoc && val->spt_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { - if (val.spt_pathmaxrxt) - trans->pathmaxrxt = val.spt_pathmaxrxt; + if (val->spt_pathmaxrxt) + trans->pathmaxrxt = val->spt_pathmaxrxt; if (v2) - trans->ps_retrans = val.spt_pathcpthld; - trans->pf_retrans = val.spt_pathpfthld; + trans->ps_retrans = val->spt_pathcpthld; + trans->pf_retrans = val->spt_pathpfthld; } - if (val.spt_pathmaxrxt) - asoc->pathmaxrxt = val.spt_pathmaxrxt; + if (val->spt_pathmaxrxt) + asoc->pathmaxrxt = val->spt_pathmaxrxt; if (v2) - asoc->ps_retrans = val.spt_pathcpthld; - asoc->pf_retrans = val.spt_pathpfthld; + asoc->ps_retrans = val->spt_pathcpthld; + asoc->pf_retrans = val->spt_pathpfthld; } else { struct sctp_sock *sp = sctp_sk(sk); - if (val.spt_pathmaxrxt) - sp->pathmaxrxt = val.spt_pathmaxrxt; + if (val->spt_pathmaxrxt) + sp->pathmaxrxt = val->spt_pathmaxrxt; if (v2) - sp->ps_retrans = val.spt_pathcpthld; - sp->pf_retrans = val.spt_pathpfthld; + sp->ps_retrans = val->spt_pathcpthld; + sp->pf_retrans = val->spt_pathpfthld; } return 0; } -static int sctp_setsockopt_recvrcvinfo(struct sock *sk, - char __user *optval, +static int sctp_setsockopt_recvrcvinfo(struct sock *sk, int *val, unsigned int optlen) { - int val; - if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *) optval)) - return -EFAULT; - sctp_sk(sk)->recvrcvinfo = (val == 0) ? 0 : 1; + sctp_sk(sk)->recvrcvinfo = (*val == 0) ? 0 : 1; return 0; } -static int sctp_setsockopt_recvnxtinfo(struct sock *sk, - char __user *optval, +static int sctp_setsockopt_recvnxtinfo(struct sock *sk, int *val, unsigned int optlen) { - int val; - if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *) optval)) - return -EFAULT; - sctp_sk(sk)->recvnxtinfo = (val == 0) ? 0 : 1; + sctp_sk(sk)->recvnxtinfo = (*val == 0) ? 0 : 1; return 0; } static int sctp_setsockopt_pr_supported(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { - struct sctp_assoc_value params; struct sctp_association *asoc; - if (optlen != sizeof(params)) + if (optlen != sizeof(*params)) return -EINVAL; - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; - sctp_sk(sk)->ep->prsctp_enable = !!params.assoc_value; + sctp_sk(sk)->ep->prsctp_enable = !!params->assoc_value; return 0; } static int sctp_setsockopt_default_prinfo(struct sock *sk, - char __user *optval, + struct sctp_default_prinfo *info, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); - struct sctp_default_prinfo info; struct sctp_association *asoc; int retval = -EINVAL; - if (optlen != sizeof(info)) + if (optlen != sizeof(*info)) goto out; - if (copy_from_user(&info, optval, sizeof(info))) { - retval = -EFAULT; + if (info->pr_policy & ~SCTP_PR_SCTP_MASK) goto out; - } - if (info.pr_policy & ~SCTP_PR_SCTP_MASK) - goto out; + if (info->pr_policy == SCTP_PR_SCTP_NONE) + info->pr_value = 0; - if (info.pr_policy == SCTP_PR_SCTP_NONE) - info.pr_value = 0; - - asoc = sctp_id2assoc(sk, info.pr_assoc_id); - if (!asoc && info.pr_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, info->pr_assoc_id); + if (!asoc && info->pr_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) goto out; retval = 0; if (asoc) { - SCTP_PR_SET_POLICY(asoc->default_flags, info.pr_policy); - asoc->default_timetolive = info.pr_value; + SCTP_PR_SET_POLICY(asoc->default_flags, info->pr_policy); + asoc->default_timetolive = info->pr_value; goto out; } if (sctp_style(sk, TCP)) - info.pr_assoc_id = SCTP_FUTURE_ASSOC; + info->pr_assoc_id = SCTP_FUTURE_ASSOC; - if (info.pr_assoc_id == SCTP_FUTURE_ASSOC || - info.pr_assoc_id == SCTP_ALL_ASSOC) { - SCTP_PR_SET_POLICY(sp->default_flags, info.pr_policy); - sp->default_timetolive = info.pr_value; + if (info->pr_assoc_id == SCTP_FUTURE_ASSOC || + info->pr_assoc_id == SCTP_ALL_ASSOC) { + SCTP_PR_SET_POLICY(sp->default_flags, info->pr_policy); + sp->default_timetolive = info->pr_value; } - if (info.pr_assoc_id == SCTP_CURRENT_ASSOC || - info.pr_assoc_id == SCTP_ALL_ASSOC) { + if (info->pr_assoc_id == SCTP_CURRENT_ASSOC || + info->pr_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { - SCTP_PR_SET_POLICY(asoc->default_flags, info.pr_policy); - asoc->default_timetolive = info.pr_value; + SCTP_PR_SET_POLICY(asoc->default_flags, + info->pr_policy); + asoc->default_timetolive = info->pr_value; } } @@ -4140,27 +4004,21 @@ out: } static int sctp_setsockopt_reconfig_supported(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { - struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EINVAL; - if (optlen != sizeof(params)) - goto out; - - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; + if (optlen != sizeof(*params)) goto out; - } - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; - sctp_sk(sk)->ep->reconf_enable = !!params.assoc_value; + sctp_sk(sk)->ep->reconf_enable = !!params->assoc_value; retval = 0; @@ -4169,60 +4027,52 @@ out: } static int sctp_setsockopt_enable_strreset(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; - struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EINVAL; - if (optlen != sizeof(params)) + if (optlen != sizeof(*params)) goto out; - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; - goto out; - } - - if (params.assoc_value & (~SCTP_ENABLE_STRRESET_MASK)) + if (params->assoc_value & (~SCTP_ENABLE_STRRESET_MASK)) goto out; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) goto out; retval = 0; if (asoc) { - asoc->strreset_enable = params.assoc_value; + asoc->strreset_enable = params->assoc_value; goto out; } if (sctp_style(sk, TCP)) - params.assoc_id = SCTP_FUTURE_ASSOC; + params->assoc_id = SCTP_FUTURE_ASSOC; - if (params.assoc_id == SCTP_FUTURE_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) - ep->strreset_enable = params.assoc_value; + if (params->assoc_id == SCTP_FUTURE_ASSOC || + params->assoc_id == SCTP_ALL_ASSOC) + ep->strreset_enable = params->assoc_value; - if (params.assoc_id == SCTP_CURRENT_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) + if (params->assoc_id == SCTP_CURRENT_ASSOC || + params->assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &ep->asocs, asocs) - asoc->strreset_enable = params.assoc_value; + asoc->strreset_enable = params->assoc_value; out: return retval; } static int sctp_setsockopt_reset_streams(struct sock *sk, - char __user *optval, + struct sctp_reset_streams *params, unsigned int optlen) { - struct sctp_reset_streams *params; struct sctp_association *asoc; - int retval = -EINVAL; if (optlen < sizeof(*params)) return -EINVAL; @@ -4230,116 +4080,82 @@ static int sctp_setsockopt_reset_streams(struct sock *sk, optlen = min_t(unsigned int, optlen, USHRT_MAX + sizeof(__u16) * sizeof(*params)); - params = memdup_user(optval, optlen); - if (IS_ERR(params)) - return PTR_ERR(params); - if (params->srs_number_streams * sizeof(__u16) > optlen - sizeof(*params)) - goto out; + return -EINVAL; asoc = sctp_id2assoc(sk, params->srs_assoc_id); if (!asoc) - goto out; - - retval = sctp_send_reset_streams(asoc, params); + return -EINVAL; -out: - kfree(params); - return retval; + return sctp_send_reset_streams(asoc, params); } -static int sctp_setsockopt_reset_assoc(struct sock *sk, - char __user *optval, +static int sctp_setsockopt_reset_assoc(struct sock *sk, sctp_assoc_t *associd, unsigned int optlen) { struct sctp_association *asoc; - sctp_assoc_t associd; - int retval = -EINVAL; - if (optlen != sizeof(associd)) - goto out; - - if (copy_from_user(&associd, optval, optlen)) { - retval = -EFAULT; - goto out; - } + if (optlen != sizeof(*associd)) + return -EINVAL; - asoc = sctp_id2assoc(sk, associd); + asoc = sctp_id2assoc(sk, *associd); if (!asoc) - goto out; - - retval = sctp_send_reset_assoc(asoc); + return -EINVAL; -out: - return retval; + return sctp_send_reset_assoc(asoc); } static int sctp_setsockopt_add_streams(struct sock *sk, - char __user *optval, + struct sctp_add_streams *params, unsigned int optlen) { struct sctp_association *asoc; - struct sctp_add_streams params; - int retval = -EINVAL; - if (optlen != sizeof(params)) - goto out; - - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; - goto out; - } + if (optlen != sizeof(*params)) + return -EINVAL; - asoc = sctp_id2assoc(sk, params.sas_assoc_id); + asoc = sctp_id2assoc(sk, params->sas_assoc_id); if (!asoc) - goto out; - - retval = sctp_send_add_streams(asoc, ¶ms); + return -EINVAL; -out: - return retval; + return sctp_send_add_streams(asoc, params); } static int sctp_setsockopt_scheduler(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; - struct sctp_assoc_value params; int retval = 0; - if (optlen < sizeof(params)) + if (optlen < sizeof(*params)) return -EINVAL; - optlen = sizeof(params); - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - - if (params.assoc_value > SCTP_SS_MAX) + if (params->assoc_value > SCTP_SS_MAX) return -EINVAL; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) - return sctp_sched_set_sched(asoc, params.assoc_value); + return sctp_sched_set_sched(asoc, params->assoc_value); if (sctp_style(sk, TCP)) - params.assoc_id = SCTP_FUTURE_ASSOC; + params->assoc_id = SCTP_FUTURE_ASSOC; - if (params.assoc_id == SCTP_FUTURE_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) - sp->default_ss = params.assoc_value; + if (params->assoc_id == SCTP_FUTURE_ASSOC || + params->assoc_id == SCTP_ALL_ASSOC) + sp->default_ss = params->assoc_value; - if (params.assoc_id == SCTP_CURRENT_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) { + if (params->assoc_id == SCTP_CURRENT_ASSOC || + params->assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { int ret = sctp_sched_set_sched(asoc, - params.assoc_value); + params->assoc_value); if (ret && !retval) retval = ret; @@ -4350,38 +4166,32 @@ static int sctp_setsockopt_scheduler(struct sock *sk, } static int sctp_setsockopt_scheduler_value(struct sock *sk, - char __user *optval, + struct sctp_stream_value *params, unsigned int optlen) { - struct sctp_stream_value params; struct sctp_association *asoc; int retval = -EINVAL; - if (optlen < sizeof(params)) - goto out; - - optlen = sizeof(params); - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; + if (optlen < sizeof(*params)) goto out; - } - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_CURRENT_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_CURRENT_ASSOC && sctp_style(sk, UDP)) goto out; if (asoc) { - retval = sctp_sched_set_value(asoc, params.stream_id, - params.stream_value, GFP_KERNEL); + retval = sctp_sched_set_value(asoc, params->stream_id, + params->stream_value, GFP_KERNEL); goto out; } retval = 0; list_for_each_entry(asoc, &sctp_sk(sk)->ep->asocs, asocs) { - int ret = sctp_sched_set_value(asoc, params.stream_id, - params.stream_value, GFP_KERNEL); + int ret = sctp_sched_set_value(asoc, params->stream_id, + params->stream_value, + GFP_KERNEL); if (ret && !retval) /* try to return the 1st error. */ retval = ret; } @@ -4391,46 +4201,30 @@ out: } static int sctp_setsockopt_interleaving_supported(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *p, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); - struct sctp_assoc_value params; struct sctp_association *asoc; - int retval = -EINVAL; - if (optlen < sizeof(params)) - goto out; - - optlen = sizeof(params); - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; - goto out; - } + if (optlen < sizeof(*p)) + return -EINVAL; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && - sctp_style(sk, UDP)) - goto out; + asoc = sctp_id2assoc(sk, p->assoc_id); + if (!asoc && p->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) + return -EINVAL; if (!sock_net(sk)->sctp.intl_enable || !sp->frag_interleave) { - retval = -EPERM; - goto out; + return -EPERM; } - sp->ep->intl_enable = !!params.assoc_value; - - retval = 0; - -out: - return retval; + sp->ep->intl_enable = !!p->assoc_value; + return 0; } -static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval, +static int sctp_setsockopt_reuse_port(struct sock *sk, int *val, unsigned int optlen) { - int val; - if (!sctp_style(sk, TCP)) return -EOPNOTSUPP; @@ -4440,10 +4234,7 @@ static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EFAULT; - - sctp_sk(sk)->reuse = !!val; + sctp_sk(sk)->reuse = !!*val; return 0; } @@ -4469,45 +4260,40 @@ static int sctp_assoc_ulpevent_type_set(struct sctp_event *param, return 0; } -static int sctp_setsockopt_event(struct sock *sk, char __user *optval, +static int sctp_setsockopt_event(struct sock *sk, struct sctp_event *param, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; - struct sctp_event param; int retval = 0; - if (optlen < sizeof(param)) + if (optlen < sizeof(*param)) return -EINVAL; - optlen = sizeof(param); - if (copy_from_user(¶m, optval, optlen)) - return -EFAULT; - - if (param.se_type < SCTP_SN_TYPE_BASE || - param.se_type > SCTP_SN_TYPE_MAX) + if (param->se_type < SCTP_SN_TYPE_BASE || + param->se_type > SCTP_SN_TYPE_MAX) return -EINVAL; - asoc = sctp_id2assoc(sk, param.se_assoc_id); - if (!asoc && param.se_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, param->se_assoc_id); + if (!asoc && param->se_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) - return sctp_assoc_ulpevent_type_set(¶m, asoc); + return sctp_assoc_ulpevent_type_set(param, asoc); if (sctp_style(sk, TCP)) - param.se_assoc_id = SCTP_FUTURE_ASSOC; + param->se_assoc_id = SCTP_FUTURE_ASSOC; - if (param.se_assoc_id == SCTP_FUTURE_ASSOC || - param.se_assoc_id == SCTP_ALL_ASSOC) + if (param->se_assoc_id == SCTP_FUTURE_ASSOC || + param->se_assoc_id == SCTP_ALL_ASSOC) sctp_ulpevent_type_set(&sp->subscribe, - param.se_type, param.se_on); + param->se_type, param->se_on); - if (param.se_assoc_id == SCTP_CURRENT_ASSOC || - param.se_assoc_id == SCTP_ALL_ASSOC) { + if (param->se_assoc_id == SCTP_CURRENT_ASSOC || + param->se_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { - int ret = sctp_assoc_ulpevent_type_set(¶m, asoc); + int ret = sctp_assoc_ulpevent_type_set(param, asoc); if (ret && !retval) retval = ret; @@ -4518,29 +4304,23 @@ static int sctp_setsockopt_event(struct sock *sk, char __user *optval, } static int sctp_setsockopt_asconf_supported(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { - struct sctp_assoc_value params; struct sctp_association *asoc; struct sctp_endpoint *ep; int retval = -EINVAL; - if (optlen != sizeof(params)) + if (optlen != sizeof(*params)) goto out; - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; - goto out; - } - - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; ep = sctp_sk(sk)->ep; - ep->asconf_enable = !!params.assoc_value; + ep->asconf_enable = !!params->assoc_value; if (ep->asconf_enable && ep->auth_enable) { sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF); @@ -4554,29 +4334,23 @@ out: } static int sctp_setsockopt_auth_supported(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { - struct sctp_assoc_value params; struct sctp_association *asoc; struct sctp_endpoint *ep; int retval = -EINVAL; - if (optlen != sizeof(params)) - goto out; - - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; + if (optlen != sizeof(*params)) goto out; - } - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; ep = sctp_sk(sk)->ep; - if (params.assoc_value) { + if (params->assoc_value) { retval = sctp_auth_init(ep, GFP_KERNEL); if (retval) goto out; @@ -4586,7 +4360,7 @@ static int sctp_setsockopt_auth_supported(struct sock *sk, } } - ep->auth_enable = !!params.assoc_value; + ep->auth_enable = !!params->assoc_value; retval = 0; out: @@ -4594,27 +4368,21 @@ out: } static int sctp_setsockopt_ecn_supported(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { - struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EINVAL; - if (optlen != sizeof(params)) - goto out; - - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; + if (optlen != sizeof(*params)) goto out; - } - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; - sctp_sk(sk)->ep->ecn_enable = !!params.assoc_value; + sctp_sk(sk)->ep->ecn_enable = !!params->assoc_value; retval = 0; out: @@ -4622,33 +4390,27 @@ out: } static int sctp_setsockopt_pf_expose(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { - struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EINVAL; - if (optlen != sizeof(params)) + if (optlen != sizeof(*params)) goto out; - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; - goto out; - } - - if (params.assoc_value > SCTP_PF_EXPOSE_MAX) + if (params->assoc_value > SCTP_PF_EXPOSE_MAX) goto out; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; if (asoc) - asoc->pf_expose = params.assoc_value; + asoc->pf_expose = params->assoc_value; else - sctp_sk(sk)->pf_expose = params.assoc_value; + sctp_sk(sk)->pf_expose = params->assoc_value; retval = 0; out: @@ -4675,8 +4437,9 @@ out: * optlen - the size of the buffer. */ static int sctp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { + void *kopt = NULL; int retval = 0; pr_debug("%s: sk:%p, optname:%d\n", __func__, sk, optname); @@ -4689,8 +4452,14 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, */ if (level != SOL_SCTP) { struct sctp_af *af = sctp_sk(sk)->pf->af; - retval = af->setsockopt(sk, level, optname, optval, optlen); - goto out_nounlock; + + return af->setsockopt(sk, level, optname, optval, optlen); + } + + if (optlen > 0) { + kopt = memdup_sockptr(optval, optlen); + if (IS_ERR(kopt)) + return PTR_ERR(kopt); } lock_sock(sk); @@ -4698,179 +4467,174 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case SCTP_SOCKOPT_BINDX_ADD: /* 'optlen' is the size of the addresses buffer. */ - retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval, - optlen, SCTP_BINDX_ADD_ADDR); + retval = sctp_setsockopt_bindx(sk, kopt, optlen, + SCTP_BINDX_ADD_ADDR); break; case SCTP_SOCKOPT_BINDX_REM: /* 'optlen' is the size of the addresses buffer. */ - retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval, - optlen, SCTP_BINDX_REM_ADDR); + retval = sctp_setsockopt_bindx(sk, kopt, optlen, + SCTP_BINDX_REM_ADDR); break; case SCTP_SOCKOPT_CONNECTX_OLD: /* 'optlen' is the size of the addresses buffer. */ - retval = sctp_setsockopt_connectx_old(sk, - (struct sockaddr __user *)optval, - optlen); + retval = sctp_setsockopt_connectx_old(sk, kopt, optlen); break; case SCTP_SOCKOPT_CONNECTX: /* 'optlen' is the size of the addresses buffer. */ - retval = sctp_setsockopt_connectx(sk, - (struct sockaddr __user *)optval, - optlen); + retval = sctp_setsockopt_connectx(sk, kopt, optlen); break; case SCTP_DISABLE_FRAGMENTS: - retval = sctp_setsockopt_disable_fragments(sk, optval, optlen); + retval = sctp_setsockopt_disable_fragments(sk, kopt, optlen); break; case SCTP_EVENTS: - retval = sctp_setsockopt_events(sk, optval, optlen); + retval = sctp_setsockopt_events(sk, kopt, optlen); break; case SCTP_AUTOCLOSE: - retval = sctp_setsockopt_autoclose(sk, optval, optlen); + retval = sctp_setsockopt_autoclose(sk, kopt, optlen); break; case SCTP_PEER_ADDR_PARAMS: - retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen); + retval = sctp_setsockopt_peer_addr_params(sk, kopt, optlen); break; case SCTP_DELAYED_SACK: - retval = sctp_setsockopt_delayed_ack(sk, optval, optlen); + retval = sctp_setsockopt_delayed_ack(sk, kopt, optlen); break; case SCTP_PARTIAL_DELIVERY_POINT: - retval = sctp_setsockopt_partial_delivery_point(sk, optval, optlen); + retval = sctp_setsockopt_partial_delivery_point(sk, kopt, optlen); break; case SCTP_INITMSG: - retval = sctp_setsockopt_initmsg(sk, optval, optlen); + retval = sctp_setsockopt_initmsg(sk, kopt, optlen); break; case SCTP_DEFAULT_SEND_PARAM: - retval = sctp_setsockopt_default_send_param(sk, optval, - optlen); + retval = sctp_setsockopt_default_send_param(sk, kopt, optlen); break; case SCTP_DEFAULT_SNDINFO: - retval = sctp_setsockopt_default_sndinfo(sk, optval, optlen); + retval = sctp_setsockopt_default_sndinfo(sk, kopt, optlen); break; case SCTP_PRIMARY_ADDR: - retval = sctp_setsockopt_primary_addr(sk, optval, optlen); + retval = sctp_setsockopt_primary_addr(sk, kopt, optlen); break; case SCTP_SET_PEER_PRIMARY_ADDR: - retval = sctp_setsockopt_peer_primary_addr(sk, optval, optlen); + retval = sctp_setsockopt_peer_primary_addr(sk, kopt, optlen); break; case SCTP_NODELAY: - retval = sctp_setsockopt_nodelay(sk, optval, optlen); + retval = sctp_setsockopt_nodelay(sk, kopt, optlen); break; case SCTP_RTOINFO: - retval = sctp_setsockopt_rtoinfo(sk, optval, optlen); + retval = sctp_setsockopt_rtoinfo(sk, kopt, optlen); break; case SCTP_ASSOCINFO: - retval = sctp_setsockopt_associnfo(sk, optval, optlen); + retval = sctp_setsockopt_associnfo(sk, kopt, optlen); break; case SCTP_I_WANT_MAPPED_V4_ADDR: - retval = sctp_setsockopt_mappedv4(sk, optval, optlen); + retval = sctp_setsockopt_mappedv4(sk, kopt, optlen); break; case SCTP_MAXSEG: - retval = sctp_setsockopt_maxseg(sk, optval, optlen); + retval = sctp_setsockopt_maxseg(sk, kopt, optlen); break; case SCTP_ADAPTATION_LAYER: - retval = sctp_setsockopt_adaptation_layer(sk, optval, optlen); + retval = sctp_setsockopt_adaptation_layer(sk, kopt, optlen); break; case SCTP_CONTEXT: - retval = sctp_setsockopt_context(sk, optval, optlen); + retval = sctp_setsockopt_context(sk, kopt, optlen); break; case SCTP_FRAGMENT_INTERLEAVE: - retval = sctp_setsockopt_fragment_interleave(sk, optval, optlen); + retval = sctp_setsockopt_fragment_interleave(sk, kopt, optlen); break; case SCTP_MAX_BURST: - retval = sctp_setsockopt_maxburst(sk, optval, optlen); + retval = sctp_setsockopt_maxburst(sk, kopt, optlen); break; case SCTP_AUTH_CHUNK: - retval = sctp_setsockopt_auth_chunk(sk, optval, optlen); + retval = sctp_setsockopt_auth_chunk(sk, kopt, optlen); break; case SCTP_HMAC_IDENT: - retval = sctp_setsockopt_hmac_ident(sk, optval, optlen); + retval = sctp_setsockopt_hmac_ident(sk, kopt, optlen); break; case SCTP_AUTH_KEY: - retval = sctp_setsockopt_auth_key(sk, optval, optlen); + retval = sctp_setsockopt_auth_key(sk, kopt, optlen); break; case SCTP_AUTH_ACTIVE_KEY: - retval = sctp_setsockopt_active_key(sk, optval, optlen); + retval = sctp_setsockopt_active_key(sk, kopt, optlen); break; case SCTP_AUTH_DELETE_KEY: - retval = sctp_setsockopt_del_key(sk, optval, optlen); + retval = sctp_setsockopt_del_key(sk, kopt, optlen); break; case SCTP_AUTH_DEACTIVATE_KEY: - retval = sctp_setsockopt_deactivate_key(sk, optval, optlen); + retval = sctp_setsockopt_deactivate_key(sk, kopt, optlen); break; case SCTP_AUTO_ASCONF: - retval = sctp_setsockopt_auto_asconf(sk, optval, optlen); + retval = sctp_setsockopt_auto_asconf(sk, kopt, optlen); break; case SCTP_PEER_ADDR_THLDS: - retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen, + retval = sctp_setsockopt_paddr_thresholds(sk, kopt, optlen, false); break; case SCTP_PEER_ADDR_THLDS_V2: - retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen, + retval = sctp_setsockopt_paddr_thresholds(sk, kopt, optlen, true); break; case SCTP_RECVRCVINFO: - retval = sctp_setsockopt_recvrcvinfo(sk, optval, optlen); + retval = sctp_setsockopt_recvrcvinfo(sk, kopt, optlen); break; case SCTP_RECVNXTINFO: - retval = sctp_setsockopt_recvnxtinfo(sk, optval, optlen); + retval = sctp_setsockopt_recvnxtinfo(sk, kopt, optlen); break; case SCTP_PR_SUPPORTED: - retval = sctp_setsockopt_pr_supported(sk, optval, optlen); + retval = sctp_setsockopt_pr_supported(sk, kopt, optlen); break; case SCTP_DEFAULT_PRINFO: - retval = sctp_setsockopt_default_prinfo(sk, optval, optlen); + retval = sctp_setsockopt_default_prinfo(sk, kopt, optlen); break; case SCTP_RECONFIG_SUPPORTED: - retval = sctp_setsockopt_reconfig_supported(sk, optval, optlen); + retval = sctp_setsockopt_reconfig_supported(sk, kopt, optlen); break; case SCTP_ENABLE_STREAM_RESET: - retval = sctp_setsockopt_enable_strreset(sk, optval, optlen); + retval = sctp_setsockopt_enable_strreset(sk, kopt, optlen); break; case SCTP_RESET_STREAMS: - retval = sctp_setsockopt_reset_streams(sk, optval, optlen); + retval = sctp_setsockopt_reset_streams(sk, kopt, optlen); break; case SCTP_RESET_ASSOC: - retval = sctp_setsockopt_reset_assoc(sk, optval, optlen); + retval = sctp_setsockopt_reset_assoc(sk, kopt, optlen); break; case SCTP_ADD_STREAMS: - retval = sctp_setsockopt_add_streams(sk, optval, optlen); + retval = sctp_setsockopt_add_streams(sk, kopt, optlen); break; case SCTP_STREAM_SCHEDULER: - retval = sctp_setsockopt_scheduler(sk, optval, optlen); + retval = sctp_setsockopt_scheduler(sk, kopt, optlen); break; case SCTP_STREAM_SCHEDULER_VALUE: - retval = sctp_setsockopt_scheduler_value(sk, optval, optlen); + retval = sctp_setsockopt_scheduler_value(sk, kopt, optlen); break; case SCTP_INTERLEAVING_SUPPORTED: - retval = sctp_setsockopt_interleaving_supported(sk, optval, + retval = sctp_setsockopt_interleaving_supported(sk, kopt, optlen); break; case SCTP_REUSE_PORT: - retval = sctp_setsockopt_reuse_port(sk, optval, optlen); + retval = sctp_setsockopt_reuse_port(sk, kopt, optlen); break; case SCTP_EVENT: - retval = sctp_setsockopt_event(sk, optval, optlen); + retval = sctp_setsockopt_event(sk, kopt, optlen); break; case SCTP_ASCONF_SUPPORTED: - retval = sctp_setsockopt_asconf_supported(sk, optval, optlen); + retval = sctp_setsockopt_asconf_supported(sk, kopt, optlen); break; case SCTP_AUTH_SUPPORTED: - retval = sctp_setsockopt_auth_supported(sk, optval, optlen); + retval = sctp_setsockopt_auth_supported(sk, kopt, optlen); break; case SCTP_ECN_SUPPORTED: - retval = sctp_setsockopt_ecn_supported(sk, optval, optlen); + retval = sctp_setsockopt_ecn_supported(sk, kopt, optlen); break; case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: - retval = sctp_setsockopt_pf_expose(sk, optval, optlen); + retval = sctp_setsockopt_pf_expose(sk, kopt, optlen); break; default: retval = -ENOPROTOOPT; @@ -4878,8 +4642,7 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, } release_sock(sk); - -out_nounlock: + kfree(kopt); return retval; } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1163d51196da..e7649bbc2b87 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -719,8 +719,11 @@ static int smc_connect_ism(struct smc_sock *smc, } /* Create send and receive buffers */ - if (smc_buf_create(smc, true)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, + rc = smc_buf_create(smc, true); + if (rc) + return smc_connect_abort(smc, (rc == -ENOSPC) ? + SMC_CLC_DECL_MAX_DMB : + SMC_CLC_DECL_MEM, ini->cln_first_contact); smc_conn_save_peer_info(smc, aclc); @@ -1200,12 +1203,14 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, } /* Create send and receive buffers */ - if (smc_buf_create(new_smc, true)) { + rc = smc_buf_create(new_smc, true); + if (rc) { if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_lgr_cleanup_early(&new_smc->conn); else smc_conn_free(&new_smc->conn); - return SMC_CLC_DECL_MEM; + return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : + SMC_CLC_DECL_MEM; } return 0; @@ -1735,7 +1740,7 @@ out: } static int smc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -1746,8 +1751,11 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* generic setsockopts reaching us here always apply to the * CLC socket */ - rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, - optval, optlen); + if (unlikely(!smc->clcsock->ops->setsockopt)) + rc = -EOPNOTSUPP; + else + rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, + optval, optlen); if (smc->clcsock->sk->sk_err) { sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); @@ -1755,7 +1763,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); @@ -1812,6 +1820,8 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, smc = smc_sk(sock->sk); /* socket options apply to the CLC socket */ + if (unlikely(!smc->clcsock->ops->getsockopt)) + return -EOPNOTSUPP; return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 76c2b150d040..cf7b45306f4e 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -48,6 +48,7 @@ #define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */ #define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */ #define SMC_CLC_DECL_VERSMISMAT 0x030c0000 /* SMC version mismatch */ +#define SMC_CLC_DECL_MAX_DMB 0x030d0000 /* SMC-D DMB limit exceeded */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f82a2e599917..b42fa3b00d00 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1614,7 +1614,7 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); if (rc) { kfree(buf_desc); - return ERR_PTR(-EAGAIN); + return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) : ERR_PTR(rc); } buf_desc->pages = virt_to_page(buf_desc->cpu_addr); /* CDC header stored in buf. So, pretend it was smaller */ @@ -1688,7 +1688,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) } if (IS_ERR(buf_desc)) - return -ENOMEM; + return PTR_ERR(buf_desc); if (!is_smcd) { if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { diff --git a/net/socket.c b/net/socket.c index 976426d03f09..aff52e81653c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -586,15 +586,6 @@ struct socket *sock_alloc(void) } EXPORT_SYMBOL(sock_alloc); -/** - * sock_release - close a socket - * @sock: socket to close - * - * The socket is released from the protocol stack if it has a release - * callback, and the inode is then released if the socket is bound to - * an inode not a file. - */ - static void __sock_release(struct socket *sock, struct inode *inode) { if (sock->ops) { @@ -620,6 +611,14 @@ static void __sock_release(struct socket *sock, struct inode *inode) sock->file = NULL; } +/** + * sock_release - close a socket + * @sock: socket to close + * + * The socket is released from the protocol stack if it has a release + * callback, and the inode is then released if the socket is bound to + * an inode not a file. + */ void sock_release(struct socket *sock) { __sock_release(sock, NULL); @@ -2080,15 +2079,25 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, return __sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); } +static bool sock_use_custom_sol_socket(const struct socket *sock) +{ + const struct sock *sk = sock->sk; + + /* Use sock->ops->setsockopt() for MPTCP */ + return IS_ENABLED(CONFIG_MPTCP) && + sk->sk_protocol == IPPROTO_MPTCP && + sk->sk_type == SOCK_STREAM && + (sk->sk_family == AF_INET || sk->sk_family == AF_INET6); +} + /* * Set a socket option. Because we don't know the option lengths we have * to pass the user mode parameter for the protocols to sort out. */ - -static int __sys_setsockopt(int fd, int level, int optname, - char __user *optval, int optlen) +int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, + int optlen) { - mm_segment_t oldfs = get_fs(); + sockptr_t optval; char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2096,44 +2105,41 @@ static int __sys_setsockopt(int fd, int level, int optname, if (optlen < 0) return -EINVAL; - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (sock != NULL) { - err = security_socket_setsockopt(sock, level, optname); - if (err) - goto out_put; + err = init_user_sockptr(&optval, user_optval, optlen); + if (err) + return err; - err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, - &optname, optval, &optlen, - &kernel_optval); + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + return err; - if (err < 0) { - goto out_put; - } else if (err > 0) { - err = 0; - goto out_put; - } + err = security_socket_setsockopt(sock, level, optname); + if (err) + goto out_put; - if (kernel_optval) { - set_fs(KERNEL_DS); - optval = (char __user __force *)kernel_optval; - } + if (!in_compat_syscall()) + err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname, + user_optval, &optlen, + &kernel_optval); + if (err < 0) + goto out_put; + if (err > 0) { + err = 0; + goto out_put; + } - if (level == SOL_SOCKET) - err = - sock_setsockopt(sock, level, optname, optval, + if (kernel_optval) + optval = KERNEL_SOCKPTR(kernel_optval); + if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock)) + err = sock_setsockopt(sock, level, optname, optval, optlen); + else if (unlikely(!sock->ops->setsockopt)) + err = -EOPNOTSUPP; + else + err = sock->ops->setsockopt(sock, level, optname, optval, optlen); - else - err = - sock->ops->setsockopt(sock, level, optname, optval, - optlen); - - if (kernel_optval) { - set_fs(oldfs); - kfree(kernel_optval); - } + kfree(kernel_optval); out_put: - fput_light(sock->file, fput_needed); - } + fput_light(sock->file, fput_needed); return err; } @@ -2147,37 +2153,38 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. */ - -static int __sys_getsockopt(int fd, int level, int optname, - char __user *optval, int __user *optlen) +int __sys_getsockopt(int fd, int level, int optname, char __user *optval, + int __user *optlen) { int err, fput_needed; struct socket *sock; int max_optlen; sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (sock != NULL) { - err = security_socket_getsockopt(sock, level, optname); - if (err) - goto out_put; + if (!sock) + return err; + err = security_socket_getsockopt(sock, level, optname); + if (err) + goto out_put; + + if (!in_compat_syscall()) max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); - if (level == SOL_SOCKET) - err = - sock_getsockopt(sock, level, optname, optval, + if (level == SOL_SOCKET) + err = sock_getsockopt(sock, level, optname, optval, optlen); + else if (unlikely(!sock->ops->getsockopt)) + err = -EOPNOTSUPP; + else + err = sock->ops->getsockopt(sock, level, optname, optval, optlen); - else - err = - sock->ops->getsockopt(sock, level, optname, optval, - optlen); + if (!in_compat_syscall()) err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, - optval, optlen, - max_optlen, err); + optval, optlen, max_optlen, + err); out_put: - fput_light(sock->file, fput_needed); - } + fput_light(sock->file, fput_needed); return err; } diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index f25604d68337..865f3e037425 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -304,8 +304,8 @@ static int switchdev_port_obj_add_defer(struct net_device *dev, * switchdev_port_obj_add - Add port object * * @dev: port device - * @id: object ID * @obj: object to add + * @extack: netlink extended ack * * Use a 2-phase prepare-commit transaction model to ensure * system is not left in a partially updated state due to @@ -357,7 +357,6 @@ static int switchdev_port_obj_del_defer(struct net_device *dev, * switchdev_port_obj_del - Delete port object * * @dev: port device - * @id: object ID * @obj: object to delete * * rtnl_lock must be held and must not be in atomic section, diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 383f87bc1061..940d176e0e87 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -250,8 +250,8 @@ static void tipc_bcast_select_xmit_method(struct net *net, int dests, * Consumes the buffer chain. * Returns 0 if success, otherwise errno: -EHOSTUNREACH,-EMSGSIZE */ -static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, - u16 *cong_link_cnt) +int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, + u16 *cong_link_cnt) { struct tipc_link *l = tipc_bc_sndlink(net); struct sk_buff_head xmitq; @@ -752,7 +752,7 @@ void tipc_nlist_purge(struct tipc_nlist *nl) nl->local = false; } -u32 tipc_bcast_get_broadcast_mode(struct net *net) +u32 tipc_bcast_get_mode(struct net *net) { struct tipc_bc_base *bb = tipc_bc_base(net); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 4240c95188b1..2d9352dc7b0e 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -90,6 +90,8 @@ void tipc_bcast_toggle_rcast(struct net *net, bool supp); int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_mc_method *method, struct tipc_nlist *dests, u16 *cong_link_cnt); +int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, + u16 *cong_link_cnt); int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb); void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr); @@ -101,7 +103,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg, int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); int tipc_bclink_reset_stats(struct net *net, struct tipc_link *l); -u32 tipc_bcast_get_broadcast_mode(struct net *net); +u32 tipc_bcast_get_mode(struct net *net); u32 tipc_bcast_get_broadcast_ratio(struct net *net); void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index e366ec9a7e4d..808b147df7d5 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -595,7 +595,7 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, /** * tipc_l2_rcv_msg - handle incoming TIPC message from an interface - * @buf: the received packet + * @skb: the received message * @dev: the net device that the packet was received on * @pt: the packet_type structure which was used to register this handler * @orig_dev: the original receive net device in case the device is a bond diff --git a/net/tipc/discover.c b/net/tipc/discover.c index bfe43da127c0..d4ecacddb40c 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -74,7 +74,7 @@ struct tipc_discoverer { /** * tipc_disc_init_msg - initialize a link setup message * @net: the applicable net namespace - * @type: message type (request or response) + * @mtyp: message type (request or response) * @b: ptr to bearer issuing message */ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, @@ -339,7 +339,7 @@ exit: * @net: the applicable net namespace * @b: ptr to bearer issuing requests * @dest: destination address for request messages - * @dest_domain: network domain to which links can be established + * @skb: pointer to created frame * * Returns 0 if successful, otherwise -errno. */ @@ -393,7 +393,6 @@ void tipc_disc_delete(struct tipc_discoverer *d) * tipc_disc_reset - reset object to send periodic link setup requests * @net: the applicable net namespace * @b: ptr to bearer issuing requests - * @dest_domain: network domain to which links can be established */ void tipc_disc_reset(struct net *net, struct tipc_bearer *b) { diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 8b0bb600602d..c68019697cfe 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -62,12 +62,10 @@ static int tipc_eth_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, char *msg) { - char bcast_mac[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - memset(addr, 0, sizeof(*addr)); ether_addr_copy(addr->value, msg); addr->media_id = TIPC_MEDIA_TYPE_ETH; - addr->broadcast = !memcmp(addr->value, bcast_mac, ETH_ALEN); + addr->broadcast = is_broadcast_ether_addr(addr->value); return 0; } diff --git a/net/tipc/link.c b/net/tipc/link.c index d40f8e5b7683..107578122973 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -445,7 +445,7 @@ u32 tipc_link_state(struct tipc_link *l) /** * tipc_link_create - create a new link - * @n: pointer to associated node + * @net: pointer to associated network namespace * @if_name: associated interface name * @bearer_id: id (index) of associated bearer * @tolerance: link tolerance to be used by link @@ -530,7 +530,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, /** * tipc_link_bc_create - create new link to be used for broadcast - * @n: pointer to associated node + * @net: pointer to associated network namespace * @mtu: mtu to be used initially if no peers * @window: send window to be used * @inputq: queue to put messages ready for delivery @@ -989,7 +989,7 @@ void tipc_link_reset(struct tipc_link *l) /** * tipc_link_xmit(): enqueue buffer list according to queue situation - * @link: link to use + * @l: link to use * @list: chain of buffers containing message * @xmitq: returned list of packets to be sent by caller * @@ -1396,12 +1396,12 @@ u16 tipc_get_gap_ack_blks(struct tipc_gap_ack_blks **ga, struct tipc_link *l, p = (struct tipc_gap_ack_blks *)msg_data(hdr); sz = ntohs(p->len); /* Sanity check */ - if (sz == tipc_gap_ack_blks_sz(p->ugack_cnt + p->bgack_cnt)) { + if (sz == struct_size(p, gacks, p->ugack_cnt + p->bgack_cnt)) { /* Good, check if the desired type exists */ if ((uc && p->ugack_cnt) || (!uc && p->bgack_cnt)) goto ok; /* Backward compatible: peer might not support bc, but uc? */ - } else if (uc && sz == tipc_gap_ack_blks_sz(p->ugack_cnt)) { + } else if (uc && sz == struct_size(p, gacks, p->ugack_cnt)) { if (p->ugack_cnt) { p->bgack_cnt = 0; goto ok; @@ -1483,7 +1483,7 @@ static u16 tipc_build_gap_ack_blks(struct tipc_link *l, struct tipc_msg *hdr) __tipc_build_gap_ack_blks(ga, l, ga->bgack_cnt) : 0; /* Total len */ - len = tipc_gap_ack_blks_sz(ga->bgack_cnt + ga->ugack_cnt); + len = struct_size(ga, gacks, ga->bgack_cnt + ga->ugack_cnt); ga->len = htons(len); return len; } @@ -1532,7 +1532,7 @@ static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r, gacks = &ga->gacks[ga->bgack_cnt]; } else if (ga) { /* Copy the Gap ACKs, bc part, for later renewal if needed */ - this_ga = kmemdup(ga, tipc_gap_ack_blks_sz(ga->bgack_cnt), + this_ga = kmemdup(ga, struct_size(ga, gacks, ga->bgack_cnt), GFP_ATOMIC); if (likely(this_ga)) { this_ga->start_index = 0; @@ -2755,7 +2755,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg, void *hdr; struct nlattr *attrs; struct nlattr *prop; - u32 bc_mode = tipc_bcast_get_broadcast_mode(net); + u32 bc_mode = tipc_bcast_get_mode(net); u32 bc_ratio = tipc_bcast_get_broadcast_ratio(net); if (!bcl) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 01b64869a173..848fae674532 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -202,7 +202,7 @@ err: /** * tipc_msg_append(): Append data to tail of an existing buffer queue - * @hdr: header to be used + * @_hdr: header to be used * @m: the data to be appended * @mss: max allowable size of buffer * @dlen: size of data to be appended diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 58660d56bc83..1016e96db5c4 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -189,11 +189,9 @@ struct tipc_gap_ack_blks { struct tipc_gap_ack gacks[]; }; -#define tipc_gap_ack_blks_sz(n) (sizeof(struct tipc_gap_ack_blks) + \ - sizeof(struct tipc_gap_ack) * (n)) - #define MAX_GAP_ACK_BLKS 128 -#define MAX_GAP_ACK_BLKS_SZ tipc_gap_ack_blks_sz(MAX_GAP_ACK_BLKS) +#define MAX_GAP_ACK_BLKS_SZ (sizeof(struct tipc_gap_ack_blks) + \ + sizeof(struct tipc_gap_ack) * MAX_GAP_ACK_BLKS) static inline struct tipc_msg *buf_msg(struct sk_buff *skb) { @@ -438,6 +436,36 @@ static inline void msg_set_errcode(struct tipc_msg *m, u32 err) msg_set_bits(m, 1, 25, 0xf, err); } +static inline void msg_set_bulk(struct tipc_msg *m) +{ + msg_set_bits(m, 1, 28, 0x1, 1); +} + +static inline u32 msg_is_bulk(struct tipc_msg *m) +{ + return msg_bits(m, 1, 28, 0x1); +} + +static inline void msg_set_last_bulk(struct tipc_msg *m) +{ + msg_set_bits(m, 1, 27, 0x1, 1); +} + +static inline u32 msg_is_last_bulk(struct tipc_msg *m) +{ + return msg_bits(m, 1, 27, 0x1); +} + +static inline void msg_set_non_legacy(struct tipc_msg *m) +{ + msg_set_bits(m, 1, 26, 0x1, 1); +} + +static inline u32 msg_is_legacy(struct tipc_msg *m) +{ + return !msg_bits(m, 1, 26, 0x1); +} + static inline u32 msg_reroute_cnt(struct tipc_msg *m) { return msg_bits(m, 1, 21, 0xf); @@ -567,6 +595,16 @@ static inline void msg_set_origport(struct tipc_msg *m, u32 p) msg_set_word(m, 4, p); } +static inline u16 msg_named_seqno(struct tipc_msg *m) +{ + return msg_bits(m, 4, 0, 0xffff); +} + +static inline void msg_set_named_seqno(struct tipc_msg *m, u16 n) +{ + msg_set_bits(m, 4, 0, 0xffff, n); +} + static inline u32 msg_destport(struct tipc_msg *m) { return msg_word(m, 5); diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 5feaf3b67380..2f9c148f17e2 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -102,7 +102,8 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) pr_warn("Publication distribution failure\n"); return NULL; } - + msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++); + msg_set_non_legacy(buf_msg(skb)); item = (struct distr_item *)msg_data(buf_msg(skb)); publ_to_item(item, publ); return skb; @@ -114,8 +115,8 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) { struct name_table *nt = tipc_name_table(net); - struct sk_buff *buf; struct distr_item *item; + struct sk_buff *skb; write_lock_bh(&nt->cluster_scope_lock); list_del(&publ->binding_node); @@ -123,15 +124,16 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) if (publ->scope == TIPC_NODE_SCOPE) return NULL; - buf = named_prepare_buf(net, WITHDRAWAL, ITEM_SIZE, 0); - if (!buf) { + skb = named_prepare_buf(net, WITHDRAWAL, ITEM_SIZE, 0); + if (!skb) { pr_warn("Withdrawal distribution failure\n"); return NULL; } - - item = (struct distr_item *)msg_data(buf_msg(buf)); + msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++); + msg_set_non_legacy(buf_msg(skb)); + item = (struct distr_item *)msg_data(buf_msg(skb)); publ_to_item(item, publ); - return buf; + return skb; } /** @@ -141,7 +143,7 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) * @pls: linked list of publication items to be packed into buffer chain */ static void named_distribute(struct net *net, struct sk_buff_head *list, - u32 dnode, struct list_head *pls) + u32 dnode, struct list_head *pls, u16 seqno) { struct publication *publ; struct sk_buff *skb = NULL; @@ -149,6 +151,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0, false) - INT_H_SIZE) / ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; + struct tipc_msg *hdr; list_for_each_entry(publ, pls, binding_node) { /* Prepare next buffer: */ @@ -159,8 +162,11 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, pr_warn("Bulk publication failure\n"); return; } - msg_set_bc_ack_invalid(buf_msg(skb), true); - item = (struct distr_item *)msg_data(buf_msg(skb)); + hdr = buf_msg(skb); + msg_set_bc_ack_invalid(hdr, true); + msg_set_bulk(hdr); + msg_set_non_legacy(hdr); + item = (struct distr_item *)msg_data(hdr); } /* Pack publication into message: */ @@ -176,24 +182,35 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, } } if (skb) { - msg_set_size(buf_msg(skb), INT_H_SIZE + (msg_dsz - msg_rem)); + hdr = buf_msg(skb); + msg_set_size(hdr, INT_H_SIZE + (msg_dsz - msg_rem)); skb_trim(skb, INT_H_SIZE + (msg_dsz - msg_rem)); __skb_queue_tail(list, skb); } + hdr = buf_msg(skb_peek_tail(list)); + msg_set_last_bulk(hdr); + msg_set_named_seqno(hdr, seqno); } /** * tipc_named_node_up - tell specified node about all publications by this node */ -void tipc_named_node_up(struct net *net, u32 dnode) +void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities) { struct name_table *nt = tipc_name_table(net); + struct tipc_net *tn = tipc_net(net); struct sk_buff_head head; + u16 seqno; __skb_queue_head_init(&head); + spin_lock_bh(&tn->nametbl_lock); + if (!(capabilities & TIPC_NAMED_BCAST)) + nt->rc_dests++; + seqno = nt->snd_nxt; + spin_unlock_bh(&tn->nametbl_lock); read_lock_bh(&nt->cluster_scope_lock); - named_distribute(net, &head, dnode, &nt->cluster_scope); + named_distribute(net, &head, dnode, &nt->cluster_scope, seqno); tipc_node_xmit(net, &head, dnode, 0); read_unlock_bh(&nt->cluster_scope_lock); } @@ -245,13 +262,21 @@ static void tipc_dist_queue_purge(struct net *net, u32 addr) spin_unlock_bh(&tn->nametbl_lock); } -void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr) +void tipc_publ_notify(struct net *net, struct list_head *nsub_list, + u32 addr, u16 capabilities) { + struct name_table *nt = tipc_name_table(net); + struct tipc_net *tn = tipc_net(net); + struct publication *publ, *tmp; list_for_each_entry_safe(publ, tmp, nsub_list, binding_node) tipc_publ_purge(net, publ, addr); tipc_dist_queue_purge(net, addr); + spin_lock_bh(&tn->nametbl_lock); + if (!(capabilities & TIPC_NAMED_BCAST)) + nt->rc_dests--; + spin_unlock_bh(&tn->nametbl_lock); } /** @@ -295,29 +320,62 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, return false; } +static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, + u16 *rcv_nxt, bool *open) +{ + struct sk_buff *skb, *tmp; + struct tipc_msg *hdr; + u16 seqno; + + skb_queue_walk_safe(namedq, skb, tmp) { + skb_linearize(skb); + hdr = buf_msg(skb); + seqno = msg_named_seqno(hdr); + if (msg_is_last_bulk(hdr)) { + *rcv_nxt = seqno; + *open = true; + } + + if (msg_is_bulk(hdr) || msg_is_legacy(hdr)) { + __skb_unlink(skb, namedq); + return skb; + } + + if (*open && (*rcv_nxt == seqno)) { + (*rcv_nxt)++; + __skb_unlink(skb, namedq); + return skb; + } + + if (less(seqno, *rcv_nxt)) { + __skb_unlink(skb, namedq); + kfree_skb(skb); + continue; + } + } + return NULL; +} + /** * tipc_named_rcv - process name table update messages sent by another node */ -void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq) +void tipc_named_rcv(struct net *net, struct sk_buff_head *namedq, + u16 *rcv_nxt, bool *open) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_msg *msg; + struct tipc_net *tn = tipc_net(net); struct distr_item *item; - uint count; - u32 node; + struct tipc_msg *hdr; struct sk_buff *skb; - int mtype; + u32 count, node; spin_lock_bh(&tn->nametbl_lock); - for (skb = skb_dequeue(inputq); skb; skb = skb_dequeue(inputq)) { - skb_linearize(skb); - msg = buf_msg(skb); - mtype = msg_type(msg); - item = (struct distr_item *)msg_data(msg); - count = msg_data_sz(msg) / ITEM_SIZE; - node = msg_orignode(msg); + while ((skb = tipc_named_dequeue(namedq, rcv_nxt, open))) { + hdr = buf_msg(skb); + node = msg_orignode(hdr); + item = (struct distr_item *)msg_data(hdr); + count = msg_data_sz(hdr) / ITEM_SIZE; while (count--) { - tipc_update_nametbl(net, item, node, mtype); + tipc_update_nametbl(net, item, node, msg_type(hdr)); item++; } kfree_skb(skb); @@ -345,6 +403,6 @@ void tipc_named_reinit(struct net *net) publ->node = self; list_for_each_entry_rcu(publ, &nt->cluster_scope, binding_node) publ->node = self; - + nt->rc_dests = 0; spin_unlock_bh(&tn->nametbl_lock); } diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index 63fc73e0fa6c..092323158f06 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -67,11 +67,14 @@ struct distr_item { __be32 key; }; +void tipc_named_bcast(struct net *net, struct sk_buff *skb); struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ); struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ); -void tipc_named_node_up(struct net *net, u32 dnode); -void tipc_named_rcv(struct net *net, struct sk_buff_head *msg_queue); +void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities); +void tipc_named_rcv(struct net *net, struct sk_buff_head *namedq, + u16 *rcv_nxt, bool *open); void tipc_named_reinit(struct net *net); -void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr); +void tipc_publ_notify(struct net *net, struct list_head *nsub_list, + u32 addr, u16 capabilities); #endif diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 359b2bc888cf..2ac33d32edc2 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -729,6 +729,7 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, struct tipc_net *tn = tipc_net(net); struct publication *p = NULL; struct sk_buff *skb = NULL; + u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); @@ -743,12 +744,14 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, nt->local_publ_count++; skb = tipc_named_publish(net, p); } + rc_dests = nt->rc_dests; exit: spin_unlock_bh(&tn->nametbl_lock); if (skb) - tipc_node_broadcast(net, skb); + tipc_node_broadcast(net, skb, rc_dests); return p; + } /** @@ -762,6 +765,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 self = tipc_own_addr(net); struct sk_buff *skb = NULL; struct publication *p; + u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); @@ -775,10 +779,11 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, pr_err("Failed to remove local publication {%u,%u,%u}/%u\n", type, lower, upper, key); } + rc_dests = nt->rc_dests; spin_unlock_bh(&tn->nametbl_lock); if (skb) { - tipc_node_broadcast(net, skb); + tipc_node_broadcast(net, skb, rc_dests); return 1; } return 0; diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 728bc7016c38..8064e1986e2c 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -106,6 +106,8 @@ struct name_table { struct list_head cluster_scope; rwlock_t cluster_scope_lock; u32 local_publ_count; + u32 rc_dests; + u32 snd_nxt; }; int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/tipc/node.c b/net/tipc/node.c index a4c2816c3746..4edcee3088da 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -75,6 +75,8 @@ struct tipc_bclink_entry { struct sk_buff_head arrvq; struct sk_buff_head inputq2; struct sk_buff_head namedq; + u16 named_rcv_nxt; + bool named_open; }; /** @@ -396,10 +398,10 @@ static void tipc_node_write_unlock(struct tipc_node *n) write_unlock_bh(&n->lock); if (flags & TIPC_NOTIFY_NODE_DOWN) - tipc_publ_notify(net, publ_list, addr); + tipc_publ_notify(net, publ_list, addr, n->capabilities); if (flags & TIPC_NOTIFY_NODE_UP) - tipc_named_node_up(net, addr); + tipc_named_node_up(net, addr, n->capabilities); if (flags & TIPC_NOTIFY_LINK_UP) { tipc_mon_peer_up(net, addr, bearer_id); @@ -1483,6 +1485,7 @@ static void node_lost_contact(struct tipc_node *n, /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); + __skb_queue_purge(&n->bc_entry.namedq); /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { @@ -1512,7 +1515,7 @@ static void node_lost_contact(struct tipc_node *n, * tipc_node_get_linkname - get the name of a link * * @bearer_id: id of the bearer - * @node: peer node address + * @addr: peer node address * @linkname: link name output buffer * * Returns 0 on success @@ -1729,12 +1732,23 @@ int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *xmitq) return 0; } -void tipc_node_broadcast(struct net *net, struct sk_buff *skb) +void tipc_node_broadcast(struct net *net, struct sk_buff *skb, int rc_dests) { + struct sk_buff_head xmitq; struct sk_buff *txskb; struct tipc_node *n; + u16 dummy; u32 dst; + /* Use broadcast if all nodes support it */ + if (!rc_dests && tipc_bcast_get_mode(net) != BCLINK_MODE_RCAST) { + __skb_queue_head_init(&xmitq); + __skb_queue_tail(&xmitq, skb); + tipc_bcast_xmit(net, &xmitq, &dummy); + return; + } + + /* Otherwise use legacy replicast method */ rcu_read_lock(); list_for_each_entry_rcu(n, tipc_nodes(net), list) { dst = n->addr; @@ -1749,7 +1763,6 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb) tipc_node_xmit_skb(net, txskb, dst, 0); } rcu_read_unlock(); - kfree_skb(skb); } @@ -1844,7 +1857,9 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id /* Handle NAME_DISTRIBUTOR messages sent from 1.7 nodes */ if (!skb_queue_empty(&n->bc_entry.namedq)) - tipc_named_rcv(net, &n->bc_entry.namedq); + tipc_named_rcv(net, &n->bc_entry.namedq, + &n->bc_entry.named_rcv_nxt, + &n->bc_entry.named_open); /* If reassembly or retransmission failure => reset all links to peer */ if (rc & TIPC_LINK_DOWN_EVT) @@ -2007,7 +2022,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, * tipc_rcv - process TIPC packets/messages arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet - * @bearer: pointer to bearer message arrived on + * @b: pointer to bearer message arrived on * * Invoked with no locks held. Bearer pointer must point to a valid bearer * structure (i.e. cannot be NULL), but bearer can be inactive. @@ -2114,7 +2129,9 @@ rcv: tipc_node_link_down(n, bearer_id, false); if (unlikely(!skb_queue_empty(&n->bc_entry.namedq))) - tipc_named_rcv(net, &n->bc_entry.namedq); + tipc_named_rcv(net, &n->bc_entry.namedq, + &n->bc_entry.named_rcv_nxt, + &n->bc_entry.named_open); if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1))) tipc_node_mcast_rcv(n); diff --git a/net/tipc/node.h b/net/tipc/node.h index a6803b449a2c..9f6f13f1604f 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -55,7 +55,8 @@ enum { TIPC_MCAST_RBCTL = (1 << 7), TIPC_GAP_ACK_BLOCK = (1 << 8), TIPC_TUNNEL_ENHANCED = (1 << 9), - TIPC_NAGLE = (1 << 10) + TIPC_NAGLE = (1 << 10), + TIPC_NAMED_BCAST = (1 << 11) }; #define TIPC_NODE_CAPABILITIES (TIPC_SYN_BIT | \ @@ -68,7 +69,8 @@ enum { TIPC_MCAST_RBCTL | \ TIPC_GAP_ACK_BLOCK | \ TIPC_TUNNEL_ENHANCED | \ - TIPC_NAGLE) + TIPC_NAGLE | \ + TIPC_NAMED_BCAST) #define INVALID_BEARER_ID -1 @@ -101,7 +103,7 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, u32 selector); void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr); void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr); -void tipc_node_broadcast(struct net *net, struct sk_buff *skb); +void tipc_node_broadcast(struct net *net, struct sk_buff *skb, int rc_dests); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index a94f38333698..07419f36116a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -711,7 +711,6 @@ exit: * tipc_getname - get port ID of socket or peer socket * @sock: socket structure * @uaddr: area for returned socket address - * @uaddr_len: area for returned length of socket address * @peer: 0 = own ID, 1 = current peer ID, 2 = current/former peer ID * * Returns 0 on success, errno otherwise @@ -1053,7 +1052,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, /** * tipc_send_group_bcast - send message to all members in communication group - * @sk: socket structure + * @sock: socket structure * @m: message to send * @dlen: total length of message data * @timeout: timeout to wait for wakeup @@ -1673,7 +1672,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, /** * tipc_sk_set_orig_addr - capture sender's address for received message * @m: descriptor for message info - * @hdr: received message header + * @skb: received message * * Note: Address is not captured if not requested by receiver. */ @@ -2095,7 +2094,6 @@ static void tipc_write_space(struct sock *sk) /** * tipc_data_ready - wake up threads to indicate messages have been received * @sk: socket - * @len: the length of messages */ static void tipc_data_ready(struct sock *sk) { @@ -2677,7 +2675,7 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) /** * tipc_accept - wait for connection request * @sock: listening socket - * @newsock: new socket that is to be connected + * @new_sock: new socket that is to be connected * @flags: file-related flags associated with socket * * Returns 0 on success, errno otherwise @@ -3105,7 +3103,7 @@ static int tipc_sk_leave(struct tipc_sock *tsk) * Returns 0 on success, errno otherwise */ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, - char __user *ov, unsigned int ol) + sockptr_t ov, unsigned int ol) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); @@ -3126,17 +3124,17 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_NODELAY: if (ol < sizeof(value)) return -EINVAL; - if (get_user(value, (u32 __user *)ov)) + if (copy_from_sockptr(&value, ov, sizeof(u32))) return -EFAULT; break; case TIPC_GROUP_JOIN: if (ol < sizeof(mreq)) return -EINVAL; - if (copy_from_user(&mreq, ov, sizeof(mreq))) + if (copy_from_sockptr(&mreq, ov, sizeof(mreq))) return -EFAULT; break; default: - if (ov || ol) + if (!sockptr_is_null(ov) || ol) return -EINVAL; } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 28a283f26a8d..53f0de0676b7 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -565,7 +565,7 @@ msg_full: /** * tipc_parse_udp_addr - build udp media address from netlink data - * @nlattr: netlink attribute containing sockaddr storage aligned address + * @nla: netlink attribute containing sockaddr storage aligned address * @addr: tipc media address to fill with address, port and protocol type * @scope_id: IPv6 scope id pointer, not NULL indicates it's required */ @@ -738,6 +738,13 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, b->mtu = b->media->mtu; #if IS_ENABLED(CONFIG_IPV6) } else if (local.proto == htons(ETH_P_IPV6)) { + struct net_device *dev; + + dev = ipv6_dev_find(net, &local.ipv6); + if (!dev) { + err = -ENODEV; + goto err; + } udp_conf.family = AF_INET6; udp_conf.use_udp6_tx_checksums = true; udp_conf.use_udp6_rx_checksums = true; @@ -745,6 +752,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, udp_conf.local_ip6 = in6addr_any; else udp_conf.local_ip6 = local.ipv6; + ub->ifindex = dev->ifindex; b->mtu = 1280; #endif } else { diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 0e55f8365ce2..18fa6067bb7f 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -690,15 +690,55 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx, TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC); } +static bool +tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async, + s64 resync_req, u32 *seq) +{ + u32 is_async = resync_req & RESYNC_REQ_ASYNC; + u32 req_seq = resync_req >> 32; + u32 req_end = req_seq + ((resync_req >> 16) & 0xffff); + + if (is_async) { + /* asynchronous stage: log all headers seq such that + * req_seq <= seq <= end_seq, and wait for real resync request + */ + if (between(*seq, req_seq, req_end) && + resync_async->loglen < TLS_DEVICE_RESYNC_ASYNC_LOGMAX) + resync_async->log[resync_async->loglen++] = *seq; + + return false; + } + + /* synchronous stage: check against the logged entries and + * proceed to check the next entries if no match was found + */ + while (resync_async->loglen) { + if (req_seq == resync_async->log[resync_async->loglen - 1] && + atomic64_try_cmpxchg(&resync_async->req, + &resync_req, 0)) { + resync_async->loglen = 0; + *seq = req_seq; + return true; + } + resync_async->loglen--; + } + + if (req_seq == *seq && + atomic64_try_cmpxchg(&resync_async->req, + &resync_req, 0)) + return true; + + return false; +} + void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; - bool is_req_pending, is_force_resync; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; + u32 sock_data, is_req_pending; struct tls_prot_info *prot; s64 resync_req; - u32 sock_data; u32 req_seq; if (tls_ctx->rx_conf != TLS_HW) @@ -713,11 +753,9 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) resync_req = atomic64_read(&rx_ctx->resync_req); req_seq = resync_req >> 32; seq += TLS_HEADER_SIZE - 1; - is_req_pending = resync_req & RESYNC_REQ; - is_force_resync = resync_req & RESYNC_REQ_FORCE; + is_req_pending = resync_req; - if (likely(!is_req_pending) || - (!is_force_resync && req_seq != seq) || + if (likely(!is_req_pending) || req_seq != seq || !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) return; break; @@ -739,6 +777,16 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) seq += rcd_len; tls_bigint_increment(rcd_sn, prot->rec_seq_size); break; + case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC: + resync_req = atomic64_read(&rx_ctx->resync_async->req); + is_req_pending = resync_req; + if (likely(!is_req_pending)) + return; + + if (!tls_device_rx_resync_async(rx_ctx->resync_async, + resync_req, &seq)) + return; + break; } tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index ec10041c6b7d..bbc52b088d29 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -450,7 +450,7 @@ static int tls_getsockopt(struct sock *sk, int level, int optname, return do_tls_getsockopt(sk, optname, optval, optlen); } -static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, +static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, unsigned int optlen, int tx) { struct tls_crypto_info *crypto_info; @@ -460,7 +460,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, int rc = 0; int conf; - if (!optval || (optlen < sizeof(*crypto_info))) { + if (sockptr_is_null(optval) || (optlen < sizeof(*crypto_info))) { rc = -EINVAL; goto out; } @@ -479,7 +479,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, goto out; } - rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); + rc = copy_from_sockptr(crypto_info, optval, sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto err_crypto_info; @@ -522,8 +522,9 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, goto err_crypto_info; } - rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), - optlen - sizeof(*crypto_info)); + rc = copy_from_sockptr_offset(crypto_info + 1, optval, + sizeof(*crypto_info), + optlen - sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto err_crypto_info; @@ -579,8 +580,8 @@ out: return rc; } -static int do_tls_setsockopt(struct sock *sk, int optname, - char __user *optval, unsigned int optlen) +static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, + unsigned int optlen) { int rc = 0; @@ -600,7 +601,7 @@ static int do_tls_setsockopt(struct sock *sk, int optname, } static int tls_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct tls_context *ctx = tls_get_ctx(sk); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3385a7a0b231..181ea6fb56a6 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -714,8 +714,6 @@ static const struct proto_ops unix_stream_ops = { #endif .listen = unix_listen, .shutdown = unix_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .mmap = sock_no_mmap, @@ -741,8 +739,6 @@ static const struct proto_ops unix_dgram_ops = { #endif .listen = sock_no_listen, .shutdown = unix_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = unix_dgram_sendmsg, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, @@ -767,8 +763,6 @@ static const struct proto_ops unix_seqpacket_ops = { #endif .listen = unix_listen, .shutdown = unix_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = unix_seqpacket_sendmsg, .recvmsg = unix_seqpacket_recvmsg, .mmap = sock_no_mmap, diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 626bf9044418..27bbcfad9c17 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1202,8 +1202,6 @@ static const struct proto_ops vsock_dgram_ops = { .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = vsock_dgram_sendmsg, .recvmsg = vsock_dgram_recvmsg, .mmap = sock_no_mmap, @@ -1519,7 +1517,7 @@ static void vsock_update_buffer_size(struct vsock_sock *vsk, static int vsock_stream_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, + sockptr_t optval, unsigned int optlen) { int err; @@ -1537,7 +1535,7 @@ static int vsock_stream_setsockopt(struct socket *sock, err = -EINVAL; \ goto exit; \ } \ - if (copy_from_user(&_v, optval, sizeof(_v)) != 0) { \ + if (copy_from_sockptr(&_v, optval, sizeof(_v)) != 0) { \ err = -EFAULT; \ goto exit; \ } \ diff --git a/net/wireless/chan.c b/net/wireless/chan.c index cddf92c5d09e..90f0f82cd9ca 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -153,6 +153,11 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) control_freq = chandef->chan->center_freq; switch (chandef->width) { + case NL80211_CHAN_WIDTH_1: + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20: @@ -263,6 +268,21 @@ static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) int width; switch (c->width) { + case NL80211_CHAN_WIDTH_1: + width = 1; + break; + case NL80211_CHAN_WIDTH_2: + width = 2; + break; + case NL80211_CHAN_WIDTH_4: + width = 4; + break; + case NL80211_CHAN_WIDTH_8: + width = 8; + break; + case NL80211_CHAN_WIDTH_16: + width = 16; + break; case NL80211_CHAN_WIDTH_5: width = 5; break; @@ -911,6 +931,21 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, control_freq = chandef->chan->center_freq; switch (chandef->width) { + case NL80211_CHAN_WIDTH_1: + width = 1; + break; + case NL80211_CHAN_WIDTH_2: + width = 2; + break; + case NL80211_CHAN_WIDTH_4: + width = 4; + break; + case NL80211_CHAN_WIDTH_8: + width = 8; + break; + case NL80211_CHAN_WIDTH_16: + width = 16; + break; case NL80211_CHAN_WIDTH_5: width = 5; break; diff --git a/net/wireless/core.c b/net/wireless/core.c index c623d9bf5096..1971d7e6eb55 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -803,10 +803,11 @@ int wiphy_register(struct wiphy *wiphy) if (WARN_ON(!sband->n_channels)) return -EINVAL; /* - * on 60GHz band, there are no legacy rates, so + * on 60GHz or sub-1Ghz band, there are no legacy rates, so * n_bitrates is 0 */ - if (WARN_ON(band != NL80211_BAND_60GHZ && + if (WARN_ON((band != NL80211_BAND_60GHZ && + band != NL80211_BAND_S1GHZ) && !sband->n_bitrates)) return -EINVAL; diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index eac5aa1419fc..e4e363138279 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -78,6 +78,7 @@ const struct mesh_config default_mesh_config = { .power_mode = NL80211_MESH_POWER_ACTIVE, .dot11MeshAwakeWindowDuration = MESH_DEFAULT_AWAKE_WINDOW, .plink_timeout = MESH_DEFAULT_PLINK_TIMEOUT, + .dot11MeshNolearn = false, }; const struct mesh_setup default_mesh_setup = { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7fbca0854265..814e23d3ce7c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4713,8 +4713,8 @@ static int nl80211_parse_he_bss_color(struct nlattr *attrs, he_bss_color->color = nla_get_u8(tb[NL80211_HE_BSS_COLOR_ATTR_COLOR]); - he_bss_color->disabled = - nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_DISABLED]); + he_bss_color->enabled = + !nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_DISABLED]); he_bss_color->partial = nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_PARTIAL]); @@ -5395,6 +5395,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(PEER_PM, peer_pm, u32); PUT_SINFO(NONPEER_PM, nonpeer_pm, u32); PUT_SINFO(CONNECTED_TO_GATE, connected_to_gate, u8); + PUT_SINFO(CONNECTED_TO_AS, connected_to_as, u8); if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) { bss_param = nla_nest_start_noflag(msg, @@ -6885,7 +6886,11 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, nla_put_u32(msg, NL80211_MESHCONF_PLINK_TIMEOUT, cur_params.plink_timeout) || nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_GATE, - cur_params.dot11MeshConnectedToMeshGate)) + cur_params.dot11MeshConnectedToMeshGate) || + nla_put_u8(msg, NL80211_MESHCONF_NOLEARN, + cur_params.dot11MeshNolearn) || + nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_AS, + cur_params.dot11MeshConnectedToAuthServer)) goto nla_put_failure; nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); @@ -6943,6 +6948,8 @@ nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = { [NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 }, [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 }, [NL80211_MESHCONF_CONNECTED_TO_GATE] = NLA_POLICY_RANGE(NLA_U8, 0, 1), + [NL80211_MESHCONF_NOLEARN] = NLA_POLICY_RANGE(NLA_U8, 0, 1), + [NL80211_MESHCONF_CONNECTED_TO_AS] = NLA_POLICY_RANGE(NLA_U8, 0, 1), }; static const struct nla_policy @@ -7055,6 +7062,9 @@ do { \ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToMeshGate, mask, NL80211_MESHCONF_CONNECTED_TO_GATE, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToAuthServer, mask, + NL80211_MESHCONF_CONNECTED_TO_AS, + nla_get_u8); /* * Check HT operation mode based on * IEEE 802.11-2016 9.4.2.57 HT Operation element. @@ -7094,6 +7104,8 @@ do { \ NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, mask, NL80211_MESHCONF_PLINK_TIMEOUT, nla_get_u32); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNolearn, mask, + NL80211_MESHCONF_NOLEARN, nla_get_u8); if (mask_out) *mask_out = mask; @@ -7774,10 +7786,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->scan) return -EOPNOTSUPP; - if (rdev->scan_req || rdev->scan_msg) { - err = -EBUSY; - goto unlock; - } + if (rdev->scan_req || rdev->scan_msg) + return -EBUSY; if (info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ]) { if (!wiphy_ext_feature_isset(wiphy, @@ -7790,10 +7800,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (scan_freqs) { n_channels = validate_scan_freqs(scan_freqs); - if (!n_channels) { - err = -EINVAL; - goto unlock; - } + if (!n_channels) + return -EINVAL; } else { n_channels = ieee80211_get_num_supported_channels(wiphy); } @@ -7802,29 +7810,23 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) n_ssids++; - if (n_ssids > wiphy->max_scan_ssids) { - err = -EINVAL; - goto unlock; - } + if (n_ssids > wiphy->max_scan_ssids) + return -EINVAL; if (info->attrs[NL80211_ATTR_IE]) ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); else ie_len = 0; - if (ie_len > wiphy->max_scan_ie_len) { - err = -EINVAL; - goto unlock; - } + if (ie_len > wiphy->max_scan_ie_len) + return -EINVAL; request = kzalloc(sizeof(*request) + sizeof(*request->ssids) * n_ssids + sizeof(*request->channels) * n_channels + ie_len, GFP_KERNEL); - if (!request) { - err = -ENOMEM; - goto unlock; - } + if (!request) + return -ENOMEM; if (n_ssids) request->ssids = (void *)&request->channels[n_channels]; @@ -8003,17 +8005,19 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) rdev->scan_req = request; err = rdev_scan(rdev, request); - if (!err) { - nl80211_send_scan_start(rdev, wdev); - if (wdev->netdev) - dev_hold(wdev->netdev); - } else { + if (err) + goto out_free; + + nl80211_send_scan_start(rdev, wdev); + if (wdev->netdev) + dev_hold(wdev->netdev); + + return 0; + out_free: - rdev->scan_req = NULL; - kfree(request); - } + rdev->scan_req = NULL; + kfree(request); - unlock: return err; } @@ -9438,7 +9442,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, if (nla_len(info->attrs[NL80211_ATTR_PMK]) != WLAN_PMK_LEN) return -EINVAL; if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK)) + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK)) return -EINVAL; settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]); } @@ -10394,8 +10400,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) memcpy(dev->ieee80211_ptr->disconnect_bssid, connect.bssid, ETH_ALEN); else - memset(dev->ieee80211_ptr->disconnect_bssid, - 0, ETH_ALEN); + eth_zero_addr(dev->ieee80211_ptr->disconnect_bssid); } wdev_unlock(dev->ieee80211_ptr); @@ -13479,7 +13484,7 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb, if (err == -ENOBUFS || err == -ENOENT) { genlmsg_cancel(skb, hdr); break; - } else if (err) { + } else if (err <= 0) { genlmsg_cancel(skb, hdr); goto out; } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 0d74a31ef0ab..35b8847a2f6d 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2384,7 +2384,7 @@ static void reg_set_request_processed(void) /** * reg_process_hint_core - process core regulatory requests - * @pending_request: a pending core regulatory request + * @core_request: a pending core regulatory request * * The wireless subsystem can use this function to process * a regulatory request issued by the regulatory core. @@ -2493,6 +2493,7 @@ __reg_process_hint_driver(struct regulatory_request *driver_request) /** * reg_process_hint_driver - process driver regulatory requests + * @wiphy: the wireless device for the regulatory request * @driver_request: a pending driver regulatory request * * The wireless subsystem can use this function to process @@ -2593,6 +2594,7 @@ __reg_process_hint_country_ie(struct wiphy *wiphy, /** * reg_process_hint_country_ie - process regulatory requests from country IEs + * @wiphy: the wireless device for the regulatory request * @country_ie_request: a regulatory request from a country IE * * The wireless subsystem can use this function to process diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 74ea4cfb39fb..e67a74488bbe 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -712,6 +712,16 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *rdev) __cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE); } +void cfg80211_bss_flush(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + + spin_lock_bh(&rdev->bss_lock); + __cfg80211_bss_expire(rdev, jiffies); + spin_unlock_bh(&rdev->bss_lock); +} +EXPORT_SYMBOL(cfg80211_bss_flush); + const struct element * cfg80211_find_elem_match(u8 eid, const u8 *ies, unsigned int len, const u8 *match, unsigned int match_len, diff --git a/net/wireless/trace.h b/net/wireless/trace.h index b23cab016521..6e218a0acd4e 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -68,7 +68,8 @@ __field(u16, ht_opmode) \ __field(u32, dot11MeshHWMPactivePathToRootTimeout) \ __field(u16, dot11MeshHWMProotInterval) \ - __field(u16, dot11MeshHWMPconfirmationInterval) + __field(u16, dot11MeshHWMPconfirmationInterval) \ + __field(bool, dot11MeshNolearn) #define MESH_CFG_ASSIGN \ do { \ __entry->dot11MeshRetryTimeout = conf->dot11MeshRetryTimeout; \ @@ -109,6 +110,7 @@ conf->dot11MeshHWMProotInterval; \ __entry->dot11MeshHWMPconfirmationInterval = \ conf->dot11MeshHWMPconfirmationInterval; \ + __entry->dot11MeshNolearn = conf->dot11MeshNolearn; \ } while (0) #define CHAN_ENTRY __field(enum nl80211_band, band) \ diff --git a/net/wireless/util.c b/net/wireless/util.c index 4d3b76f94f55..26a977343c3b 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -102,6 +102,8 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) if (chan < 7) return MHZ_TO_KHZ(56160 + chan * 2160); break; + case NL80211_BAND_S1GHZ: + return 902000 + chan * 500; default: ; } @@ -210,6 +212,12 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) WARN_ON(!sband->ht_cap.ht_supported); WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; + case NL80211_BAND_S1GHZ: + /* Figure 9-589bd: 3 means unsupported, so != 3 means at least + * mandatory is ok. + */ + WARN_ON((sband->s1g_cap.nss_mcs[0] & 0x3) == 0x3); + break; case NUM_NL80211_BANDS: default: WARN_ON(1); diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index cac9e28d852b..aa918d7ff6bd 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -220,7 +220,6 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_giwrange); /** * cfg80211_wext_freq - get wext frequency for non-"auto" - * @dev: the net device * @freq: the wext freq encoding * * Returns a frequency, or a negative error code, or 0 for auto. diff --git a/net/x25/Kconfig b/net/x25/Kconfig index e3ed23245a82..68729aa3a5d5 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -17,7 +17,7 @@ config X25 if you want that) and the lower level data link layer protocol LAPB (say Y to "LAPB Data Link Driver" below if you want that). - You can read more about X.25 at <http://www.sangoma.com/tutorials/x25/> and + You can read more about X.25 at <https://www.sangoma.com/tutorials/x25/> and <http://docwiki.cisco.com/wiki/X.25>. Information about X.25 for Linux is contained in the files <file:Documentation/networking/x25.rst> and diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index d5b09bbff375..0bbb283f23c9 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -431,7 +431,7 @@ void x25_destroy_socket_from_timer(struct sock *sk) */ static int x25_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { int opt; struct sock *sk = sock->sk; @@ -445,7 +445,7 @@ static int x25_setsockopt(struct socket *sock, int level, int optname, goto out; rc = -EFAULT; - if (get_user(opt, (int __user *)optval)) + if (copy_from_sockptr(&opt, optval, sizeof(int))) goto out; if (opt) diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index 7d02532aad0d..fdae054b7dc1 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -270,7 +270,7 @@ void x25_link_device_up(struct net_device *dev) /** * __x25_remove_neigh - remove neighbour from x25_neigh_list - * @nb - neigh to remove + * @nb: - neigh to remove * * Remove neighbour from x25_neigh_list. If it was there. * Caller must hold x25_neigh_list_lock. diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c index b8e94d58d0f1..00e46c9a5280 100644 --- a/net/x25/x25_route.c +++ b/net/x25/x25_route.c @@ -142,7 +142,7 @@ struct net_device *x25_dev_get(char *devname) /** * x25_get_route - Find a route given an X.25 address. - * @addr - address to find a route for + * @addr: - address to find a route for * * Find a route given an X.25 address. */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3700266229f6..c3231620d210 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -123,7 +123,7 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) addr = xp_get_handle(xskb); err = xskq_prod_reserve_desc(xs->rx, addr, len); if (err) { - xs->rx_dropped++; + xs->rx_queue_full++; return err; } @@ -274,8 +274,10 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { - if (!xskq_cons_peek_desc(xs->tx, desc, umem)) + if (!xskq_cons_peek_desc(xs->tx, desc, umem)) { + xs->tx->queue_empty_descs++; continue; + } /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed @@ -387,6 +389,8 @@ static int xsk_generic_xmit(struct sock *sk) sent_frame = true; } + xs->tx->queue_empty_descs++; + out: if (sent_frame) sk->sk_write_space(sk); @@ -698,7 +702,7 @@ struct xdp_umem_reg_v1 { }; static int xsk_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); @@ -716,7 +720,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(entries)) return -EINVAL; - if (copy_from_user(&entries, optval, sizeof(entries))) + if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); @@ -743,7 +747,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, else if (optlen < sizeof(mr)) mr_size = sizeof(struct xdp_umem_reg_v1); - if (copy_from_user(&mr, optval, mr_size)) + if (copy_from_sockptr(&mr, optval, mr_size)) return -EFAULT; mutex_lock(&xs->mutex); @@ -770,7 +774,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xsk_queue **q; int entries; - if (copy_from_user(&entries, optval, sizeof(entries))) + if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); @@ -812,6 +816,12 @@ static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) ring->desc = offsetof(struct xdp_umem_ring, desc); } +struct xdp_statistics_v1 { + __u64 rx_dropped; + __u64 rx_invalid_descs; + __u64 tx_invalid_descs; +}; + static int xsk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { @@ -830,20 +840,36 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, switch (optname) { case XDP_STATISTICS: { - struct xdp_statistics stats; + struct xdp_statistics stats = {}; + bool extra_stats = true; + size_t stats_size; - if (len < sizeof(stats)) + if (len < sizeof(struct xdp_statistics_v1)) { return -EINVAL; + } else if (len < sizeof(stats)) { + extra_stats = false; + stats_size = sizeof(struct xdp_statistics_v1); + } else { + stats_size = sizeof(stats); + } mutex_lock(&xs->mutex); stats.rx_dropped = xs->rx_dropped; + if (extra_stats) { + stats.rx_ring_full = xs->rx_queue_full; + stats.rx_fill_ring_empty_descs = + xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0; + stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); + } else { + stats.rx_dropped += xs->rx_queue_full; + } stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); mutex_unlock(&xs->mutex); - if (copy_to_user(optval, &stats, sizeof(stats))) + if (copy_to_user(optval, &stats, stats_size)) return -EFAULT; - if (put_user(sizeof(stats), optlen)) + if (put_user(stats_size, optlen)) return -EFAULT; return 0; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 08b80669f649..a2044c245215 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -189,6 +189,7 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) for (;;) { if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { + pool->fq->queue_empty_descs++; xp_release(xskb); return NULL; } diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 0163b26aaf63..21e9c2d123ee 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -76,6 +76,19 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) return err; } +static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xdp_diag_stats du = {}; + + du.n_rx_dropped = xs->rx_dropped; + du.n_rx_invalid = xskq_nb_invalid_descs(xs->rx); + du.n_rx_full = xs->rx_queue_full; + du.n_fill_ring_empty = xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0; + du.n_tx_invalid = xskq_nb_invalid_descs(xs->tx); + du.n_tx_ring_empty = xskq_nb_queue_empty_descs(xs->tx); + return nla_put(nlskb, XDP_DIAG_STATS, sizeof(du), &du); +} + static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, struct xdp_diag_req *req, struct user_namespace *user_ns, @@ -118,6 +131,10 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) goto out_nlmsg_trim; + if ((req->xdiag_show & XDP_SHOW_STATS) && + xsk_diag_put_stats(xs, nlskb)) + goto out_nlmsg_trim; + mutex_unlock(&xs->mutex); nlmsg_end(nlskb, nlh); return 0; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 5b5d24d2dd37..bf42cfd74b89 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -38,6 +38,7 @@ struct xsk_queue { u32 cached_cons; struct xdp_ring *ring; u64 invalid_descs; + u64 queue_empty_descs; }; /* The structure of the shared state of the rings are the same as the @@ -354,6 +355,11 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } +static inline u64 xskq_nb_queue_empty_descs(struct xsk_queue *q) +{ + return q ? q->queue_empty_descs : 0; +} + struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 1dc7208c71ba..8367adbbe9df 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -254,6 +254,7 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, spin_unlock_bh(&map->lock); } +static int xsk_map_btf_id; const struct bpf_map_ops xsk_map_ops = { .map_alloc = xsk_map_alloc, .map_free = xsk_map_free, @@ -264,4 +265,6 @@ const struct bpf_map_ops xsk_map_ops = { .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "xsk_map", + .map_btf_id = &xsk_map_btf_id, }; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 626096bd0d29..edf11893dbe8 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -106,6 +106,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur struct sk_buff *skb2, *nskb, *pskb = NULL; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); + struct net_device *dev = skb->dev; struct sec_path *sp; if (!xo || (xo->flags & XFRM_XMIT)) @@ -119,6 +120,10 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND) return skb; + /* This skb was already validated on the upper/virtual dev */ + if ((x->xso.dev != dev) && (x->xso.real_dev == dev)) + return skb; + local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); err = !skb_queue_empty(&sd->xfrm_backlog); @@ -131,25 +136,20 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur xo->flags |= XFRM_XMIT; - if (skb_is_gso(skb)) { - struct net_device *dev = skb->dev; - - if (unlikely(x->xso.dev != dev)) { - struct sk_buff *segs; + if (skb_is_gso(skb) && unlikely(x->xso.dev != dev)) { + struct sk_buff *segs; - /* Packet got rerouted, fixup features and segment it. */ - esp_features = esp_features & ~(NETIF_F_HW_ESP - | NETIF_F_GSO_ESP); + /* Packet got rerouted, fixup features and segment it. */ + esp_features = esp_features & ~(NETIF_F_HW_ESP | NETIF_F_GSO_ESP); - segs = skb_gso_segment(skb, esp_features); - if (IS_ERR(segs)) { - kfree_skb(skb); - atomic_long_inc(&dev->tx_dropped); - return NULL; - } else { - consume_skb(skb); - skb = segs; - } + segs = skb_gso_segment(skb, esp_features); + if (IS_ERR(segs)) { + kfree_skb(skb); + atomic_long_inc(&dev->tx_dropped); + return NULL; + } else { + consume_skb(skb); + skb = segs; } } @@ -261,6 +261,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } xso->dev = dev; + xso->real_dev = dev; xso->num_exthdrs = 1; xso->flags = xuo->flags; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index bd984ff17c2d..37456d022cfa 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -42,7 +42,7 @@ struct xfrm_trans_cb { #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); -static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1]; +static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[2][AF_INET6 + 1]; static struct gro_cells gro_cells; static struct net_device xfrm_napi_dev; @@ -53,14 +53,14 @@ int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; - if (WARN_ON(afinfo->family >= ARRAY_SIZE(xfrm_input_afinfo))) + if (WARN_ON(afinfo->family > AF_INET6)) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_input_afinfo_lock); - if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL)) + if (unlikely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family])) err = -EEXIST; else - rcu_assign_pointer(xfrm_input_afinfo[afinfo->family], afinfo); + rcu_assign_pointer(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family], afinfo); spin_unlock_bh(&xfrm_input_afinfo_lock); return err; } @@ -71,11 +71,11 @@ int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo) int err = 0; spin_lock_bh(&xfrm_input_afinfo_lock); - if (likely(xfrm_input_afinfo[afinfo->family] != NULL)) { - if (unlikely(xfrm_input_afinfo[afinfo->family] != afinfo)) + if (likely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family])) { + if (unlikely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family] != afinfo)) err = -EINVAL; else - RCU_INIT_POINTER(xfrm_input_afinfo[afinfo->family], NULL); + RCU_INIT_POINTER(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family], NULL); } spin_unlock_bh(&xfrm_input_afinfo_lock); synchronize_rcu(); @@ -83,15 +83,15 @@ int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_input_unregister_afinfo); -static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family) +static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(u8 family, bool is_ipip) { const struct xfrm_input_afinfo *afinfo; - if (WARN_ON_ONCE(family >= ARRAY_SIZE(xfrm_input_afinfo))) + if (WARN_ON_ONCE(family > AF_INET6)) return NULL; rcu_read_lock(); - afinfo = rcu_dereference(xfrm_input_afinfo[family]); + afinfo = rcu_dereference(xfrm_input_afinfo[is_ipip][family]); if (unlikely(!afinfo)) rcu_read_unlock(); return afinfo; @@ -100,9 +100,11 @@ static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol, int err) { + bool is_ipip = (protocol == IPPROTO_IPIP || protocol == IPPROTO_IPV6); + const struct xfrm_input_afinfo *afinfo; int ret; - const struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family); + afinfo = xfrm_input_get_afinfo(family, is_ipip); if (!afinfo) return -EAFNOSUPPORT; diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index b615729812e5..eb8181987620 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -48,21 +48,30 @@ static int xfrmi_dev_init(struct net_device *dev); static void xfrmi_dev_setup(struct net_device *dev); static struct rtnl_link_ops xfrmi_link_ops __read_mostly; static unsigned int xfrmi_net_id __read_mostly; +static const struct net_device_ops xfrmi_netdev_ops; + +#define XFRMI_HASH_BITS 8 +#define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) struct xfrmi_net { /* lists for storing interfaces in use */ - struct xfrm_if __rcu *xfrmi[1]; + struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; }; #define for_each_xfrmi_rcu(start, xi) \ for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) +static u32 xfrmi_hash(u32 if_id) +{ + return hash_32(if_id, XFRMI_HASH_BITS); +} + static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if *xi; - for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { + for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { if (x->if_id == xi->p.if_id && (xi->dev->flags & IFF_UP)) return xi; @@ -74,8 +83,7 @@ static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb, unsigned short family) { - struct xfrmi_net *xfrmn; - struct xfrm_if *xi; + struct net_device *dev; int ifindex = 0; if (!secpath_exists(skb) || !skb->dev) @@ -89,23 +97,26 @@ static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb, ifindex = inet_sdif(skb); break; } - if (!ifindex) - ifindex = skb->dev->ifindex; - xfrmn = net_generic(xs_net(xfrm_input_state(skb)), xfrmi_net_id); + if (ifindex) { + struct net *net = xs_net(xfrm_input_state(skb)); - for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { - if (ifindex == xi->dev->ifindex && - (xi->dev->flags & IFF_UP)) - return xi; + dev = dev_get_by_index_rcu(net, ifindex); + } else { + dev = skb->dev; } - return NULL; + if (!dev || !(dev->flags & IFF_UP)) + return NULL; + if (dev->netdev_ops != &xfrmi_netdev_ops) + return NULL; + + return netdev_priv(dev); } static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { - struct xfrm_if __rcu **xip = &xfrmn->xfrmi[0]; + struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); rcu_assign_pointer(*xip, xi); @@ -116,7 +127,7 @@ static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) struct xfrm_if __rcu **xip; struct xfrm_if *iter; - for (xip = &xfrmn->xfrmi[0]; + for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; (iter = rtnl_dereference(*xip)) != NULL; xip = &iter->next) { if (xi == iter) { @@ -160,7 +171,7 @@ static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) struct xfrm_if *xi; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - for (xip = &xfrmn->xfrmi[0]; + for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) if (xi->p.if_id == p->if_id) @@ -760,11 +771,14 @@ static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if __rcu **xip; struct xfrm_if *xi; + int i; - for (xip = &xfrmn->xfrmi[0]; - (xi = rtnl_dereference(*xip)) != NULL; - xip = &xi->next) - unregister_netdevice_queue(xi->dev, &list); + for (i = 0; i < XFRMI_HASH_SIZE; i++) { + for (xip = &xfrmn->xfrmi[i]; + (xi = rtnl_dereference(*xip)) != NULL; + xip = &xi->next) + unregister_netdevice_queue(xi->dev, &list); + } } unregister_netdevice_many(&list); rtnl_unlock(); @@ -800,6 +814,33 @@ static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { .priority = 10, }; +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) +static int xfrmi6_rcv_tunnel(struct sk_buff *skb) +{ + const xfrm_address_t *saddr; + __be32 spi; + + saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; + spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); + + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); +} + +static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { + .handler = xfrmi6_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = -1, +}; + +static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { + .handler = xfrmi6_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = -1, +}; +#endif + static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, @@ -824,6 +865,27 @@ static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { .priority = 10, }; +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) +static int xfrmi4_rcv_tunnel(struct sk_buff *skb) +{ + return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); +} + +static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { + .handler = xfrmi4_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = -1, +}; + +static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { + .handler = xfrmi4_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = -1, +}; +#endif + static int __init xfrmi4_init(void) { int err; @@ -837,9 +899,23 @@ static int __init xfrmi4_init(void) err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) + err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); + if (err < 0) + goto xfrm_tunnel_ipip_failed; + err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); + if (err < 0) + goto xfrm_tunnel_ipip6_failed; +#endif return 0; +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) +xfrm_tunnel_ipip6_failed: + xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); +xfrm_tunnel_ipip_failed: + xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); +#endif xfrm_proto_comp_failed: xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: @@ -850,6 +926,10 @@ xfrm_proto_esp_failed: static void xfrmi4_fini(void) { +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) + xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); + xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); +#endif xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); @@ -868,9 +948,23 @@ static int __init xfrmi6_init(void) err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); + if (err < 0) + goto xfrm_tunnel_ipv6_failed; + err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); + if (err < 0) + goto xfrm_tunnel_ip6ip_failed; +#endif return 0; +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) +xfrm_tunnel_ip6ip_failed: + xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); +xfrm_tunnel_ipv6_failed: + xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); +#endif xfrm_proto_comp_failed: xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm_proto_ah_failed: @@ -881,6 +975,10 @@ xfrm_proto_esp_failed: static void xfrmi6_fini(void) { +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); + xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); +#endif xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 19c5e0fa3f44..042ea9b40c7b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2751,6 +2751,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; struct sk_buff_head list; + __u32 skb_mark; spin_lock(&pq->hold_queue.lock); skb = skb_peek(&pq->hold_queue); @@ -2760,7 +2761,12 @@ static void xfrm_policy_queue_process(struct timer_list *t) } dst = skb_dst(skb); sk = skb->sk; + + /* Fixup the mark to support VTI. */ + skb_mark = skb->mark; + skb->mark = pol->mark.v; xfrm_decode_session(skb, &fl, dst->ops->family); + skb->mark = skb_mark; spin_unlock(&pq->hold_queue.lock); dst_hold(xfrm_dst_path(dst)); @@ -2792,7 +2798,12 @@ static void xfrm_policy_queue_process(struct timer_list *t) while (!skb_queue_empty(&list)) { skb = __skb_dequeue(&list); + /* Fixup the mark to support VTI. */ + skb_mark = skb->mark; + skb->mark = pol->mark.v; xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); + skb->mark = skb_mark; + dst_hold(xfrm_dst_path(skb_dst(skb))); dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0); if (IS_ERR(dst)) { diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 98943f8d01aa..c6a4338a0d08 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -89,7 +89,8 @@ static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq; XFRM_SKB_CB(skb)->seq.output.hi = 0; - if (unlikely(x->replay.oseq == 0)) { + if (unlikely(x->replay.oseq == 0) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { x->replay.oseq--; xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -168,7 +169,8 @@ static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq; XFRM_SKB_CB(skb)->seq.output.hi = 0; - if (unlikely(replay_esn->oseq == 0)) { + if (unlikely(replay_esn->oseq == 0) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { replay_esn->oseq--; xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -572,7 +574,8 @@ static int xfrm_replay_overflow_offload(struct xfrm_state *x, struct sk_buff *sk XFRM_SKB_CB(skb)->seq.output.hi = 0; xo->seq.hi = 0; - if (unlikely(oseq < x->replay.oseq)) { + if (unlikely(oseq < x->replay.oseq) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -611,7 +614,8 @@ static int xfrm_replay_overflow_offload_bmp(struct xfrm_state *x, struct sk_buff XFRM_SKB_CB(skb)->seq.output.hi = 0; xo->seq.hi = 0; - if (unlikely(oseq < replay_esn->oseq)) { + if (unlikely(oseq < replay_esn->oseq) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 8be2d926acc2..69520ad3d83b 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2264,7 +2264,7 @@ static bool km_is_alive(const struct km_event *c) return is_alive; } -int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen) +int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen) { int err; u8 *data; @@ -2274,7 +2274,7 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen if (in_compat_syscall()) return -EOPNOTSUPP; - if (!optval && !optlen) { + if (sockptr_is_null(optval) && !optlen) { xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL); __sk_dst_reset(sk); @@ -2284,7 +2284,7 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen if (optlen <= 0 || optlen > PAGE_SIZE) return -EMSGSIZE; - data = memdup_user(optval, optlen); + data = memdup_sockptr(optval, optlen); if (IS_ERR(data)) return PTR_ERR(data); |