diff options
Diffstat (limited to 'net')
413 files changed, 18986 insertions, 6498 deletions
diff --git a/net/8021q/Makefile b/net/8021q/Makefile index 9b703454b93e..e05d4d7aab35 100644 --- a/net/8021q/Makefile +++ b/net/8021q/Makefile @@ -9,4 +9,3 @@ obj-$(CONFIG_VLAN_8021Q) += 8021q.o 8021q-$(CONFIG_VLAN_8021Q_GVRP) += vlan_gvrp.o 8021q-$(CONFIG_VLAN_8021Q_MVRP) += vlan_mvrp.o 8021q-$(CONFIG_PROC_FS) += vlanproc.o - diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 73a65789271b..5e9950453955 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -647,13 +647,14 @@ out: return err; } -static struct sk_buff **vlan_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *vlan_gro_receive(struct list_head *head, + struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; - struct vlan_hdr *vhdr; - unsigned int hlen, off_vlan; const struct packet_offload *ptype; + unsigned int hlen, off_vlan; + struct sk_buff *pp = NULL; + struct vlan_hdr *vhdr; + struct sk_buff *p; __be16 type; int flush = 1; @@ -675,7 +676,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head, flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { struct vlan_hdr *vhdr2; if (!NAPI_GRO_CB(p)->same_flow) @@ -693,7 +694,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/net/9p/client.c b/net/9p/client.c index 18c5271910dc..5c1343195292 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -225,7 +225,8 @@ static int parse_opts(char *opts, struct p9_client *clnt) } free_and_return: - v9fs_put_trans(clnt->trans_mod); + if (ret) + v9fs_put_trans(clnt->trans_mod); kfree(tmp_options); return ret; } diff --git a/net/Kconfig b/net/Kconfig index f738a6f27665..228dfa382eec 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -12,7 +12,7 @@ menuconfig NET The reason is that some programs need kernel networking support even when running on a stand-alone machine that isn't connected to any other computer. - + If you are upgrading from an older kernel, you should consider updating your networking tools too because changes in the kernel and the tools often go hand in hand. The tools are diff --git a/net/Makefile b/net/Makefile index 13ec0d5415c7..bdaf53925acd 100644 --- a/net/Makefile +++ b/net/Makefile @@ -20,11 +20,7 @@ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ obj-$(CONFIG_NET) += ipv6/ -ifneq ($(CC_CAN_LINK),y) -$(warning CC cannot link executables. Skipping bpfilter.) -else obj-$(CONFIG_BPFILTER) += bpfilter/ -endif obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 55fdba05d7d9..9b6bc5abe946 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1869,7 +1869,7 @@ static const struct proto_ops atalk_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = atalk_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = atalk_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = atalk_compat_ioctl, diff --git a/net/atm/common.c b/net/atm/common.c index ff5748b2190f..a7a68e509628 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -647,11 +647,16 @@ out: return error; } -__poll_t vcc_poll_mask(struct socket *sock, __poll_t events) +__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - struct atm_vcc *vcc = ATM_SD(sock); - __poll_t mask = 0; + struct atm_vcc *vcc; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; + + vcc = ATM_SD(sock); /* exceptional events */ if (sk->sk_err) diff --git a/net/atm/common.h b/net/atm/common.h index 526796ad230f..5850649068bb 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -17,7 +17,7 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci); int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len); -__poll_t vcc_poll_mask(struct socket *sock, __poll_t events); +__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait); int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_setsockopt(struct socket *sock, int level, int optname, diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c index b93cc0f18292..46d6cd9a36ae 100644 --- a/net/atm/mpoa_proc.c +++ b/net/atm/mpoa_proc.c @@ -307,9 +307,3 @@ void mpc_proc_clean(void) } #endif /* CONFIG_PROC_FS */ - - - - - - diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 9f75092fe778..2cb10af16afc 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -113,7 +113,7 @@ static const struct proto_ops pvc_proto_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pvc_getname, - .poll_mask = vcc_poll_mask, + .poll = vcc_poll, .ioctl = vcc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vcc_compat_ioctl, diff --git a/net/atm/svc.c b/net/atm/svc.c index 53f4ad7087b1..2f91b766ac42 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -636,7 +636,7 @@ static const struct proto_ops svc_proto_ops = { .socketpair = sock_no_socketpair, .accept = svc_accept, .getname = svc_getname, - .poll_mask = vcc_poll_mask, + .poll = vcc_poll, .ioctl = svc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = svc_compat_ioctl, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d1d2442ce573..c603d33d5410 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1941,7 +1941,7 @@ static const struct proto_ops ax25_proto_ops = { .socketpair = sock_no_socketpair, .accept = ax25_accept, .getname = ax25_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = ax25_ioctl, .listen = ax25_listen, .shutdown = ax25_shutdown, diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c index ac2542b7be88..a14cfa736b63 100644 --- a/net/ax25/ax25_addr.c +++ b/net/ax25/ax25_addr.c @@ -304,4 +304,3 @@ void ax25_digi_invert(const ax25_digi *in, ax25_digi *out) } } } - diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c index 891596e74278..488fc2d7085a 100644 --- a/net/ax25/ax25_ds_in.c +++ b/net/ax25/ax25_ds_in.c @@ -299,4 +299,3 @@ int ax25_ds_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type) return queued; } - diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c index 28827e81ba2b..bc0329f43013 100644 --- a/net/ax25/ax25_ds_subr.c +++ b/net/ax25/ax25_ds_subr.c @@ -205,4 +205,3 @@ void ax25_dama_off(ax25_cb *ax25) ax25->condition &= ~AX25_COND_DAMA_MODE; ax25_dev_dama_off(ax25->ax25_dev); } - diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index 183b1c583d56..70417e9b932d 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -249,4 +249,3 @@ const struct header_ops ax25_header_ops = { EXPORT_SYMBOL(ax25_header_ops); EXPORT_SYMBOL(ax25_ip_xmit); - diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index b11a5f466fcc..3e5afc8dc93e 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -394,4 +394,3 @@ int ax25_check_iframes_acked(ax25_cb *ax25, unsigned short nr) } return 0; } - diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index de8034d80623..361116f77cb9 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -24,7 +24,6 @@ config BATMAN_ADV depends on NET select CRC16 select LIBCRC32C - default n help B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is a routing protocol for multi-hop ad-hoc mesh networks. The @@ -33,7 +32,7 @@ config BATMAN_ADV tools. config BATMAN_ADV_BATMAN_V - bool "B.A.T.M.A.N. V protocol (experimental)" + bool "B.A.T.M.A.N. V protocol" depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y) default y help @@ -60,7 +59,7 @@ config BATMAN_ADV_BLA config BATMAN_ADV_DAT bool "Distributed ARP Table" depends on BATMAN_ADV && INET - default n + default y help This option enables DAT (Distributed ARP Table), a DHT based mechanism that increases ARP reliability on sparse wireless @@ -70,7 +69,6 @@ config BATMAN_ADV_DAT config BATMAN_ADV_NC bool "Network Coding" depends on BATMAN_ADV - default n help This option enables network coding, a mechanism that aims to increase the overall network throughput by fusing multiple @@ -84,7 +82,6 @@ config BATMAN_ADV_NC config BATMAN_ADV_MCAST bool "Multicast optimisation" depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y) - default n help This option enables the multicast optimisation which aims to reduce the air overhead while improving the reliability of @@ -94,7 +91,6 @@ config BATMAN_ADV_DEBUGFS bool "batman-adv debugfs entries" depends on BATMAN_ADV depends on DEBUG_FS - default n help Enable this to export routing related debug tables via debugfs. The information for each soft-interface and used hard-interface can be diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index be09a9883825..73bf6a93a3cf 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -2732,7 +2732,7 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; - struct batadv_gw_node *curr_gw; + struct batadv_gw_node *curr_gw = NULL; int ret = 0; void *hdr; @@ -2780,6 +2780,8 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, ret = 0; out: + if (curr_gw) + batadv_gw_node_put(curr_gw); if (router_ifinfo) batadv_neigh_ifinfo_put(router_ifinfo); if (router) diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index 317cafd302cf..3dc6a7a43eb7 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -16,11 +16,11 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#ifndef _BATMAN_ADV_BATADV_IV_OGM_H_ -#define _BATMAN_ADV_BATADV_IV_OGM_H_ +#ifndef _NET_BATMAN_ADV_BAT_IV_OGM_H_ +#define _NET_BATMAN_ADV_BAT_IV_OGM_H_ #include "main.h" int batadv_iv_init(void); -#endif /* _BATMAN_ADV_BATADV_IV_OGM_H_ */ +#endif /* _NET_BATMAN_ADV_BAT_IV_OGM_H_ */ diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index ec93337ee259..6baec4e68898 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -927,7 +927,7 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; - struct batadv_gw_node *curr_gw; + struct batadv_gw_node *curr_gw = NULL; int ret = 0; void *hdr; @@ -995,6 +995,8 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, ret = 0; out: + if (curr_gw) + batadv_gw_node_put(curr_gw); if (router_ifinfo) batadv_neigh_ifinfo_put(router_ifinfo); if (router) diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index ed36c5e79fde..e5be14c908c6 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -16,8 +16,8 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#ifndef _BATMAN_ADV_BATADV_V_OGM_H_ -#define _BATMAN_ADV_BATADV_V_OGM_H_ +#ifndef _NET_BATMAN_ADV_BAT_V_OGM_H_ +#define _NET_BATMAN_ADV_BAT_V_OGM_H_ #include "main.h" @@ -34,4 +34,4 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface); int batadv_v_ogm_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming); -#endif /* _BATMAN_ADV_BATADV_V_OGM_H_ */ +#endif /* _NET_BATMAN_ADV_BAT_V_OGM_H_ */ diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 4229b01ac7b5..3cb82378300b 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -19,6 +19,7 @@ #include "debugfs.h" #include "main.h" +#include <linux/dcache.h> #include <linux/debugfs.h> #include <linux/err.h> #include <linux/errno.h> @@ -117,7 +118,7 @@ static int batadv_bla_backbone_table_open(struct inode *inode, #ifdef CONFIG_BATMAN_ADV_DAT /** - * batadv_dat_cache_open() - Prepare file handler for reads from dat_chache + * batadv_dat_cache_open() - Prepare file handler for reads from dat_cache * @inode: inode which was opened * @file: file handle to be initialized * @@ -344,6 +345,25 @@ out: } /** + * batadv_debugfs_rename_hardif() - Fix debugfs path for renamed hardif + * @hard_iface: hard interface which was renamed + */ +void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface) +{ + const char *name = hard_iface->net_dev->name; + struct dentry *dir; + struct dentry *d; + + dir = hard_iface->debug_dir; + if (!dir) + return; + + d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name); + if (!d) + pr_err("Can't rename debugfs dir to %s\n", name); +} + +/** * batadv_debugfs_del_hardif() - delete the base directory for a hard interface * in debugfs. * @hard_iface: hard interface which is deleted. @@ -414,6 +434,26 @@ out: } /** + * batadv_debugfs_rename_meshif() - Fix debugfs path for renamed softif + * @dev: net_device which was renamed + */ +void batadv_debugfs_rename_meshif(struct net_device *dev) +{ + struct batadv_priv *bat_priv = netdev_priv(dev); + const char *name = dev->name; + struct dentry *dir; + struct dentry *d; + + dir = bat_priv->debug_dir; + if (!dir) + return; + + d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name); + if (!d) + pr_err("Can't rename debugfs dir to %s\n", name); +} + +/** * batadv_debugfs_del_meshif() - Remove interface dependent debugfs entries * @dev: netdev struct of the soft interface */ diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 37b069698b04..08a592ffbee5 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -30,8 +30,10 @@ struct net_device; void batadv_debugfs_init(void); void batadv_debugfs_destroy(void); int batadv_debugfs_add_meshif(struct net_device *dev); +void batadv_debugfs_rename_meshif(struct net_device *dev); void batadv_debugfs_del_meshif(struct net_device *dev); int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); +void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface); void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface); #else @@ -49,6 +51,10 @@ static inline int batadv_debugfs_add_meshif(struct net_device *dev) return 0; } +static inline void batadv_debugfs_rename_meshif(struct net_device *dev) +{ +} + static inline void batadv_debugfs_del_meshif(struct net_device *dev) { } @@ -60,6 +66,11 @@ int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) } static inline +void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface) +{ +} + +static inline void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) { } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index c405d15befd6..2f0d42f2f913 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -989,6 +989,32 @@ void batadv_hardif_remove_interfaces(void) rtnl_unlock(); } +/** + * batadv_hard_if_event_softif() - Handle events for soft interfaces + * @event: NETDEV_* event to handle + * @net_dev: net_device which generated an event + * + * Return: NOTIFY_* result + */ +static int batadv_hard_if_event_softif(unsigned long event, + struct net_device *net_dev) +{ + struct batadv_priv *bat_priv; + + switch (event) { + case NETDEV_REGISTER: + batadv_sysfs_add_meshif(net_dev); + bat_priv = netdev_priv(net_dev); + batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS); + break; + case NETDEV_CHANGENAME: + batadv_debugfs_rename_meshif(net_dev); + break; + } + + return NOTIFY_DONE; +} + static int batadv_hard_if_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -997,12 +1023,8 @@ static int batadv_hard_if_event(struct notifier_block *this, struct batadv_hard_iface *primary_if = NULL; struct batadv_priv *bat_priv; - if (batadv_softif_is_valid(net_dev) && event == NETDEV_REGISTER) { - batadv_sysfs_add_meshif(net_dev); - bat_priv = netdev_priv(net_dev); - batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS); - return NOTIFY_DONE; - } + if (batadv_softif_is_valid(net_dev)) + return batadv_hard_if_event_softif(event, net_dev); hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface && (event == NETDEV_REGISTER || @@ -1051,6 +1073,9 @@ static int batadv_hard_if_event(struct notifier_block *this, if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; break; + case NETDEV_CHANGENAME: + batadv_debugfs_rename_hardif(hard_iface); + break; default: break; } diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 716e5b43acfa..1d295da3e342 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1339,7 +1339,11 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, return false; } -static void _batadv_purge_orig(struct batadv_priv *bat_priv) +/** + * batadv_purge_orig_ref() - Purge all outdated originators + * @bat_priv: the bat priv with all the soft interface information + */ +void batadv_purge_orig_ref(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_node *node_tmp; @@ -1385,21 +1389,12 @@ static void batadv_purge_orig(struct work_struct *work) delayed_work = to_delayed_work(work); bat_priv = container_of(delayed_work, struct batadv_priv, orig_work); - _batadv_purge_orig(bat_priv); + batadv_purge_orig_ref(bat_priv); queue_delayed_work(batadv_event_workqueue, &bat_priv->orig_work, msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD)); } -/** - * batadv_purge_orig_ref() - Purge all outdated originators - * @bat_priv: the bat priv with all the soft interface information - */ -void batadv_purge_orig_ref(struct batadv_priv *bat_priv) -{ - _batadv_purge_orig(bat_priv); -} - #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 3986551397ca..12a2b7d21376 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1705,7 +1705,9 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, ether_addr_copy(common->addr, tt_addr); common->vid = vid; - common->flags = flags; + if (!is_multicast_ether_addr(common->addr)) + common->flags = flags & (~BATADV_TT_SYNC_MASK); + tt_global_entry->roam_at = 0; /* node must store current time in case of roaming. This is * needed to purge this entry out on timeout (if nobody claims @@ -1768,7 +1770,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, * TT_CLIENT_TEMP, therefore they have to be copied in the * client entry */ - common->flags |= flags & (~BATADV_TT_SYNC_MASK); + if (!is_multicast_ether_addr(common->addr)) + common->flags |= flags & (~BATADV_TT_SYNC_MASK); /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only * one originator left in the list and we previously received a diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 360357f83f20..343d304851a5 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -43,12 +43,13 @@ struct seq_file; #ifdef CONFIG_BATMAN_ADV_DAT /** - * batadv_dat_addr_t - it is the type used for all DHT addresses. If it is - * changed, BATADV_DAT_ADDR_MAX is changed as well. + * typedef batadv_dat_addr_t - type used for all DHT addresses + * + * If it is changed, BATADV_DAT_ADDR_MAX is changed as well. * * *Please be careful: batadv_dat_addr_t must be UNSIGNED* */ -#define batadv_dat_addr_t u16 +typedef u16 batadv_dat_addr_t; #endif /* CONFIG_BATMAN_ADV_DAT */ diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 510ab4f55df5..3264e1873219 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -437,13 +437,16 @@ static inline __poll_t bt_accept_poll(struct sock *parent) return 0; } -__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events) +__poll_t bt_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; BT_DBG("sock %p, sk %p", sock, sk); + poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == BT_LISTEN) return bt_accept_poll(sk); @@ -475,7 +478,7 @@ __poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL(bt_sock_poll_mask); +EXPORT_SYMBOL(bt_sock_poll); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index d6c099861538..1506e1632394 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1975,7 +1975,7 @@ static const struct proto_ops hci_sock_ops = { .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, .ioctl = hci_sock_ioctl, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = hci_sock_setsockopt, diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 742a190034e6..686bdc6b35b0 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1653,7 +1653,7 @@ static const struct proto_ops l2cap_sock_ops = { .getname = l2cap_sock_getname, .sendmsg = l2cap_sock_sendmsg, .recvmsg = l2cap_sock_recvmsg, - .poll_mask = bt_sock_poll_mask, + .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 1cf57622473a..d606e9212291 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -1049,7 +1049,7 @@ static const struct proto_ops rfcomm_sock_ops = { .setsockopt = rfcomm_sock_setsockopt, .getsockopt = rfcomm_sock_getsockopt, .ioctl = rfcomm_sock_ioctl, - .poll_mask = bt_sock_poll_mask, + .poll = bt_sock_poll, .socketpair = sock_no_socketpair, .mmap = sock_no_mmap }; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index d60dbc61d170..413b8ee49fec 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1197,7 +1197,7 @@ static const struct proto_ops sco_sock_ops = { .getname = sco_sock_getname, .sendmsg = sco_sock_sendmsg, .recvmsg = sco_sock_recvmsg, - .poll_mask = bt_sock_poll_mask, + .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 68c3578343b4..22a78eedf4b1 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -96,6 +96,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; u32 retval, duration; + int hh_len = ETH_HLEN; struct sk_buff *skb; void *data; int ret; @@ -131,12 +132,22 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, skb_reset_network_header(skb); if (is_l2) - __skb_push(skb, ETH_HLEN); + __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); retval = bpf_test_run(prog, skb, repeat, &duration); - if (!is_l2) - __skb_push(skb, ETH_HLEN); + if (!is_l2) { + if (skb_headroom(skb) < hh_len) { + int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); + + if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { + kfree_skb(skb); + return -ENOMEM; + } + } + memset(__skb_push(skb, hh_len), 0, hh_len); + } + size = skb->len; /* bpf program can never convert linear skb to non-linear */ if (WARN_ON_ONCE(skb_is_nonlinear(skb))) diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig index a948b072c28f..e558b46596c4 100644 --- a/net/bpfilter/Kconfig +++ b/net/bpfilter/Kconfig @@ -1,6 +1,5 @@ menuconfig BPFILTER bool "BPF based packet filtering framework (BPFILTER)" - default n depends on NET && BPF && INET help This builds experimental bpfilter framework that is aiming to @@ -9,8 +8,8 @@ menuconfig BPFILTER if BPFILTER config BPFILTER_UMH tristate "bpfilter kernel module with user mode helper" + depends on $(success,$(srctree)/scripts/cc-can-link.sh $(CC)) default m help This builds bpfilter kernel module with embedded user mode helper endif - diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index 051dc18b8ccb..39c6980b5d99 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -15,20 +15,7 @@ ifeq ($(CONFIG_BPFILTER_UMH), y) HOSTLDFLAGS += -static endif -# a bit of elf magic to convert bpfilter_umh binary into a binary blob -# inside bpfilter_umh.o elf file referenced by -# _binary_net_bpfilter_bpfilter_umh_start symbol -# which bpfilter_kern.c passes further into umh blob loader at run-time -quiet_cmd_copy_umh = GEN $@ - cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \ - $(OBJCOPY) -I binary \ - `LC_ALL=C $(OBJDUMP) -f net/bpfilter/bpfilter_umh \ - |awk -F' |,' '/file format/{print "-O",$$NF} \ - /^architecture:/{print "-B",$$2}'` \ - --rename-section .data=.init.rodata $< $@ - -$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh - $(call cmd,copy_umh) +$(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o -bpfilter-objs += bpfilter_kern.o bpfilter_umh.o +bpfilter-objs += bpfilter_kern.o bpfilter_umh_blob.o diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 09522573f611..f0fc182d3db7 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -10,11 +10,8 @@ #include <linux/file.h> #include "msgfmt.h" -#define UMH_start _binary_net_bpfilter_bpfilter_umh_start -#define UMH_end _binary_net_bpfilter_bpfilter_umh_end - -extern char UMH_start; -extern char UMH_end; +extern char bpfilter_umh_start; +extern char bpfilter_umh_end; static struct umh_info info; /* since ip_getsockopt() can run in parallel, serialize access to umh */ @@ -93,7 +90,9 @@ static int __init load_umh(void) int err; /* fork usermode process */ - err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info); + err = fork_usermode_blob(&bpfilter_umh_start, + &bpfilter_umh_end - &bpfilter_umh_start, + &info); if (err) return err; pr_info("Loaded bpfilter_umh pid %d\n", info.pid); diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S new file mode 100644 index 000000000000..40311d10d2f2 --- /dev/null +++ b/net/bpfilter/bpfilter_umh_blob.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + .section .init.rodata, "a" + .global bpfilter_umh_start +bpfilter_umh_start: + .incbin "net/bpfilter/bpfilter_umh" + .global bpfilter_umh_end +bpfilter_umh_end: diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 9019f326fe81..5372e2042adf 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -142,7 +142,20 @@ static int deliver_clone(const struct net_bridge_port *prev, void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, bool local_rcv, bool local_orig) { - if (to && should_deliver(to, skb)) { + if (unlikely(!to)) + goto out; + + /* redirect to backup link if the destination port is down */ + if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) { + struct net_bridge_port *backup_port; + + backup_port = rcu_dereference(to->backup_port); + if (unlikely(!backup_port)) + goto out; + to = backup_port; + } + + if (should_deliver(to, skb)) { if (local_rcv) deliver_clone(to, skb, local_orig); else @@ -150,6 +163,7 @@ void br_forward(const struct net_bridge_port *to, return; } +out: if (!local_rcv) kfree_skb(skb); } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 05e42d86882d..0363f1bdc401 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -26,6 +26,7 @@ #include <net/sock.h> #include <linux/if_vlan.h> #include <net/switchdev.h> +#include <net/net_namespace.h> #include "br_private.h" @@ -169,6 +170,58 @@ void br_manage_promisc(struct net_bridge *br) } } +int nbp_backup_change(struct net_bridge_port *p, + struct net_device *backup_dev) +{ + struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port); + struct net_bridge_port *backup_p = NULL; + + ASSERT_RTNL(); + + if (backup_dev) { + if (!br_port_exists(backup_dev)) + return -ENOENT; + + backup_p = br_port_get_rtnl(backup_dev); + if (backup_p->br != p->br) + return -EINVAL; + } + + if (p == backup_p) + return -EINVAL; + + if (old_backup == backup_p) + return 0; + + /* if the backup link is already set, clear it */ + if (old_backup) + old_backup->backup_redirected_cnt--; + + if (backup_p) + backup_p->backup_redirected_cnt++; + rcu_assign_pointer(p->backup_port, backup_p); + + return 0; +} + +static void nbp_backup_clear(struct net_bridge_port *p) +{ + nbp_backup_change(p, NULL); + if (p->backup_redirected_cnt) { + struct net_bridge_port *cur_p; + + list_for_each_entry(cur_p, &p->br->port_list, list) { + struct net_bridge_port *backup_p; + + backup_p = rtnl_dereference(cur_p->backup_port); + if (backup_p == p) + nbp_backup_change(cur_p, NULL); + } + } + + WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt); +} + static void nbp_update_port_count(struct net_bridge *br) { struct net_bridge_port *p; @@ -204,11 +257,19 @@ static void release_nbp(struct kobject *kobj) kfree(p); } +static void brport_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid) +{ + struct net_bridge_port *p = kobj_to_brport(kobj); + + net_ns_get_ownership(dev_net(p->dev), uid, gid); +} + static struct kobj_type brport_ktype = { #ifdef CONFIG_SYSFS .sysfs_ops = &brport_sysfs_ops, #endif .release = release_nbp, + .get_ownership = brport_get_ownership, }; static void destroy_nbp(struct net_bridge_port *p) @@ -286,6 +347,7 @@ static void del_nbp(struct net_bridge_port *p) nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); switchdev_deferred_process(); + nbp_backup_clear(p); nbp_update_port_count(br); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 9f5eb05b0373..ec2b58a09f76 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -169,13 +169,15 @@ static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask) + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */ + nla_total_size(br_get_link_af_size_filtered(dev, - filter_mask)); /* IFLA_AF_SPEC */ + filter_mask)) /* IFLA_AF_SPEC */ + + nla_total_size(4); /* IFLA_BRPORT_BACKUP_PORT */ } static int br_port_fill_attrs(struct sk_buff *skb, const struct net_bridge_port *p) { u8 mode = !!(p->flags & BR_HAIRPIN_MODE); + struct net_bridge_port *backup_p; u64 timerval; if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) || @@ -237,6 +239,14 @@ static int br_port_fill_attrs(struct sk_buff *skb, return -EMSGSIZE; #endif + /* we might be called only with br->lock */ + rcu_read_lock(); + backup_p = rcu_dereference(p->backup_port); + if (backup_p) + nla_put_u32(skb, IFLA_BRPORT_BACKUP_PORT, + backup_p->dev->ifindex); + rcu_read_unlock(); + return 0; } @@ -663,6 +673,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, + [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ @@ -817,6 +828,23 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) if (err) return err; + if (tb[IFLA_BRPORT_BACKUP_PORT]) { + struct net_device *backup_dev = NULL; + u32 backup_ifindex; + + backup_ifindex = nla_get_u32(tb[IFLA_BRPORT_BACKUP_PORT]); + if (backup_ifindex) { + backup_dev = __dev_get_by_index(dev_net(p->dev), + backup_ifindex); + if (!backup_dev) + return -ENOENT; + } + + err = nbp_backup_change(p, backup_dev); + if (err) + return err; + } + br_port_flags_change(p, old_flags ^ p->flags); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 5216a524b537..11ed2029985f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -237,6 +237,7 @@ struct net_bridge_port { #ifdef CONFIG_BRIDGE_VLAN_FILTERING struct net_bridge_vlan_group __rcu *vlgrp; #endif + struct net_bridge_port __rcu *backup_port; /* STP */ u8 priority; @@ -281,8 +282,11 @@ struct net_bridge_port { int offload_fwd_mark; #endif u16 group_fwd_mask; + u16 backup_redirected_cnt; }; +#define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj) + #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) #define br_promisc_port(p) ((p)->flags & BR_PROMISC) @@ -595,6 +599,7 @@ netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features); void br_port_flags_change(struct net_bridge_port *port, unsigned long mask); void br_manage_promisc(struct net_bridge *br); +int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev); /* br_input.c */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index f99c5bf5c906..7c87a2fe5248 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -25,6 +25,15 @@ struct brport_attribute { struct attribute attr; ssize_t (*show)(struct net_bridge_port *, char *); int (*store)(struct net_bridge_port *, unsigned long); + int (*store_raw)(struct net_bridge_port *, char *); +}; + +#define BRPORT_ATTR_RAW(_name, _mode, _show, _store) \ +const struct brport_attribute brport_attr_##_name = { \ + .attr = {.name = __stringify(_name), \ + .mode = _mode }, \ + .show = _show, \ + .store_raw = _store, \ }; #define BRPORT_ATTR(_name, _mode, _show, _store) \ @@ -182,6 +191,38 @@ static int store_group_fwd_mask(struct net_bridge_port *p, static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask, store_group_fwd_mask); +static ssize_t show_backup_port(struct net_bridge_port *p, char *buf) +{ + struct net_bridge_port *backup_p; + int ret = 0; + + rcu_read_lock(); + backup_p = rcu_dereference(p->backup_port); + if (backup_p) + ret = sprintf(buf, "%s\n", backup_p->dev->name); + rcu_read_unlock(); + + return ret; +} + +static int store_backup_port(struct net_bridge_port *p, char *buf) +{ + struct net_device *backup_dev = NULL; + char *nl = strchr(buf, '\n'); + + if (nl) + *nl = '\0'; + + if (strlen(buf) > 0) { + backup_dev = __dev_get_by_name(dev_net(p->dev), buf); + if (!backup_dev) + return -ENOENT; + } + + return nbp_backup_change(p, backup_dev); +} +static BRPORT_ATTR_RAW(backup_port, 0644, show_backup_port, store_backup_port); + BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); @@ -245,17 +286,17 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_group_fwd_mask, &brport_attr_neigh_suppress, &brport_attr_isolated, + &brport_attr_backup_port, NULL }; #define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr) -#define to_brport(obj) container_of(obj, struct net_bridge_port, kobj) static ssize_t brport_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct brport_attribute *brport_attr = to_brport_attr(attr); - struct net_bridge_port *p = to_brport(kobj); + struct net_bridge_port *p = kobj_to_brport(kobj); if (!brport_attr->show) return -EINVAL; @@ -268,29 +309,48 @@ static ssize_t brport_store(struct kobject *kobj, const char *buf, size_t count) { struct brport_attribute *brport_attr = to_brport_attr(attr); - struct net_bridge_port *p = to_brport(kobj); + struct net_bridge_port *p = kobj_to_brport(kobj); ssize_t ret = -EINVAL; - char *endp; unsigned long val; + char *endp; if (!ns_capable(dev_net(p->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - val = simple_strtoul(buf, &endp, 0); - if (endp != buf) { - if (!rtnl_trylock()) - return restart_syscall(); - if (p->dev && p->br && brport_attr->store) { - spin_lock_bh(&p->br->lock); - ret = brport_attr->store(p, val); - spin_unlock_bh(&p->br->lock); - if (!ret) { - br_ifinfo_notify(RTM_NEWLINK, NULL, p); - ret = count; - } + if (!rtnl_trylock()) + return restart_syscall(); + + if (!p->dev || !p->br) + goto out_unlock; + + if (brport_attr->store_raw) { + char *buf_copy; + + buf_copy = kstrndup(buf, count, GFP_KERNEL); + if (!buf_copy) { + ret = -ENOMEM; + goto out_unlock; } - rtnl_unlock(); + spin_lock_bh(&p->br->lock); + ret = brport_attr->store_raw(p, buf_copy); + spin_unlock_bh(&p->br->lock); + kfree(buf_copy); + } else if (brport_attr->store) { + val = simple_strtoul(buf, &endp, 0); + if (endp == buf) + goto out_unlock; + spin_lock_bh(&p->br->lock); + ret = brport_attr->store(p, val); + spin_unlock_bh(&p->br->lock); } + + if (!ret) { + br_ifinfo_notify(RTM_NEWLINK, NULL, p); + ret = count; + } +out_unlock: + rtnl_unlock(); + return ret; } diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 6de981270566..08cbed7d940e 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -89,8 +89,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net, niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, net->ipv4.sysctl_ip_default_ttl); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - niph->ttl = net->ipv4.sysctl_ip_default_ttl; - niph->tot_len = htons(nskb->len); + niph->tot_len = htons(nskb->len); ip_send_check(niph); nft_reject_br_push_etherhdr(oldskb, nskb); diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index e0adcd123f48..711d7156efd8 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -131,8 +131,10 @@ static void caif_flow_cb(struct sk_buff *skb) caifd = caif_get(skb->dev); WARN_ON(caifd == NULL); - if (caifd == NULL) + if (!caifd) { + rcu_read_unlock(); return; + } caifd_hold(caifd); rcu_read_unlock(); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index c7991867d622..a6fb1b3bcad9 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -934,11 +934,15 @@ static int caif_release(struct socket *sock) } /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ -static __poll_t caif_poll_mask(struct socket *sock, __poll_t events) +static __poll_t caif_poll(struct file *file, + struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; + __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); - __poll_t mask = 0; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err) @@ -972,7 +976,7 @@ static const struct proto_ops caif_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = caif_poll_mask, + .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -993,7 +997,7 @@ static const struct proto_ops caif_stream_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = caif_poll_mask, + .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/can/bcm.c b/net/can/bcm.c index 9393f25df08d..0af8f0db892a 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1660,7 +1660,7 @@ static const struct proto_ops bcm_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/can/raw.c b/net/can/raw.c index fd7e2f49ea6a..1051eee82581 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -843,7 +843,7 @@ static const struct proto_ops raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = raw_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/core/datagram.c b/net/core/datagram.c index f19bf3dc2bd6..9938952c5c78 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -819,8 +819,9 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** * datagram_poll - generic datagram poll + * @file: file struct * @sock: socket - * @events to wait for + * @wait: poll table * * Datagram poll: Again totally generic. This also handles * sequenced packet sockets providing the socket receive queue @@ -830,10 +831,14 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); * and you use a different write policy from sock_writeable() * then please supply your own write_space callback. */ -__poll_t datagram_poll_mask(struct socket *sock, __poll_t events) +__poll_t datagram_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -866,4 +871,4 @@ __poll_t datagram_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL(datagram_poll_mask); +EXPORT_SYMBOL(datagram_poll); diff --git a/net/core/dev.c b/net/core/dev.c index a5aa1c7444e6..87c42c8249ae 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -149,7 +149,6 @@ #include "net-sysfs.h" -/* Instead of increasing this, you should create a hash table. */ #define MAX_GRO_SKBS 8 /* This should be increased if a protocol with a bigger head is added. */ @@ -2068,11 +2067,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; int i; + /* walk through the TCs and see if it falls into any of them */ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { if ((txq - tc->offset) < tc->count) return i; } + /* didn't find it, just return -1 to indicate no match */ return -1; } @@ -2081,6 +2082,10 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS +struct static_key xps_needed __read_mostly; +EXPORT_SYMBOL(xps_needed); +struct static_key xps_rxqs_needed __read_mostly; +EXPORT_SYMBOL(xps_rxqs_needed); static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) @@ -2092,7 +2097,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, int pos; if (dev_maps) - map = xmap_dereference(dev_maps->cpu_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; @@ -2105,7 +2110,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, break; } - RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL); + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; } @@ -2135,33 +2140,68 @@ static bool remove_xps_queue_cpu(struct net_device *dev, return active; } +static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, + struct xps_dev_maps *dev_maps, unsigned int nr_ids, + u16 offset, u16 count, bool is_rxqs_map) +{ + bool active = false; + int i, j; + + for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), + j < nr_ids;) + active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, + count); + if (!active) { + if (is_rxqs_map) { + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + } else { + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); + + for (i = offset + (count - 1); count--; i--) + netdev_queue_numa_node_write( + netdev_get_tx_queue(dev, i), + NUMA_NO_NODE); + } + kfree_rcu(dev_maps, rcu); + } +} + static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { + const unsigned long *possible_mask = NULL; struct xps_dev_maps *dev_maps; - int cpu, i; - bool active = false; + unsigned int nr_ids; + + if (!static_key_false(&xps_needed)) + return; mutex_lock(&xps_map_mutex); - dev_maps = xmap_dereference(dev->xps_maps); + if (static_key_false(&xps_rxqs_needed)) { + dev_maps = xmap_dereference(dev->xps_rxqs_map); + if (dev_maps) { + nr_ids = dev->num_rx_queues; + clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, + offset, count, true); + } + } + + dev_maps = xmap_dereference(dev->xps_cpus_map); if (!dev_maps) goto out_no_maps; - for_each_possible_cpu(cpu) - active |= remove_xps_queue_cpu(dev, dev_maps, cpu, - offset, count); - - if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); - kfree_rcu(dev_maps, rcu); - } - - for (i = offset + (count - 1); count--; i--) - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); + if (num_possible_cpus() > 1) + possible_mask = cpumask_bits(cpu_possible_mask); + nr_ids = nr_cpu_ids; + clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count, + false); out_no_maps: + if (static_key_enabled(&xps_rxqs_needed)) + static_key_slow_dec(&xps_rxqs_needed); + + static_key_slow_dec(&xps_needed); mutex_unlock(&xps_map_mutex); } @@ -2170,8 +2210,8 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); } -static struct xps_map *expand_xps_map(struct xps_map *map, - int cpu, u16 index) +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, + u16 index, bool is_rxqs_map) { struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; @@ -2183,7 +2223,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return map; } - /* Need to add queue to this CPU's existing map */ + /* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map; @@ -2191,9 +2231,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map, alloc_len = map->alloc_len * 2; } - /* Need to allocate new map to store queue on this CPU's map */ - new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, - cpu_to_node(cpu)); + /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's + * map + */ + if (is_rxqs_map) + new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); + else + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, + cpu_to_node(attr_index)); if (!new_map) return NULL; @@ -2205,32 +2250,52 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return new_map; } -int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, - u16 index) +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, + u16 index, bool is_rxqs_map) { + const unsigned long *online_mask = NULL, *possible_mask = NULL; struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; - int i, cpu, tci, numa_node_id = -2; + int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; bool active = false; + unsigned int nr_ids; if (dev->num_tc) { + /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; + if (num_tc < 0) + return -EINVAL; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; } - maps_sz = XPS_DEV_MAPS_SIZE(num_tc); - if (maps_sz < L1_CACHE_BYTES) - maps_sz = L1_CACHE_BYTES; - mutex_lock(&xps_map_mutex); + if (is_rxqs_map) { + maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); + dev_maps = xmap_dereference(dev->xps_rxqs_map); + nr_ids = dev->num_rx_queues; + } else { + maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); + if (num_possible_cpus() > 1) { + online_mask = cpumask_bits(cpu_online_mask); + possible_mask = cpumask_bits(cpu_possible_mask); + } + dev_maps = xmap_dereference(dev->xps_cpus_map); + nr_ids = nr_cpu_ids; + } - dev_maps = xmap_dereference(dev->xps_maps); + if (maps_sz < L1_CACHE_BYTES) + maps_sz = L1_CACHE_BYTES; /* allocate memory for queue storage */ - for_each_cpu_and(cpu, cpu_online_mask, mask) { + for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), + j < nr_ids;) { if (!new_dev_maps) new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { @@ -2238,73 +2303,85 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, return -ENOMEM; } - tci = cpu * num_tc + tc; - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) : + tci = j * num_tc + tc; + map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; - map = expand_xps_map(map, cpu, index); + map = expand_xps_map(map, j, index, is_rxqs_map); if (!map) goto error; - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; - for_each_possible_cpu(cpu) { + static_key_slow_inc(&xps_needed); + if (is_rxqs_map) + static_key_slow_inc(&xps_rxqs_needed); + + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { /* copy maps belonging to foreign traffic classes */ - for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) { + for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* We need to explicitly update tci as prevous loop * could break out early if dev_maps is NULL. */ - tci = cpu * num_tc + tc; + tci = j * num_tc + tc; - if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { - /* add queue to CPU maps */ + if (netif_attr_test_mask(j, mask, nr_ids) && + netif_attr_test_online(j, online_mask, nr_ids)) { + /* add tx-queue to CPU/rx-queue maps */ int pos = 0; - map = xmap_dereference(new_dev_maps->cpu_map[tci]); + map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA - if (numa_node_id == -2) - numa_node_id = cpu_to_node(cpu); - else if (numa_node_id != cpu_to_node(cpu)) - numa_node_id = -1; + if (!is_rxqs_map) { + if (numa_node_id == -2) + numa_node_id = cpu_to_node(j); + else if (numa_node_id != cpu_to_node(j)) + numa_node_id = -1; + } #endif } else if (dev_maps) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* copy maps belonging to foreign traffic classes */ for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } } - rcu_assign_pointer(dev->xps_maps, new_dev_maps); + if (is_rxqs_map) + rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps); + else + rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps); /* Cleanup old maps */ if (!dev_maps) goto out_no_old_maps; - for_each_possible_cpu(cpu) { - for (i = num_tc, tci = cpu * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); - map = xmap_dereference(dev_maps->cpu_map[tci]); + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = num_tc, tci = j * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (map && map != new_map) kfree_rcu(map, rcu); } @@ -2317,19 +2394,23 @@ out_no_old_maps: active = true; out_no_new_maps: - /* update Tx queue numa node */ - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), - (numa_node_id >= 0) ? numa_node_id : - NUMA_NO_NODE); + if (!is_rxqs_map) { + /* update Tx queue numa node */ + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), + (numa_node_id >= 0) ? + numa_node_id : NUMA_NO_NODE); + } if (!dev_maps) goto out_no_maps; - /* removes queue from unused CPUs */ - for_each_possible_cpu(cpu) { - for (i = tc, tci = cpu * num_tc; i--; tci++) + /* removes tx-queue from unused CPUs/rx-queues */ + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = tc, tci = j * num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); - if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu)) + if (!netif_attr_test_mask(j, mask, nr_ids) || + !netif_attr_test_online(j, online_mask, nr_ids)) active |= remove_xps_queue(dev_maps, tci, index); for (i = num_tc - tc, tci++; --i; tci++) active |= remove_xps_queue(dev_maps, tci, index); @@ -2337,7 +2418,10 @@ out_no_new_maps: /* free map if not active */ if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); + if (is_rxqs_map) + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + else + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); kfree_rcu(dev_maps, rcu); } @@ -2347,11 +2431,12 @@ out_no_maps: return 0; error: /* remove any maps that we added */ - for_each_possible_cpu(cpu) { - for (i = num_tc, tci = cpu * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = num_tc, tci = j * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = dev_maps ? - xmap_dereference(dev_maps->cpu_map[tci]) : + xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) kfree(new_map); @@ -2363,14 +2448,34 @@ error: kfree(new_dev_maps); return -ENOMEM; } + +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, + u16 index) +{ + return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); +} EXPORT_SYMBOL(netif_set_xps_queue); #endif +static void netdev_unbind_all_sb_channels(struct net_device *dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + + /* Unbind any subordinate channels */ + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev) + netdev_unbind_sb_channel(dev, txq->sb_dev); + } +} + void netdev_reset_tc(struct net_device *dev) { #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif + netdev_unbind_all_sb_channels(dev); + + /* Reset TC configuration of device */ dev->num_tc = 0; memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); @@ -2399,11 +2504,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc) #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif + netdev_unbind_all_sb_channels(dev); + dev->num_tc = num_tc; return 0; } EXPORT_SYMBOL(netdev_set_num_tc); +void netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(sb_dev, 0); +#endif + memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); + memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); + + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev == sb_dev) + txq->sb_dev = NULL; + } +} +EXPORT_SYMBOL(netdev_unbind_sb_channel); + +int netdev_bind_sb_channel_queue(struct net_device *dev, + struct net_device *sb_dev, + u8 tc, u16 count, u16 offset) +{ + /* Make certain the sb_dev and dev are already configured */ + if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) + return -EINVAL; + + /* We cannot hand out queues we don't have */ + if ((offset + count) > dev->real_num_tx_queues) + return -EINVAL; + + /* Record the mapping */ + sb_dev->tc_to_txq[tc].count = count; + sb_dev->tc_to_txq[tc].offset = offset; + + /* Provide a way for Tx queue to find the tc_to_txq map or + * XPS map for itself. + */ + while (count--) + netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; + + return 0; +} +EXPORT_SYMBOL(netdev_bind_sb_channel_queue); + +int netdev_set_sb_channel(struct net_device *dev, u16 channel) +{ + /* Do not use a multiqueue device to represent a subordinate channel */ + if (netif_is_multiqueue(dev)) + return -ENODEV; + + /* We allow channels 1 - 32767 to be used for subordinate channels. + * Channel 0 is meant to be "native" mode and used only to represent + * the main root device. We allow writing 0 to reset the device back + * to normal mode after being used as a subordinate channel. + */ + if (channel > S16_MAX) + return -EINVAL; + + dev->num_tc = -channel; + + return 0; +} +EXPORT_SYMBOL(netdev_set_sb_channel); + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. @@ -2615,24 +2786,26 @@ EXPORT_SYMBOL(netif_device_attach); * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ -static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb) +static u16 skb_tx_hash(const struct net_device *dev, + const struct net_device *sb_dev, + struct sk_buff *skb) { u32 hash; u16 qoffset = 0; u16 qcount = dev->real_num_tx_queues; + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + + qoffset = sb_dev->tc_to_txq[tc].offset; + qcount = sb_dev->tc_to_txq[tc].count; + } + if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); while (unlikely(hash >= qcount)) hash -= qcount; - return hash; - } - - if (dev->num_tc) { - u8 tc = netdev_get_prio_tc_map(dev, skb->priority); - - qoffset = dev->tc_to_txq[tc].offset; - qcount = dev->tc_to_txq[tc].count; + return hash + qoffset; } return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; @@ -3376,32 +3549,64 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) } #endif /* CONFIG_NET_EGRESS */ -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +#ifdef CONFIG_XPS +static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, + struct xps_dev_maps *dev_maps, unsigned int tci) +{ + struct xps_map *map; + int queue_index = -1; + + if (dev->num_tc) { + tci *= dev->num_tc; + tci += netdev_get_prio_tc_map(dev, skb->priority); + } + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else + queue_index = map->queues[reciprocal_scale( + skb_get_hash(skb), map->len)]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + return queue_index; +} +#endif + +static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, + struct sk_buff *skb) { #ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; - struct xps_map *map; + struct sock *sk = skb->sk; int queue_index = -1; + if (!static_key_false(&xps_needed)) + return -1; + rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); + if (!static_key_false(&xps_rxqs_needed)) + goto get_cpus_map; + + dev_maps = rcu_dereference(sb_dev->xps_rxqs_map); if (dev_maps) { - unsigned int tci = skb->sender_cpu - 1; + int tci = sk_rx_queue_get(sk); - if (dev->num_tc) { - tci *= dev->num_tc; - tci += netdev_get_prio_tc_map(dev, skb->priority); - } + if (tci >= 0 && tci < dev->num_rx_queues) + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); + } - map = rcu_dereference(dev_maps->cpu_map[tci]); - if (map) { - if (map->len == 1) - queue_index = map->queues[0]; - else - queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), - map->len)]; - if (unlikely(queue_index >= dev->real_num_tx_queues)) - queue_index = -1; +get_cpus_map: + if (queue_index < 0) { + dev_maps = rcu_dereference(sb_dev->xps_cpus_map); + if (dev_maps) { + unsigned int tci = skb->sender_cpu - 1; + + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); } } rcu_read_unlock(); @@ -3412,17 +3617,36 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) #endif } -static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev, + select_queue_fallback_t fallback) +{ + return 0; +} +EXPORT_SYMBOL(dev_pick_tx_zero); + +u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev, + select_queue_fallback_t fallback) +{ + return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; +} +EXPORT_SYMBOL(dev_pick_tx_cpu_id); + +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); + sb_dev = sb_dev ? : dev; + if (queue_index < 0 || skb->ooo_okay || queue_index >= dev->real_num_tx_queues) { - int new_index = get_xps_queue(dev, skb); + int new_index = get_xps_queue(dev, sb_dev, skb); if (new_index < 0) - new_index = skb_tx_hash(dev, skb); + new_index = skb_tx_hash(dev, sb_dev, skb); if (queue_index != new_index && sk && sk_fullsock(sk) && @@ -3437,7 +3661,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + struct net_device *sb_dev) { int queue_index = 0; @@ -3452,10 +3676,10 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb, accel_priv, + queue_index = ops->ndo_select_queue(dev, skb, sb_dev, __netdev_pick_tx); else - queue_index = __netdev_pick_tx(dev, skb); + queue_index = __netdev_pick_tx(dev, skb, sb_dev); queue_index = netdev_cap_txqueue(dev, queue_index); } @@ -3467,7 +3691,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, /** * __dev_queue_xmit - transmit a buffer * @skb: buffer to transmit - * @accel_priv: private data used for L2 forwarding offload + * @sb_dev: suboordinate device used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling @@ -3490,7 +3714,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, * the BH enable code must have IRQs enabled so that it will not deadlock. * --BLG */ -static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) +static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -3529,7 +3753,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); - txq = netdev_pick_tx(dev, skb, accel_priv); + txq = netdev_pick_tx(dev, skb, sb_dev); q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); @@ -3603,9 +3827,9 @@ int dev_queue_xmit(struct sk_buff *skb) } EXPORT_SYMBOL(dev_queue_xmit); -int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) +int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) { - return __dev_queue_xmit(skb, accel_priv); + return __dev_queue_xmit(skb, sb_dev); } EXPORT_SYMBOL(dev_queue_xmit_accel); @@ -4494,7 +4718,8 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, return 0; } -static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) +static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc, + struct packet_type **ppt_prev) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; @@ -4624,8 +4849,7 @@ skip_classify: if (pt_prev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; - else - ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + *ppt_prev = pt_prev; } else { drop: if (!deliver_exact) @@ -4643,6 +4867,18 @@ out: return ret; } +static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) +{ + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + int ret; + + ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); + if (pt_prev) + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + return ret; +} + /** * netif_receive_skb_core - special purpose version of netif_receive_skb * @skb: buffer to process @@ -4663,13 +4899,72 @@ int netif_receive_skb_core(struct sk_buff *skb) int ret; rcu_read_lock(); - ret = __netif_receive_skb_core(skb, false); + ret = __netif_receive_skb_one_core(skb, false); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(netif_receive_skb_core); +static inline void __netif_receive_skb_list_ptype(struct list_head *head, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + struct sk_buff *skb, *next; + + if (!pt_prev) + return; + if (list_empty(head)) + return; + if (pt_prev->list_func != NULL) + pt_prev->list_func(head, pt_prev, orig_dev); + else + list_for_each_entry_safe(skb, next, head, list) + pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} + +static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) +{ + /* Fast-path assumptions: + * - There is no RX handler. + * - Only one packet_type matches. + * If either of these fails, we will end up doing some per-packet + * processing in-line, then handling the 'last ptype' for the whole + * sublist. This can't cause out-of-order delivery to any single ptype, + * because the 'last ptype' must be constant across the sublist, and all + * other ptypes are handled per-packet. + */ + /* Current (common) ptype of sublist */ + struct packet_type *pt_curr = NULL; + /* Current (common) orig_dev of sublist */ + struct net_device *od_curr = NULL; + struct list_head sublist; + struct sk_buff *skb, *next; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + + list_del(&skb->list); + __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); + if (!pt_prev) + continue; + if (pt_curr != pt_prev || od_curr != orig_dev) { + /* dispatch old sublist */ + __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + pt_curr = pt_prev; + od_curr = orig_dev; + } + list_add_tail(&skb->list, &sublist); + } + + /* dispatch final sublist */ + __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); +} + static int __netif_receive_skb(struct sk_buff *skb) { int ret; @@ -4687,14 +4982,44 @@ static int __netif_receive_skb(struct sk_buff *skb) * context down to all allocation sites. */ noreclaim_flag = memalloc_noreclaim_save(); - ret = __netif_receive_skb_core(skb, true); + ret = __netif_receive_skb_one_core(skb, true); memalloc_noreclaim_restore(noreclaim_flag); } else - ret = __netif_receive_skb_core(skb, false); + ret = __netif_receive_skb_one_core(skb, false); return ret; } +static void __netif_receive_skb_list(struct list_head *head) +{ + unsigned long noreclaim_flag = 0; + struct sk_buff *skb, *next; + bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ + + list_for_each_entry_safe(skb, next, head, list) { + if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { + struct list_head sublist; + + /* Handle the previous sublist */ + list_cut_before(&sublist, head, &skb->list); + if (!list_empty(&sublist)) + __netif_receive_skb_list_core(&sublist, pfmemalloc); + pfmemalloc = !pfmemalloc; + /* See comments in __netif_receive_skb */ + if (pfmemalloc) + noreclaim_flag = memalloc_noreclaim_save(); + else + memalloc_noreclaim_restore(noreclaim_flag); + } + } + /* Handle the remaining sublist */ + if (!list_empty(head)) + __netif_receive_skb_list_core(head, pfmemalloc); + /* Restore pflags */ + if (pfmemalloc) + memalloc_noreclaim_restore(noreclaim_flag); +} + static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); @@ -4717,7 +5042,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) break; case XDP_QUERY_PROG: - xdp->prog_attached = !!old; xdp->prog_id = old ? old->aux->id : 0; break; @@ -4769,6 +5093,55 @@ static int netif_receive_skb_internal(struct sk_buff *skb) return ret; } +static void netif_receive_skb_list_internal(struct list_head *head) +{ + struct bpf_prog *xdp_prog = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + net_timestamp_check(netdev_tstamp_prequeue, skb); + list_del(&skb->list); + if (!skb_defer_rx_timestamp(skb)) + list_add_tail(&skb->list, &sublist); + } + list_splice_init(&sublist, head); + + if (static_branch_unlikely(&generic_xdp_needed_key)) { + preempt_disable(); + rcu_read_lock(); + list_for_each_entry_safe(skb, next, head, list) { + xdp_prog = rcu_dereference(skb->dev->xdp_prog); + list_del(&skb->list); + if (do_xdp_generic(xdp_prog, skb) == XDP_PASS) + list_add_tail(&skb->list, &sublist); + } + rcu_read_unlock(); + preempt_enable(); + /* Put passed packets back on main list */ + list_splice_init(&sublist, head); + } + + rcu_read_lock(); +#ifdef CONFIG_RPS + if (static_key_false(&rps_needed)) { + list_for_each_entry_safe(skb, next, head, list) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + /* Will be handled, remove from list */ + list_del(&skb->list); + enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + } + } + } +#endif + __netif_receive_skb_list(head); + rcu_read_unlock(); +} + /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process @@ -4792,6 +5165,28 @@ int netif_receive_skb(struct sk_buff *skb) } EXPORT_SYMBOL(netif_receive_skb); +/** + * netif_receive_skb_list - process many receive buffers from network + * @head: list of skbs to process. + * + * Since return value of netif_receive_skb() is normally ignored, and + * wouldn't be meaningful for a list, this function returns void. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + */ +void netif_receive_skb_list(struct list_head *head) +{ + struct sk_buff *skb; + + if (list_empty(head)) + return; + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); + netif_receive_skb_list_internal(head); +} +EXPORT_SYMBOL(netif_receive_skb_list); + DEFINE_PER_CPU(struct work_struct, flush_works); /* Network device is going away, flush any packets still pending */ @@ -4875,42 +5270,50 @@ out: return netif_receive_skb_internal(skb); } -/* napi->gro_list contains packets ordered by age. - * youngest packets at the head of it. - * Complete skbs in reverse order to reduce latencies. - */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) +static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, + bool flush_old) { - struct sk_buff *skb, *prev = NULL; - - /* scan list and build reverse chain */ - for (skb = napi->gro_list; skb != NULL; skb = skb->next) { - skb->prev = prev; - prev = skb; - } - - for (skb = prev; skb; skb = prev) { - skb->next = NULL; + struct list_head *head = &napi->gro_hash[index].list; + struct sk_buff *skb, *p; + list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; - - prev = skb->prev; + list_del(&skb->list); + skb->next = NULL; napi_gro_complete(skb); - napi->gro_count--; + napi->gro_hash[index].count--; } - napi->gro_list = NULL; + if (!napi->gro_hash[index].count) + __clear_bit(index, &napi->gro_bitmask); +} + +/* napi->gro_hash[].list contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old) +{ + u32 i; + + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + if (test_bit(i, &napi->gro_bitmask)) + __napi_gro_flush_chain(napi, i, flush_old); + } } EXPORT_SYMBOL(napi_gro_flush); -static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) +static struct list_head *gro_list_prepare(struct napi_struct *napi, + struct sk_buff *skb) { - struct sk_buff *p; unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); + struct list_head *head; + struct sk_buff *p; - for (p = napi->gro_list; p; p = p->next) { + head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list; + list_for_each_entry(p, head, list) { unsigned long diffs; NAPI_GRO_CB(p)->flush = 0; @@ -4933,6 +5336,8 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) maclen); NAPI_GRO_CB(p)->same_flow = !diffs; } + + return head; } static void skb_gro_reset_offset(struct sk_buff *skb) @@ -4975,20 +5380,41 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) } } +static void gro_flush_oldest(struct list_head *head) +{ + struct sk_buff *oldest; + + oldest = list_last_entry(head, struct sk_buff, list); + + /* We are called with head length >= MAX_GRO_SKBS, so this is + * impossible. + */ + if (WARN_ON_ONCE(!oldest)) + return; + + /* Do not adjust napi->gro_hash[].count, caller is adding a new + * SKB to the chain. + */ + list_del(&oldest->list); + napi_gro_complete(oldest); +} + static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { - struct sk_buff **pp = NULL; + u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &offload_base; - int same_flow; + struct list_head *gro_head; + struct sk_buff *pp = NULL; enum gro_result ret; + int same_flow; int grow; if (netif_elide_gro(skb->dev)) goto normal; - gro_list_prepare(napi, skb); + gro_head = gro_list_prepare(napi, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -5022,7 +5448,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->csum_valid = 0; } - pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); + pp = ptype->callbacks.gro_receive(gro_head, skb); break; } rcu_read_unlock(); @@ -5039,12 +5465,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { - struct sk_buff *nskb = *pp; - - *pp = nskb->next; - nskb->next = NULL; - napi_gro_complete(nskb); - napi->gro_count--; + list_del(&pp->list); + pp->next = NULL; + napi_gro_complete(pp); + napi->gro_hash[hash].count--; } if (same_flow) @@ -5053,26 +5477,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (NAPI_GRO_CB(skb)->flush) goto normal; - if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { - struct sk_buff *nskb = napi->gro_list; - - /* locate the end of the list to select the 'oldest' flow */ - while (nskb->next) { - pp = &nskb->next; - nskb = *pp; - } - *pp = NULL; - nskb->next = NULL; - napi_gro_complete(nskb); + if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { + gro_flush_oldest(gro_head); } else { - napi->gro_count++; + napi->gro_hash[hash].count++; } NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); - skb->next = napi->gro_list; - napi->gro_list = skb; + list_add(&skb->list, gro_head); ret = GRO_HELD; pull: @@ -5080,6 +5494,13 @@ pull: if (grow > 0) gro_pull_from_frag0(skb, grow); ok: + if (napi->gro_hash[hash].count) { + if (!test_bit(hash, &napi->gro_bitmask)) + __set_bit(hash, &napi->gro_bitmask); + } else if (test_bit(hash, &napi->gro_bitmask)) { + __clear_bit(hash, &napi->gro_bitmask); + } + return ret; normal: @@ -5478,7 +5899,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (n->gro_list) { + if (n->gro_bitmask) { unsigned long timeout = 0; if (work_done) @@ -5687,21 +6108,31 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_list && !napi_disable_pending(napi) && + if (napi->gro_bitmask && !napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); return HRTIMER_NORESTART; } +static void init_gro_hash(struct napi_struct *napi) +{ + int i; + + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + INIT_LIST_HEAD(&napi->gro_hash[i].list); + napi->gro_hash[i].count = 0; + } + napi->gro_bitmask = 0; +} + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { INIT_LIST_HEAD(&napi->poll_list); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; - napi->gro_count = 0; - napi->gro_list = NULL; + init_gro_hash(napi); napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) @@ -5734,6 +6165,19 @@ void napi_disable(struct napi_struct *n) } EXPORT_SYMBOL(napi_disable); +static void flush_gro_hash(struct napi_struct *napi) +{ + int i; + + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + struct sk_buff *skb, *n; + + list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) + kfree_skb(skb); + napi->gro_hash[i].count = 0; + } +} + /* Must be called in process context */ void netif_napi_del(struct napi_struct *napi) { @@ -5743,9 +6187,8 @@ void netif_napi_del(struct napi_struct *napi) list_del_init(&napi->dev_list); napi_free_frags(napi); - kfree_skb_list(napi->gro_list); - napi->gro_list = NULL; - napi->gro_count = 0; + flush_gro_hash(napi); + napi->gro_bitmask = 0; } EXPORT_SYMBOL(netif_napi_del); @@ -5787,7 +6230,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } - if (n->gro_list) { + if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. */ @@ -7276,23 +7719,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, - struct netdev_bpf *xdp) +u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, + enum bpf_netdev_command cmd) { - memset(xdp, 0, sizeof(*xdp)); - xdp->command = XDP_QUERY_PROG; + struct netdev_bpf xdp; - /* Query must always succeed. */ - WARN_ON(bpf_op(dev, xdp) < 0); -} + if (!bpf_op) + return 0; -static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) -{ - struct netdev_bpf xdp; + memset(&xdp, 0, sizeof(xdp)); + xdp.command = cmd; - __dev_xdp_query(dev, bpf_op, &xdp); + /* Query must always succeed. */ + WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG); - return xdp.prog_attached; + return xdp.prog_id; } static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, @@ -7326,12 +7767,19 @@ static void dev_xdp_uninstall(struct net_device *dev) if (!ndo_bpf) return; - __dev_xdp_query(dev, ndo_bpf, &xdp); - if (xdp.prog_attached == XDP_ATTACHED_NONE) - return; + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG; + WARN_ON(ndo_bpf(dev, &xdp)); + if (xdp.prog_id) + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, + NULL)); - /* Program removal should always succeed */ - WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); + /* Remove HW offload */ + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG_HW; + if (!ndo_bpf(dev, &xdp) && xdp.prog_id) + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, + NULL)); } /** @@ -7347,12 +7795,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; + enum bpf_netdev_command query; struct bpf_prog *prog = NULL; bpf_op_t bpf_op, bpf_chk; int err; ASSERT_RTNL(); + query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG; + bpf_op = bpf_chk = ops->ndo_bpf; if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) return -EOPNOTSUPP; @@ -7362,10 +7813,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_chk = generic_xdp_install; if (fd >= 0) { - if (bpf_chk && __dev_xdp_attached(dev, bpf_chk)) + if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) || + __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, bpf_op)) + __dev_xdp_query(dev, bpf_op, query)) return -EBUSY; prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, @@ -8834,6 +9286,9 @@ static struct hlist_head * __net_init netdev_create_hash(void) /* Initialize per network namespace state */ static int __net_init netdev_init(struct net *net) { + BUILD_BUG_ON(GRO_HASH_BUCKETS > + 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask)); + if (net != &init_net) INIT_LIST_HEAD(&net->dev_base_head); @@ -9104,6 +9559,7 @@ static int __init net_dev_init(void) sd->cpu = i; #endif + init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; } diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index a04e1e88bf3a..90e8aa36881e 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -284,19 +284,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) case SIOCSIFTXQLEN: if (ifr->ifr_qlen < 0) return -EINVAL; - if (dev->tx_queue_len ^ ifr->ifr_qlen) { - unsigned int orig_len = dev->tx_queue_len; - - dev->tx_queue_len = ifr->ifr_qlen; - err = call_netdevice_notifiers( - NETDEV_CHANGE_TX_QUEUE_LEN, dev); - err = notifier_to_errno(err); - if (err) { - dev->tx_queue_len = orig_len; - return err; - } - } - return 0; + return dev_change_tx_queue_len(dev, ifr->ifr_qlen); case SIOCSIFNAME: ifr->ifr_newname[IFNAMSIZ-1] = '\0'; diff --git a/net/core/devlink.c b/net/core/devlink.c index 22099705cc41..65fc366a78a4 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -326,6 +326,57 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, pool_type, p_tc_index); } +struct devlink_region { + struct devlink *devlink; + struct list_head list; + const char *name; + struct list_head snapshot_list; + u32 max_snapshots; + u32 cur_snapshots; + u64 size; +}; + +struct devlink_snapshot { + struct list_head list; + struct devlink_region *region; + devlink_snapshot_data_dest_t *data_destructor; + u64 data_len; + u8 *data; + u32 id; +}; + +static struct devlink_region * +devlink_region_get_by_name(struct devlink *devlink, const char *region_name) +{ + struct devlink_region *region; + + list_for_each_entry(region, &devlink->region_list, list) + if (!strcmp(region->name, region_name)) + return region; + + return NULL; +} + +static struct devlink_snapshot * +devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) +{ + struct devlink_snapshot *snapshot; + + list_for_each_entry(snapshot, ®ion->snapshot_list, list) + if (snapshot->id == id) + return snapshot; + + return NULL; +} + +static void devlink_region_snapshot_del(struct devlink_snapshot *snapshot) +{ + snapshot->region->cur_snapshots--; + list_del(&snapshot->list); + (*snapshot->data_destructor)(snapshot->data); + kfree(snapshot); +} + #define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) #define DEVLINK_NL_FLAG_NEED_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_SB BIT(2) @@ -2604,6 +2655,919 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return devlink->ops->reload(devlink, info->extack); } +static const struct devlink_param devlink_param_generic[] = { + { + .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, + .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME, + .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS, + .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME, + .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, + .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, + .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, + .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, + }, +}; + +static int devlink_param_generic_verify(const struct devlink_param *param) +{ + /* verify it match generic parameter by id and name */ + if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX) + return -EINVAL; + if (strcmp(param->name, devlink_param_generic[param->id].name)) + return -ENOENT; + + WARN_ON(param->type != devlink_param_generic[param->id].type); + + return 0; +} + +static int devlink_param_driver_verify(const struct devlink_param *param) +{ + int i; + + if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX) + return -EINVAL; + /* verify no such name in generic params */ + for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++) + if (!strcmp(param->name, devlink_param_generic[i].name)) + return -EEXIST; + + return 0; +} + +static struct devlink_param_item * +devlink_param_find_by_name(struct list_head *param_list, + const char *param_name) +{ + struct devlink_param_item *param_item; + + list_for_each_entry(param_item, param_list, list) + if (!strcmp(param_item->param->name, param_name)) + return param_item; + return NULL; +} + +static struct devlink_param_item * +devlink_param_find_by_id(struct list_head *param_list, u32 param_id) +{ + struct devlink_param_item *param_item; + + list_for_each_entry(param_item, param_list, list) + if (param_item->param->id == param_id) + return param_item; + return NULL; +} + +static bool +devlink_param_cmode_is_supported(const struct devlink_param *param, + enum devlink_param_cmode cmode) +{ + return test_bit(cmode, ¶m->supported_cmodes); +} + +static int devlink_param_get(struct devlink *devlink, + const struct devlink_param *param, + struct devlink_param_gset_ctx *ctx) +{ + if (!param->get) + return -EOPNOTSUPP; + return param->get(devlink, param->id, ctx); +} + +static int devlink_param_set(struct devlink *devlink, + const struct devlink_param *param, + struct devlink_param_gset_ctx *ctx) +{ + if (!param->set) + return -EOPNOTSUPP; + return param->set(devlink, param->id, ctx); +} + +static int +devlink_param_type_to_nla_type(enum devlink_param_type param_type) +{ + switch (param_type) { + case DEVLINK_PARAM_TYPE_U8: + return NLA_U8; + case DEVLINK_PARAM_TYPE_U16: + return NLA_U16; + case DEVLINK_PARAM_TYPE_U32: + return NLA_U32; + case DEVLINK_PARAM_TYPE_STRING: + return NLA_STRING; + case DEVLINK_PARAM_TYPE_BOOL: + return NLA_FLAG; + default: + return -EINVAL; + } +} + +static int +devlink_nl_param_value_fill_one(struct sk_buff *msg, + enum devlink_param_type type, + enum devlink_param_cmode cmode, + union devlink_param_value val) +{ + struct nlattr *param_value_attr; + + param_value_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUE); + if (!param_value_attr) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode)) + goto value_nest_cancel; + + switch (type) { + case DEVLINK_PARAM_TYPE_U8: + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_U16: + if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_U32: + if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_STRING: + if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, + val.vstr)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_BOOL: + if (val.vbool && + nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA)) + goto value_nest_cancel; + break; + } + + nla_nest_end(msg, param_value_attr); + return 0; + +value_nest_cancel: + nla_nest_cancel(msg, param_value_attr); +nla_put_failure: + return -EMSGSIZE; +} + +static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_param_item *param_item, + enum devlink_command cmd, + u32 portid, u32 seq, int flags) +{ + union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1]; + const struct devlink_param *param = param_item->param; + struct devlink_param_gset_ctx ctx; + struct nlattr *param_values_list; + struct nlattr *param_attr; + int nla_type; + void *hdr; + int err; + int i; + + /* Get value from driver part to driverinit configuration mode */ + for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { + if (!devlink_param_cmode_is_supported(param, i)) + continue; + if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) { + if (!param_item->driverinit_value_valid) + return -EOPNOTSUPP; + param_value[i] = param_item->driverinit_value; + } else { + ctx.cmode = i; + err = devlink_param_get(devlink, param, &ctx); + if (err) + return err; + param_value[i] = ctx.val; + } + } + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto genlmsg_cancel; + param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM); + if (!param_attr) + goto genlmsg_cancel; + if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name)) + goto param_nest_cancel; + if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC)) + goto param_nest_cancel; + + nla_type = devlink_param_type_to_nla_type(param->type); + if (nla_type < 0) + goto param_nest_cancel; + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type)) + goto param_nest_cancel; + + param_values_list = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUES_LIST); + if (!param_values_list) + goto param_nest_cancel; + + for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { + if (!devlink_param_cmode_is_supported(param, i)) + continue; + err = devlink_nl_param_value_fill_one(msg, param->type, + i, param_value[i]); + if (err) + goto values_list_nest_cancel; + } + + nla_nest_end(msg, param_values_list); + nla_nest_end(msg, param_attr); + genlmsg_end(msg, hdr); + return 0; + +values_list_nest_cancel: + nla_nest_end(msg, param_values_list); +param_nest_cancel: + nla_nest_cancel(msg, param_attr); +genlmsg_cancel: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_param_notify(struct devlink *devlink, + struct devlink_param_item *param_item, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_param_item *param_item; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + mutex_lock(&devlink->lock); + list_for_each_entry(param_item, &devlink->param_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_param_fill(msg, devlink, param_item, + DEVLINK_CMD_PARAM_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int +devlink_param_type_get_from_info(struct genl_info *info, + enum devlink_param_type *param_type) +{ + if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE]) + return -EINVAL; + + switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { + case NLA_U8: + *param_type = DEVLINK_PARAM_TYPE_U8; + break; + case NLA_U16: + *param_type = DEVLINK_PARAM_TYPE_U16; + break; + case NLA_U32: + *param_type = DEVLINK_PARAM_TYPE_U32; + break; + case NLA_STRING: + *param_type = DEVLINK_PARAM_TYPE_STRING; + break; + case NLA_FLAG: + *param_type = DEVLINK_PARAM_TYPE_BOOL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int +devlink_param_value_get_from_info(const struct devlink_param *param, + struct genl_info *info, + union devlink_param_value *value) +{ + if (param->type != DEVLINK_PARAM_TYPE_BOOL && + !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) + return -EINVAL; + + switch (param->type) { + case DEVLINK_PARAM_TYPE_U8: + value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_U16: + value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_U32: + value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_STRING: + if (nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) > + DEVLINK_PARAM_MAX_STRING_VALUE) + return -EINVAL; + value->vstr = nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_BOOL: + value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ? + true : false; + break; + } + return 0; +} + +static struct devlink_param_item * +devlink_param_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + char *param_name; + + if (!info->attrs[DEVLINK_ATTR_PARAM_NAME]) + return NULL; + + param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); + return devlink_param_find_by_name(&devlink->param_list, param_name); +} + +static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_param_item *param_item; + struct sk_buff *msg; + int err; + + param_item = devlink_param_get_from_info(devlink, info); + if (!param_item) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_param_fill(msg, devlink, param_item, + DEVLINK_CMD_PARAM_GET, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + enum devlink_param_type param_type; + struct devlink_param_gset_ctx ctx; + enum devlink_param_cmode cmode; + struct devlink_param_item *param_item; + const struct devlink_param *param; + union devlink_param_value value; + int err = 0; + + param_item = devlink_param_get_from_info(devlink, info); + if (!param_item) + return -EINVAL; + param = param_item->param; + err = devlink_param_type_get_from_info(info, ¶m_type); + if (err) + return err; + if (param_type != param->type) + return -EINVAL; + err = devlink_param_value_get_from_info(param, info, &value); + if (err) + return err; + if (param->validate) { + err = param->validate(devlink, param->id, value, info->extack); + if (err) + return err; + } + + if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]) + return -EINVAL; + cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]); + if (!devlink_param_cmode_is_supported(param, cmode)) + return -EOPNOTSUPP; + + if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { + param_item->driverinit_value = value; + param_item->driverinit_value_valid = true; + } else { + if (!param->set) + return -EOPNOTSUPP; + ctx.val = value; + ctx.cmode = cmode; + err = devlink_param_set(devlink, param, &ctx); + if (err) + return err; + } + + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); + return 0; +} + +static int devlink_param_register_one(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + if (devlink_param_find_by_name(&devlink->param_list, + param->name)) + return -EEXIST; + + if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) + WARN_ON(param->get || param->set); + else + WARN_ON(!param->get || !param->set); + + param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); + if (!param_item) + return -ENOMEM; + param_item->param = param; + + list_add_tail(¶m_item->list, &devlink->param_list); + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); + return 0; +} + +static void devlink_param_unregister_one(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_name(&devlink->param_list, + param->name); + WARN_ON(!param_item); + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL); + list_del(¶m_item->list); + kfree(param_item); +} + +static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_snapshot *snapshot) +{ + struct nlattr *snap_attr; + int err; + + snap_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOT); + if (!snap_attr) + return -EINVAL; + + err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id); + if (err) + goto nla_put_failure; + + nla_nest_end(msg, snap_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, snap_attr); + return err; +} + +static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_region *region) +{ + struct devlink_snapshot *snapshot; + struct nlattr *snapshots_attr; + int err; + + snapshots_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOTS); + if (!snapshots_attr) + return -EINVAL; + + list_for_each_entry(snapshot, ®ion->snapshot_list, list) { + err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot); + if (err) + goto nla_put_failure; + } + + nla_nest_end(msg, snapshots_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, snapshots_attr); + return err; +} + +static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, + struct devlink_region *region) +{ + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto nla_put_failure; + + err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name); + if (err) + goto nla_put_failure; + + err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, + region->size, + DEVLINK_ATTR_PAD); + if (err) + goto nla_put_failure; + + err = devlink_nl_region_snapshots_id_put(msg, devlink, region); + if (err) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return err; +} + +static void devlink_nl_region_notify(struct devlink_region *region, + struct devlink_snapshot *snapshot, + enum devlink_command cmd) +{ + struct devlink *devlink = region->devlink; + struct sk_buff *msg; + void *hdr; + int err; + + WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto out_cancel_msg; + + err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, + region->name); + if (err) + goto out_cancel_msg; + + if (snapshot) { + err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, + snapshot->id); + if (err) + goto out_cancel_msg; + } else { + err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, + region->size, DEVLINK_ATTR_PAD); + if (err) + goto out_cancel_msg; + } + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + + return; + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); +} + +static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_region *region; + const char *region_name; + struct sk_buff *msg; + int err; + + if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) + return -EINVAL; + + region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); + region = devlink_region_get_by_name(devlink, region_name); + if (!region) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, + info->snd_portid, info->snd_seq, 0, + region); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_region *region; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + + mutex_lock(&devlink->lock); + list_for_each_entry(region, &devlink->region_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_region_fill(msg, devlink, + DEVLINK_CMD_REGION_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, region); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + cb->args[0] = idx; + return msg->len; +} + +static int devlink_nl_cmd_region_del(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_snapshot *snapshot; + struct devlink_region *region; + const char *region_name; + u32 snapshot_id; + + if (!info->attrs[DEVLINK_ATTR_REGION_NAME] || + !info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) + return -EINVAL; + + region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); + snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + + region = devlink_region_get_by_name(devlink, region_name); + if (!region) + return -EINVAL; + + snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); + if (!snapshot) + return -EINVAL; + + devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); + devlink_region_snapshot_del(snapshot); + return 0; +} + +static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, + struct devlink *devlink, + u8 *chunk, u32 chunk_size, + u64 addr) +{ + struct nlattr *chunk_attr; + int err; + + chunk_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_CHUNK); + if (!chunk_attr) + return -EINVAL; + + err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk); + if (err) + goto nla_put_failure; + + err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr, + DEVLINK_ATTR_PAD); + if (err) + goto nla_put_failure; + + nla_nest_end(msg, chunk_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, chunk_attr); + return err; +} + +#define DEVLINK_REGION_READ_CHUNK_SIZE 256 + +static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, + struct devlink *devlink, + struct devlink_region *region, + struct nlattr **attrs, + u64 start_offset, + u64 end_offset, + bool dump, + u64 *new_offset) +{ + struct devlink_snapshot *snapshot; + u64 curr_offset = start_offset; + u32 snapshot_id; + int err = 0; + + *new_offset = start_offset; + + snapshot_id = nla_get_u32(attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); + if (!snapshot) + return -EINVAL; + + if (end_offset > snapshot->data_len || dump) + end_offset = snapshot->data_len; + + while (curr_offset < end_offset) { + u32 data_size; + u8 *data; + + if (end_offset - curr_offset < DEVLINK_REGION_READ_CHUNK_SIZE) + data_size = end_offset - curr_offset; + else + data_size = DEVLINK_REGION_READ_CHUNK_SIZE; + + data = &snapshot->data[curr_offset]; + err = devlink_nl_cmd_region_read_chunk_fill(skb, devlink, + data, data_size, + curr_offset); + if (err) + break; + + curr_offset += data_size; + } + *new_offset = curr_offset; + + return err; +} + +static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + u64 ret_offset, start_offset, end_offset = 0; + struct nlattr *attrs[DEVLINK_ATTR_MAX + 1]; + const struct genl_ops *ops = cb->data; + struct devlink_region *region; + struct nlattr *chunks_attr; + const char *region_name; + struct devlink *devlink; + bool dump = true; + void *hdr; + int err; + + start_offset = *((u64 *)&cb->args[0]); + + err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize, + attrs, DEVLINK_ATTR_MAX, ops->policy, NULL); + if (err) + goto out; + + devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); + if (IS_ERR(devlink)) + goto out; + + mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); + + if (!attrs[DEVLINK_ATTR_REGION_NAME] || + !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) + goto out_unlock; + + region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]); + region = devlink_region_get_by_name(devlink, region_name); + if (!region) + goto out_unlock; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, + DEVLINK_CMD_REGION_READ); + if (!hdr) + goto out_unlock; + + err = devlink_nl_put_handle(skb, devlink); + if (err) + goto nla_put_failure; + + err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name); + if (err) + goto nla_put_failure; + + chunks_attr = nla_nest_start(skb, DEVLINK_ATTR_REGION_CHUNKS); + if (!chunks_attr) + goto nla_put_failure; + + if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && + attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { + if (!start_offset) + start_offset = + nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + + end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); + dump = false; + } + + err = devlink_nl_region_read_snapshot_fill(skb, devlink, + region, attrs, + start_offset, + end_offset, dump, + &ret_offset); + + if (err && err != -EMSGSIZE) + goto nla_put_failure; + + /* Check if there was any progress done to prevent infinite loop */ + if (ret_offset == start_offset) + goto nla_put_failure; + + *((u64 *)&cb->args[0]) = ret_offset; + + nla_nest_end(skb, chunks_attr); + genlmsg_end(skb, hdr); + mutex_unlock(&devlink->lock); + mutex_unlock(&devlink_mutex); + + return skb->len; + +nla_put_failure: + genlmsg_cancel(skb, hdr); +out_unlock: + mutex_unlock(&devlink->lock); + mutex_unlock(&devlink_mutex); +out: + return 0; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -2624,6 +3588,11 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, + [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -2807,6 +3776,43 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, }, + { + .cmd = DEVLINK_CMD_PARAM_GET, + .doit = devlink_nl_cmd_param_get_doit, + .dumpit = devlink_nl_cmd_param_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_PARAM_SET, + .doit = devlink_nl_cmd_param_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_REGION_GET, + .doit = devlink_nl_cmd_region_get_doit, + .dumpit = devlink_nl_cmd_region_get_dumpit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_REGION_DEL, + .doit = devlink_nl_cmd_region_del, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_REGION_READ, + .dumpit = devlink_nl_cmd_region_read_dumpit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -2845,6 +3851,8 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); + INIT_LIST_HEAD(&devlink->param_list); + INIT_LIST_HEAD(&devlink->region_list); mutex_init(&devlink->lock); return devlink; } @@ -3434,6 +4442,320 @@ out: } EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); +/** + * devlink_params_register - register configuration parameters + * + * @devlink: devlink + * @params: configuration parameters array + * @params_count: number of parameters provided + * + * Register the configuration parameters supported by the driver. + */ +int devlink_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + const struct devlink_param *param = params; + int i; + int err; + + mutex_lock(&devlink->lock); + for (i = 0; i < params_count; i++, param++) { + if (!param || !param->name || !param->supported_cmodes) { + err = -EINVAL; + goto rollback; + } + if (param->generic) { + err = devlink_param_generic_verify(param); + if (err) + goto rollback; + } else { + err = devlink_param_driver_verify(param); + if (err) + goto rollback; + } + err = devlink_param_register_one(devlink, param); + if (err) + goto rollback; + } + + mutex_unlock(&devlink->lock); + return 0; + +rollback: + if (!i) + goto unlock; + for (param--; i > 0; i--, param--) + devlink_param_unregister_one(devlink, param); +unlock: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_params_register); + +/** + * devlink_params_unregister - unregister configuration parameters + * @devlink: devlink + * @params: configuration parameters to unregister + * @params_count: number of parameters provided + */ +void devlink_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + const struct devlink_param *param = params; + int i; + + mutex_lock(&devlink->lock); + for (i = 0; i < params_count; i++, param++) + devlink_param_unregister_one(devlink, param); + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_params_unregister); + +/** + * devlink_param_driverinit_value_get - get configuration parameter + * value for driver initializing + * + * @devlink: devlink + * @param_id: parameter ID + * @init_val: value of parameter in driverinit configuration mode + * + * This function should be used by the driver to get driverinit + * configuration for initialization after reload command. + */ +int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *init_val) +{ + struct devlink_param_item *param_item; + + if (!devlink->ops || !devlink->ops->reload) + return -EOPNOTSUPP; + + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + if (!param_item) + return -EINVAL; + + if (!param_item->driverinit_value_valid || + !devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT)) + return -EOPNOTSUPP; + + *init_val = param_item->driverinit_value; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); + +/** + * devlink_param_driverinit_value_set - set value of configuration + * parameter for driverinit + * configuration mode + * + * @devlink: devlink + * @param_id: parameter ID + * @init_val: value of parameter to set for driverinit configuration mode + * + * This function should be used by the driver to set driverinit + * configuration mode default value. + */ +int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, + union devlink_param_value init_val) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + if (!param_item) + return -EINVAL; + + if (!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT)) + return -EOPNOTSUPP; + + param_item->driverinit_value = init_val; + param_item->driverinit_value_valid = true; + + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); + +/** + * devlink_param_value_changed - notify devlink on a parameter's value + * change. Should be called by the driver + * right after the change. + * + * @devlink: devlink + * @param_id: parameter ID + * + * This function should be used by the driver to notify devlink on value + * change, excluding driverinit configuration mode. + * For driverinit configuration mode driver should use the function + * devlink_param_driverinit_value_set() instead. + */ +void devlink_param_value_changed(struct devlink *devlink, u32 param_id) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + WARN_ON(!param_item); + + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); +} +EXPORT_SYMBOL_GPL(devlink_param_value_changed); + +/** + * devlink_region_create - create a new address region + * + * @devlink: devlink + * @region_name: region name + * @region_max_snapshots: Maximum supported number of snapshots for region + * @region_size: size of region + */ +struct devlink_region *devlink_region_create(struct devlink *devlink, + const char *region_name, + u32 region_max_snapshots, + u64 region_size) +{ + struct devlink_region *region; + int err = 0; + + mutex_lock(&devlink->lock); + + if (devlink_region_get_by_name(devlink, region_name)) { + err = -EEXIST; + goto unlock; + } + + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) { + err = -ENOMEM; + goto unlock; + } + + region->devlink = devlink; + region->max_snapshots = region_max_snapshots; + region->name = region_name; + region->size = region_size; + INIT_LIST_HEAD(®ion->snapshot_list); + list_add_tail(®ion->list, &devlink->region_list); + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); + + mutex_unlock(&devlink->lock); + return region; + +unlock: + mutex_unlock(&devlink->lock); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(devlink_region_create); + +/** + * devlink_region_destroy - destroy address region + * + * @region: devlink region to destroy + */ +void devlink_region_destroy(struct devlink_region *region) +{ + struct devlink *devlink = region->devlink; + struct devlink_snapshot *snapshot, *ts; + + mutex_lock(&devlink->lock); + + /* Free all snapshots of region */ + list_for_each_entry_safe(snapshot, ts, ®ion->snapshot_list, list) + devlink_region_snapshot_del(snapshot); + + list_del(®ion->list); + + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); + mutex_unlock(&devlink->lock); + kfree(region); +} +EXPORT_SYMBOL_GPL(devlink_region_destroy); + +/** + * devlink_region_shapshot_id_get - get snapshot ID + * + * This callback should be called when adding a new snapshot, + * Driver should use the same id for multiple snapshots taken + * on multiple regions at the same time/by the same trigger. + * + * @devlink: devlink + */ +u32 devlink_region_shapshot_id_get(struct devlink *devlink) +{ + u32 id; + + mutex_lock(&devlink->lock); + id = ++devlink->snapshot_id; + mutex_unlock(&devlink->lock); + + return id; +} +EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get); + +/** + * devlink_region_snapshot_create - create a new snapshot + * This will add a new snapshot of a region. The snapshot + * will be stored on the region struct and can be accessed + * from devlink. This is useful for future analyses of snapshots. + * Multiple snapshots can be created on a region. + * The @snapshot_id should be obtained using the getter function. + * + * @devlink_region: devlink region of the snapshot + * @data_len: size of snapshot data + * @data: snapshot data + * @snapshot_id: snapshot id to be created + * @data_destructor: pointer to destructor function to free data + */ +int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, + u8 *data, u32 snapshot_id, + devlink_snapshot_data_dest_t *data_destructor) +{ + struct devlink *devlink = region->devlink; + struct devlink_snapshot *snapshot; + int err; + + mutex_lock(&devlink->lock); + + /* check if region can hold one more snapshot */ + if (region->cur_snapshots == region->max_snapshots) { + err = -ENOMEM; + goto unlock; + } + + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { + err = -EEXIST; + goto unlock; + } + + snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); + if (!snapshot) { + err = -ENOMEM; + goto unlock; + } + + snapshot->id = snapshot_id; + snapshot->region = region; + snapshot->data = data; + snapshot->data_len = data_len; + snapshot->data_destructor = data_destructor; + + list_add_tail(&snapshot->list, ®ion->snapshot_list); + + region->cur_snapshots++; + + devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW); + mutex_unlock(&devlink->lock); + return 0; + +unlock: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_region_snapshot_create); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index e677a20180cf..c9993c6c2fd4 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -111,6 +111,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", + [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", }; static const char diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 126ffc5bc630..f64aa13811ea 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -416,6 +416,14 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, if (rule->mark && r->mark != rule->mark) continue; + if (rule->suppress_ifgroup != -1 && + r->suppress_ifgroup != rule->suppress_ifgroup) + continue; + + if (rule->suppress_prefixlen != -1 && + r->suppress_prefixlen != rule->suppress_prefixlen) + continue; + if (rule->mark_mask && r->mark_mask != rule->mark_mask) continue; @@ -436,6 +444,9 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, if (rule->ip_proto && r->ip_proto != rule->ip_proto) continue; + if (rule->proto && r->proto != rule->proto) + continue; + if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) @@ -645,6 +656,73 @@ errout: return err; } +static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, + struct nlattr **tb, struct fib_rule *rule) +{ + struct fib_rule *r; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->action != rule->action) + continue; + + if (r->table != rule->table) + continue; + + if (r->pref != rule->pref) + continue; + + if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) + continue; + + if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) + continue; + + if (r->mark != rule->mark) + continue; + + if (r->suppress_ifgroup != rule->suppress_ifgroup) + continue; + + if (r->suppress_prefixlen != rule->suppress_prefixlen) + continue; + + if (r->mark_mask != rule->mark_mask) + continue; + + if (r->tun_id != rule->tun_id) + continue; + + if (r->fr_net != rule->fr_net) + continue; + + if (r->l3mdev != rule->l3mdev) + continue; + + if (!uid_eq(r->uid_range.start, rule->uid_range.start) || + !uid_eq(r->uid_range.end, rule->uid_range.end)) + continue; + + if (r->ip_proto != rule->ip_proto) + continue; + + if (r->proto != rule->proto) + continue; + + if (!fib_rule_port_range_compare(&r->sport_range, + &rule->sport_range)) + continue; + + if (!fib_rule_port_range_compare(&r->dport_range, + &rule->dport_range)) + continue; + + if (!ops->compare(r, frh, tb)) + continue; + return 1; + } + return 0; +} + int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -679,7 +757,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; if ((nlh->nlmsg_flags & NLM_F_EXCL) && - rule_find(ops, frh, tb, rule, user_priority)) { + rule_exists(ops, frh, tb, rule)) { err = -EEXIST; goto errout_free; } diff --git a/net/core/filter.c b/net/core/filter.c index e7f12e9f598c..104d560946da 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -459,11 +459,21 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) (!unaligned_ok && offset >= 0 && offset + ip_align >= 0 && offset + ip_align % size == 0))) { + bool ldx_off_ok = offset <= S16_MAX; + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); - *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian); - *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, - offset); + *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, + size, 2 + endian + (!ldx_off_ok * 2)); + if (ldx_off_ok) { + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, + BPF_REG_D, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset); + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, + BPF_REG_TMP, 0); + } if (endian) *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); *insn++ = BPF_JMP_A(8); @@ -1762,6 +1772,37 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = { .arg2_type = ARG_ANYTHING, }; +static inline int sk_skb_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + int err = __bpf_try_make_writable(skb, write_len); + + bpf_compute_data_end_sk_skb(skb); + return err; +} + +BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto sk_skb_pull_data_proto = { + .func = sk_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { @@ -2779,7 +2820,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) static u32 __bpf_skb_max_len(const struct sk_buff *skb) { - return skb->dev->mtu + skb->dev->hard_header_len; + return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len : + SKB_MAX_ALLOC; } static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) @@ -2863,8 +2905,8 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) return __skb_trim_rcsum(skb, new_len); } -BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, - u64, flags) +static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, + u64 flags) { u32 max_len = __bpf_skb_max_len(skb); u32 min_len = __bpf_skb_min_len(skb); @@ -2900,6 +2942,13 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, if (!ret && skb_is_gso(skb)) skb_gso_reset(skb); } + return ret; +} + +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) +{ + int ret = __bpf_skb_change_tail(skb, new_len, flags); bpf_compute_data_pointers(skb); return ret; @@ -2914,9 +2963,27 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, +BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { + int ret = __bpf_skb_change_tail(skb, new_len, flags); + + bpf_compute_data_end_sk_skb(skb); + return ret; +} + +static const struct bpf_func_proto sk_skb_change_tail_proto = { + .func = sk_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, + u64 flags) +{ u32 max_len = __bpf_skb_max_len(skb); u32 new_len = skb->len + head_room; int ret; @@ -2941,8 +3008,16 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, skb_reset_mac_header(skb); } + return ret; +} + +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + bpf_compute_data_pointers(skb); - return 0; + return ret; } static const struct bpf_func_proto bpf_skb_change_head_proto = { @@ -2954,6 +3029,23 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + + bpf_compute_data_end_sk_skb(skb); + return ret; +} + +static const struct bpf_func_proto sk_skb_change_head_proto = { + .func = sk_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : @@ -3046,12 +3138,16 @@ static int __bpf_tx_xdp(struct net_device *dev, u32 index) { struct xdp_frame *xdpf; - int sent; + int err, sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -3285,7 +3381,8 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, goto err; } - if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) goto err; skb->dev = fwd; @@ -3582,7 +3679,7 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; - ip_tunnel_info_opts_set(info, from, size); + ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); return 0; } @@ -4073,8 +4170,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; + params->ifindex = dev->ifindex; - return dev->ifindex; + return 0; } #endif @@ -4098,7 +4196,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, /* verify forwarding is enabled on this interface */ in_dev = __in_dev_get_rcu(dev); if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) - return 0; + return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl4.flowi4_iif = 1; @@ -4123,7 +4221,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, tb = fib_get_table(net, tbid); if (unlikely(!tb)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { @@ -4135,8 +4233,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); } - if (err || res.type != RTN_UNICAST) - return 0; + if (err) { + /* map fib lookup errors to RTN_ type */ + if (err == -EINVAL) + return BPF_FIB_LKUP_RET_BLACKHOLE; + if (err == -EHOSTUNREACH) + return BPF_FIB_LKUP_RET_UNREACHABLE; + if (err == -EACCES) + return BPF_FIB_LKUP_RET_PROHIBIT; + + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + + if (res.type != RTN_UNICAST) + return BPF_FIB_LKUP_RET_NOT_FWDED; if (res.fi->fib_nhs > 1) fib_select_path(net, &res, &fl4, NULL); @@ -4144,19 +4254,16 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); if (params->tot_len > mtu) - return 0; + return BPF_FIB_LKUP_RET_FRAG_NEEDED; } nh = &res.fi->fib_nh[res.nh_sel]; /* do not handle lwt encaps right now */ if (nh->nh_lwtstate) - return 0; + return BPF_FIB_LKUP_RET_UNSUPP_LWT; dev = nh->nh_dev; - if (unlikely(!dev)) - return 0; - if (nh->nh_gw) params->ipv4_dst = nh->nh_gw; @@ -4166,10 +4273,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, * rcu_read_lock_bh is not needed here */ neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); - if (neigh) - return bpf_fib_set_fwd_params(params, neigh, dev); + if (!neigh) + return BPF_FIB_LKUP_RET_NO_NEIGH; - return 0; + return bpf_fib_set_fwd_params(params, neigh, dev); } #endif @@ -4190,7 +4297,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -4198,7 +4305,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, idev = __in6_dev_get_safely(dev); if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) - return 0; + return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl6.flowi6_iif = 1; @@ -4225,7 +4332,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); } else { @@ -4238,11 +4345,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, } if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; + + if (unlikely(f6i->fib6_flags & RTF_REJECT)) { + switch (f6i->fib6_type) { + case RTN_BLACKHOLE: + return BPF_FIB_LKUP_RET_BLACKHOLE; + case RTN_UNREACHABLE: + return BPF_FIB_LKUP_RET_UNREACHABLE; + case RTN_PROHIBIT: + return BPF_FIB_LKUP_RET_PROHIBIT; + default: + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + } - if (unlikely(f6i->fib6_flags & RTF_REJECT || - f6i->fib6_type != RTN_UNICAST)) - return 0; + if (f6i->fib6_type != RTN_UNICAST) + return BPF_FIB_LKUP_RET_NOT_FWDED; if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, @@ -4252,11 +4371,11 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); if (params->tot_len > mtu) - return 0; + return BPF_FIB_LKUP_RET_FRAG_NEEDED; } if (f6i->fib6_nh.nh_lwtstate) - return 0; + return BPF_FIB_LKUP_RET_UNSUPP_LWT; if (f6i->fib6_flags & RTF_GATEWAY) *dst = f6i->fib6_nh.nh_gw; @@ -4270,10 +4389,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, */ neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, ndisc_hashfn, dst, dev); - if (neigh) - return bpf_fib_set_fwd_params(params, neigh, dev); + if (!neigh) + return BPF_FIB_LKUP_RET_NO_NEIGH; - return 0; + return bpf_fib_set_fwd_params(params, neigh, dev); } #endif @@ -4315,7 +4434,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { struct net *net = dev_net(skb->dev); - int index = -EAFNOSUPPORT; + int rc = -EAFNOSUPPORT; if (plen < sizeof(*params)) return -EINVAL; @@ -4326,25 +4445,25 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - index = bpf_ipv4_fib_lookup(net, params, flags, false); + rc = bpf_ipv4_fib_lookup(net, params, flags, false); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - index = bpf_ipv6_fib_lookup(net, params, flags, false); + rc = bpf_ipv6_fib_lookup(net, params, flags, false); break; #endif } - if (index > 0) { + if (!rc) { struct net_device *dev; - dev = dev_get_by_index_rcu(net, index); + dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) - index = 0; + rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; } - return index; + return rc; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { @@ -4417,10 +4536,10 @@ static const struct bpf_func_proto bpf_lwt_push_encap_proto = { .arg4_type = ARG_CONST_SIZE }; +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len) { -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); void *srh_tlvs, *srh_end, *ptr; @@ -4446,9 +4565,6 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, memcpy(skb->data + offset, from, len); return 0; -#else /* CONFIG_IPV6_SEG6_BPF */ - return -EOPNOTSUPP; -#endif } static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { @@ -4464,7 +4580,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, u32, action, void *, param, u32, param_len) { -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh; @@ -4512,9 +4627,6 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, default: return -EINVAL; } -#else /* CONFIG_IPV6_SEG6_BPF */ - return -EOPNOTSUPP; -#endif } static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { @@ -4530,7 +4642,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, s32, len) { -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); void *srh_end, *srh_tlvs, *ptr; @@ -4574,9 +4685,6 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, srh_state->hdrlen += len; srh_state->valid = 0; return 0; -#else /* CONFIG_IPV6_SEG6_BPF */ - return -EOPNOTSUPP; -#endif } static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { @@ -4587,6 +4695,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; +#endif /* CONFIG_IPV6_SEG6_BPF */ bool bpf_helper_changes_pkt_data(void *func) { @@ -4595,9 +4704,12 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_store_bytes || func == bpf_skb_change_proto || func == bpf_skb_change_head || + func == sk_skb_change_head || func == bpf_skb_change_tail || + func == sk_skb_change_tail || func == bpf_skb_adjust_room || func == bpf_skb_pull_data || + func == sk_skb_pull_data || func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || @@ -4605,11 +4717,12 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || func == bpf_xdp_adjust_tail || - func == bpf_lwt_push_encap || +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || func == bpf_lwt_seg6_adjust_srh || - func == bpf_lwt_seg6_action - ) + func == bpf_lwt_seg6_action || +#endif + func == bpf_lwt_push_encap) return true; return false; @@ -4638,6 +4751,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); + /* else: fall through */ default: return NULL; } @@ -4849,11 +4963,11 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: - return &bpf_skb_pull_data_proto; + return &sk_skb_pull_data_proto; case BPF_FUNC_skb_change_tail: - return &bpf_skb_change_tail_proto; + return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: - return &bpf_skb_change_head_proto; + return &sk_skb_change_head_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: @@ -4944,12 +5058,14 @@ static const struct bpf_func_proto * lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_FUNC_lwt_seg6_store_bytes: return &bpf_lwt_seg6_store_bytes_proto; case BPF_FUNC_lwt_seg6_action: return &bpf_lwt_seg6_action_proto; case BPF_FUNC_lwt_seg6_adjust_srh: return &bpf_lwt_seg6_adjust_srh_proto; +#endif default: return lwt_out_func_proto(func_id, prog); } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 53f96e4f7bf5..08a5184f4b34 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -152,7 +152,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL) && !dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ENC_PORTS)) + FLOW_DISSECTOR_KEY_ENC_PORTS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP)) return; info = skb_tunnel_info(skb); @@ -212,6 +214,16 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, tp->src = key->tp_src; tp->dst = key->tp_dst; } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) { + struct flow_dissector_key_ip *ip; + + ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP, + target_container); + ip->tos = key->tos; + ip->ttl = key->ttl; + } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); @@ -589,7 +601,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; - bool skip_vlan = false; + enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -748,14 +760,14 @@ proto_again: } case htons(ETH_P_8021AD): case htons(ETH_P_8021Q): { - const struct vlan_hdr *vlan; + const struct vlan_hdr *vlan = NULL; struct vlan_hdr _vlan; - bool vlan_tag_present = skb && skb_vlan_tag_present(skb); + __be16 saved_vlan_tpid = proto; - if (vlan_tag_present) + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX && + skb && skb_vlan_tag_present(skb)) { proto = skb->protocol; - - if (!vlan_tag_present || eth_type_vlan(skb->protocol)) { + } else { vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) { @@ -765,20 +777,23 @@ proto_again: proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); - if (skip_vlan) { - fdret = FLOW_DISSECT_RET_PROTO_AGAIN; - break; - } } - skip_vlan = true; - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_VLAN)) { + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) { + dissector_vlan = FLOW_DISSECTOR_KEY_VLAN; + } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) { + dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN; + } else { + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + + if (dissector_uses_key(flow_dissector, dissector_vlan)) { key_vlan = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_VLAN, + dissector_vlan, target_container); - if (vlan_tag_present) { + if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); key_vlan->vlan_priority = (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); @@ -789,6 +804,7 @@ proto_again: (ntohs(vlan->h_vlan_TCI) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } + key_vlan->vlan_tpid = saved_vlan_tpid; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index b2b2323bdc84..188d693cb251 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -77,8 +77,20 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, d->lock = lock; spin_lock_bh(lock); } - if (d->tail) - return gnet_stats_copy(d, type, NULL, 0, padattr); + if (d->tail) { + int ret = gnet_stats_copy(d, type, NULL, 0, padattr); + + /* The initial attribute added in gnet_stats_copy() may be + * preceded by a padding attribute, in which case d->tail will + * end up pointing at the padding instead of the real attribute. + * Fix this so gnet_stats_finish_copy() adjusts the length of + * the right attribute. + */ + if (ret == 0 && d->tail->nla_type == padattr) + d->tail = (struct nlattr *)((char *)d->tail + + NLA_ALIGN(d->tail->nla_len)); + return ret; + } return 0; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8e3fda9e725c..aa19d86937af 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1148,7 +1148,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh->nud_state = new; err = 0; notify = old & NUD_VALID; - if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && + if (((old & (NUD_INCOMPLETE | NUD_PROBE)) || + (flags & NEIGH_UPDATE_F_ADMIN)) && (new & NUD_FAILED)) { neigh_invalidate(neigh); notify = 1; @@ -3273,4 +3274,3 @@ static int __init neigh_init(void) } subsys_initcall(neigh_init); - diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bb7e80f4ced3..0a95bcf64cdc 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -905,11 +905,20 @@ static const void *rx_queue_namespace(struct kobject *kobj) return ns; } +static void rx_queue_get_ownership(struct kobject *kobj, + kuid_t *uid, kgid_t *gid) +{ + const struct net *net = rx_queue_namespace(kobj); + + net_ns_get_ownership(net, uid, gid); +} + static struct kobj_type rx_queue_ktype __ro_after_init = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, .default_attrs = rx_queue_default_attrs, - .namespace = rx_queue_namespace + .namespace = rx_queue_namespace, + .get_ownership = rx_queue_get_ownership, }; static int rx_queue_add_kobject(struct net_device *dev, int index) @@ -1047,13 +1056,30 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; - int index = get_netdev_queue_index(queue); - int tc = netdev_txq_to_tc(dev, index); + int index; + int tc; + + if (!netif_is_multiqueue(dev)) + return -ENOENT; + index = get_netdev_queue_index(queue); + + /* If queue belongs to subordinate dev use its TC mapping */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; - return sprintf(buf, "%u\n", tc); + /* We can report the traffic class one of two ways: + * Subordinate device traffic classes are reported with the traffic + * class first, and then the subordinate class so for example TC0 on + * subordinate device 2 will be reported as "0-2". If the queue + * belongs to the root device it will be reported with just the + * traffic class, so just "0" for TC 0 for example. + */ + return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) : + sprintf(buf, "%u\n", tc); } #ifdef CONFIG_XPS @@ -1070,6 +1096,9 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue, int err, index = get_netdev_queue_index(queue); u32 rate = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + err = kstrtou32(buf, 10, &rate); if (err < 0) return err; @@ -1214,10 +1243,20 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, cpumask_var_t mask; unsigned long index; + if (!netif_is_multiqueue(dev)) + return -ENOENT; + index = get_netdev_queue_index(queue); if (dev->num_tc) { + /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; + if (num_tc < 0) + return -EINVAL; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; @@ -1227,13 +1266,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, return -ENOMEM; rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); + dev_maps = rcu_dereference(dev->xps_cpus_map); if (dev_maps) { for_each_possible_cpu(cpu) { int i, tci = cpu * num_tc + tc; struct xps_map *map; - map = rcu_dereference(dev_maps->cpu_map[tci]); + map = rcu_dereference(dev_maps->attr_map[tci]); if (!map) continue; @@ -1260,6 +1299,9 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, cpumask_var_t mask; int err; + if (!netif_is_multiqueue(dev)) + return -ENOENT; + if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -1283,6 +1325,88 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); + +static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) +{ + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + unsigned long *mask, index; + int j, len, num_tc = 1, tc = 0; + + index = get_netdev_queue_index(queue); + + if (dev->num_tc) { + num_tc = dev->num_tc; + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } + mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long), + GFP_KERNEL); + if (!mask) + return -ENOMEM; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_rxqs_map); + if (!dev_maps) + goto out_no_maps; + + for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues), + j < dev->num_rx_queues;) { + int i, tci = j * num_tc + tc; + struct xps_map *map; + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (!map) + continue; + + for (i = map->len; i--;) { + if (map->queues[i] == index) { + set_bit(j, mask); + break; + } + } + } +out_no_maps: + rcu_read_unlock(); + + len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); + kfree(mask); + + return len < PAGE_SIZE ? len : -EINVAL; +} + +static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, + size_t len) +{ + struct net_device *dev = queue->dev; + struct net *net = dev_net(dev); + unsigned long *mask, index; + int err; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long), + GFP_KERNEL); + if (!mask) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + err = bitmap_parse(buf, len, mask, dev->num_rx_queues); + if (err) { + kfree(mask); + return err; + } + + err = __netif_set_xps_queue(dev, mask, index, true); + kfree(mask); + return err ? : len; +} + +static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init + = __ATTR_RW(xps_rxqs); #endif /* CONFIG_XPS */ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { @@ -1290,6 +1414,7 @@ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { &queue_traffic_class.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, + &xps_rxqs_attribute.attr, &queue_tx_maxrate.attr, #endif NULL @@ -1315,11 +1440,20 @@ static const void *netdev_queue_namespace(struct kobject *kobj) return ns; } +static void netdev_queue_get_ownership(struct kobject *kobj, + kuid_t *uid, kgid_t *gid) +{ + const struct net *net = netdev_queue_namespace(kobj); + + net_ns_get_ownership(net, uid, gid); +} + static struct kobj_type netdev_queue_ktype __ro_after_init = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, .default_attrs = netdev_queue_default_attrs, .namespace = netdev_queue_namespace, + .get_ownership = netdev_queue_get_ownership, }; static int netdev_queue_add_kobject(struct net_device *dev, int index) @@ -1509,6 +1643,14 @@ static const void *net_namespace(struct device *d) return dev_net(dev); } +static void net_get_ownership(struct device *d, kuid_t *uid, kgid_t *gid) +{ + struct net_device *dev = to_net_dev(d); + const struct net *net = dev_net(dev); + + net_ns_get_ownership(net, uid, gid); +} + static struct class net_class __ro_after_init = { .name = "net", .dev_release = netdev_release, @@ -1516,6 +1658,7 @@ static struct class net_class __ro_after_init = { .dev_uevent = netdev_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, + .get_ownership = net_get_ownership, }; #ifdef CONFIG_OF_NET diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a11e03f920d3..738871af5efa 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -17,6 +17,7 @@ #include <linux/user_namespace.h> #include <linux/net_namespace.h> #include <linux/sched/task.h> +#include <linux/uidgid.h> #include <net/sock.h> #include <net/netlink.h> @@ -448,6 +449,33 @@ dec_ucounts: return net; } +/** + * net_ns_get_ownership - get sysfs ownership data for @net + * @net: network namespace in question (can be NULL) + * @uid: kernel user ID for sysfs objects + * @gid: kernel group ID for sysfs objects + * + * Returns the uid/gid pair of root in the user namespace associated with the + * given network namespace. + */ +void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) +{ + if (net) { + kuid_t ns_root_uid = make_kuid(net->user_ns, 0); + kgid_t ns_root_gid = make_kgid(net->user_ns, 0); + + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; + } else { + *uid = GLOBAL_ROOT_UID; + *gid = GLOBAL_ROOT_GID; + } +} +EXPORT_SYMBOL_GPL(net_ns_get_ownership); + static void unhash_nsid(struct net *net, struct net *last) { struct net *tmp; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 68bf07206744..43a932cb609b 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -269,7 +269,7 @@ static void __page_pool_empty_ring(struct page_pool *pool) struct page *page; /* Empty recycle ring */ - while ((page = ptr_ring_consume(&pool->ring))) { + while ((page = ptr_ring_consume_bh(&pool->ring))) { /* Verify the refcnt invariant of cached pages */ if (!(page_ref_count(page) == 1)) pr_crit("%s() page_pool refcnt %d violation\n", diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 6d37dbf0aa64..7f6938405fa1 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1265,7 +1265,7 @@ static ssize_t pktgen_if_write(struct file *file, buf[len] = 0; if (strcmp(buf, pkt_dev->dst_min) != 0) { memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); - strncpy(pkt_dev->dst_min, buf, len); + strcpy(pkt_dev->dst_min, buf); pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); pkt_dev->cur_daddr = pkt_dev->daddr_min; } @@ -1280,14 +1280,12 @@ static ssize_t pktgen_if_write(struct file *file, if (len < 0) return len; - if (copy_from_user(buf, &user_buffer[i], len)) return -EFAULT; - buf[len] = 0; if (strcmp(buf, pkt_dev->dst_max) != 0) { memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); - strncpy(pkt_dev->dst_max, buf, len); + strcpy(pkt_dev->dst_max, buf); pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); pkt_dev->cur_daddr = pkt_dev->daddr_max; } @@ -1396,7 +1394,7 @@ static ssize_t pktgen_if_write(struct file *file, buf[len] = 0; if (strcmp(buf, pkt_dev->src_min) != 0) { memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); - strncpy(pkt_dev->src_min, buf, len); + strcpy(pkt_dev->src_min, buf); pkt_dev->saddr_min = in_aton(pkt_dev->src_min); pkt_dev->cur_saddr = pkt_dev->saddr_min; } @@ -1416,7 +1414,7 @@ static ssize_t pktgen_if_write(struct file *file, buf[len] = 0; if (strcmp(buf, pkt_dev->src_max) != 0) { memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); - strncpy(pkt_dev->src_max, buf, len); + strcpy(pkt_dev->src_max, buf); pkt_dev->saddr_max = in_aton(pkt_dev->src_max); pkt_dev->cur_saddr = pkt_dev->saddr_max; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5ef61222fdef..92b6fa5d5f6e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -964,7 +964,8 @@ static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ - nla_total_size(4); /* XDP_PROG_ID */ + nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ + nla_total_size(4); /* XDP_<mode>_PROG_ID */ return xdp_size; } @@ -1353,27 +1354,51 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) return 0; } -static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) +static u32 rtnl_xdp_prog_skb(struct net_device *dev) { - const struct net_device_ops *ops = dev->netdev_ops; const struct bpf_prog *generic_xdp_prog; - struct netdev_bpf xdp; ASSERT_RTNL(); - *prog_id = 0; generic_xdp_prog = rtnl_dereference(dev->xdp_prog); - if (generic_xdp_prog) { - *prog_id = generic_xdp_prog->aux->id; - return XDP_ATTACHED_SKB; - } - if (!ops->ndo_bpf) - return XDP_ATTACHED_NONE; + if (!generic_xdp_prog) + return 0; + return generic_xdp_prog->aux->id; +} + +static u32 rtnl_xdp_prog_drv(struct net_device *dev) +{ + return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG); +} + +static u32 rtnl_xdp_prog_hw(struct net_device *dev) +{ + return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, + XDP_QUERY_PROG_HW); +} + +static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, + u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, + u32 (*get_prog_id)(struct net_device *dev)) +{ + u32 curr_id; + int err; + + curr_id = get_prog_id(dev); + if (!curr_id) + return 0; - __dev_xdp_query(dev, ops->ndo_bpf, &xdp); - *prog_id = xdp.prog_id; + *prog_id = curr_id; + err = nla_put_u32(skb, attr, curr_id); + if (err) + return err; + + if (*mode != XDP_ATTACHED_NONE) + *mode = XDP_ATTACHED_MULTI; + else + *mode = tgt_mode; - return xdp.prog_attached; + return 0; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) @@ -1381,17 +1406,32 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) struct nlattr *xdp; u32 prog_id; int err; + u8 mode; xdp = nla_nest_start(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; - err = nla_put_u8(skb, IFLA_XDP_ATTACHED, - rtnl_xdp_attached_mode(dev, &prog_id)); + prog_id = 0; + mode = XDP_ATTACHED_NONE; + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, + IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb); + if (err) + goto err_cancel; + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, + IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv); + if (err) + goto err_cancel; + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, + IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw); + if (err) + goto err_cancel; + + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; - if (prog_id) { + if (prog_id && mode != XDP_ATTACHED_MULTI) { err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; @@ -2759,9 +2799,12 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) return err; } - dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - - __dev_notify_flags(dev, old_flags, ~0U); + if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { + __dev_notify_flags(dev, old_flags, 0U); + } else { + dev->rtnl_link_state = RTNL_LINK_INITIALIZED; + __dev_notify_flags(dev, old_flags, ~0U); + } return 0; } EXPORT_SYMBOL(rtnl_configure_link); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c642304f178c..266b954f763e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -858,6 +858,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->cloned = 1; n->nohdr = 0; n->peeked = 0; + C(pfmemalloc); n->destructor = NULL; C(tail); C(end); @@ -3719,6 +3720,7 @@ normal: net_warn_ratelimited( "skb_segment: too many frags: %u %u\n", pos, mss); + err = -EINVAL; goto err; } @@ -3752,11 +3754,10 @@ skip_fraglist: perform_csum_check: if (!csum) { - if (skb_has_shared_frag(nskb)) { - err = __skb_linearize(nskb); - if (err) - goto err; - } + if (skb_has_shared_frag(nskb) && + __skb_linearize(nskb)) + goto err; + if (!nskb->remcsum_offload) nskb->ip_summed = CHECKSUM_NONE; SKB_GSO_CB(nskb)->csum = @@ -3815,14 +3816,14 @@ err: } EXPORT_SYMBOL_GPL(skb_segment); -int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); - struct sk_buff *lp, *p = *head; unsigned int delta_truesize; + struct sk_buff *lp; if (unlikely(p->len + len >= 65536)) return -E2BIG; @@ -4898,7 +4899,6 @@ EXPORT_SYMBOL(skb_try_coalesce); */ void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - skb->tstamp = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; @@ -4911,8 +4911,8 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) return; ipvs_reset(skb); - skb_orphan(skb); skb->mark = 0; + skb->tstamp = 0; } EXPORT_SYMBOL_GPL(skb_scrub_packet); @@ -5276,8 +5276,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, if (npages >= 1 << order) { page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | - __GFP_NOWARN | - __GFP_NORETRY, + __GFP_NOWARN, order); if (page) goto fill_page; diff --git a/net/core/sock.c b/net/core/sock.c index bcc41829a16d..9c6ebbdfebf3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -91,6 +91,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/unaligned.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/errqueue.h> @@ -697,6 +698,7 @@ EXPORT_SYMBOL(sk_mc_loop); int sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { + struct sock_txtime sk_txtime; struct sock *sk = sock->sk; int val; int valbool; @@ -1070,6 +1072,26 @@ set_rcvbuf: } break; + case SO_TXTIME: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + } else if (optlen != sizeof(struct sock_txtime)) { + ret = -EINVAL; + } else if (copy_from_user(&sk_txtime, optval, + sizeof(struct sock_txtime))) { + ret = -EFAULT; + } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { + ret = -EINVAL; + } else { + sock_valbool_flag(sk, SOCK_TXTIME, true); + sk->sk_clockid = sk_txtime.clockid; + sk->sk_txtime_deadline_mode = + !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); + } + break; + default: ret = -ENOPROTOOPT; break; @@ -1115,6 +1137,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, u64 val64; struct linger ling; struct timeval tm; + struct sock_txtime txtime; } v; int lv = sizeof(int); @@ -1403,6 +1426,15 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sock_flag(sk, SOCK_ZEROCOPY); break; + case SO_TXTIME: + lv = sizeof(v.txtime); + v.txtime.clockid = sk->sk_clockid; + v.txtime.flags |= sk->sk_txtime_deadline_mode ? + SOF_TXTIME_DEADLINE_MODE : 0; + v.txtime.flags |= sk->sk_txtime_report_errors ? + SOF_TXTIME_REPORT_ERRORS : 0; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -2137,6 +2169,13 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; + case SCM_TXTIME: + if (!sock_flag(sk, SOCK_TXTIME)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) + return -EINVAL; + sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); + break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: @@ -2277,9 +2316,9 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, pfrag->offset += use; sge = sg + sg_curr - 1; - if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page && - sg->offset + sg->length == orig_offset) { - sg->length += use; + if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page && + sge->offset + sge->length == orig_offset) { + sge->length += use; } else { sge = sg + sg_curr; sg_unmark_end(sge); @@ -2401,9 +2440,10 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct proto *prot = sk->sk_prot; long allocated = sk_memory_allocated_add(sk, amt); + bool charged = true; if (mem_cgroup_sockets_enabled && sk->sk_memcg && - !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) + !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) goto suppress_allocation; /* Under limit. */ @@ -2461,7 +2501,8 @@ suppress_allocation: return 1; } - trace_sock_exceed_buf_limit(sk, prot, allocated); + if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) + trace_sock_exceed_buf_limit(sk, prot, allocated, kind); sk_memory_allocated_sub(sk, amt); @@ -2818,6 +2859,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_pacing_rate = ~0U; sk->sk_pacing_shift = 10; sk->sk_incoming_cpu = -1; + + sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) @@ -3243,7 +3286,8 @@ static int req_prot_init(const struct proto *prot) rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, - prot->slab_flags, NULL); + SLAB_ACCOUNT | prot->slab_flags, + NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", @@ -3258,7 +3302,8 @@ int proto_register(struct proto *prot, int alloc_slab) if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, - SLAB_HWCACHE_ALIGN | prot->slab_flags, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | + prot->slab_flags, prot->useroffset, prot->usersize, NULL); @@ -3281,6 +3326,7 @@ int proto_register(struct proto *prot, int alloc_slab) kmem_cache_create(prot->twsk_prot->twsk_slab_name, prot->twsk_prot->twsk_obj_size, 0, + SLAB_ACCOUNT | prot->slab_flags, NULL); if (prot->twsk_prot->twsk_slab == NULL) diff --git a/net/core/xdp.c b/net/core/xdp.c index 31c58719b5a9..57285383ed00 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -3,8 +3,11 @@ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. * Released under terms in GPL version 2. See COPYING. */ +#include <linux/bpf.h> +#include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> +#include <linux/netdevice.h> #include <linux/slab.h> #include <linux/idr.h> #include <linux/rhashtable.h> @@ -370,3 +373,34 @@ void xdp_return_buff(struct xdp_buff *xdp) __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); } EXPORT_SYMBOL_GPL(xdp_return_buff); + +int xdp_attachment_query(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + bpf->prog_id = info->prog ? info->prog->aux->id : 0; + bpf->prog_flags = info->prog ? info->flags : 0; + return 0; +} +EXPORT_SYMBOL_GPL(xdp_attachment_query); + +bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) { + NL_SET_ERR_MSG(bpf->extack, + "program loaded with different flags"); + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok); + +void xdp_attachment_setup(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + if (info->prog) + bpf_prog_put(info->prog); + info->prog = bpf->prog; + info->flags = bpf->flags; +} +EXPORT_SYMBOL_GPL(xdp_attachment_setup); diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 8b5ba6dffac7..12877a1514e7 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -600,7 +600,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, { struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); struct dccp_sock *dp = dccp_sk(sk); - ktime_t now = ktime_get_real(); + ktime_t now = ktime_get(); s64 delta = 0; switch (fbtype) { @@ -625,15 +625,14 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, case CCID3_FBACK_PERIODIC: delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback); if (delta <= 0) - DCCP_BUG("delta (%ld) <= 0", (long)delta); - else - hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta); + delta = 1; + hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta); break; default: return; } - ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, + ccid3_pr_debug("Interval %lldusec, X_recv=%u, 1/p=%u\n", delta, hc->rx_x_recv, hc->rx_pinv); hc->rx_tstamp_last_feedback = now; @@ -680,7 +679,8 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) static u32 ccid3_first_li(struct sock *sk) { struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - u32 x_recv, p, delta; + u32 x_recv, p; + s64 delta; u64 fval; if (hc->rx_rtt == 0) { @@ -688,7 +688,9 @@ static u32 ccid3_first_li(struct sock *sk) hc->rx_rtt = DCCP_FALLBACK_RTT; } - delta = ktime_to_us(net_timedelta(hc->rx_tstamp_last_feedback)); + delta = ktime_us_delta(ktime_get(), hc->rx_tstamp_last_feedback); + if (delta <= 0) + delta = 1; x_recv = scaled_div32(hc->rx_bytes_recv, delta); if (x_recv == 0) { /* would also trigger divide-by-zero */ DCCP_WARN("X_recv==0\n"); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 0ea2ee56ac1b..f91e3816806b 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -316,7 +316,8 @@ int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); void dccp_shutdown(struct sock *sk, int how); int inet_dccp_listen(struct socket *sock, int backlog); -__poll_t dccp_poll_mask(struct socket *sock, __poll_t events); +__poll_t dccp_poll(struct file *file, struct socket *sock, + poll_table *wait); int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); void dccp_req_err(struct sock *sk, u64 seq); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index a9e478cd3787..b08feb219b44 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -984,7 +984,7 @@ static const struct proto_ops inet_dccp_ops = { .accept = inet_accept, .getname = inet_getname, /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ - .poll_mask = dccp_poll_mask, + .poll = dccp_poll, .ioctl = inet_ioctl, /* FIXME: work on inet_listen to rename it to sock_common_listen */ .listen = inet_dccp_listen, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 17fc4e0166ba..6344f1b18a6a 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1070,7 +1070,7 @@ static const struct proto_ops inet6_dccp_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet6_getname, - .poll_mask = dccp_poll_mask, + .poll = dccp_poll, .ioctl = inet6_ioctl, .listen = inet_dccp_listen, .shutdown = inet_shutdown, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index ca21c1c76da0..0d56e36a6db7 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -312,11 +312,20 @@ int dccp_disconnect(struct sock *sk, int flags) EXPORT_SYMBOL_GPL(dccp_disconnect); -__poll_t dccp_poll_mask(struct socket *sock, __poll_t events) +/* + * Wait for a DCCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +__poll_t dccp_poll(struct file *file, struct socket *sock, + poll_table *wait) { __poll_t mask; struct sock *sk = sock->sk; + sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == DCCP_LISTEN) return inet_csk_listen_poll(sk); @@ -358,7 +367,7 @@ __poll_t dccp_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL_GPL(dccp_poll_mask); +EXPORT_SYMBOL_GPL(dccp_poll); int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) { diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig index f3393e154f0f..dcc74956badd 100644 --- a/net/decnet/Kconfig +++ b/net/decnet/Kconfig @@ -40,4 +40,3 @@ config DECNET_ROUTER to work. See <file:Documentation/networking/decnet.txt> for more information. - diff --git a/net/decnet/Makefile b/net/decnet/Makefile index 9e38122d942b..07b38e441b2d 100644 --- a/net/decnet/Makefile +++ b/net/decnet/Makefile @@ -8,4 +8,3 @@ decnet-$(CONFIG_DECNET_ROUTER) += dn_fib.o dn_rules.o dn_table.o decnet-y += sysctl_net_decnet.o obj-$(CONFIG_NETFILTER) += netfilter/ - diff --git a/net/decnet/TODO b/net/decnet/TODO index ebb5ac69d128..358e9eb49016 100644 --- a/net/decnet/TODO +++ b/net/decnet/TODO @@ -16,14 +16,14 @@ Steve's quick list of things that need finishing off: o Verify errors etc. against POSIX 1003.1g (draft) - o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g) + o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g) [maybe this should be done at socket level... the control data in the send/recvmsg() calls should simply be a vector of set/getsockopt() calls] o check MSG_CTRUNC is set where it should be. - o Find all the commonality between DECnet and IPv4 routing code and extract + o Find all the commonality between DECnet and IPv4 routing code and extract it into a small library of routines. [probably a project for 2.7.xx] o Add perfect socket hashing - an idea suggested by Paul Koning. Currently @@ -38,4 +38,3 @@ Steve's quick list of things that need finishing off: o DECnet sendpages() function o AIO for DECnet - diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 9a686d890bfa..7d6ff983ba2c 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1207,11 +1207,11 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer) } -static __poll_t dn_poll_mask(struct socket *sock, __poll_t events) +static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); - __poll_t mask = datagram_poll_mask(sock, events); + __poll_t mask = datagram_poll(file, sock, wait); if (!skb_queue_empty(&scp->other_receive_queue)) mask |= EPOLLRDBAND; @@ -2331,7 +2331,7 @@ static const struct proto_ops dn_proto_ops = { .socketpair = sock_no_socketpair, .accept = dn_accept, .getname = dn_getname, - .poll_mask = dn_poll_mask, + .poll = dn_poll, .ioctl = dn_ioctl, .listen = dn_listen, .shutdown = dn_shutdown, diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index fce94cbd4378..f78fe58eafc8 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -797,5 +797,3 @@ void __init dn_fib_init(void) rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL, 0); } - - diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 1b2120645730..2fb5e055ba25 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -491,6 +491,7 @@ static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb) break; case DN_RUN: sk->sk_shutdown |= SHUTDOWN_MASK; + /* fall through */ case DN_CC: scp->state = DN_CN; } @@ -911,4 +912,3 @@ free_out: return NET_RX_SUCCESS; } - diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 56a52a004c56..a1779de6bd9c 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -701,4 +701,3 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg) dn_nsp_send(skb); } - diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index e74765024d88..3107a2e24e6b 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1925,4 +1925,3 @@ void __exit dn_route_cleanup(void) remove_proc_entry("decnet_cache", init_net.proc_net); dst_entries_destroy(&dn_dst_ops); } - diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index 72236695db3d..4a4e3c17740c 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -256,5 +256,3 @@ void __exit dn_fib_rules_cleanup(void) rtnl_unlock(); rcu_barrier(); } - - diff --git a/net/decnet/netfilter/Makefile b/net/decnet/netfilter/Makefile index 255c1ae9daeb..b579e52130aa 100644 --- a/net/decnet/netfilter/Makefile +++ b/net/decnet/netfilter/Makefile @@ -3,4 +3,3 @@ # obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o - diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index ab395e55cd78..a4faacadd8a8 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -158,4 +158,3 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG); module_init(dn_rtmsg_init); module_exit(dn_rtmsg_fini); - diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 40c851693f77..7f4534828f6c 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -86,35 +86,39 @@ dns_resolver_preparse(struct key_preparsed_payload *prep) opt++; kdebug("options: '%s'", opt); do { + int opt_len, opt_nlen; const char *eq; - int opt_len, opt_nlen, opt_vlen, tmp; + char optval[128]; next_opt = memchr(opt, '#', end - opt) ?: end; opt_len = next_opt - opt; - if (opt_len <= 0 || opt_len > 128) { + if (opt_len <= 0 || opt_len > sizeof(optval)) { pr_warn_ratelimited("Invalid option length (%d) for dns_resolver key\n", opt_len); return -EINVAL; } - eq = memchr(opt, '=', opt_len) ?: end; - opt_nlen = eq - opt; - eq++; - opt_vlen = next_opt - eq; /* will be -1 if no value */ + eq = memchr(opt, '=', opt_len); + if (eq) { + opt_nlen = eq - opt; + eq++; + memcpy(optval, eq, next_opt - eq); + optval[next_opt - eq] = '\0'; + } else { + opt_nlen = opt_len; + optval[0] = '\0'; + } - tmp = opt_vlen >= 0 ? opt_vlen : 0; - kdebug("option '%*.*s' val '%*.*s'", - opt_nlen, opt_nlen, opt, tmp, tmp, eq); + kdebug("option '%*.*s' val '%s'", + opt_nlen, opt_nlen, opt, optval); /* see if it's an error number representing a DNS error * that's to be recorded as the result in this key */ if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 && memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) { kdebug("dns error number option"); - if (opt_vlen <= 0) - goto bad_option_value; - ret = kstrtoul(eq, 10, &derrno); + ret = kstrtoul(optval, 10, &derrno); if (ret < 0) goto bad_option_value; @@ -316,4 +320,3 @@ static void __exit exit_dns_resolver(void) module_init(init_dns_resolver) module_exit(exit_dns_resolver) MODULE_LICENSE("GPL"); - diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index dc5d9af3dc80..a1917025e155 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -775,6 +775,20 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n) if (!ds) return NULL; + /* We avoid allocating memory outside dsa_switch + * if it is not needed. + */ + if (n <= sizeof(ds->_bitmap) * 8) { + ds->bitmap = &ds->_bitmap; + } else { + ds->bitmap = devm_kcalloc(dev, + BITS_TO_LONGS(n), + sizeof(unsigned long), + GFP_KERNEL); + if (unlikely(!ds->bitmap)) + return NULL; + } + ds->dev = dev; ds->num_ports = n; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 1e3b6a6d8a40..71536c435132 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -900,7 +900,7 @@ static int dsa_slave_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: - return tcf_block_cb_register(f->block, cb, dev, dev); + return tcf_block_cb_register(f->block, cb, dev, dev, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, cb, dev); return 0; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index b93511726069..142b294d3446 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -136,21 +136,20 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, { const struct switchdev_obj_port_mdb *mdb = info->mdb; struct switchdev_trans *trans = info->trans; - DECLARE_BITMAP(group, ds->num_ports); int port; /* Build a mask of Multicast group members */ - bitmap_zero(group, ds->num_ports); + bitmap_zero(ds->bitmap, ds->num_ports); if (ds->index == info->sw_index) - set_bit(info->port, group); + set_bit(info->port, ds->bitmap); for (port = 0; port < ds->num_ports; port++) if (dsa_is_dsa_port(ds, port)) - set_bit(port, group); + set_bit(port, ds->bitmap); if (switchdev_trans_ph_prepare(trans)) - return dsa_switch_mdb_prepare_bitmap(ds, mdb, group); + return dsa_switch_mdb_prepare_bitmap(ds, mdb, ds->bitmap); - dsa_switch_mdb_add_bitmap(ds, mdb, group); + dsa_switch_mdb_add_bitmap(ds, mdb, ds->bitmap); return 0; } @@ -204,21 +203,20 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds, { const struct switchdev_obj_port_vlan *vlan = info->vlan; struct switchdev_trans *trans = info->trans; - DECLARE_BITMAP(members, ds->num_ports); int port; /* Build a mask of VLAN members */ - bitmap_zero(members, ds->num_ports); + bitmap_zero(ds->bitmap, ds->num_ports); if (ds->index == info->sw_index) - set_bit(info->port, members); + set_bit(info->port, ds->bitmap); for (port = 0; port < ds->num_ports; port++) if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) - set_bit(port, members); + set_bit(port, ds->bitmap); if (switchdev_trans_ph_prepare(trans)) - return dsa_switch_vlan_prepare_bitmap(ds, vlan, members); + return dsa_switch_vlan_prepare_bitmap(ds, vlan, ds->bitmap); - dsa_switch_vlan_add_bitmap(ds, vlan, members); + dsa_switch_vlan_add_bitmap(ds, vlan, ds->bitmap); return 0; } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index ee28440f57c5..fd8faa0dfa61 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -427,13 +427,13 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) } EXPORT_SYMBOL(sysfs_format_mac); -struct sk_buff **eth_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; - struct ethhdr *eh, *eh2; - unsigned int hlen, off_eth; const struct packet_offload *ptype; + unsigned int hlen, off_eth; + struct sk_buff *pp = NULL; + struct ethhdr *eh, *eh2; + struct sk_buff *p; __be16 type; int flush = 1; @@ -448,7 +448,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head, flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 275449b0d633..3297e7fa9945 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -90,12 +90,18 @@ static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n) return 0; } +static int lowpan_get_iflink(const struct net_device *dev) +{ + return lowpan_802154_dev(dev)->wdev->ifindex; +} + static const struct net_device_ops lowpan_netdev_ops = { .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, .ndo_neigh_construct = lowpan_neigh_construct, + .ndo_get_iflink = lowpan_get_iflink, }; static void lowpan_setup(struct net_device *ldev) diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 2cc224106b69..ec7a5da56129 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -25,7 +25,7 @@ #include <net/ieee802154_netdev.h> #include <net/6lowpan.h> -#include <net/ipv6.h> +#include <net/ipv6_frag.h> #include <net/inet_frag.h> #include "6lowpan_i.h" diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index cb7176cd4cd6..fe225d9a1877 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -400,4 +400,3 @@ module_exit(wpan_phy_class_exit); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("IEEE 802.15.4 configuration interface"); MODULE_AUTHOR("Dmitry Eremin-Solenikov"); - diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c index 35c432668454..78f6f1233194 100644 --- a/net/ieee802154/nl_policy.c +++ b/net/ieee802154/nl_policy.c @@ -75,4 +75,3 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = { [IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] = { .type = NLA_U8, }, [IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] = { .type = NLA_U8, }, }; - diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a0768d2759b8..a60658c85a9a 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -423,7 +423,7 @@ static const struct proto_ops ieee802154_raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -969,7 +969,7 @@ static const struct proto_ops ieee802154_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 80dad301361d..32cae39cdff6 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -430,7 +430,7 @@ config INET_DIAG Support for INET (TCP, DCCP, etc) socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable at: - + http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2 If unsure, say Y. @@ -600,7 +600,7 @@ config TCP_CONG_VENO distinguishing to circumvent the difficult judgment of the packet loss type. TCP Veno cuts down less congestion window in response to random loss packets. - See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> + See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> config TCP_CONG_YEAH tristate "YeAH TCP" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index eec9569ffa5c..7446b98661d8 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,7 +43,7 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ -obj-$(CONFIG_INET_DIAG) += inet_diag.o +obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 15e125558c76..f2a0a3bab6b5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -229,6 +229,7 @@ int inet_listen(struct socket *sock, int backlog) err = inet_csk_listen_start(sk, backlog); if (err) goto out; + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } sk->sk_max_ack_backlog = backlog; err = 0; @@ -986,7 +987,7 @@ const struct proto_ops inet_stream_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, - .poll_mask = tcp_poll_mask, + .poll = tcp_poll, .ioctl = inet_ioctl, .listen = inet_listen, .shutdown = inet_shutdown, @@ -1021,7 +1022,7 @@ const struct proto_ops inet_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, - .poll_mask = udp_poll_mask, + .poll = udp_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, @@ -1042,7 +1043,7 @@ EXPORT_SYMBOL(inet_dgram_ops); /* * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without - * udp_poll_mask + * udp_poll */ static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, @@ -1053,7 +1054,7 @@ static const struct proto_ops inet_sockraw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, @@ -1384,12 +1385,12 @@ out: } EXPORT_SYMBOL(inet_gso_segment); -struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; - struct sk_buff **pp = NULL; - struct sk_buff *p; + struct sk_buff *pp = NULL; const struct iphdr *iph; + struct sk_buff *p; unsigned int hlen; unsigned int off; unsigned int id; @@ -1425,7 +1426,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { struct iphdr *iph2; u16 flush_id; @@ -1505,8 +1506,8 @@ out: } EXPORT_SYMBOL(inet_gro_receive); -static struct sk_buff **ipip_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ipip_gro_receive(struct list_head *head, + struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; @@ -1882,6 +1883,7 @@ fs_initcall(ipv4_offload_init); static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, + .list_func = ip_list_rcv, }; static int __init inet_init(void) diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile index ce262d76cc48..e9e42f99725e 100644 --- a/net/ipv4/bpfilter/Makefile +++ b/net/ipv4/bpfilter/Makefile @@ -1,2 +1 @@ obj-$(CONFIG_BPFILTER) += sockopt.o - diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 133589d693a9..58834a10c0be 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -28,8 +28,8 @@ #include <linux/spinlock.h> #include <net/udp.h> -static struct sk_buff **esp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *esp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b21833651394..e46cdd310e5f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -300,6 +300,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_oif = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), .flowi4_scope = scope, diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 1540db65241a..500a59906b87 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -224,14 +224,14 @@ drop: return 0; } -static struct sk_buff **fou_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *fou_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { - const struct net_offload *ops; - struct sk_buff **pp = NULL; u8 proto = fou_from_sock(sk)->protocol; const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff *pp = NULL; /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel @@ -305,13 +305,13 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, return guehdr; } -static struct sk_buff **gue_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gue_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { const struct net_offload **offloads; const struct net_offload *ops; - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct guehdr *guehdr; size_t len, optlen, hdrlen, off; @@ -397,7 +397,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, skb_gro_pull(skb, hdrlen); - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct guehdr *guehdr2; if (!NAPI_GRO_CB(p)->same_flow) @@ -448,9 +448,7 @@ next_proto: out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, &grc); - skb->remcsum_offload = 0; + skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 1859c473b21a..6c63524f598a 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -108,10 +108,10 @@ out: return segs; } -static struct sk_buff **gre_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gre_gro_receive(struct list_head *head, + struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; const struct gre_base_hdr *greh; unsigned int hlen, grehlen; @@ -182,7 +182,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, null_compute_pseudo); } - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct gre_base_hdr *greh2; if (!NAPI_GRO_CB(p)->same_flow) @@ -223,7 +223,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1617604c9284..695979b7ef6d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -429,14 +429,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; + ipcm_init(&ipc); inet->tos = ip_hdr(skb)->tos; sk->sk_mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; @@ -710,11 +707,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param.offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; sk->sk_mark = mark; + ipcm_init(&ipc); ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, &icmp_param); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 85b617b655bc..bae9096821be 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1200,13 +1200,13 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) spin_lock_bh(&im->lock); if (pmc) { im->interface = pmc->interface; - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; - im->sfmode = pmc->sfmode; - if (pmc->sfmode == MCAST_INCLUDE) { + if (im->sfmode == MCAST_INCLUDE) { im->tomb = pmc->tomb; im->sources = pmc->sources; for (psf = im->sources; psf; psf = psf->sf_next) - psf->sf_crcount = im->crcount; + psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + } else { + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; } in_dev_put(pmc->interface); kfree(pmc); @@ -1316,7 +1316,13 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should + * not send filter-mode change record as the mode should be from + * IN() to IN(A). + */ + if (im->sfmode == MCAST_EXCLUDE) + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + igmp_ifc_event(in_dev); #endif } @@ -1381,8 +1387,7 @@ static void ip_mc_hash_remove(struct in_device *in_dev, /* * A socket has joined a multicast group on device dev. */ - -void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) +void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode) { struct ip_mc_list *im; #ifdef CONFIG_IP_MULTICAST @@ -1394,7 +1399,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == addr) { im->users++; - ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); + ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0); goto out; } } @@ -1408,8 +1413,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) in_dev_hold(in_dev); im->multiaddr = addr; /* initial mode is (EX, empty) */ - im->sfmode = MCAST_EXCLUDE; - im->sfcount[MCAST_EXCLUDE] = 1; + im->sfmode = mode; + im->sfcount[mode] = 1; refcount_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST @@ -1432,6 +1437,11 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) out: return; } + +void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) +{ + __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ip_mc_inc_group); static int ip_mc_check_iphdr(struct sk_buff *skb) @@ -2130,8 +2140,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) /* Join a multicast group */ - -int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode) { __be32 addr = imr->imr_multiaddr.s_addr; struct ip_mc_socklist *iml, *i; @@ -2172,15 +2182,30 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) memcpy(&iml->multi, imr, sizeof(*imr)); iml->next_rcu = inet->mc_list; iml->sflist = NULL; - iml->sfmode = MCAST_EXCLUDE; + iml->sfmode = mode; rcu_assign_pointer(inet->mc_list, iml); - ip_mc_inc_group(in_dev, addr); + __ip_mc_inc_group(in_dev, addr, mode); err = 0; done: return err; } + +/* Join ASM (Any-Source Multicast) group + */ +int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +{ + return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ip_mc_join_group); +/* Join SSM (Source-Specific Multicast) group + */ +int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode) +{ + return __ip_mc_join_group(sk, imr, mode); +} + static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 316518f87294..d3162baca9f1 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -91,7 +91,7 @@ static void inet_frags_free_cb(void *ptr, void *arg) void inet_frags_exit_net(struct netns_frags *nf) { - nf->low_thresh = 0; /* prevent creation of new frags */ + nf->high_thresh = 0; /* prevent creation of new frags */ rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2d8efeecf619..c8ca5d8f0f75 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -587,6 +587,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; key = &tun_info->key; + if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + goto err_free_rt; md = ip_tunnel_info_opts(tun_info); if (!md) goto err_free_rt; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7582713dd18f..3196cf58f418 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -307,7 +307,8 @@ drop: return true; } -static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +static int ip_rcv_finish_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); @@ -315,13 +316,6 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) struct rtable *rt; int err; - /* if ingress device is enslaved to an L3 master device pass the - * skb to its handler for processing - */ - skb = l3mdev_ip_rcv(skb); - if (!skb) - return NET_RX_SUCCESS; - if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && @@ -393,7 +387,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - return dst_input(skb); + return NET_RX_SUCCESS; drop: kfree_skb(skb); @@ -405,13 +399,29 @@ drop_error: goto drop; } +static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + + ret = ip_rcv_finish_core(net, sk, skb); + if (ret != NET_RX_DROP) + ret = dst_input(skb); + return ret; +} + /* * Main IP Receive routine. */ -int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; - struct net *net; u32 len; /* When the interface is in promisc. mode, drop all the crap @@ -421,7 +431,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; - net = dev_net(dev); __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); @@ -489,9 +498,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - net, NULL, skb, dev, NULL, - ip_rcv_finish); + return skb; csum_error: __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); @@ -500,5 +507,113 @@ inhdr_error: drop: kfree_skb(skb); out: - return NET_RX_DROP; + return NULL; +} + +/* + * IP receive entry point + */ +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + + skb = ip_rcv_core(skb, net); + if (skb == NULL) + return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, + ip_rcv_finish); +} + +static void ip_sublist_rcv_finish(struct list_head *head) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + list_del(&skb->list); + /* Handle ip{6}_forward case, as sch_direct_xmit have + * another kind of SKB-list usage (see validate_xmit_skb_list) + */ + skb->next = NULL; + dst_input(skb); + } +} + +static void ip_list_rcv_finish(struct net *net, struct sock *sk, + struct list_head *head) +{ + struct dst_entry *curr_dst = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct dst_entry *dst; + + list_del(&skb->list); + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + continue; + if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP) + continue; + + dst = skb_dst(skb); + if (curr_dst != dst) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip_sublist_rcv_finish(&sublist); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dst = dst; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip_sublist_rcv_finish(&sublist); +} + +static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ + NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, + head, dev, NULL, ip_rcv_finish); + ip_list_rcv_finish(net, NULL, head); +} + +/* Receive a list of IP packets */ +void ip_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net_device *curr_dev = NULL; + struct net *curr_net = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + + list_del(&skb->list); + skb = ip_rcv_core(skb, net); + if (skb == NULL) + continue; + + if (curr_dev != dev || curr_net != net) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, curr_dev, curr_net); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dev = dev; + curr_net = net; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip_sublist_rcv(&sublist, curr_dev, curr_net); } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b3308e9d9762..9c4e72e9c60a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -423,7 +423,8 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) } /* Note: skb->sk can be different from sk, in case of tunnels */ -int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) +int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, + __u8 tos) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -462,7 +463,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_CONN_FLAGS(sk), + RT_CONN_FLAGS_TOS(sk, tos), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; @@ -478,7 +479,7 @@ packet_routed: skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); - *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); + *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff)); if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else @@ -511,7 +512,7 @@ no_route: kfree_skb(skb); return -EHOSTUNREACH; } -EXPORT_SYMBOL(ip_queue_xmit); +EXPORT_SYMBOL(__ip_queue_xmit); static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { @@ -523,6 +524,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->dev = from->dev; to->mark = from->mark; + skb_copy_hash(to, from); + /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; @@ -1145,14 +1148,15 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->fragsize = ip_sk_use_pmtu(sk) ? dst_mtu(&rt->dst) : rt->dst.dev->mtu; - cork->gso_size = sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP ? ipc->gso_size : 0; + cork->gso_size = ipc->gso_size; cork->dst = &rt->dst; cork->length = 0; cork->ttl = ipc->ttl; cork->tos = ipc->tos; cork->priority = ipc->priority; - cork->tx_flags = ipc->tx_flags; + cork->transmit_time = ipc->sockc.transmit_time; + cork->tx_flags = 0; + sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags); return 0; } @@ -1413,6 +1417,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = cork->transmit_time; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount @@ -1545,11 +1550,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) return; + ipcm_init(&ipc); ipc.addr = daddr; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index fc32fdbeefa6..c0fe5ad996f2 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -150,15 +150,18 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { struct sockaddr_in sin; const struct iphdr *iph = ip_hdr(skb); - __be16 *ports = (__be16 *)skb_transport_header(skb); + __be16 *ports; + int end; - if (skb_transport_offset(skb) + 4 > (int)skb->len) + end = skb_transport_offset(skb) + 4; + if (end > 0 && !pskb_may_pull(skb, end)) return; /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ + ports = (__be16 *)skb_transport_header(skb); sin.sin_family = AF_INET; sin.sin_addr.s_addr = iph->daddr; @@ -984,7 +987,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; mreq.imr_address.s_addr = mreqs.imr_interface; mreq.imr_ifindex = 0; - err = ip_mc_join_group(sk, &mreq); + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; omode = MCAST_INCLUDE; @@ -1061,7 +1064,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, mreq.imr_multiaddr = psin->sin_addr; mreq.imr_address.s_addr = 0; mreq.imr_ifindex = greqs.gsr_interface; - err = ip_mc_join_group(sk, &mreq); + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; greqs.gsr_interface = mreq.imr_ifindex; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 82f914122f1b..5660adcf7a04 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1052,7 +1052,7 @@ static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *skb; int ret; - if (assert == IGMPMSG_WHOLEPKT) + if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); else skb = alloc_skb(128, GFP_ATOMIC); @@ -1060,7 +1060,7 @@ static int ipmr_cache_report(struct mr_table *mrt, if (!skb) return -ENOBUFS; - if (assert == IGMPMSG_WHOLEPKT) { + if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) { /* Ugly, but we have no choice with this interface. * Duplicate old header, fix ihl, length etc. * And all this only to mangle msg->im_msgtype and @@ -1071,9 +1071,12 @@ static int ipmr_cache_report(struct mr_table *mrt, skb_reset_transport_header(skb); msg = (struct igmpmsg *)skb_network_header(skb); memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); - msg->im_msgtype = IGMPMSG_WHOLEPKT; + msg->im_msgtype = assert; msg->im_mbz = 0; - msg->im_vif = mrt->mroute_reg_vif_num; + if (assert == IGMPMSG_WRVIFWHOLE) + msg->im_vif = vifi; + else + msg->im_vif = mrt->mroute_reg_vif_num; ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -1372,6 +1375,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, struct mr_table *mrt; struct vifctl vif; struct mfcctl mfc; + bool do_wrvifwhole; u32 uval; /* There's one exception to the lock - MRT_DONE which needs to unlock */ @@ -1502,10 +1506,12 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, break; } + do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE); val = !!val; if (val != mrt->mroute_do_pim) { mrt->mroute_do_pim = val; mrt->mroute_do_assert = val; + mrt->mroute_do_wrvifwhole = do_wrvifwhole; } break; case MRT_TABLE: @@ -1983,6 +1989,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); + if (mrt->mroute_do_wrvifwhole) + ipmr_cache_report(mrt, skb, true_vifi, + IGMPMSG_WRVIFWHOLE); } goto dont_forward; } @@ -2659,7 +2668,9 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) mrt->mroute_reg_vif_num) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, mrt->mroute_do_assert) || - nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim)) + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) || + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, + mrt->mroute_do_wrvifwhole)) return false; return true; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index e6774ccb7731..8d2e5dc9a827 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -98,59 +98,6 @@ int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_ip_reroute); -__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol) -{ - const struct iphdr *iph = ip_hdr(skb); - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) - break; - if ((protocol == 0 && !csum_fold(skb->csum)) || - !csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - dataoff, protocol, - skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - /* fall through */ - case CHECKSUM_NONE: - if (protocol == 0) - skb->csum = 0; - else - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len - dataoff, - protocol, 0); - csum = __skb_checksum_complete(skb); - } - return csum; -} -EXPORT_SYMBOL(nf_ip_checksum); - -__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol) -{ - const struct iphdr *iph = ip_hdr(skb); - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (len == skb->len - dataoff) - return nf_ip_checksum(skb, hook, dataoff, protocol); - /* fall through */ - case CHECKSUM_NONE: - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, - skb->len - dataoff, 0); - skb->ip_summed = CHECKSUM_NONE; - return __skb_checksum_complete_head(skb, dataoff + len); - } - return csum; -} -EXPORT_SYMBOL_GPL(nf_ip_checksum_partial); - int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict __always_unused) { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index bbfc356cb1b5..d9504adc47b3 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -9,22 +9,6 @@ config NF_DEFRAG_IPV4 tristate default n -config NF_CONNTRACK_IPV4 - tristate "IPv4 connection tracking support (required for NAT)" - depends on NF_CONNTRACK - default m if NETFILTER_ADVANCED=n - select NF_DEFRAG_IPV4 - ---help--- - Connection tracking keeps a record of what packets have passed - through your machine, in order to figure out how they are related - into connections. - - This is IPv4 support on Layer 3 independent connection tracking. - Layer 3 independent connection tracking is experimental scheme - which generalize ip_conntrack to support other layer 3 protocols. - - To compile it as a module, choose M here. If unsure, say N. - config NF_SOCKET_IPV4 tristate "IPv4 socket lookup support" help @@ -112,7 +96,7 @@ config NF_REJECT_IPV4 config NF_NAT_IPV4 tristate "IPv4 NAT" - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n select NF_NAT help @@ -279,7 +263,7 @@ config IP_NF_TARGET_SYNPROXY # NAT + specific targets: nf_conntrack config IP_NF_NAT tristate "iptables NAT support" - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n select NF_NAT select NF_NAT_IPV4 @@ -340,7 +324,7 @@ config IP_NF_MANGLE config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support" depends on IP_NF_MANGLE - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK select NETFILTER_FAMILY_ARP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 8394c17c269f..367993adf4d3 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -3,12 +3,6 @@ # Makefile for the netfilter modules on top of IPv4. # -# objects for l3 independent conntrack -nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o - -# connection tracking -obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o - nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index ca0dad90803a..e77872c93c20 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1898,6 +1898,7 @@ static struct xt_match ipt_builtin_mt[] __read_mostly = { .checkentry = icmp_checkentry, .proto = IPPROTO_ICMP, .family = NFPROTO_IPV4, + .me = THIS_MODULE, }, }; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c deleted file mode 100644 index 9db988f9a4d7..000000000000 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ /dev/null @@ -1,472 +0,0 @@ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2006-2012 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/icmp.h> -#include <linux/sysctl.h> -#include <net/route.h> -#include <net/ip.h> - -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_l3proto.h> -#include <net/netfilter/nf_conntrack_zones.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/ipv4/nf_defrag_ipv4.h> -#include <net/netfilter/nf_log.h> - -static int conntrack4_net_id __read_mostly; -static DEFINE_MUTEX(register_ipv4_hooks); - -struct conntrack4_net { - unsigned int users; -}; - -static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) -{ - const __be32 *ap; - __be32 _addrs[2]; - ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr), - sizeof(u_int32_t) * 2, _addrs); - if (ap == NULL) - return false; - - tuple->src.u3.ip = ap[0]; - tuple->dst.u3.ip = ap[1]; - - return true; -} - -static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u3.ip = orig->dst.u3.ip; - tuple->dst.u3.ip = orig->src.u3.ip; - - return true; -} - -static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum) -{ - const struct iphdr *iph; - struct iphdr _iph; - - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); - if (iph == NULL) - return -NF_ACCEPT; - - /* Conntrack defragments packets, we might still see fragments - * inside ICMP packets though. */ - if (iph->frag_off & htons(IP_OFFSET)) - return -NF_ACCEPT; - - *dataoff = nhoff + (iph->ihl << 2); - *protonum = iph->protocol; - - /* Check bogus IP headers */ - if (*dataoff > skb->len) { - pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: " - "nhoff %u, ihl %u, skblen %u\n", - nhoff, iph->ihl << 2, skb->len); - return -NF_ACCEPT; - } - - return NF_ACCEPT; -} - -static unsigned int ipv4_helper(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - const struct nf_conn_help *help; - const struct nf_conntrack_helper *helper; - - /* This is where we call the helper: as the packet goes out. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; - - help = nfct_help(ct); - if (!help) - return NF_ACCEPT; - - /* rcu_read_lock()ed by nf_hook_thresh */ - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; - - return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), - ct, ctinfo); -} - -static unsigned int ipv4_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - goto out; - - /* adjust seqs for loopback traffic only in outgoing direction */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } - } -out: - /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb); -} - -static unsigned int ipv4_conntrack_in(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); -} - -static unsigned int ipv4_conntrack_local(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */ - enum ip_conntrack_info ctinfo; - struct nf_conn *tmpl; - - tmpl = nf_ct_get(skb, &ctinfo); - if (tmpl && nf_ct_is_template(tmpl)) { - /* when skipping ct, clear templates to avoid fooling - * later targets/matches - */ - skb->_nfct = 0; - nf_ct_put(tmpl); - } - return NF_ACCEPT; - } - - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); -} - -/* Connection tracking may drop packets, but never alters them, so - make it the first hook. */ -static const struct nf_hook_ops ipv4_conntrack_ops[] = { - { - .hook = ipv4_conntrack_in, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ipv4_conntrack_local, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ipv4_helper, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv4_confirm, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, - { - .hook = ipv4_helper, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv4_confirm, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, -}; - -/* Fast function for those who don't want to parse /proc (and I don't - blame them). */ -/* Reversing the socket's dst/src point of view gives us the reply - mapping. */ -static int -getorigdst(struct sock *sk, int optval, void __user *user, int *len) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct nf_conntrack_tuple_hash *h; - struct nf_conntrack_tuple tuple; - - memset(&tuple, 0, sizeof(tuple)); - - lock_sock(sk); - tuple.src.u3.ip = inet->inet_rcv_saddr; - tuple.src.u.tcp.port = inet->inet_sport; - tuple.dst.u3.ip = inet->inet_daddr; - tuple.dst.u.tcp.port = inet->inet_dport; - tuple.src.l3num = PF_INET; - tuple.dst.protonum = sk->sk_protocol; - release_sock(sk); - - /* We only do TCP and SCTP at the moment: is there a better way? */ - if (tuple.dst.protonum != IPPROTO_TCP && - tuple.dst.protonum != IPPROTO_SCTP) { - pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); - return -ENOPROTOOPT; - } - - if ((unsigned int) *len < sizeof(struct sockaddr_in)) { - pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", - *len, sizeof(struct sockaddr_in)); - return -EINVAL; - } - - h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); - if (h) { - struct sockaddr_in sin; - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - - sin.sin_family = AF_INET; - sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.u.tcp.port; - sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.u3.ip; - memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); - - pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", - &sin.sin_addr.s_addr, ntohs(sin.sin_port)); - nf_ct_put(ct); - if (copy_to_user(user, &sin, sizeof(sin)) != 0) - return -EFAULT; - else - return 0; - } - pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", - &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); - return -ENOENT; -} - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_conntrack.h> - -static int ipv4_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple) -{ - if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || - nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = { - [CTA_IP_V4_SRC] = { .type = NLA_U32 }, - [CTA_IP_V4_DST] = { .type = NLA_U32 }, -}; - -static int ipv4_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) -{ - if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) - return -EINVAL; - - t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); - t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); - - return 0; -} -#endif - -static struct nf_sockopt_ops so_getorigdst = { - .pf = PF_INET, - .get_optmin = SO_ORIGINAL_DST, - .get_optmax = SO_ORIGINAL_DST+1, - .get = getorigdst, - .owner = THIS_MODULE, -}; - -static int ipv4_hooks_register(struct net *net) -{ - struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); - int err = 0; - - mutex_lock(®ister_ipv4_hooks); - - cnet->users++; - if (cnet->users > 1) - goto out_unlock; - - err = nf_defrag_ipv4_enable(net); - if (err) { - cnet->users = 0; - goto out_unlock; - } - - err = nf_register_net_hooks(net, ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - - if (err) - cnet->users = 0; - out_unlock: - mutex_unlock(®ister_ipv4_hooks); - return err; -} - -static void ipv4_hooks_unregister(struct net *net) -{ - struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); - - mutex_lock(®ister_ipv4_hooks); - if (cnet->users && (--cnet->users == 0)) - nf_unregister_net_hooks(net, ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - mutex_unlock(®ister_ipv4_hooks); -} - -const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { - .l3proto = PF_INET, - .pkt_to_tuple = ipv4_pkt_to_tuple, - .invert_tuple = ipv4_invert_tuple, - .get_l4proto = ipv4_get_l4proto, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = ipv4_tuple_to_nlattr, - .nlattr_to_tuple = ipv4_nlattr_to_tuple, - .nla_policy = ipv4_nla_policy, - .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */ - NLA_ALIGN(NLA_HDRLEN + sizeof(u32)), /* CTA_IP_V4_DST */ -#endif - .net_ns_get = ipv4_hooks_register, - .net_ns_put = ipv4_hooks_unregister, - .me = THIS_MODULE, -}; - -module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, - &nf_conntrack_htable_size, 0600); - -MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); -MODULE_ALIAS("ip_conntrack"); -MODULE_LICENSE("GPL"); - -static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = { - &nf_conntrack_l4proto_tcp4, - &nf_conntrack_l4proto_udp4, - &nf_conntrack_l4proto_icmp, -#ifdef CONFIG_NF_CT_PROTO_DCCP - &nf_conntrack_l4proto_dccp4, -#endif -#ifdef CONFIG_NF_CT_PROTO_SCTP - &nf_conntrack_l4proto_sctp4, -#endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - &nf_conntrack_l4proto_udplite4, -#endif -}; - -static int ipv4_net_init(struct net *net) -{ - return nf_ct_l4proto_pernet_register(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); -} - -static void ipv4_net_exit(struct net *net) -{ - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); -} - -static struct pernet_operations ipv4_net_ops = { - .init = ipv4_net_init, - .exit = ipv4_net_exit, - .id = &conntrack4_net_id, - .size = sizeof(struct conntrack4_net), -}; - -static int __init nf_conntrack_l3proto_ipv4_init(void) -{ - int ret = 0; - - need_conntrack(); - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - if (WARN_ON(nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1) != - nf_conntrack_l3proto_ipv4.nla_size)) - return -EINVAL; -#endif - ret = nf_register_sockopt(&so_getorigdst); - if (ret < 0) { - pr_err("Unable to register netfilter socket option\n"); - return ret; - } - - ret = register_pernet_subsys(&ipv4_net_ops); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register pernet ops\n"); - goto cleanup_sockopt; - } - - ret = nf_ct_l4proto_register(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - if (ret < 0) - goto cleanup_pernet; - - ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n"); - goto cleanup_l4proto; - } - - return ret; -cleanup_l4proto: - nf_ct_l4proto_unregister(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - cleanup_pernet: - unregister_pernet_subsys(&ipv4_net_ops); - cleanup_sockopt: - nf_unregister_sockopt(&so_getorigdst); - return ret; -} - -static void __exit nf_conntrack_l3proto_ipv4_fini(void) -{ - synchronize_net(); - nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - nf_ct_l4proto_unregister(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - unregister_pernet_subsys(&ipv4_net_ops); - nf_unregister_sockopt(&so_getorigdst); -} - -module_init(nf_conntrack_l3proto_ipv4_init); -module_exit(nf_conntrack_l3proto_ipv4_fini); diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 4388de0e5380..1e6f28c97d3a 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -35,7 +35,7 @@ static const struct nf_loginfo default_loginfo = { }; /* One level of recursion won't kill us */ -static void dump_ipv4_packet(struct nf_log_buf *m, +static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { @@ -183,7 +183,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ nf_log_buf_add(m, "["); - dump_ipv4_packet(m, info, skb, + dump_ipv4_packet(net, m, info, skb, iphoff + ih->ihl*4+sizeof(_icmph)); nf_log_buf_add(m, "] "); } @@ -251,7 +251,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && !iphoff) - nf_log_dump_sk_uid_gid(m, skb->sk); + nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!iphoff && skb->mark) @@ -333,7 +333,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf, if (in != NULL) dump_ipv4_mac_header(m, loginfo, skb); - dump_ipv4_packet(m, loginfo, skb, 0); + dump_ipv4_packet(net, m, loginfo, skb, 0); nf_log_buf_close(m); } diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 805e83ec3ad9..164714104965 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -37,7 +37,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, + sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol, iph->saddr, laddr ? laddr : iph->daddr, hp->source, lport ? lport : hp->dest, skb->dev, NF_TPROXY_LOOKUP_LISTENER); @@ -71,7 +71,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) EXPORT_SYMBOL_GPL(nf_tproxy_laddr4); struct sock * -nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, @@ -79,16 +79,21 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; - struct tcphdr *tcph; switch (protocol) { - case IPPROTO_TCP: + case IPPROTO_TCP: { + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), + sizeof(struct tcphdr), &_hdr); + if (hp == NULL) + return NULL; + switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - tcph = hp; sk = inet_lookup_listener(net, &tcp_hashinfo, skb, ip_hdrlen(skb) + - __tcp_hdrlen(tcph), + __tcp_hdrlen(hp), saddr, sport, daddr, dport, in->ifindex, 0); @@ -110,6 +115,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, BUG(); } break; + } case IPPROTO_UDP: sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, in->ifindex); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 2ed64bca54e3..b54c964ad925 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -739,13 +739,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* no remote port */ } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.opt = NULL; - ipc.oif = sk->sk_bound_dev_if; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; + ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); @@ -769,8 +763,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - saddr = ipc.addr; ipc.addr = faddr = daddr; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 77350c1256ce..b46e4cf9a55a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -287,6 +287,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), + SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), + SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index abb3c9490c55..33df4d76db2d 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -381,6 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *rtp = NULL; @@ -561,13 +562,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = inet->inet_daddr; } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.oif = sk->sk_bound_dev_if; + ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); @@ -670,8 +665,6 @@ back_from_confirm: &rt, msg->msg_flags, &ipc.sockc); else { - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d06247ba08b2..5fa335fd3852 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -189,8 +189,9 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, if (write && ret == 0) { low = make_kgid(user_ns, urange[0]); high = make_kgid(user_ns, urange[1]); - if (!gid_valid(low) || !gid_valid(high) || - (urange[1] < urange[0]) || gid_lt(high, low)) { + if (!gid_valid(low) || !gid_valid(high)) + return -EINVAL; + if (urange[1] < urange[0] || gid_lt(high, low)) { low = make_kgid(&init_user_ns, 1); high = make_kgid(&init_user_ns, 0); } @@ -265,8 +266,9 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, ipv4.sysctl_tcp_fastopen); struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; - int ret; u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ + __le32 key[4]; + int ret, i; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) @@ -275,11 +277,14 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, rcu_read_lock(); ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) - memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); + memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else - memset(user_key, 0, sizeof(user_key)); + memset(key, 0, sizeof(key)); rcu_read_unlock(); + for (i = 0; i < ARRAY_SIZE(key); i++) + user_key[i] = le32_to_cpu(key[i]); + snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", user_key[0], user_key[1], user_key[2], user_key[3]); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); @@ -290,13 +295,17 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, ret = -EINVAL; goto bad_key; } - tcp_fastopen_reset_cipher(net, NULL, user_key, + + for (i = 0; i < ARRAY_SIZE(user_key); i++) + key[i] = cpu_to_le32(user_key[i]); + + tcp_fastopen_reset_cipher(net, NULL, key, TCP_FASTOPEN_KEY_LENGTH); } bad_key: pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", - user_key[0], user_key[1], user_key[2], user_key[3], + user_key[0], user_key[1], user_key[2], user_key[3], (char *)tbl.data, ret); kfree(tbl.data); return ret; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 47c45d5be9f9..514aaac1626f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -494,21 +494,32 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, } /* - * Socket is not locked. We are protected from async events by poll logic and - * correct handling of state changes made by other threads is impossible in - * any case. + * Wait for a TCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. */ -__poll_t tcp_poll_mask(struct socket *sock, __poll_t events) +__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { + __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); - __poll_t mask = 0; int state; + sock_poll_wait(file, sk_sleep(sk), wait); + state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); + /* Socket is not locked. We are protected from async events + * by poll logic and correct handling of state changes + * made by other threads is impossible in any case. + */ + + mask = 0; + /* * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a @@ -589,7 +600,7 @@ __poll_t tcp_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL(tcp_poll_mask); +EXPORT_SYMBOL(tcp_poll); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { @@ -806,8 +817,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, * This occurs when user tries to read * from never connected socket. */ - if (!sock_flag(sk, SOCK_DONE)) - ret = -ENOTCONN; + ret = -ENOTCONN; break; } if (!timeo) { @@ -1230,7 +1240,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { @@ -1264,9 +1274,6 @@ restart: int linear; new_segment: - /* Allocate new segment. If the interface is SG, - * allocate skb fitting to single page. - */ if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -1987,7 +1994,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, * shouldn't happen. */ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), - "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n", + "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags)) break; @@ -2002,7 +2009,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), - "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", + "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); } @@ -2031,13 +2038,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, break; if (sk->sk_state == TCP_CLOSE) { - if (!sock_flag(sk, SOCK_DONE)) { - /* This occurs when user tries to read - * from never connected socket. - */ - copied = -ENOTCONN; - break; - } + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; break; } @@ -2551,6 +2555,8 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); + tp->copied_seq = tp->rcv_nxt; + tp->urg_data = 0; tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); @@ -2811,14 +2817,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_REPAIR: if (!tcp_can_repair_sock(sk)) err = -EPERM; - else if (val == 1) { + else if (val == TCP_REPAIR_ON) { tp->repair = 1; sk->sk_reuse = SK_FORCE_REUSE; tp->repair_queue = TCP_NO_QUEUE; - } else if (val == 0) { + } else if (val == TCP_REPAIR_OFF) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; tcp_send_window_probe(sk); + } else if (val == TCP_REPAIR_OFF_NO_WP) { + tp->repair = 0; + sk->sk_reuse = SK_NO_REUSE; } else err = -EINVAL; @@ -2980,7 +2989,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (val < 0) err = -EINVAL; else - icsk->icsk_user_timeout = msecs_to_jiffies(val); + icsk->icsk_user_timeout = val; break; case TCP_FASTOPEN: @@ -3436,7 +3445,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_USER_TIMEOUT: - val = jiffies_to_msecs(icsk->icsk_user_timeout); + val = icsk->icsk_user_timeout; break; case TCP_FASTOPEN: @@ -3710,8 +3719,7 @@ int tcp_abort(struct sock *sk, int err) struct request_sock *req = inet_reqsk(sk); local_bh_disable(); - inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, - req); + inet_csk_reqsk_queue_drop(req->rsk_listener, req); local_bh_enable(); return 0; } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 5f5e5936760e..8b637f9f23a2 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -55,7 +55,6 @@ struct dctcp { u32 dctcp_alpha; u32 next_seq; u32 ce_state; - u32 delayed_ack_reserved; u32 loss_cwnd; }; @@ -96,7 +95,6 @@ static void dctcp_init(struct sock *sk) ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); - ca->delayed_ack_reserved = 0; ca->loss_cwnd = 0; ca->ce_state = 0; @@ -131,23 +129,14 @@ static void dctcp_ce_state_0_to_1(struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - /* State has changed from CE=0 to CE=1 and delayed - * ACK has not sent yet. - */ - if (!ca->ce_state && ca->delayed_ack_reserved) { - u32 tmp_rcv_nxt; - - /* Save current rcv_nxt. */ - tmp_rcv_nxt = tp->rcv_nxt; - - /* Generate previous ack with CE=0. */ - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; - tp->rcv_nxt = ca->prior_rcv_nxt; - - tcp_send_ack(sk); - - /* Recover current rcv_nxt. */ - tp->rcv_nxt = tmp_rcv_nxt; + if (!ca->ce_state) { + /* State has changed from CE=0 to CE=1, force an immediate + * ACK to reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); + tcp_enter_quickack_mode(sk, 1); } ca->prior_rcv_nxt = tp->rcv_nxt; @@ -161,23 +150,14 @@ static void dctcp_ce_state_1_to_0(struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - /* State has changed from CE=1 to CE=0 and delayed - * ACK has not sent yet. - */ - if (ca->ce_state && ca->delayed_ack_reserved) { - u32 tmp_rcv_nxt; - - /* Save current rcv_nxt. */ - tmp_rcv_nxt = tp->rcv_nxt; - - /* Generate previous ack with CE=1. */ - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; - tp->rcv_nxt = ca->prior_rcv_nxt; - - tcp_send_ack(sk); - - /* Recover current rcv_nxt. */ - tp->rcv_nxt = tmp_rcv_nxt; + if (ca->ce_state) { + /* State has changed from CE=1 to CE=0, force an immediate + * ACK to reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); + tcp_enter_quickack_mode(sk, 1); } ca->prior_rcv_nxt = tp->rcv_nxt; @@ -248,25 +228,6 @@ static void dctcp_state(struct sock *sk, u8 new_state) } } -static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev) -{ - struct dctcp *ca = inet_csk_ca(sk); - - switch (ev) { - case CA_EVENT_DELAYED_ACK: - if (!ca->delayed_ack_reserved) - ca->delayed_ack_reserved = 1; - break; - case CA_EVENT_NON_DELAYED_ACK: - if (ca->delayed_ack_reserved) - ca->delayed_ack_reserved = 0; - break; - default: - /* Don't care for the rest. */ - break; - } -} - static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { switch (ev) { @@ -276,10 +237,6 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) case CA_EVENT_ECN_NO_CE: dctcp_ce_state_1_to_0(sk); break; - case CA_EVENT_DELAYED_ACK: - case CA_EVENT_NON_DELAYED_ACK: - dctcp_update_ack_reserved(sk, ev); - break; default: /* Don't care for the rest. */ break; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 76ca88f63b70..d51fa358b2b1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -78,6 +78,7 @@ #include <linux/errqueue.h> #include <trace/events/tcp.h> #include <linux/static_key.h> +#include <net/busy_poll.h> int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -215,7 +216,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.quick = quickacks; } -static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) +void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -223,6 +224,7 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; } +EXPORT_SYMBOL(tcp_enter_quickack_mode); /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. @@ -265,7 +267,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) * it is probably a retransmit. */ if (tp->ecn_flags & TCP_ECN_SEEN) - tcp_enter_quickack_mode(sk, 1); + tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: if (tcp_ca_needs_ecn(sk)) @@ -273,7 +275,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ - tcp_enter_quickack_mode(sk, 1); + tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } tp->ecn_flags |= TCP_ECN_SEEN; @@ -3184,6 +3186,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked); + + /* If any of the cumulatively ACKed segments was + * retransmitted, non-SACK case cannot confirm that + * progress was due to original transmission due to + * lack of TCPCB_SACKED_ACKED bits even if some of + * the packets may have been never retransmitted. + */ + if (flag & FLAG_RETRANS_DATA_ACKED) + flag &= ~FLAG_ORIG_SACK_ACKED; } else { int delta; @@ -3452,7 +3463,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = get_seconds(); + tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); } static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) @@ -4333,6 +4344,11 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; +#ifdef CONFIG_TLS_DEVICE + if (from->decrypted != to->decrypted) + return false; +#endif + if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; @@ -4351,6 +4367,23 @@ static bool tcp_try_coalesce(struct sock *sk, return true; } +static bool tcp_ooo_try_coalesce(struct sock *sk, + struct sk_buff *to, + struct sk_buff *from, + bool *fragstolen) +{ + bool res = tcp_try_coalesce(sk, to, from, fragstolen); + + /* In case tcp_drop() is called later, update to->gso_segs */ + if (res) { + u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + + max_t(u16, 1, skb_shinfo(from)->gso_segs); + + skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + } + return res; +} + static void tcp_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); @@ -4474,8 +4507,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ - if (tcp_try_coalesce(sk, tp->ooo_last_skb, - skb, &fragstolen)) { + if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, + skb, &fragstolen)) { coalesce_done: tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); @@ -4503,7 +4536,7 @@ coalesce_done: /* All the bits are present. Drop. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb); + tcp_drop(sk, skb); skb = NULL; tcp_dsack_set(sk, seq, end_seq); goto add_sack; @@ -4522,11 +4555,11 @@ coalesce_done: TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb1); + tcp_drop(sk, skb1); goto merge_right; } - } else if (tcp_try_coalesce(sk, skb1, - skb, &fragstolen)) { + } else if (tcp_ooo_try_coalesce(sk, skb1, + skb, &fragstolen)) { goto coalesce_done; } p = &parent->rb_right; @@ -4611,8 +4644,10 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) skb->data_len = data_len; skb->len = size; - if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto err_free; + } err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) @@ -4668,18 +4703,21 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) * Out of sequence packets to the out_of_order_queue. */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - if (tcp_receive_window(tp) == 0) + if (tcp_receive_window(tp) == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; + } /* Ok. In sequence. In window. */ queue_and_out: if (skb_queue_len(&sk->sk_receive_queue) == 0) sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto drop; + } eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); - tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -4735,8 +4773,10 @@ drop: /* If window is closed, drop tail of packet. But after * remembering D-SACK for its head made in previous line. */ - if (!tcp_receive_window(tp)) + if (!tcp_receive_window(tp)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; + } goto queue_and_out; } @@ -4854,6 +4894,9 @@ restart: break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); +#ifdef CONFIG_TLS_DEVICE + nskb->decrypted = skb->decrypted; +#endif TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) __skb_queue_before(list, skb, nskb); @@ -4881,6 +4924,10 @@ restart: skb == tail || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; +#ifdef CONFIG_TLS_DEVICE + if (skb->decrypted != nskb->decrypted) + goto end; +#endif } } } @@ -4895,6 +4942,7 @@ end: static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 range_truesize, sum_tiny = 0; struct sk_buff *skb, *head; u32 start, end; @@ -4906,6 +4954,7 @@ new_range: } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; + range_truesize = skb->truesize; for (head = skb;;) { skb = skb_rb_next(skb); @@ -4916,11 +4965,20 @@ new_range: if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { - tcp_collapse(sk, NULL, &tp->out_of_order_queue, - head, skb, start, end); + /* Do not attempt collapsing tiny skbs */ + if (range_truesize != head->truesize || + end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { + tcp_collapse(sk, NULL, &tp->out_of_order_queue, + head, skb, start, end); + } else { + sum_tiny += range_truesize; + if (sum_tiny > sk->sk_rcvbuf >> 3) + return; + } goto new_range; } + range_truesize += skb->truesize; if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) start = TCP_SKB_CB(skb)->seq; if (after(TCP_SKB_CB(skb)->end_seq, end)) @@ -4935,6 +4993,7 @@ new_range: * 2) not add too big latencies if thousands of packets sit there. * (But if application shrinks SO_RCVBUF, we could still end up * freeing whole queue here) + * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks. * * Return true if queue has shrunk. */ @@ -4942,20 +5001,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct rb_node *node, *prev; + int goal; if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) return false; NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); + goal = sk->sk_rcvbuf >> 3; node = &tp->ooo_last_skb->rbnode; do { prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); + goal -= rb_to_skb(node)->truesize; tcp_drop(sk, rb_to_skb(node)); - sk_mem_reclaim(sk); - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !tcp_under_memory_pressure(sk)) - break; + if (!prev || goal <= 0) { + sk_mem_reclaim(sk); + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && + !tcp_under_memory_pressure(sk)) + break; + goal = sk->sk_rcvbuf >> 3; + } node = prev; } while (node); tp->ooo_last_skb = rb_to_skb(prev); @@ -4990,6 +5055,9 @@ static int tcp_prune_queue(struct sock *sk) else if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) tcp_collapse(sk, &sk->sk_receive_queue, NULL, @@ -5584,6 +5652,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); + sk_mark_napi_id(sk, skb); } tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); @@ -6412,6 +6481,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); + sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bea17f1e8302..9e041fa5c545 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -155,12 +155,26 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + (!twp || (reuse && time_after32(ktime_get_seconds(), + tcptw->tw_ts_recent_stamp)))) { + /* In case of repair and re-using TIME-WAIT sockets we still + * want to be sure that it is safe as above but honor the + * sequence numbers and time stamps set as part of the repair + * process. + * + * Without this check re-using a TIME-WAIT socket with TCP + * repair would accumulate a -1 on the repair assigned + * sequence number. The first time it is reused the sequence + * is -1, the second time -2, etc. This fixes that issue + * without appearing to create any others. + */ + if (likely(!tp->repair)) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + } sock_hold(sktw); return 1; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1dda1341a223..75ef332a7caf 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -144,7 +144,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tw->tw_substate = TCP_TIME_WAIT; tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } @@ -189,7 +189,7 @@ kill: if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); } inet_twsk_put(tw); @@ -449,119 +449,122 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); + const struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_request_sock *treq = tcp_rsk(req); + struct inet_connection_sock *newicsk; + struct tcp_sock *oldtp, *newtp; - if (newsk) { - const struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_request_sock *treq = tcp_rsk(req); - struct inet_connection_sock *newicsk = inet_csk(newsk); - struct tcp_sock *newtp = tcp_sk(newsk); - struct tcp_sock *oldtp = tcp_sk(sk); - - smc_check_reset_syn_req(oldtp, req, newtp); - - /* Now setup tcp_sock */ - newtp->pred_flags = 0; - - newtp->rcv_wup = newtp->copied_seq = - newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->segs_in = 1; - - newtp->snd_sml = newtp->snd_una = - newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; - - INIT_LIST_HEAD(&newtp->tsq_node); - INIT_LIST_HEAD(&newtp->tsorted_sent_queue); - - tcp_init_wl(newtp, treq->rcv_isn); - - newtp->srtt_us = 0; - newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); - newicsk->icsk_rto = TCP_TIMEOUT_INIT; - newicsk->icsk_ack.lrcvtime = tcp_jiffies32; - - newtp->packets_out = 0; - newtp->retrans_out = 0; - newtp->sacked_out = 0; - newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - newtp->tlp_high_seq = 0; - newtp->lsndtime = tcp_jiffies32; - newsk->sk_txhash = treq->txhash; - newtp->last_oow_ack_time = 0; - newtp->total_retrans = req->num_retrans; - - /* So many TCP implementations out there (incorrectly) count the - * initial SYN frame in their delayed-ACK and congestion control - * algorithms that we must have the following bandaid to talk - * efficiently to them. -DaveM - */ - newtp->snd_cwnd = TCP_INIT_CWND; - newtp->snd_cwnd_cnt = 0; - - /* There's a bubble in the pipe until at least the first ACK. */ - newtp->app_limited = ~0U; - - tcp_init_xmit_timers(newsk); - newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; - - newtp->rx_opt.saw_tstamp = 0; - - newtp->rx_opt.dsack = 0; - newtp->rx_opt.num_sacks = 0; - - newtp->urg_data = 0; - - if (sock_flag(newsk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); - - newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; - newtp->rx_opt.sack_ok = ireq->sack_ok; - newtp->window_clamp = req->rsk_window_clamp; - newtp->rcv_ssthresh = req->rsk_rcv_wnd; - newtp->rcv_wnd = req->rsk_rcv_wnd; - newtp->rx_opt.wscale_ok = ireq->wscale_ok; - if (newtp->rx_opt.wscale_ok) { - newtp->rx_opt.snd_wscale = ireq->snd_wscale; - newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; - } else { - newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; - newtp->window_clamp = min(newtp->window_clamp, 65535U); - } - newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << - newtp->rx_opt.snd_wscale); - newtp->max_window = newtp->snd_wnd; - - if (newtp->rx_opt.tstamp_ok) { - newtp->rx_opt.ts_recent = req->ts_recent; - newtp->rx_opt.ts_recent_stamp = get_seconds(); - newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - } else { - newtp->rx_opt.ts_recent_stamp = 0; - newtp->tcp_header_len = sizeof(struct tcphdr); - } - newtp->tsoffset = treq->ts_off; + if (!newsk) + return NULL; + + newicsk = inet_csk(newsk); + newtp = tcp_sk(newsk); + oldtp = tcp_sk(sk); + + smc_check_reset_syn_req(oldtp, req, newtp); + + /* Now setup tcp_sock */ + newtp->pred_flags = 0; + + newtp->rcv_wup = newtp->copied_seq = + newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->segs_in = 1; + + newtp->snd_sml = newtp->snd_una = + newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; + + INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); + + tcp_init_wl(newtp, treq->rcv_isn); + + newtp->srtt_us = 0; + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); + newicsk->icsk_rto = TCP_TIMEOUT_INIT; + newicsk->icsk_ack.lrcvtime = tcp_jiffies32; + + newtp->packets_out = 0; + newtp->retrans_out = 0; + newtp->sacked_out = 0; + newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + newtp->tlp_high_seq = 0; + newtp->lsndtime = tcp_jiffies32; + newsk->sk_txhash = treq->txhash; + newtp->last_oow_ack_time = 0; + newtp->total_retrans = req->num_retrans; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + newtp->snd_cwnd = TCP_INIT_CWND; + newtp->snd_cwnd_cnt = 0; + + /* There's a bubble in the pipe until at least the first ACK. */ + newtp->app_limited = ~0U; + + tcp_init_xmit_timers(newsk); + newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; + + newtp->rx_opt.saw_tstamp = 0; + + newtp->rx_opt.dsack = 0; + newtp->rx_opt.num_sacks = 0; + + newtp->urg_data = 0; + + if (sock_flag(newsk, SOCK_KEEPOPEN)) + inet_csk_reset_keepalive_timer(newsk, + keepalive_time_when(newtp)); + + newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; + newtp->rx_opt.sack_ok = ireq->sack_ok; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; + newtp->rx_opt.wscale_ok = ireq->wscale_ok; + if (newtp->rx_opt.wscale_ok) { + newtp->rx_opt.snd_wscale = ireq->snd_wscale; + newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; + } else { + newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp, 65535U); + } + newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; + newtp->max_window = newtp->snd_wnd; + + if (newtp->rx_opt.tstamp_ok) { + newtp->rx_opt.ts_recent = req->ts_recent; + newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->rx_opt.ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } + newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG - newtp->md5sig_info = NULL; /*XXX*/ - if (newtp->af_specific->md5_lookup(sk, newsk)) - newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; + newtp->md5sig_info = NULL; /*XXX*/ + if (newtp->af_specific->md5_lookup(sk, newsk)) + newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif - if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) - newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; - newtp->rx_opt.mss_clamp = req->mss; - tcp_ecn_openreq_child(newtp, req); - newtp->fastopen_req = NULL; - newtp->fastopen_rsk = NULL; - newtp->syn_data_acked = 0; - newtp->rack.mstamp = 0; - newtp->rack.advanced = 0; - newtp->rack.reo_wnd_steps = 1; - newtp->rack.last_delivered = 0; - newtp->rack.reo_wnd_persist = 0; - newtp->rack.dsack_seen = 0; - - __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); - } + if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; + newtp->rx_opt.mss_clamp = req->mss; + tcp_ecn_openreq_child(newtp, req); + newtp->fastopen_req = NULL; + newtp->fastopen_rsk = NULL; + newtp->syn_data_acked = 0; + newtp->rack.mstamp = 0; + newtp->rack.advanced = 0; + newtp->rack.reo_wnd_steps = 1; + newtp->rack.last_delivered = 0; + newtp->rack.reo_wnd_persist = 0; + newtp->rack.dsack_seen = 0; + + __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); + return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); @@ -600,7 +603,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); + tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 8cc7c3487330..870b0a335061 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -180,9 +180,9 @@ out: return segs; } -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct tcphdr *th; struct tcphdr *th2; @@ -220,7 +220,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) len = skb_gro_len(skb); flags = tcp_flag_word(th); - for (; (p = *head); head = &p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -233,7 +233,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) goto found; } - + p = NULL; goto out_check_final; found: @@ -262,8 +262,11 @@ found: flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); +#ifdef CONFIG_TLS_DEVICE + flush |= p->decrypted ^ skb->decrypted; +#endif - if (flush || skb_gro_receive(head, skb)) { + if (flush || skb_gro_receive(p, skb)) { mss = 1; goto out_check_final; } @@ -277,7 +280,7 @@ out_check_final: TCP_FLAG_FIN)); if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) - pp = head; + pp = p; out: NAPI_GRO_CB(skb)->flush |= (flush != 0); @@ -302,7 +305,7 @@ int tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); -static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) +static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f8f6129160dd..490df62f26d4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -160,7 +160,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp, } /* Account for an ACK we sent. */ -static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) +static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, + u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); @@ -171,6 +172,9 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } + + if (unlikely(rcv_nxt != tp->rcv_nxt)) + return; /* Special ACK sent by DCTCP to reflect ECN */ tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } @@ -1009,8 +1013,8 @@ static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, - gfp_t gfp_mask) +static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, + int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; @@ -1086,7 +1090,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); - th->ack_seq = htonl(tp->rcv_nxt); + th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->tcp_flags); @@ -1127,7 +1131,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, icsk->icsk_af_ops->send_check(sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); + tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); @@ -1164,6 +1168,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, return err; } +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask) +{ + return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, + tcp_sk(sk)->rcv_nxt); +} + /* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, @@ -3509,8 +3520,6 @@ void tcp_send_delayed_ack(struct sock *sk) int ato = icsk->icsk_ack.ato; unsigned long timeout; - tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); - if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; @@ -3559,7 +3568,7 @@ void tcp_send_delayed_ack(struct sock *sk) } /* This routine sends an ack and also updates the window. */ -void tcp_send_ack(struct sock *sk) +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) { struct sk_buff *buff; @@ -3567,8 +3576,6 @@ void tcp_send_ack(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return; - tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); - /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. @@ -3594,9 +3601,14 @@ void tcp_send_ack(struct sock *sk) skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ - tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0); + __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); +} +EXPORT_SYMBOL_GPL(__tcp_send_ack); + +void tcp_send_ack(struct sock *sk) +{ + __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); } -EXPORT_SYMBOL_GPL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index c61240e43923..4dff40dad4dc 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -146,6 +146,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); + /* Record both segment send and ack receive intervals */ + rs->snd_interval_us = snd_us; + rs->rcv_interval_us = ack_us; + /* Normally we expect interval_us >= min-rtt. * Note that rate may still be over-estimated when a spuriously * retransmistted skb was first (s)acked because "interval_us" diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 3b3611729928..7fdf222a0bdf 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,6 +22,35 @@ #include <linux/gfp.h> #include <net/tcp.h> +static u32 tcp_retransmit_stamp(const struct sock *sk) +{ + u32 start_ts = tcp_sk(sk)->retrans_stamp; + + if (unlikely(!start_ts)) { + struct sk_buff *head = tcp_rtx_queue_head(sk); + + if (!head) + return 0; + start_ts = tcp_skb_timestamp(head); + } + return start_ts; +} + +static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + u32 elapsed, start_ts; + + start_ts = tcp_retransmit_stamp(sk); + if (!icsk->icsk_user_timeout || !start_ts) + return icsk->icsk_rto; + elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; + if (elapsed >= icsk->icsk_user_timeout) + return 1; /* user timeout has passed; fire ASAP */ + else + return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed)); +} + /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. @@ -166,14 +195,9 @@ static bool retransmits_timed_out(struct sock *sk, if (!inet_csk(sk)->icsk_retransmits) return false; - start_ts = tcp_sk(sk)->retrans_stamp; - if (unlikely(!start_ts)) { - struct sk_buff *head = tcp_rtx_queue_head(sk); - - if (!head) - return false; - start_ts = tcp_skb_timestamp(head); - } + start_ts = tcp_retransmit_stamp(sk); + if (!start_ts) + return false; if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); @@ -183,8 +207,9 @@ static bool retransmits_timed_out(struct sock *sk, else timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + timeout = jiffies_to_msecs(timeout); } - return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout); + return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout; } /* A write timeout has occurred. Process the after effects. */ @@ -337,8 +362,7 @@ static void tcp_probe_timer(struct sock *sk) if (!start_ts) skb->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && - (s32)(tcp_time_stamp(tp) - start_ts) > - jiffies_to_msecs(icsk->icsk_user_timeout)) + (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; @@ -535,7 +559,8 @@ out_reset_timer: /* Use normal (exponential) backoff */ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX); if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) __sk_dst_reset(sk); @@ -672,7 +697,7 @@ static void tcp_keepalive_timer (struct timer_list *t) * to determine when to timeout instead. */ if ((icsk->icsk_user_timeout != 0 && - elapsed >= icsk->icsk_user_timeout && + elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) && icsk->icsk_probes_out > 0) || (icsk->icsk_user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9bb27df4dac5..060e841dde40 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -926,11 +926,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; fl4 = &inet->cork.fl.u.ip4; @@ -977,9 +972,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) connected = 1; } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.oif = sk->sk_bound_dev_if; + ipcm_init_sk(&ipc, inet); ipc.gso_size = up->gso_size; if (msg->msg_controllen) { @@ -1027,8 +1020,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) saddr = ipc.addr; ipc.addr = faddr = daddr; - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; @@ -2591,7 +2582,7 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname, * udp_poll - wait for a UDP event. * @file - file struct * @sock - socket - * @events - events to wait for + * @wait - poll table * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd @@ -2600,23 +2591,23 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname, * but then block when reading it. Add special case code * to work around these arguably broken applications. */ -__poll_t udp_poll_mask(struct socket *sock, __poll_t events) +__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) { - __poll_t mask = datagram_poll_mask(sock, events); + __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Check for false positives due to checksum errors */ - if ((mask & EPOLLRDNORM) && !(sock->file->f_flags & O_NONBLOCK) && + if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) mask &= ~(EPOLLIN | EPOLLRDNORM); return mask; } -EXPORT_SYMBOL(udp_poll_mask); +EXPORT_SYMBOL(udp_poll); int udp_abort(struct sock *sk, int err) { diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 92dc9e5a7ff3..0c0522b79b43 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -343,10 +343,11 @@ out: return segs; } -struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup) +struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct udphdr *uh, udp_lookup_t lookup) { - struct sk_buff *p, **pp = NULL; + struct sk_buff *pp = NULL; + struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; @@ -371,7 +372,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, unflush: flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -394,13 +395,13 @@ unflush: out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } EXPORT_SYMBOL(udp_gro_receive); -static struct sk_buff **udp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *udp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 0eff75525da1..613282c65a10 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -15,7 +15,7 @@ menuconfig IPV6 Documentation/networking/ipv6.txt and read the HOWTO at <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/> - To compile this protocol support as a module, choose M here: the + To compile this protocol support as a module, choose M here: the module will be called ipv6. if IPV6 @@ -108,6 +108,7 @@ config IPV6_MIP6 config IPV6_ILA tristate "IPv6: Identifier Locator Addressing (ILA)" depends on NETFILTER + select DST_CACHE select LWTUNNEL ---help--- Support for IPv6 Identifier Locator Addressing (ILA). diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c134286d6a41..2fac4ad74867 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -385,8 +385,6 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) if (ndev->cnf.stable_secret.initialized) ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; - else - ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode; ndev->cnf.mtu6 = dev->mtu; ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); @@ -2374,7 +2372,8 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, continue; if ((rt->fib6_flags & noflags) != 0) continue; - fib6_info_hold(rt); + if (!fib6_info_hold_safe(rt)) + continue; break; } out: @@ -4528,6 +4527,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, u32 flags) { struct fib6_info *f6i; + u32 prio; f6i = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len, @@ -4536,13 +4536,15 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, if (!f6i) return -ENOENT; - if (f6i->fib6_metric != ifp->rt_priority) { + prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF; + if (f6i->fib6_metric != prio) { + /* delete old one */ + ip6_del_rt(dev_net(ifp->idev->dev), f6i); + /* add new one */ addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->rt_priority, ifp->idev->dev, expires, flags, GFP_KERNEL); - /* delete old one */ - ip6_del_rt(dev_net(ifp->idev->dev), f6i); } else { if (!expires) fib6_clean_expires(f6i); @@ -5207,7 +5209,9 @@ static inline size_t inet6_ifla6_size(void) + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */ + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */ + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */ - + nla_total_size(sizeof(struct in6_addr)); /* IFLA_INET6_TOKEN */ + + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */ + + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */ + + 0; } static inline size_t inet6_if_nlmsg_size(void) @@ -5889,32 +5893,31 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, loff_t *ppos) { int ret = 0; - int new_val; + u32 new_val; struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1; struct net *net = (struct net *)ctl->extra2; + struct ctl_table tmp = { + .data = &new_val, + .maxlen = sizeof(new_val), + .mode = ctl->mode, + }; if (!rtnl_trylock()) return restart_syscall(); - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + new_val = *((u32 *)ctl->data); - if (write) { - new_val = *((int *)ctl->data); + ret = proc_douintvec(&tmp, write, buffer, lenp, ppos); + if (ret != 0) + goto out; + if (write) { if (check_addr_gen_mode(new_val) < 0) { ret = -EINVAL; goto out; } - /* request for default */ - if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) { - ipv6_devconf_dflt.addr_gen_mode = new_val; - - /* request for individual net device */ - } else { - if (!idev) - goto out; - + if (idev) { if (check_stable_privacy(idev, net, new_val) < 0) { ret = -EINVAL; goto out; @@ -5924,7 +5927,21 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, idev->cnf.addr_gen_mode = new_val; addrconf_dev_config(idev->dev); } + } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) { + struct net_device *dev; + + net->ipv6.devconf_dflt->addr_gen_mode = new_val; + for_each_netdev(net, dev) { + idev = __in6_dev_get(dev); + if (idev && + idev->cnf.addr_gen_mode != new_val) { + idev->cnf.addr_gen_mode = new_val; + addrconf_dev_config(idev->dev); + } + } } + + *((u32 *)ctl->data) = new_val; } out: diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 74f2a261e8df..c9535354149f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -570,7 +570,7 @@ const struct proto_ops inet6_stream_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = inet_accept, /* ok */ .getname = inet6_getname, - .poll_mask = tcp_poll_mask, /* ok */ + .poll = tcp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = inet_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ @@ -603,7 +603,7 @@ const struct proto_ops inet6_dgram_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, - .poll_mask = udp_poll_mask, /* ok */ + .poll = udp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ @@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .func = ipv6_rcv, + .list_func = ipv6_list_rcv, }; static int __init ipv6_packet_init(void) diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 1323b9679cf7..1c0bb9fb76e6 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -799,8 +799,7 @@ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop) { struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts; - txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS, - hop, hop ? ipv6_optlen(hop) : 0); + txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop); txopt_put(old); if (IS_ERR(txopts)) return PTR_ERR(txopts); @@ -1222,8 +1221,7 @@ static int calipso_req_setattr(struct request_sock *req, if (IS_ERR(new)) return PTR_ERR(new); - txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, - new, new ? ipv6_optlen(new) : 0); + txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); kfree(new); @@ -1260,8 +1258,7 @@ static void calipso_req_delattr(struct request_sock *req) if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new)) return; /* Nothing to do */ - txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, - new, new ? ipv6_optlen(new) : 0); + txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); if (!IS_ERR(txopts)) { txopts = xchg(&req_inet->ipv6_opt, txopts); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 2ee08b6a86a4..5a094f58fe8a 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -700,13 +700,16 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, } if (np->rxopt.bits.rxorigdstaddr) { struct sockaddr_in6 sin6; - __be16 *ports = (__be16 *) skb_transport_header(skb); + __be16 *ports; + int end; - if (skb_transport_offset(skb) + 4 <= (int)skb->len) { + end = skb_transport_offset(skb) + 4; + if (end <= 0 || pskb_may_pull(skb, end)) { /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ + ports = (__be16 *)skb_transport_header(skb); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ipv6_hdr(skb)->daddr; @@ -736,7 +739,7 @@ EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, - struct ipcm6_cookie *ipc6, struct sockcm_cookie *sockc) + struct ipcm6_cookie *ipc6) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; @@ -755,7 +758,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, } if (cmsg->cmsg_level == SOL_SOCKET) { - err = __sock_cmsg_send(sk, msg, cmsg, sockc); + err = __sock_cmsg_send(sk, msg, cmsg, &ipc6->sockc); if (err) return err; continue; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 96af267835c3..6177e2171171 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -49,8 +49,8 @@ static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen) return 0; } -static struct sk_buff **esp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *esp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 5bc2bf3733ab..20291c2036fc 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -1015,29 +1015,21 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) } EXPORT_SYMBOL_GPL(ipv6_dup_options); -static int ipv6_renew_option(void *ohdr, - struct ipv6_opt_hdr __user *newopt, int newoptlen, - int inherit, - struct ipv6_opt_hdr **hdr, - char **p) +static void ipv6_renew_option(int renewtype, + struct ipv6_opt_hdr **dest, + struct ipv6_opt_hdr *old, + struct ipv6_opt_hdr *new, + int newtype, char **p) { - if (inherit) { - if (ohdr) { - memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr)); - *hdr = (struct ipv6_opt_hdr *)*p; - *p += CMSG_ALIGN(ipv6_optlen(*hdr)); - } - } else { - if (newopt) { - if (copy_from_user(*p, newopt, newoptlen)) - return -EFAULT; - *hdr = (struct ipv6_opt_hdr *)*p; - if (ipv6_optlen(*hdr) > newoptlen) - return -EINVAL; - *p += CMSG_ALIGN(newoptlen); - } - } - return 0; + struct ipv6_opt_hdr *src; + + src = (renewtype == newtype ? new : old); + if (!src) + return; + + memcpy(*p, src, ipv6_optlen(src)); + *dest = (struct ipv6_opt_hdr *)*p; + *p += CMSG_ALIGN(ipv6_optlen(*dest)); } /** @@ -1063,13 +1055,11 @@ static int ipv6_renew_option(void *ohdr, */ struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, - int newtype, - struct ipv6_opt_hdr __user *newopt, int newoptlen) + int newtype, struct ipv6_opt_hdr *newopt) { int tot_len = 0; char *p; struct ipv6_txoptions *opt2; - int err; if (opt) { if (newtype != IPV6_HOPOPTS && opt->hopopt) @@ -1082,8 +1072,8 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt)); } - if (newopt && newoptlen) - tot_len += CMSG_ALIGN(newoptlen); + if (newopt) + tot_len += CMSG_ALIGN(ipv6_optlen(newopt)); if (!tot_len) return NULL; @@ -1098,29 +1088,19 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, opt2->tot_len = tot_len; p = (char *)(opt2 + 1); - err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen, - newtype != IPV6_HOPOPTS, - &opt2->hopopt, &p); - if (err) - goto out; - - err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen, - newtype != IPV6_RTHDRDSTOPTS, - &opt2->dst0opt, &p); - if (err) - goto out; - - err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen, - newtype != IPV6_RTHDR, - (struct ipv6_opt_hdr **)&opt2->srcrt, &p); - if (err) - goto out; - - err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen, - newtype != IPV6_DSTOPTS, - &opt2->dst1opt, &p); - if (err) - goto out; + ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt, + (opt ? opt->hopopt : NULL), + newopt, newtype, &p); + ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt, + (opt ? opt->dst0opt : NULL), + newopt, newtype, &p); + ipv6_renew_option(IPV6_RTHDR, + (struct ipv6_opt_hdr **)&opt2->srcrt, + (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL), + newopt, newtype, &p); + ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt, + (opt ? opt->dst1opt : NULL), + newopt, newtype, &p); opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) + (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) + @@ -1128,37 +1108,6 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0); return opt2; -out: - sock_kfree_s(sk, opt2, opt2->tot_len); - return ERR_PTR(err); -} - -/** - * ipv6_renew_options_kern - replace a specific ext hdr with a new one. - * - * @sk: sock from which to allocate memory - * @opt: original options - * @newtype: option type to replace in @opt - * @newopt: new option of type @newtype to replace (kernel-mem) - * @newoptlen: length of @newopt - * - * See ipv6_renew_options(). The difference is that @newopt is - * kernel memory, rather than user memory. - */ -struct ipv6_txoptions * -ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt, - int newtype, struct ipv6_opt_hdr *newopt, - int newoptlen) -{ - struct ipv6_txoptions *ret_val; - const mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret_val = ipv6_renew_options(sk, opt, newtype, - (struct ipv6_opt_hdr __user *)newopt, - newoptlen); - set_fs(old_fs); - return ret_val; } struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index be491bf6ab6e..00d159d431dc 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -402,9 +402,10 @@ static int icmp6_iif(const struct sk_buff *skb) /* for local traffic to local address, skb dev is the loopback * device. Check if there is a dst attached to the skb and if so - * get the real device index. + * get the real device index. Same is needed for replies to a link + * local address on a device enslaved to an L3 master device */ - if (unlikely(iif == LOOPBACK_IFINDEX)) { + if (unlikely(iif == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) { const struct rt6_info *rt6 = skb_rt6_info(skb); if (rt6) @@ -430,7 +431,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; - struct sockcm_cookie sockc_unused = {0}; struct ipcm6_cookie ipc6; int iif = 0; int addr_type = 0; @@ -545,7 +545,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - ipc6.tclass = np->tclass; + ipcm6_init_sk(&ipc6, np); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = icmpv6_route_lookup(net, skb, sk, &fl6); @@ -553,8 +553,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, goto out; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - ipc6.dontfrag = np->dontfrag; - ipc6.opt = NULL; msg.skb = skb; msg.offset = skb_network_offset(skb); @@ -575,7 +573,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, (struct rt6_info *)dst, - MSG_DONTWAIT, &sockc_unused)) { + MSG_DONTWAIT)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { @@ -679,7 +677,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct dst_entry *dst; struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); - struct sockcm_cookie sockc_unused = {0}; saddr = &ipv6_hdr(skb)->daddr; @@ -726,16 +723,14 @@ static void icmpv6_echo_reply(struct sk_buff *skb) msg.offset = 0; msg.type = ICMPV6_ECHO_REPLY; + ipcm6_init_sk(&ipc6, np); ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); - ipc6.dontfrag = np->dontfrag; - ipc6.opt = NULL; if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, - (struct rt6_info *)dst, MSG_DONTWAIT, - &sockc_unused)) { + (struct rt6_info *)dst, MSG_DONTWAIT)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile index 4b32e5921e5c..b7739aba6e68 100644 --- a/net/ipv6/ila/Makefile +++ b/net/ipv6/ila/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_IPV6_ILA) += ila.o -ila-objs := ila_common.o ila_lwt.o ila_xlat.o +ila-objs := ila_main.o ila_common.o ila_lwt.o ila_xlat.o diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index 3c7a11b62334..1f747bcbec29 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -19,6 +19,7 @@ #include <linux/skbuff.h> #include <linux/types.h> #include <net/checksum.h> +#include <net/genetlink.h> #include <net/ip.h> #include <net/protocol.h> #include <uapi/linux/ila.h> @@ -104,9 +105,31 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, void ila_init_saved_csum(struct ila_params *p); +struct ila_net { + struct { + struct rhashtable rhash_table; + spinlock_t *locks; /* Bucket locks for entry manipulation */ + unsigned int locks_mask; + bool hooks_registered; + } xlat; +}; + int ila_lwt_init(void); void ila_lwt_fini(void); -int ila_xlat_init(void); -void ila_xlat_fini(void); + +int ila_xlat_init_net(struct net *net); +void ila_xlat_exit_net(struct net *net); + +int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info); +int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info); +int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info); +int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info); +int ila_xlat_nl_dump_start(struct netlink_callback *cb); +int ila_xlat_nl_dump_done(struct netlink_callback *cb); +int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb); + +extern unsigned int ila_net_id; + +extern struct genl_family ila_nl_family; #endif /* __ILA_H */ diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 8c88ecf29b93..95e9146918cc 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -153,34 +153,3 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, /* Now change destination address */ iaddr->loc = p->locator; } - -static int __init ila_init(void) -{ - int ret; - - ret = ila_lwt_init(); - - if (ret) - goto fail_lwt; - - ret = ila_xlat_init(); - if (ret) - goto fail_xlat; - - return 0; -fail_xlat: - ila_lwt_fini(); -fail_lwt: - return ret; -} - -static void __exit ila_fini(void) -{ - ila_xlat_fini(); - ila_lwt_fini(); -} - -module_init(ila_init); -module_exit(ila_fini); -MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); -MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c new file mode 100644 index 000000000000..18fac76b9520 --- /dev/null +++ b/net/ipv6/ila/ila_main.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <net/genetlink.h> +#include <net/ila.h> +#include <net/netns/generic.h> +#include <uapi/linux/genetlink.h> +#include "ila.h" + +static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { + [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, + [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, + [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, + [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, + [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, +}; + +static const struct genl_ops ila_nl_ops[] = { + { + .cmd = ILA_CMD_ADD, + .doit = ila_xlat_nl_cmd_add_mapping, + .policy = ila_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = ILA_CMD_DEL, + .doit = ila_xlat_nl_cmd_del_mapping, + .policy = ila_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = ILA_CMD_FLUSH, + .doit = ila_xlat_nl_cmd_flush, + .policy = ila_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = ILA_CMD_GET, + .doit = ila_xlat_nl_cmd_get_mapping, + .start = ila_xlat_nl_dump_start, + .dumpit = ila_xlat_nl_dump, + .done = ila_xlat_nl_dump_done, + .policy = ila_nl_policy, + }, +}; + +unsigned int ila_net_id; + +struct genl_family ila_nl_family __ro_after_init = { + .hdrsize = 0, + .name = ILA_GENL_NAME, + .version = ILA_GENL_VERSION, + .maxattr = ILA_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .ops = ila_nl_ops, + .n_ops = ARRAY_SIZE(ila_nl_ops), +}; + +static __net_init int ila_init_net(struct net *net) +{ + int err; + + err = ila_xlat_init_net(net); + if (err) + goto ila_xlat_init_fail; + + return 0; + +ila_xlat_init_fail: + return err; +} + +static __net_exit void ila_exit_net(struct net *net) +{ + ila_xlat_exit_net(net); +} + +static struct pernet_operations ila_net_ops = { + .init = ila_init_net, + .exit = ila_exit_net, + .id = &ila_net_id, + .size = sizeof(struct ila_net), +}; + +static int __init ila_init(void) +{ + int ret; + + ret = register_pernet_device(&ila_net_ops); + if (ret) + goto register_device_fail; + + ret = genl_register_family(&ila_nl_family); + if (ret) + goto register_family_fail; + + ret = ila_lwt_init(); + if (ret) + goto fail_lwt; + + return 0; + +fail_lwt: + genl_unregister_family(&ila_nl_family); +register_family_fail: + unregister_pernet_device(&ila_net_ops); +register_device_fail: + return ret; +} + +static void __exit ila_fini(void) +{ + ila_lwt_fini(); + genl_unregister_family(&ila_nl_family); + unregister_pernet_device(&ila_net_ops); +} + +module_init(ila_init); +module_exit(ila_fini); +MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 10ae13560b40..17c455ff69ff 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -22,36 +22,14 @@ struct ila_map { struct rcu_head rcu; }; -static unsigned int ila_net_id; - -struct ila_net { - struct rhashtable rhash_table; - spinlock_t *locks; /* Bucket locks for entry manipulation */ - unsigned int locks_mask; - bool hooks_registered; -}; - +#define MAX_LOCKS 1024 #define LOCKS_PER_CPU 10 static int alloc_ila_locks(struct ila_net *ilan) { - unsigned int i, size; - unsigned int nr_pcpus = num_possible_cpus(); - - nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); - size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU); - - if (sizeof(spinlock_t) != 0) { - ilan->locks = kvmalloc_array(size, sizeof(spinlock_t), - GFP_KERNEL); - if (!ilan->locks) - return -ENOMEM; - for (i = 0; i < size; i++) - spin_lock_init(&ilan->locks[i]); - } - ilan->locks_mask = size - 1; - - return 0; + return alloc_bucket_spinlocks(&ilan->xlat.locks, &ilan->xlat.locks_mask, + MAX_LOCKS, LOCKS_PER_CPU, + GFP_KERNEL); } static u32 hashrnd __read_mostly; @@ -71,7 +49,7 @@ static inline u32 ila_locator_hash(struct ila_locator loc) static inline spinlock_t *ila_get_lock(struct ila_net *ilan, struct ila_locator loc) { - return &ilan->locks[ila_locator_hash(loc) & ilan->locks_mask]; + return &ilan->xlat.locks[ila_locator_hash(loc) & ilan->xlat.locks_mask]; } static inline int ila_cmp_wildcards(struct ila_map *ila, @@ -115,16 +93,6 @@ static const struct rhashtable_params rht_params = { .obj_cmpfn = ila_cmpfn, }; -static struct genl_family ila_nl_family; - -static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { - [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, - [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, - [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, - [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, - [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, -}; - static int parse_nl_config(struct genl_info *info, struct ila_xlat_params *xp) { @@ -162,7 +130,7 @@ static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr, { struct ila_map *ila; - ila = rhashtable_lookup_fast(&ilan->rhash_table, &iaddr->loc, + ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &iaddr->loc, rht_params); while (ila) { if (!ila_cmp_wildcards(ila, iaddr, ifindex)) @@ -179,7 +147,7 @@ static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp, { struct ila_map *ila; - ila = rhashtable_lookup_fast(&ilan->rhash_table, + ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); while (ila) { @@ -196,9 +164,9 @@ static inline void ila_release(struct ila_map *ila) kfree_rcu(ila, rcu); } -static void ila_free_cb(void *ptr, void *arg) +static void ila_free_node(struct ila_map *ila) { - struct ila_map *ila = (struct ila_map *)ptr, *next; + struct ila_map *next; /* Assume rcu_readlock held */ while (ila) { @@ -208,6 +176,11 @@ static void ila_free_cb(void *ptr, void *arg) } } +static void ila_free_cb(void *ptr, void *arg) +{ + ila_free_node((struct ila_map *)ptr); +} + static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila); static unsigned int @@ -235,7 +208,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; - if (!ilan->hooks_registered) { + if (!ilan->xlat.hooks_registered) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ @@ -244,7 +217,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) if (err) return err; - ilan->hooks_registered = true; + ilan->xlat.hooks_registered = true; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); @@ -259,12 +232,12 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) spin_lock(lock); - head = rhashtable_lookup_fast(&ilan->rhash_table, + head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); if (!head) { /* New entry for the rhash_table */ - err = rhashtable_lookup_insert_fast(&ilan->rhash_table, + err = rhashtable_lookup_insert_fast(&ilan->xlat.rhash_table, &ila->node, rht_params); } else { struct ila_map *tila = head, *prev = NULL; @@ -290,7 +263,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) } else { /* Make this ila new head */ RCU_INIT_POINTER(ila->next, head); - err = rhashtable_replace_fast(&ilan->rhash_table, + err = rhashtable_replace_fast(&ilan->xlat.rhash_table, &head->node, &ila->node, rht_params); if (err) @@ -316,7 +289,7 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp) spin_lock(lock); - head = rhashtable_lookup_fast(&ilan->rhash_table, + head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); ila = head; @@ -346,15 +319,15 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp) * table */ err = rhashtable_replace_fast( - &ilan->rhash_table, &ila->node, + &ilan->xlat.rhash_table, &ila->node, &head->node, rht_params); if (err) goto out; } else { /* Entry no longer used */ - err = rhashtable_remove_fast(&ilan->rhash_table, - &ila->node, - rht_params); + err = rhashtable_remove_fast( + &ilan->xlat.rhash_table, + &ila->node, rht_params); } } @@ -369,7 +342,7 @@ out: return err; } -static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) +int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params p; @@ -382,7 +355,7 @@ static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) return ila_add_mapping(net, &p); } -static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) +int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params xp; @@ -397,6 +370,59 @@ static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) return 0; } +static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan, + struct ila_map *ila) +{ + return ila_get_lock(ilan, ila->xp.ip.locator_match); +} + +int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct ila_net *ilan = net_generic(net, ila_net_id); + struct rhashtable_iter iter; + struct ila_map *ila; + spinlock_t *lock; + int ret; + + ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter, GFP_KERNEL); + if (ret) + goto done; + + rhashtable_walk_start(&iter); + + for (;;) { + ila = rhashtable_walk_next(&iter); + + if (IS_ERR(ila)) { + if (PTR_ERR(ila) == -EAGAIN) + continue; + ret = PTR_ERR(ila); + goto done; + } else if (!ila) { + break; + } + + lock = lock_from_ila_map(ilan, ila); + + spin_lock(lock); + + ret = rhashtable_remove_fast(&ilan->xlat.rhash_table, + &ila->node, rht_params); + if (!ret) + ila_free_node(ila); + + spin_unlock(lock); + + if (ret) + break; + } + +done: + rhashtable_walk_stop(&iter); + return ret; +} + static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) { if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR, @@ -434,7 +460,7 @@ nla_put_failure: return -EMSGSIZE; } -static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) +int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); @@ -475,27 +501,34 @@ out_free: struct ila_dump_iter { struct rhashtable_iter rhiter; + int skip; }; -static int ila_nl_dump_start(struct netlink_callback *cb) +int ila_xlat_nl_dump_start(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct ila_net *ilan = net_generic(net, ila_net_id); - struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; + struct ila_dump_iter *iter; + int ret; - if (!iter) { - iter = kmalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; - cb->args[0] = (long)iter; + ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter->rhiter, + GFP_KERNEL); + if (ret) { + kfree(iter); + return ret; } - return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter, - GFP_KERNEL); + iter->skip = 0; + cb->args[0] = (long)iter; + + return ret; } -static int ila_nl_dump_done(struct netlink_callback *cb) +int ila_xlat_nl_dump_done(struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; @@ -506,24 +539,49 @@ static int ila_nl_dump_done(struct netlink_callback *cb) return 0; } -static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) +int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; struct rhashtable_iter *rhiter = &iter->rhiter; + int skip = iter->skip; struct ila_map *ila; int ret; rhashtable_walk_start(rhiter); - for (;;) { - ila = rhashtable_walk_next(rhiter); + /* Get first entry */ + ila = rhashtable_walk_peek(rhiter); + + if (ila && !IS_ERR(ila) && skip) { + /* Skip over visited entries */ + + while (ila && skip) { + /* Skip over any ila entries in this list that we + * have already dumped. + */ + ila = rcu_access_pointer(ila->next); + skip--; + } + } + skip = 0; + + for (;;) { if (IS_ERR(ila)) { - if (PTR_ERR(ila) == -EAGAIN) - continue; ret = PTR_ERR(ila); - goto done; + if (ret == -EAGAIN) { + /* Table has changed and iter has reset. Return + * -EAGAIN to the application even if we have + * written data to the skb. The application + * needs to deal with this. + */ + + goto out_ret; + } else { + break; + } } else if (!ila) { + ret = 0; break; } @@ -532,90 +590,54 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, ILA_CMD_GET); if (ret) - goto done; + goto out; + skip++; ila = rcu_access_pointer(ila->next); } + + skip = 0; + ila = rhashtable_walk_next(rhiter); } - ret = skb->len; +out: + iter->skip = skip; + ret = (skb->len ? : ret); -done: +out_ret: rhashtable_walk_stop(rhiter); return ret; } -static const struct genl_ops ila_nl_ops[] = { - { - .cmd = ILA_CMD_ADD, - .doit = ila_nl_cmd_add_mapping, - .policy = ila_nl_policy, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = ILA_CMD_DEL, - .doit = ila_nl_cmd_del_mapping, - .policy = ila_nl_policy, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = ILA_CMD_GET, - .doit = ila_nl_cmd_get_mapping, - .start = ila_nl_dump_start, - .dumpit = ila_nl_dump, - .done = ila_nl_dump_done, - .policy = ila_nl_policy, - }, -}; - -static struct genl_family ila_nl_family __ro_after_init = { - .hdrsize = 0, - .name = ILA_GENL_NAME, - .version = ILA_GENL_VERSION, - .maxattr = ILA_ATTR_MAX, - .netnsok = true, - .parallel_ops = true, - .module = THIS_MODULE, - .ops = ila_nl_ops, - .n_ops = ARRAY_SIZE(ila_nl_ops), -}; - #define ILA_HASH_TABLE_SIZE 1024 -static __net_init int ila_init_net(struct net *net) +int ila_xlat_init_net(struct net *net) { - int err; struct ila_net *ilan = net_generic(net, ila_net_id); + int err; err = alloc_ila_locks(ilan); if (err) return err; - rhashtable_init(&ilan->rhash_table, &rht_params); + rhashtable_init(&ilan->xlat.rhash_table, &rht_params); return 0; } -static __net_exit void ila_exit_net(struct net *net) +void ila_xlat_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); - rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL); + rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL); - kvfree(ilan->locks); + free_bucket_spinlocks(ilan->xlat.locks); - if (ilan->hooks_registered) + if (ilan->xlat.hooks_registered) nf_unregister_net_hooks(net, ila_nf_hook_ops, ARRAY_SIZE(ila_nf_hook_ops)); } -static struct pernet_operations ila_net_ops = { - .init = ila_init_net, - .exit = ila_exit_net, - .id = &ila_net_id, - .size = sizeof(struct ila_net), -}; - static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) { struct ila_map *ila; @@ -641,29 +663,3 @@ static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) return 0; } - -int __init ila_xlat_init(void) -{ - int ret; - - ret = register_pernet_device(&ila_net_ops); - if (ret) - goto exit; - - ret = genl_register_family(&ila_nl_family); - if (ret < 0) - goto unregister; - - return 0; - -unregister: - unregister_pernet_device(&ila_net_ops); -exit: - return ret; -} - -void ila_xlat_fini(void) -{ - genl_unregister_family(&ila_nl_family); - unregister_pernet_device(&ila_net_ops); -} diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1fb2f3118d60..d212738e9d10 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -935,20 +935,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); - enum fib_event_type event = FIB_EVENT_ENTRY_ADD; - struct fib6_info *iter = NULL, *match = NULL; + struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; + struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); - int append = (info->nlh && - (info->nlh->nlmsg_flags & NLM_F_APPEND)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; + bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); u16 nlflags = NLM_F_EXCL; int err; - if (append) + if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; @@ -970,8 +969,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, nlflags &= ~NLM_F_EXCL; if (replace) { - found++; - break; + if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { + found++; + break; + } + if (rt_can_ecmp) + fallback_ins = fallback_ins ?: ins; + goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { @@ -986,51 +990,71 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } - - /* first route that matches */ - if (!match) - match = iter; + /* If we have the same destination and the same metric, + * but not the same gateway, then the route we try to + * add is sibling to this route, increment our counter + * of siblings, and later we will add our route to the + * list. + * Only static routes (which don't have flag + * RTF_EXPIRES) are used for ECMPv6. + * + * To avoid long list, we only had siblings if the + * route have a gateway. + */ + if (rt_can_ecmp && + rt6_qualify_for_ecmp(iter)) + rt->fib6_nsiblings++; } if (iter->fib6_metric > rt->fib6_metric) break; +next_iter: ins = &iter->fib6_next; } + if (fallback_ins && !found) { + /* No ECMP-able route found, replace first non-ECMP one */ + ins = fallback_ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + found++; + } + /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ - if (append && match) { + if (rt->fib6_nsiblings) { + unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; - if (rt->fib6_flags & RTF_REJECT) { - NL_SET_ERR_MSG(extack, - "Can not append a REJECT route"); - return -EINVAL; - } else if (match->fib6_flags & RTF_REJECT) { - NL_SET_ERR_MSG(extack, - "Can not append to a REJECT route"); - return -EINVAL; + /* Find the first route that have the same metric */ + sibling = leaf; + while (sibling) { + if (sibling->fib6_metric == rt->fib6_metric && + rt6_qualify_for_ecmp(sibling)) { + list_add_tail(&rt->fib6_siblings, + &sibling->fib6_siblings); + break; + } + sibling = rcu_dereference_protected(sibling->fib6_next, + lockdep_is_held(&rt->fib6_table->tb6_lock)); } - event = FIB_EVENT_ENTRY_APPEND; - rt->fib6_nsiblings = match->fib6_nsiblings; - list_add_tail(&rt->fib6_siblings, &match->fib6_siblings); - match->fib6_nsiblings++; - /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ + fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, - &match->fib6_siblings, fib6_siblings) { + &rt->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; - BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings); + BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); + fib6_nsiblings++; } - - rt6_multipath_rebalance(match); + BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); + rt6_multipath_rebalance(temp_sibling); } /* @@ -1043,8 +1067,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, add: nlflags |= NLM_F_CREATE; - err = call_fib6_entry_notifiers(info->nl_net, event, rt, - extack); + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_ADD, + rt, extack); if (err) return err; @@ -1062,7 +1087,7 @@ add: } } else { - struct fib6_info *tmp; + int nsiblings; if (!found) { if (add) @@ -1077,57 +1102,48 @@ add: if (err) return err; - /* if route being replaced has siblings, set tmp to - * last one, otherwise tmp is current route. this is - * used to set fib6_next for new route - */ - if (iter->fib6_nsiblings) - tmp = list_last_entry(&iter->fib6_siblings, - struct fib6_info, - fib6_siblings); - else - tmp = iter; - - /* insert new route */ atomic_inc(&rt->fib6_ref); rcu_assign_pointer(rt->fib6_node, fn); - rt->fib6_next = tmp->fib6_next; + rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); - if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } + nsiblings = iter->fib6_nsiblings; + iter->fib6_node = NULL; + fib6_purge_rt(iter, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == iter) + fn->rr_ptr = NULL; + fib6_info_release(iter); - /* delete old route */ - rt = iter; - - if (rt->fib6_nsiblings) { - struct fib6_info *tmp; - + if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ - list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings, - fib6_siblings) { - iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == iter) - fn->rr_ptr = NULL; - fib6_info_release(iter); - - rt->fib6_nsiblings--; - info->nl_net->ipv6.rt6_stats->fib_rt_entries--; + ins = &rt->fib6_next; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + while (iter) { + if (iter->fib6_metric > rt->fib6_metric) + break; + if (rt6_qualify_for_ecmp(iter)) { + *ins = iter->fib6_next; + iter->fib6_node = NULL; + fib6_purge_rt(iter, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == iter) + fn->rr_ptr = NULL; + fib6_info_release(iter); + nsiblings--; + info->nl_net->ipv6.rt6_stats->fib_rt_entries--; + } else { + ins = &iter->fib6_next; + } + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); } + WARN_ON(nsiblings != 0); } - - WARN_ON(rt->fib6_nsiblings != 0); - - rt->fib6_node = NULL; - fib6_purge_rt(rt, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == rt) - fn->rr_ptr = NULL; - fib6_info_release(rt); } return 0; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 3eee7637bdfe..cb54a8a3c273 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -373,7 +373,6 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, if (olen > 0) { struct msghdr msg; struct flowi6 flowi6; - struct sockcm_cookie sockc_junk; struct ipcm6_cookie ipc6; err = -ENOMEM; @@ -392,7 +391,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, memset(&flowi6, 0, sizeof(flowi6)); ipc6.opt = fl->opt; - err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6, &sockc_junk); + err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6); if (err) goto done; err = -EINVAL; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index c8cf2fdbb13b..fc7dd3a04360 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -927,7 +927,6 @@ tx_err: static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { - struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct ip6_tnl *t = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); struct net_device_stats *stats; @@ -990,6 +989,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = key->tos; + if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + goto tx_err; md = ip_tunnel_info_opts(tun_info); if (!md) goto tx_err; @@ -1010,6 +1011,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, goto tx_err; } } else { + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + switch (skb->protocol) { case htons(ETH_P_IP): memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index f08d34491ece..6242682be876 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -47,17 +47,11 @@ #include <net/inet_ecn.h> #include <net/dst_metadata.h> -int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +static void ip6_rcv_finish_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { void (*edemux)(struct sk_buff *skb); - /* if ingress device is enslaved to an L3 master device pass the - * skb to its handler for processing - */ - skb = l3mdev_ip6_rcv(skb); - if (!skb) - return NET_RX_SUCCESS; - if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; @@ -67,20 +61,73 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) } if (!skb_valid_dst(skb)) ip6_route_input(skb); +} + +int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + ip6_rcv_finish_core(net, sk, skb); return dst_input(skb); } -int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +static void ip6_sublist_rcv_finish(struct list_head *head) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) + dst_input(skb); +} + +static void ip6_list_rcv_finish(struct net *net, struct sock *sk, + struct list_head *head) +{ + struct dst_entry *curr_dst = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct dst_entry *dst; + + list_del(&skb->list); + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_rcv(skb); + if (!skb) + continue; + ip6_rcv_finish_core(net, sk, skb); + dst = skb_dst(skb); + if (curr_dst != dst) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip6_sublist_rcv_finish(&sublist); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dst = dst; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip6_sublist_rcv_finish(&sublist); +} + +static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, + struct net *net) { const struct ipv6hdr *hdr; u32 pkt_len; struct inet6_dev *idev; - struct net *net = dev_net(skb->dev); if (skb->pkt_type == PACKET_OTHERHOST) { kfree_skb(skb); - return NET_RX_DROP; + return NULL; } rcu_read_lock(); @@ -196,7 +243,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (ipv6_parse_hopopts(skb) < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); rcu_read_unlock(); - return NET_RX_DROP; + return NULL; } } @@ -205,15 +252,67 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, - net, NULL, skb, dev, NULL, - ip6_rcv_finish); + return skb; err: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); drop: rcu_read_unlock(); kfree_skb(skb); - return NET_RX_DROP; + return NULL; +} + +int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +{ + struct net *net = dev_net(skb->dev); + + skb = ip6_rcv_core(skb, dev, net); + if (skb == NULL) + return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, + ip6_rcv_finish); +} + +static void ip6_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ + NF_HOOK_LIST(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL, + head, dev, NULL, ip6_rcv_finish); + ip6_list_rcv_finish(net, NULL, head); +} + +/* Receive a list of IPv6 packets */ +void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net_device *curr_dev = NULL; + struct net *curr_net = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + + list_del(&skb->list); + skb = ip6_rcv_core(skb, dev, net); + if (skb == NULL) + continue; + + if (curr_dev != dev || curr_net != net) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip6_sublist_rcv(&sublist, curr_dev, curr_net); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dev = dev; + curr_net = net; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip6_sublist_rcv(&sublist, curr_dev, curr_net); } /* diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 5b3f2f89ef41..37ff4805b20c 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -163,11 +163,11 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph, return len; } -static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ipv6_gro_receive(struct list_head *head, + struct sk_buff *skb) { const struct net_offload *ops; - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct ipv6hdr *iph; unsigned int nlen; @@ -214,7 +214,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, flush--; nlen = skb_network_header_len(skb); - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; __be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */ @@ -263,8 +263,8 @@ out: return pp; } -static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ @@ -278,8 +278,8 @@ static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head, return ipv6_gro_receive(head, skb); } -static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a14fb4fcdf18..16f200f06500 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -570,6 +570,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->dev = from->dev; to->mark = from->mark; + skb_copy_hash(to, from); + #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif @@ -1219,13 +1221,16 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, if (mtu < IPV6_MIN_MTU) return -EINVAL; cork->base.fragsize = mtu; - cork->base.gso_size = sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0; + cork->base.gso_size = ipc6->gso_size; + cork->base.tx_flags = 0; + sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); if (dst_allfrag(xfrm_dst_path(&rt->dst))) cork->base.flags |= IPCORK_ALLFRAG; cork->base.length = 0; + cork->base.transmit_time = ipc6->sockc.transmit_time; + return 0; } @@ -1238,8 +1243,7 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - unsigned int flags, struct ipcm6_cookie *ipc6, - const struct sockcm_cookie *sockc) + unsigned int flags, struct ipcm6_cookie *ipc6) { struct sk_buff *skb, *skb_prev = NULL; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; @@ -1249,7 +1253,6 @@ static int __ip6_append_data(struct sock *sk, int copy; int err; int offset = 0; - __u8 tx_flags = 0; u32 tskey = 0; struct rt6_info *rt = (struct rt6_info *)cork->dst; struct ipv6_txoptions *opt = v6_cork->opt; @@ -1268,6 +1271,10 @@ static int __ip6_append_data(struct sock *sk, mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; orig_mtu = mtu; + if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = sk->sk_tskey++; + hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + @@ -1317,13 +1324,6 @@ emsgsize: rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; - if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { - sock_tx_timestamp(sk, sockc->tsflags, &tx_flags); - if (tx_flags & SKBTX_ANY_SW_TSTAMP && - sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; - } - /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1442,8 +1442,8 @@ alloc_new_skb: dst_exthdrlen); /* Only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = tx_flags; - tx_flags = 0; + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; @@ -1560,8 +1560,7 @@ int ip6_append_data(struct sock *sk, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, - struct rt6_info *rt, unsigned int flags, - const struct sockcm_cookie *sockc) + struct rt6_info *rt, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -1589,7 +1588,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, ipc6, sockc); + from, length, transhdrlen, flags, ipc6); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -1673,6 +1672,8 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = cork->base.transmit_time; + skb_dst_set(skb, dst_clone(&rt->dst)); IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); if (proto == IPPROTO_ICMPV6) { @@ -1747,8 +1748,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, - struct inet_cork_full *cork, - const struct sockcm_cookie *sockc) + struct inet_cork_full *cork) { struct inet6_cork v6_cork; struct sk_buff_head queue; @@ -1776,7 +1776,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, ipc6, sockc); + flags, ipc6); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 4d780c7f0130..c0cac9cc3a28 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -398,6 +398,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; + struct ipv6_opt_hdr *new = NULL; + + /* hop-by-hop / destination options are privileged option */ + retv = -EPERM; + if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) + break; /* remove any sticky options header with a zero option * length, per RFC3542. @@ -409,17 +415,22 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, else if (optlen < sizeof(struct ipv6_opt_hdr) || optlen & 0x7 || optlen > 8 * 255) goto e_inval; - - /* hop-by-hop / destination options are privileged option */ - retv = -EPERM; - if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) - break; + else { + new = memdup_user(optval, optlen); + if (IS_ERR(new)) { + retv = PTR_ERR(new); + break; + } + if (unlikely(ipv6_optlen(new) > optlen)) { + kfree(new); + goto e_inval; + } + } opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); - opt = ipv6_renew_options(sk, opt, optname, - (struct ipv6_opt_hdr __user *)optval, - optlen); + opt = ipv6_renew_options(sk, opt, optname, new); + kfree(new); if (IS_ERR(opt)) { retv = PTR_ERR(opt); break; @@ -489,7 +500,6 @@ sticky_done: struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; - struct sockcm_cookie sockc_junk; struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); @@ -522,7 +532,7 @@ sticky_done: msg.msg_control = (void *)(opt+1); ipc6.opt = opt; - retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6, &sockc_junk); + retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6); if (retv) goto done; update: @@ -718,8 +728,9 @@ done: struct sockaddr_in6 *psin6; psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; - retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, - &psin6->sin6_addr); + retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, + &psin6->sin6_addr, + MCAST_INCLUDE); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) break; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 975021df7c1c..4ae54aaca373 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -95,6 +95,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int delta); static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev); +static int __ipv6_dev_mc_inc(struct net_device *dev, + const struct in6_addr *addr, unsigned int mode); #define MLD_QRV_DEFAULT 2 /* RFC3810, 9.2. Query Interval */ @@ -132,7 +134,8 @@ static int unsolicited_report_interval(struct inet6_dev *idev) return iv > 0 ? iv : 1; } -int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) +static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, + const struct in6_addr *addr, unsigned int mode) { struct net_device *dev = NULL; struct ipv6_mc_socklist *mc_lst; @@ -179,7 +182,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) } mc_lst->ifindex = dev->ifindex; - mc_lst->sfmode = MCAST_EXCLUDE; + mc_lst->sfmode = mode; rwlock_init(&mc_lst->sflock); mc_lst->sflist = NULL; @@ -187,7 +190,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) * now add/increase the group membership on the device */ - err = ipv6_dev_mc_inc(dev, addr); + err = __ipv6_dev_mc_inc(dev, addr, mode); if (err) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); @@ -199,8 +202,19 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) return 0; } + +int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) +{ + return __ipv6_sock_mc_join(sk, ifindex, addr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ipv6_sock_mc_join); +int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, + const struct in6_addr *addr, unsigned int mode) +{ + return __ipv6_sock_mc_join(sk, ifindex, addr, mode); +} + /* * socket leave on multicast group */ @@ -672,7 +686,13 @@ static void igmp6_group_added(struct ifmcaddr6 *mc) } /* else v2 */ - mc->mca_crcount = mc->idev->mc_qrv; + /* Based on RFC3810 6.1, for newly added INCLUDE SSM, we + * should not send filter-mode change record as the mode + * should be from IN() to IN(A). + */ + if (mc->mca_sfmode == MCAST_EXCLUDE) + mc->mca_crcount = mc->idev->mc_qrv; + mld_ifc_event(mc->idev); } @@ -770,13 +790,13 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) spin_lock_bh(&im->mca_lock); if (pmc) { im->idev = pmc->idev; - im->mca_crcount = idev->mc_qrv; - im->mca_sfmode = pmc->mca_sfmode; - if (pmc->mca_sfmode == MCAST_INCLUDE) { + if (im->mca_sfmode == MCAST_INCLUDE) { im->mca_tomb = pmc->mca_tomb; im->mca_sources = pmc->mca_sources; for (psf = im->mca_sources; psf; psf = psf->sf_next) - psf->sf_crcount = im->mca_crcount; + psf->sf_crcount = idev->mc_qrv; + } else { + im->mca_crcount = idev->mc_qrv; } in6_dev_put(pmc->idev); kfree(pmc); @@ -831,7 +851,8 @@ static void ma_put(struct ifmcaddr6 *mc) } static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, - const struct in6_addr *addr) + const struct in6_addr *addr, + unsigned int mode) { struct ifmcaddr6 *mc; @@ -849,9 +870,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, refcount_set(&mc->mca_refcnt, 1); spin_lock_init(&mc->mca_lock); - /* initial mode is (EX, empty) */ - mc->mca_sfmode = MCAST_EXCLUDE; - mc->mca_sfcount[MCAST_EXCLUDE] = 1; + mc->mca_sfmode = mode; + mc->mca_sfcount[mode] = 1; if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) @@ -863,7 +883,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, /* * device multicast group inc (add if not found) */ -int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) +static int __ipv6_dev_mc_inc(struct net_device *dev, + const struct in6_addr *addr, unsigned int mode) { struct ifmcaddr6 *mc; struct inet6_dev *idev; @@ -887,14 +908,13 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) if (ipv6_addr_equal(&mc->mca_addr, addr)) { mc->mca_users++; write_unlock_bh(&idev->lock); - ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0, - NULL, 0); + ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0); in6_dev_put(idev); return 0; } } - mc = mca_alloc(idev, addr); + mc = mca_alloc(idev, addr, mode); if (!mc) { write_unlock_bh(&idev->lock); in6_dev_put(idev); @@ -916,6 +936,11 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) return 0; } +int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) +{ + return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE); +} + /* * device multicast group del */ @@ -1751,7 +1776,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, psf_next = psf->sf_next; - if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { + if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) { psf_prev = psf; continue; } @@ -2066,7 +2091,7 @@ static void mld_send_initial_cr(struct inet6_dev *idev) if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_CHANGE_TO_EXCLUDE; else - type = MLD2_CHANGE_TO_INCLUDE; + type = MLD2_ALLOW_NEW_SOURCES; skb = add_grec(skb, pmc, type, 0, 0, 1); spin_unlock_bh(&pmc->mca_lock); } @@ -2082,7 +2107,8 @@ void ipv6_mc_dad_complete(struct inet6_dev *idev) mld_send_initial_cr(idev); idev->mc_dad_count--; if (idev->mc_dad_count) - mld_dad_start_timer(idev, idev->mc_maxdelay); + mld_dad_start_timer(idev, + unsolicited_report_interval(idev)); } } @@ -2094,7 +2120,8 @@ static void mld_dad_timer_expire(struct timer_list *t) if (idev->mc_dad_count) { idev->mc_dad_count--; if (idev->mc_dad_count) - mld_dad_start_timer(idev, idev->mc_maxdelay); + mld_dad_start_timer(idev, + unsolicited_report_interval(idev)); } in6_dev_put(idev); } @@ -2452,7 +2479,8 @@ static void mld_ifc_timer_expire(struct timer_list *t) if (idev->mc_ifc_count) { idev->mc_ifc_count--; if (idev->mc_ifc_count) - mld_ifc_start_timer(idev, idev->mc_maxdelay); + mld_ifc_start_timer(idev, + unsolicited_report_interval(idev)); } in6_dev_put(idev); } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index e640d2f3c55c..0ec273997d1d 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -811,7 +811,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) return; } } - if (ndopts.nd_opts_nonce) + if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1) memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6); inc = ipv6_addr_is_multicast(daddr); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 531d6957af36..5ae8e1c51079 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -15,7 +15,6 @@ #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/xfrm.h> -#include <net/ip6_checksum.h> #include <net/netfilter/nf_queue.h> int ip6_route_me_harder(struct net *net, struct sk_buff *skb) @@ -106,71 +105,10 @@ static int nf_ip6_route(struct net *net, struct dst_entry **dst, return err; } -__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol) -{ - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) - break; - if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, - skb->len - dataoff, protocol, - csum_sub(skb->csum, - skb_checksum(skb, 0, - dataoff, 0)))) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - /* fall through */ - case CHECKSUM_NONE: - skb->csum = ~csum_unfold( - csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, - skb->len - dataoff, - protocol, - csum_sub(0, - skb_checksum(skb, 0, - dataoff, 0)))); - csum = __skb_checksum_complete(skb); - } - return csum; -} -EXPORT_SYMBOL(nf_ip6_checksum); - -static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol) -{ - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - __wsum hsum; - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (len == skb->len - dataoff) - return nf_ip6_checksum(skb, hook, dataoff, protocol); - /* fall through */ - case CHECKSUM_NONE: - hsum = skb_checksum(skb, 0, dataoff, 0); - skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, - &ip6h->daddr, - skb->len - dataoff, - protocol, - csum_sub(0, hsum))); - skb->ip_summed = CHECKSUM_NONE; - return __skb_checksum_complete_head(skb, dataoff + len); - } - return csum; -}; - static const struct nf_ipv6_ops ipv6ops = { .chk_addr = ipv6_chk_addr, .route_input = ip6_route_input, .fragment = ip6_fragment, - .checksum = nf_ip6_checksum, - .checksum_partial = nf_ip6_checksum_partial, .route = nf_ip6_route, .reroute = nf_ip6_reroute, }; diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 37b14dc9d863..339d0762b027 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -5,26 +5,6 @@ menu "IPv6: Netfilter Configuration" depends on INET && IPV6 && NETFILTER -config NF_DEFRAG_IPV6 - tristate - default n - -config NF_CONNTRACK_IPV6 - tristate "IPv6 connection tracking support" - depends on INET && IPV6 && NF_CONNTRACK - default m if NETFILTER_ADVANCED=n - select NF_DEFRAG_IPV6 - ---help--- - Connection tracking keeps a record of what packets have passed - through your machine, in order to figure out how they are related - into connections. - - This is IPv6 support on Layer 3 independent connection tracking. - Layer 3 independent connection tracking is experimental scheme - which generalize ip_conntrack to support other layer 3 protocols. - - To compile it as a module, choose M here. If unsure, say N. - config NF_SOCKET_IPV6 tristate "IPv6 socket lookup support" help @@ -128,7 +108,7 @@ config NF_LOG_IPV6 config NF_NAT_IPV6 tristate "IPv6 NAT" - depends on NF_CONNTRACK_IPV6 + depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_NAT help @@ -328,7 +308,7 @@ config IP6_NF_SECURITY config IP6_NF_NAT tristate "ip6tables NAT support" - depends on NF_CONNTRACK_IPV6 + depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_NAT select NF_NAT_IPV6 @@ -365,6 +345,7 @@ config IP6_NF_TARGET_NPT endif # IP6_NF_NAT endif # IP6_NF_IPTABLES - endmenu +config NF_DEFRAG_IPV6 + tristate diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 10a5a1c87320..200c0c235565 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -11,12 +11,6 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o -# objects for l3 independent conntrack -nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o - -# l3 independent conntrack -obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o - nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7eab959734bc..daf2e9e9193d 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1909,6 +1909,7 @@ static struct xt_match ip6t_builtin_mt[] __read_mostly = { .checkentry = icmp6_checkentry, .proto = IPPROTO_ICMPV6, .family = NFPROTO_IPV6, + .me = THIS_MODULE, }, }; diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c deleted file mode 100644 index 663827ee3cf8..000000000000 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ /dev/null @@ -1,460 +0,0 @@ -/* - * Copyright (C)2004 USAGI/WIDE Project - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Author: - * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - */ - -#include <linux/types.h> -#include <linux/ipv6.h> -#include <linux/in6.h> -#include <linux/netfilter.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/icmp.h> -#include <net/ipv6.h> -#include <net/inet_frag.h> - -#include <linux/netfilter_bridge.h> -#include <linux/netfilter_ipv6.h> -#include <linux/netfilter_ipv6/ip6_tables.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_l3proto.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_zones.h> -#include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/ipv6/nf_defrag_ipv6.h> -#include <net/netfilter/nf_log.h> - -static int conntrack6_net_id; -static DEFINE_MUTEX(register_ipv6_hooks); - -struct conntrack6_net { - unsigned int users; -}; - -static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) -{ - const u_int32_t *ap; - u_int32_t _addrs[8]; - - ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr), - sizeof(_addrs), _addrs); - if (ap == NULL) - return false; - - memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); - memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); - - return true; -} - -static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6)); - memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6)); - - return true; -} - -static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum) -{ - unsigned int extoff = nhoff + sizeof(struct ipv6hdr); - __be16 frag_off; - int protoff; - u8 nexthdr; - - if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), - &nexthdr, sizeof(nexthdr)) != 0) { - pr_debug("ip6_conntrack_core: can't get nexthdr\n"); - return -NF_ACCEPT; - } - protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); - /* - * (protoff == skb->len) means the packet has not data, just - * IPv6 and possibly extensions headers, but it is tracked anyway - */ - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("ip6_conntrack_core: can't find proto in pkt\n"); - return -NF_ACCEPT; - } - - *dataoff = protoff; - *protonum = nexthdr; - return NF_ACCEPT; -} - -static unsigned int ipv6_helper(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - const struct nf_conn_help *help; - const struct nf_conntrack_helper *helper; - enum ip_conntrack_info ctinfo; - __be16 frag_off; - int protoff; - u8 nexthdr; - - /* This is where we call the helper: as the packet goes out. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; - - help = nfct_help(ct); - if (!help) - return NF_ACCEPT; - /* rcu_read_lock()ed by nf_hook_thresh */ - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; - - nexthdr = ipv6_hdr(skb)->nexthdr; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - return NF_ACCEPT; - } - - return helper->help(skb, protoff, ct, ctinfo); -} - -static unsigned int ipv6_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned char pnum = ipv6_hdr(skb)->nexthdr; - int protoff; - __be16 frag_off; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - goto out; - - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - goto out; - } - - /* adjust seqs for loopback traffic only in outgoing direction */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } - } -out: - /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb); -} - -static unsigned int ipv6_conntrack_in(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); -} - -static unsigned int ipv6_conntrack_local(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); -} - -static const struct nf_hook_ops ipv6_conntrack_ops[] = { - { - .hook = ipv6_conntrack_in, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_CONNTRACK, - }, - { - .hook = ipv6_conntrack_local, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_CONNTRACK, - }, - { - .hook = ipv6_helper, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv6_confirm, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_LAST, - }, - { - .hook = ipv6_helper, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv6_confirm, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_LAST-1, - }, -}; - -static int -ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) -{ - struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 }; - const struct ipv6_pinfo *inet6 = inet6_sk(sk); - const struct inet_sock *inet = inet_sk(sk); - const struct nf_conntrack_tuple_hash *h; - struct sockaddr_in6 sin6; - struct nf_conn *ct; - __be32 flow_label; - int bound_dev_if; - - lock_sock(sk); - tuple.src.u3.in6 = sk->sk_v6_rcv_saddr; - tuple.src.u.tcp.port = inet->inet_sport; - tuple.dst.u3.in6 = sk->sk_v6_daddr; - tuple.dst.u.tcp.port = inet->inet_dport; - tuple.dst.protonum = sk->sk_protocol; - bound_dev_if = sk->sk_bound_dev_if; - flow_label = inet6->flow_label; - release_sock(sk); - - if (tuple.dst.protonum != IPPROTO_TCP && - tuple.dst.protonum != IPPROTO_SCTP) - return -ENOPROTOOPT; - - if (*len < 0 || (unsigned int) *len < sizeof(sin6)) - return -EINVAL; - - h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); - if (!h) { - pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", - &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); - return -ENOENT; - } - - ct = nf_ct_tuplehash_to_ctrack(h); - - sin6.sin6_family = AF_INET6; - sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; - sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK; - memcpy(&sin6.sin6_addr, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6, - sizeof(sin6.sin6_addr)); - - nf_ct_put(ct); - sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if); - return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0; -} - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_conntrack.h> - -static int ipv6_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple) -{ - if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) || - nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy ipv6_nla_policy[CTA_IP_MAX+1] = { - [CTA_IP_V6_SRC] = { .len = sizeof(u_int32_t)*4 }, - [CTA_IP_V6_DST] = { .len = sizeof(u_int32_t)*4 }, -}; - -static int ipv6_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) -{ - if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST]) - return -EINVAL; - - t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); - t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); - - return 0; -} -#endif - -static int ipv6_hooks_register(struct net *net) -{ - struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id); - int err = 0; - - mutex_lock(®ister_ipv6_hooks); - cnet->users++; - if (cnet->users > 1) - goto out_unlock; - - err = nf_defrag_ipv6_enable(net); - if (err < 0) { - cnet->users = 0; - goto out_unlock; - } - - err = nf_register_net_hooks(net, ipv6_conntrack_ops, - ARRAY_SIZE(ipv6_conntrack_ops)); - if (err) - cnet->users = 0; - out_unlock: - mutex_unlock(®ister_ipv6_hooks); - return err; -} - -static void ipv6_hooks_unregister(struct net *net) -{ - struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id); - - mutex_lock(®ister_ipv6_hooks); - if (cnet->users && (--cnet->users == 0)) - nf_unregister_net_hooks(net, ipv6_conntrack_ops, - ARRAY_SIZE(ipv6_conntrack_ops)); - mutex_unlock(®ister_ipv6_hooks); -} - -const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { - .l3proto = PF_INET6, - .pkt_to_tuple = ipv6_pkt_to_tuple, - .invert_tuple = ipv6_invert_tuple, - .get_l4proto = ipv6_get_l4proto, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = ipv6_tuple_to_nlattr, - .nlattr_to_tuple = ipv6_nlattr_to_tuple, - .nla_policy = ipv6_nla_policy, - .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])) + - NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])), -#endif - .net_ns_get = ipv6_hooks_register, - .net_ns_put = ipv6_hooks_unregister, - .me = THIS_MODULE, -}; - -MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>"); - -static struct nf_sockopt_ops so_getorigdst6 = { - .pf = NFPROTO_IPV6, - .get_optmin = IP6T_SO_ORIGINAL_DST, - .get_optmax = IP6T_SO_ORIGINAL_DST + 1, - .get = ipv6_getorigdst, - .owner = THIS_MODULE, -}; - -static const struct nf_conntrack_l4proto * const builtin_l4proto6[] = { - &nf_conntrack_l4proto_tcp6, - &nf_conntrack_l4proto_udp6, - &nf_conntrack_l4proto_icmpv6, -#ifdef CONFIG_NF_CT_PROTO_DCCP - &nf_conntrack_l4proto_dccp6, -#endif -#ifdef CONFIG_NF_CT_PROTO_SCTP - &nf_conntrack_l4proto_sctp6, -#endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - &nf_conntrack_l4proto_udplite6, -#endif -}; - -static int ipv6_net_init(struct net *net) -{ - return nf_ct_l4proto_pernet_register(net, builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); -} - -static void ipv6_net_exit(struct net *net) -{ - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); -} - -static struct pernet_operations ipv6_net_ops = { - .init = ipv6_net_init, - .exit = ipv6_net_exit, - .id = &conntrack6_net_id, - .size = sizeof(struct conntrack6_net), -}; - -static int __init nf_conntrack_l3proto_ipv6_init(void) -{ - int ret = 0; - - need_conntrack(); - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - if (WARN_ON(nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1) != - nf_conntrack_l3proto_ipv6.nla_size)) - return -EINVAL; -#endif - - ret = nf_register_sockopt(&so_getorigdst6); - if (ret < 0) { - pr_err("Unable to register netfilter socket option\n"); - return ret; - } - - ret = register_pernet_subsys(&ipv6_net_ops); - if (ret < 0) - goto cleanup_sockopt; - - ret = nf_ct_l4proto_register(builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - if (ret < 0) - goto cleanup_pernet; - - ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6); - if (ret < 0) { - pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n"); - goto cleanup_l4proto; - } - return ret; -cleanup_l4proto: - nf_ct_l4proto_unregister(builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - cleanup_pernet: - unregister_pernet_subsys(&ipv6_net_ops); - cleanup_sockopt: - nf_unregister_sockopt(&so_getorigdst6); - return ret; -} - -static void __exit nf_conntrack_l3proto_ipv6_fini(void) -{ - synchronize_net(); - nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6); - nf_ct_l4proto_unregister(builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - unregister_pernet_subsys(&ipv6_net_ops); - nf_unregister_sockopt(&so_getorigdst6); -} - -module_init(nf_conntrack_l3proto_ipv6_init); -module_exit(nf_conntrack_l3proto_ipv6_fini); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 5e0332014c17..0610bdab721c 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -33,9 +33,8 @@ #include <net/sock.h> #include <net/snmp.h> -#include <net/inet_frag.h> +#include <net/ipv6_frag.h> -#include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/rawv6.h> @@ -107,7 +106,7 @@ static int nf_ct_frag6_sysctl_register(struct net *net) if (hdr == NULL) goto err_reg; - net->nf_frag.sysctl.frags_hdr = hdr; + net->nf_frag_frags_hdr = hdr; return 0; err_reg: @@ -121,8 +120,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { struct ctl_table *table; - table = net->nf_frag.sysctl.frags_hdr->ctl_table_arg; - unregister_net_sysctl_table(net->nf_frag.sysctl.frags_hdr); + table = net->nf_frag_frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->nf_frag_frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } @@ -151,7 +150,7 @@ static void nf_ct_frag6_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(net, fq); } /* Creation primitives. */ @@ -585,6 +584,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) fq->q.meat == fq->q.len && nf_ct_frag6_reasm(fq, skb, dev)) ret = 0; + else + skb_dst_drop(skb); out_unlock: spin_unlock_bh(&fq->q.lock); @@ -622,16 +623,24 @@ static struct pernet_operations nf_ct_net_ops = { .exit = nf_ct_net_exit, }; +static const struct rhashtable_params nfct_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = ip6frag_key_hashfn, + .obj_hashfn = ip6frag_obj_hashfn, + .obj_cmpfn = ip6frag_obj_cmpfn, + .automatic_shrinking = true, +}; + int nf_ct_frag6_init(void) { int ret = 0; - nf_frags.constructor = ip6_frag_init; + nf_frags.constructor = ip6frag_init; nf_frags.destructor = NULL; nf_frags.qsize = sizeof(struct frag_queue); nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frags_cache_name = nf_frags_cache_name; - nf_frags.rhash_params = ip6_rhash_params; + nf_frags.rhash_params = nfct_rhash_params; ret = inet_frags_init(&nf_frags); if (ret) goto out; diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index c87b48359e8f..72dd3e202375 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -14,8 +14,7 @@ #include <linux/skbuff.h> #include <linux/icmp.h> #include <linux/sysctl.h> -#include <net/ipv6.h> -#include <net/inet_frag.h> +#include <net/ipv6_frag.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> @@ -23,7 +22,6 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #endif diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index b397a8fe88b9..c6bf580d0f33 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -36,7 +36,7 @@ static const struct nf_loginfo default_loginfo = { }; /* One level of recursion won't kill us */ -static void dump_ipv6_packet(struct nf_log_buf *m, +static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int ip6hoff, int recurse) @@ -258,7 +258,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, /* Max length: 3+maxlen */ if (recurse) { nf_log_buf_add(m, "["); - dump_ipv6_packet(m, info, skb, + dump_ipv6_packet(net, m, info, skb, ptr + sizeof(_icmp6h), 0); nf_log_buf_add(m, "] "); } @@ -278,7 +278,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && recurse) - nf_log_dump_sk_uid_gid(m, skb->sk); + nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (recurse && skb->mark) @@ -365,7 +365,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf, if (in != NULL) dump_ipv6_mac_header(m, loginfo, skb); - dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); + dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1); nf_log_buf_close(m); } diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index bf1d6c421e3b..5dfd33af6451 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -55,7 +55,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, hp, tproto, + sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, tproto, &iph->saddr, nf_tproxy_laddr6(skb, laddr, &iph->daddr), hp->source, @@ -72,7 +72,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6); struct sock * -nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, @@ -80,15 +80,20 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; - struct tcphdr *tcph; switch (protocol) { - case IPPROTO_TCP: + case IPPROTO_TCP: { + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, thoff, + sizeof(struct tcphdr), &_hdr); + if (hp == NULL) + return NULL; + switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - tcph = hp; sk = inet6_lookup_listener(net, &tcp_hashinfo, skb, - thoff + __tcp_hdrlen(tcph), + thoff + __tcp_hdrlen(hp), saddr, sport, daddr, ntohs(dport), in->ifindex, 0); @@ -110,6 +115,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, BUG(); } break; + } case IPPROTO_UDP: sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, in->ifindex); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 96f56bf49a30..4c04bccc7417 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -62,7 +62,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct dst_entry *dst; struct rt6_info *rt; struct pingfakehdr pfh; - struct sockcm_cookie junk = {0}; struct ipcm6_cookie ipc6; pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -119,7 +118,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - ipc6.tclass = np->tclass; + ipcm6_init_sk(&ipc6, np); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); @@ -142,13 +141,11 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) pfh.family = AF_INET6; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - ipc6.dontfrag = np->dontfrag; - ipc6.opt = NULL; lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, 0, &ipc6, &fl6, rt, - MSG_DONTWAIT, &junk); + MSG_DONTWAIT); if (err) { ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ce6f0d15b5dd..413d98bf24f4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -620,7 +620,7 @@ out: static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, struct flowi6 *fl6, struct dst_entry **dstp, - unsigned int flags) + unsigned int flags, const struct sockcm_cookie *sockc) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); @@ -650,6 +650,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *dstp = NULL; @@ -766,7 +767,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct dst_entry *dst = NULL; struct raw6_frag_vec rfv; struct flowi6 fl6; - struct sockcm_cookie sockc; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; u16 proto; @@ -790,10 +790,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sk->sk_uid; - ipc6.hlimit = -1; - ipc6.tclass = -1; - ipc6.dontfrag = -1; - ipc6.opt = NULL; + ipcm6_init(&ipc6); + ipc6.sockc.tsflags = sk->sk_tsflags; if (sin6) { if (addr_len < SIN6_LEN_RFC2133) @@ -847,14 +845,13 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (fl6.flowi6_oif == 0) fl6.flowi6_oif = sk->sk_bound_dev_if; - sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -921,13 +918,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) back_from_confirm: if (inet->hdrincl) - err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags); + err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, + msg->msg_flags, &ipc6.sockc); else { ipc6.opt = opt; lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, len, 0, &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, &sockc); + msg->msg_flags); if (err) ip6_flush_pending_frames(sk); @@ -1334,7 +1332,7 @@ void raw6_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -/* Same as inet6_dgram_ops, sans udp_poll_mask. */ +/* Same as inet6_dgram_ops, sans udp_poll. */ const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, @@ -1344,7 +1342,7 @@ const struct proto_ops inet6_sockraw_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, - .poll_mask = datagram_poll_mask, /* ok */ + .poll = datagram_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b939b94e7e91..6edd2ac8ae4b 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -57,7 +57,7 @@ #include <net/rawv6.h> #include <net/ndisc.h> #include <net/addrconf.h> -#include <net/inet_frag.h> +#include <net/ipv6_frag.h> #include <net/inet_ecn.h> static const char ip6_frag_cache_name[] = "ip6-frags"; @@ -72,61 +72,6 @@ static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev); -void ip6_frag_init(struct inet_frag_queue *q, const void *a) -{ - struct frag_queue *fq = container_of(q, struct frag_queue, q); - const struct frag_v6_compare_key *key = a; - - q->key.v6 = *key; - fq->ecn = 0; -} -EXPORT_SYMBOL(ip6_frag_init); - -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) -{ - struct net_device *dev = NULL; - struct sk_buff *head; - - rcu_read_lock(); - spin_lock(&fq->q.lock); - - if (fq->q.flags & INET_FRAG_COMPLETE) - goto out; - - inet_frag_kill(&fq->q); - - dev = dev_get_by_index_rcu(net, fq->iif); - if (!dev) - goto out; - - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); - - /* Don't send error if the first segment did not arrive. */ - head = fq->q.fragments; - if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) - goto out; - - /* But use as source device on which LAST ARRIVED - * segment was received. And do not use fq->dev - * pointer directly, device might already disappeared. - */ - head->dev = dev; - skb_get(head); - spin_unlock(&fq->q.lock); - - icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); - kfree_skb(head); - goto out_rcu_unlock; - -out: - spin_unlock(&fq->q.lock); -out_rcu_unlock: - rcu_read_unlock(); - inet_frag_put(&fq->q); -} -EXPORT_SYMBOL(ip6_expire_frag_queue); - static void ip6_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); @@ -136,7 +81,7 @@ static void ip6_frag_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ipv6.frags); - ip6_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(net, fq); } static struct frag_queue * @@ -696,42 +641,19 @@ static struct pernet_operations ip6_frags_ops = { .exit = ipv6_frags_exit_net, }; -static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed) -{ - return jhash2(data, - sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); -} - -static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed) -{ - const struct inet_frag_queue *fq = data; - - return jhash2((const u32 *)&fq->key.v6, - sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); -} - -static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) -{ - const struct frag_v6_compare_key *key = arg->key; - const struct inet_frag_queue *fq = ptr; - - return !!memcmp(&fq->key, key, sizeof(*key)); -} - -const struct rhashtable_params ip6_rhash_params = { +static const struct rhashtable_params ip6_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), - .hashfn = ip6_key_hashfn, - .obj_hashfn = ip6_obj_hashfn, - .obj_cmpfn = ip6_obj_cmpfn, + .hashfn = ip6frag_key_hashfn, + .obj_hashfn = ip6frag_obj_hashfn, + .obj_cmpfn = ip6frag_obj_cmpfn, .automatic_shrinking = true, }; -EXPORT_SYMBOL(ip6_rhash_params); int __init ipv6_frag_init(void) { int ret; - ip6_frags.constructor = ip6_frag_init; + ip6_frags.constructor = ip6frag_init; ip6_frags.destructor = NULL; ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.frag_expire = ip6_frag_expire; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 86a0e4333d42..ec18b3ce8b6d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -972,10 +972,10 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) rt->dst.lastuse = jiffies; } +/* Caller must already hold reference to @from */ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) { rt->rt6i_flags &= ~RTF_EXPIRES; - fib6_info_hold(from); rcu_assign_pointer(rt->from, from); dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); if (from->fib6_metrics != &dst_default_metrics) { @@ -984,6 +984,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) } } +/* Caller must already hold reference to @ort */ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) { struct net_device *dev = fib6_info_nh_dev(ort); @@ -1044,9 +1045,14 @@ static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) struct net_device *dev = rt->fib6_nh.nh_dev; struct rt6_info *nrt; + if (!fib6_info_hold_safe(rt)) + return NULL; + nrt = ip6_dst_alloc(dev_net(dev), dev, flags); if (nrt) ip6_rt_copy_init(nrt, rt); + else + fib6_info_release(rt); return nrt; } @@ -1178,10 +1184,15 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, * Clone the route. */ + if (!fib6_info_hold_safe(ort)) + return NULL; + dev = ip6_rt_get_dev_rcu(ort); rt = ip6_dst_alloc(dev_net(dev), dev, 0); - if (!rt) + if (!rt) { + fib6_info_release(ort); return NULL; + } ip6_rt_copy_init(rt, ort); rt->rt6i_flags |= RTF_CACHE; @@ -1210,12 +1221,17 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) struct net_device *dev; struct rt6_info *pcpu_rt; + if (!fib6_info_hold_safe(rt)) + return NULL; + rcu_read_lock(); dev = ip6_rt_get_dev_rcu(rt); pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); rcu_read_unlock(); - if (!pcpu_rt) + if (!pcpu_rt) { + fib6_info_release(rt); return NULL; + } ip6_rt_copy_init(pcpu_rt, rt); pcpu_rt->rt6i_flags |= RTF_PCPU; return pcpu_rt; @@ -2486,7 +2502,7 @@ restart: out: if (ret) - dst_hold(&ret->dst); + ip6_hold_safe(net, &ret, true); else ret = ip6_create_rt_rcu(rt); @@ -3303,7 +3319,8 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) continue; - fib6_info_hold(rt); + if (!fib6_info_hold_safe(rt)) + continue; rcu_read_unlock(); /* if gateway was specified only delete the one hop */ @@ -3409,6 +3426,9 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu rcu_read_lock(); from = rcu_dereference(rt->from); + /* This fib6_info_hold() is safe here because we hold reference to rt + * and rt already holds reference to fib6_info. + */ fib6_info_hold(from); rcu_read_unlock(); @@ -3470,7 +3490,8 @@ static struct fib6_info *rt6_get_route_info(struct net *net, continue; if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) continue; - fib6_info_hold(rt); + if (!fib6_info_hold_safe(rt)) + continue; break; } out: @@ -3530,8 +3551,8 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) break; } - if (rt) - fib6_info_hold(rt); + if (rt && !fib6_info_hold_safe(rt)) + rt = NULL; rcu_read_unlock(); return rt; } @@ -3579,8 +3600,8 @@ restart: struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && - (!idev || idev->cnf.accept_ra != 2)) { - fib6_info_hold(rt); + (!idev || idev->cnf.accept_ra != 2) && + fib6_info_hold_safe(rt)) { rcu_read_unlock(); ip6_del_rt(net, rt); goto restart; @@ -3842,7 +3863,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric == rt->fib6_metric && - iter->fib6_nsiblings) + rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -4388,6 +4409,13 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rt = NULL; goto cleanup; } + if (!rt6_qualify_for_ecmp(rt)) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, + "Device only routes can not be added for IPv6 using the multipath API."); + fib6_info_release(rt); + goto cleanup; + } rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; @@ -4439,7 +4467,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, */ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); - cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND; nhn++; } diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index b1791129a875..8546f94f30d4 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -374,7 +374,7 @@ static int seg6_hmac_init_algo(void) return -ENOMEM; for_each_possible_cpu(cpu) { - tfm = crypto_alloc_shash(algo->name, 0, GFP_KERNEL); + tfm = crypto_alloc_shash(algo->name, 0, 0); if (IS_ERR(tfm)) return PTR_ERR(tfm); p_tfm = per_cpu_ptr(algo->tfms, cpu); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 19ccf0dc996c..a8854dd3e9c5 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -101,7 +101,7 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb, if (do_flowlabel > 0) { hash = skb_get_hash(skb); - rol32(hash, 16); + hash = rol32(hash, 16); flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; } else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) { flowlabel = ip6_flowlabel(inner_hdr); diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index cd6e4cab63f6..e1025b493a18 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -637,12 +637,10 @@ static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) if (!seg6_validate_srh(srh, len)) return -EINVAL; - slwt->srh = kmalloc(len, GFP_KERNEL); + slwt->srh = kmemdup(srh, len, GFP_KERNEL); if (!slwt->srh) return -ENOMEM; - memcpy(slwt->srh, srh, len); - slwt->headroom += len; return 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7efa9fd7e109..03e6b7a2bc53 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -938,7 +938,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) &tcp_hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, - ntohs(th->source), tcp_v6_iif(skb), + ntohs(th->source), + tcp_v6_iif_l3_slave(skb), tcp_v6_sdif(skb)); if (!sk1) goto out; @@ -1609,7 +1610,8 @@ do_time_wait: skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, - ntohs(th->dest), tcp_v6_iif(skb), + ntohs(th->dest), + tcp_v6_iif_l3_slave(skb), sdif); if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 278e49cd67d4..e72947c99454 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -15,8 +15,8 @@ #include <net/ip6_checksum.h> #include "ip6_offload.h" -static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *tcp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e6645cae403e..f6b96956a8ed 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1141,13 +1141,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int err; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); - struct sockcm_cookie sockc; - ipc6.hlimit = -1; - ipc6.tclass = -1; - ipc6.dontfrag = -1; + ipcm6_init(&ipc6); ipc6.gso_size = up->gso_size; - sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.tsflags = sk->sk_tsflags; /* destination address check */ if (sin6) { @@ -1282,7 +1279,7 @@ do_udp_sendmsg: err = udp_cmsg_send(sk, msg, &ipc6.gso_size); if (err > 0) err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, - &ipc6, &sockc); + &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -1376,7 +1373,7 @@ back_from_confirm: skb = ip6_make_skb(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, &cork, &sockc); + msg->msg_flags, &cork); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_v6_send_skb(skb, &fl6, &cork.base); @@ -1402,7 +1399,7 @@ do_append_data: up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, &fl6, (struct rt6_info *)dst, - corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, &sockc); + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_v6_flush_pending_frames(sk); else if (!corkreq) diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 03a2ff3fe1e6..95dee9ca8d22 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -114,8 +114,8 @@ out: return segs; } -static struct sk_buff **udp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *udp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 68e86257a549..8d1c43f8fed4 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1488,11 +1488,14 @@ static inline __poll_t iucv_accept_poll(struct sock *parent) return 0; } -static __poll_t iucv_sock_poll_mask(struct socket *sock, __poll_t events) +__poll_t iucv_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == IUCV_LISTEN) return iucv_accept_poll(sk); @@ -2385,7 +2388,7 @@ static const struct proto_ops iucv_sock_ops = { .getname = iucv_sock_getname, .sendmsg = iucv_sock_sendmsg, .recvmsg = iucv_sock_recvmsg, - .poll_mask = iucv_sock_poll_mask, + .poll = iucv_sock_poll, .ioctl = sock_no_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, @@ -2512,4 +2515,3 @@ MODULE_DESCRIPTION("IUCV Sockets ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_IUCV); - diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig index 87fca36e6c47..9ca83f2ade6f 100644 --- a/net/kcm/Kconfig +++ b/net/kcm/Kconfig @@ -8,4 +8,3 @@ config AF_KCM KCM (Kernel Connection Multiplexor) sockets provide a method for multiplexing messages of a message based application protocol over kernel connectons (e.g. TCP connections). - diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 84b7d5c6fec8..571d824e4e24 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1336,9 +1336,9 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) struct list_head *head; int index = 0; - /* For SOCK_SEQPACKET sock type, datagram_poll_mask checks the sk_state, - * so we set sk_state, otherwise epoll_wait always returns right away - * with EPOLLHUP + /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so + * we set sk_state, otherwise epoll_wait always returns right away with + * EPOLLHUP */ kcm->sk.sk_state = TCP_ESTABLISHED; @@ -1903,7 +1903,7 @@ static const struct proto_ops kcm_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -1924,7 +1924,7 @@ static const struct proto_ops kcm_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -2104,4 +2104,3 @@ module_exit(kcm_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_KCM); - diff --git a/net/key/af_key.c b/net/key/af_key.c index 398ebcd614a0..9d61266526e7 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3751,7 +3751,7 @@ static const struct proto_ops pfkey_ops = { /* Now the operations that really occur. */ .release = pfkey_release, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .sendmsg = pfkey_sendmsg, .recvmsg = pfkey_recvmsg, }; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 40261cb68e83..d10f4ed52d92 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -322,8 +322,7 @@ int l2tp_session_register(struct l2tp_session *session, if (tunnel->version == L2TP_HDR_VER_3) { pn = l2tp_pernet(tunnel->l2tp_net); - g_head = l2tp_session_id_hash_2(l2tp_pernet(tunnel->l2tp_net), - session->session_id); + g_head = l2tp_session_id_hash_2(pn, session->session_id); spin_lock_bh(&pn->l2tp_session_hlist_lock); @@ -620,7 +619,7 @@ discard: */ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, - int length, int (*payload_hook)(struct sk_buff *skb)) + int length) { struct l2tp_tunnel *tunnel = session->tunnel; int offset; @@ -741,13 +740,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, __skb_pull(skb, offset); - /* If caller wants to process the payload before we queue the - * packet, do so now. - */ - if (payload_hook) - if ((*payload_hook)(skb)) - goto discard; - /* Prepare skb for adding to the session's reorder_q. Hold * packets for max reorder_timeout or 1 second if not * reordering. @@ -783,7 +775,7 @@ EXPORT_SYMBOL(l2tp_recv_common); /* Drop skbs from the session's reorder_q */ -int l2tp_session_queue_purge(struct l2tp_session *session) +static int l2tp_session_queue_purge(struct l2tp_session *session) { struct sk_buff *skb = NULL; BUG_ON(!session); @@ -794,7 +786,6 @@ int l2tp_session_queue_purge(struct l2tp_session *session) } return 0; } -EXPORT_SYMBOL_GPL(l2tp_session_queue_purge); /* Internal UDP receive frame. Do the real work of receiving an L2TP data frame * here. The skb is not on a list when we get here. @@ -802,8 +793,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_queue_purge); * Returns 1 if the packet was not a good data packet and could not be * forwarded. All such packets are passed up to userspace to deal with. */ -static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, - int (*payload_hook)(struct sk_buff *skb)) +static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) { struct l2tp_session *session = NULL; unsigned char *ptr, *optr; @@ -894,7 +884,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, goto error; } - l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook); + l2tp_recv_common(session, skb, ptr, optr, hdrflags, length); l2tp_session_dec_refcount(session); return 0; @@ -923,7 +913,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) l2tp_dbg(tunnel, L2TP_MSG_DATA, "%s: received %d bytes\n", tunnel->name, skb->len); - if (l2tp_udp_recv_core(tunnel, skb, tunnel->recv_payload_hook)) + if (l2tp_udp_recv_core(tunnel, skb)) goto pass_up; return 0; @@ -1009,8 +999,8 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf) return bufp - optr; } -static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, - struct flowi *fl, size_t data_len) +static void l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, + struct flowi *fl, size_t data_len) { struct l2tp_tunnel *tunnel = session->tunnel; unsigned int len = skb->len; @@ -1052,8 +1042,6 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, atomic_long_inc(&tunnel->stats.tx_errors); atomic_long_inc(&session->stats.tx_errors); } - - return 0; } /* If caller requires the skb to have a ppp header, the header must be @@ -1193,7 +1181,7 @@ end: /* When the tunnel is closed, all the attached sessions need to go too. */ -void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) +static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) { int hash; struct hlist_node *walk; @@ -1242,7 +1230,6 @@ again: } write_unlock_bh(&tunnel->hlist_lock); } -EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall); /* Tunnel socket destroy hook for UDP encapsulation */ static void l2tp_udp_encap_destroy(struct sock *sk) @@ -1800,4 +1787,3 @@ MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("L2TP core"); MODULE_LICENSE("GPL"); MODULE_VERSION(L2TP_DRV_VERSION); - diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index c199020f8a8a..d85fde793a8c 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -180,18 +180,12 @@ struct l2tp_tunnel { struct net *l2tp_net; /* the net we belong to */ refcount_t ref_count; -#ifdef CONFIG_DEBUG_FS - void (*show)(struct seq_file *m, void *arg); -#endif - int (*recv_payload_hook)(struct sk_buff *skb); void (*old_sk_destruct)(struct sock *); struct sock *sock; /* Parent socket */ int fd; /* Parent fd, if tunnel socket * was created by userspace */ struct work_struct del_work; - - uint8_t priv[0]; /* private data */ }; struct l2tp_nl_cmd_ops { @@ -201,11 +195,6 @@ struct l2tp_nl_cmd_ops { int (*session_delete)(struct l2tp_session *session); }; -static inline void *l2tp_tunnel_priv(struct l2tp_tunnel *tunnel) -{ - return &tunnel->priv[0]; -} - static inline void *l2tp_session_priv(struct l2tp_session *session) { return &session->priv[0]; @@ -229,7 +218,6 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, struct l2tp_tunnel_cfg *cfg); -void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, @@ -243,8 +231,7 @@ int l2tp_session_delete(struct l2tp_session *session); void l2tp_session_free(struct l2tp_session *session); void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, - int length, int (*payload_hook)(struct sk_buff *skb)); -int l2tp_session_queue_purge(struct l2tp_session *session); + int length); int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); void l2tp_session_set_header_len(struct l2tp_session *session, int version); diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index e87686f7d63c..b5d7dde003ef 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -177,9 +177,6 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) atomic_long_read(&tunnel->stats.rx_packets), atomic_long_read(&tunnel->stats.rx_bytes), atomic_long_read(&tunnel->stats.rx_errors)); - - if (tunnel->show != NULL) - tunnel->show(m, tunnel); } static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 181073bf6925..0bc39cc20a3f 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -165,7 +165,7 @@ static int l2tp_ip_recv(struct sk_buff *skb) print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); } - l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook); + l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_dec_refcount(session); return 0; @@ -613,7 +613,7 @@ static const struct proto_ops l2tp_ip_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 336e4c00abbc..42f828cf62fb 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -178,8 +178,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb) print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); } - l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, - tunnel->recv_payload_hook); + l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_dec_refcount(session); return 0; @@ -500,7 +499,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct flowi6 fl6; - struct sockcm_cookie sockc_unused = {0}; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int transhdrlen = 4; /* zero session-id */ @@ -525,9 +523,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sk->sk_uid; - ipc6.hlimit = -1; - ipc6.tclass = -1; - ipc6.dontfrag = -1; + ipcm6_init(&ipc6); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) @@ -575,8 +571,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, - &sockc_unused); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -641,7 +636,7 @@ back_from_confirm: err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, &sockc_unused); + msg->msg_flags); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) @@ -754,7 +749,7 @@ static const struct proto_ops l2tp_ip6_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip6_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = inet6_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 55188382845c..000c9829304c 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -183,25 +183,6 @@ out: * Receive data handling *****************************************************************************/ -static int pppol2tp_recv_payload_hook(struct sk_buff *skb) -{ - /* Skip PPP header, if present. In testing, Microsoft L2TP clients - * don't send the PPP header (PPP header compression enabled), but - * other clients can include the header. So we cope with both cases - * here. The PPP header is always FF03 when using L2TP. - * - * Note that skb->data[] isn't dereferenced from a u16 ptr here since - * the field may be unaligned. - */ - if (!pskb_may_pull(skb, 2)) - return 1; - - if ((skb->data[0] == PPP_ALLSTATIONS) && (skb->data[1] == PPP_UI)) - skb_pull(skb, 2); - - return 0; -} - /* Receive message. This is the recvmsg for the PPPoL2TP socket. */ static int pppol2tp_recvmsg(struct socket *sock, struct msghdr *msg, @@ -248,6 +229,17 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int if (sk == NULL) goto no_sock; + /* If the first two bytes are 0xFF03, consider that it is the PPP's + * Address and Control fields and skip them. The L2TP module has always + * worked this way, although, in theory, the use of these fields should + * be negociated and handled at the PPP layer. These fields are + * constant: 0xFF is the All-Stations Address and 0x03 the Unnumbered + * Information command with Poll/Final bit set to zero (RFC 1662). + */ + if (pskb_may_pull(skb, 2) && skb->data[0] == PPP_ALLSTATIONS && + skb->data[1] == PPP_UI) + skb_pull(skb, 2); + if (sk->sk_state & PPPOX_BOUND) { struct pppox_sock *po; @@ -424,12 +416,6 @@ static void pppol2tp_put_sk(struct rcu_head *head) sock_put(ps->__sk); } -/* Called by l2tp_core when a session socket is being closed. - */ -static void pppol2tp_session_close(struct l2tp_session *session) -{ -} - /* Really kill the session socket. (Called from sock_put() if * refcnt == 0.) */ @@ -573,7 +559,6 @@ static void pppol2tp_session_init(struct l2tp_session *session) struct dst_entry *dst; session->recv_skb = pppol2tp_recv; - session->session_close = pppol2tp_session_close; #if IS_ENABLED(CONFIG_L2TP_DEBUGFS) session->show = pppol2tp_show; #endif @@ -595,40 +580,113 @@ static void pppol2tp_session_init(struct l2tp_session *session) } } +struct l2tp_connect_info { + u8 version; + int fd; + u32 tunnel_id; + u32 peer_tunnel_id; + u32 session_id; + u32 peer_session_id; +}; + +static int pppol2tp_sockaddr_get_info(const void *sa, int sa_len, + struct l2tp_connect_info *info) +{ + switch (sa_len) { + case sizeof(struct sockaddr_pppol2tp): + { + const struct sockaddr_pppol2tp *sa_v2in4 = sa; + + if (sa_v2in4->sa_protocol != PX_PROTO_OL2TP) + return -EINVAL; + + info->version = 2; + info->fd = sa_v2in4->pppol2tp.fd; + info->tunnel_id = sa_v2in4->pppol2tp.s_tunnel; + info->peer_tunnel_id = sa_v2in4->pppol2tp.d_tunnel; + info->session_id = sa_v2in4->pppol2tp.s_session; + info->peer_session_id = sa_v2in4->pppol2tp.d_session; + + break; + } + case sizeof(struct sockaddr_pppol2tpv3): + { + const struct sockaddr_pppol2tpv3 *sa_v3in4 = sa; + + if (sa_v3in4->sa_protocol != PX_PROTO_OL2TP) + return -EINVAL; + + info->version = 3; + info->fd = sa_v3in4->pppol2tp.fd; + info->tunnel_id = sa_v3in4->pppol2tp.s_tunnel; + info->peer_tunnel_id = sa_v3in4->pppol2tp.d_tunnel; + info->session_id = sa_v3in4->pppol2tp.s_session; + info->peer_session_id = sa_v3in4->pppol2tp.d_session; + + break; + } + case sizeof(struct sockaddr_pppol2tpin6): + { + const struct sockaddr_pppol2tpin6 *sa_v2in6 = sa; + + if (sa_v2in6->sa_protocol != PX_PROTO_OL2TP) + return -EINVAL; + + info->version = 2; + info->fd = sa_v2in6->pppol2tp.fd; + info->tunnel_id = sa_v2in6->pppol2tp.s_tunnel; + info->peer_tunnel_id = sa_v2in6->pppol2tp.d_tunnel; + info->session_id = sa_v2in6->pppol2tp.s_session; + info->peer_session_id = sa_v2in6->pppol2tp.d_session; + + break; + } + case sizeof(struct sockaddr_pppol2tpv3in6): + { + const struct sockaddr_pppol2tpv3in6 *sa_v3in6 = sa; + + if (sa_v3in6->sa_protocol != PX_PROTO_OL2TP) + return -EINVAL; + + info->version = 3; + info->fd = sa_v3in6->pppol2tp.fd; + info->tunnel_id = sa_v3in6->pppol2tp.s_tunnel; + info->peer_tunnel_id = sa_v3in6->pppol2tp.d_tunnel; + info->session_id = sa_v3in6->pppol2tp.s_session; + info->peer_session_id = sa_v3in6->pppol2tp.d_session; + + break; + } + default: + return -EINVAL; + } + + return 0; +} + /* connect() handler. Attach a PPPoX socket to a tunnel UDP socket */ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; - struct sockaddr_pppol2tp *sp = (struct sockaddr_pppol2tp *) uservaddr; struct pppox_sock *po = pppox_sk(sk); struct l2tp_session *session = NULL; + struct l2tp_connect_info info; struct l2tp_tunnel *tunnel; struct pppol2tp_session *ps; struct l2tp_session_cfg cfg = { 0, }; - int error = 0; - u32 tunnel_id, peer_tunnel_id; - u32 session_id, peer_session_id; bool drop_refcnt = false; bool drop_tunnel = false; bool new_session = false; bool new_tunnel = false; - int ver = 2; - int fd; - - lock_sock(sk); - - error = -EINVAL; + int error; - if (sockaddr_len != sizeof(struct sockaddr_pppol2tp) && - sockaddr_len != sizeof(struct sockaddr_pppol2tpv3) && - sockaddr_len != sizeof(struct sockaddr_pppol2tpin6) && - sockaddr_len != sizeof(struct sockaddr_pppol2tpv3in6)) - goto end; + error = pppol2tp_sockaddr_get_info(uservaddr, sockaddr_len, &info); + if (error < 0) + return error; - if (sp->sa_protocol != PX_PROTO_OL2TP) - goto end; + lock_sock(sk); /* Check for already bound sockets */ error = -EBUSY; @@ -640,56 +698,12 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, if (sk->sk_user_data) goto end; /* socket is already attached */ - /* Get params from socket address. Handle L2TPv2 and L2TPv3. - * This is nasty because there are different sockaddr_pppol2tp - * structs for L2TPv2, L2TPv3, over IPv4 and IPv6. We use - * the sockaddr size to determine which structure the caller - * is using. - */ - peer_tunnel_id = 0; - if (sockaddr_len == sizeof(struct sockaddr_pppol2tp)) { - fd = sp->pppol2tp.fd; - tunnel_id = sp->pppol2tp.s_tunnel; - peer_tunnel_id = sp->pppol2tp.d_tunnel; - session_id = sp->pppol2tp.s_session; - peer_session_id = sp->pppol2tp.d_session; - } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3)) { - struct sockaddr_pppol2tpv3 *sp3 = - (struct sockaddr_pppol2tpv3 *) sp; - ver = 3; - fd = sp3->pppol2tp.fd; - tunnel_id = sp3->pppol2tp.s_tunnel; - peer_tunnel_id = sp3->pppol2tp.d_tunnel; - session_id = sp3->pppol2tp.s_session; - peer_session_id = sp3->pppol2tp.d_session; - } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpin6)) { - struct sockaddr_pppol2tpin6 *sp6 = - (struct sockaddr_pppol2tpin6 *) sp; - fd = sp6->pppol2tp.fd; - tunnel_id = sp6->pppol2tp.s_tunnel; - peer_tunnel_id = sp6->pppol2tp.d_tunnel; - session_id = sp6->pppol2tp.s_session; - peer_session_id = sp6->pppol2tp.d_session; - } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3in6)) { - struct sockaddr_pppol2tpv3in6 *sp6 = - (struct sockaddr_pppol2tpv3in6 *) sp; - ver = 3; - fd = sp6->pppol2tp.fd; - tunnel_id = sp6->pppol2tp.s_tunnel; - peer_tunnel_id = sp6->pppol2tp.d_tunnel; - session_id = sp6->pppol2tp.s_session; - peer_session_id = sp6->pppol2tp.d_session; - } else { - error = -EINVAL; - goto end; /* bad socket address */ - } - /* Don't bind if tunnel_id is 0 */ error = -EINVAL; - if (tunnel_id == 0) + if (!info.tunnel_id) goto end; - tunnel = l2tp_tunnel_get(sock_net(sk), tunnel_id); + tunnel = l2tp_tunnel_get(sock_net(sk), info.tunnel_id); if (tunnel) drop_tunnel = true; @@ -697,7 +711,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, * peer_session_id is 0. Otherwise look up tunnel using supplied * tunnel id. */ - if ((session_id == 0) && (peer_session_id == 0)) { + if (!info.session_id && !info.peer_session_id) { if (tunnel == NULL) { struct l2tp_tunnel_cfg tcfg = { .encap = L2TP_ENCAPTYPE_UDP, @@ -707,12 +721,16 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, /* Prevent l2tp_tunnel_register() from trying to set up * a kernel socket. */ - if (fd < 0) { + if (info.fd < 0) { error = -EBADF; goto end; } - error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel); + error = l2tp_tunnel_create(sock_net(sk), info.fd, + info.version, + info.tunnel_id, + info.peer_tunnel_id, &tcfg, + &tunnel); if (error < 0) goto end; @@ -737,13 +755,10 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, goto end; } - if (tunnel->recv_payload_hook == NULL) - tunnel->recv_payload_hook = pppol2tp_recv_payload_hook; - if (tunnel->peer_tunnel_id == 0) - tunnel->peer_tunnel_id = peer_tunnel_id; + tunnel->peer_tunnel_id = info.peer_tunnel_id; - session = l2tp_session_get(sock_net(sk), tunnel, session_id); + session = l2tp_session_get(sock_net(sk), tunnel, info.session_id); if (session) { drop_refcnt = true; @@ -772,8 +787,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, cfg.pw_type = L2TP_PWTYPE_PPP; session = l2tp_session_create(sizeof(struct pppol2tp_session), - tunnel, session_id, - peer_session_id, &cfg); + tunnel, info.session_id, + info.peer_session_id, &cfg); if (IS_ERR(session)) { error = PTR_ERR(session); goto end; @@ -1818,7 +1833,7 @@ static const struct proto_ops pppol2tp_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppol2tp_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = pppol2tp_setsockopt, diff --git a/net/llc/Kconfig b/net/llc/Kconfig index b91c65108162..176a6c1521a5 100644 --- a/net/llc/Kconfig +++ b/net/llc/Kconfig @@ -6,5 +6,5 @@ config LLC2 tristate "ANSI/IEEE 802.2 LLC type 2 Support" select LLC help - This is a Logical Link Layer type 2, connection oriented support. + This is a Logical Link Layer type 2, connection oriented support. Select this if you want to have support for PF_LLC sockets. diff --git a/net/llc/Makefile b/net/llc/Makefile index 4e260cff3c5d..5e0ef436daae 100644 --- a/net/llc/Makefile +++ b/net/llc/Makefile @@ -4,7 +4,7 @@ # Copyright (c) 1997 by Procom Technology,Inc. # 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> # -# This program can be redistributed or modified under the terms of the +# This program can be redistributed or modified under the terms of the # GNU General Public License as published by the Free Software Foundation. # This program is distributed without any warranty or implied warranty # of merchantability or fitness for a particular purpose. diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 804de8490186..1beeea9549fa 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1192,7 +1192,7 @@ static const struct proto_ops llc_ui_ops = { .socketpair = sock_no_socketpair, .accept = llc_ui_accept, .getname = llc_ui_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = llc_ui_ioctl, .listen = llc_ui_listen, .shutdown = llc_ui_shutdown, diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index 6daf391b3e84..8db03c2d5440 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -151,4 +151,3 @@ out: sock_put(sk); return rc; } - diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index e3589ade62e0..bb707789ef2b 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -12,6 +12,7 @@ mac80211-y := \ scan.o offchannel.o \ ht.o agg-tx.o agg-rx.o \ vht.o \ + he.o \ ibss.o \ iface.o \ rate.o \ diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index e83c19d4c292..6a4f154c99f6 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -245,6 +245,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, }; int i, ret = -EOPNOTSUPP; u16 status = WLAN_STATUS_REQUEST_DECLINED; + u16 max_buf_size; if (tid >= IEEE80211_FIRST_TSPEC_TSID) { ht_dbg(sta->sdata, @@ -268,13 +269,18 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, goto end; } + if (sta->sta.he_cap.has_he) + max_buf_size = IEEE80211_MAX_AMPDU_BUF; + else + max_buf_size = IEEE80211_MAX_AMPDU_BUF_HT; + /* sanity check for incoming parameters: * check if configuration can support the BA policy * and if buffer size does not exceeds max value */ /* XXX: check own ht delayed BA capability?? */ if (((ba_policy != 1) && (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) || - (buf_size > IEEE80211_MAX_AMPDU_BUF)) { + (buf_size > max_buf_size)) { status = WLAN_STATUS_INVALID_QOS_PARAM; ht_dbg_ratelimited(sta->sdata, "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n", @@ -283,7 +289,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, } /* determine default buffer size */ if (buf_size == 0) - buf_size = IEEE80211_MAX_AMPDU_BUF; + buf_size = max_buf_size; /* make sure the size doesn't exceed the maximum supported by the hw */ if (buf_size > sta->sta.max_rx_aggregation_subframes) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index ac4295296514..69e831bc317b 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -463,6 +463,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) .timeout = 0, }; int ret; + u16 buf_size; tid_tx = rcu_dereference_protected_tid_tx(sta, tid); @@ -511,11 +512,22 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) sta->ampdu_mlme.addba_req_num[tid]++; spin_unlock_bh(&sta->lock); + if (sta->sta.he_cap.has_he) { + buf_size = local->hw.max_tx_aggregation_subframes; + } else { + /* + * We really should use what the driver told us it will + * transmit as the maximum, but certain APs (e.g. the + * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012) + * will crash when we use a lower number. + */ + buf_size = IEEE80211_MAX_AMPDU_BUF_HT; + } + /* send AddBA request */ ieee80211_send_addba_request(sdata, sta->sta.addr, tid, tid_tx->dialog_token, params.ssn, - IEEE80211_MAX_AMPDU_BUF, - tid_tx->timeout); + buf_size, tid_tx->timeout); } /* @@ -905,8 +917,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, { struct tid_ampdu_tx *tid_tx; struct ieee80211_txq *txq; - u16 capab, tid; - u8 buf_size; + u16 capab, tid, buf_size; bool amsdu; capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index bdf6fa78d0d2..d25da0e66da1 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -495,7 +495,7 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, goto out_unlock; } - ieee80211_key_free(key, true); + ieee80211_key_free(key, sdata->vif.type == NL80211_IFTYPE_STATION); ret = 0; out_unlock: @@ -1412,6 +1412,11 @@ static int sta_apply_parameters(struct ieee80211_local *local, ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, params->vht_capa, sta); + if (params->he_capa) + ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, + (void *)params->he_capa, + params->he_capa_len, sta); + if (params->opmode_notif_used) { /* returned value is only needed for rc update, but the * rc isn't initialized here yet, so ignore it @@ -3486,7 +3491,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, } local_bh_disable(); - ieee80211_xmit(sdata, sta, skb); + ieee80211_xmit(sdata, sta, skb, 0); local_bh_enable(); ret = 0; diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 690c142a7a44..5ac743816b59 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -116,16 +116,16 @@ static void ieee80211_get_stats(struct net_device *dev, data[i++] = sta->sta_state; - if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.txrate); i++; - if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.rxrate); i++; - if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL_AVG)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG)) data[i] = (u8)sinfo.signal_avg; i++; } else { diff --git a/net/mac80211/he.c b/net/mac80211/he.c new file mode 100644 index 000000000000..769078ed5a12 --- /dev/null +++ b/net/mac80211/he.c @@ -0,0 +1,55 @@ +/* + * HE handling + * + * Copyright(c) 2017 Intel Deutschland GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "ieee80211_i.h" + +void +ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const u8 *he_cap_ie, u8 he_cap_len, + struct sta_info *sta) +{ + struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; + struct ieee80211_he_cap_elem *he_cap_ie_elem = (void *)he_cap_ie; + u8 he_ppe_size; + u8 mcs_nss_size; + u8 he_total_size; + + memset(he_cap, 0, sizeof(*he_cap)); + + if (!he_cap_ie || !ieee80211_get_he_sta_cap(sband)) + return; + + /* Make sure size is OK */ + mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap_ie_elem); + he_ppe_size = + ieee80211_he_ppe_size(he_cap_ie[sizeof(he_cap->he_cap_elem) + + mcs_nss_size], + he_cap_ie_elem->phy_cap_info); + he_total_size = sizeof(he_cap->he_cap_elem) + mcs_nss_size + + he_ppe_size; + if (he_cap_len < he_total_size) + return; + + memcpy(&he_cap->he_cap_elem, he_cap_ie, sizeof(he_cap->he_cap_elem)); + + /* HE Tx/Rx HE MCS NSS Support Field */ + memcpy(&he_cap->he_mcs_nss_supp, + &he_cap_ie[sizeof(he_cap->he_cap_elem)], mcs_nss_size); + + /* Check if there are (optional) PPE Thresholds */ + if (he_cap->he_cap_elem.phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) + memcpy(he_cap->ppe_thres, + &he_cap_ie[sizeof(he_cap->he_cap_elem) + mcs_nss_size], + he_ppe_size); + + he_cap->has_he = true; +} diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 26a7ba3b698f..f849ea814993 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -352,7 +352,7 @@ void ieee80211_ba_session_work(struct work_struct *work) test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl)) ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, - IEEE80211_MAX_AMPDU_BUF, + IEEE80211_MAX_AMPDU_BUF_HT, false, true); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index d1978aa1c15d..172aeae21ae9 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -165,6 +165,7 @@ typedef unsigned __bitwise ieee80211_tx_result; #define TX_DROP ((__force ieee80211_tx_result) 1u) #define TX_QUEUED ((__force ieee80211_tx_result) 2u) +#define IEEE80211_TX_NO_SEQNO BIT(0) #define IEEE80211_TX_UNICAST BIT(1) #define IEEE80211_TX_PS_BUFFERED BIT(2) @@ -364,6 +365,7 @@ enum ieee80211_sta_flags { IEEE80211_STA_DISABLE_160MHZ = BIT(13), IEEE80211_STA_DISABLE_WMM = BIT(14), IEEE80211_STA_ENABLE_RRM = BIT(15), + IEEE80211_STA_DISABLE_HE = BIT(16), }; struct ieee80211_mgd_auth_data { @@ -1453,6 +1455,10 @@ struct ieee802_11_elems { const struct ieee80211_vht_cap *vht_cap_elem; const struct ieee80211_vht_operation *vht_operation; const struct ieee80211_meshconf_ie *mesh_config; + const u8 *he_cap; + const struct ieee80211_he_operation *he_operation; + const struct ieee80211_mu_edca_param_set *mu_edca_param_set; + const u8 *uora_element; const u8 *mesh_id; const u8 *peering; const __le16 *awake_window; @@ -1482,6 +1488,7 @@ struct ieee802_11_elems { u8 ext_supp_rates_len; u8 wmm_info_len; u8 wmm_param_len; + u8 he_cap_len; u8 mesh_id_len; u8 peering_len; u8 preq_len; @@ -1824,6 +1831,13 @@ void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, enum nl80211_chan_width ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta); +/* HE */ +void +ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const u8 *he_cap_ie, u8 he_cap_len, + struct sta_info *sta); + /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, @@ -1880,19 +1894,20 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, bool bss_notify, bool enable_qos); void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb); + struct sta_info *sta, struct sk_buff *skb, + u32 txdata_flags); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band); + enum nl80211_band band, u32 txdata_flags); static inline void ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band) + enum nl80211_band band, u32 txdata_flags) { rcu_read_lock(); - __ieee80211_tx_skb_tid_band(sdata, skb, tid, band); + __ieee80211_tx_skb_tid_band(sdata, skb, tid, band, txdata_flags); rcu_read_unlock(); } @@ -1910,7 +1925,7 @@ static inline void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, } __ieee80211_tx_skb_tid_band(sdata, skb, tid, - chanctx_conf->def.chan->band); + chanctx_conf->def.chan->band, 0); rcu_read_unlock(); } @@ -2031,26 +2046,27 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf); + +enum { + IEEE80211_PROBE_FLAG_DIRECTED = BIT(0), + IEEE80211_PROBE_FLAG_MIN_CONTENT = BIT(1), + IEEE80211_PROBE_FLAG_RANDOM_SN = BIT(2), +}; + int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, - struct cfg80211_chan_def *chandef); + struct cfg80211_chan_def *chandef, + u32 flags); struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, u32 ratemask, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, - bool directed); -void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, - const u8 *src, const u8 *dst, - const u8 *ssid, size_t ssid_len, - const u8 *ie, size_t ie_len, - u32 ratemask, bool directed, u32 tx_flags, - struct ieee80211_channel *channel, bool scan); - + u32 flags); u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates); @@ -2073,6 +2089,9 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap); u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef); +u8 *ieee80211_ie_build_he_cap(u8 *pos, + const struct ieee80211_sta_he_cap *he_cap, + u8 *end); int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 555e389b7dfa..5e6cf2cee965 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1130,7 +1130,7 @@ static void ieee80211_uninit(struct net_device *dev) static u16 ieee80211_netdev_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); @@ -1176,7 +1176,7 @@ static const struct net_device_ops ieee80211_dataif_ops = { static u16 ieee80211_monitor_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index ee0d0cc8dc3b..c054ac85793c 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -656,11 +656,15 @@ int ieee80211_key_link(struct ieee80211_key *key, { struct ieee80211_local *local = sdata->local; struct ieee80211_key *old_key; - int idx, ret; - bool pairwise; - - pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; - idx = key->conf.keyidx; + int idx = key->conf.keyidx; + bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; + /* + * We want to delay tailroom updates only for station - in that + * case it helps roaming speed, but in other cases it hurts and + * can cause warnings to appear. + */ + bool delay_tailroom = sdata->vif.type == NL80211_IFTYPE_STATION; + int ret; mutex_lock(&sdata->local->key_mtx); @@ -688,14 +692,14 @@ int ieee80211_key_link(struct ieee80211_key *key, increment_tailroom_need_count(sdata); ieee80211_key_replace(sdata, sta, pairwise, old_key, key); - ieee80211_key_destroy(old_key, true); + ieee80211_key_destroy(old_key, delay_tailroom); ieee80211_debugfs_key_add(key); if (!local->wowlan) { ret = ieee80211_key_enable_hw_accel(key); if (ret) - ieee80211_key_free(key, true); + ieee80211_key_free(key, delay_tailroom); } else { ret = 0; } @@ -930,7 +934,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local, ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); - __ieee80211_key_destroy(key, true); + __ieee80211_key_destroy(key, key->sdata->vif.type == + NL80211_IFTYPE_STATION); } for (i = 0; i < NUM_DEFAULT_KEYS; i++) { @@ -940,7 +945,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local, ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); - __ieee80211_key_destroy(key, true); + __ieee80211_key_destroy(key, key->sdata->vif.type == + NL80211_IFTYPE_STATION); } mutex_unlock(&local->key_mtx); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index fb73451ed85e..4fb2709cb527 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright (C) 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -557,10 +558,19 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211); - if (!ops->hw_scan) + if (!ops->hw_scan) { wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | NL80211_FEATURE_AP_SCAN; - + /* + * if the driver behaves correctly using the probe request + * (template) from mac80211, then both of these should be + * supported even with hw scan - but let drivers opt in. + */ + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_SCAN_RANDOM_SN); + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT); + } if (!ops->set_key) wiphy->flags |= WIPHY_FLAG_IBSS_RSN; @@ -588,8 +598,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, local->hw.queues = 1; local->hw.max_rates = 1; local->hw.max_report_rates = 0; - local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; - local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; + local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT; + local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT; local->hw.offchannel_tx_hw_queue = IEEE80211_INVAL_HW_QUEUE; local->hw.conf.long_frame_max_tx_count = wiphy->retry_long; local->hw.conf.short_frame_max_tx_count = wiphy->retry_short; @@ -816,7 +826,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) int result, i; enum nl80211_band band; int channels, max_bitrates; - bool supp_ht, supp_vht; + bool supp_ht, supp_vht, supp_he; netdev_features_t feature_whitelist; struct cfg80211_chan_def dflt_chandef = {}; @@ -896,6 +906,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) max_bitrates = 0; supp_ht = false; supp_vht = false; + supp_he = false; for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; @@ -922,6 +933,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_ht = supp_ht || sband->ht_cap.ht_supported; supp_vht = supp_vht || sband->vht_cap.vht_supported; + if (!supp_he) + supp_he = !!ieee80211_get_he_sta_cap(sband); + if (!sband->ht_cap.ht_supported) continue; @@ -1011,6 +1025,18 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->scan_ies_len += 2 + sizeof(struct ieee80211_vht_cap); + /* HE cap element is variable in size - set len to allow max size */ + /* + * TODO: 1 is added at the end of the calculation to accommodate for + * the temporary placing of the HE capabilities IE under EXT. + * Remove it once it is placed in the final place. + */ + if (supp_he) + local->scan_ies_len += + 2 + sizeof(struct ieee80211_he_cap_elem) + + sizeof(struct ieee80211_he_mcs_nss_supp) + + IEEE80211_HE_PPE_THRES_MAX_LEN + 1; + if (!local->ops->hw_scan) { /* For hw_scan, driver needs to set these up. */ local->hw.wiphy->max_scan_ssids = 4; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a59187c016e0..7fb9957359a3 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -149,6 +149,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, + const struct ieee80211_he_operation *he_oper, struct cfg80211_chan_def *chandef, bool tracking) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -207,7 +208,27 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, } vht_chandef = *chandef; - if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && he_oper && + (le32_to_cpu(he_oper->he_oper_params) & + IEEE80211_HE_OPERATION_VHT_OPER_INFO)) { + struct ieee80211_vht_operation he_oper_vht_cap; + + /* + * Set only first 3 bytes (other 2 aren't used in + * ieee80211_chandef_vht_oper() anyway) + */ + memcpy(&he_oper_vht_cap, he_oper->optional, 3); + he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0); + + if (!ieee80211_chandef_vht_oper(&he_oper_vht_cap, + &vht_chandef)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) + sdata_info(sdata, + "HE AP VHT information is invalid, disable HE\n"); + ret = IEEE80211_STA_DISABLE_HE; + goto out; + } + } else if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) sdata_info(sdata, "AP VHT information is invalid, disable VHT\n"); @@ -300,12 +321,14 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ht_cap *ht_cap, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, + const struct ieee80211_he_operation *he_oper, const u8 *bssid, u32 *changed) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct ieee80211_supported_band *sband; - struct ieee80211_channel *chan; + struct ieee80211_channel *chan = sdata->vif.bss_conf.chandef.chan; + struct ieee80211_supported_band *sband = + local->hw.wiphy->bands[chan->band]; struct cfg80211_chan_def chandef; u16 ht_opmode; u32 flags; @@ -320,6 +343,11 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT) vht_oper = NULL; + /* don't check HE if we associated as non-HE station */ + if (ifmgd->flags & IEEE80211_STA_DISABLE_HE || + !ieee80211_get_he_sta_cap(sband)) + he_oper = NULL; + if (WARN_ON_ONCE(!sta)) return -EINVAL; @@ -333,12 +361,9 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.ht_operation_mode = ht_opmode; } - chan = sdata->vif.bss_conf.chandef.chan; - sband = local->hw.wiphy->bands[chan->band]; - - /* calculate new channel (type) based on HT/VHT operation IEs */ + /* calculate new channel (type) based on HT/VHT/HE operation IEs */ flags = ieee80211_determine_chantype(sdata, sband, chan, - ht_oper, vht_oper, + ht_oper, vht_oper, he_oper, &chandef, true); /* @@ -582,6 +607,34 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, ieee80211_ie_build_vht_cap(pos, &vht_cap, cap); } +/* This function determines HE capability flags for the association + * and builds the IE. + */ +static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, + struct ieee80211_supported_band *sband) +{ + u8 *pos; + const struct ieee80211_sta_he_cap *he_cap = NULL; + u8 he_cap_size; + + he_cap = ieee80211_get_he_sta_cap(sband); + if (!he_cap) + return; + + /* + * TODO: the 1 added is because this temporarily is under the EXTENSION + * IE. Get rid of it when it moves. + */ + he_cap_size = + 2 + 1 + sizeof(he_cap->he_cap_elem) + + ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem) + + ieee80211_he_ppe_size(he_cap->ppe_thres[0], + he_cap->he_cap_elem.phy_cap_info); + pos = skb_put(skb, he_cap_size); + ieee80211_ie_build_he_cap(pos, he_cap, pos + he_cap_size); +} + static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; @@ -643,6 +696,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) 2 + 2 * sband->n_channels + /* supported channels */ 2 + sizeof(struct ieee80211_ht_cap) + /* HT */ 2 + sizeof(struct ieee80211_vht_cap) + /* VHT */ + 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + /* HE */ + sizeof(struct ieee80211_he_mcs_nss_supp) + + IEEE80211_HE_PPE_THRES_MAX_LEN + assoc_data->ie_len + /* extra IEs */ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) + 9, /* WMM */ @@ -827,11 +883,41 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) offset = noffset; } + /* if present, add any custom IEs that go before HE */ + if (assoc_data->ie_len) { + static const u8 before_he[] = { + /* + * no need to list the ones split off before VHT + * or generated here + */ + WLAN_EID_OPMODE_NOTIF, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE, + /* 11ai elements */ + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_SESSION, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_PUBLIC_KEY, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_KEY_CONFIRM, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_HLP_CONTAINER, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN, + /* TODO: add 11ah/11aj/11ak elements */ + }; + + /* RIC already taken above, so no need to handle here anymore */ + noffset = ieee80211_ie_split(assoc_data->ie, assoc_data->ie_len, + before_he, ARRAY_SIZE(before_he), + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, assoc_data->ie + offset, noffset - offset); + offset = noffset; + } + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) ieee80211_add_vht_ie(sdata, skb, sband, &assoc_data->ap_vht_cap); - /* if present, add any custom non-vendor IEs that go after HT */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) + ieee80211_add_he_ie(sdata, skb, sband); + + /* if present, add any custom non-vendor IEs that go after HE */ if (assoc_data->ie_len) { noffset = ieee80211_ie_split_vendor(assoc_data->ie, assoc_data->ie_len, @@ -898,6 +984,11 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + /* Don't send NDPs when STA is connected HE */ + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) + return; + skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, !ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP)); if (!skb) @@ -929,6 +1020,10 @@ static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; + /* Don't send NDPs when connected HE */ + if (!(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE)) + return; + skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30); if (!skb) return; @@ -1700,9 +1795,11 @@ static void ieee80211_sta_handle_tspec_ac_params_wk(struct work_struct *work) } /* MLME */ -static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - const u8 *wmm_param, size_t wmm_param_len) +static bool +ieee80211_sta_wmm_params(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + const u8 *wmm_param, size_t wmm_param_len, + const struct ieee80211_mu_edca_param_set *mu_edca) { struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS]; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -1749,6 +1846,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_bk; break; case 2: /* AC_VI */ ac = IEEE80211_AC_VI; @@ -1756,6 +1856,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_vi; break; case 3: /* AC_VO */ ac = IEEE80211_AC_VO; @@ -1763,6 +1866,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_vo; break; case 0: /* AC_BE */ default: @@ -1771,6 +1877,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_be; break; } @@ -2219,6 +2328,20 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, ieee80211_sta_reset_conn_monitor(sdata); } +static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata, + const u8 *src, const u8 *dst, + const u8 *ssid, size_t ssid_len, + struct ieee80211_channel *channel) +{ + struct sk_buff *skb; + + skb = ieee80211_build_probe_req(sdata, src, dst, (u32)-1, channel, + ssid, ssid_len, NULL, 0, + IEEE80211_PROBE_FLAG_DIRECTED); + if (skb) + ieee80211_tx_skb(sdata, skb); +} + static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -2265,10 +2388,9 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) else ssid_len = ssid[1]; - ieee80211_send_probe_req(sdata, sdata->vif.addr, dst, - ssid + 2, ssid_len, NULL, - 0, (u32) -1, true, 0, - ifmgd->associated->channel, false); + ieee80211_mlme_send_probe_req(sdata, sdata->vif.addr, dst, + ssid + 2, ssid_len, + ifmgd->associated->channel); rcu_read_unlock(); } @@ -2370,7 +2492,7 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, skb = ieee80211_build_probe_req(sdata, sdata->vif.addr, cbss->bssid, (u32) -1, cbss->channel, ssid + 2, ssid_len, - NULL, 0, true); + NULL, 0, IEEE80211_PROBE_FLAG_DIRECTED); rcu_read_unlock(); return skb; @@ -3008,6 +3130,25 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out; } + /* + * If AP doesn't support HT, or it doesn't have HE mandatory IEs, mark + * HE as disabled. If on the 5GHz band, make sure it supports VHT. + */ + if (ifmgd->flags & IEEE80211_STA_DISABLE_HT || + (sband->band == NL80211_BAND_5GHZ && + ifmgd->flags & IEEE80211_STA_DISABLE_VHT) || + (!elems.he_cap && !elems.he_operation)) + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + (!elems.he_cap || !elems.he_operation)) { + mutex_unlock(&sdata->local->sta_mtx); + sdata_info(sdata, + "HE AP is missing HE capability/operation\n"); + ret = false; + goto out; + } + /* Set up internal HT/VHT capabilities */ if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, @@ -3017,6 +3158,48 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, elems.vht_cap_elem, sta); + if (elems.he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + elems.he_cap) { + ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, + elems.he_cap, + elems.he_cap_len, + sta); + + bss_conf->he_support = sta->sta.he_cap.has_he; + } else { + bss_conf->he_support = false; + } + + if (bss_conf->he_support) { + u32 he_oper_params = + le32_to_cpu(elems.he_operation->he_oper_params); + + bss_conf->bss_color = he_oper_params & + IEEE80211_HE_OPERATION_BSS_COLOR_MASK; + bss_conf->htc_trig_based_pkt_ext = + (he_oper_params & + IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK) << + IEEE80211_HE_OPERATION_DFLT_PE_DURATION_OFFSET; + bss_conf->frame_time_rts_th = + (he_oper_params & + IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK) << + IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET; + + bss_conf->multi_sta_back_32bit = + sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & + IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP; + + bss_conf->ack_enabled = + sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & + IEEE80211_HE_MAC_CAP2_ACK_EN; + + bss_conf->uora_exists = !!elems.uora_element; + if (elems.uora_element) + bss_conf->uora_ocw_range = elems.uora_element[0]; + + /* TODO: OPEN: what happens if BSS color disable is set? */ + } + /* * Some APs, e.g. Netgear WNDR3700, report invalid HT operation data * in their association response, so ignore that data for our own @@ -3076,7 +3259,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) { ieee80211_set_wmm_default(sdata, false, false); } else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len)) { + elems.wmm_param_len, + elems.mu_edca_param_set)) { /* still enable QoS since we might have HT/VHT */ ieee80211_set_wmm_default(sdata, false, true); /* set the disable-WMM flag in this case to disable @@ -3590,7 +3774,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) && ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len)) + elems.wmm_param_len, + elems.mu_edca_param_set)) changed |= BSS_CHANGED_QOS; /* @@ -3629,7 +3814,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (ieee80211_config_bw(sdata, sta, elems.ht_cap_elem, elems.ht_operation, - elems.vht_operation, bssid, &changed)) { + elems.vht_operation, elems.he_operation, + bssid, &changed)) { mutex_unlock(&local->sta_mtx); sdata_info(sdata, "failed to follow AP %pM bandwidth change, disconnect\n", @@ -4266,6 +4452,68 @@ static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata, return chains; } +static bool +ieee80211_verify_sta_he_mcs_support(struct ieee80211_supported_band *sband, + const struct ieee80211_he_operation *he_op) +{ + const struct ieee80211_sta_he_cap *sta_he_cap = + ieee80211_get_he_sta_cap(sband); + u16 ap_min_req_set; + int i; + + if (!sta_he_cap || !he_op) + return false; + + ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); + + /* Need to go over for 80MHz, 160MHz and for 80+80 */ + for (i = 0; i < 3; i++) { + const struct ieee80211_he_mcs_nss_supp *sta_mcs_nss_supp = + &sta_he_cap->he_mcs_nss_supp; + u16 sta_mcs_map_rx = + le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i]); + u16 sta_mcs_map_tx = + le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i + 1]); + u8 nss; + bool verified = true; + + /* + * For each band there is a maximum of 8 spatial streams + * possible. Each of the sta_mcs_map_* is a 16-bit struct built + * of 2 bits per NSS (1-8), with the values defined in enum + * ieee80211_he_mcs_support. Need to make sure STA TX and RX + * capabilities aren't less than the AP's minimum requirements + * for this HE BSS per SS. + * It is enough to find one such band that meets the reqs. + */ + for (nss = 8; nss > 0; nss--) { + u8 sta_rx_val = (sta_mcs_map_rx >> (2 * (nss - 1))) & 3; + u8 sta_tx_val = (sta_mcs_map_tx >> (2 * (nss - 1))) & 3; + u8 ap_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; + + if (ap_val == IEEE80211_HE_MCS_NOT_SUPPORTED) + continue; + + /* + * Make sure the HE AP doesn't require MCSs that aren't + * supported by the client + */ + if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + (ap_val > sta_rx_val) || (ap_val > sta_tx_val)) { + verified = false; + break; + } + } + + if (verified) + return true; + } + + /* If here, STA doesn't meet AP's HE min requirements */ + return false; +} + static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss) { @@ -4274,6 +4522,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ht_cap *ht_cap = NULL; const struct ieee80211_ht_operation *ht_oper = NULL; const struct ieee80211_vht_operation *vht_oper = NULL; + const struct ieee80211_he_operation *he_oper = NULL; struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; int ret; @@ -4329,6 +4578,24 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } } + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + ieee80211_get_he_sta_cap(sband)) { + const struct cfg80211_bss_ies *ies; + const u8 *he_oper_ie; + + ies = rcu_dereference(cbss->ies); + he_oper_ie = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, + ies->data, ies->len); + if (he_oper_ie && + he_oper_ie[1] == ieee80211_he_oper_size(&he_oper_ie[3])) + he_oper = (void *)(he_oper_ie + 3); + else + he_oper = NULL; + + if (!ieee80211_verify_sta_he_mcs_support(sband, he_oper)) + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + } + /* Allow VHT if at least one channel on the sband supports 80 MHz */ have_80mhz = false; for (i = 0; i < sband->n_channels; i++) { @@ -4345,7 +4612,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, - ht_oper, vht_oper, + ht_oper, vht_oper, he_oper, &chandef, false); sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss), @@ -4751,8 +5018,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) { ifmgd->flags |= IEEE80211_STA_DISABLE_HT; ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; netdev_info(sdata->dev, - "disabling HT/VHT due to WEP/TKIP use\n"); + "disabling HE/HT/VHT due to WEP/TKIP use\n"); } } diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index f1d40b6645ff..8ef4153cd299 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -262,7 +262,7 @@ static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc, if (roc->mgmt_tx_cookie) { if (!WARN_ON(!roc->frame)) { ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7, - roc->chan->band); + roc->chan->band, 0); roc->frame = NULL; } } else { diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 76048b53c5b2..07fb219327d6 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -751,4 +751,3 @@ rc80211_minstrel_exit(void) { ieee80211_rate_control_unregister(&mac80211_minstrel); } - diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 0a38cc1cbebc..64742f2765c4 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -175,6 +175,20 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, len += 12; } + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE) { + len = ALIGN(len, 2); + len += 12; + BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he) != 12); + } + + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE_MU) { + len = ALIGN(len, 2); + len += 12; + BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he_mu) != 12); + } + if (status->chains) { /* antenna and antenna signal fields */ len += 2 * hweight8(status->chains); @@ -263,6 +277,19 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, int mpdulen, chain; unsigned long chains = status->chains; struct ieee80211_vendor_radiotap rtap = {}; + struct ieee80211_radiotap_he he = {}; + struct ieee80211_radiotap_he_mu he_mu = {}; + + if (status->flag & RX_FLAG_RADIOTAP_HE) { + he = *(struct ieee80211_radiotap_he *)skb->data; + skb_pull(skb, sizeof(he)); + WARN_ON_ONCE(status->encoding != RX_ENC_HE); + } + + if (status->flag & RX_FLAG_RADIOTAP_HE_MU) { + he_mu = *(struct ieee80211_radiotap_he_mu *)skb->data; + skb_pull(skb, sizeof(he_mu)); + } if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) { rtap = *(struct ieee80211_vendor_radiotap *)skb->data; @@ -520,6 +547,89 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, *pos++ = flags; } + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE) { +#define HE_PREP(f, val) cpu_to_le16(FIELD_PREP(IEEE80211_RADIOTAP_HE_##f, val)) + + if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) { + he.data6 |= HE_PREP(DATA6_NSTS, + FIELD_GET(RX_ENC_FLAG_STBC_MASK, + status->enc_flags)); + he.data3 |= HE_PREP(DATA3_STBC, 1); + } else { + he.data6 |= HE_PREP(DATA6_NSTS, status->nss); + } + +#define CHECK_GI(s) \ + BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \ + (int)NL80211_RATE_INFO_HE_GI_##s) + + CHECK_GI(0_8); + CHECK_GI(1_6); + CHECK_GI(3_2); + + he.data3 |= HE_PREP(DATA3_DATA_MCS, status->rate_idx); + he.data3 |= HE_PREP(DATA3_DATA_DCM, status->he_dcm); + he.data3 |= HE_PREP(DATA3_CODING, + !!(status->enc_flags & RX_ENC_FLAG_LDPC)); + + he.data5 |= HE_PREP(DATA5_GI, status->he_gi); + + switch (status->bw) { + case RATE_INFO_BW_20: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ); + break; + case RATE_INFO_BW_40: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ); + break; + case RATE_INFO_BW_80: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ); + break; + case RATE_INFO_BW_160: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ); + break; + case RATE_INFO_BW_HE_RU: +#define CHECK_RU_ALLOC(s) \ + BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \ + NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4) + + CHECK_RU_ALLOC(26); + CHECK_RU_ALLOC(52); + CHECK_RU_ALLOC(106); + CHECK_RU_ALLOC(242); + CHECK_RU_ALLOC(484); + CHECK_RU_ALLOC(996); + CHECK_RU_ALLOC(2x996); + + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + status->he_ru + 4); + break; + default: + WARN_ONCE(1, "Invalid SU BW %d\n", status->bw); + } + + /* ensure 2 byte alignment */ + while ((pos - (u8 *)rthdr) & 1) + pos++; + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE); + memcpy(pos, &he, sizeof(he)); + pos += sizeof(he); + } + + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE_MU) { + /* ensure 2 byte alignment */ + while ((pos - (u8 *)rthdr) & 1) + pos++; + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE_MU); + memcpy(pos, &he_mu, sizeof(he_mu)); + pos += sizeof(he_mu); + } + for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) { *pos++ = status->chain_signal[chain]; *pos++ = chain; @@ -613,6 +723,12 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, rcu_dereference(local->monitor_sdata); bool only_monitor = false; + if (status->flag & RX_FLAG_RADIOTAP_HE) + rtap_space += sizeof(struct ieee80211_radiotap_he); + + if (status->flag & RX_FLAG_RADIOTAP_HE_MU) + rtap_space += sizeof(struct ieee80211_radiotap_he_mu); + if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) { struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data; @@ -2254,11 +2370,8 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb, sdata->control_port_over_nl80211)) { struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); bool noencrypt = status->flag & RX_FLAG_DECRYPTED; - struct ethhdr *ehdr = eth_hdr(skb); - cfg80211_rx_control_port(dev, skb->data, skb->len, - ehdr->h_source, - be16_to_cpu(skb->protocol), noencrypt); + cfg80211_rx_control_port(dev, skb, noencrypt); dev_kfree_skb(skb); } else { /* deliver to local stack */ @@ -3241,7 +3354,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) } __ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7, - status->band); + status->band, 0); } dev_kfree_skb(rx->skb); return RX_QUEUED; @@ -3386,8 +3499,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, status = IEEE80211_SKB_RXCB((rx->skb)); sband = rx->local->hw.wiphy->bands[status->band]; - if (!(status->encoding == RX_ENC_HT) && - !(status->encoding == RX_ENC_VHT)) + if (status->encoding == RX_ENC_LEGACY) rate = &sband->bitrates[status->rate_idx]; ieee80211_rx_cooked_monitor(rx, rate); @@ -4386,6 +4498,14 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, status->rate_idx, status->nss)) goto drop; break; + case RX_ENC_HE: + if (WARN_ONCE(status->rate_idx > 11 || + !status->nss || + status->nss > 8, + "Rate marked as an HE rate but data is invalid: MCS: %d, NSS: %d\n", + status->rate_idx, status->nss)) + goto drop; + break; default: WARN_ON_ONCE(1); /* fall through */ diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 2e917a6d239d..5d2a11777718 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -20,6 +20,7 @@ #include <net/sch_generic.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/random.h> #include <net/mac80211.h> #include "ieee80211_i.h" @@ -293,6 +294,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) struct cfg80211_chan_def chandef; u8 bands_used = 0; int i, ielen, n_chans; + u32 flags = 0; req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->mtx)); @@ -331,12 +333,16 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) local->hw_scan_req->req.n_channels = n_chans; ieee80211_prepare_scan_chandef(&chandef, req->scan_width); + if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) + flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; + ielen = ieee80211_build_preq_ies(local, (u8 *)local->hw_scan_req->req.ie, local->hw_scan_ies_bufsize, &local->hw_scan_req->ies, req->ie, req->ie_len, - bands_used, req->rates, &chandef); + bands_used, req->rates, &chandef, + flags); local->hw_scan_req->req.ie_len = ielen; local->hw_scan_req->req.no_cck = req->no_cck; ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr); @@ -528,6 +534,35 @@ void ieee80211_run_deferred_scan(struct ieee80211_local *local) round_jiffies_relative(0)); } +static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, + const u8 *src, const u8 *dst, + const u8 *ssid, size_t ssid_len, + const u8 *ie, size_t ie_len, + u32 ratemask, u32 flags, u32 tx_flags, + struct ieee80211_channel *channel) +{ + struct sk_buff *skb; + u32 txdata_flags = 0; + + skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, + ssid, ssid_len, + ie, ie_len, flags); + + if (skb) { + if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) { + struct ieee80211_hdr *hdr = (void *)skb->data; + u16 sn = get_random_u32(); + + txdata_flags |= IEEE80211_TX_NO_SEQNO; + hdr->seq_ctrl = + cpu_to_le16(IEEE80211_SN_TO_SEQ(sn)); + } + IEEE80211_SKB_CB(skb)->flags |= tx_flags; + ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band, + txdata_flags); + } +} + static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, unsigned long *next_delay) { @@ -535,7 +570,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; enum nl80211_band band = local->hw.conf.chandef.chan->band; - u32 tx_flags; + u32 flags = 0, tx_flags; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->mtx)); @@ -543,17 +578,21 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (scan_req->no_cck) tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE; + if (scan_req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) + flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; + if (scan_req->flags & NL80211_SCAN_FLAG_RANDOM_SN) + flags |= IEEE80211_PROBE_FLAG_RANDOM_SN; sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->mtx)); for (i = 0; i < scan_req->n_ssids; i++) - ieee80211_send_probe_req( + ieee80211_send_scan_probe_req( sdata, local->scan_addr, scan_req->bssid, scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len, scan_req->ie, scan_req->ie_len, - scan_req->rates[band], false, - tx_flags, local->hw.conf.chandef.chan, true); + scan_req->rates[band], flags, + tx_flags, local->hw.conf.chandef.chan); /* * After sending probe requests, wait for probe responses @@ -1141,6 +1180,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, u32 rate_masks[NUM_NL80211_BANDS] = {}; u8 bands_used = 0; u8 *ie; + u32 flags = 0; iebufsz = local->scan_ies_len + req->ie_len; @@ -1157,6 +1197,9 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, } } + if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) + flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; + ie = kcalloc(iebufsz, num_bands, GFP_KERNEL); if (!ie) { ret = -ENOMEM; @@ -1167,7 +1210,8 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, - req->ie_len, bands_used, rate_masks, &chandef); + req->ie_len, bands_used, rate_masks, &chandef, + flags); ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 6428f1ac37b6..f34202242d24 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1323,6 +1323,11 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid, struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; + /* Don't send NDPs when STA is connected HE */ + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE)) + return; + if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | @@ -1391,7 +1396,7 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid, } info->band = chanctx_conf->def.chan->band; - ieee80211_xmit(sdata, sta, skb); + ieee80211_xmit(sdata, sta, skb, 0); rcu_read_unlock(); } @@ -1968,7 +1973,7 @@ sta_get_last_rx_stats(struct sta_info *sta) return stats; } -static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, +static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, struct rate_info *rinfo) { rinfo->bw = STA_STATS_GET(BW, rate); @@ -2005,6 +2010,14 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); break; } + case STA_STATS_RATE_TYPE_HE: + rinfo->flags = RATE_INFO_FLAGS_HE_MCS; + rinfo->mcs = STA_STATS_GET(HE_MCS, rate); + rinfo->nss = STA_STATS_GET(HE_NSS, rate); + rinfo->he_gi = STA_STATS_GET(HE_GI, rate); + rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate); + rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate); + break; } } @@ -2101,38 +2114,38 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, drv_sta_statistics(local, sdata, &sta->sta, sinfo); - sinfo->filled |= BIT(NL80211_STA_INFO_INACTIVE_TIME) | - BIT(NL80211_STA_INFO_STA_FLAGS) | - BIT(NL80211_STA_INFO_BSS_PARAM) | - BIT(NL80211_STA_INFO_CONNECTED_TIME) | - BIT(NL80211_STA_INFO_RX_DROP_MISC); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | + BIT_ULL(NL80211_STA_INFO_STA_FLAGS) | + BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | + BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) | + BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sinfo->beacon_loss_count = sdata->u.mgd.beacon_loss_count; - sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_LOSS); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); } sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->inactive_time = jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); - if (!(sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES64) | - BIT(NL80211_STA_INFO_TX_BYTES)))) { + if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | + BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { sinfo->tx_bytes = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_bytes += sta->tx_stats.bytes[ac]; - sinfo->filled |= BIT(NL80211_STA_INFO_TX_BYTES64); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_PACKETS))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { sinfo->tx_packets = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_packets += sta->tx_stats.packets[ac]; - sinfo->filled |= BIT(NL80211_STA_INFO_TX_PACKETS); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); } - if (!(sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES64) | - BIT(NL80211_STA_INFO_RX_BYTES)))) { + if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | + BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats); if (sta->pcpu_rx_stats) { @@ -2144,10 +2157,10 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, } } - sinfo->filled |= BIT(NL80211_STA_INFO_RX_BYTES64); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_PACKETS))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { sinfo->rx_packets = sta->rx_stats.packets; if (sta->pcpu_rx_stats) { for_each_possible_cpu(cpu) { @@ -2157,17 +2170,17 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->rx_packets += cpurxs->packets; } } - sinfo->filled |= BIT(NL80211_STA_INFO_RX_PACKETS); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_RETRIES))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { sinfo->tx_retries = sta->status_stats.retry_count; - sinfo->filled |= BIT(NL80211_STA_INFO_TX_RETRIES); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_FAILED))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { sinfo->tx_failed = sta->status_stats.retry_failed; - sinfo->filled |= BIT(NL80211_STA_INFO_TX_FAILED); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); } sinfo->rx_dropped_misc = sta->rx_stats.dropped; @@ -2182,23 +2195,23 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (sdata->vif.type == NL80211_IFTYPE_STATION && !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { - sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_RX) | - BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | + BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); } if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { - if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) { sinfo->signal = (s8)last_rxstats->last_signal; - sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); } if (!sta->pcpu_rx_stats && - !(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) { + !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { sinfo->signal_avg = -ewma_signal_read(&sta->rx_stats_avg.signal); - sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL_AVG); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); } } @@ -2207,11 +2220,11 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, * pcpu statistics */ if (last_rxstats->chains && - !(sinfo->filled & (BIT(NL80211_STA_INFO_CHAIN_SIGNAL) | - BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { - sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL); + !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | + BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); if (!sta->pcpu_rx_stats) - sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); sinfo->chains = last_rxstats->chains; @@ -2223,15 +2236,15 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, } } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) { sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &sinfo->txrate); - sinfo->filled |= BIT(NL80211_STA_INFO_TX_BITRATE); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))) { if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0) - sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); } if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { @@ -2244,18 +2257,18 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (ieee80211_vif_is_mesh(&sdata->vif)) { #ifdef CONFIG_MAC80211_MESH - sinfo->filled |= BIT(NL80211_STA_INFO_LLID) | - BIT(NL80211_STA_INFO_PLID) | - BIT(NL80211_STA_INFO_PLINK_STATE) | - BIT(NL80211_STA_INFO_LOCAL_PM) | - BIT(NL80211_STA_INFO_PEER_PM) | - BIT(NL80211_STA_INFO_NONPEER_PM); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) | + BIT_ULL(NL80211_STA_INFO_PLID) | + BIT_ULL(NL80211_STA_INFO_PLINK_STATE) | + BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | + BIT_ULL(NL80211_STA_INFO_PEER_PM) | + BIT_ULL(NL80211_STA_INFO_NONPEER_PM); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; sinfo->plink_state = sta->mesh->plink_state; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { - sinfo->filled |= BIT(NL80211_STA_INFO_T_OFFSET); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET); sinfo->t_offset = sta->mesh->t_offset; } sinfo->local_pm = sta->mesh->local_pm; @@ -2300,7 +2313,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, thr = sta_get_expected_throughput(sta); if (thr != 0) { - sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); sinfo->expected_throughput = thr; } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 81b35f623792..9a04327d71d1 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -170,7 +170,7 @@ struct tid_ampdu_tx { u8 dialog_token; u8 stop_initiator; bool tx_stop; - u8 buf_size; + u16 buf_size; u16 failed_bar_ssn; bool bar_pending; @@ -405,7 +405,7 @@ struct ieee80211_sta_rx_stats { int last_signal; u8 chains; s8 chain_signal_last[IEEE80211_MAX_CHAINS]; - u16 last_rate; + u32 last_rate; struct u64_stats_sync syncp; u64 bytes; u64 msdu[IEEE80211_NUM_TIDS + 1]; @@ -764,6 +764,7 @@ enum sta_stats_type { STA_STATS_RATE_TYPE_LEGACY, STA_STATS_RATE_TYPE_HT, STA_STATS_RATE_TYPE_VHT, + STA_STATS_RATE_TYPE_HE, }; #define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0) @@ -771,9 +772,14 @@ enum sta_stats_type { #define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4) #define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0) #define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4) +#define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0) +#define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4) #define STA_STATS_FIELD_BW GENMASK(11, 8) #define STA_STATS_FIELD_SGI GENMASK(12, 12) #define STA_STATS_FIELD_TYPE GENMASK(15, 13) +#define STA_STATS_FIELD_HE_RU GENMASK(18, 16) +#define STA_STATS_FIELD_HE_GI GENMASK(20, 19) +#define STA_STATS_FIELD_HE_DCM GENMASK(21, 21) #define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v) #define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v) @@ -782,7 +788,7 @@ enum sta_stats_type { static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) { - u16 r; + u32 r; r = STA_STATS_FIELD(BW, s->bw); @@ -804,6 +810,14 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) r |= STA_STATS_FIELD(LEGACY_BAND, s->band); r |= STA_STATS_FIELD(LEGACY_IDX, s->rate_idx); break; + case RX_ENC_HE: + r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HE); + r |= STA_STATS_FIELD(HE_NSS, s->nss); + r |= STA_STATS_FIELD(HE_MCS, s->rate_idx); + r |= STA_STATS_FIELD(HE_GI, s->he_gi); + r |= STA_STATS_FIELD(HE_RU, s->he_ru); + r |= STA_STATS_FIELD(HE_DCM, s->he_dcm); + break; default: WARN_ON(1); return STA_STATS_RATE_INVALID; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 80a7edf8d314..0ab69a1964f8 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -92,7 +92,7 @@ STA_ENTRY \ __field(u16, tid) \ __field(u16, ssn) \ - __field(u8, buf_size) \ + __field(u16, buf_size) \ __field(bool, amsdu) \ __field(u16, timeout) \ __field(u16, action) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 44b5dfe8727d..cd332e3e1134 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -825,6 +825,8 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) */ if (!ieee80211_is_data_qos(hdr->frame_control) || is_multicast_ether_addr(hdr->addr1)) { + if (tx->flags & IEEE80211_TX_NO_SEQNO) + return TX_CONTINUE; /* driver should assign sequence number */ info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; /* for pure STA mode without beacons, we can do it */ @@ -1247,7 +1249,7 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) return NULL; - if (!ieee80211_is_data(hdr->frame_control)) + if (!ieee80211_is_data_present(hdr->frame_control)) return NULL; if (sta) { @@ -1854,7 +1856,7 @@ EXPORT_SYMBOL(ieee80211_tx_prepare_skb); */ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb, - bool txpending) + bool txpending, u32 txdata_flags) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_data tx; @@ -1872,6 +1874,8 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, led_len = skb->len; res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb); + tx.flags |= txdata_flags; + if (unlikely(res_prepare == TX_DROP)) { ieee80211_free_txskb(&local->hw, skb); return true; @@ -1933,7 +1937,8 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, } void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb) + struct sta_info *sta, struct sk_buff *skb, + u32 txdata_flags) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); @@ -1968,7 +1973,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, } ieee80211_set_qos_hdr(sdata, skb); - ieee80211_tx(sdata, sta, skb, false); + ieee80211_tx(sdata, sta, skb, false, txdata_flags); } static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, @@ -2289,7 +2294,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, if (!ieee80211_parse_tx_radiotap(local, skb)) goto fail_rcu; - ieee80211_xmit(sdata, NULL, skb); + ieee80211_xmit(sdata, NULL, skb, 0); rcu_read_unlock(); return NETDEV_TX_OK; @@ -3648,7 +3653,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, ieee80211_tx_stats(dev, skb->len); - ieee80211_xmit(sdata, sta, skb); + ieee80211_xmit(sdata, sta, skb, 0); } goto out; out_free: @@ -3867,7 +3872,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, return true; } info->band = chanctx_conf->def.chan->band; - result = ieee80211_tx(sdata, NULL, skb, true); + result = ieee80211_tx(sdata, NULL, skb, true, 0); } else { struct sk_buff_head skbs; @@ -4783,7 +4788,7 @@ EXPORT_SYMBOL(ieee80211_unreserve_tid); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band) + enum nl80211_band band, u32 txdata_flags) { int ac = ieee80211_ac_from_tid(tid); @@ -4800,7 +4805,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, */ local_bh_disable(); IEEE80211_SKB_CB(skb)->band = band; - ieee80211_xmit(sdata, NULL, skb); + ieee80211_xmit(sdata, NULL, skb, txdata_flags); local_bh_enable(); } @@ -4845,7 +4850,9 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, skb_reset_network_header(skb); skb_reset_mac_header(skb); + local_bh_disable(); __ieee80211_subif_start_xmit(skb, skb->dev, flags); + local_bh_enable(); return 0; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 5e2e511c4a6f..88efda7c9f8a 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1095,6 +1095,21 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, if (elen >= sizeof(*elems->max_idle_period_ie)) elems->max_idle_period_ie = (void *)pos; break; + case WLAN_EID_EXTENSION: + if (pos[0] == WLAN_EID_EXT_HE_MU_EDCA && + elen >= (sizeof(*elems->mu_edca_param_set) + 1)) { + elems->mu_edca_param_set = (void *)&pos[1]; + } else if (pos[0] == WLAN_EID_EXT_HE_CAPABILITY) { + elems->he_cap = (void *)&pos[1]; + elems->he_cap_len = elen - 1; + } else if (pos[0] == WLAN_EID_EXT_HE_OPERATION && + elen >= sizeof(*elems->he_operation) && + elen >= ieee80211_he_oper_size(&pos[1])) { + elems->he_operation = (void *)&pos[1]; + } else if (pos[0] == WLAN_EID_EXT_UORA && elen >= 1) { + elems->uora_element = (void *)&pos[1]; + } + break; default: break; } @@ -1353,9 +1368,10 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, enum nl80211_band band, u32 rate_mask, struct cfg80211_chan_def *chandef, - size_t *offset) + size_t *offset, u32 flags) { struct ieee80211_supported_band *sband; + const struct ieee80211_sta_he_cap *he_cap; u8 *pos = buffer, *end = buffer + buffer_len; size_t noffset; int supp_rates_len, i; @@ -1433,6 +1449,9 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, chandef->chan->center_freq); } + if (flags & IEEE80211_PROBE_FLAG_MIN_CONTENT) + goto done; + /* insert custom IEs that go before HT */ if (ie && ie_len) { static const u8 before_ht[] = { @@ -1460,11 +1479,6 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, sband->ht_cap.cap); } - /* - * If adding more here, adjust code in main.c - * that calculates local->scan_ies_len. - */ - /* insert custom IEs that go before VHT */ if (ie && ie_len) { static const u8 before_vht[] = { @@ -1507,9 +1521,43 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, sband->vht_cap.cap); } + /* insert custom IEs that go before HE */ + if (ie && ie_len) { + static const u8 before_he[] = { + /* + * no need to list the ones split off before VHT + * or generated here + */ + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_REQ_PARAMS, + WLAN_EID_AP_CSN, + /* TODO: add 11ah/11aj/11ak elements */ + }; + noffset = ieee80211_ie_split(ie, ie_len, + before_he, ARRAY_SIZE(before_he), + *offset); + if (end - pos < noffset - *offset) + goto out_err; + memcpy(pos, ie + *offset, noffset - *offset); + pos += noffset - *offset; + *offset = noffset; + } + + he_cap = ieee80211_get_he_sta_cap(sband); + if (he_cap) { + pos = ieee80211_ie_build_he_cap(pos, he_cap, end); + if (!pos) + goto out_err; + } + + /* + * If adding more here, adjust code in main.c + * that calculates local->scan_ies_len. + */ + return pos - buffer; out_err: WARN_ONCE(1, "not enough space for preq IEs\n"); + done: return pos - buffer; } @@ -1518,7 +1566,8 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, - struct cfg80211_chan_def *chandef) + struct cfg80211_chan_def *chandef, + u32 flags) { size_t pos = 0, old_pos = 0, custom_ie_offset = 0; int i; @@ -1533,7 +1582,8 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, ie, ie_len, i, rate_masks[i], chandef, - &custom_ie_offset); + &custom_ie_offset, + flags); ie_desc->ies[i] = buffer + old_pos; ie_desc->len[i] = pos - old_pos; old_pos = pos; @@ -1561,7 +1611,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, - bool directed) + u32 flags) { struct ieee80211_local *local = sdata->local; struct cfg80211_chan_def chandef; @@ -1577,7 +1627,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, * badly-behaved APs don't respond when this parameter is included. */ chandef.width = sdata->vif.bss_conf.chandef.width; - if (directed) + if (flags & IEEE80211_PROBE_FLAG_DIRECTED) chandef.chan = NULL; else chandef.chan = chan; @@ -1591,7 +1641,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb), skb_tailroom(skb), &dummy_ie_desc, ie, ie_len, BIT(chan->band), - rate_masks, &chandef); + rate_masks, &chandef, flags); skb_put(skb, ies_len); if (dst) { @@ -1605,27 +1655,6 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, return skb; } -void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, - const u8 *src, const u8 *dst, - const u8 *ssid, size_t ssid_len, - const u8 *ie, size_t ie_len, - u32 ratemask, bool directed, u32 tx_flags, - struct ieee80211_channel *channel, bool scan) -{ - struct sk_buff *skb; - - skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, - ssid, ssid_len, - ie, ie_len, directed); - if (skb) { - IEEE80211_SKB_CB(skb)->flags |= tx_flags; - if (scan) - ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); - else - ieee80211_tx_skb(sdata, skb); - } -} - u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates) @@ -2111,7 +2140,8 @@ int ieee80211_reconfig(struct ieee80211_local *local) if (!sta->uploaded) continue; - if (sta->sdata->vif.type != NL80211_IFTYPE_AP) + if (sta->sdata->vif.type != NL80211_IFTYPE_AP && + sta->sdata->vif.type != NL80211_IFTYPE_AP_VLAN) continue; for (state = IEEE80211_STA_NOTEXIST; @@ -2412,6 +2442,72 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, return pos; } +u8 *ieee80211_ie_build_he_cap(u8 *pos, + const struct ieee80211_sta_he_cap *he_cap, + u8 *end) +{ + u8 n; + u8 ie_len; + u8 *orig_pos = pos; + + /* Make sure we have place for the IE */ + /* + * TODO: the 1 added is because this temporarily is under the EXTENSION + * IE. Get rid of it when it moves. + */ + if (!he_cap) + return orig_pos; + + n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem); + ie_len = 2 + 1 + + sizeof(he_cap->he_cap_elem) + n + + ieee80211_he_ppe_size(he_cap->ppe_thres[0], + he_cap->he_cap_elem.phy_cap_info); + + if ((end - pos) < ie_len) + return orig_pos; + + *pos++ = WLAN_EID_EXTENSION; + pos++; /* We'll set the size later below */ + *pos++ = WLAN_EID_EXT_HE_CAPABILITY; + + /* Fixed data */ + memcpy(pos, &he_cap->he_cap_elem, sizeof(he_cap->he_cap_elem)); + pos += sizeof(he_cap->he_cap_elem); + + memcpy(pos, &he_cap->he_mcs_nss_supp, n); + pos += n; + + /* Check if PPE Threshold should be present */ + if ((he_cap->he_cap_elem.phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) + goto end; + + /* + * Calculate how many PPET16/PPET8 pairs are to come. Algorithm: + * (NSS_M1 + 1) x (num of 1 bits in RU_INDEX_BITMASK) + */ + n = hweight8(he_cap->ppe_thres[0] & + IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= (1 + ((he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) >> + IEEE80211_PPE_THRES_NSS_POS)); + + /* + * Each pair is 6 bits, and we need to add the 7 "header" bits to the + * total size. + */ + n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; + n = DIV_ROUND_UP(n, 8); + + /* Copy PPE Thresholds */ + memcpy(pos, &he_cap->ppe_thres, n); + pos += n; + +end: + orig_pos[1] = (pos - orig_pos) - 2; + return pos; +} + u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode) diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 6e558a419f60..94f53a9b7d1a 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -224,7 +224,7 @@ static int mpls_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct mpls_iptunnel_encap *tun_encap_info; - + tun_encap_info = mpls_lwtunnel_encap(lwtstate); if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels, diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index dbd7d1fad277..6f6c959aeb8f 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -49,6 +49,8 @@ config NETFILTER_NETLINK_LOG config NF_CONNTRACK tristate "Netfilter connection tracking support" default m if NETFILTER_ADVANCED=n + select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if IPV6 != n help Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related @@ -460,6 +462,13 @@ config NF_TABLES if NF_TABLES +config NF_TABLES_SET + tristate "Netfilter nf_tables set infrastructure" + help + This option enables the nf_tables set infrastructure that allows to + look up for elements in a set and to build one-way mappings between + matchings and actions. + config NF_TABLES_INET depends on IPV6 select NF_TABLES_IPV4 @@ -493,24 +502,6 @@ config NFT_FLOW_OFFLOAD This option adds the "flow_offload" expression that you can use to choose what flows are placed into the hardware. -config NFT_SET_RBTREE - tristate "Netfilter nf_tables rbtree set module" - help - This option adds the "rbtree" set type (Red Black tree) that is used - to build interval-based sets. - -config NFT_SET_HASH - tristate "Netfilter nf_tables hash set module" - help - This option adds the "hash" set type that is used to build one-way - mappings between matchings and actions. - -config NFT_SET_BITMAP - tristate "Netfilter nf_tables bitmap set module" - help - This option adds the "bitmap" set type that is used to build sets - whose keys are smaller or equal to 16 bits. - config NFT_COUNTER tristate "Netfilter nf_tables counter module" help @@ -626,7 +617,7 @@ config NFT_SOCKET tristate "Netfilter nf_tables socket match support" depends on IPV6 || IPV6=n select NF_SOCKET_IPV4 - select NF_SOCKET_IPV6 if IPV6 + select NF_SOCKET_IPV6 if NF_TABLES_IPV6 help This option allows matching for the presence or absence of a corresponding socket and its attributes. @@ -892,7 +883,7 @@ config NETFILTER_XT_TARGET_LOG tristate "LOG target support" select NF_LOG_COMMON select NF_LOG_IPV4 - select NF_LOG_IPV6 if IPV6 + select NF_LOG_IPV6 if IP6_NF_IPTABLES default m if NETFILTER_ADVANCED=n help This option adds a `LOG' target, which allows you to create rules in @@ -984,7 +975,7 @@ config NETFILTER_XT_TARGET_TEE depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK select NF_DUP_IPV4 - select NF_DUP_IPV6 if IPV6 + select NF_DUP_IPV6 if IP6_NF_IPTABLES ---help--- This option adds a "TEE" target with which a packet can be cloned and this clone be rerouted to another nexthop. @@ -1492,8 +1483,8 @@ config NETFILTER_XT_MATCH_SOCKET depends on NETFILTER_ADVANCED depends on IPV6 || IPV6=n depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n - depends on NF_SOCKET_IPV4 - depends on NF_SOCKET_IPV6 + select NF_SOCKET_IPV4 + select NF_SOCKET_IPV6 if IP6_NF_IPTABLES select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 44449389e527..dd26e4961f43 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,7 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o -nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o +nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o \ + nf_conntrack_proto.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o \ + nf_conntrack_proto_icmp.o \ + nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o + +nf_conntrack-$(subst m,y,$(CONFIG_IPV6)) += nf_conntrack_proto_icmpv6.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o @@ -78,7 +83,11 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o +nf_tables_set-objs := nf_tables_set_core.o \ + nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o + obj-$(CONFIG_NF_TABLES) += nf_tables.o +obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o @@ -91,9 +100,6 @@ obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_QUOTA) += nft_quota.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o -obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o -obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o -obj-$(CONFIG_NFT_SET_BITMAP) += nft_set_bitmap.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 168af54db975..dc240cb47ddf 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -603,6 +603,21 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) } EXPORT_SYMBOL(nf_conntrack_destroy); +bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb) +{ + struct nf_ct_hook *ct_hook; + bool ret = false; + + rcu_read_lock(); + ct_hook = rcu_dereference(nf_ct_hook); + if (ct_hook) + ret = ct_hook->get_tuple_skb(dst_tuple, skb); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(nf_ct_get_tuple_skb); + /* Built-in default zone used e.g. by modules. */ const struct nf_conntrack_zone nf_ct_zone_dflt = { .id = NF_CT_DEFAULT_ZONE_ID, diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 99e0aa350dc5..0edc62910ebf 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -825,12 +825,23 @@ static void ip_vs_conn_expire(struct timer_list *t) /* Unlink conn if not referenced anymore */ if (likely(ip_vs_conn_unlink(cp))) { + struct ip_vs_conn *ct = cp->control; + /* delete the timer if it is activated by other users */ del_timer(&cp->timer); /* does anybody control me? */ - if (cp->control) + if (ct) { ip_vs_control_del(cp); + /* Drop CTL or non-assured TPL if not used anymore */ + if (!cp->timeout && !atomic_read(&ct->n_control) && + (!(ct->flags & IP_VS_CONN_F_TEMPLATE) || + !(ct->state & IP_VS_CTPL_S_ASSURED))) { + IP_VS_DBG(4, "drop controlling connection\n"); + ct->timeout = 0; + ip_vs_conn_expire_now(ct); + } + } if ((cp->flags & IP_VS_CONN_F_NFCT) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) { @@ -872,6 +883,10 @@ static void ip_vs_conn_expire(struct timer_list *t) /* Modify timer, so that it expires as soon as possible. * Can be called without reference only if under RCU lock. + * We can have such chain of conns linked with ->control: DATA->CTL->TPL + * - DATA (eg. FTP) and TPL (persistence) can be present depending on setup + * - cp->timeout=0 indicates all conns from chain should be dropped but + * TPL is not dropped if in assured state */ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) { @@ -1107,7 +1122,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), dbuf, ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), + ip_vs_state_name(cp), (cp->timer.expires-jiffies)/HZ, pe_data); else #endif @@ -1118,7 +1133,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), dbuf, ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), + ip_vs_state_name(cp), (cp->timer.expires-jiffies)/HZ, pe_data); } return 0; @@ -1169,7 +1184,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), dbuf, ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), + ip_vs_state_name(cp), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); else @@ -1181,7 +1196,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), dbuf, ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), + ip_vs_state_name(cp), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); } @@ -1197,8 +1212,11 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = { #endif -/* - * Randomly drop connection entries before running out of memory +/* Randomly drop connection entries before running out of memory + * Can be used for DATA and CTL conns. For TPL conns there are exceptions: + * - traffic for services in OPS mode increases ct->in_pkts, so it is supported + * - traffic for services not in OPS mode does not increase ct->in_pkts in + * all cases, so it is not supported */ static inline int todrop_entry(struct ip_vs_conn *cp) { @@ -1242,7 +1260,7 @@ static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { int idx; - struct ip_vs_conn *cp, *cp_c; + struct ip_vs_conn *cp; rcu_read_lock(); /* @@ -1254,13 +1272,15 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->ipvs != ipvs) continue; + if (atomic_read(&cp->n_control)) + continue; if (cp->flags & IP_VS_CONN_F_TEMPLATE) { - if (atomic_read(&cp->n_control) || - !ip_vs_conn_ops_mode(cp)) - continue; - else - /* connection template of OPS */ + /* connection template of OPS */ + if (ip_vs_conn_ops_mode(cp)) goto try_drop; + if (!(cp->state & IP_VS_CTPL_S_ASSURED)) + goto drop; + continue; } if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { @@ -1294,15 +1314,10 @@ try_drop: continue; } - IP_VS_DBG(4, "del connection\n"); +drop: + IP_VS_DBG(4, "drop connection\n"); + cp->timeout = 0; ip_vs_conn_expire_now(cp); - cp_c = cp->control; - /* cp->control is valid only with reference to cp */ - if (cp_c && __ip_vs_conn_get(cp)) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp_c); - __ip_vs_conn_put(cp); - } } cond_resched_rcu(); } @@ -1325,15 +1340,19 @@ flush_again: hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { if (cp->ipvs != ipvs) continue; - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); + /* As timers are expired in LIFO order, restart + * the timer of controlling connection first, so + * that it is expired after us. + */ cp_c = cp->control; /* cp->control is valid only with reference to cp */ if (cp_c && __ip_vs_conn_get(cp)) { - IP_VS_DBG(4, "del conn template\n"); + IP_VS_DBG(4, "del controlling connection\n"); ip_vs_conn_expire_now(cp_c); __ip_vs_conn_put(cp); } + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); } cond_resched_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index ca880a3ad033..54ee84adf0bd 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -42,6 +42,11 @@ static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; +/* States for conn templates: NONE or words separated with ",", max 15 chars */ +static const char *ip_vs_ctpl_state_name_table[IP_VS_CTPL_S_LAST] = { + [IP_VS_CTPL_S_NONE] = "NONE", + [IP_VS_CTPL_S_ASSURED] = "ASSURED", +}; /* * register an ipvs protocol @@ -193,12 +198,20 @@ ip_vs_create_timeout_table(int *table, int size) } -const char * ip_vs_state_name(__u16 proto, int state) +const char *ip_vs_state_name(const struct ip_vs_conn *cp) { - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + unsigned int state = cp->state; + struct ip_vs_protocol *pp; + + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + if (state >= IP_VS_CTPL_S_LAST) + return "ERR!"; + return ip_vs_ctpl_state_name_table[state] ? : "?"; + } + pp = ip_vs_proto_get(cp->protocol); if (pp == NULL || pp->state_name == NULL) - return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; + return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!"; return pp->state_name(state); } diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 3250c4a1111e..b0cd7d08f2a7 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -461,6 +461,8 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } + if (next_state == IP_VS_SCTP_S_ESTABLISHED) + ip_vs_control_assure_ct(cp); } if (likely(pd)) cp->timeout = pd->timeout_table[cp->state = next_state]; diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 80d10ad12a15..1770fc6ce960 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -569,6 +569,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } + if (new_state == IP_VS_TCP_S_ESTABLISHED) + ip_vs_control_assure_ct(cp); } if (likely(pd)) diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index e0ef11c3691e..0f53c49025f8 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -460,6 +460,8 @@ udp_state_transition(struct ip_vs_conn *cp, int direction, } cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; + if (direction == IP_VS_DIR_OUTPUT) + ip_vs_control_assure_ct(cp); } static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 001501e25625..d4020c5e831d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1003,12 +1003,9 @@ static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer continue; } } else { - /* protocol in templates is not used for state/timeout */ - if (state > 0) { - IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", - state); - state = 0; - } + if (state >= IP_VS_CTPL_S_LAST) + IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n", + state); } ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, @@ -1166,12 +1163,9 @@ static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *m goto out; } } else { - /* protocol in templates is not used for state/timeout */ - if (state > 0) { - IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", - state); - state = 0; - } + if (state >= IP_VS_CTPL_S_LAST) + IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n", + state); } if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, pe_data_len, pe_name, pe_name_len)) { diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index d8383609fe28..02ca7df793f5 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -44,15 +44,19 @@ /* we will save the tuples of all connections we care about */ struct nf_conncount_tuple { - struct hlist_node node; + struct list_head node; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; + int cpu; + u32 jiffies32; + struct rcu_head rcu_head; }; struct nf_conncount_rb { struct rb_node node; - struct hlist_head hhead; /* connections/hosts in same subnet */ + struct nf_conncount_list list; u32 key[MAX_KEYLEN]; + struct rcu_head rcu_head; }; static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp; @@ -60,6 +64,10 @@ static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_i struct nf_conncount_data { unsigned int keylen; struct rb_root root[CONNCOUNT_SLOTS]; + struct net *net; + struct work_struct gc_work; + unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)]; + unsigned int gc_tree; }; static u_int32_t conncount_rnd __read_mostly; @@ -80,41 +88,129 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) return memcmp(a, b, klen * sizeof(u32)); } -bool nf_conncount_add(struct hlist_head *head, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +enum nf_conncount_list_add +nf_conncount_add(struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { struct nf_conncount_tuple *conn; + if (WARN_ON_ONCE(list->count > INT_MAX)) + return NF_CONNCOUNT_ERR; + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); if (conn == NULL) - return false; + return NF_CONNCOUNT_ERR; + conn->tuple = *tuple; conn->zone = *zone; - hlist_add_head(&conn->node, head); - return true; + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; + spin_lock(&list->list_lock); + if (list->dead == true) { + kmem_cache_free(conncount_conn_cachep, conn); + spin_unlock(&list->list_lock); + return NF_CONNCOUNT_SKIP; + } + list_add_tail(&conn->node, &list->head); + list->count++; + spin_unlock(&list->list_lock); + return NF_CONNCOUNT_ADDED; } EXPORT_SYMBOL_GPL(nf_conncount_add); -unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit) +static void __conn_free(struct rcu_head *h) { - const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn; - struct hlist_node *n; + + conn = container_of(h, struct nf_conncount_tuple, rcu_head); + kmem_cache_free(conncount_conn_cachep, conn); +} + +static bool conn_free(struct nf_conncount_list *list, + struct nf_conncount_tuple *conn) +{ + bool free_entry = false; + + spin_lock(&list->list_lock); + + if (list->count == 0) { + spin_unlock(&list->list_lock); + return free_entry; + } + + list->count--; + list_del_rcu(&conn->node); + if (list->count == 0) + free_entry = true; + + spin_unlock(&list->list_lock); + call_rcu(&conn->rcu_head, __conn_free); + return free_entry; +} + +static const struct nf_conntrack_tuple_hash * +find_or_evict(struct net *net, struct nf_conncount_list *list, + struct nf_conncount_tuple *conn, bool *free_entry) +{ + const struct nf_conntrack_tuple_hash *found; + unsigned long a, b; + int cpu = raw_smp_processor_id(); + __s32 age; + + found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); + if (found) + return found; + b = conn->jiffies32; + a = (u32)jiffies; + + /* conn might have been added just before by another cpu and + * might still be unconfirmed. In this case, nf_conntrack_find() + * returns no result. Thus only evict if this cpu added the + * stale entry or if the entry is older than two jiffies. + */ + age = a - b; + if (conn->cpu == cpu || age >= 2) { + *free_entry = conn_free(list, conn); + return ERR_PTR(-ENOENT); + } + + return ERR_PTR(-EAGAIN); +} + +void nf_conncount_lookup(struct net *net, + struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone, + bool *addit) +{ + const struct nf_conntrack_tuple_hash *found; + struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; - unsigned int length = 0; + unsigned int collect = 0; + bool free_entry = false; + /* best effort only */ *addit = tuple ? true : false; /* check the saved connections */ - hlist_for_each_entry_safe(conn, n, head, node) { - found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); - if (found == NULL) { - hlist_del(&conn->node); - kmem_cache_free(conncount_conn_cachep, conn); + list_for_each_entry_safe(conn, conn_n, &list->head, node) { + if (collect > CONNCOUNT_GC_MAX_NODES) + break; + + found = find_or_evict(net, list, conn, &free_entry); + if (IS_ERR(found)) { + /* Not found, but might be about to be confirmed */ + if (PTR_ERR(found) == -EAGAIN) { + if (!tuple) + continue; + + if (nf_ct_tuple_equal(&conn->tuple, tuple) && + nf_ct_zone_id(&conn->zone, conn->zone.dir) == + nf_ct_zone_id(zone, zone->dir)) + *addit = false; + } else if (PTR_ERR(found) == -ENOENT) + collect++; continue; } @@ -123,9 +219,10 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* - * Just to be sure we have it only once in the list. * We should not see tuples twice unless someone hooks * this into a table without "-p tcp --syn". + * + * Attempt to avoid a re-add in this case. */ *addit = false; } else if (already_closed(found_ct)) { @@ -134,19 +231,75 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, * closed already -> ditch it */ nf_ct_put(found_ct); - hlist_del(&conn->node); - kmem_cache_free(conncount_conn_cachep, conn); + conn_free(list, conn); + collect++; continue; } nf_ct_put(found_ct); - length++; } - - return length; } EXPORT_SYMBOL_GPL(nf_conncount_lookup); +void nf_conncount_list_init(struct nf_conncount_list *list) +{ + spin_lock_init(&list->list_lock); + INIT_LIST_HEAD(&list->head); + list->count = 1; + list->dead = false; +} +EXPORT_SYMBOL_GPL(nf_conncount_list_init); + +/* Return true if the list is empty */ +bool nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) +{ + const struct nf_conntrack_tuple_hash *found; + struct nf_conncount_tuple *conn, *conn_n; + struct nf_conn *found_ct; + unsigned int collected = 0; + bool free_entry = false; + + list_for_each_entry_safe(conn, conn_n, &list->head, node) { + found = find_or_evict(net, list, conn, &free_entry); + if (IS_ERR(found)) { + if (PTR_ERR(found) == -ENOENT) { + if (free_entry) + return true; + collected++; + } + continue; + } + + found_ct = nf_ct_tuplehash_to_ctrack(found); + if (already_closed(found_ct)) { + /* + * we do not care about connections which are + * closed already -> ditch it + */ + nf_ct_put(found_ct); + if (conn_free(list, conn)) + return true; + collected++; + continue; + } + + nf_ct_put(found_ct); + if (collected > CONNCOUNT_GC_MAX_NODES) + return false; + } + return false; +} +EXPORT_SYMBOL_GPL(nf_conncount_gc_list); + +static void __tree_nodes_free(struct rcu_head *h) +{ + struct nf_conncount_rb *rbconn; + + rbconn = container_of(h, struct nf_conncount_rb, rcu_head); + kmem_cache_free(conncount_rb_cachep, rbconn); +} + static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], unsigned int gc_count) @@ -155,32 +308,46 @@ static void tree_nodes_free(struct rb_root *root, while (gc_count) { rbconn = gc_nodes[--gc_count]; - rb_erase(&rbconn->node, root); - kmem_cache_free(conncount_rb_cachep, rbconn); + spin_lock(&rbconn->list.list_lock); + if (rbconn->list.count == 0 && rbconn->list.dead == false) { + rbconn->list.dead = true; + rb_erase(&rbconn->node, root); + call_rcu(&rbconn->rcu_head, __tree_nodes_free); + } + spin_unlock(&rbconn->list.list_lock); } } +static void schedule_gc_worker(struct nf_conncount_data *data, int tree) +{ + set_bit(tree, data->pending_trees); + schedule_work(&data->gc_work); +} + static unsigned int -count_tree(struct net *net, struct rb_root *root, - const u32 *key, u8 keylen, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +insert_tree(struct net *net, + struct nf_conncount_data *data, + struct rb_root *root, + unsigned int hash, + const u32 *key, + u8 keylen, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { + enum nf_conncount_list_add ret; struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; struct rb_node **rbnode, *parent; struct nf_conncount_rb *rbconn; struct nf_conncount_tuple *conn; - unsigned int gc_count; - bool no_gc = false; + unsigned int count = 0, gc_count = 0; + bool node_found = false; + + spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); - restart: - gc_count = 0; parent = NULL; rbnode = &(root->rb_node); while (*rbnode) { int diff; - bool addit; - rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); parent = *rbnode; @@ -190,33 +357,30 @@ count_tree(struct net *net, struct rb_root *root, } else if (diff > 0) { rbnode = &((*rbnode)->rb_right); } else { - /* same source network -> be counted! */ - unsigned int count; - - count = nf_conncount_lookup(net, &rbconn->hhead, tuple, - zone, &addit); - - tree_nodes_free(root, gc_nodes, gc_count); - if (!addit) - return count; - - if (!nf_conncount_add(&rbconn->hhead, tuple, zone)) - return 0; /* hotdrop */ - - return count + 1; + /* unlikely: other cpu added node already */ + node_found = true; + ret = nf_conncount_add(&rbconn->list, tuple, zone); + if (ret == NF_CONNCOUNT_ERR) { + count = 0; /* hotdrop */ + } else if (ret == NF_CONNCOUNT_ADDED) { + count = rbconn->list.count; + } else { + /* NF_CONNCOUNT_SKIP, rbconn is already + * reclaimed by gc, insert a new tree node + */ + node_found = false; + } + break; } - if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) + if (gc_count >= ARRAY_SIZE(gc_nodes)) continue; - /* only used for GC on hhead, retval and 'addit' ignored */ - nf_conncount_lookup(net, &rbconn->hhead, tuple, zone, &addit); - if (hlist_empty(&rbconn->hhead)) + if (nf_conncount_gc_list(net, &rbconn->list)) gc_nodes[gc_count++] = rbconn; } if (gc_count) { - no_gc = true; tree_nodes_free(root, gc_nodes, gc_count); /* tree_node_free before new allocation permits * allocator to re-use newly free'd object. @@ -224,58 +388,146 @@ count_tree(struct net *net, struct rb_root *root, * This is a rare event; in most cases we will find * existing node to re-use. (or gc_count is 0). */ - goto restart; + + if (gc_count >= ARRAY_SIZE(gc_nodes)) + schedule_gc_worker(data, hash); } - if (!tuple) - return 0; + if (node_found) + goto out_unlock; - /* no match, need to insert new node */ + /* expected case: match, insert new node */ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); if (rbconn == NULL) - return 0; + goto out_unlock; conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); if (conn == NULL) { kmem_cache_free(conncount_rb_cachep, rbconn); - return 0; + goto out_unlock; } conn->tuple = *tuple; conn->zone = *zone; memcpy(rbconn->key, key, sizeof(u32) * keylen); - INIT_HLIST_HEAD(&rbconn->hhead); - hlist_add_head(&conn->node, &rbconn->hhead); + nf_conncount_list_init(&rbconn->list); + list_add(&conn->node, &rbconn->list.head); + count = 1; rb_link_node(&rbconn->node, parent, rbnode); rb_insert_color(&rbconn->node, root); - return 1; +out_unlock: + spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + return count; } -/* Count and return number of conntrack entries in 'net' with particular 'key'. - * If 'tuple' is not null, insert it into the accounting data structure. - */ -unsigned int nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +static unsigned int +count_tree(struct net *net, + struct nf_conncount_data *data, + const u32 *key, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { + enum nf_conncount_list_add ret; struct rb_root *root; - int count; - u32 hash; + struct rb_node *parent; + struct nf_conncount_rb *rbconn; + unsigned int hash; + u8 keylen = data->keylen; hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; root = &data->root[hash]; - spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + parent = rcu_dereference_raw(root->rb_node); + while (parent) { + int diff; + bool addit; - count = count_tree(net, root, key, data->keylen, tuple, zone); + rbconn = rb_entry(parent, struct nf_conncount_rb, node); - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + diff = key_diff(key, rbconn->key, keylen); + if (diff < 0) { + parent = rcu_dereference_raw(parent->rb_left); + } else if (diff > 0) { + parent = rcu_dereference_raw(parent->rb_right); + } else { + /* same source network -> be counted! */ + nf_conncount_lookup(net, &rbconn->list, tuple, zone, + &addit); - return count; + if (!addit) + return rbconn->list.count; + + ret = nf_conncount_add(&rbconn->list, tuple, zone); + if (ret == NF_CONNCOUNT_ERR) { + return 0; /* hotdrop */ + } else if (ret == NF_CONNCOUNT_ADDED) { + return rbconn->list.count; + } else { + /* NF_CONNCOUNT_SKIP, rbconn is already + * reclaimed by gc, insert a new tree node + */ + break; + } + } + } + + if (!tuple) + return 0; + + return insert_tree(net, data, root, hash, key, keylen, tuple, zone); +} + +static void tree_gc_worker(struct work_struct *work) +{ + struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work); + struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn; + struct rb_root *root; + struct rb_node *node; + unsigned int tree, next_tree, gc_count = 0; + + tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS; + root = &data->root[tree]; + + rcu_read_lock(); + for (node = rb_first(root); node != NULL; node = rb_next(node)) { + rbconn = rb_entry(node, struct nf_conncount_rb, node); + if (nf_conncount_gc_list(data->net, &rbconn->list)) + gc_nodes[gc_count++] = rbconn; + } + rcu_read_unlock(); + + spin_lock_bh(&nf_conncount_locks[tree]); + + if (gc_count) { + tree_nodes_free(root, gc_nodes, gc_count); + } + + clear_bit(tree, data->pending_trees); + + next_tree = (tree + 1) % CONNCOUNT_SLOTS; + next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS); + + if (next_tree < CONNCOUNT_SLOTS) { + data->gc_tree = next_tree; + schedule_work(work); + } + + spin_unlock_bh(&nf_conncount_locks[tree]); +} + +/* Count and return number of conntrack entries in 'net' with particular 'key'. + * If 'tuple' is not null, insert it into the accounting data structure. + * Call with RCU read lock. + */ +unsigned int nf_conncount_count(struct net *net, + struct nf_conncount_data *data, + const u32 *key, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) +{ + return count_tree(net, data, key, tuple, zone); } EXPORT_SYMBOL_GPL(nf_conncount_count); @@ -306,17 +558,18 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family data->root[i] = RB_ROOT; data->keylen = keylen / sizeof(u32); + data->net = net; + INIT_WORK(&data->gc_work, tree_gc_worker); return data; } EXPORT_SYMBOL_GPL(nf_conncount_init); -void nf_conncount_cache_free(struct hlist_head *hhead) +void nf_conncount_cache_free(struct nf_conncount_list *list) { - struct nf_conncount_tuple *conn; - struct hlist_node *n; + struct nf_conncount_tuple *conn, *conn_n; - hlist_for_each_entry_safe(conn, n, hhead, node) + list_for_each_entry_safe(conn, conn_n, &list->head, node) kmem_cache_free(conncount_conn_cachep, conn); } EXPORT_SYMBOL_GPL(nf_conncount_cache_free); @@ -331,7 +584,7 @@ static void destroy_tree(struct rb_root *r) rb_erase(node, r); - nf_conncount_cache_free(&rbconn->hhead); + nf_conncount_cache_free(&rbconn->list); kmem_cache_free(conncount_rb_cachep, rbconn); } @@ -342,6 +595,7 @@ void nf_conncount_destroy(struct net *net, unsigned int family, { unsigned int i; + cancel_work_sync(&data->gc_work); nf_ct_netns_put(net, family); for (i = 0; i < ARRAY_SIZE(data->root); ++i) diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index a1086bdec242..5423b197d98a 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -32,7 +32,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, __be32 mask = 0; /* we're only interested in locally generated packets */ - if (skb->sk == NULL) + if (skb->sk == NULL || !net_eq(nf_ct_net(ct), sock_net(skb->sk))) goto out; if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) goto out; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3465da2a98bd..8a113ca1eea2 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -37,7 +37,6 @@ #include <linux/rculist_nulls.h> #include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> @@ -55,6 +54,7 @@ #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_helper.h> #include <net/netns/hash.h> +#include <net/ip.h> #include "nf_internals.h" @@ -222,7 +222,7 @@ static u32 hash_conntrack(const struct net *net, return scale_hash(hash_conntrack_raw(tuple, net)); } -bool +static bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff, unsigned int dataoff, @@ -230,37 +230,151 @@ nf_ct_get_tuple(const struct sk_buff *skb, u_int8_t protonum, struct net *net, struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { + unsigned int size; + const __be32 *ap; + __be32 _addrs[8]; + struct { + __be16 sport; + __be16 dport; + } _inet_hdr, *inet_hdr; + memset(tuple, 0, sizeof(*tuple)); tuple->src.l3num = l3num; - if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) + switch (l3num) { + case NFPROTO_IPV4: + nhoff += offsetof(struct iphdr, saddr); + size = 2 * sizeof(__be32); + break; + case NFPROTO_IPV6: + nhoff += offsetof(struct ipv6hdr, saddr); + size = sizeof(_addrs); + break; + default: + return true; + } + + ap = skb_header_pointer(skb, nhoff, size, _addrs); + if (!ap) return false; + switch (l3num) { + case NFPROTO_IPV4: + tuple->src.u3.ip = ap[0]; + tuple->dst.u3.ip = ap[1]; + break; + case NFPROTO_IPV6: + memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); + memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); + break; + } + tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; - return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); + if (unlikely(l4proto->pkt_to_tuple)) + return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); + + /* Actually only need first 4 bytes to get ports. */ + inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); + if (!inet_hdr) + return false; + + tuple->src.u.udp.port = inet_hdr->sport; + tuple->dst.u.udp.port = inet_hdr->dport; + return true; +} + +static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, + u_int8_t *protonum) +{ + int dataoff = -1; + const struct iphdr *iph; + struct iphdr _iph; + + iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + if (!iph) + return -1; + + /* Conntrack defragments packets, we might still see fragments + * inside ICMP packets though. + */ + if (iph->frag_off & htons(IP_OFFSET)) + return -1; + + dataoff = nhoff + (iph->ihl << 2); + *protonum = iph->protocol; + + /* Check bogus IP headers */ + if (dataoff > skb->len) { + pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", + nhoff, iph->ihl << 2, skb->len); + return -1; + } + return dataoff; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, + u8 *protonum) +{ + int protoff = -1; + unsigned int extoff = nhoff + sizeof(struct ipv6hdr); + __be16 frag_off; + u8 nexthdr; + + if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), + &nexthdr, sizeof(nexthdr)) != 0) { + pr_debug("can't get nexthdr\n"); + return -1; + } + protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); + /* + * (protoff == skb->len) means the packet has not data, just + * IPv6 and possibly extensions headers, but it is tracked anyway + */ + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("can't find proto in pkt\n"); + return -1; + } + + *protonum = nexthdr; + return protoff; +} +#endif + +static int get_l4proto(const struct sk_buff *skb, + unsigned int nhoff, u8 pf, u8 *l4num) +{ + switch (pf) { + case NFPROTO_IPV4: + return ipv4_get_l4proto(skb, nhoff, l4num); +#if IS_ENABLED(CONFIG_IPV6) + case NFPROTO_IPV6: + return ipv6_get_l4proto(skb, nhoff, l4num); +#endif + default: + *l4num = 0; + break; + } + return -1; } -EXPORT_SYMBOL_GPL(nf_ct_get_tuple); bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, struct net *net, struct nf_conntrack_tuple *tuple) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; - unsigned int protoff; - u_int8_t protonum; + u8 protonum; + int protoff; int ret; rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(l3num); - ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum); - if (ret != NF_ACCEPT) { + protoff = get_l4proto(skb, nhoff, l3num, &protonum); + if (protoff <= 0) { rcu_read_unlock(); return false; } @@ -268,7 +382,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, l4proto = __nf_ct_l4proto_find(l3num, protonum); ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple, - l3proto, l4proto); + l4proto); rcu_read_unlock(); return ret; @@ -278,19 +392,35 @@ EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { memset(inverse, 0, sizeof(*inverse)); inverse->src.l3num = orig->src.l3num; - if (l3proto->invert_tuple(inverse, orig) == 0) - return false; + + switch (orig->src.l3num) { + case NFPROTO_IPV4: + inverse->src.u3.ip = orig->dst.u3.ip; + inverse->dst.u3.ip = orig->src.u3.ip; + break; + case NFPROTO_IPV6: + inverse->src.u3.in6 = orig->dst.u3.in6; + inverse->dst.u3.in6 = orig->src.u3.in6; + break; + default: + break; + } inverse->dst.dir = !orig->dst.dir; inverse->dst.protonum = orig->dst.protonum; - return l4proto->invert_tuple(inverse, orig); + + if (unlikely(l4proto->invert_tuple)) + return l4proto->invert_tuple(inverse, orig); + + inverse->src.u.all = orig->dst.u.all; + inverse->dst.u.all = orig->src.u.all; + return true; } EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); @@ -502,6 +632,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, net_eq(net, nf_ct_net(ct)); } +static inline bool +nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) +{ + return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && + nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, + &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && + nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && + nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && + net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); +} + /* caller must hold rcu readlock and none of the nf_conntrack_locks */ static void nf_ct_gc_expired(struct nf_conn *ct) { @@ -695,19 +837,21 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); const struct nf_conntrack_l4proto *l4proto; + enum ip_conntrack_info oldinfo; + struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto->allow_clash && - ((ct->status & IPS_NAT_DONE_MASK) == 0) && !nf_ct_is_dying(ct) && atomic_inc_not_zero(&ct->ct_general.use)) { - enum ip_conntrack_info oldinfo; - struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); - - nf_ct_acct_merge(ct, ctinfo, loser_ct); - nf_conntrack_put(&loser_ct->ct_general); - nf_ct_set(skb, ct, oldinfo); - return NF_ACCEPT; + if (((ct->status & IPS_NAT_DONE_MASK) == 0) || + nf_ct_match(ct, loser_ct)) { + nf_ct_acct_merge(ct, ctinfo, loser_ct); + nf_conntrack_put(&loser_ct->ct_general); + nf_ct_set(skb, ct, oldinfo); + return NF_ACCEPT; + } + nf_ct_put(ct); } NF_CT_STAT_INC(net, drop); return NF_DROP; @@ -1195,7 +1339,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free); static noinline struct nf_conntrack_tuple_hash * init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto, struct sk_buff *skb, unsigned int dataoff, u32 hash) @@ -1208,9 +1351,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; struct nf_conntrack_zone tmp; - unsigned int *timeouts; - if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { + if (!nf_ct_invert_tuple(&repl_tuple, tuple, l4proto)) { pr_debug("Can't invert tuple.\n"); return NULL; } @@ -1227,15 +1369,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; - if (timeout_ext) { - timeouts = nf_ct_timeout_data(timeout_ext); - if (unlikely(!timeouts)) - timeouts = l4proto->get_timeouts(net); - } else { - timeouts = l4proto->get_timeouts(net); - } - if (!l4proto->new(ct, skb, dataoff, timeouts)) { + if (!l4proto->new(ct, skb, dataoff)) { nf_conntrack_free(ct); pr_debug("can't track with proto module\n"); return NULL; @@ -1266,8 +1401,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ ct->master = exp->master; if (exp->helper) { - help = nf_ct_helper_ext_add(ct, exp->helper, - GFP_ATOMIC); + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) rcu_assign_pointer(help->helper, exp->helper); } @@ -1307,7 +1441,6 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { const struct nf_conntrack_zone *zone; @@ -1319,8 +1452,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), - dataoff, l3num, protonum, net, &tuple, l3proto, - l4proto)) { + dataoff, l3num, protonum, net, &tuple, l4proto)) { pr_debug("Can't get tuple\n"); return 0; } @@ -1330,7 +1462,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, hash = hash_conntrack_raw(&tuple, net); h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { - h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, + h = init_conntrack(net, tmpl, &tuple, l4proto, skb, dataoff, hash); if (!h) return 0; @@ -1363,14 +1495,11 @@ unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; struct nf_conn *ct, *tmpl; enum ip_conntrack_info ctinfo; - unsigned int *timeouts; - unsigned int dataoff; u_int8_t protonum; - int ret; + int dataoff, ret; tmpl = nf_ct_get(skb, &ctinfo); if (tmpl || ctinfo == IP_CT_UNTRACKED) { @@ -1384,14 +1513,12 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, } /* rcu_read_lock()ed by nf_hook_thresh */ - l3proto = __nf_ct_l3proto_find(pf); - ret = l3proto->get_l4proto(skb, skb_network_offset(skb), - &dataoff, &protonum); - if (ret <= 0) { + dataoff = get_l4proto(skb, skb_network_offset(skb), pf, &protonum); + if (dataoff <= 0) { pr_debug("not prepared to track yet or error occurred\n"); NF_CT_STAT_INC_ATOMIC(net, error); NF_CT_STAT_INC_ATOMIC(net, invalid); - ret = -ret; + ret = NF_ACCEPT; goto out; } @@ -1413,8 +1540,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, goto out; } repeat: - ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, - l3proto, l4proto); + ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l4proto); if (ret < 0) { /* Too stressed to deal. */ NF_CT_STAT_INC_ATOMIC(net, drop); @@ -1430,10 +1556,7 @@ repeat: goto out; } - /* Decide what timeout policy we want to apply to this flow. */ - timeouts = nf_ct_timeout_lookup(net, ct, l4proto); - - ret = l4proto->packet(ct, skb, dataoff, ctinfo, timeouts); + ret = l4proto->packet(ct, skb, dataoff, ctinfo); if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ @@ -1471,7 +1594,6 @@ bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, rcu_read_lock(); ret = nf_ct_invert_tuple(inverse, orig, - __nf_ct_l3proto_find(orig->src.l3num), __nf_ct_l4proto_find(orig->src.l3num, orig->dst.protonum)); rcu_read_unlock(); @@ -1609,14 +1731,14 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) static int nf_conntrack_update(struct net *net, struct sk_buff *skb) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; struct nf_nat_hook *nat_hook; - unsigned int dataoff, status; + unsigned int status; struct nf_conn *ct; + int dataoff; u16 l3num; u8 l4num; @@ -1625,16 +1747,15 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) return 0; l3num = nf_ct_l3num(ct); - l3proto = nf_ct_l3proto_find_get(l3num); - if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, - &l4num) <= 0) + dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); + if (dataoff <= 0) return -1; l4proto = nf_ct_l4proto_find_get(l3num, l4num); if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, - l4num, net, &tuple, l3proto, l4proto)) + l4num, net, &tuple, l4proto)) return -1; if (ct->status & IPS_SRC_NAT) { @@ -1683,6 +1804,41 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) return 0; } +static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb) +{ + const struct nf_conntrack_tuple *src_tuple; + const struct nf_conntrack_tuple_hash *hash; + struct nf_conntrack_tuple srctuple; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); + memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); + return true; + } + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), + NFPROTO_IPV4, dev_net(skb->dev), + &srctuple)) + return false; + + hash = nf_conntrack_find_get(dev_net(skb->dev), + &nf_ct_zone_dflt, + &srctuple); + if (!hash) + return false; + + ct = nf_ct_tuplehash_to_ctrack(hash); + src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); + memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); + nf_ct_put(ct); + + return true; +} + /* Bring out ya dead! */ static struct nf_conn * get_next_corpse(int (*iter)(struct nf_conn *i, void *data), @@ -2043,7 +2199,7 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) return -EOPNOTSUPP; /* On boot, we can set this without any fancy locking. */ - if (!nf_conntrack_htable_size) + if (!nf_conntrack_hash) return param_set_uint(val, kp); rc = kstrtouint(val, 0, &hashsize); @@ -2054,9 +2210,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) } EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); -module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, - &nf_conntrack_htable_size, 0600); - static __always_inline unsigned int total_extension_size(void) { /* remember to add new extensions below */ @@ -2204,6 +2357,7 @@ err_cachep: static struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, .destroy = destroy_conntrack, + .get_tuple_skb = nf_conntrack_get_tuple_skb, }; void nf_conntrack_init_end(void) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 853b23206bb7..3f586ba23d92 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -610,7 +610,6 @@ static int exp_seq_show(struct seq_file *s, void *v) expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, - __nf_ct_l3proto_find(expect->tuple.src.l3num), __nf_ct_l4proto_find(expect->tuple.src.l3num, expect->tuple.dst.protonum)); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 551a1eddf0fa..d557a425289d 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -24,7 +24,6 @@ #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> @@ -193,8 +192,7 @@ void nf_conntrack_helper_put(struct nf_conntrack_helper *helper) EXPORT_SYMBOL_GPL(nf_conntrack_helper_put); struct nf_conn_help * -nf_ct_helper_ext_add(struct nf_conn *ct, - struct nf_conntrack_helper *helper, gfp_t gfp) +nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) { struct nf_conn_help *help; @@ -263,7 +261,7 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, } if (help == NULL) { - help = nf_ct_helper_ext_add(ct, helper, flags); + help = nf_ct_helper_ext_add(ct, flags); if (help == NULL) return -ENOMEM; } else { @@ -465,6 +463,11 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) nf_ct_expect_iterate_destroy(expect_iter_me, NULL); nf_ct_iterate_destroy(unhelp, me); + + /* Maybe someone has gotten the helper already when unhelp above. + * So need to wait it. + */ + synchronize_rcu(); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c deleted file mode 100644 index 397e6911214f..000000000000 --- a/net/netfilter/nf_conntrack_l3proto_generic.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> - * - * Based largely upon the original ip_conntrack code which - * had the following copyright information: - * - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Author: - * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - */ - -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/icmp.h> -#include <linux/sysctl.h> -#include <net/ip.h> - -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_l3proto.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> - -static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) -{ - memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); - memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); - - return true; -} - -static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); - memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); - - return true; -} - -static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum) -{ - /* Never track !!! */ - return -NF_ACCEPT; -} - - -struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { - .l3proto = PF_UNSPEC, - .pkt_to_tuple = generic_pkt_to_tuple, - .invert_tuple = generic_invert_tuple, - .get_l4proto = generic_get_l4proto, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 20a2e37c76d1..f981bfa8db72 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -38,7 +38,6 @@ #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_acct.h> @@ -81,9 +80,26 @@ nla_put_failure: return -1; } +static int ipv4_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || + nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) + return -EMSGSIZE; + return 0; +} + +static int ipv6_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) || + nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6)) + return -EMSGSIZE; + return 0; +} + static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto) + const struct nf_conntrack_tuple *tuple) { int ret = 0; struct nlattr *nest_parms; @@ -92,8 +108,14 @@ static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, if (!nest_parms) goto nla_put_failure; - if (likely(l3proto->tuple_to_nlattr)) - ret = l3proto->tuple_to_nlattr(skb, tuple); + switch (tuple->src.l3num) { + case NFPROTO_IPV4: + ret = ipv4_tuple_to_nlattr(skb, tuple); + break; + case NFPROTO_IPV6: + ret = ipv6_tuple_to_nlattr(skb, tuple); + break; + } nla_nest_end(skb, nest_parms); @@ -106,13 +128,11 @@ nla_put_failure: static int ctnetlink_dump_tuples(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; int ret; rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(tuple->src.l3num); - ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto); + ret = ctnetlink_dump_tuples_ip(skb, tuple); if (ret >= 0) { l4proto = __nf_ct_l4proto_find(tuple->src.l3num, @@ -556,15 +576,20 @@ nla_put_failure: return -1; } +static const struct nla_policy cta_ip_nla_policy[CTA_IP_MAX + 1] = { + [CTA_IP_V4_SRC] = { .type = NLA_U32 }, + [CTA_IP_V4_DST] = { .type = NLA_U32 }, + [CTA_IP_V6_SRC] = { .len = sizeof(__be32) * 4 }, + [CTA_IP_V6_DST] = { .len = sizeof(__be32) * 4 }, +}; + #if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS) static size_t ctnetlink_proto_size(const struct nf_conn *ct) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; size_t len, len4 = 0; - l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); - len = l3proto->nla_size; + len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1); len *= 3u; /* ORIG, REPLY, MASTER */ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); @@ -936,29 +961,54 @@ out: return skb->len; } +static int ipv4_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) + return -EINVAL; + + t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); + t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); + + return 0; +} + +static int ipv6_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST]) + return -EINVAL; + + t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); + t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); + + return 0; +} + static int ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple) { struct nlattr *tb[CTA_IP_MAX+1]; - struct nf_conntrack_l3proto *l3proto; int ret = 0; ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL, NULL); if (ret < 0) return ret; - rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(tuple->src.l3num); + ret = nla_validate_nested(attr, CTA_IP_MAX, + cta_ip_nla_policy, NULL); + if (ret) + return ret; - if (likely(l3proto->nlattr_to_tuple)) { - ret = nla_validate_nested(attr, CTA_IP_MAX, - l3proto->nla_policy, NULL); - if (ret == 0) - ret = l3proto->nlattr_to_tuple(tb, tuple); + switch (tuple->src.l3num) { + case NFPROTO_IPV4: + ret = ipv4_nlattr_to_tuple(tb, tuple); + break; + case NFPROTO_IPV6: + ret = ipv6_nlattr_to_tuple(tb, tuple); + break; } - rcu_read_unlock(); - return ret; } @@ -1897,7 +1947,7 @@ ctnetlink_create_conntrack(struct net *net, } else { struct nf_conn_help *help; - help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC); + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help == NULL) { err = -ENOMEM; goto err2; @@ -2581,7 +2631,6 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple_mask *mask) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple m; struct nlattr *nest_parms; @@ -2597,8 +2646,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, goto nla_put_failure; rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(tuple->src.l3num); - ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto); + ret = ctnetlink_dump_tuples_ip(skb, &m); if (ret >= 0) { l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index d88841fbc560..803607a90102 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -1,14 +1,4 @@ -/* L3/L4 protocol support for nf_conntrack. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> - * (C) 2006-2012 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ +// SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/netfilter.h> @@ -24,14 +14,36 @@ #include <linux/netdevice.h> #include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_log.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/route.h> +#include <net/ip.h> + +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/ipv4/nf_defrag_ipv4.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> + +#include <linux/ipv6.h> +#include <linux/in6.h> +#include <net/ipv6.h> +#include <net/inet_frag.h> + +extern unsigned int nf_conntrack_net_id; + static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly; -struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO] __read_mostly; -EXPORT_SYMBOL_GPL(nf_ct_l3protos); static DEFINE_MUTEX(nf_ct_proto_mutex); @@ -122,137 +134,6 @@ __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) } EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); -/* this is guaranteed to always return a valid protocol helper, since - * it falls back to generic_protocol */ -const struct nf_conntrack_l3proto * -nf_ct_l3proto_find_get(u_int16_t l3proto) -{ - struct nf_conntrack_l3proto *p; - - rcu_read_lock(); - p = __nf_ct_l3proto_find(l3proto); - if (!try_module_get(p->me)) - p = &nf_conntrack_l3proto_generic; - rcu_read_unlock(); - - return p; -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get); - -int -nf_ct_l3proto_try_module_get(unsigned short l3proto) -{ - const struct nf_conntrack_l3proto *p; - int ret; - -retry: p = nf_ct_l3proto_find_get(l3proto); - if (p == &nf_conntrack_l3proto_generic) { - ret = request_module("nf_conntrack-%d", l3proto); - if (!ret) - goto retry; - - return -EPROTOTYPE; - } - - return 0; -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_try_module_get); - -void nf_ct_l3proto_module_put(unsigned short l3proto) -{ - struct nf_conntrack_l3proto *p; - - /* rcu_read_lock not necessary since the caller holds a reference, but - * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find() - */ - rcu_read_lock(); - p = __nf_ct_l3proto_find(l3proto); - module_put(p->me); - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put); - -static int nf_ct_netns_do_get(struct net *net, u8 nfproto) -{ - const struct nf_conntrack_l3proto *l3proto; - int ret; - - might_sleep(); - - ret = nf_ct_l3proto_try_module_get(nfproto); - if (ret < 0) - return ret; - - /* we already have a reference, can't fail */ - rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(nfproto); - rcu_read_unlock(); - - if (!l3proto->net_ns_get) - return 0; - - ret = l3proto->net_ns_get(net); - if (ret < 0) - nf_ct_l3proto_module_put(nfproto); - - return ret; -} - -int nf_ct_netns_get(struct net *net, u8 nfproto) -{ - int err; - - if (nfproto == NFPROTO_INET) { - err = nf_ct_netns_do_get(net, NFPROTO_IPV4); - if (err < 0) - goto err1; - err = nf_ct_netns_do_get(net, NFPROTO_IPV6); - if (err < 0) - goto err2; - } else { - err = nf_ct_netns_do_get(net, nfproto); - if (err < 0) - goto err1; - } - return 0; - -err2: - nf_ct_netns_put(net, NFPROTO_IPV4); -err1: - return err; -} -EXPORT_SYMBOL_GPL(nf_ct_netns_get); - -static void nf_ct_netns_do_put(struct net *net, u8 nfproto) -{ - const struct nf_conntrack_l3proto *l3proto; - - might_sleep(); - - /* same as nf_conntrack_netns_get(), reference assumed */ - rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(nfproto); - rcu_read_unlock(); - - if (WARN_ON(!l3proto)) - return; - - if (l3proto->net_ns_put) - l3proto->net_ns_put(net); - - nf_ct_l3proto_module_put(nfproto); -} - -void nf_ct_netns_put(struct net *net, uint8_t nfproto) -{ - if (nfproto == NFPROTO_INET) { - nf_ct_netns_do_put(net, NFPROTO_IPV4); - nf_ct_netns_do_put(net, NFPROTO_IPV6); - } else - nf_ct_netns_do_put(net, nfproto); -} -EXPORT_SYMBOL_GPL(nf_ct_netns_put); - const struct nf_conntrack_l4proto * nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) { @@ -274,11 +155,6 @@ void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p) } EXPORT_SYMBOL_GPL(nf_ct_l4proto_put); -static int kill_l3proto(struct nf_conn *i, void *data) -{ - return nf_ct_l3num(i) == ((const struct nf_conntrack_l3proto *)data)->l3proto; -} - static int kill_l4proto(struct nf_conn *i, void *data) { const struct nf_conntrack_l4proto *l4proto; @@ -287,52 +163,6 @@ static int kill_l4proto(struct nf_conn *i, void *data) nf_ct_l3num(i) == l4proto->l3proto; } -int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto) -{ - int ret = 0; - struct nf_conntrack_l3proto *old; - - if (proto->l3proto >= NFPROTO_NUMPROTO) - return -EBUSY; -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - if (proto->tuple_to_nlattr && proto->nla_size == 0) - return -EINVAL; -#endif - mutex_lock(&nf_ct_proto_mutex); - old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], - lockdep_is_held(&nf_ct_proto_mutex)); - if (old != &nf_conntrack_l3proto_generic) { - ret = -EBUSY; - goto out_unlock; - } - - rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); - -out_unlock: - mutex_unlock(&nf_ct_proto_mutex); - return ret; - -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_register); - -void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto) -{ - BUG_ON(proto->l3proto >= NFPROTO_NUMPROTO); - - mutex_lock(&nf_ct_proto_mutex); - BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], - lockdep_is_held(&nf_ct_proto_mutex) - ) != proto); - rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], - &nf_conntrack_l3proto_generic); - mutex_unlock(&nf_ct_proto_mutex); - - synchronize_rcu(); - /* Remove all contrack entries for this protocol */ - nf_ct_iterate_destroy(kill_l3proto, (void*)proto); -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister); - static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, const struct nf_conntrack_l4proto *l4proto) { @@ -499,8 +329,23 @@ void nf_ct_l4proto_pernet_unregister_one(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one); -int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], - unsigned int num_proto) +static void +nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[], + unsigned int num_proto) +{ + mutex_lock(&nf_ct_proto_mutex); + while (num_proto-- != 0) + __nf_ct_l4proto_unregister_one(l4proto[num_proto]); + mutex_unlock(&nf_ct_proto_mutex); + + synchronize_net(); + /* Remove all contrack entries for this protocol */ + nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto); +} + +static int +nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], + unsigned int num_proto) { int ret = -EINVAL, ver; unsigned int i; @@ -518,7 +363,6 @@ int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], } return ret; } -EXPORT_SYMBOL_GPL(nf_ct_l4proto_register); int nf_ct_l4proto_pernet_register(struct net *net, const struct nf_conntrack_l4proto *const l4proto[], @@ -542,20 +386,6 @@ int nf_ct_l4proto_pernet_register(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register); -void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[], - unsigned int num_proto) -{ - mutex_lock(&nf_ct_proto_mutex); - while (num_proto-- != 0) - __nf_ct_l4proto_unregister_one(l4proto[num_proto]); - mutex_unlock(&nf_ct_proto_mutex); - - synchronize_net(); - /* Remove all contrack entries for this protocol */ - nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto); -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister); - void nf_ct_l4proto_pernet_unregister(struct net *net, const struct nf_conntrack_l4proto *const l4proto[], unsigned int num_proto) @@ -565,6 +395,563 @@ void nf_ct_l4proto_pernet_unregister(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister); +static unsigned int ipv4_helper(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + const struct nf_conn_help *help; + const struct nf_conntrack_helper *helper; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + + /* rcu_read_lock()ed by nf_hook_thresh */ + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), + ct, ctinfo); +} + +static unsigned int ipv4_confirm(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + goto out; + + /* adjust seqs for loopback traffic only in outgoing direction */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; + } + } +out: + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb); +} + +static unsigned int ipv4_conntrack_in(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); +} + +static unsigned int ipv4_conntrack_local(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */ + enum ip_conntrack_info ctinfo; + struct nf_conn *tmpl; + + tmpl = nf_ct_get(skb, &ctinfo); + if (tmpl && nf_ct_is_template(tmpl)) { + /* when skipping ct, clear templates to avoid fooling + * later targets/matches + */ + skb->_nfct = 0; + nf_ct_put(tmpl); + } + return NF_ACCEPT; + } + + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); +} + +/* Connection tracking may drop packets, but never alters them, so + * make it the first hook. + */ +static const struct nf_hook_ops ipv4_conntrack_ops[] = { + { + .hook = ipv4_conntrack_in, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, + }, + { + .hook = ipv4_conntrack_local, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK, + }, + { + .hook = ipv4_helper, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_HELPER, + }, + { + .hook = ipv4_confirm, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, + { + .hook = ipv4_helper, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_HELPER, + }, + { + .hook = ipv4_confirm, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, +}; + +/* Fast function for those who don't want to parse /proc (and I don't + * blame them). + * Reversing the socket's dst/src point of view gives us the reply + * mapping. + */ +static int +getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + + memset(&tuple, 0, sizeof(tuple)); + + lock_sock(sk); + tuple.src.u3.ip = inet->inet_rcv_saddr; + tuple.src.u.tcp.port = inet->inet_sport; + tuple.dst.u3.ip = inet->inet_daddr; + tuple.dst.u.tcp.port = inet->inet_dport; + tuple.src.l3num = PF_INET; + tuple.dst.protonum = sk->sk_protocol; + release_sock(sk); + + /* We only do TCP and SCTP at the moment: is there a better way? */ + if (tuple.dst.protonum != IPPROTO_TCP && + tuple.dst.protonum != IPPROTO_SCTP) { + pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); + return -ENOPROTOOPT; + } + + if ((unsigned int)*len < sizeof(struct sockaddr_in)) { + pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); + if (h) { + struct sockaddr_in sin; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + sin.sin_family = AF_INET; + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u3.ip; + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); + + pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", + &sin.sin_addr.s_addr, ntohs(sin.sin_port)); + nf_ct_put(ct); + if (copy_to_user(user, &sin, sizeof(sin)) != 0) + return -EFAULT; + else + return 0; + } + pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", + &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), + &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} + +static struct nf_sockopt_ops so_getorigdst = { + .pf = PF_INET, + .get_optmin = SO_ORIGINAL_DST, + .get_optmax = SO_ORIGINAL_DST + 1, + .get = getorigdst, + .owner = THIS_MODULE, +}; + +#if IS_ENABLED(CONFIG_IPV6) +static int +ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 }; + const struct ipv6_pinfo *inet6 = inet6_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + const struct nf_conntrack_tuple_hash *h; + struct sockaddr_in6 sin6; + struct nf_conn *ct; + __be32 flow_label; + int bound_dev_if; + + lock_sock(sk); + tuple.src.u3.in6 = sk->sk_v6_rcv_saddr; + tuple.src.u.tcp.port = inet->inet_sport; + tuple.dst.u3.in6 = sk->sk_v6_daddr; + tuple.dst.u.tcp.port = inet->inet_dport; + tuple.dst.protonum = sk->sk_protocol; + bound_dev_if = sk->sk_bound_dev_if; + flow_label = inet6->flow_label; + release_sock(sk); + + if (tuple.dst.protonum != IPPROTO_TCP && + tuple.dst.protonum != IPPROTO_SCTP) + return -ENOPROTOOPT; + + if (*len < 0 || (unsigned int)*len < sizeof(sin6)) + return -EINVAL; + + h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); + if (!h) { + pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", + &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), + &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; + } + + ct = nf_ct_tuplehash_to_ctrack(h); + + sin6.sin6_family = AF_INET6; + sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; + sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK; + memcpy(&sin6.sin6_addr, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6, + sizeof(sin6.sin6_addr)); + + nf_ct_put(ct); + sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if); + return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0; +} + +static struct nf_sockopt_ops so_getorigdst6 = { + .pf = NFPROTO_IPV6, + .get_optmin = IP6T_SO_ORIGINAL_DST, + .get_optmax = IP6T_SO_ORIGINAL_DST + 1, + .get = ipv6_getorigdst, + .owner = THIS_MODULE, +}; + +static unsigned int ipv6_confirm(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned char pnum = ipv6_hdr(skb)->nexthdr; + int protoff; + __be16 frag_off; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + goto out; + + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + goto out; + } + + /* adjust seqs for loopback traffic only in outgoing direction */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; + } + } +out: + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb); +} + +static unsigned int ipv6_conntrack_in(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); +} + +static unsigned int ipv6_conntrack_local(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); +} + +static unsigned int ipv6_helper(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conn *ct; + const struct nf_conn_help *help; + const struct nf_conntrack_helper *helper; + enum ip_conntrack_info ctinfo; + __be16 frag_off; + int protoff; + u8 nexthdr; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + /* rcu_read_lock()ed by nf_hook_thresh */ + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + nexthdr = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + + return helper->help(skb, protoff, ct, ctinfo); +} + +static const struct nf_hook_ops ipv6_conntrack_ops[] = { + { + .hook = ipv6_conntrack_in, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK, + }, + { + .hook = ipv6_conntrack_local, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK, + }, + { + .hook = ipv6_helper, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_HELPER, + }, + { + .hook = ipv6_confirm, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_LAST, + }, + { + .hook = ipv6_helper, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_CONNTRACK_HELPER, + }, + { + .hook = ipv6_confirm, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_LAST - 1, + }, +}; +#endif + +static int nf_ct_netns_do_get(struct net *net, u8 nfproto) +{ + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + int err = 0; + + mutex_lock(&nf_ct_proto_mutex); + + switch (nfproto) { + case NFPROTO_IPV4: + cnet->users4++; + if (cnet->users4 > 1) + goto out_unlock; + err = nf_defrag_ipv4_enable(net); + if (err) { + cnet->users4 = 0; + goto out_unlock; + } + + err = nf_register_net_hooks(net, ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + if (err) + cnet->users4 = 0; + break; +#if IS_ENABLED(CONFIG_IPV6) + case NFPROTO_IPV6: + cnet->users6++; + if (cnet->users6 > 1) + goto out_unlock; + err = nf_defrag_ipv6_enable(net); + if (err < 0) { + cnet->users6 = 0; + goto out_unlock; + } + + err = nf_register_net_hooks(net, ipv6_conntrack_ops, + ARRAY_SIZE(ipv6_conntrack_ops)); + if (err) + cnet->users6 = 0; + break; +#endif + default: + err = -EPROTO; + break; + } + out_unlock: + mutex_unlock(&nf_ct_proto_mutex); + return err; +} + +static void nf_ct_netns_do_put(struct net *net, u8 nfproto) +{ + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + + mutex_lock(&nf_ct_proto_mutex); + switch (nfproto) { + case NFPROTO_IPV4: + if (cnet->users4 && (--cnet->users4 == 0)) + nf_unregister_net_hooks(net, ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + break; +#if IS_ENABLED(CONFIG_IPV6) + case NFPROTO_IPV6: + if (cnet->users6 && (--cnet->users6 == 0)) + nf_unregister_net_hooks(net, ipv6_conntrack_ops, + ARRAY_SIZE(ipv6_conntrack_ops)); + break; +#endif + } + + mutex_unlock(&nf_ct_proto_mutex); +} + +int nf_ct_netns_get(struct net *net, u8 nfproto) +{ + int err; + + if (nfproto == NFPROTO_INET) { + err = nf_ct_netns_do_get(net, NFPROTO_IPV4); + if (err < 0) + goto err1; + err = nf_ct_netns_do_get(net, NFPROTO_IPV6); + if (err < 0) + goto err2; + } else { + err = nf_ct_netns_do_get(net, nfproto); + if (err < 0) + goto err1; + } + return 0; + +err2: + nf_ct_netns_put(net, NFPROTO_IPV4); +err1: + return err; +} +EXPORT_SYMBOL_GPL(nf_ct_netns_get); + +void nf_ct_netns_put(struct net *net, uint8_t nfproto) +{ + if (nfproto == NFPROTO_INET) { + nf_ct_netns_do_put(net, NFPROTO_IPV4); + nf_ct_netns_do_put(net, NFPROTO_IPV6); + } else { + nf_ct_netns_do_put(net, nfproto); + } +} +EXPORT_SYMBOL_GPL(nf_ct_netns_put); + +static const struct nf_conntrack_l4proto * const builtin_l4proto[] = { + &nf_conntrack_l4proto_tcp4, + &nf_conntrack_l4proto_udp4, + &nf_conntrack_l4proto_icmp, +#ifdef CONFIG_NF_CT_PROTO_DCCP + &nf_conntrack_l4proto_dccp4, +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + &nf_conntrack_l4proto_sctp4, +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + &nf_conntrack_l4proto_udplite4, +#endif +#if IS_ENABLED(CONFIG_IPV6) + &nf_conntrack_l4proto_tcp6, + &nf_conntrack_l4proto_udp6, + &nf_conntrack_l4proto_icmpv6, +#ifdef CONFIG_NF_CT_PROTO_DCCP + &nf_conntrack_l4proto_dccp6, +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + &nf_conntrack_l4proto_sctp6, +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + &nf_conntrack_l4proto_udplite6, +#endif +#endif /* CONFIG_IPV6 */ +}; + +int nf_conntrack_proto_init(void) +{ + int ret = 0; + + ret = nf_register_sockopt(&so_getorigdst); + if (ret < 0) + return ret; + +#if IS_ENABLED(CONFIG_IPV6) + ret = nf_register_sockopt(&so_getorigdst6); + if (ret < 0) + goto cleanup_sockopt; +#endif + ret = nf_ct_l4proto_register(builtin_l4proto, + ARRAY_SIZE(builtin_l4proto)); + if (ret < 0) + goto cleanup_sockopt2; + + return ret; +cleanup_sockopt2: + nf_unregister_sockopt(&so_getorigdst); +#if IS_ENABLED(CONFIG_IPV6) +cleanup_sockopt: + nf_unregister_sockopt(&so_getorigdst6); +#endif + return ret; +} + +void nf_conntrack_proto_fini(void) +{ + unsigned int i; + + nf_ct_l4proto_unregister(builtin_l4proto, + ARRAY_SIZE(builtin_l4proto)); + nf_unregister_sockopt(&so_getorigdst); +#if IS_ENABLED(CONFIG_IPV6) + nf_unregister_sockopt(&so_getorigdst6); +#endif + + /* free l3proto protocol tables */ + for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) + kfree(nf_ct_protos[i]); +} + int nf_conntrack_proto_pernet_init(struct net *net) { int err; @@ -581,6 +968,14 @@ int nf_conntrack_proto_pernet_init(struct net *net) if (err < 0) return err; + err = nf_ct_l4proto_pernet_register(net, builtin_l4proto, + ARRAY_SIZE(builtin_l4proto)); + if (err < 0) { + nf_ct_l4proto_unregister_sysctl(net, pn, + &nf_conntrack_l4proto_generic); + return err; + } + pn->users++; return 0; } @@ -590,25 +985,19 @@ void nf_conntrack_proto_pernet_fini(struct net *net) struct nf_proto_net *pn = nf_ct_l4proto_net(net, &nf_conntrack_l4proto_generic); + nf_ct_l4proto_pernet_unregister(net, builtin_l4proto, + ARRAY_SIZE(builtin_l4proto)); pn->users--; nf_ct_l4proto_unregister_sysctl(net, pn, &nf_conntrack_l4proto_generic); } -int nf_conntrack_proto_init(void) -{ - unsigned int i; - for (i = 0; i < NFPROTO_NUMPROTO; i++) - rcu_assign_pointer(nf_ct_l3protos[i], - &nf_conntrack_l3proto_generic); - return 0; -} -void nf_conntrack_proto_fini(void) -{ - unsigned int i; - /* free l3proto protocol tables */ - for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) - kfree(nf_ct_protos[i]); -} +module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, + &nf_conntrack_htable_size, 0600); + +MODULE_ALIAS("ip_conntrack"); +MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); +MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index abe647d5b8c6..8c58f96b59e7 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -23,6 +23,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> /* Timeouts are based on values from RFC4340: @@ -243,14 +244,14 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = * We currently ignore Sync packets * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, }, [DCCP_PKT_SYNCACK] = { /* * We currently ignore SyncAck packets * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, }, }, [CT_DCCP_ROLE_SERVER] = { @@ -371,14 +372,14 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = * We currently ignore Sync packets * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, }, [DCCP_PKT_SYNCACK] = { /* * We currently ignore SyncAck packets * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, }, }, }; @@ -388,31 +389,8 @@ static inline struct nf_dccp_net *dccp_pernet(struct net *net) return &net->ct.nf_ct_proto.dccp; } -static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct net *net, struct nf_conntrack_tuple *tuple) -{ - struct dccp_hdr _hdr, *dh; - - /* Actually only need first 4 bytes to get ports. */ - dh = skb_header_pointer(skb, dataoff, 4, &_hdr); - if (dh == NULL) - return false; - - tuple->src.u.dccp.port = dh->dccph_sport; - tuple->dst.u.dccp.port = dh->dccph_dport; - return true; -} - -static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv, - const struct nf_conntrack_tuple *tuple) -{ - inv->src.u.dccp.port = tuple->dst.u.dccp.port; - inv->dst.u.dccp.port = tuple->src.u.dccp.port; - return true; -} - static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { struct net *net = nf_ct_net(ct); struct nf_dccp_net *dn; @@ -460,19 +438,14 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh) ntohl(dhack->dccph_ack_nr_low); } -static unsigned int *dccp_get_timeouts(struct net *net) -{ - return dccp_pernet(net)->dccp_timeout; -} - static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + unsigned int dataoff, enum ip_conntrack_info ctinfo) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; u_int8_t type, old_state, new_state; enum ct_dccp_roles role; + unsigned int *timeouts; dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); BUG_ON(dh == NULL); @@ -546,6 +519,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, if (new_state != old_state) nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = dccp_pernet(nf_ct_net(ct))->dccp_timeout; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; @@ -864,11 +840,8 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = { .l3proto = AF_INET, .l4proto = IPPROTO_DCCP, - .pkt_to_tuple = dccp_pkt_to_tuple, - .invert_tuple = dccp_invert_tuple, .new = dccp_new, .packet = dccp_packet, - .get_timeouts = dccp_get_timeouts, .error = dccp_error, .can_early_drop = dccp_can_early_drop, #ifdef CONFIG_NF_CONNTRACK_PROCFS @@ -900,11 +873,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4); const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { .l3proto = AF_INET6, .l4proto = IPPROTO_DCCP, - .pkt_to_tuple = dccp_pkt_to_tuple, - .invert_tuple = dccp_invert_tuple, .new = dccp_new, .packet = dccp_packet, - .get_timeouts = dccp_get_timeouts, .error = dccp_error, .can_early_drop = dccp_can_early_drop, #ifdef CONFIG_NF_CONNTRACK_PROCFS diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 6c6896d21cd7..ac4a0b296dcd 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -11,6 +11,7 @@ #include <linux/timer.h> #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_timeout.h> static const unsigned int nf_ct_generic_timeout = 600*HZ; @@ -41,34 +42,24 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb, return true; } -static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u.all = 0; - tuple->dst.u.all = 0; - - return true; -} - -static unsigned int *generic_get_timeouts(struct net *net) -{ - return &(generic_pernet(net)->timeout); -} - /* Returns verdict for packet, or -1 for invalid. */ static int generic_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeout) + enum ip_conntrack_info ctinfo) { + const unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = &generic_pernet(nf_ct_net(ct))->timeout; + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { bool ret; @@ -87,8 +78,11 @@ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - unsigned int *timeout = data; struct nf_generic_net *gn = generic_pernet(net); + unsigned int *timeout = data; + + if (!timeout) + timeout = &gn->timeout; if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT]) *timeout = @@ -168,9 +162,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = .l3proto = PF_UNSPEC, .l4proto = 255, .pkt_to_tuple = generic_pkt_to_tuple, - .invert_tuple = generic_invert_tuple, .packet = generic_packet, - .get_timeouts = generic_get_timeouts, .new = generic_new, #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) .ctnl_timeout = { diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index d049ea5a3770..d1632252bf5b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -39,6 +39,7 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_timeout.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <linux/netfilter/nf_conntrack_pptp.h> @@ -179,15 +180,6 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy); /* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ -/* invert gre part of tuple */ -static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->dst.u.gre.key = orig->src.u.gre.key; - tuple->src.u.gre.key = orig->dst.u.gre.key; - return true; -} - /* gre hdr info to tuple */ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) @@ -243,8 +235,7 @@ static unsigned int *gre_get_timeouts(struct net *net) static int gre_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + enum ip_conntrack_info ctinfo) { /* If we've seen traffic both ways, this is a GRE connection. * Extend timeout. */ @@ -263,8 +254,13 @@ static int gre_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { + unsigned int *timeouts = nf_ct_timeout_lookup(ct); + + if (!timeouts) + timeouts = gre_get_timeouts(nf_ct_net(ct)); + pr_debug(": "); nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); @@ -300,6 +296,8 @@ static int gre_timeout_nlattr_to_obj(struct nlattr *tb[], unsigned int *timeouts = data; struct netns_proto_gre *net_gre = gre_pernet(net); + if (!timeouts) + timeouts = gre_get_timeouts(net); /* set default timeouts for GRE. */ timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED]; timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED]; @@ -356,11 +354,9 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { .l3proto = AF_INET, .l4proto = IPPROTO_GRE, .pkt_to_tuple = gre_pkt_to_tuple, - .invert_tuple = gre_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif - .get_timeouts = gre_get_timeouts, .packet = gre_packet, .new = gre_new, .destroy = gre_destroy, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 5c15beafa711..036670b38282 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -19,6 +19,7 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> @@ -80,12 +81,16 @@ static unsigned int *icmp_get_timeouts(struct net *net) static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeout) + enum ip_conntrack_info ctinfo) { /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ + unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = icmp_get_timeouts(nf_ct_net(ct)); + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; @@ -93,7 +98,7 @@ static int icmp_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, @@ -142,8 +147,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&innertuple, &origtuple, - &nf_conntrack_l3proto_ipv4, innerproto)) { + if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; } @@ -281,9 +285,11 @@ static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], struct nf_icmp_net *in = icmp_pernet(net); if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { + if (!timeout) + timeout = &in->timeout; *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; - } else { + } else if (timeout) { /* Set default ICMP timeout. */ *timeout = in->timeout; } @@ -358,7 +364,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = .pkt_to_tuple = icmp_pkt_to_tuple, .invert_tuple = icmp_invert_tuple, .packet = icmp_packet, - .get_timeouts = icmp_get_timeouts, .new = icmp_new, .error = icmp_error, .destroy = NULL, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 2548e2c8aedd..bed07b998a10 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -23,6 +23,7 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> #include <net/netfilter/nf_log.h> @@ -93,9 +94,13 @@ static unsigned int *icmpv6_get_timeouts(struct net *net) static int icmpv6_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeout) + enum ip_conntrack_info ctinfo) { + unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = icmpv6_get_timeouts(nf_ct_net(ct)); + /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ @@ -106,7 +111,7 @@ static int icmpv6_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { static const u_int8_t valid_new[] = { [ICMPV6_ECHO_REQUEST - 128] = 1, @@ -152,8 +157,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&intuple, &origtuple, - &nf_conntrack_l3proto_ipv6, inproto)) { + if (!nf_ct_invert_tuple(&intuple, &origtuple, inproto)) { pr_debug("icmpv6_error: Can't invert tuple\n"); return -NF_ACCEPT; } @@ -281,6 +285,8 @@ static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], unsigned int *timeout = data; struct nf_icmp_net *in = icmpv6_pernet(net); + if (!timeout) + timeout = icmpv6_get_timeouts(net); if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; @@ -359,7 +365,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = .pkt_to_tuple = icmpv6_pkt_to_tuple, .invert_tuple = icmpv6_invert_tuple, .packet = icmpv6_packet, - .get_timeouts = icmpv6_get_timeouts, .new = icmpv6_new, .error = icmpv6_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index fb9a35d16069..8d1e085fc14a 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -28,6 +28,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_timeout.h> /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR @@ -150,30 +151,6 @@ static inline struct nf_sctp_net *sctp_pernet(struct net *net) return &net->ct.nf_ct_proto.sctp; } -static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct net *net, struct nf_conntrack_tuple *tuple) -{ - const struct sctphdr *hp; - struct sctphdr _hdr; - - /* Actually only need first 4 bytes to get ports. */ - hp = skb_header_pointer(skb, dataoff, 4, &_hdr); - if (hp == NULL) - return false; - - tuple->src.u.sctp.port = hp->source; - tuple->dst.u.sctp.port = hp->dest; - return true; -} - -static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u.sctp.port = orig->dst.u.sctp.port; - tuple->dst.u.sctp.port = orig->src.u.sctp.port; - return true; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -296,17 +273,11 @@ static int sctp_new_state(enum ip_conntrack_dir dir, return sctp_conntracks[dir][i][cur_state]; } -static unsigned int *sctp_get_timeouts(struct net *net) -{ - return sctp_pernet(net)->timeouts; -} - /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ static int sctp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + enum ip_conntrack_info ctinfo) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -315,6 +286,7 @@ static int sctp_packet(struct nf_conn *ct, const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u_int32_t offset, count; + unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); @@ -403,6 +375,10 @@ static int sctp_packet(struct nf_conn *ct, } spin_unlock_bh(&ct->lock); + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = sctp_pernet(nf_ct_net(ct))->timeouts; + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && @@ -423,7 +399,7 @@ out: /* Called when a new connection for this protocol found. */ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { enum sctp_conntrack new_state; const struct sctphdr *sh; @@ -780,13 +756,10 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { .l3proto = PF_INET, .l4proto = IPPROTO_SCTP, - .pkt_to_tuple = sctp_pkt_to_tuple, - .invert_tuple = sctp_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif .packet = sctp_packet, - .get_timeouts = sctp_get_timeouts, .new = sctp_new, .error = sctp_error, .can_early_drop = sctp_can_early_drop, @@ -817,13 +790,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4); const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_SCTP, - .pkt_to_tuple = sctp_pkt_to_tuple, - .invert_tuple = sctp_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif .packet = sctp_packet, - .get_timeouts = sctp_get_timeouts, .new = sctp_new, .error = sctp_error, .can_early_drop = sctp_can_early_drop, diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 8e67910185a0..d80d322b9d8b 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -29,6 +29,7 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> +#include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> @@ -276,31 +277,6 @@ static inline struct nf_tcp_net *tcp_pernet(struct net *net) return &net->ct.nf_ct_proto.tcp; } -static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct net *net, struct nf_conntrack_tuple *tuple) -{ - const struct tcphdr *hp; - struct tcphdr _hdr; - - /* Actually only need first 4 bytes to get ports. */ - hp = skb_header_pointer(skb, dataoff, 4, &_hdr); - if (hp == NULL) - return false; - - tuple->src.u.tcp.port = hp->source; - tuple->dst.u.tcp.port = hp->dest; - - return true; -} - -static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u.tcp.port = orig->dst.u.tcp.port; - tuple->dst.u.tcp.port = orig->src.u.tcp.port; - return true; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -793,27 +769,21 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, return NF_ACCEPT; } -static unsigned int *tcp_get_timeouts(struct net *net) -{ - return tcp_pernet(net)->timeouts; -} - /* Returns verdict for packet, or -1 for invalid. */ static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + enum ip_conntrack_info ctinfo) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = tcp_pernet(net); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; + unsigned int index, *timeouts; enum ip_conntrack_dir dir; const struct tcphdr *th; struct tcphdr _tcph; unsigned long timeout; - unsigned int index; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); BUG_ON(th == NULL); @@ -1046,6 +1016,10 @@ static int tcp_packet(struct nf_conn *ct, && new_state == TCP_CONNTRACK_FIN_WAIT) ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = tn->timeouts; + if (ct->proto.tcp.retrans >= tn->tcp_max_retrans && timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) timeout = timeouts[TCP_CONNTRACK_RETRANS]; @@ -1095,7 +1069,7 @@ static int tcp_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { enum tcp_conntrack new_state; const struct tcphdr *th; @@ -1313,10 +1287,12 @@ static unsigned int tcp_nlattr_tuple_size(void) static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - unsigned int *timeouts = data; struct nf_tcp_net *tn = tcp_pernet(net); + unsigned int *timeouts = data; int i; + if (!timeouts) + timeouts = tn->timeouts; /* set default TCP timeouts. */ for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++) timeouts[i] = tn->timeouts[i]; @@ -1559,13 +1535,10 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = { .l3proto = PF_INET, .l4proto = IPPROTO_TCP, - .pkt_to_tuple = tcp_pkt_to_tuple, - .invert_tuple = tcp_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif .packet = tcp_packet, - .get_timeouts = tcp_get_timeouts, .new = tcp_new, .error = tcp_error, .can_early_drop = tcp_can_early_drop, @@ -1597,13 +1570,10 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_TCP, - .pkt_to_tuple = tcp_pkt_to_tuple, - .invert_tuple = tcp_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif .packet = tcp_packet, - .get_timeouts = tcp_get_timeouts, .new = tcp_new, .error = tcp_error, .can_early_drop = tcp_can_early_drop, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index fe7243970aa4..7a1b8988a931 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -22,6 +22,7 @@ #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> @@ -36,33 +37,6 @@ static inline struct nf_udp_net *udp_pernet(struct net *net) return &net->ct.nf_ct_proto.udp; } -static bool udp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct net *net, - struct nf_conntrack_tuple *tuple) -{ - const struct udphdr *hp; - struct udphdr _hdr; - - /* Actually only need first 4 bytes to get ports. */ - hp = skb_header_pointer(skb, dataoff, 4, &_hdr); - if (hp == NULL) - return false; - - tuple->src.u.udp.port = hp->source; - tuple->dst.u.udp.port = hp->dest; - - return true; -} - -static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u.udp.port = orig->dst.u.udp.port; - tuple->dst.u.udp.port = orig->src.u.udp.port; - return true; -} - static unsigned int *udp_get_timeouts(struct net *net) { return udp_pernet(net)->timeouts; @@ -72,9 +46,14 @@ static unsigned int *udp_get_timeouts(struct net *net) static int udp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + enum ip_conntrack_info ctinfo) { + unsigned int *timeouts; + + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = udp_get_timeouts(nf_ct_net(ct)); + /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { @@ -92,7 +71,7 @@ static int udp_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { return true; } @@ -203,6 +182,9 @@ static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], unsigned int *timeouts = data; struct nf_udp_net *un = udp_pernet(net); + if (!timeouts) + timeouts = un->timeouts; + /* set default timeouts for UDP. */ timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED]; timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED]; @@ -301,10 +283,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = .l3proto = PF_INET, .l4proto = IPPROTO_UDP, .allow_clash = true, - .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, .packet = udp_packet, - .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udp_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -333,10 +312,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 = .l3proto = PF_INET, .l4proto = IPPROTO_UDPLITE, .allow_clash = true, - .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, .packet = udp_packet, - .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udplite_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -365,10 +341,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = .l3proto = PF_INET6, .l4proto = IPPROTO_UDP, .allow_clash = true, - .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, .packet = udp_packet, - .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udp_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -397,10 +370,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = .l3proto = PF_INET6, .l4proto = IPPROTO_UDPLITE, .allow_clash = true, - .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, .packet = udp_packet, - .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udplite_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -423,3 +393,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6); #endif +#include <net/netfilter/nf_conntrack_timeout.h> diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index b642c0b2495c..13279f683da9 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1,12 +1,4 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2005-2012 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - +// SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/netfilter.h> #include <linux/slab.h> @@ -24,7 +16,6 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> @@ -33,15 +24,14 @@ #include <net/netfilter/nf_conntrack_timestamp.h> #include <linux/rculist_nulls.h> -MODULE_LICENSE("GPL"); +unsigned int nf_conntrack_net_id __read_mostly; #ifdef CONFIG_NF_CONNTRACK_PROCFS void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { - switch (l3proto->l3proto) { + switch (tuple->src.l3num) { case NFPROTO_IPV4: seq_printf(s, "src=%pI4 dst=%pI4 ", &tuple->src.u3.ip, &tuple->dst.u3.ip); @@ -282,7 +272,6 @@ static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; struct net *net = seq_file_net(s); int ret = 0; @@ -303,14 +292,12 @@ static int ct_seq_show(struct seq_file *s, void *v) if (!net_eq(nf_ct_net(ct), net)) goto release; - l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); - WARN_ON(!l3proto); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); WARN_ON(!l4proto); ret = -ENOSPC; seq_printf(s, "%-8s %u %-8s %u ", - l3proto_name(l3proto->l3proto), nf_ct_l3num(ct), + l3proto_name(nf_ct_l3num(ct)), nf_ct_l3num(ct), l4proto_name(l4proto->l4proto), nf_ct_protonum(ct)); if (!test_bit(IPS_OFFLOAD_BIT, &ct->status)) @@ -320,7 +307,7 @@ static int ct_seq_show(struct seq_file *s, void *v) l4proto->print_conntrack(s, ct); print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - l3proto, l4proto); + l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG); @@ -333,8 +320,7 @@ static int ct_seq_show(struct seq_file *s, void *v) if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) seq_puts(s, "[UNREPLIED] "); - print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, - l3proto, l4proto); + print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); @@ -680,6 +666,8 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) static struct pernet_operations nf_conntrack_net_ops = { .init = nf_conntrack_pernet_init, .exit_batch = nf_conntrack_pernet_exit, + .id = &nf_conntrack_net_id, + .size = sizeof(struct nf_conntrack_net), }; static int __init nf_conntrack_standalone_init(void) diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index eb0d1658ac05..d8125616edc7 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -107,11 +107,12 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) tcp->seen[1].td_maxwin = 0; } +#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) +#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) + static void flow_offload_fixup_ct_state(struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; - struct net *net = nf_ct_net(ct); - unsigned int *timeouts; unsigned int timeout; int l4num; @@ -123,14 +124,10 @@ static void flow_offload_fixup_ct_state(struct nf_conn *ct) if (!l4proto) return; - timeouts = l4proto->get_timeouts(net); - if (!timeouts) - return; - if (l4num == IPPROTO_TCP) - timeout = timeouts[TCP_CONNTRACK_ESTABLISHED]; + timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT; else if (l4num == IPPROTO_UDP) - timeout = timeouts[UDP_CT_REPLIED]; + timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT; else return; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 426457047578..a61d6df6e5f6 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -424,6 +424,10 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write, if (write) { struct ctl_table tmp = *table; + /* proc_dostring() can append to existing strings, so we need to + * initialize it as an empty string. + */ + buf[0] = '\0'; tmp.data = buf; r = proc_dostring(&tmp, write, buffer, lenp, ppos); if (r) @@ -442,14 +446,17 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write, rcu_assign_pointer(net->nf.nf_loggers[tindex], logger); mutex_unlock(&nf_log_mutex); } else { + struct ctl_table tmp = *table; + + tmp.data = buf; mutex_lock(&nf_log_mutex); logger = nft_log_dereference(net->nf.nf_loggers[tindex]); if (!logger) - table->data = "NONE"; + strlcpy(buf, "NONE", sizeof(buf)); else - table->data = logger->name; - r = proc_dostring(table, write, buffer, lenp, ppos); + strlcpy(buf, logger->name, sizeof(buf)); mutex_unlock(&nf_log_mutex); + r = proc_dostring(&tmp, write, buffer, lenp, ppos); } return r; diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index dc61399e30be..a8c5c846aec1 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -132,9 +132,10 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header); -void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk) +void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, + struct sock *sk) { - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) return; read_lock_bh(&sk->sk_callback_lock); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 46f9df99d276..6366f0c0b8c1 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -28,7 +28,6 @@ #include <net/netfilter/nf_nat_helper.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_zones.h> #include <linux/netfilter/nf_nat.h> @@ -108,6 +107,7 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) struct flowi fl; unsigned int hh_len; struct dst_entry *dst; + struct sock *sk = skb->sk; int err; err = xfrm_decode_session(skb, &fl, family); @@ -119,7 +119,10 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) dst = ((struct xfrm_dst *)dst)->route; dst_hold(dst); - dst = xfrm_lookup(net, dst, &fl, skb->sk, 0); + if (sk && !net_eq(net, sock_net(sk))) + sk = NULL; + + dst = xfrm_lookup(net, dst, &fl, sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -739,12 +742,6 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister); int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto) { - int err; - - err = nf_ct_l3proto_try_module_get(l3proto->l3proto); - if (err < 0) - return err; - mutex_lock(&nf_nat_proto_mutex); RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP], &nf_nat_l4proto_tcp); @@ -777,7 +774,6 @@ void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto) synchronize_rcu(); nf_nat_l3proto_clean(l3proto->l3proto); - nf_ct_l3proto_module_put(l3proto->l3proto); } EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c index 5ba5c7bef2f9..b44d62d5d9a9 100644 --- a/net/netfilter/nf_osf.c +++ b/net/netfilter/nf_osf.c @@ -21,15 +21,14 @@ #include <linux/netfilter/nf_osf.h> static inline int nf_osf_ttl(const struct sk_buff *skb, - const struct nf_osf_info *info, - unsigned char f_ttl) + int ttl_check, unsigned char f_ttl) { const struct iphdr *ip = ip_hdr(skb); - if (info->flags & NF_OSF_TTL) { - if (info->ttl == NF_OSF_TTL_TRUE) + if (ttl_check != -1) { + if (ttl_check == NF_OSF_TTL_TRUE) return ip->ttl == f_ttl; - if (info->ttl == NF_OSF_TTL_NOCHECK) + if (ttl_check == NF_OSF_TTL_NOCHECK) return 1; else if (ip->ttl <= f_ttl) return 1; @@ -52,140 +51,175 @@ static inline int nf_osf_ttl(const struct sk_buff *skb, return ip->ttl == f_ttl; } -bool -nf_osf_match(const struct sk_buff *skb, u_int8_t family, - int hooknum, struct net_device *in, struct net_device *out, - const struct nf_osf_info *info, struct net *net, - const struct list_head *nf_osf_fingers) +struct nf_osf_hdr_ctx { + bool df; + u16 window; + u16 totlen; + const unsigned char *optp; + unsigned int optsize; +}; + +static bool nf_osf_match_one(const struct sk_buff *skb, + const struct nf_osf_user_finger *f, + int ttl_check, + struct nf_osf_hdr_ctx *ctx) { - const unsigned char *optp = NULL, *_optp = NULL; - unsigned int optsize = 0, check_WSS = 0; - int fmatch = FMATCH_WRONG, fcount = 0; - const struct iphdr *ip = ip_hdr(skb); - const struct nf_osf_user_finger *f; - unsigned char opts[MAX_IPOPTLEN]; - const struct nf_osf_finger *kf; - u16 window, totlen, mss = 0; - const struct tcphdr *tcp; - struct tcphdr _tcph; - bool df; + unsigned int check_WSS = 0; + int fmatch = FMATCH_WRONG; + int foptsize, optnum; + u16 mss = 0; - tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); - if (!tcp) + if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl)) return false; - if (!tcp->syn) + /* + * Should not happen if userspace parser was written correctly. + */ + if (f->wss.wc >= OSF_WSS_MAX) return false; - totlen = ntohs(ip->tot_len); - df = ntohs(ip->frag_off) & IP_DF; - window = ntohs(tcp->window); + /* Check options */ - if (tcp->doff * 4 > sizeof(struct tcphdr)) { - optsize = tcp->doff * 4 - sizeof(struct tcphdr); + foptsize = 0; + for (optnum = 0; optnum < f->opt_num; ++optnum) + foptsize += f->opt[optnum].length; - _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) + - sizeof(struct tcphdr), optsize, opts); - } + if (foptsize > MAX_IPOPTLEN || + ctx->optsize > MAX_IPOPTLEN || + ctx->optsize != foptsize) + return false; - list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) { - int foptsize, optnum; + check_WSS = f->wss.wc; - f = &kf->finger; + for (optnum = 0; optnum < f->opt_num; ++optnum) { + if (f->opt[optnum].kind == *ctx->optp) { + __u32 len = f->opt[optnum].length; + const __u8 *optend = ctx->optp + len; - if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) - continue; + fmatch = FMATCH_OK; + + switch (*ctx->optp) { + case OSFOPT_MSS: + mss = ctx->optp[3]; + mss <<= 8; + mss |= ctx->optp[2]; + + mss = ntohs((__force __be16)mss); + break; + case OSFOPT_TS: + break; + } + + ctx->optp = optend; + } else + fmatch = FMATCH_OPT_WRONG; + + if (fmatch != FMATCH_OK) + break; + } - optp = _optp; + if (fmatch != FMATCH_OPT_WRONG) { fmatch = FMATCH_WRONG; - if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl)) - continue; + switch (check_WSS) { + case OSF_WSS_PLAIN: + if (f->wss.val == 0 || ctx->window == f->wss.val) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MSS: + /* + * Some smart modems decrease mangle MSS to + * SMART_MSS_2, so we check standard, decreased + * and the one provided in the fingerprint MSS + * values. + */ +#define SMART_MSS_1 1460 +#define SMART_MSS_2 1448 + if (ctx->window == f->wss.val * mss || + ctx->window == f->wss.val * SMART_MSS_1 || + ctx->window == f->wss.val * SMART_MSS_2) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MTU: + if (ctx->window == f->wss.val * (mss + 40) || + ctx->window == f->wss.val * (SMART_MSS_1 + 40) || + ctx->window == f->wss.val * (SMART_MSS_2 + 40)) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MODULO: + if ((ctx->window % f->wss.val) == 0) + fmatch = FMATCH_OK; + break; + } + } - /* - * Should not happen if userspace parser was written correctly. - */ - if (f->wss.wc >= OSF_WSS_MAX) - continue; + return fmatch == FMATCH_OK; +} - /* Check options */ +static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, + const struct sk_buff *skb, + const struct iphdr *ip, + unsigned char *opts) +{ + const struct tcphdr *tcp; + struct tcphdr _tcph; - foptsize = 0; - for (optnum = 0; optnum < f->opt_num; ++optnum) - foptsize += f->opt[optnum].length; + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); + if (!tcp) + return NULL; - if (foptsize > MAX_IPOPTLEN || - optsize > MAX_IPOPTLEN || - optsize != foptsize) - continue; + if (!tcp->syn) + return NULL; - check_WSS = f->wss.wc; + ctx->totlen = ntohs(ip->tot_len); + ctx->df = ntohs(ip->frag_off) & IP_DF; + ctx->window = ntohs(tcp->window); - for (optnum = 0; optnum < f->opt_num; ++optnum) { - if (f->opt[optnum].kind == (*optp)) { - __u32 len = f->opt[optnum].length; - const __u8 *optend = optp + len; + if (tcp->doff * 4 > sizeof(struct tcphdr)) { + ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr); - fmatch = FMATCH_OK; + ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) + + sizeof(struct tcphdr), ctx->optsize, opts); + } - switch (*optp) { - case OSFOPT_MSS: - mss = optp[3]; - mss <<= 8; - mss |= optp[2]; + return tcp; +} - mss = ntohs((__force __be16)mss); - break; - case OSFOPT_TS: - break; - } +bool +nf_osf_match(const struct sk_buff *skb, u_int8_t family, + int hooknum, struct net_device *in, struct net_device *out, + const struct nf_osf_info *info, struct net *net, + const struct list_head *nf_osf_fingers) +{ + const struct iphdr *ip = ip_hdr(skb); + const struct nf_osf_user_finger *f; + unsigned char opts[MAX_IPOPTLEN]; + const struct nf_osf_finger *kf; + int fcount = 0, ttl_check; + int fmatch = FMATCH_WRONG; + struct nf_osf_hdr_ctx ctx; + const struct tcphdr *tcp; - optp = optend; - } else - fmatch = FMATCH_OPT_WRONG; + memset(&ctx, 0, sizeof(ctx)); - if (fmatch != FMATCH_OK) - break; - } + tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); + if (!tcp) + return false; - if (fmatch != FMATCH_OPT_WRONG) { - fmatch = FMATCH_WRONG; + ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1; - switch (check_WSS) { - case OSF_WSS_PLAIN: - if (f->wss.val == 0 || window == f->wss.val) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MSS: - /* - * Some smart modems decrease mangle MSS to - * SMART_MSS_2, so we check standard, decreased - * and the one provided in the fingerprint MSS - * values. - */ -#define SMART_MSS_1 1460 -#define SMART_MSS_2 1448 - if (window == f->wss.val * mss || - window == f->wss.val * SMART_MSS_1 || - window == f->wss.val * SMART_MSS_2) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MTU: - if (window == f->wss.val * (mss + 40) || - window == f->wss.val * (SMART_MSS_1 + 40) || - window == f->wss.val * (SMART_MSS_2 + 40)) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MODULO: - if ((window % f->wss.val) == 0) - fmatch = FMATCH_OK; - break; - } - } + list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { - if (fmatch != FMATCH_OK) + f = &kf->finger; + + if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) + continue; + + if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; + fmatch = FMATCH_OK; + fcount++; if (info->flags & NF_OSF_LOG) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3f211e1025c1..f18085639807 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -76,6 +76,7 @@ static void nft_ctx_init(struct nft_ctx *ctx, { ctx->net = net; ctx->family = family; + ctx->level = 0; ctx->table = table; ctx->chain = chain; ctx->nla = nla; @@ -455,20 +456,59 @@ __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) return NULL; } +/* + * Loading a module requires dropping mutex that guards the + * transaction. + * We first need to abort any pending transactions as once + * mutex is unlocked a different client could start a new + * transaction. It must not see any 'future generation' + * changes * as these changes will never happen. + */ +#ifdef CONFIG_MODULES +static int __nf_tables_abort(struct net *net); + +static void nft_request_module(struct net *net, const char *fmt, ...) +{ + char module_name[MODULE_NAME_LEN]; + va_list args; + int ret; + + __nf_tables_abort(net); + + va_start(args, fmt); + ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); + va_end(args); + if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret)) + return; + + mutex_unlock(&net->nft.commit_mutex); + request_module("%s", module_name); + mutex_lock(&net->nft.commit_mutex); +} +#endif + +static void lockdep_nfnl_nft_mutex_not_held(void) +{ +#ifdef CONFIG_PROVE_LOCKING + WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); +#endif +} + static const struct nft_chain_type * -nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload) +nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, + u8 family, bool autoload) { const struct nft_chain_type *type; type = __nf_tables_chain_type_lookup(nla, family); if (type != NULL) return type; + + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (autoload) { - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-chain-%u-%.*s", family, - nla_len(nla), (const char *)nla_data(nla)); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + nft_request_module(net, "nft-chain-%u-%.*s", family, + nla_len(nla), (const char *)nla_data(nla)); type = __nf_tables_chain_type_lookup(nla, family); if (type != NULL) return ERR_PTR(-EAGAIN); @@ -772,6 +812,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, struct nft_ctx ctx; int err; + lockdep_assert_held(&net->nft.commit_mutex); attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask); if (IS_ERR(table)) { @@ -1012,7 +1053,17 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) return ERR_PTR(-ENOENT); } -static struct nft_chain *nft_chain_lookup(struct nft_table *table, +static bool lockdep_commit_lock_is_held(struct net *net) +{ +#ifdef CONFIG_PROVE_LOCKING + return lockdep_is_held(&net->nft.commit_mutex); +#else + return true; +#endif +} + +static struct nft_chain *nft_chain_lookup(struct net *net, + struct nft_table *table, const struct nlattr *nla, u8 genmask) { char search[NFT_CHAIN_MAXNAMELEN + 1]; @@ -1025,7 +1076,7 @@ static struct nft_chain *nft_chain_lookup(struct nft_table *table, nla_strlcpy(search, nla, sizeof(search)); WARN_ON(!rcu_read_lock_held() && - !lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + !lockdep_commit_lock_is_held(net)); chain = ERR_PTR(-ENOENT); rcu_read_lock(); @@ -1265,7 +1316,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + chain = nft_chain_lookup(net, table, nla[NFTA_CHAIN_NAME], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); return PTR_ERR(chain); @@ -1398,6 +1449,9 @@ static int nft_chain_parse_hook(struct net *net, struct net_device *dev; int err; + lockdep_assert_held(&net->nft.commit_mutex); + lockdep_nfnl_nft_mutex_not_held(); + err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], nft_hook_policy, NULL); if (err < 0) @@ -1412,7 +1466,7 @@ static int nft_chain_parse_hook(struct net *net, type = chain_type[family][NFT_CHAIN_T_DEFAULT]; if (nla[NFTA_CHAIN_TYPE]) { - type = nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE], + type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], family, create); if (IS_ERR(type)) return PTR_ERR(type); @@ -1598,7 +1652,6 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, struct nft_base_chain *basechain; struct nft_stats *stats = NULL; struct nft_chain_hook hook; - const struct nlattr *name; struct nf_hook_ops *ops; struct nft_trans *trans; int err; @@ -1632,7 +1685,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nla[NFTA_CHAIN_NAME]) { struct nft_chain *chain2; - chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + chain2 = nft_chain_lookup(ctx->net, table, + nla[NFTA_CHAIN_NAME], genmask); if (!IS_ERR(chain2)) return -EEXIST; } @@ -1646,12 +1700,11 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, return PTR_ERR(stats); } + err = -ENOMEM; trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN, sizeof(struct nft_trans_chain)); - if (trans == NULL) { - free_percpu(stats); - return -ENOMEM; - } + if (trans == NULL) + goto err; nft_trans_chain_stats(trans) = stats; nft_trans_chain_update(trans) = true; @@ -1661,19 +1714,37 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, else nft_trans_chain_policy(trans) = -1; - name = nla[NFTA_CHAIN_NAME]; - if (nla[NFTA_CHAIN_HANDLE] && name) { - nft_trans_chain_name(trans) = - nla_strdup(name, GFP_KERNEL); - if (!nft_trans_chain_name(trans)) { - kfree(trans); - free_percpu(stats); - return -ENOMEM; + if (nla[NFTA_CHAIN_HANDLE] && + nla[NFTA_CHAIN_NAME]) { + struct nft_trans *tmp; + char *name; + + err = -ENOMEM; + name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); + if (!name) + goto err; + + err = -EEXIST; + list_for_each_entry(tmp, &ctx->net->nft.commit_list, list) { + if (tmp->msg_type == NFT_MSG_NEWCHAIN && + tmp->ctx.table == table && + nft_trans_chain_update(tmp) && + nft_trans_chain_name(tmp) && + strcmp(name, nft_trans_chain_name(tmp)) == 0) { + kfree(name); + goto err; + } } + + nft_trans_chain_name(trans) = name; } list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; +err: + free_percpu(stats); + kfree(trans); + return err; } static int nf_tables_newchain(struct net *net, struct sock *nlsk, @@ -1694,6 +1765,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + lockdep_assert_held(&net->nft.commit_mutex); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); @@ -1712,7 +1785,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } attr = nla[NFTA_CHAIN_HANDLE]; } else { - chain = nft_chain_lookup(table, attr, genmask); + chain = nft_chain_lookup(net, table, attr, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) { NL_SET_BAD_ATTR(extack, attr); @@ -1790,7 +1863,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, chain = nft_chain_lookup_byhandle(table, handle, genmask); } else { attr = nla[NFTA_CHAIN_NAME]; - chain = nft_chain_lookup(table, attr, genmask); + chain = nft_chain_lookup(net, table, attr, genmask); } if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, attr); @@ -1875,7 +1948,8 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, return NULL; } -static const struct nft_expr_type *nft_expr_type_get(u8 family, +static const struct nft_expr_type *nft_expr_type_get(struct net *net, + u8 family, struct nlattr *nla) { const struct nft_expr_type *type; @@ -1887,19 +1961,16 @@ static const struct nft_expr_type *nft_expr_type_get(u8 family, if (type != NULL && try_module_get(type->owner)) return type; + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-expr-%u-%.*s", family, - nla_len(nla), (char *)nla_data(nla)); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + nft_request_module(net, "nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)); if (__nft_expr_type_get(family, nla)) return ERR_PTR(-EAGAIN); - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-expr-%.*s", - nla_len(nla), (char *)nla_data(nla)); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + nft_request_module(net, "nft-expr-%.*s", + nla_len(nla), (char *)nla_data(nla)); if (__nft_expr_type_get(family, nla)) return ERR_PTR(-EAGAIN); } @@ -1968,7 +2039,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, if (err < 0) return err; - type = nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); + type = nft_expr_type_get(ctx->net, ctx->family, tb[NFTA_EXPR_NAME]); if (IS_ERR(type)) return PTR_ERR(type); @@ -2255,6 +2326,39 @@ done: return skb->len; } +static int nf_tables_dump_rules_start(struct netlink_callback *cb) +{ + const struct nlattr * const *nla = cb->data; + struct nft_rule_dump_ctx *ctx = NULL; + + if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) { + ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC); + if (!ctx) + return -ENOMEM; + + if (nla[NFTA_RULE_TABLE]) { + ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], + GFP_ATOMIC); + if (!ctx->table) { + kfree(ctx); + return -ENOMEM; + } + } + if (nla[NFTA_RULE_CHAIN]) { + ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], + GFP_ATOMIC); + if (!ctx->chain) { + kfree(ctx->table); + kfree(ctx); + return -ENOMEM; + } + } + } + + cb->data = ctx; + return 0; +} + static int nf_tables_dump_rules_done(struct netlink_callback *cb) { struct nft_rule_dump_ctx *ctx = cb->data; @@ -2284,38 +2388,13 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { + .start= nf_tables_dump_rules_start, .dump = nf_tables_dump_rules, .done = nf_tables_dump_rules_done, .module = THIS_MODULE, + .data = (void *)nla, }; - if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) { - struct nft_rule_dump_ctx *ctx; - - ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC); - if (!ctx) - return -ENOMEM; - - if (nla[NFTA_RULE_TABLE]) { - ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], - GFP_ATOMIC); - if (!ctx->table) { - kfree(ctx); - return -ENOMEM; - } - } - if (nla[NFTA_RULE_CHAIN]) { - ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], - GFP_ATOMIC); - if (!ctx->chain) { - kfree(ctx->table); - kfree(ctx); - return -ENOMEM; - } - } - c.data = ctx; - } - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } @@ -2325,7 +2404,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); @@ -2359,6 +2438,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, { struct nft_expr *expr; + lockdep_assert_held(&ctx->net->nft.commit_mutex); /* * Careful: some expressions might not be initialized in case this * is called on error from nf_tables_newrule(). @@ -2385,6 +2465,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) struct nft_rule *rule; int err; + if (ctx->level == NFT_JUMP_STACK_SIZE) + return -EMLINK; + list_for_each_entry(rule, &chain->rules, list) { if (!nft_is_active_next(ctx->net, rule)) continue; @@ -2427,8 +2510,6 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) #define NFT_RULE_MAXEXPRS 128 -static struct nft_expr_info *info; - static int nf_tables_newrule(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -2436,6 +2517,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); + struct nft_expr_info *info = NULL; int family = nfmsg->nfgen_family; struct nft_table *table; struct nft_chain *chain; @@ -2450,6 +2532,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, bool create; u64 handle, pos_handle; + lockdep_assert_held(&net->nft.commit_mutex); + create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); @@ -2458,7 +2542,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); @@ -2506,6 +2590,12 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, n = 0; size = 0; if (nla[NFTA_RULE_EXPRESSIONS]) { + info = kvmalloc_array(NFT_RULE_MAXEXPRS, + sizeof(struct nft_expr_info), + GFP_KERNEL); + if (!info) + return -ENOMEM; + nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { err = -EINVAL; if (nla_type(tmp) != NFTA_LIST_ELEM) @@ -2598,6 +2688,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, list_add_rcu(&rule->list, &chain->rules); } } + kvfree(info); chain->use++; if (net->nft.validate_state == NFT_VALIDATE_DO) @@ -2611,6 +2702,7 @@ err1: if (info[i].ops != NULL) module_put(info[i].ops->type->owner); } + kvfree(info); return err; } @@ -2650,7 +2742,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, } if (nla[NFTA_RULE_CHAIN]) { - chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], + genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); @@ -2742,11 +2835,11 @@ nft_select_set_ops(const struct nft_ctx *ctx, const struct nft_set_type *type; u32 flags = 0; + lockdep_assert_held(&ctx->net->nft.commit_mutex); + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (list_empty(&nf_tables_set_types)) { - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-set"); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + nft_request_module(ctx->net, "nft-set"); if (!list_empty(&nf_tables_set_types)) return ERR_PTR(-EAGAIN); } @@ -3162,6 +3255,18 @@ done: return skb->len; } +static int nf_tables_dump_sets_start(struct netlink_callback *cb) +{ + struct nft_ctx *ctx_dump = NULL; + + ctx_dump = kmemdup(cb->data, sizeof(*ctx_dump), GFP_ATOMIC); + if (ctx_dump == NULL) + return -ENOMEM; + + cb->data = ctx_dump; + return 0; +} + static int nf_tables_dump_sets_done(struct netlink_callback *cb) { kfree(cb->data); @@ -3189,18 +3294,12 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { + .start = nf_tables_dump_sets_start, .dump = nf_tables_dump_sets, .done = nf_tables_dump_sets_done, + .data = &ctx, .module = THIS_MODULE, }; - struct nft_ctx *ctx_dump; - - ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_ATOMIC); - if (ctx_dump == NULL) - return -ENOMEM; - - *ctx_dump = ctx; - c.data = ctx_dump; return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } @@ -3850,6 +3949,15 @@ nla_put_failure: return -ENOSPC; } +static int nf_tables_dump_set_start(struct netlink_callback *cb) +{ + struct nft_set_dump_ctx *dump_ctx = cb->data; + + cb->data = kmemdup(dump_ctx, sizeof(*dump_ctx), GFP_ATOMIC); + + return cb->data ? 0 : -ENOMEM; +} + static int nf_tables_dump_set_done(struct netlink_callback *cb) { kfree(cb->data); @@ -4003,20 +4111,17 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { + .start = nf_tables_dump_set_start, .dump = nf_tables_dump_set, .done = nf_tables_dump_set_done, .module = THIS_MODULE, }; - struct nft_set_dump_ctx *dump_ctx; - - dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_ATOMIC); - if (!dump_ctx) - return -ENOMEM; - - dump_ctx->set = set; - dump_ctx->ctx = ctx; + struct nft_set_dump_ctx dump_ctx = { + .set = set, + .ctx = ctx, + }; - c.data = dump_ctx; + c.data = &dump_ctx; return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } @@ -4779,7 +4884,8 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype) return NULL; } -static const struct nft_object_type *nft_obj_type_get(u32 objtype) +static const struct nft_object_type * +nft_obj_type_get(struct net *net, u32 objtype) { const struct nft_object_type *type; @@ -4787,11 +4893,10 @@ static const struct nft_object_type *nft_obj_type_get(u32 objtype) if (type != NULL && try_module_get(type->owner)) return type; + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-obj-%u", objtype); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + nft_request_module(net, "nft-obj-%u", objtype); if (__nft_obj_type_get(objtype)) return ERR_PTR(-EAGAIN); } @@ -4843,7 +4948,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); - type = nft_obj_type_get(objtype); + type = nft_obj_type_get(net, objtype); if (IS_ERR(type)) return PTR_ERR(type); @@ -4976,38 +5081,42 @@ done: return skb->len; } -static int nf_tables_dump_obj_done(struct netlink_callback *cb) +static int nf_tables_dump_obj_start(struct netlink_callback *cb) { - struct nft_obj_filter *filter = cb->data; + const struct nlattr * const *nla = cb->data; + struct nft_obj_filter *filter = NULL; - if (filter) { - kfree(filter->table); - kfree(filter); + if (nla[NFTA_OBJ_TABLE] || nla[NFTA_OBJ_TYPE]) { + filter = kzalloc(sizeof(*filter), GFP_ATOMIC); + if (!filter) + return -ENOMEM; + + if (nla[NFTA_OBJ_TABLE]) { + filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC); + if (!filter->table) { + kfree(filter); + return -ENOMEM; + } + } + + if (nla[NFTA_OBJ_TYPE]) + filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); } + cb->data = filter; return 0; } -static struct nft_obj_filter * -nft_obj_filter_alloc(const struct nlattr * const nla[]) +static int nf_tables_dump_obj_done(struct netlink_callback *cb) { - struct nft_obj_filter *filter; - - filter = kzalloc(sizeof(*filter), GFP_ATOMIC); - if (!filter) - return ERR_PTR(-ENOMEM); + struct nft_obj_filter *filter = cb->data; - if (nla[NFTA_OBJ_TABLE]) { - filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC); - if (!filter->table) { - kfree(filter); - return ERR_PTR(-ENOMEM); - } + if (filter) { + kfree(filter->table); + kfree(filter); } - if (nla[NFTA_OBJ_TYPE]) - filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - return filter; + return 0; } /* called with rcu_read_lock held */ @@ -5028,21 +5137,13 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { + .start = nf_tables_dump_obj_start, .dump = nf_tables_dump_obj, .done = nf_tables_dump_obj_done, .module = THIS_MODULE, + .data = (void *)nla, }; - if (nla[NFTA_OBJ_TABLE] || - nla[NFTA_OBJ_TYPE]) { - struct nft_obj_filter *filter; - - filter = nft_obj_filter_alloc(nla); - if (IS_ERR(filter)) - return -ENOMEM; - - c.data = filter; - } return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } @@ -5321,8 +5422,6 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, flowtable->ops[i].priv = &flowtable->data; flowtable->ops[i].hook = flowtable->data.type->hook; flowtable->ops[i].dev = dev_array[i]; - flowtable->dev_name[i] = kstrdup(dev_array[i]->name, - GFP_KERNEL); } return err; @@ -5339,7 +5438,8 @@ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family) return NULL; } -static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family) +static const struct nf_flowtable_type * +nft_flowtable_type_get(struct net *net, u8 family) { const struct nf_flowtable_type *type; @@ -5347,11 +5447,10 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family) if (type != NULL && try_module_get(type->owner)) return type; + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nf-flowtable-%u", family); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + nft_request_module(net, "nf-flowtable-%u", family); if (__nft_flowtable_type_get(family)) return ERR_PTR(-EAGAIN); } @@ -5431,7 +5530,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, goto err1; } - type = nft_flowtable_type_get(family); + type = nft_flowtable_type_get(net, family); if (IS_ERR(type)) { err = PTR_ERR(type); goto err2; @@ -5480,10 +5579,8 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, err6: i = flowtable->ops_len; err5: - for (k = i - 1; k >= 0; k--) { - kfree(flowtable->dev_name[k]); + for (k = i - 1; k >= 0; k--) nf_unregister_net_hook(net, &flowtable->ops[k]); - } kfree(flowtable->ops); err4: @@ -5582,9 +5679,10 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; for (i = 0; i < flowtable->ops_len; i++) { - if (flowtable->dev_name[i][0] && - nla_put_string(skb, NFTA_DEVICE_NAME, - flowtable->dev_name[i])) + const struct net_device *dev = READ_ONCE(flowtable->ops[i].dev); + + if (dev && + nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -5651,37 +5749,39 @@ done: return skb->len; } -static int nf_tables_dump_flowtable_done(struct netlink_callback *cb) +static int nf_tables_dump_flowtable_start(struct netlink_callback *cb) { - struct nft_flowtable_filter *filter = cb->data; + const struct nlattr * const *nla = cb->data; + struct nft_flowtable_filter *filter = NULL; - if (!filter) - return 0; + if (nla[NFTA_FLOWTABLE_TABLE]) { + filter = kzalloc(sizeof(*filter), GFP_ATOMIC); + if (!filter) + return -ENOMEM; - kfree(filter->table); - kfree(filter); + filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE], + GFP_ATOMIC); + if (!filter->table) { + kfree(filter); + return -ENOMEM; + } + } + cb->data = filter; return 0; } -static struct nft_flowtable_filter * -nft_flowtable_filter_alloc(const struct nlattr * const nla[]) +static int nf_tables_dump_flowtable_done(struct netlink_callback *cb) { - struct nft_flowtable_filter *filter; + struct nft_flowtable_filter *filter = cb->data; - filter = kzalloc(sizeof(*filter), GFP_ATOMIC); if (!filter) - return ERR_PTR(-ENOMEM); + return 0; - if (nla[NFTA_FLOWTABLE_TABLE]) { - filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE], - GFP_ATOMIC); - if (!filter->table) { - kfree(filter); - return ERR_PTR(-ENOMEM); - } - } - return filter; + kfree(filter->table); + kfree(filter); + + return 0; } /* called with rcu_read_lock held */ @@ -5701,20 +5801,13 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { + .start = nf_tables_dump_flowtable_start, .dump = nf_tables_dump_flowtable, .done = nf_tables_dump_flowtable_done, .module = THIS_MODULE, + .data = (void *)nla, }; - if (nla[NFTA_FLOWTABLE_TABLE]) { - struct nft_flowtable_filter *filter; - - filter = nft_flowtable_filter_alloc(nla); - if (IS_ERR(filter)) - return -ENOMEM; - - c.data = filter; - } return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } @@ -5784,6 +5877,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) kfree(flowtable->name); flowtable->data.type->free(&flowtable->data); module_put(flowtable->data.type->owner); + kfree(flowtable); } static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, @@ -5826,7 +5920,6 @@ static void nft_flowtable_event(unsigned long event, struct net_device *dev, continue; nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]); - flowtable->dev_name[i][0] = '\0'; flowtable->ops[i].dev = NULL; break; } @@ -6087,6 +6180,9 @@ static void nft_commit_release(struct nft_trans *trans) case NFT_MSG_DELTABLE: nf_tables_table_destroy(&trans->ctx); break; + case NFT_MSG_NEWCHAIN: + kfree(nft_trans_chain_name(trans)); + break; case NFT_MSG_DELCHAIN: nf_tables_chain_destroy(&trans->ctx); break; @@ -6202,9 +6298,9 @@ static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *cha next_genbit = nft_gencursor_next(net); g0 = rcu_dereference_protected(chain->rules_gen_0, - lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + lockdep_commit_lock_is_held(net)); g1 = rcu_dereference_protected(chain->rules_gen_1, - lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + lockdep_commit_lock_is_held(net)); /* No changes to this chain? */ if (chain->rules_next == NULL) { @@ -6316,13 +6412,15 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); break; case NFT_MSG_NEWCHAIN: - if (nft_trans_chain_update(trans)) + if (nft_trans_chain_update(trans)) { nft_chain_commit_update(trans); - else + nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); + /* trans destroyed after rcu grace period */ + } else { nft_clear(net, trans->ctx.chain); - - nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); - nft_trans_destroy(trans); + nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); + nft_trans_destroy(trans); + } break; case NFT_MSG_DELCHAIN: nft_chain_del(trans->ctx.chain); @@ -6412,6 +6510,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_commit_release(net); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + mutex_unlock(&net->nft.commit_mutex); return 0; } @@ -6472,7 +6571,7 @@ static int __nf_tables_abort(struct net *net) case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { free_percpu(nft_trans_chain_stats(trans)); - + kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { trans->ctx.table->use--; @@ -6563,12 +6662,25 @@ static void nf_tables_cleanup(struct net *net) static int nf_tables_abort(struct net *net, struct sk_buff *skb) { - return __nf_tables_abort(net); + int ret = __nf_tables_abort(net); + + mutex_unlock(&net->nft.commit_mutex); + + return ret; } static bool nf_tables_valid_genid(struct net *net, u32 genid) { - return net->nft.base_seq == genid; + bool genid_ok; + + mutex_lock(&net->nft.commit_mutex); + + genid_ok = genid == 0 || net->nft.base_seq == genid; + if (!genid_ok) + mutex_unlock(&net->nft.commit_mutex); + + /* else, commit mutex has to be released by commit or abort function */ + return genid_ok; } static const struct nfnetlink_subsystem nf_tables_subsys = { @@ -6580,6 +6692,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { .abort = nf_tables_abort, .cleanup = nf_tables_cleanup, .valid_genid = nf_tables_valid_genid, + .owner = THIS_MODULE, }; int nft_chain_validate_dependency(const struct nft_chain *chain, @@ -6838,13 +6951,6 @@ int nft_validate_register_store(const struct nft_ctx *ctx, err = nf_tables_check_loops(ctx, data->verdict.chain); if (err < 0) return err; - - if (ctx->chain->level + 1 > - data->verdict.chain->level) { - if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE) - return -EMLINK; - data->verdict.chain->level = ctx->chain->level + 1; - } } return 0; @@ -6906,8 +7012,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, case NFT_GOTO: if (!tb[NFTA_VERDICT_CHAIN]) return -EINVAL; - chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN], - genmask); + chain = nft_chain_lookup(ctx->net, ctx->table, + tb[NFTA_VERDICT_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); if (nft_is_base_chain(chain)) @@ -7152,6 +7258,7 @@ static int __net_init nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.tables); INIT_LIST_HEAD(&net->nft.commit_list); + mutex_init(&net->nft.commit_mutex); net->nft.base_seq = 1; net->nft.validate_state = NFT_VALIDATE_SKIP; @@ -7160,11 +7267,11 @@ static int __net_init nf_tables_init_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { - nfnl_lock(NFNL_SUBSYS_NFTABLES); + mutex_lock(&net->nft.commit_mutex); if (!list_empty(&net->nft.commit_list)) __nf_tables_abort(net); __nft_release_tables(net); - nfnl_unlock(NFNL_SUBSYS_NFTABLES); + mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); } @@ -7179,29 +7286,19 @@ static int __init nf_tables_module_init(void) nft_chain_filter_init(); - info = kmalloc_array(NFT_RULE_MAXEXPRS, sizeof(struct nft_expr_info), - GFP_KERNEL); - if (info == NULL) { - err = -ENOMEM; - goto err1; - } - err = nf_tables_core_module_init(); if (err < 0) - goto err2; + return err; err = nfnetlink_subsys_register(&nf_tables_subsys); if (err < 0) - goto err3; + goto err; register_netdevice_notifier(&nf_tables_flowtable_notifier); return register_pernet_subsys(&nf_tables_net_ops); -err3: +err: nf_tables_core_module_exit(); -err2: - kfree(info); -err1: return err; } @@ -7213,7 +7310,6 @@ static void __exit nf_tables_module_exit(void) unregister_pernet_subsys(&nf_tables_net_ops); rcu_barrier(); nf_tables_core_module_exit(); - kfree(info); } module_init(nf_tables_module_init); diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c new file mode 100644 index 000000000000..814789644bd3 --- /dev/null +++ b/net/netfilter/nf_tables_set_core.c @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <net/netfilter/nf_tables_core.h> + +static int __init nf_tables_set_module_init(void) +{ + nft_register_set(&nft_set_hash_fast_type); + nft_register_set(&nft_set_hash_type); + nft_register_set(&nft_set_rhash_type); + nft_register_set(&nft_set_bitmap_type); + nft_register_set(&nft_set_rbtree_type); + + return 0; +} + +static void __exit nf_tables_set_module_exit(void) +{ + nft_unregister_set(&nft_set_rbtree_type); + nft_unregister_set(&nft_set_bitmap_type); + nft_unregister_set(&nft_set_rhash_type); + nft_unregister_set(&nft_set_hash_type); + nft_unregister_set(&nft_set_hash_fast_type); +} + +module_init(nf_tables_set_module_init); +module_exit(nf_tables_set_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e1b6be29848d..916913454624 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -331,18 +331,27 @@ replay: } } - if (!ss->commit || !ss->abort) { + if (!ss->valid_genid || !ss->commit || !ss->abort) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); return kfree_skb(skb); } - if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) { + if (!try_module_get(ss->owner)) { + nfnl_unlock(subsys_id); + netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); + return kfree_skb(skb); + } + + if (!ss->valid_genid(net, genid)) { + module_put(ss->owner); nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -ERESTART, NULL); return kfree_skb(skb); } + nfnl_unlock(subsys_id); + while (skb->len >= nlmsg_total_size(0)) { int msglen, type; @@ -464,14 +473,10 @@ ack: } done: if (status & NFNL_BATCH_REPLAY) { - const struct nfnetlink_subsystem *ss2; - - ss2 = nfnl_dereference_protected(subsys_id); - if (ss2 == ss) - ss->abort(net, oskb); + ss->abort(net, oskb); nfnl_err_reset(&err_list); - nfnl_unlock(subsys_id); kfree_skb(skb); + module_put(ss->owner); goto replay; } else if (status == NFNL_BATCH_DONE) { err = ss->commit(net, oskb); @@ -489,8 +494,8 @@ done: ss->cleanup(net); nfnl_err_deliver(&err_list, oskb); - nfnl_unlock(subsys_id); kfree_skb(skb); + module_put(ss->owner); } static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = { diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 9ee5fa551fa6..d9d952fad3e0 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -26,7 +26,6 @@ #include <net/sock.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_timeout.h> @@ -47,7 +46,7 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { }; static int -ctnl_timeout_parse_policy(void *timeouts, +ctnl_timeout_parse_policy(void *timeout, const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { @@ -68,7 +67,7 @@ ctnl_timeout_parse_policy(void *timeouts, if (ret < 0) goto err; - ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); + ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeout); err: kfree(tb); @@ -373,7 +372,6 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, struct netlink_ext_ack *extack) { const struct nf_conntrack_l4proto *l4proto; - unsigned int *timeouts; __u16 l3num; __u8 l4num; int ret; @@ -393,9 +391,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, goto err; } - timeouts = l4proto->get_timeouts(net); - - ret = ctnl_timeout_parse_policy(timeouts, l4proto, net, + ret = ctnl_timeout_parse_policy(NULL, l4proto, net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; @@ -432,7 +428,6 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { struct nlattr *nest_parms; - unsigned int *timeouts = l4proto->get_timeouts(net); int ret; nest_parms = nla_nest_start(skb, @@ -440,7 +435,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, if (!nest_parms) goto nla_put_failure; - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL); if (ret < 0) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 4ccd2988f9db..ea4ba551abb2 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1243,6 +1243,9 @@ static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl, static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, + [NFQA_CFG_QUEUE_MAXLEN] = { .type = NLA_U32 }, + [NFQA_CFG_MASK] = { .type = NLA_U32 }, + [NFQA_CFG_FLAGS] = { .type = NLA_U32 }, }; static const struct nf_queue_handler nfqh = { diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index d21834bed805..ea5b7c4944f6 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -322,7 +322,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, if (!ctx.net) return NOTIFY_DONE; - nfnl_lock(NFNL_SUBSYS_NFTABLES); + mutex_lock(&ctx.net->nft.commit_mutex); list_for_each_entry(table, &ctx.net->nft.tables, list) { if (table->family != NFPROTO_NETDEV) continue; @@ -337,7 +337,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, nft_netdev_event(event, dev, &ctx); } } - nfnl_unlock(NFNL_SUBSYS_NFTABLES); + mutex_unlock(&ctx.net->nft.commit_mutex); put_net(ctx.net); return NOTIFY_DONE; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 8d1ff654e5af..32535eea51b2 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -832,10 +832,18 @@ nft_target_select_ops(const struct nft_ctx *ctx, rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV])); family = ctx->family; + if (strcmp(tg_name, XT_ERROR_TARGET) == 0 || + strcmp(tg_name, XT_STANDARD_TARGET) == 0 || + strcmp(tg_name, "standard") == 0) + return ERR_PTR(-EINVAL); + /* Re-use the existing target if it's already loaded. */ list_for_each_entry(nft_target, &nft_target_list, head) { struct xt_target *target = nft_target->ops.data; + if (!target->target) + continue; + if (nft_target_cmp(target, tg_name, rev, family)) return &nft_target->ops; } @@ -844,6 +852,11 @@ nft_target_select_ops(const struct nft_ctx *ctx, if (IS_ERR(target)) return ERR_PTR(-ENOENT); + if (!target->target) { + err = -EINVAL; + goto err; + } + if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) { err = -EINVAL; goto err; diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index a832c59f0a9c..b90d96ba4a12 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -14,10 +14,9 @@ #include <net/netfilter/nf_conntrack_zones.h> struct nft_connlimit { - spinlock_t lock; - struct hlist_head hhead; - u32 limit; - bool invert; + struct nf_conncount_list list; + u32 limit; + bool invert; }; static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, @@ -45,21 +44,19 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, return; } - spin_lock_bh(&priv->lock); - count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone, - &addit); + nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, + &addit); + count = priv->list.count; if (!addit) goto out; - if (!nf_conncount_add(&priv->hhead, tuple_ptr, zone)) { + if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) { regs->verdict.code = NF_DROP; - spin_unlock_bh(&priv->lock); return; } count++; out: - spin_unlock_bh(&priv->lock); if ((count > priv->limit) ^ priv->invert) { regs->verdict.code = NFT_BREAK; @@ -87,8 +84,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, invert = true; } - spin_lock_init(&priv->lock); - INIT_HLIST_HEAD(&priv->hhead); + nf_conncount_list_init(&priv->list); priv->limit = limit; priv->invert = invert; @@ -99,7 +95,7 @@ static void nft_connlimit_do_destroy(const struct nft_ctx *ctx, struct nft_connlimit *priv) { nf_ct_netns_put(ctx->net, ctx->family); - nf_conncount_cache_free(&priv->hhead); + nf_conncount_cache_free(&priv->list); } static int nft_connlimit_do_dump(struct sk_buff *skb, @@ -212,8 +208,7 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) struct nft_connlimit *priv_dst = nft_expr_priv(dst); struct nft_connlimit *priv_src = nft_expr_priv(src); - spin_lock_init(&priv_dst->lock); - INIT_HLIST_HEAD(&priv_dst->hhead); + nf_conncount_list_init(&priv_dst->list); priv_dst->limit = priv_src->limit; priv_dst->invert = priv_src->invert; @@ -225,21 +220,14 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, { struct nft_connlimit *priv = nft_expr_priv(expr); - nf_conncount_cache_free(&priv->hhead); + nf_conncount_cache_free(&priv->list); } static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); - bool addit, ret; - spin_lock_bh(&priv->lock); - nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit); - - ret = hlist_empty(&priv->hhead); - spin_unlock_bh(&priv->lock); - - return ret; + return nf_conncount_gc_list(net, &priv->list); } static struct nft_expr_type nft_connlimit_type; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 1435ffc5f57e..3bc82ee5464d 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -870,7 +870,7 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj, if (test_bit(IPS_HELPER_BIT, &ct->status)) return; - help = nf_ct_helper_ext_add(ct, to_assign, GFP_ATOMIC); + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) { rcu_assign_pointer(help->helper, to_assign); set_bit(IPS_HELPER_BIT, &ct->status); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 27d7e4598ab6..81184c244d1a 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -118,6 +118,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, u64 timeout; int err; + lockdep_assert_held(&ctx->net->nft.commit_mutex); + if (tb[NFTA_DYNSET_SET_NAME] == NULL || tb[NFTA_DYNSET_OP] == NULL || tb[NFTA_DYNSET_SREG_KEY] == NULL) diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 15adf8ca82c3..0777a93211e2 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -98,6 +98,7 @@ static int nft_immediate_validate(const struct nft_ctx *ctx, const struct nft_data **d) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + struct nft_ctx *pctx = (struct nft_ctx *)ctx; const struct nft_data *data; int err; @@ -109,9 +110,11 @@ static int nft_immediate_validate(const struct nft_ctx *ctx, switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: + pctx->level++; err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; + pctx->level--; break; default: break; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 42e6fadf1417..c2a1d84cdfc4 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -155,7 +155,9 @@ static int nft_lookup_validate_setelem(const struct nft_ctx *ctx, struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + struct nft_ctx *pctx = (struct nft_ctx *)ctx; const struct nft_data *data; + int err; if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) @@ -165,10 +167,17 @@ static int nft_lookup_validate_setelem(const struct nft_ctx *ctx, switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - return nft_chain_validate(ctx, data->verdict.chain); + pctx->level++; + err = nft_chain_validate(ctx, data->verdict.chain); + if (err < 0) + return err; + pctx->level--; + break; default: - return 0; + break; } + + return 0; } static int nft_lookup_validate(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 1105a23bda5e..2b94dcc43456 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -107,7 +107,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr, break; case NFT_META_SKUID: sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || + !net_eq(nft_net(pkt), sock_net(sk))) goto err; read_lock_bh(&sk->sk_callback_lock); @@ -123,7 +124,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr, break; case NFT_META_SKGID: sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || + !net_eq(nft_net(pkt), sock_net(sk))) goto err; read_lock_bh(&sk->sk_callback_lock); @@ -214,7 +216,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr, #ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || + !net_eq(nft_net(pkt), sock_net(sk))) goto err; *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index d6626e01c7ee..128bc16f52dd 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -296,7 +296,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_bitmap_type __read_mostly = { +struct nft_set_type nft_set_bitmap_type __read_mostly = { .owner = THIS_MODULE, .ops = { .privsize = nft_bitmap_privsize, @@ -314,20 +314,3 @@ static struct nft_set_type nft_bitmap_type __read_mostly = { .get = nft_bitmap_get, }, }; - -static int __init nft_bitmap_module_init(void) -{ - return nft_register_set(&nft_bitmap_type); -} - -static void __exit nft_bitmap_module_exit(void) -{ - nft_unregister_set(&nft_bitmap_type); -} - -module_init(nft_bitmap_module_init); -module_exit(nft_bitmap_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); -MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6f9a1365a09f..90c3e7e6cacb 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -387,6 +387,7 @@ static void nft_rhash_destroy(const struct nft_set *set) struct nft_rhash *priv = nft_set_priv(set); cancel_delayed_work_sync(&priv->gc_work); + rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, (void *)set); } @@ -654,7 +655,7 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features return true; } -static struct nft_set_type nft_rhash_type __read_mostly = { +struct nft_set_type nft_set_rhash_type __read_mostly = { .owner = THIS_MODULE, .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT | NFT_SET_EVAL, @@ -677,7 +678,7 @@ static struct nft_set_type nft_rhash_type __read_mostly = { }, }; -static struct nft_set_type nft_hash_type __read_mostly = { +struct nft_set_type nft_set_hash_type __read_mostly = { .owner = THIS_MODULE, .features = NFT_SET_MAP | NFT_SET_OBJECT, .ops = { @@ -697,7 +698,7 @@ static struct nft_set_type nft_hash_type __read_mostly = { }, }; -static struct nft_set_type nft_hash_fast_type __read_mostly = { +struct nft_set_type nft_set_hash_fast_type __read_mostly = { .owner = THIS_MODULE, .features = NFT_SET_MAP | NFT_SET_OBJECT, .ops = { @@ -716,26 +717,3 @@ static struct nft_set_type nft_hash_fast_type __read_mostly = { .get = nft_hash_get, }, }; - -static int __init nft_hash_module_init(void) -{ - if (nft_register_set(&nft_hash_fast_type) || - nft_register_set(&nft_hash_type) || - nft_register_set(&nft_rhash_type)) - return 1; - return 0; -} - -static void __exit nft_hash_module_exit(void) -{ - nft_unregister_set(&nft_rhash_type); - nft_unregister_set(&nft_hash_type); - nft_unregister_set(&nft_hash_fast_type); -} - -module_init(nft_hash_module_init); -module_exit(nft_hash_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 7f3a9a211034..9873d734b494 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -381,7 +381,7 @@ static void nft_rbtree_gc(struct work_struct *work) gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); if (!gcb) - goto out; + break; atomic_dec(&set->nelems); nft_set_gc_batch_add(gcb, rbe); @@ -390,10 +390,12 @@ static void nft_rbtree_gc(struct work_struct *work) rbe = rb_entry(prev, struct nft_rbtree_elem, node); atomic_dec(&set->nelems); nft_set_gc_batch_add(gcb, rbe); + prev = NULL; } node = rb_next(node); + if (!node) + break; } -out: if (gcb) { for (i = 0; i < gcb->head.cnt; i++) { rbe = gcb->elems[i]; @@ -440,6 +442,7 @@ static void nft_rbtree_destroy(const struct nft_set *set) struct rb_node *node; cancel_delayed_work_sync(&priv->gc_work); + rcu_barrier(); while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); @@ -462,7 +465,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_rbtree_type __read_mostly = { +struct nft_set_type nft_set_rbtree_type __read_mostly = { .owner = THIS_MODULE, .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { @@ -481,20 +484,3 @@ static struct nft_set_type nft_rbtree_type __read_mostly = { .get = nft_rbtree_get, }, }; - -static int __init nft_rbtree_module_init(void) -{ - return nft_register_set(&nft_rbtree_type); -} - -static void __exit nft_rbtree_module_exit(void) -{ - nft_unregister_set(&nft_rbtree_type); -} - -module_init(nft_rbtree_module_init); -module_exit(nft_rbtree_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 74e1b3bd6954..d7f3776dfd71 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -23,12 +23,15 @@ static void nft_socket_eval(const struct nft_expr *expr, struct sock *sk = skb->sk; u32 *dest = ®s->data[priv->dreg]; + if (sk && !net_eq(nft_net(pkt), sock_net(sk))) + sk = NULL; + if (!sk) switch(nft_pf(pkt)) { case NFPROTO_IPV4: sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt)); break; -#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6) +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt)); break; @@ -39,8 +42,8 @@ static void nft_socket_eval(const struct nft_expr *expr, return; } - if(!sk) { - nft_reg_store8(dest, 0); + if (!sk) { + regs->verdict.code = NFT_BREAK; return; } @@ -51,6 +54,14 @@ static void nft_socket_eval(const struct nft_expr *expr, case NFT_SOCKET_TRANSPARENT: nft_reg_store8(dest, inet_sk_transparent(sk)); break; + case NFT_SOCKET_MARK: + if (sk_fullsock(sk)) { + *dest = sk->sk_mark; + } else { + regs->verdict.code = NFT_BREAK; + return; + } + break; default: WARN_ON(1); regs->verdict.code = NFT_BREAK; @@ -74,7 +85,7 @@ static int nft_socket_init(const struct nft_ctx *ctx, switch(ctx->family) { case NFPROTO_IPV4: -#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6) +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: #endif case NFPROTO_INET: @@ -88,6 +99,9 @@ static int nft_socket_init(const struct nft_ctx *ctx, case NFT_SOCKET_TRANSPARENT: len = sizeof(u8); break; + case NFT_SOCKET_MARK: + len = sizeof(u32); + break; default: return -EOPNOTSUPP; } diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index 0b660c568156..e8da9a9bba73 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -1,14 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_queue.h> +#include <net/ip6_checksum.h> + +#ifdef CONFIG_INET +__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, u8 protocol) +{ + const struct iphdr *iph = ip_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) + break; + if ((protocol == 0 && !csum_fold(skb->csum)) || + !csum_tcpudp_magic(iph->saddr, iph->daddr, + skb->len - dataoff, protocol, + skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + /* fall through */ + case CHECKSUM_NONE: + if (protocol == 0) + skb->csum = 0; + else + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb->len - dataoff, + protocol, 0); + csum = __skb_checksum_complete(skb); + } + return csum; +} +EXPORT_SYMBOL(nf_ip_checksum); +#endif + +static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u8 protocol) +{ + const struct iphdr *iph = ip_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (len == skb->len - dataoff) + return nf_ip_checksum(skb, hook, dataoff, protocol); + /* fall through */ + case CHECKSUM_NONE: + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, + skb->len - dataoff, 0); + skb->ip_summed = CHECKSUM_NONE; + return __skb_checksum_complete_head(skb, dataoff + len); + } + return csum; +} + +__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, u8 protocol) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) + break; + if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + skb->len - dataoff, protocol, + csum_sub(skb->csum, + skb_checksum(skb, 0, + dataoff, 0)))) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + /* fall through */ + case CHECKSUM_NONE: + skb->csum = ~csum_unfold( + csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + skb->len - dataoff, + protocol, + csum_sub(0, + skb_checksum(skb, 0, + dataoff, 0)))); + csum = __skb_checksum_complete(skb); + } + return csum; +} +EXPORT_SYMBOL(nf_ip6_checksum); + +static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u8 protocol) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + __wsum hsum; + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (len == skb->len - dataoff) + return nf_ip6_checksum(skb, hook, dataoff, protocol); + /* fall through */ + case CHECKSUM_NONE: + hsum = skb_checksum(skb, 0, dataoff, 0); + skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, + &ip6h->daddr, + skb->len - dataoff, + protocol, + csum_sub(0, hsum))); + skb->ip_summed = CHECKSUM_NONE; + return __skb_checksum_complete_head(skb, dataoff + len); + } + return csum; +}; __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol, + unsigned int dataoff, u8 protocol, unsigned short family) { - const struct nf_ipv6_ops *v6ops; __sum16 csum = 0; switch (family) { @@ -16,9 +130,7 @@ __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, csum = nf_ip_checksum(skb, hook, dataoff, protocol); break; case AF_INET6: - v6ops = rcu_dereference(nf_ipv6_ops); - if (v6ops) - csum = v6ops->checksum(skb, hook, dataoff, protocol); + csum = nf_ip6_checksum(skb, hook, dataoff, protocol); break; } @@ -28,9 +140,8 @@ EXPORT_SYMBOL_GPL(nf_checksum); __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, - u_int8_t protocol, unsigned short family) + u8 protocol, unsigned short family) { - const struct nf_ipv6_ops *v6ops; __sum16 csum = 0; switch (family) { @@ -39,10 +150,8 @@ __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, protocol); break; case AF_INET6: - v6ops = rcu_dereference(nf_ipv6_ops); - if (v6ops) - csum = v6ops->checksum_partial(skb, hook, dataoff, len, - protocol); + csum = nf_ip6_checksum_partial(skb, hook, dataoff, len, + protocol); break; } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 03b9a50ec93b..7ba454e9e3fa 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -93,7 +93,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, return -ENOENT; } - help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL); + help = nf_ct_helper_ext_add(ct, GFP_KERNEL); if (help == NULL) { nf_conntrack_helper_put(helper); return -ENOMEM; diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 475957cfcf50..0d0d68c989df 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -38,7 +38,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -#if IS_ENABLED(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { @@ -141,7 +141,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .destroy = tee_tg_destroy, .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TEE", .revision = 1, diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 58fce4e749a9..ad7420cdc439 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -36,15 +36,6 @@ #include <net/netfilter/nf_tproxy.h> #include <linux/netfilter/xt_TPROXY.h> -/* assign a socket to the skb -- consumes sk */ -static void -nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) -{ - skb_orphan(skb); - skb->sk = sk; - skb->destructor = sock_edemux; -} - static unsigned int tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, u_int32_t mark_mask, u_int32_t mark_value) @@ -61,7 +52,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED); @@ -77,7 +68,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, else if (!sk) /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol, iph->saddr, laddr, hp->source, lport, skb->dev, NF_TPROXY_LOOKUP_LISTENER); @@ -150,7 +141,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto, + sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, tproto, &iph->saddr, &iph->daddr, hp->source, hp->dest, xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED); @@ -171,7 +162,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) else if (!sk) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, + sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, tproto, &iph->saddr, laddr, hp->source, lport, xt_in(par), NF_TPROXY_LOOKUP_LISTENER); diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 7df2dece57d3..5d92e1781980 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -72,8 +72,9 @@ static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cgroup_info_v0 *info = par->matchinfo; + struct sock *sk = skb->sk; - if (skb->sk == NULL || !sk_fullsock(skb->sk)) + if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ @@ -85,8 +86,9 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_cgroup_info_v1 *info = par->matchinfo; struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; struct cgroup *ancestor = info->priv; + struct sock *sk = skb->sk; - if (!skb->sk || !sk_fullsock(skb->sk)) + if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; if (ancestor) diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 3d705c688a27..46686fb73784 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -67,7 +67,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) struct sock *sk = skb_to_full_sk(skb); struct net *net = xt_net(par); - if (sk == NULL || sk->sk_socket == NULL) + if (!sk || !sk->sk_socket || !net_eq(net, sock_net(sk))) return (info->match ^ info->invert) == 0; else if (info->match & info->invert & XT_OWNER_SOCKET) /* diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 07085c22b19c..f44de4bc2100 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -265,7 +265,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par) } /* use TTL as seen before forwarding */ - if (xt_out(par) != NULL && skb->sk == NULL) + if (xt_out(par) != NULL && + (!skb->sk || !net_eq(net, sock_net(skb->sk)))) ttl++; spin_lock_bh(&recent_lock); diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 5c0779c4fa3c..0472f3472842 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -56,8 +56,12 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; + if (!net_eq(xt_net(par), sock_net(sk))) + sk = NULL; + if (!sk) sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par)); + if (sk) { bool wildcard; bool transparent = true; @@ -113,8 +117,12 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; + if (!net_eq(xt_net(par), sock_net(sk))) + sk = NULL; + if (!sk) sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par)); + if (sk) { bool wildcard; bool transparent = true; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 1189b84413d5..f6ac7693d2cc 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2300,7 +2300,6 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb = &nlk->cb; memset(cb, 0, sizeof(*cb)); - cb->start = control->start; cb->dump = control->dump; cb->done = control->done; cb->nlh = nlh; @@ -2309,8 +2308,8 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; - if (cb->start) { - ret = cb->start(cb); + if (control->start) { + ret = control->start(cb); if (ret) goto error_put; } @@ -2658,7 +2657,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = netlink_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 93fbcafbf388..03f37c4e64fe 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1355,7 +1355,7 @@ static const struct proto_ops nr_proto_ops = { .socketpair = sock_no_socketpair, .accept = nr_accept, .getname = nr_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = nr_ioctl, .listen = nr_listen, .shutdown = sock_no_shutdown, diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 2ceefa183cee..6a196e438b6c 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -752,11 +752,14 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap, pr_debug("Fragment %zd bytes remaining %zd", frag_len, remaining_len); - pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, MSG_DONTWAIT, + pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, 0, frag_len + LLCP_HEADER_SIZE, &err); if (pdu == NULL) { - pr_err("Could not allocate PDU\n"); - continue; + pr_err("Could not allocate PDU (error=%d)\n", err); + len -= remaining_len; + if (len == 0) + len = err; + break; } pdu = llcp_add_header(pdu, dsap, ssap, LLCP_PDU_UI); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index ab5bb14b49af..ea0c0c6f1874 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -548,13 +548,16 @@ static inline __poll_t llcp_accept_poll(struct sock *parent) return 0; } -static __poll_t llcp_sock_poll_mask(struct socket *sock, __poll_t events) +static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; pr_debug("%p\n", sk); + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); @@ -896,7 +899,7 @@ static const struct proto_ops llcp_sock_ops = { .socketpair = sock_no_socketpair, .accept = llcp_sock_accept, .getname = llcp_sock_getname, - .poll_mask = llcp_sock_poll_mask, + .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = llcp_sock_listen, .shutdown = sock_no_shutdown, @@ -916,7 +919,7 @@ static const struct proto_ops llcp_rawsock_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = llcp_sock_getname, - .poll_mask = llcp_sock_poll_mask, + .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 60c322531c49..e2188deb08dc 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -284,7 +284,7 @@ static const struct proto_ops rawsock_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -304,7 +304,7 @@ static const struct proto_ops rawsock_raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c index 9696ef96b719..1a30e165eeb4 100644 --- a/net/nsh/nsh.c +++ b/net/nsh/nsh.c @@ -104,7 +104,7 @@ static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, __skb_pull(skb, nsh_len); skb_reset_mac_header(skb); - skb_reset_mac_len(skb); + skb->mac_len = proto == htons(ETH_P_TEB) ? ETH_HLEN : 0; skb->protocol = proto; features &= NETIF_F_SG; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 30a5df27116e..85ae53d8fd09 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1057,6 +1057,28 @@ static int sample(struct datapath *dp, struct sk_buff *skb, clone_flow_key); } +/* When 'last' is true, clone() should always consume the 'skb'. + * Otherwise, clone() should keep 'skb' intact regardless what + * actions are executed within clone(). + */ +static int clone(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *attr, + bool last) +{ + struct nlattr *actions; + struct nlattr *clone_arg; + int rem = nla_len(attr); + bool dont_clone_flow_key; + + /* The first action is always 'OVS_CLONE_ATTR_ARG'. */ + clone_arg = nla_data(attr); + dont_clone_flow_key = nla_get_u32(clone_arg); + actions = nla_next(clone_arg, &rem); + + return clone_execute(dp, skb, key, 0, actions, rem, last, + !dont_clone_flow_key); +} + static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr) { @@ -1336,6 +1358,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, consume_skb(skb); return 0; } + break; + + case OVS_ACTION_ATTR_CLONE: { + bool last = nla_is_last(a, rem); + + err = clone(dp, skb, key, a, last); + if (last) + return err; + + break; + } } if (unlikely(err)) { diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 284aca2a252d..86a75105af1a 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -26,6 +26,7 @@ #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include <net/ipv6_frag.h> #ifdef CONFIG_NF_NAT_NEEDED #include <linux/netfilter/nf_nat.h> @@ -607,23 +608,12 @@ static struct nf_conn * ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, u8 l3num, struct sk_buff *skb, bool natted) { - const struct nf_conntrack_l3proto *l3proto; - const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - unsigned int dataoff; - u8 protonum; - l3proto = __nf_ct_l3proto_find(l3num); - if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, - &protonum) <= 0) { - pr_debug("ovs_ct_find_existing: Can't get protonum\n"); - return NULL; - } - l4proto = __nf_ct_l4proto_find(l3num, protonum); - if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, - protonum, net, &tuple, l3proto, l4proto)) { + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, + net, &tuple)) { pr_debug("ovs_ct_find_existing: Can't get tuple\n"); return NULL; } @@ -632,7 +622,7 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, if (natted) { struct nf_conntrack_tuple inverse; - if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) { + if (!nf_ct_invert_tuplepr(&inverse, &tuple)) { pr_debug("ovs_ct_find_existing: Inversion failed!\n"); return NULL; } @@ -1314,7 +1304,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, return -EINVAL; } - help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); + help = nf_ct_helper_ext_add(info->ct, GFP_KERNEL); if (!help) { nf_conntrack_helper_put(helper); return -ENOMEM; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 492ab0c36f7c..a70097ecf33c 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2460,6 +2460,40 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, return 0; } +static int validate_and_copy_clone(struct net *net, + const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + bool log, bool last) +{ + int start, err; + u32 exec; + + if (nla_len(attr) && nla_len(attr) < NLA_HDRLEN) + return -EINVAL; + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CLONE, log); + if (start < 0) + return start; + + exec = last || !actions_may_change_flow(attr); + + err = ovs_nla_add_action(sfa, OVS_CLONE_ATTR_EXEC, &exec, + sizeof(exec), log); + if (err) + return err; + + err = __ovs_nla_copy_actions(net, attr, key, sfa, + eth_type, vlan_tci, log); + if (err) + return err; + + add_nested_action_end(*sfa, start); + + return 0; +} + void ovs_match_init(struct sw_flow_match *match, struct sw_flow_key *key, bool reset_key, @@ -2516,7 +2550,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, struct ovs_tunnel_info *ovs_tun; struct nlattr *a; int err = 0, start, opts_type; + __be16 dst_opt_type; + dst_opt_type = 0; ovs_match_init(&match, &key, true, NULL); opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) @@ -2528,10 +2564,13 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, err = validate_geneve_opts(&key); if (err < 0) return err; + dst_opt_type = TUNNEL_GENEVE_OPT; break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: + dst_opt_type = TUNNEL_VXLAN_OPT; break; case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + dst_opt_type = TUNNEL_ERSPAN_OPT; break; } } @@ -2574,7 +2613,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, */ ip_tunnel_info_opts_set(tun_info, TUN_METADATA_OPTS(&key, key.tun_opts_len), - key.tun_opts_len); + key.tun_opts_len, dst_opt_type); add_nested_action_end(*sfa, start); return err; @@ -2844,6 +2883,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1, [OVS_ACTION_ATTR_POP_NSH] = 0, [OVS_ACTION_ATTR_METER] = sizeof(u32), + [OVS_ACTION_ATTR_CLONE] = (u32)-1, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3033,6 +3073,18 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, /* Non-existent meters are simply ignored. */ break; + case OVS_ACTION_ATTR_CLONE: { + bool last = nla_is_last(a, rem); + + err = validate_and_copy_clone(net, a, key, sfa, + eth_type, vlan_tci, + log, last); + if (err) + return err; + skip_copy = true; + break; + } + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; @@ -3111,6 +3163,26 @@ out: return err; } +static int clone_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + struct nlattr *start; + int err = 0, rem = nla_len(attr); + + start = nla_nest_start(skb, OVS_ACTION_ATTR_CLONE); + if (!start) + return -EMSGSIZE; + + err = ovs_nla_put_actions(nla_data(attr), rem, skb); + + if (err) + nla_nest_cancel(skb, start); + else + nla_nest_end(skb, start); + + return err; +} + static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) { const struct nlattr *ovs_key = nla_data(a); @@ -3199,6 +3271,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) return err; break; + case OVS_ACTION_ATTR_CLONE: + err = clone_action_to_attr(a, skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ff8e7e245c37..e3e00d3a972e 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -275,9 +275,10 @@ static bool packet_use_direct_xmit(const struct packet_sock *po) return po->xmit == packet_direct_xmit; } -static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) +static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) { - return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; + return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL); } static u16 packet_pick_tx_queue(struct sk_buff *skb) @@ -291,7 +292,7 @@ static u16 packet_pick_tx_queue(struct sk_buff *skb) __packet_pick_tx_queue); queue_index = netdev_cap_txqueue(dev, queue_index); } else { - queue_index = __packet_pick_tx_queue(dev, skb); + queue_index = __packet_pick_tx_queue(dev, skb, NULL); } return queue_index; @@ -1951,7 +1952,7 @@ retry: goto out_unlock; } - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) @@ -1962,6 +1963,7 @@ retry: skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc.transmit_time; sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); @@ -2457,6 +2459,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->dev = dev; skb->priority = po->sk.sk_priority; skb->mark = po->sk.sk_mark; + skb->tstamp = sockc->transmit_time; sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; @@ -2633,7 +2636,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (unlikely(!(dev->flags & IFF_UP))) goto out_put; - sockc.tsflags = po->sk.sk_tsflags; + sockcm_init(&sockc, &po->sk); if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) @@ -2829,7 +2832,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); sockc.mark = sk->sk_mark; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); @@ -2878,6 +2881,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_free; } else if (reserve) { skb_reserve(skb, -reserve); + if (len < reserve) + skb_reset_network_header(skb); } /* Returns -EFAULT on error */ @@ -2903,6 +2908,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sockc.mark; + skb->tstamp = sockc.transmit_time; if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); @@ -4076,11 +4082,12 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, return 0; } -static __poll_t packet_poll_mask(struct socket *sock, __poll_t events) +static __poll_t packet_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); - __poll_t mask = datagram_poll_mask(sock, events); + __poll_t mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { @@ -4422,7 +4429,7 @@ static const struct proto_ops packet_ops_spkt = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname_spkt, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -4443,7 +4450,7 @@ static const struct proto_ops packet_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, - .poll_mask = packet_poll_mask, + .poll = packet_poll, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/phonet/socket.c b/net/phonet/socket.c index c295c4e20f01..30187990257f 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -340,12 +340,15 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr, return sizeof(struct sockaddr_pn); } -static __poll_t pn_socket_poll_mask(struct socket *sock, __poll_t events) +static __poll_t pn_socket_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct pep_sock *pn = pep_sk(sk); __poll_t mask = 0; + poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == TCP_CLOSE) return EPOLLERR; if (!skb_queue_empty(&sk->sk_receive_queue)) @@ -445,7 +448,7 @@ const struct proto_ops phonet_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pn_socket_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = pn_socket_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -470,7 +473,7 @@ const struct proto_ops phonet_stream_ops = { .socketpair = sock_no_socketpair, .accept = pn_socket_accept, .getname = pn_socket_getname, - .poll_mask = pn_socket_poll_mask, + .poll = pn_socket_poll, .ioctl = pn_socket_ioctl, .listen = pn_socket_listen, .shutdown = sock_no_shutdown, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 1b5025ea5b04..86e1e37eb4e8 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -191,8 +191,13 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, hdr->type = cpu_to_le32(type); hdr->src_node_id = cpu_to_le32(from->sq_node); hdr->src_port_id = cpu_to_le32(from->sq_port); - hdr->dst_node_id = cpu_to_le32(to->sq_node); - hdr->dst_port_id = cpu_to_le32(to->sq_port); + if (to->sq_port == QRTR_PORT_CTRL) { + hdr->dst_node_id = cpu_to_le32(node->nid); + hdr->dst_port_id = cpu_to_le32(QRTR_NODE_BCAST); + } else { + hdr->dst_node_id = cpu_to_le32(to->sq_node); + hdr->dst_port_id = cpu_to_le32(to->sq_port); + } hdr->size = cpu_to_le32(len); hdr->confirm_rx = 0; @@ -764,6 +769,10 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) node = NULL; if (addr->sq_node == QRTR_NODE_BCAST) { enqueue_fn = qrtr_bcast_enqueue; + if (addr->sq_port != QRTR_PORT_CTRL) { + release_sock(sk); + return -ENOTCONN; + } } else if (addr->sq_node == ipc->us.sq_node) { enqueue_fn = qrtr_local_enqueue; } else { @@ -1023,7 +1032,7 @@ static const struct proto_ops qrtr_proto_ops = { .recvmsg = qrtr_recvmsg, .getname = qrtr_getname, .ioctl = qrtr_ioctl, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, .getsockopt = sock_no_getsockopt, diff --git a/net/rds/Kconfig b/net/rds/Kconfig index bffde4b46c5d..607128f10bcd 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -1,7 +1,7 @@ config RDS tristate "The RDS Protocol" - depends on INET + depends on INET && CONFIG_IPV6 ---help--- The RDS (Reliable Datagram Sockets) protocol provides reliable, sequenced delivery of datagrams over Infiniband or TCP. @@ -24,4 +24,3 @@ config RDS_DEBUG bool "RDS debugging messages" depends on RDS default n - diff --git a/net/rds/Makefile b/net/rds/Makefile index b5d568bd479c..e647f9de104a 100644 --- a/net/rds/Makefile +++ b/net/rds/Makefile @@ -15,4 +15,3 @@ rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \ tcp_send.o tcp_stats.o ccflags-$(CONFIG_RDS_DEBUG) := -DRDS_DEBUG - diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index ab751a150f70..fc5c48b248fe 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,6 +35,7 @@ #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/in.h> +#include <linux/ipv6.h> #include <linux/poll.h> #include <net/sock.h> @@ -113,26 +114,80 @@ void rds_wake_sk_sleep(struct rds_sock *rs) static int rds_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; struct rds_sock *rs = rds_sk_to_rs(sock->sk); - - memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + struct sockaddr_in6 *sin6; + struct sockaddr_in *sin; + int uaddr_len; /* racey, don't care */ if (peer) { - if (!rs->rs_conn_addr) + if (ipv6_addr_any(&rs->rs_conn_addr)) return -ENOTCONN; - sin->sin_port = rs->rs_conn_port; - sin->sin_addr.s_addr = rs->rs_conn_addr; + if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) { + sin = (struct sockaddr_in *)uaddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + sin->sin_family = AF_INET; + sin->sin_port = rs->rs_conn_port; + sin->sin_addr.s_addr = rs->rs_conn_addr_v4; + uaddr_len = sizeof(*sin); + } else { + sin6 = (struct sockaddr_in6 *)uaddr; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = rs->rs_conn_port; + sin6->sin6_addr = rs->rs_conn_addr; + sin6->sin6_flowinfo = 0; + /* scope_id is the same as in the bound address. */ + sin6->sin6_scope_id = rs->rs_bound_scope_id; + uaddr_len = sizeof(*sin6); + } } else { - sin->sin_port = rs->rs_bound_port; - sin->sin_addr.s_addr = rs->rs_bound_addr; + /* If socket is not yet bound and the socket is connected, + * set the return address family to be the same as the + * connected address, but with 0 address value. If it is not + * connected, set the family to be AF_UNSPEC (value 0) and + * the address size to be that of an IPv4 address. + */ + if (ipv6_addr_any(&rs->rs_bound_addr)) { + if (ipv6_addr_any(&rs->rs_conn_addr)) { + sin = (struct sockaddr_in *)uaddr; + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_UNSPEC; + return sizeof(*sin); + } + + if (ipv6_addr_type(&rs->rs_conn_addr) & + IPV6_ADDR_MAPPED) { + sin = (struct sockaddr_in *)uaddr; + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_INET; + return sizeof(*sin); + } + + sin6 = (struct sockaddr_in6 *)uaddr; + memset(sin6, 0, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + return sizeof(*sin6); + } + if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { + sin = (struct sockaddr_in *)uaddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + sin->sin_family = AF_INET; + sin->sin_port = rs->rs_bound_port; + sin->sin_addr.s_addr = rs->rs_bound_addr_v4; + uaddr_len = sizeof(*sin); + } else { + sin6 = (struct sockaddr_in6 *)uaddr; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = rs->rs_bound_port; + sin6->sin6_addr = rs->rs_bound_addr; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = rs->rs_bound_scope_id; + uaddr_len = sizeof(*sin6); + } } - sin->sin_family = AF_INET; - - return sizeof(*sin); + return uaddr_len; } /* @@ -203,11 +258,12 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, int len) { + struct sockaddr_in6 sin6; struct sockaddr_in sin; int ret = 0; /* racing with another thread binding seems ok here */ - if (rs->rs_bound_addr == 0) { + if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } @@ -215,14 +271,23 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, if (len < sizeof(struct sockaddr_in)) { ret = -EINVAL; goto out; + } else if (len < sizeof(struct sockaddr_in6)) { + /* Assume IPv4 */ + if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) { + ret = -EFAULT; + goto out; + } + ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); + sin6.sin6_port = sin.sin_port; + } else { + if (copy_from_user(&sin6, optval, + sizeof(struct sockaddr_in6))) { + ret = -EFAULT; + goto out; + } } - if (copy_from_user(&sin, optval, sizeof(sin))) { - ret = -EFAULT; - goto out; - } - - rds_send_drop_to(rs, &sin); + rds_send_drop_to(rs, &sin6); out: return ret; } @@ -435,31 +500,87 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; struct rds_sock *rs = rds_sk_to_rs(sk); + int addr_type; int ret = 0; lock_sock(sk); - if (addr_len != sizeof(struct sockaddr_in)) { - ret = -EINVAL; - goto out; - } + switch (uaddr->sa_family) { + case AF_INET: + sin = (struct sockaddr_in *)uaddr; + if (addr_len < sizeof(struct sockaddr_in)) { + ret = -EINVAL; + break; + } + if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { + ret = -EDESTADDRREQ; + break; + } + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) || + sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) { + ret = -EINVAL; + break; + } + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr); + rs->rs_conn_port = sin->sin_port; + break; - if (sin->sin_family != AF_INET) { - ret = -EAFNOSUPPORT; - goto out; - } + case AF_INET6: + sin6 = (struct sockaddr_in6 *)uaddr; + if (addr_len < sizeof(struct sockaddr_in6)) { + ret = -EINVAL; + break; + } + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (!(addr_type & IPV6_ADDR_UNICAST)) { + __be32 addr4; + + if (!(addr_type & IPV6_ADDR_MAPPED)) { + ret = -EPROTOTYPE; + break; + } + + /* It is a mapped address. Need to do some sanity + * checks. + */ + addr4 = sin6->sin6_addr.s6_addr32[3]; + if (addr4 == htonl(INADDR_ANY) || + addr4 == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(addr4))) { + ret = -EPROTOTYPE; + break; + } + } - if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { - ret = -EDESTADDRREQ; - goto out; - } + if (addr_type & IPV6_ADDR_LINKLOCAL) { + /* If socket is arleady bound to a link local address, + * the peer address must be on the same link. + */ + if (sin6->sin6_scope_id == 0 || + (!ipv6_addr_any(&rs->rs_bound_addr) && + rs->rs_bound_scope_id && + sin6->sin6_scope_id != rs->rs_bound_scope_id)) { + ret = -EINVAL; + break; + } + /* Remember the connected address scope ID. It will + * be checked against the binding local address when + * the socket is bound. + */ + rs->rs_bound_scope_id = sin6->sin6_scope_id; + } + rs->rs_conn_addr = sin6->sin6_addr; + rs->rs_conn_port = sin6->sin6_port; + break; - rs->rs_conn_addr = sin->sin_addr.s_addr; - rs->rs_conn_port = sin->sin_port; + default: + ret = -EAFNOSUPPORT; + break; + } -out: release_sock(sk); return ret; } @@ -578,8 +699,10 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { total++; if (total <= len) - rds_inc_info_copy(inc, iter, inc->i_saddr, - rs->rs_bound_addr, 1); + rds_inc_info_copy(inc, iter, + inc->i_saddr.s6_addr32[3], + rs->rs_bound_addr_v4, + 1); } read_unlock(&rs->rs_recv_lock); @@ -608,8 +731,8 @@ static void rds_sock_info(struct socket *sock, unsigned int len, list_for_each_entry(rs, &rds_sock_list, rs_item) { sinfo.sndbuf = rds_sk_sndbuf(rs); sinfo.rcvbuf = rds_sk_rcvbuf(rs); - sinfo.bound_addr = rs->rs_bound_addr; - sinfo.connected_addr = rs->rs_conn_addr; + sinfo.bound_addr = rs->rs_bound_addr_v4; + sinfo.connected_addr = rs->rs_conn_addr_v4; sinfo.bound_port = rs->rs_bound_port; sinfo.connected_port = rs->rs_conn_port; sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); diff --git a/net/rds/bind.c b/net/rds/bind.c index 5aa3a64aa4f0..ba778760cbc2 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,6 +33,7 @@ #include <linux/kernel.h> #include <net/sock.h> #include <linux/in.h> +#include <linux/ipv6.h> #include <linux/if_arp.h> #include <linux/jhash.h> #include <linux/ratelimit.h> @@ -42,42 +43,58 @@ static struct rhashtable bind_hash_table; static const struct rhashtable_params ht_parms = { .nelem_hint = 768, - .key_len = sizeof(u64), + .key_len = RDS_BOUND_KEY_LEN, .key_offset = offsetof(struct rds_sock, rs_bound_key), .head_offset = offsetof(struct rds_sock, rs_bound_node), .max_size = 16384, .min_size = 1024, }; +/* Create a key for the bind hash table manipulation. Port is in network byte + * order. + */ +static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr, + __be16 port, __u32 scope_id) +{ + memcpy(key, addr, sizeof(*addr)); + key += sizeof(*addr); + memcpy(key, &port, sizeof(port)); + key += sizeof(port); + memcpy(key, &scope_id, sizeof(scope_id)); +} + /* * Return the rds_sock bound at the given local address. * * The rx path can race with rds_release. We notice if rds_release() has * marked this socket and don't return a rs ref to the rx path. */ -struct rds_sock *rds_find_bound(__be32 addr, __be16 port) +struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, + __u32 scope_id) { - u64 key = ((u64)addr << 32) | port; + u8 key[RDS_BOUND_KEY_LEN]; struct rds_sock *rs; - rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms); + __rds_create_bind_key(key, addr, port, scope_id); + rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms); if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); else rs = NULL; - rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, - ntohs(port)); + rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr, + ntohs(port)); return rs; } /* returns -ve errno or +ve port */ -static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) +static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr, + __be16 *port, __u32 scope_id) { int ret = -EADDRINUSE; u16 rover, last; - u64 key; + u8 key[RDS_BOUND_KEY_LEN]; if (*port != 0) { rover = be16_to_cpu(*port); @@ -95,12 +112,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) if (rover == RDS_FLAG_PROBE_PORT) continue; - key = ((u64)addr << 32) | cpu_to_be16(rover); - if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms)) + __rds_create_bind_key(key, addr, cpu_to_be16(rover), + scope_id); + if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms)) continue; - rs->rs_bound_key = key; - rs->rs_bound_addr = addr; + memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key)); + rs->rs_bound_addr = *addr; net_get_random_once(&rs->rs_hash_initval, sizeof(rs->rs_hash_initval)); rs->rs_bound_port = cpu_to_be16(rover); @@ -109,12 +127,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) if (!rhashtable_insert_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms)) { *port = rs->rs_bound_port; + rs->rs_bound_scope_id = scope_id; ret = 0; - rdsdebug("rs %p binding to %pI4:%d\n", - rs, &addr, (int)ntohs(*port)); + rdsdebug("rs %p binding to %pI6c:%d\n", + rs, addr, (int)ntohs(*port)); break; } else { - rs->rs_bound_addr = 0; + rs->rs_bound_addr = in6addr_any; rds_sock_put(rs); ret = -ENOMEM; break; @@ -127,44 +146,101 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) void rds_remove_bound(struct rds_sock *rs) { - if (!rs->rs_bound_addr) + if (ipv6_addr_any(&rs->rs_bound_addr)) return; - rdsdebug("rs %p unbinding from %pI4:%d\n", + rdsdebug("rs %p unbinding from %pI6c:%d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms); rds_sock_put(rs); - rs->rs_bound_addr = 0; + rs->rs_bound_addr = in6addr_any; } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; struct rds_sock *rs = rds_sk_to_rs(sk); + struct in6_addr v6addr, *binding_addr; struct rds_transport *trans; + __u32 scope_id = 0; + int addr_type; int ret = 0; + __be16 port; + + /* We allow an RDS socket to be bound to either IPv4 or IPv6 + * address. + */ + if (uaddr->sa_family == AF_INET) { + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + + if (addr_len < sizeof(struct sockaddr_in) || + sin->sin_addr.s_addr == htonl(INADDR_ANY) || + sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + return -EINVAL; + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr); + binding_addr = &v6addr; + port = sin->sin_port; + } else if (uaddr->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr; + + if (addr_len < sizeof(struct sockaddr_in6)) + return -EINVAL; + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (!(addr_type & IPV6_ADDR_UNICAST)) { + __be32 addr4; + if (!(addr_type & IPV6_ADDR_MAPPED)) + return -EINVAL; + + /* It is a mapped address. Need to do some sanity + * checks. + */ + addr4 = sin6->sin6_addr.s6_addr32[3]; + if (addr4 == htonl(INADDR_ANY) || + addr4 == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(addr4))) + return -EINVAL; + } + /* The scope ID must be specified for link local address. */ + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (sin6->sin6_scope_id == 0) + return -EINVAL; + scope_id = sin6->sin6_scope_id; + } + binding_addr = &sin6->sin6_addr; + port = sin6->sin6_port; + } else { + return -EINVAL; + } lock_sock(sk); - if (addr_len != sizeof(struct sockaddr_in) || - sin->sin_family != AF_INET || - rs->rs_bound_addr || - sin->sin_addr.s_addr == htonl(INADDR_ANY)) { + /* RDS socket does not allow re-binding. */ + if (!ipv6_addr_any(&rs->rs_bound_addr)) { + ret = -EINVAL; + goto out; + } + /* Socket is connected. The binding address should have the same + * scope ID as the connected address, except the case when one is + * non-link local address (scope_id is 0). + */ + if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id && + rs->rs_bound_scope_id && + scope_id != rs->rs_bound_scope_id) { ret = -EINVAL; goto out; } - ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port); + ret = rds_add_bound(rs, binding_addr, &port, scope_id); if (ret) goto out; if (rs->rs_transport) { /* previously bound */ trans = rs->rs_transport; if (trans->laddr_check(sock_net(sock->sk), - sin->sin_addr.s_addr) != 0) { + binding_addr, scope_id) != 0) { ret = -ENOPROTOOPT; rds_remove_bound(rs); } else { @@ -172,13 +248,13 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } goto out; } - trans = rds_trans_get_preferred(sock_net(sock->sk), - sin->sin_addr.s_addr); + trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr, + scope_id); if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); - pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n", - __func__, &sin->sin_addr.s_addr); + pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n", + __func__, binding_addr); goto out; } diff --git a/net/rds/cong.c b/net/rds/cong.c index 63da9d2f142d..ccdff09a79c8 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Oracle. All rights reserved. + * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -101,7 +101,7 @@ static DEFINE_RWLOCK(rds_cong_monitor_lock); static DEFINE_SPINLOCK(rds_cong_lock); static struct rb_root rds_cong_tree = RB_ROOT; -static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, +static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr, struct rds_cong_map *insert) { struct rb_node **p = &rds_cong_tree.rb_node; @@ -109,12 +109,15 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, struct rds_cong_map *map; while (*p) { + int diff; + parent = *p; map = rb_entry(parent, struct rds_cong_map, m_rb_node); - if (addr < map->m_addr) + diff = rds_addr_cmp(addr, &map->m_addr); + if (diff < 0) p = &(*p)->rb_left; - else if (addr > map->m_addr) + else if (diff > 0) p = &(*p)->rb_right; else return map; @@ -132,7 +135,7 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, * these bitmaps in the process getting pointers to them. The bitmaps are only * ever freed as the module is removed after all connections have been freed. */ -static struct rds_cong_map *rds_cong_from_addr(__be32 addr) +static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr) { struct rds_cong_map *map; struct rds_cong_map *ret = NULL; @@ -144,7 +147,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) if (!map) return NULL; - map->m_addr = addr; + map->m_addr = *addr; init_waitqueue_head(&map->m_waitq); INIT_LIST_HEAD(&map->m_conn_list); @@ -171,7 +174,7 @@ out: kfree(map); } - rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr)); + rdsdebug("map %p for addr %pI6c\n", ret, addr); return ret; } @@ -202,8 +205,8 @@ void rds_cong_remove_conn(struct rds_connection *conn) int rds_cong_get_maps(struct rds_connection *conn) { - conn->c_lcong = rds_cong_from_addr(conn->c_laddr); - conn->c_fcong = rds_cong_from_addr(conn->c_faddr); + conn->c_lcong = rds_cong_from_addr(&conn->c_laddr); + conn->c_fcong = rds_cong_from_addr(&conn->c_faddr); if (!(conn->c_lcong && conn->c_fcong)) return -ENOMEM; @@ -353,7 +356,7 @@ void rds_cong_remove_socket(struct rds_sock *rs) /* update congestion map for now-closed port */ spin_lock_irqsave(&rds_cong_lock, flags); - map = rds_cong_tree_walk(rs->rs_bound_addr, NULL); + map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL); spin_unlock_irqrestore(&rds_cong_lock, flags); if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { diff --git a/net/rds/connection.c b/net/rds/connection.c index abef75da89a7..051e35c1e7c6 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -34,7 +34,9 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> -#include <net/inet_hashtables.h> +#include <net/ipv6.h> +#include <net/inet6_hashtables.h> +#include <net/addrconf.h> #include "rds.h" #include "loop.h" @@ -49,18 +51,21 @@ static unsigned long rds_conn_count; static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; static struct kmem_cache *rds_conn_slab; -static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) +static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, + const struct in6_addr *faddr) { + static u32 rds6_hash_secret __read_mostly; static u32 rds_hash_secret __read_mostly; - unsigned long hash; + u32 lhash, fhash, hash; net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); + net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); + + lhash = (__force u32)laddr->s6_addr32[3]; + fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret); + hash = __inet6_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); - /* Pass NULL, don't need struct net for hash */ - hash = __inet_ehashfn(be32_to_cpu(laddr), 0, - be32_to_cpu(faddr), 0, - rds_hash_secret); return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; } @@ -72,20 +77,25 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) /* rcu read lock must be held or the connection spinlock */ static struct rds_connection *rds_conn_lookup(struct net *net, struct hlist_head *head, - __be32 laddr, __be32 faddr, - struct rds_transport *trans) + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, + int dev_if) { struct rds_connection *conn, *ret = NULL; hlist_for_each_entry_rcu(conn, head, c_hash_node) { - if (conn->c_faddr == faddr && conn->c_laddr == laddr && - conn->c_trans == trans && net == rds_conn_net(conn)) { + if (ipv6_addr_equal(&conn->c_faddr, faddr) && + ipv6_addr_equal(&conn->c_laddr, laddr) && + conn->c_trans == trans && + net == rds_conn_net(conn) && + conn->c_dev_if == dev_if) { ret = conn; break; } } - rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret, - &laddr, &faddr); + rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret, + laddr, faddr); return ret; } @@ -99,8 +109,8 @@ static void rds_conn_path_reset(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; - rdsdebug("connection %pI4 to %pI4 reset\n", - &conn->c_laddr, &conn->c_faddr); + rdsdebug("connection %pI6c to %pI6c reset\n", + &conn->c_laddr, &conn->c_faddr); rds_stats_inc(s_conn_reset); rds_send_path_reset(cp); @@ -142,9 +152,12 @@ static void __rds_conn_path_init(struct rds_connection *conn, * are torn down as the module is removed, if ever. */ static struct rds_connection *__rds_conn_create(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp, - int is_outgoing) + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, + gfp_t gfp, + int is_outgoing, + int dev_if) { struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); @@ -154,9 +167,12 @@ static struct rds_connection *__rds_conn_create(struct net *net, int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rcu_read_lock(); - conn = rds_conn_lookup(net, head, laddr, faddr, trans); - if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && - laddr == faddr && !is_outgoing) { + conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if); + if (conn && + conn->c_loopback && + conn->c_trans != &rds_loop_transport && + ipv6_addr_equal(laddr, faddr) && + !is_outgoing) { /* This is a looped back IB connection, and we're * called by the code handling the incoming connect. * We need a second connection object into which we @@ -181,8 +197,19 @@ static struct rds_connection *__rds_conn_create(struct net *net, } INIT_HLIST_NODE(&conn->c_hash_node); - conn->c_laddr = laddr; - conn->c_faddr = faddr; + conn->c_laddr = *laddr; + conn->c_isv6 = !ipv6_addr_v4mapped(laddr); + conn->c_faddr = *faddr; + conn->c_dev_if = dev_if; + /* If the local address is link local, set c_bound_if to be the + * index used for this connection. Otherwise, set it to 0 as + * the socket is not bound to an interface. c_bound_if is used + * to look up a socket when a packet is received + */ + if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL) + conn->c_bound_if = dev_if; + else + conn->c_bound_if = 0; rds_conn_net_set(conn, net); @@ -199,7 +226,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ - loop_trans = rds_trans_get_preferred(net, faddr); + loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if); if (loop_trans) { rds_trans_put(loop_trans); conn->c_loopback = 1; @@ -233,10 +260,10 @@ static struct rds_connection *__rds_conn_create(struct net *net, goto out; } - rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", - conn, &laddr, &faddr, - strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name : - "[unknown]", is_outgoing ? "(outgoing)" : ""); + rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n", + conn, laddr, faddr, + strnlen(trans->t_name, sizeof(trans->t_name)) ? + trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : ""); /* * Since we ran without holding the conn lock, someone could @@ -262,7 +289,8 @@ static struct rds_connection *__rds_conn_create(struct net *net, /* Creating normal conn */ struct rds_connection *found; - found = rds_conn_lookup(net, head, laddr, faddr, trans); + found = rds_conn_lookup(net, head, laddr, faddr, trans, + dev_if); if (found) { struct rds_conn_path *cp; int i; @@ -295,18 +323,22 @@ out: } struct rds_connection *rds_conn_create(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp) + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, gfp_t gfp, + int dev_if) { - return __rds_conn_create(net, laddr, faddr, trans, gfp, 0); + return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create); struct rds_connection *rds_conn_create_outgoing(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp) + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, + gfp_t gfp, int dev_if) { - return __rds_conn_create(net, laddr, faddr, trans, gfp, 1); + return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); @@ -464,10 +496,21 @@ void rds_conn_destroy(struct rds_connection *conn) } EXPORT_SYMBOL_GPL(rds_conn_destroy); -static void rds_conn_message_info(struct socket *sock, unsigned int len, - struct rds_info_iterator *iter, - struct rds_info_lengths *lens, - int want_send) +static void __rds_inc_msg_cp(struct rds_incoming *inc, + struct rds_info_iterator *iter, + void *saddr, void *daddr, int flip, bool isv6) +{ + if (isv6) + rds6_inc_info_copy(inc, iter, saddr, daddr, flip); + else + rds_inc_info_copy(inc, iter, *(__be32 *)saddr, + *(__be32 *)daddr, flip); +} + +static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send, bool isv6) { struct hlist_head *head; struct list_head *list; @@ -478,7 +521,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, size_t i; int j; - len /= sizeof(struct rds_info_message); + if (isv6) + len /= sizeof(struct rds6_info_message); + else + len /= sizeof(struct rds_info_message); rcu_read_lock(); @@ -488,6 +534,9 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, struct rds_conn_path *cp; int npaths; + if (!isv6 && conn->c_isv6) + continue; + npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); @@ -504,11 +553,11 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, list_for_each_entry(rm, list, m_conn_item) { total++; if (total <= len) - rds_inc_info_copy(&rm->m_inc, - iter, - conn->c_laddr, - conn->c_faddr, - 0); + __rds_inc_msg_cp(&rm->m_inc, + iter, + &conn->c_laddr, + &conn->c_faddr, + 0, isv6); } spin_unlock_irqrestore(&cp->cp_lock, flags); @@ -518,7 +567,26 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, rcu_read_unlock(); lens->nr = total; - lens->each = sizeof(struct rds_info_message); + if (isv6) + lens->each = sizeof(struct rds6_info_message); + else + lens->each = sizeof(struct rds_info_message); +} + +static void rds_conn_message_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) +{ + rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false); +} + +static void rds6_conn_message_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) +{ + rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true); } static void rds_conn_message_info_send(struct socket *sock, unsigned int len, @@ -528,6 +596,13 @@ static void rds_conn_message_info_send(struct socket *sock, unsigned int len, rds_conn_message_info(sock, len, iter, lens, 1); } +static void rds6_conn_message_info_send(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds6_conn_message_info(sock, len, iter, lens, 1); +} + static void rds_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, @@ -536,6 +611,14 @@ static void rds_conn_message_info_retrans(struct socket *sock, rds_conn_message_info(sock, len, iter, lens, 0); } +static void rds6_conn_message_info_retrans(struct socket *sock, + unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds6_conn_message_info(sock, len, iter, lens, 0); +} + void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, @@ -584,7 +667,6 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, struct hlist_head *head; struct rds_connection *conn; size_t i; - int j; rcu_read_lock(); @@ -595,17 +677,20 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { struct rds_conn_path *cp; - int npaths; - npaths = (conn->c_trans->t_mp_capable ? - RDS_MPATH_WORKERS : 1); - for (j = 0; j < npaths; j++) { - cp = &conn->c_path[j]; + /* XXX We only copy the information from the first + * path for now. The problem is that if there are + * more than one underlying paths, we cannot report + * information of all of them using the existing + * API. For example, there is only one next_tx_seq, + * which path's next_tx_seq should we report? It is + * a bug in the design of MPRDS. + */ + cp = conn->c_path; - /* XXX no cp_lock usage.. */ - if (!visitor(cp, buffer)) - continue; - } + /* XXX no cp_lock usage.. */ + if (!visitor(cp, buffer)) + continue; /* We copy as much as we can fit in the buffer, * but we count all items so that the caller @@ -624,12 +709,16 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds_info_connection *cinfo = buffer; + struct rds_connection *conn = cp->cp_conn; + + if (conn->c_isv6) + return 0; cinfo->next_tx_seq = cp->cp_next_tx_seq; cinfo->next_rx_seq = cp->cp_next_rx_seq; - cinfo->laddr = cp->cp_conn->c_laddr; - cinfo->faddr = cp->cp_conn->c_faddr; - strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name, + cinfo->laddr = conn->c_laddr.s6_addr32[3]; + cinfo->faddr = conn->c_faddr.s6_addr32[3]; + strncpy(cinfo->transport, conn->c_trans->t_name, sizeof(cinfo->transport)); cinfo->flags = 0; @@ -645,6 +734,34 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) return 1; } +static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) +{ + struct rds6_info_connection *cinfo6 = buffer; + struct rds_connection *conn = cp->cp_conn; + + cinfo6->next_tx_seq = cp->cp_next_tx_seq; + cinfo6->next_rx_seq = cp->cp_next_rx_seq; + cinfo6->laddr = conn->c_laddr; + cinfo6->faddr = conn->c_faddr; + strncpy(cinfo6->transport, conn->c_trans->t_name, + sizeof(cinfo6->transport)); + cinfo6->flags = 0; + + rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), + SENDING); + /* XXX Future: return the state rather than these funky bits */ + rds_conn_info_set(cinfo6->flags, + atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, + CONNECTING); + rds_conn_info_set(cinfo6->flags, + atomic_read(&cp->cp_state) == RDS_CONN_UP, + CONNECTED); + /* Just return 1 as there is no error case. This is a helper function + * for rds_walk_conn_path_info() and it wants a return value. + */ + return 1; +} + static void rds_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -657,25 +774,51 @@ static void rds_conn_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_connection)); } +static void rds6_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8]; + + rds_walk_conn_path_info(sock, len, iter, lens, + rds6_conn_info_visitor, + buffer, + sizeof(struct rds6_info_connection)); +} + int rds_conn_init(void) { + int ret; + + ret = rds_loop_net_init(); /* register pernet callback */ + if (ret) + return ret; + rds_conn_slab = kmem_cache_create("rds_connection", sizeof(struct rds_connection), 0, 0, NULL); - if (!rds_conn_slab) + if (!rds_conn_slab) { + rds_loop_net_exit(); return -ENOMEM; + } rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); rds_info_register_func(RDS_INFO_SEND_MESSAGES, rds_conn_message_info_send); rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); + rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); + rds_info_register_func(RDS6_INFO_SEND_MESSAGES, + rds6_conn_message_info_send); + rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES, + rds6_conn_message_info_retrans); return 0; } void rds_conn_exit(void) { + rds_loop_net_exit(); /* unregister pernet callback */ rds_loop_exit(); WARN_ON(!hlist_empty(rds_conn_hash)); @@ -687,6 +830,11 @@ void rds_conn_exit(void) rds_conn_message_info_send); rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); + rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); + rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES, + rds6_conn_message_info_send); + rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES, + rds6_conn_message_info_retrans); } /* diff --git a/net/rds/ib.c b/net/rds/ib.c index b6ad38e48f62..a4245c42d43b 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,6 +39,7 @@ #include <linux/delay.h> #include <linux/slab.h> #include <linux/module.h> +#include <net/addrconf.h> #include "rds_single_path.h" #include "rds.h" @@ -295,9 +296,11 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, /* We will only ever look at IB transports */ if (conn->c_trans != &rds_ib_transport) return 0; + if (conn->c_isv6) + return 0; - iinfo->src_addr = conn->c_laddr; - iinfo->dst_addr = conn->c_faddr; + iinfo->src_addr = conn->c_laddr.s6_addr32[3]; + iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); @@ -318,6 +321,43 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, return 1; } +/* IPv6 version of rds_ib_conn_info_visitor(). */ +static int rds6_ib_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds6_info_rdma_connection *iinfo6 = buffer; + struct rds_ib_connection *ic; + + /* We will only ever look at IB transports */ + if (conn->c_trans != &rds_ib_transport) + return 0; + + iinfo6->src_addr = conn->c_laddr; + iinfo6->dst_addr = conn->c_faddr; + + memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid)); + memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid)); + + if (rds_conn_state(conn) == RDS_CONN_UP) { + struct rds_ib_device *rds_ibdev; + struct rdma_dev_addr *dev_addr; + + ic = conn->c_transport_data; + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + rdma_addr_get_sgid(dev_addr, + (union ib_gid *)&iinfo6->src_gid); + rdma_addr_get_dgid(dev_addr, + (union ib_gid *)&iinfo6->dst_gid); + + rds_ibdev = ic->rds_ibdev; + iinfo6->max_send_wr = ic->i_send_ring.w_nr; + iinfo6->max_recv_wr = ic->i_recv_ring.w_nr; + iinfo6->max_send_sge = rds_ibdev->max_sge; + rds6_ib_get_mr_info(rds_ibdev, iinfo6); + } + return 1; +} + static void rds_ib_ic_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -330,6 +370,18 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_rdma_connection)); } +/* IPv6 version of rds_ib_ic_info(). */ +static void rds6_ib_ic_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8]; + + rds_for_each_conn_info(sock, len, iter, lens, + rds6_ib_conn_info_visitor, + buffer, + sizeof(struct rds6_info_rdma_connection)); +} /* * Early RDS/IB was built to only bind to an address if there is an IPoIB @@ -341,12 +393,17 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, * allowed to influence which paths have priority. We could call userspace * asserting this policy "routing". */ -static int rds_ib_laddr_check(struct net *net, __be32 addr) +static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, + __u32 scope_id) { int ret; struct rdma_cm_id *cm_id; + struct sockaddr_in6 sin6; struct sockaddr_in sin; + struct sockaddr *sa; + bool isv4; + isv4 = ipv6_addr_v4mapped(addr); /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ @@ -355,21 +412,54 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr) if (IS_ERR(cm_id)) return PTR_ERR(cm_id); - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr; + if (isv4) { + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr->s6_addr32[3]; + sa = (struct sockaddr *)&sin; + } else { + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = *addr; + sin6.sin6_scope_id = scope_id; + sa = (struct sockaddr *)&sin6; + + /* XXX Do a special IPv6 link local address check here. The + * reason is that rdma_bind_addr() always succeeds with IPv6 + * link local address regardless it is indeed configured in a + * system. + */ + if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) { + struct net_device *dev; + + if (scope_id == 0) + return -EADDRNOTAVAIL; + + /* Use init_net for now as RDS is not network + * name space aware. + */ + dev = dev_get_by_index(&init_net, scope_id); + if (!dev) + return -EADDRNOTAVAIL; + if (!ipv6_chk_addr(&init_net, addr, dev, 1)) { + dev_put(dev); + return -EADDRNOTAVAIL; + } + dev_put(dev); + } + } /* rdma_bind_addr will only succeed for IB & iWARP devices */ - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + ret = rdma_bind_addr(cm_id, sa); /* due to this, we will claim to support iWARP devices unless we check node_type. */ if (ret || !cm_id->device || cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; - rdsdebug("addr %pI4 ret %d node type %d\n", - &addr, ret, - cm_id->device ? cm_id->device->node_type : -1); + rdsdebug("addr %pI6c%%%u ret %d node type %d\n", + addr, scope_id, ret, + cm_id->device ? cm_id->device->node_type : -1); rdma_destroy_id(cm_id); @@ -401,6 +491,7 @@ void rds_ib_exit(void) rds_ib_set_unloading(); synchronize_rcu(); rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); rds_ib_unregister_client(); rds_ib_destroy_nodev_conns(); rds_ib_sysctl_exit(); @@ -462,6 +553,7 @@ int rds_ib_init(void) rds_trans_register(&rds_ib_transport); rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); goto out; @@ -476,4 +568,3 @@ out: } MODULE_LICENSE("GPL"); - diff --git a/net/rds/ib.h b/net/rds/ib.h index a6f4d7d68e95..beb95b893f78 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -57,16 +57,44 @@ struct rds_ib_refill_cache { struct list_head *ready; }; +/* This is the common structure for the IB private data exchange in setting up + * an RDS connection. The exchange is different for IPv4 and IPv6 connections. + * The reason is that the address size is different and the addresses + * exchanged are in the beginning of the structure. Hence it is not possible + * for interoperability if same structure is used. + */ +struct rds_ib_conn_priv_cmn { + u8 ricpc_protocol_major; + u8 ricpc_protocol_minor; + __be16 ricpc_protocol_minor_mask; /* bitmask */ + __be32 ricpc_reserved1; + __be64 ricpc_ack_seq; + __be32 ricpc_credit; /* non-zero enables flow ctl */ +}; + struct rds_ib_connect_private { /* Add new fields at the end, and don't permute existing fields. */ - __be32 dp_saddr; - __be32 dp_daddr; - u8 dp_protocol_major; - u8 dp_protocol_minor; - __be16 dp_protocol_minor_mask; /* bitmask */ - __be32 dp_reserved1; - __be64 dp_ack_seq; - __be32 dp_credit; /* non-zero enables flow ctl */ + __be32 dp_saddr; + __be32 dp_daddr; + struct rds_ib_conn_priv_cmn dp_cmn; +}; + +struct rds6_ib_connect_private { + /* Add new fields at the end, and don't permute existing fields. */ + struct in6_addr dp_saddr; + struct in6_addr dp_daddr; + struct rds_ib_conn_priv_cmn dp_cmn; +}; + +#define dp_protocol_major dp_cmn.ricpc_protocol_major +#define dp_protocol_minor dp_cmn.ricpc_protocol_minor +#define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask +#define dp_ack_seq dp_cmn.ricpc_ack_seq +#define dp_credit dp_cmn.ricpc_credit + +union rds_ib_conn_priv { + struct rds_ib_connect_private ricp_v4; + struct rds6_ib_connect_private ricp_v6; }; struct rds_ib_send_work { @@ -351,8 +379,8 @@ void rds_ib_listen_stop(void); __printf(2, 3) void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event); -int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); + struct rdma_cm_event *event, bool isv6); +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6); void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event); @@ -361,7 +389,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) /* ib_rdma.c */ -int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, + struct in6_addr *ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_destroy_nodev_conns(void); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index f1684ae6abfd..a33b82dc0804 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,6 +35,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/ratelimit.h> +#include <net/addrconf.h> #include "rds_single_path.h" #include "rds.h" @@ -95,25 +96,45 @@ rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr) */ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) { - const struct rds_ib_connect_private *dp = NULL; struct rds_ib_connection *ic = conn->c_transport_data; + const union rds_ib_conn_priv *dp = NULL; struct ib_qp_attr qp_attr; + __be64 ack_seq = 0; + __be32 credit = 0; + u8 major = 0; + u8 minor = 0; int err; - if (event->param.conn.private_data_len >= sizeof(*dp)) { - dp = event->param.conn.private_data; - - /* make sure it isn't empty data */ - if (dp->dp_protocol_major) { - rds_ib_set_protocol(conn, - RDS_PROTOCOL(dp->dp_protocol_major, - dp->dp_protocol_minor)); - rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + dp = event->param.conn.private_data; + if (conn->c_isv6) { + if (event->param.conn.private_data_len >= + sizeof(struct rds6_ib_connect_private)) { + major = dp->ricp_v6.dp_protocol_major; + minor = dp->ricp_v6.dp_protocol_minor; + credit = dp->ricp_v6.dp_credit; + /* dp structure start is not guaranteed to be 8 bytes + * aligned. Since dp_ack_seq is 64-bit extended load + * operations can be used so go through get_unaligned + * to avoid unaligned errors. + */ + ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq); } + } else if (event->param.conn.private_data_len >= + sizeof(struct rds_ib_connect_private)) { + major = dp->ricp_v4.dp_protocol_major; + minor = dp->ricp_v4.dp_protocol_minor; + credit = dp->ricp_v4.dp_credit; + ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq); + } + + /* make sure it isn't empty data */ + if (major) { + rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor)); + rds_ib_set_flow_control(conn, be32_to_cpu(credit)); } if (conn->c_version < RDS_PROTOCOL(3, 1)) { - pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n", + pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n", &conn->c_laddr, &conn->c_faddr, RDS_PROTOCOL_MAJOR(conn->c_version), RDS_PROTOCOL_MINOR(conn->c_version)); @@ -121,7 +142,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even rds_conn_destroy(conn); return; } else { - pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n", + pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n", ic->i_active_side ? "Active" : "Passive", &conn->c_laddr, &conn->c_faddr, RDS_PROTOCOL_MAJOR(conn->c_version), @@ -150,7 +171,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); /* update ib_device with this local ipaddr */ - err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); + err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr); if (err) printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); @@ -158,14 +179,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ if (dp) { - /* dp structure start is not guaranteed to be 8 bytes aligned. - * Since dp_ack_seq is 64-bit extended load operations can be - * used so go through get_unaligned to avoid unaligned errors. - */ - __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq); - - if (dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq), + if (ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(ack_seq), NULL); } @@ -173,11 +188,12 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, - struct rdma_conn_param *conn_param, - struct rds_ib_connect_private *dp, - u32 protocol_version, - u32 max_responder_resources, - u32 max_initiator_depth) + struct rdma_conn_param *conn_param, + union rds_ib_conn_priv *dp, + u32 protocol_version, + u32 max_responder_resources, + u32 max_initiator_depth, + bool isv6) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_device *rds_ibdev = ic->rds_ibdev; @@ -193,24 +209,49 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, if (dp) { memset(dp, 0, sizeof(*dp)); - dp->dp_saddr = conn->c_laddr; - dp->dp_daddr = conn->c_faddr; - dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); - dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); - dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); - dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic)); + if (isv6) { + dp->ricp_v6.dp_saddr = conn->c_laddr; + dp->ricp_v6.dp_daddr = conn->c_faddr; + dp->ricp_v6.dp_protocol_major = + RDS_PROTOCOL_MAJOR(protocol_version); + dp->ricp_v6.dp_protocol_minor = + RDS_PROTOCOL_MINOR(protocol_version); + dp->ricp_v6.dp_protocol_minor_mask = + cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); + dp->ricp_v6.dp_ack_seq = + cpu_to_be64(rds_ib_piggyb_ack(ic)); + + conn_param->private_data = &dp->ricp_v6; + conn_param->private_data_len = sizeof(dp->ricp_v6); + } else { + dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3]; + dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3]; + dp->ricp_v4.dp_protocol_major = + RDS_PROTOCOL_MAJOR(protocol_version); + dp->ricp_v4.dp_protocol_minor = + RDS_PROTOCOL_MINOR(protocol_version); + dp->ricp_v4.dp_protocol_minor_mask = + cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); + dp->ricp_v4.dp_ack_seq = + cpu_to_be64(rds_ib_piggyb_ack(ic)); + + conn_param->private_data = &dp->ricp_v4; + conn_param->private_data_len = sizeof(dp->ricp_v4); + } /* Advertise flow control */ if (ic->i_flowctl) { unsigned int credits; - credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); - dp->dp_credit = cpu_to_be32(credits); - atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); + credits = IB_GET_POST_CREDITS + (atomic_read(&ic->i_credits)); + if (isv6) + dp->ricp_v6.dp_credit = cpu_to_be32(credits); + else + dp->ricp_v4.dp_credit = cpu_to_be32(credits); + atomic_sub(IB_SET_POST_CREDITS(credits), + &ic->i_credits); } - - conn_param->private_data = dp; - conn_param->private_data_len = sizeof(*dp); } } @@ -349,7 +390,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) break; default: rdsdebug("Fatal QP Event %u (%s) " - "- connection %pI4->%pI4, reconnecting\n", + "- connection %pI6c->%pI6c, reconnecting\n", event->event, ib_event_msg(event->event), &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); @@ -580,11 +621,13 @@ out: return ret; } -static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) +static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6) { - const struct rds_ib_connect_private *dp = event->param.conn.private_data; - u16 common; + const union rds_ib_conn_priv *dp = event->param.conn.private_data; + u8 data_len, major, minor; u32 version = 0; + __be16 mask; + u16 common; /* * rdma_cm private data is odd - when there is any private data in the @@ -603,51 +646,133 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) return 0; } + if (isv6) { + data_len = sizeof(struct rds6_ib_connect_private); + major = dp->ricp_v6.dp_protocol_major; + minor = dp->ricp_v6.dp_protocol_minor; + mask = dp->ricp_v6.dp_protocol_minor_mask; + } else { + data_len = sizeof(struct rds_ib_connect_private); + major = dp->ricp_v4.dp_protocol_major; + minor = dp->ricp_v4.dp_protocol_minor; + mask = dp->ricp_v4.dp_protocol_minor_mask; + } + /* Even if len is crap *now* I still want to check it. -ASG */ - if (event->param.conn.private_data_len < sizeof (*dp) || - dp->dp_protocol_major == 0) + if (event->param.conn.private_data_len < data_len || major == 0) return RDS_PROTOCOL_3_0; - common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; - if (dp->dp_protocol_major == 3 && common) { + common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS; + if (major == 3 && common) { version = RDS_PROTOCOL_3_0; while ((common >>= 1) != 0) version++; - } else - printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n", - &dp->dp_saddr, - dp->dp_protocol_major, - dp->dp_protocol_minor); + } else { + if (isv6) + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n", + &dp->ricp_v6.dp_saddr, major, minor); + else + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n", + &dp->ricp_v4.dp_saddr, major, minor); + } return version; } +/* Given an IPv6 address, find the net_device which hosts that address and + * return its index. This is used by the rds_ib_cm_handle_connect() code to + * find the interface index of where an incoming request comes from when + * the request is using a link local address. + * + * Note one problem in this search. It is possible that two interfaces have + * the same link local address. Unfortunately, this cannot be solved unless + * the underlying layer gives us the interface which an incoming RDMA connect + * request comes from. + */ +static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr) +{ + struct net_device *dev; + int idx = 0; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (ipv6_chk_addr(net, addr, dev, 1)) { + idx = dev->ifindex; + break; + } + } + rcu_read_unlock(); + + return idx; +} + int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event) + struct rdma_cm_event *event, bool isv6) { __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id; __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id; - const struct rds_ib_connect_private *dp = event->param.conn.private_data; - struct rds_ib_connect_private dp_rep; + const struct rds_ib_conn_priv_cmn *dp_cmn; struct rds_connection *conn = NULL; struct rds_ib_connection *ic = NULL; struct rdma_conn_param conn_param; + const union rds_ib_conn_priv *dp; + union rds_ib_conn_priv dp_rep; + struct in6_addr s_mapped_addr; + struct in6_addr d_mapped_addr; + const struct in6_addr *saddr6; + const struct in6_addr *daddr6; + int destroy = 1; + u32 ifindex = 0; u32 version; - int err = 1, destroy = 1; + int err = 1; /* Check whether the remote protocol version matches ours. */ - version = rds_ib_protocol_compatible(event); + version = rds_ib_protocol_compatible(event, isv6); if (!version) goto out; - rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid " - "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr, + dp = event->param.conn.private_data; + if (isv6) { + dp_cmn = &dp->ricp_v6.dp_cmn; + saddr6 = &dp->ricp_v6.dp_saddr; + daddr6 = &dp->ricp_v6.dp_daddr; + /* If either address is link local, need to find the + * interface index in order to create a proper RDS + * connection. + */ + if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) { + /* Using init_net for now .. */ + ifindex = __rds_find_ifindex(&init_net, daddr6); + /* No index found... Need to bail out. */ + if (ifindex == 0) { + err = -EOPNOTSUPP; + goto out; + } + } else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) { + /* Use our address to find the correct index. */ + ifindex = __rds_find_ifindex(&init_net, daddr6); + /* No index found... Need to bail out. */ + if (ifindex == 0) { + err = -EOPNOTSUPP; + goto out; + } + } + } else { + dp_cmn = &dp->ricp_v4.dp_cmn; + ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr); + ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr); + saddr6 = &s_mapped_addr; + daddr6 = &d_mapped_addr; + } + + rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid " + "0x%llx\n", saddr6, daddr6, RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), (unsigned long long)be64_to_cpu(lguid), (unsigned long long)be64_to_cpu(fguid)); /* RDS/IB is not currently netns aware, thus init_net */ - conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, - &rds_ib_transport, GFP_KERNEL); + conn = rds_conn_create(&init_net, daddr6, saddr6, + &rds_ib_transport, GFP_KERNEL, ifindex); if (IS_ERR(conn)) { rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); conn = NULL; @@ -678,12 +803,13 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, ic = conn->c_transport_data; rds_ib_set_protocol(conn, version); - rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit)); /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ - if (dp->dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + if (dp_cmn->ricpc_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq), + NULL); BUG_ON(cm_id->context); BUG_ON(ic->i_cm_id); @@ -702,8 +828,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, } rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, - event->param.conn.responder_resources, - event->param.conn.initiator_depth); + event->param.conn.responder_resources, + event->param.conn.initiator_depth, isv6); /* rdma_accept() calls rdma_reject() internally if it fails */ if (rdma_accept(cm_id, &conn_param)) @@ -718,12 +844,12 @@ out: } -int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6) { struct rds_connection *conn = cm_id->context; struct rds_ib_connection *ic = conn->c_transport_data; struct rdma_conn_param conn_param; - struct rds_ib_connect_private dp; + union rds_ib_conn_priv dp; int ret; /* If the peer doesn't do protocol negotiation, we must @@ -738,7 +864,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) } rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, - UINT_MAX, UINT_MAX); + UINT_MAX, UINT_MAX, isv6); ret = rdma_connect(cm_id, &conn_param); if (ret) rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); @@ -758,13 +884,20 @@ out: int rds_ib_conn_path_connect(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; - struct rds_ib_connection *ic = conn->c_transport_data; - struct sockaddr_in src, dest; + struct sockaddr_storage src, dest; + rdma_cm_event_handler handler; + struct rds_ib_connection *ic; int ret; + ic = conn->c_transport_data; + /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ - ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn, + if (conn->c_isv6) + handler = rds6_rdma_cm_event_handler; + else + handler = rds_rdma_cm_event_handler; + ic->i_cm_id = rdma_create_id(&init_net, handler, conn, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); @@ -775,13 +908,33 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp) rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); - src.sin_family = AF_INET; - src.sin_addr.s_addr = (__force u32)conn->c_laddr; - src.sin_port = (__force u16)htons(0); + if (ipv6_addr_v4mapped(&conn->c_faddr)) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&src; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; + sin->sin_port = 0; - dest.sin_family = AF_INET; - dest.sin_addr.s_addr = (__force u32)conn->c_faddr; - dest.sin_port = (__force u16)htons(RDS_PORT); + sin = (struct sockaddr_in *)&dest; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; + sin->sin_port = htons(RDS_PORT); + } else { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&src; + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = conn->c_laddr; + sin6->sin6_port = 0; + sin6->sin6_scope_id = conn->c_dev_if; + + sin6 = (struct sockaddr_in6 *)&dest; + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = conn->c_faddr; + sin6->sin6_port = htons(RDS_CM_PORT); + sin6->sin6_scope_id = conn->c_dev_if; + } ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, (struct sockaddr *)&dest, diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 0ea4ab017a8c..f440ace584c8 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -113,6 +113,8 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, int npages); void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); +void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, + struct rds6_info_rdma_connection *iinfo6); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index e678699268a2..e3c8bbbdb43f 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -100,18 +100,19 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) kfree_rcu(to_free, rcu); } -int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, + struct in6_addr *ipaddr) { struct rds_ib_device *rds_ibdev_old; - rds_ibdev_old = rds_ib_get_device(ipaddr); + rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]); if (!rds_ibdev_old) - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); if (rds_ibdev_old != rds_ibdev) { - rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); + rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]); rds_ib_dev_put(rds_ibdev_old); - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); } rds_ib_dev_put(rds_ibdev_old); @@ -179,6 +180,15 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; } +void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, + struct rds6_info_rdma_connection *iinfo6) +{ + struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; + + iinfo6->rdma_mr_max = pool_1m->max_items; + iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages; +} + struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; @@ -544,7 +554,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_ib_connection *ic = rs->rs_conn->c_transport_data; int ret; - rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); + rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]); if (!rds_ibdev) { ret = -ENODEV; goto out; diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index b4e421aa9727..557ccbb1ce00 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -266,7 +266,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i rds_ib_stats_inc(s_ib_rx_total_incs); } INIT_LIST_HEAD(&ibinc->ii_frags); - rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); + rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr); return ibinc; } @@ -376,8 +376,6 @@ static void release_refill(struct rds_connection *conn) * This tries to allocate and post unused work requests after making sure that * they have all the allocations they need to queue received fragments into * sockets. - * - * -1 is returned if posting fails due to temporary resource exhaustion. */ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) { @@ -420,7 +418,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); if (ret) { rds_ib_conn_error(conn, "recv post on " - "%pI4 returned %d, disconnecting and " + "%pI6c returned %d, disconnecting and " "reconnecting\n", &conn->c_faddr, ret); break; @@ -850,7 +848,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, if (data_len < sizeof(struct rds_header)) { rds_ib_conn_error(conn, "incoming message " - "from %pI4 didn't include a " + "from %pI6c didn't include a " "header, disconnecting and " "reconnecting\n", &conn->c_faddr); @@ -863,7 +861,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { rds_ib_conn_error(conn, "incoming message " - "from %pI4 has corrupted header - " + "from %pI6c has corrupted header - " "forcing a reconnect\n", &conn->c_faddr); rds_stats_inc(s_recv_drop_bad_checksum); @@ -943,10 +941,10 @@ static void rds_ib_process_recv(struct rds_connection *conn, ic->i_recv_data_rem = 0; ic->i_ibinc = NULL; - if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) { rds_ib_cong_recv(conn, ibinc); - else { - rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, + } else { + rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr, &ibinc->ii_inc, GFP_ATOMIC); state->ack_next = be64_to_cpu(hdr->h_sequence); state->ack_next_valid = 1; @@ -990,7 +988,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, } else { /* We expect errors as the qp is drained during shutdown */ if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", + rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, wc->status, ib_wc_status_msg(wc->status)); @@ -1025,7 +1023,6 @@ int rds_ib_recv_path(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; struct rds_ib_connection *ic = conn->c_transport_data; - int ret = 0; rdsdebug("conn %p\n", conn); if (rds_conn_up(conn)) { @@ -1034,7 +1031,7 @@ int rds_ib_recv_path(struct rds_conn_path *cp) rds_ib_stats_inc(s_ib_rx_refill_from_thread); } - return ret; + return 0; } int rds_ib_recv_init(void) diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 8557a1cae041..c4cdfe491d96 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -305,7 +305,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) /* We expect errors as the qp is drained during shutdown */ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", + rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, wc->status, ib_wc_status_msg(wc->status)); } @@ -730,7 +730,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, first, &first->s_wr, ret, failed_wr); BUG_ON(failed_wr != &first->s_wr); if (ret) { - printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " + printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_sub_signaled(ic, nr_sig); @@ -827,7 +827,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) send, &send->s_atomic_wr, ret, failed_wr); BUG_ON(failed_wr != &send->s_atomic_wr.wr); if (ret) { - printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " + printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_sub_signaled(ic, nr_sig); @@ -967,7 +967,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) first, &first->s_rdma_wr.wr, ret, failed_wr); BUG_ON(failed_wr != &first->s_rdma_wr.wr); if (ret) { - printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " + printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_sub_signaled(ic, nr_sig); diff --git a/net/rds/loop.c b/net/rds/loop.c index dac6218a460e..1d73ad79c847 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,6 +33,9 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/in.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <linux/ipv6.h> #include "rds_single_path.h" #include "rds.h" @@ -40,6 +43,17 @@ static DEFINE_SPINLOCK(loop_conns_lock); static LIST_HEAD(loop_conns); +static atomic_t rds_loop_unloading = ATOMIC_INIT(0); + +static void rds_loop_set_unloading(void) +{ + atomic_set(&rds_loop_unloading, 1); +} + +static bool rds_loop_is_unloading(struct rds_connection *conn) +{ + return atomic_read(&rds_loop_unloading) != 0; +} /* * This 'loopback' transport is a special case for flows that originate @@ -75,11 +89,11 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, BUG_ON(hdr_off || sg || off); - rds_inc_init(&rm->m_inc, conn, conn->c_laddr); + rds_inc_init(&rm->m_inc, conn, &conn->c_laddr); /* For the embedded inc. Matching put is in loop_inc_free() */ rds_message_addref(rm); - rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, + rds_recv_incoming(conn, &conn->c_laddr, &conn->c_faddr, &rm->m_inc, GFP_KERNEL); rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence), @@ -165,6 +179,8 @@ void rds_loop_exit(void) struct rds_loop_connection *lc, *_lc; LIST_HEAD(tmp_list); + rds_loop_set_unloading(); + synchronize_rcu(); /* avoid calling conn_destroy with irqs off */ spin_lock_irq(&loop_conns_lock); list_splice(&loop_conns, &tmp_list); @@ -177,6 +193,46 @@ void rds_loop_exit(void) } } +static void rds_loop_kill_conns(struct net *net) +{ + struct rds_loop_connection *lc, *_lc; + LIST_HEAD(tmp_list); + + spin_lock_irq(&loop_conns_lock); + list_for_each_entry_safe(lc, _lc, &loop_conns, loop_node) { + struct net *c_net = read_pnet(&lc->conn->c_net); + + if (net != c_net) + continue; + list_move_tail(&lc->loop_node, &tmp_list); + } + spin_unlock_irq(&loop_conns_lock); + + list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) { + WARN_ON(lc->conn->c_passive); + rds_conn_destroy(lc->conn); + } +} + +static void __net_exit rds_loop_exit_net(struct net *net) +{ + rds_loop_kill_conns(net); +} + +static struct pernet_operations rds_loop_net_ops = { + .exit = rds_loop_exit_net, +}; + +int rds_loop_net_init(void) +{ + return register_pernet_device(&rds_loop_net_ops); +} + +void rds_loop_net_exit(void) +{ + unregister_pernet_device(&rds_loop_net_ops); +} + /* * This is missing .xmit_* because loop doesn't go through generic * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and @@ -194,4 +250,5 @@ struct rds_transport rds_loop_transport = { .inc_free = rds_loop_inc_free, .t_name = "loopback", .t_type = RDS_TRANS_LOOP, + .t_unloading = rds_loop_is_unloading, }; diff --git a/net/rds/loop.h b/net/rds/loop.h index 469fa4b2da4f..bbc8cdd030df 100644 --- a/net/rds/loop.h +++ b/net/rds/loop.h @@ -5,6 +5,8 @@ /* loop.c */ extern struct rds_transport rds_loop_transport; +int rds_loop_net_init(void); +void rds_loop_net_exit(void); void rds_loop_exit(void); #endif diff --git a/net/rds/message.c b/net/rds/message.c index a35f76971984..4b00b1152a5f 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -514,4 +514,3 @@ void rds_message_unmapped(struct rds_message *rm) wake_up_interruptible(&rm->m_flush_wait); } EXPORT_SYMBOL_GPL(rds_message_unmapped); - diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 634cfcb7bba6..7b3998026825 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Oracle. All rights reserved. + * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -183,7 +183,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, long i; int ret; - if (rs->rs_bound_addr == 0 || !rs->rs_transport) { + if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } @@ -574,7 +574,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, args = CMSG_DATA(cmsg); - if (rs->rs_bound_addr == 0) { + if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out_ret; } diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index fc59821f0a27..ad78929036ef 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Oracle. All rights reserved. + * Copyright (c) 2009, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -37,10 +37,13 @@ #include "rdma_transport.h" #include "ib.h" +/* Global IPv4 and IPv6 RDS RDMA listener cm_id */ static struct rdma_cm_id *rds_rdma_listen_id; +static struct rdma_cm_id *rds6_rdma_listen_id; -int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event) +static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event, + bool isv6) { /* this can be null in the listening path */ struct rds_connection *conn = cm_id->context; @@ -72,7 +75,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: - ret = trans->cm_handle_connect(cm_id, event); + ret = trans->cm_handle_connect(cm_id, event, isv6); break; case RDMA_CM_EVENT_ADDR_RESOLVED: @@ -90,7 +93,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, ibic = conn->c_transport_data; if (ibic && ibic->i_cm_id == cm_id) - ret = trans->cm_initiate_connect(cm_id); + ret = trans->cm_initiate_connect(cm_id, isv6); else rds_conn_drop(conn); } @@ -116,14 +119,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_DISCONNECTED: rdsdebug("DISCONNECT event - dropping connection " - "%pI4->%pI4\n", &conn->c_laddr, + "%pI6c->%pI6c\n", &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: if (conn) { - pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n", + pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI6c->%pI6c\n", &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); } @@ -146,13 +149,26 @@ out: return ret; } -static int rds_rdma_listen_init(void) +int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + return rds_rdma_cm_event_handler_cmn(cm_id, event, false); +} + +int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + return rds_rdma_cm_event_handler_cmn(cm_id, event, true); +} + +static int rds_rdma_listen_init_common(rdma_cm_event_handler handler, + struct sockaddr *sa, + struct rdma_cm_id **ret_cm_id) { - struct sockaddr_in sin; struct rdma_cm_id *cm_id; int ret; - cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL, + cm_id = rdma_create_id(&init_net, handler, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); @@ -161,15 +177,11 @@ static int rds_rdma_listen_init(void) return ret; } - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); - sin.sin_port = (__force u16)htons(RDS_PORT); - /* * XXX I bet this binds the cm_id to a device. If we want to support * fail-over we'll have to take this into consideration. */ - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + ret = rdma_bind_addr(cm_id, sa); if (ret) { printk(KERN_ERR "RDS/RDMA: failed to setup listener, " "rdma_bind_addr() returned %d\n", ret); @@ -185,7 +197,7 @@ static int rds_rdma_listen_init(void) rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT); - rds_rdma_listen_id = cm_id; + *ret_cm_id = cm_id; cm_id = NULL; out: if (cm_id) @@ -193,6 +205,41 @@ out: return ret; } +/* Initialize the RDS RDMA listeners. We create two listeners for + * compatibility reason. The one on RDS_PORT is used for IPv4 + * requests only. The one on RDS_CM_PORT is used for IPv6 requests + * only. So only IPv6 enabled RDS module will communicate using this + * port. + */ +static int rds_rdma_listen_init(void) +{ + int ret; + struct sockaddr_in6 sin6; + struct sockaddr_in sin; + + sin.sin_family = PF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(RDS_PORT); + ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler, + (struct sockaddr *)&sin, + &rds_rdma_listen_id); + if (ret != 0) + return ret; + + sin6.sin6_family = PF_INET6; + sin6.sin6_addr = in6addr_any; + sin6.sin6_port = htons(RDS_CM_PORT); + sin6.sin6_scope_id = 0; + sin6.sin6_flowinfo = 0; + ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler, + (struct sockaddr *)&sin6, + &rds6_rdma_listen_id); + /* Keep going even when IPv6 is not enabled in the system. */ + if (ret != 0) + rdsdebug("Cannot set up IPv6 RDMA listener\n"); + return 0; +} + static void rds_rdma_listen_stop(void) { if (rds_rdma_listen_id) { @@ -200,6 +247,11 @@ static void rds_rdma_listen_stop(void) rdma_destroy_id(rds_rdma_listen_id); rds_rdma_listen_id = NULL; } + if (rds6_rdma_listen_id) { + rdsdebug("cm %p\n", rds6_rdma_listen_id); + rdma_destroy_id(rds6_rdma_listen_id); + rds6_rdma_listen_id = NULL; + } } static int rds_rdma_init(void) @@ -229,4 +281,3 @@ module_exit(rds_rdma_exit); MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); MODULE_DESCRIPTION("RDS: IB transport"); MODULE_LICENSE("Dual BSD/GPL"); - diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index d309c4430124..200d3134aaae 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -6,11 +6,16 @@ #include <rdma/rdma_cm.h> #include "rds.h" +/* RDMA_CM also uses 16385 as the listener port. */ +#define RDS_CM_PORT 16385 + #define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 int rds_rdma_conn_connect(struct rds_connection *conn); int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); +int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); /* from ib.c */ extern struct rds_transport rds_ib_transport; diff --git a/net/rds/rds.h b/net/rds/rds.h index f2272fb8cd45..ff537bb11411 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -10,6 +10,7 @@ #include <linux/rds.h> #include <linux/rhashtable.h> #include <linux/refcount.h> +#include <linux/in6.h> #include "info.h" @@ -23,11 +24,13 @@ #define RDS_PROTOCOL_MINOR(v) ((v) & 255) #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) -/* - * XXX randomly chosen, but at least seems to be unused: - * # 18464-18768 Unassigned - * We should do better. We want a reserved port to discourage unpriv'ed - * userspace from listening. +/* The following ports, 16385, 18634, 18635, are registered with IANA as + * the ports to be used for RDS over TCP and UDP. Currently, only RDS over + * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value + * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After + * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept + * to ensure compatibility with older RDS modules. Those ports are defined + * in each transport's header file. */ #define RDS_PORT 18634 @@ -61,7 +64,7 @@ void rdsdebug(char *fmt, ...) struct rds_cong_map { struct rb_node m_rb_node; - __be32 m_addr; + struct in6_addr m_addr; wait_queue_head_t m_waitq; struct list_head m_conn_list; unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; @@ -136,11 +139,14 @@ struct rds_conn_path { /* One rds_connection per RDS address pair */ struct rds_connection { struct hlist_node c_hash_node; - __be32 c_laddr; - __be32 c_faddr; + struct in6_addr c_laddr; + struct in6_addr c_faddr; + int c_dev_if; /* ifindex used for this conn */ + int c_bound_if; /* ifindex of c_laddr */ unsigned int c_loopback:1, + c_isv6:1, c_ping_triggered:1, - c_pad_to_32:30; + c_pad_to_32:29; int c_npaths; struct rds_connection *c_passive; struct rds_transport *c_trans; @@ -269,7 +275,7 @@ struct rds_incoming { struct rds_conn_path *i_conn_path; struct rds_header i_hdr; unsigned long i_rx_jiffies; - __be32 i_saddr; + struct in6_addr i_saddr; rds_rdma_cookie_t i_rdma_cookie; struct timeval i_rx_tstamp; @@ -386,7 +392,7 @@ struct rds_message { struct list_head m_conn_item; struct rds_incoming m_inc; u64 m_ack_seq; - __be32 m_daddr; + struct in6_addr m_daddr; unsigned long m_flags; /* Never access m_rs without holding m_rs_lock. @@ -519,7 +525,8 @@ struct rds_transport { t_mp_capable:1; unsigned int t_type; - int (*laddr_check)(struct net *net, __be32 addr); + int (*laddr_check)(struct net *net, const struct in6_addr *addr, + __u32 scope_id); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); int (*conn_path_connect)(struct rds_conn_path *cp); @@ -535,8 +542,8 @@ struct rds_transport { void (*inc_free)(struct rds_incoming *inc); int (*cm_handle_connect)(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event); - int (*cm_initiate_connect)(struct rdma_cm_id *cm_id); + struct rdma_cm_event *event, bool isv6); + int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6); void (*cm_connect_complete)(struct rds_connection *conn, struct rdma_cm_event *event); @@ -551,6 +558,12 @@ struct rds_transport { bool (*t_unloading)(struct rds_connection *conn); }; +/* Bind hash table key length. It is the sum of the size of a struct + * in6_addr, a scope_id and a port. + */ +#define RDS_BOUND_KEY_LEN \ + (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16)) + struct rds_sock { struct sock rs_sk; @@ -562,10 +575,14 @@ struct rds_sock { * support. */ struct rhash_head rs_bound_node; - u64 rs_bound_key; - __be32 rs_bound_addr; - __be32 rs_conn_addr; - __be16 rs_bound_port; + u8 rs_bound_key[RDS_BOUND_KEY_LEN]; + struct sockaddr_in6 rs_bound_sin6; +#define rs_bound_addr rs_bound_sin6.sin6_addr +#define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3] +#define rs_bound_port rs_bound_sin6.sin6_port +#define rs_bound_scope_id rs_bound_sin6.sin6_scope_id + struct in6_addr rs_conn_addr; +#define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3] __be16 rs_conn_port; struct rds_transport *rs_transport; @@ -701,7 +718,8 @@ extern wait_queue_head_t rds_poll_waitq; /* bind.c */ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); void rds_remove_bound(struct rds_sock *rs); -struct rds_sock *rds_find_bound(__be32 addr, __be16 port); +struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, + __u32 scope_id); int rds_bind_lock_init(void); void rds_bind_lock_destroy(void); @@ -720,16 +738,20 @@ void rds_cong_remove_socket(struct rds_sock *); void rds_cong_exit(void); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); -/* conn.c */ +/* connection.c */ extern u32 rds_gen_num; int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp); + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, gfp_t gfp, + int dev_if); struct rds_connection *rds_conn_create_outgoing(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp); + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, + gfp_t gfp, int dev_if); void rds_conn_shutdown(struct rds_conn_path *cpath); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); @@ -840,11 +862,12 @@ void rds_page_exit(void); /* recv.c */ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, - __be32 saddr); + struct in6_addr *saddr); void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn, - __be32 saddr); + struct in6_addr *saddr); void rds_inc_put(struct rds_incoming *inc); -void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, +void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, + struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp); int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags); @@ -853,13 +876,17 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); void rds_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, __be32 saddr, __be32 daddr, int flip); +void rds6_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + struct in6_addr *saddr, struct in6_addr *daddr, + int flip); /* send.c */ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); void rds_send_path_reset(struct rds_conn_path *conn); int rds_send_xmit(struct rds_conn_path *cp); struct sockaddr_in; -void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); @@ -946,11 +973,14 @@ void rds_send_worker(struct work_struct *); void rds_recv_worker(struct work_struct *); void rds_connect_path_complete(struct rds_conn_path *conn, int curr); void rds_connect_complete(struct rds_connection *conn); +int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2); /* transport.c */ void rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); -struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr); +struct rds_transport *rds_trans_get_preferred(struct net *net, + const struct in6_addr *addr, + __u32 scope_id); void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); diff --git a/net/rds/recv.c b/net/rds/recv.c index 192ac6f78ded..03cd8df54c26 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -41,14 +41,14 @@ #include "rds.h" void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, - __be32 saddr) + struct in6_addr *saddr) { int i; refcount_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = conn; - inc->i_saddr = saddr; + inc->i_saddr = *saddr; inc->i_rdma_cookie = 0; inc->i_rx_tstamp.tv_sec = 0; inc->i_rx_tstamp.tv_usec = 0; @@ -59,13 +59,13 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, EXPORT_SYMBOL_GPL(rds_inc_init); void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, - __be32 saddr) + struct in6_addr *saddr) { refcount_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = cp->cp_conn; inc->i_conn_path = cp; - inc->i_saddr = saddr; + inc->i_saddr = *saddr; inc->i_rdma_cookie = 0; inc->i_rx_tstamp.tv_sec = 0; inc->i_rx_tstamp.tv_usec = 0; @@ -110,7 +110,7 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); - rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " + rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d " "now_cong %d delta %d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, @@ -260,7 +260,7 @@ static void rds_start_mprds(struct rds_connection *conn) struct rds_conn_path *cp; if (conn->c_npaths > 1 && - IS_CANONICAL(conn->c_laddr, conn->c_faddr)) { + rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) { for (i = 0; i < conn->c_npaths; i++) { cp = &conn->c_path[i]; rds_conn_path_connect_if_down(cp); @@ -284,7 +284,8 @@ static void rds_start_mprds(struct rds_connection *conn) * conn. This lets loopback, who only has one conn for both directions, * tell us which roles the addrs in the conn are playing for this message. */ -void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, +void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, + struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp) { struct rds_sock *rs = NULL; @@ -339,7 +340,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { if (inc->i_hdr.h_sport == 0) { - rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr); + rdsdebug("ignore ping with 0 sport from %pI6c\n", + saddr); goto out; } rds_stats_inc(s_recv_ping); @@ -362,7 +364,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, goto out; } - rs = rds_find_bound(daddr, inc->i_hdr.h_dport); + rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if); if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; @@ -625,6 +627,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, struct rds_sock *rs = rds_sk_to_rs(sk); long timeo; int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct rds_incoming *inc = NULL; @@ -673,7 +676,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, break; } - rdsdebug("copying inc %p from %pI4:%u to user\n", inc, + rdsdebug("copying inc %p from %pI6c:%u to user\n", inc, &inc->i_conn->c_faddr, ntohs(inc->i_hdr.h_sport)); ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter); @@ -707,12 +710,26 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, rds_stats_inc(s_recv_delivered); - if (sin) { - sin->sin_family = AF_INET; - sin->sin_port = inc->i_hdr.h_sport; - sin->sin_addr.s_addr = inc->i_saddr; - memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); - msg->msg_namelen = sizeof(*sin); + if (msg->msg_name) { + if (ipv6_addr_v4mapped(&inc->i_saddr)) { + sin = (struct sockaddr_in *)msg->msg_name; + + sin->sin_family = AF_INET; + sin->sin_port = inc->i_hdr.h_sport; + sin->sin_addr.s_addr = + inc->i_saddr.s6_addr32[3]; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + msg->msg_namelen = sizeof(*sin); + } else { + sin6 = (struct sockaddr_in6 *)msg->msg_name; + + sin6->sin6_family = AF_INET6; + sin6->sin6_port = inc->i_hdr.h_sport; + sin6->sin6_addr = inc->i_saddr; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = rs->rs_bound_scope_id; + msg->msg_namelen = sizeof(*sin6); + } } break; } @@ -775,3 +792,28 @@ void rds_inc_info_copy(struct rds_incoming *inc, rds_info_copy(iter, &minfo, sizeof(minfo)); } + +void rds6_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + struct in6_addr *saddr, struct in6_addr *daddr, + int flip) +{ + struct rds6_info_message minfo6; + + minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence); + minfo6.len = be32_to_cpu(inc->i_hdr.h_len); + + if (flip) { + minfo6.laddr = *daddr; + minfo6.faddr = *saddr; + minfo6.lport = inc->i_hdr.h_dport; + minfo6.fport = inc->i_hdr.h_sport; + } else { + minfo6.laddr = *saddr; + minfo6.faddr = *daddr; + minfo6.lport = inc->i_hdr.h_sport; + minfo6.fport = inc->i_hdr.h_dport; + } + + rds_info_copy(iter, &minfo6, sizeof(minfo6)); +} diff --git a/net/rds/send.c b/net/rds/send.c index 94c7f74909be..18e2b4d3931f 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -709,7 +709,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, } EXPORT_SYMBOL_GPL(rds_send_drop_acked); -void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn; @@ -721,8 +721,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) spin_lock_irqsave(&rs->rs_lock, flags); list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { - if (dest && (dest->sin_addr.s_addr != rm->m_daddr || - dest->sin_port != rm->m_inc.i_hdr.h_dport)) + if (dest && + (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) || + dest->sin6_port != rm->m_inc.i_hdr.h_dport)) continue; list_move(&rm->m_sock_item, &list); @@ -1059,8 +1060,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); - __be32 daddr; __be16 dport; struct rds_message *rm = NULL; struct rds_connection *conn; @@ -1069,10 +1070,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) int nonblock = msg->msg_flags & MSG_DONTWAIT; long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; + struct in6_addr daddr; + __u32 scope_id = 0; size_t total_payload_len = payload_len, rdma_payload_len = 0; bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); int num_sgs = ceil(payload_len, PAGE_SIZE); + int namelen; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ @@ -1081,27 +1085,106 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } - if (msg->msg_namelen) { - /* XXX fail non-unicast destination IPs? */ - if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) { + namelen = msg->msg_namelen; + if (namelen != 0) { + if (namelen < sizeof(*usin)) { + ret = -EINVAL; + goto out; + } + switch (usin->sin_family) { + case AF_INET: + if (usin->sin_addr.s_addr == htonl(INADDR_ANY) || + usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) { + ret = -EINVAL; + goto out; + } + ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr); + dport = usin->sin_port; + break; + + case AF_INET6: { + int addr_type; + + if (namelen < sizeof(*sin6)) { + ret = -EINVAL; + goto out; + } + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (!(addr_type & IPV6_ADDR_UNICAST)) { + __be32 addr4; + + if (!(addr_type & IPV6_ADDR_MAPPED)) { + ret = -EINVAL; + goto out; + } + + /* It is a mapped address. Need to do some + * sanity checks. + */ + addr4 = sin6->sin6_addr.s6_addr32[3]; + if (addr4 == htonl(INADDR_ANY) || + addr4 == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(addr4))) { + ret = -EINVAL; + goto out; + } + } + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (sin6->sin6_scope_id == 0) { + ret = -EINVAL; + goto out; + } + scope_id = sin6->sin6_scope_id; + } + + daddr = sin6->sin6_addr; + dport = sin6->sin6_port; + break; + } + + default: ret = -EINVAL; goto out; } - daddr = usin->sin_addr.s_addr; - dport = usin->sin_port; } else { /* We only care about consistency with ->connect() */ lock_sock(sk); daddr = rs->rs_conn_addr; dport = rs->rs_conn_port; + scope_id = rs->rs_bound_scope_id; release_sock(sk); } lock_sock(sk); - if (daddr == 0 || rs->rs_bound_addr == 0) { + if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) { release_sock(sk); - ret = -ENOTCONN; /* XXX not a great errno */ + ret = -ENOTCONN; goto out; + } else if (namelen != 0) { + /* Cannot send to an IPv4 address using an IPv6 source + * address and cannot send to an IPv6 address using an + * IPv4 source address. + */ + if (ipv6_addr_v4mapped(&daddr) ^ + ipv6_addr_v4mapped(&rs->rs_bound_addr)) { + release_sock(sk); + ret = -EOPNOTSUPP; + goto out; + } + /* If the socket is already bound to a link local address, + * it can only send to peers on the same link. But allow + * communicating beween link local and non-link local address. + */ + if (scope_id != rs->rs_bound_scope_id) { + if (!scope_id) { + scope_id = rs->rs_bound_scope_id; + } else if (rs->rs_bound_scope_id) { + release_sock(sk); + ret = -EINVAL; + goto out; + } + } } release_sock(sk); @@ -1155,13 +1238,14 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* rds_conn_create has a spinlock that runs with IRQ off. * Caching the conn in the socket helps a lot. */ - if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) + if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) conn = rs->rs_conn; else { conn = rds_conn_create_outgoing(sock_net(sock->sk), - rs->rs_bound_addr, daddr, - rs->rs_transport, - sock->sk->sk_allocation); + &rs->rs_bound_addr, &daddr, + rs->rs_transport, + sock->sk->sk_allocation, + scope_id); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 351a28474667..f23925af0b8d 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -37,6 +37,8 @@ #include <net/tcp.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/tcp.h> +#include <net/addrconf.h> #include "rds.h" #include "tcp.h" @@ -44,7 +46,12 @@ /* only for info exporting */ static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); static LIST_HEAD(rds_tcp_tc_list); + +/* rds_tcp_tc_count counts only IPv4 connections. + * rds6_tcp_tc_count counts both IPv4 and IPv6 connections. + */ static unsigned int rds_tcp_tc_count; +static unsigned int rds6_tcp_tc_count; /* Track rds_tcp_connection structs so they can be cleaned up */ static DEFINE_SPINLOCK(rds_tcp_conn_lock); @@ -111,7 +118,9 @@ void rds_tcp_restore_callbacks(struct socket *sock, /* done under the callback_lock to serialize with write_space */ spin_lock(&rds_tcp_tc_list_lock); list_del_init(&tc->t_list_item); - rds_tcp_tc_count--; + rds6_tcp_tc_count--; + if (!tc->t_cpath->cp_conn->c_isv6) + rds_tcp_tc_count--; spin_unlock(&rds_tcp_tc_list_lock); tc->t_sock = NULL; @@ -198,7 +207,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) /* done under the callback_lock to serialize with write_space */ spin_lock(&rds_tcp_tc_list_lock); list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); - rds_tcp_tc_count++; + rds6_tcp_tc_count++; + if (!tc->t_cpath->cp_conn->c_isv6) + rds_tcp_tc_count++; spin_unlock(&rds_tcp_tc_list_lock); /* accepted sockets need our listen data ready undone */ @@ -219,6 +230,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) write_unlock_bh(&sock->sk->sk_callback_lock); } +/* Handle RDS_INFO_TCP_SOCKETS socket option. It only returns IPv4 + * connections for backward compatibility. + */ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -226,8 +240,6 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, struct rds_info_tcp_socket tsinfo; struct rds_tcp_connection *tc; unsigned long flags; - struct sockaddr_in sin; - struct socket *sock; spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); @@ -235,16 +247,15 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, goto out; list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { + struct inet_sock *inet = inet_sk(tc->t_sock->sk); - sock = tc->t_sock; - if (sock) { - sock->ops->getname(sock, (struct sockaddr *)&sin, 0); - tsinfo.local_addr = sin.sin_addr.s_addr; - tsinfo.local_port = sin.sin_port; - sock->ops->getname(sock, (struct sockaddr *)&sin, 1); - tsinfo.peer_addr = sin.sin_addr.s_addr; - tsinfo.peer_port = sin.sin_port; - } + if (tc->t_cpath->cp_conn->c_isv6) + continue; + + tsinfo.local_addr = inet->inet_saddr; + tsinfo.local_port = inet->inet_sport; + tsinfo.peer_addr = inet->inet_daddr; + tsinfo.peer_port = inet->inet_dport; tsinfo.hdr_rem = tc->t_tinc_hdr_rem; tsinfo.data_rem = tc->t_tinc_data_rem; @@ -262,9 +273,75 @@ out: spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); } -static int rds_tcp_laddr_check(struct net *net, __be32 addr) +/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and + * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped + * address. + */ +static void rds6_tcp_tc_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds6_info_tcp_socket tsinfo6; + struct rds_tcp_connection *tc; + unsigned long flags; + + spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); + + if (len / sizeof(tsinfo6) < rds6_tcp_tc_count) + goto out; + + list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { + struct sock *sk = tc->t_sock->sk; + struct inet_sock *inet = inet_sk(sk); + + tsinfo6.local_addr = sk->sk_v6_rcv_saddr; + tsinfo6.local_port = inet->inet_sport; + tsinfo6.peer_addr = sk->sk_v6_daddr; + tsinfo6.peer_port = inet->inet_dport; + + tsinfo6.hdr_rem = tc->t_tinc_hdr_rem; + tsinfo6.data_rem = tc->t_tinc_data_rem; + tsinfo6.last_sent_nxt = tc->t_last_sent_nxt; + tsinfo6.last_expected_una = tc->t_last_expected_una; + tsinfo6.last_seen_una = tc->t_last_seen_una; + + rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6)); + } + +out: + lens->nr = rds6_tcp_tc_count; + lens->each = sizeof(tsinfo6); + + spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); +} + +static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr, + __u32 scope_id) { - if (inet_addr_type(net, addr) == RTN_LOCAL) + struct net_device *dev = NULL; + int ret; + + if (ipv6_addr_v4mapped(addr)) { + if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL) + return 0; + return -EADDRNOTAVAIL; + } + + /* If the scope_id is specified, check only those addresses + * hosted on the specified interface. + */ + if (scope_id != 0) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, scope_id); + /* scope_id is not valid... */ + if (!dev) { + rcu_read_unlock(); + return -EADDRNOTAVAIL; + } + rcu_read_unlock(); + } + ret = ipv6_chk_addr(net, addr, dev, 0); + if (ret) return 0; return -EADDRNOTAVAIL; } @@ -468,13 +545,18 @@ static __net_init int rds_tcp_init_net(struct net *net) err = -ENOMEM; goto fail; } - rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true); if (!rtn->rds_tcp_listen_sock) { - pr_warn("could not set up listen sock\n"); - unregister_net_sysctl_table(rtn->rds_tcp_sysctl); - rtn->rds_tcp_sysctl = NULL; - err = -EAFNOSUPPORT; - goto fail; + pr_warn("could not set up IPv6 listen sock\n"); + + /* Try IPv4 as some systems disable IPv6 */ + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false); + if (!rtn->rds_tcp_listen_sock) { + unregister_net_sysctl_table(rtn->rds_tcp_sysctl); + rtn->rds_tcp_sysctl = NULL; + err = -EAFNOSUPPORT; + goto fail; + } } INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); return 0; @@ -588,6 +670,7 @@ static void rds_tcp_exit(void) rds_tcp_set_unloading(); synchronize_rcu(); rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info); unregister_pernet_device(&rds_tcp_net_ops); rds_tcp_destroy_conns(); rds_trans_unregister(&rds_tcp_transport); @@ -619,6 +702,7 @@ static int rds_tcp_init(void) rds_trans_register(&rds_tcp_transport); rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info); goto out; out_recv: @@ -633,4 +717,3 @@ module_init(rds_tcp_init); MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); MODULE_DESCRIPTION("RDS: TCP transport"); MODULE_LICENSE("Dual BSD/GPL"); - diff --git a/net/rds/tcp.h b/net/rds/tcp.h index c6fa080e9b6d..3c69361d21c7 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -67,7 +67,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn); void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ -struct socket *rds_tcp_listen_init(struct net *); +struct socket *rds_tcp_listen_init(struct net *net, bool isv6); void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index d999e7075645..008f50fb25dd 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -66,7 +66,8 @@ void rds_tcp_state_change(struct sock *sk) * RDS connection as RDS_CONN_UP until the reconnect, * to avoid RDS datagram loss. */ - if (!IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr) && + if (rds_addr_cmp(&cp->cp_conn->c_laddr, + &cp->cp_conn->c_faddr) >= 0 && rds_conn_path_transition(cp, RDS_CONN_CONNECTING, RDS_CONN_ERROR)) { rds_conn_path_drop(cp, false); @@ -88,7 +89,11 @@ out: int rds_tcp_conn_path_connect(struct rds_conn_path *cp) { struct socket *sock = NULL; - struct sockaddr_in src, dest; + struct sockaddr_in6 sin6; + struct sockaddr_in sin; + struct sockaddr *addr; + int addrlen; + bool isv6; int ret; struct rds_connection *conn = cp->cp_conn; struct rds_tcp_connection *tc = cp->cp_transport_data; @@ -105,37 +110,68 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) mutex_unlock(&tc->t_conn_path_lock); return 0; } - ret = sock_create_kern(rds_conn_net(conn), PF_INET, - SOCK_STREAM, IPPROTO_TCP, &sock); + if (ipv6_addr_v4mapped(&conn->c_laddr)) { + ret = sock_create_kern(rds_conn_net(conn), PF_INET, + SOCK_STREAM, IPPROTO_TCP, &sock); + isv6 = false; + } else { + ret = sock_create_kern(rds_conn_net(conn), PF_INET6, + SOCK_STREAM, IPPROTO_TCP, &sock); + isv6 = true; + } + if (ret < 0) goto out; rds_tcp_tune(sock); - src.sin_family = AF_INET; - src.sin_addr.s_addr = (__force u32)conn->c_laddr; - src.sin_port = (__force u16)htons(0); + if (isv6) { + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = conn->c_laddr; + sin6.sin6_port = 0; + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = conn->c_dev_if; + addr = (struct sockaddr *)&sin6; + addrlen = sizeof(sin6); + } else { + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; + sin.sin_port = 0; + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); + } - ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); + ret = sock->ops->bind(sock, addr, addrlen); if (ret) { - rdsdebug("bind failed with %d at address %pI4\n", + rdsdebug("bind failed with %d at address %pI6c\n", ret, &conn->c_laddr); goto out; } - dest.sin_family = AF_INET; - dest.sin_addr.s_addr = (__force u32)conn->c_faddr; - dest.sin_port = (__force u16)htons(RDS_TCP_PORT); + if (isv6) { + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = conn->c_faddr; + sin6.sin6_port = htons(RDS_TCP_PORT); + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = conn->c_dev_if; + addr = (struct sockaddr *)&sin6; + addrlen = sizeof(sin6); + } else { + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; + sin.sin_port = htons(RDS_TCP_PORT); + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); + } /* * once we call connect() we can start getting callbacks and they * own the socket */ rds_tcp_set_callbacks(sock, cp); - ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), - O_NONBLOCK); + ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK); - rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); + rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) ret = 0; if (ret == 0) { diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 22571189f21e..0cf0147117d8 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -83,13 +83,12 @@ static struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) { int i; - bool peer_is_smaller = IS_CANONICAL(conn->c_faddr, conn->c_laddr); int npaths = max_t(int, 1, conn->c_npaths); /* for mprds, all paths MUST be initiated by the peer * with the smaller address. */ - if (!peer_is_smaller) { + if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) { /* Make sure we initiate at least one path if this * has not already been done; rds_start_mprds() will * take care of additional paths, if necessary. @@ -132,6 +131,8 @@ int rds_tcp_accept_one(struct socket *sock) struct rds_tcp_connection *rs_tcp = NULL; int conn_state; struct rds_conn_path *cp; + struct in6_addr *my_addr, *peer_addr; + int dev_if; if (!sock) /* module unload or netns delete in progress */ return -ENETUNREACH; @@ -164,13 +165,30 @@ int rds_tcp_accept_one(struct socket *sock) inet = inet_sk(new_sock->sk); - rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n", - &inet->inet_saddr, ntohs(inet->inet_sport), - &inet->inet_daddr, ntohs(inet->inet_dport)); + my_addr = &new_sock->sk->sk_v6_rcv_saddr; + peer_addr = &new_sock->sk->sk_v6_daddr; + rdsdebug("accepted tcp %pI6c:%u -> %pI6c:%u\n", + my_addr, ntohs(inet->inet_sport), + peer_addr, ntohs(inet->inet_dport)); + /* sk_bound_dev_if is not set if the peer address is not link local + * address. In this case, it happens that mcast_oif is set. So + * just use it. + */ + if ((ipv6_addr_type(my_addr) & IPV6_ADDR_LINKLOCAL) && + !(ipv6_addr_type(peer_addr) & IPV6_ADDR_LINKLOCAL)) { + struct ipv6_pinfo *inet6; + + inet6 = inet6_sk(new_sock->sk); + dev_if = inet6->mcast_oif; + } else { + dev_if = new_sock->sk->sk_bound_dev_if; + } conn = rds_conn_create(sock_net(sock->sk), - inet->inet_saddr, inet->inet_daddr, - &rds_tcp_transport, GFP_KERNEL); + &new_sock->sk->sk_v6_rcv_saddr, + &new_sock->sk->sk_v6_daddr, + &rds_tcp_transport, GFP_KERNEL, dev_if); + if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; @@ -254,15 +272,22 @@ out: ready(sk); } -struct socket *rds_tcp_listen_init(struct net *net) +struct socket *rds_tcp_listen_init(struct net *net, bool isv6) { - struct sockaddr_in sin; struct socket *sock = NULL; + struct sockaddr_storage ss; + struct sockaddr_in6 *sin6; + struct sockaddr_in *sin; + int addr_len; int ret; - ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (ret < 0) + ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM, + IPPROTO_TCP, &sock); + if (ret < 0) { + rdsdebug("could not create %s listener socket: %d\n", + isv6 ? "IPv6" : "IPv4", ret); goto out; + } sock->sk->sk_reuse = SK_CAN_REUSE; rds_tcp_nonagle(sock); @@ -272,13 +297,28 @@ struct socket *rds_tcp_listen_init(struct net *net) sock->sk->sk_data_ready = rds_tcp_listen_data_ready; write_unlock_bh(&sock->sk->sk_callback_lock); - sin.sin_family = PF_INET; - sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); - sin.sin_port = (__force u16)htons(RDS_TCP_PORT); + if (isv6) { + sin6 = (struct sockaddr_in6 *)&ss; + sin6->sin6_family = PF_INET6; + sin6->sin6_addr = in6addr_any; + sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT); + sin6->sin6_scope_id = 0; + sin6->sin6_flowinfo = 0; + addr_len = sizeof(*sin6); + } else { + sin = (struct sockaddr_in *)&ss; + sin->sin_family = PF_INET; + sin->sin_addr.s_addr = INADDR_ANY; + sin->sin_port = (__force u16)htons(RDS_TCP_PORT); + addr_len = sizeof(*sin); + } - ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); - if (ret < 0) + ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len); + if (ret < 0) { + rdsdebug("could not bind %s listener socket: %d\n", + isv6 ? "IPv6" : "IPv4", ret); goto out; + } ret = sock->ops->listen(sock, 64); if (ret < 0) diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index b9fbd2ee74ef..42c5ff1eda95 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -179,7 +179,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, tc->t_tinc = tinc; rdsdebug("alloced tinc %p\n", tinc); rds_inc_path_init(&tinc->ti_inc, cp, - cp->cp_conn->c_faddr); + &cp->cp_conn->c_faddr); tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = local_clock(); @@ -239,8 +239,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) rds_tcp_cong_recv(conn, tinc); else - rds_recv_incoming(conn, conn->c_faddr, - conn->c_laddr, &tinc->ti_inc, + rds_recv_incoming(conn, &conn->c_faddr, + &conn->c_laddr, + &tinc->ti_inc, arg->gfp); tc->t_tinc_hdr_rem = sizeof(struct rds_header); diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 7df869d37afd..78a2554a4497 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -153,7 +153,7 @@ out: * an incoming RST. */ if (rds_conn_path_up(cp)) { - pr_warn("RDS/tcp: send to %pI4 on cp [%d]" + pr_warn("RDS/tcp: send to %pI6c on cp [%d]" "returned %d, " "disconnecting and reconnecting\n", &conn->c_faddr, cp->cp_index, ret); diff --git a/net/rds/threads.c b/net/rds/threads.c index c52861d77a59..e64f9e4c3cda 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -82,8 +82,8 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr) return; } - rdsdebug("conn %p for %pI4 to %pI4 complete\n", - cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); + rdsdebug("conn %p for %pI6c to %pI6c complete\n", + cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); cp->cp_reconnect_jiffies = 0; set_bit(0, &cp->cp_conn->c_map_queued); @@ -125,13 +125,13 @@ void rds_queue_reconnect(struct rds_conn_path *cp) unsigned long rand; struct rds_connection *conn = cp->cp_conn; - rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", - conn, &conn->c_laddr, &conn->c_faddr, - cp->cp_reconnect_jiffies); + rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n", + conn, &conn->c_laddr, &conn->c_faddr, + cp->cp_reconnect_jiffies); /* let peer with smaller addr initiate reconnect, to avoid duels */ if (conn->c_trans->t_type == RDS_TRANS_TCP && - !IS_CANONICAL(conn->c_laddr, conn->c_faddr)) + rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0) return; set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); @@ -145,7 +145,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp) } get_random_bytes(&rand, sizeof(rand)); - rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", + rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n", rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, conn, &conn->c_laddr, &conn->c_faddr); rcu_read_lock(); @@ -167,14 +167,14 @@ void rds_connect_worker(struct work_struct *work) int ret; if (cp->cp_index > 0 && - !IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr)) + rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0) return; clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); if (ret) { ret = conn->c_trans->conn_path_connect(cp); - rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", - conn, &conn->c_laddr, &conn->c_faddr, ret); + rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n", + conn, &conn->c_laddr, &conn->c_faddr, ret); if (ret) { if (rds_conn_path_transition(cp, @@ -259,3 +259,50 @@ int rds_threads_init(void) return 0; } + +/* Compare two IPv6 addresses. Return 0 if the two addresses are equal. + * Return 1 if the first is greater. Return -1 if the second is greater. + */ +int rds_addr_cmp(const struct in6_addr *addr1, + const struct in6_addr *addr2) +{ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + const __be64 *a1, *a2; + u64 x, y; + + a1 = (__be64 *)addr1; + a2 = (__be64 *)addr2; + + if (*a1 != *a2) { + if (be64_to_cpu(*a1) < be64_to_cpu(*a2)) + return -1; + else + return 1; + } else { + x = be64_to_cpu(*++a1); + y = be64_to_cpu(*++a2); + if (x < y) + return -1; + else if (x > y) + return 1; + else + return 0; + } +#else + u32 a, b; + int i; + + for (i = 0; i < 4; i++) { + if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) { + a = ntohl(addr1->s6_addr32[i]); + b = ntohl(addr2->s6_addr32[i]); + if (a < b) + return -1; + else if (a > b) + return 1; + } + } + return 0; +#endif +} +EXPORT_SYMBOL_GPL(rds_addr_cmp); diff --git a/net/rds/transport.c b/net/rds/transport.c index 0b188dd0a344..46f709a4b577 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,6 +33,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/in.h> +#include <linux/ipv6.h> #include "rds.h" #include "loop.h" @@ -75,20 +76,26 @@ void rds_trans_put(struct rds_transport *trans) module_put(trans->t_owner); } -struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr) +struct rds_transport *rds_trans_get_preferred(struct net *net, + const struct in6_addr *addr, + __u32 scope_id) { struct rds_transport *ret = NULL; struct rds_transport *trans; unsigned int i; - if (IN_LOOPBACK(ntohl(addr))) + if (ipv6_addr_v4mapped(addr)) { + if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET) + return &rds_loop_transport; + } else if (ipv6_addr_loopback(addr)) { return &rds_loop_transport; + } down_read(&rds_trans_sem); for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; - if (trans && (trans->laddr_check(net, addr) == 0) && + if (trans && (trans->laddr_check(net, addr, scope_id) == 0) && (!trans->t_owner || try_module_get(trans->t_owner))) { ret = trans; break; @@ -152,4 +159,3 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, return total; } - diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ebe42e7eb456..d00a0ef39a56 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1470,7 +1470,7 @@ static const struct proto_ops rose_proto_ops = { .socketpair = sock_no_socketpair, .accept = rose_accept, .getname = rose_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = rose_ioctl, .listen = rose_listen, .shutdown = sock_no_shutdown, diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 3b1ac93efee2..2b463047dd7b 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -734,11 +734,15 @@ static int rxrpc_getsockopt(struct socket *sock, int level, int optname, /* * permit an RxRPC socket to be polled */ -static __poll_t rxrpc_poll_mask(struct socket *sock, __poll_t events) +static __poll_t rxrpc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* the socket is readable if there are any messages waiting on the Rx * queue */ @@ -945,7 +949,7 @@ static const struct proto_ops rxrpc_rpc_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = rxrpc_poll_mask, + .poll = rxrpc_poll, .ioctl = sock_no_ioctl, .listen = rxrpc_listen, .shutdown = rxrpc_shutdown, diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a01169fb5325..e95741388311 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -1,6 +1,6 @@ # # Traffic control configuration. -# +# menuconfig NET_SCHED bool "QoS and/or fair queueing" @@ -183,6 +183,17 @@ config NET_SCH_CBS To compile this code as a module, choose M here: the module will be called sch_cbs. +config NET_SCH_ETF + tristate "Earliest TxTime First (ETF)" + help + Say Y here if you want to use the Earliest TxTime First (ETF) packet + scheduling algorithm. + + See the top of <file:net/sched/sch_etf.c> for more details. + + To compile this code as a module, choose M here: the + module will be called sch_etf. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- @@ -240,6 +251,19 @@ config NET_SCH_MQPRIO If unsure, say N. +config NET_SCH_SKBPRIO + tristate "SKB priority queue scheduler (SKBPRIO)" + help + Say Y here if you want to use the SKB priority queue + scheduler. This schedules packets according to skb->priority, + which is useful for request packets in DoS mitigation systems such + as Gatekeeper. + + To compile this driver as a module, choose M here: the module will + be called sch_skbprio. + + If unsure, say N. + config NET_SCH_CHOKE tristate "CHOose and Keep responsive flow scheduler (CHOKE)" help @@ -284,6 +308,17 @@ config NET_SCH_FQ_CODEL If unsure, say N. +config NET_SCH_CAKE + tristate "Common Applications Kept Enhanced (CAKE)" + help + Say Y here if you want to use the Common Applications Kept Enhanced + (CAKE) queue management algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_cake. + + If unsure, say N. + config NET_SCH_FQ tristate "Fair Queue" help @@ -684,7 +719,7 @@ config NET_CLS_ACT config NET_ACT_POLICE tristate "Traffic Policing" - depends on NET_CLS_ACT + depends on NET_CLS_ACT ---help--- Say Y here if you want to do traffic policing, i.e. strict bandwidth limiting. This action replaces the existing policing diff --git a/net/sched/Makefile b/net/sched/Makefile index 8811d3804878..f0403f49edcb 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o obj-$(CONFIG_NET_SCH_RED) += sch_red.o obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o -obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o +obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o @@ -46,14 +46,17 @@ obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o +obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o +obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o +obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 3f4cf930f809..148a89ab789b 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -55,6 +55,24 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a, res->goto_tp = rcu_dereference_bh(chain->filter_chain); } +static void tcf_free_cookie_rcu(struct rcu_head *p) +{ + struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu); + + kfree(cookie->data); + kfree(cookie); +} + +static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, + struct tc_cookie *new_cookie) +{ + struct tc_cookie *old; + + old = xchg((__force struct tc_cookie **)old_cookie, new_cookie); + if (old) + call_rcu(&old->rcu, tcf_free_cookie_rcu); +} + /* XXX: For standalone actions, we don't need a RCU grace period either, because * actions are always connected to filters and filters are already destroyed in * RCU callbacks, so after a RCU grace period actions are already disconnected @@ -65,44 +83,64 @@ static void free_tcf(struct tc_action *p) free_percpu(p->cpu_bstats); free_percpu(p->cpu_qstats); - if (p->act_cookie) { - kfree(p->act_cookie->data); - kfree(p->act_cookie); - } + tcf_set_action_cookie(&p->act_cookie, NULL); if (p->goto_chain) tcf_action_goto_chain_fini(p); kfree(p); } -static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p) +static void tcf_action_cleanup(struct tc_action *p) { - spin_lock(&idrinfo->lock); - idr_remove(&idrinfo->action_idr, p->tcfa_index); - spin_unlock(&idrinfo->lock); + if (p->ops->cleanup) + p->ops->cleanup(p); + gen_kill_estimator(&p->tcfa_rate_est); free_tcf(p); } +static int __tcf_action_put(struct tc_action *p, bool bind) +{ + struct tcf_idrinfo *idrinfo = p->idrinfo; + + if (refcount_dec_and_lock(&p->tcfa_refcnt, &idrinfo->lock)) { + if (bind) + atomic_dec(&p->tcfa_bindcnt); + idr_remove(&idrinfo->action_idr, p->tcfa_index); + spin_unlock(&idrinfo->lock); + + tcf_action_cleanup(p); + return 1; + } + + if (bind) + atomic_dec(&p->tcfa_bindcnt); + + return 0; +} + int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) { int ret = 0; - ASSERT_RTNL(); - + /* Release with strict==1 and bind==0 is only called through act API + * interface (classifiers always bind). Only case when action with + * positive reference count and zero bind count can exist is when it was + * also created with act API (unbinding last classifier will destroy the + * action if it was created by classifier). So only case when bind count + * can be changed after initial check is when unbound action is + * destroyed by act API while classifier binds to action with same id + * concurrently. This result either creation of new action(same behavior + * as before), or reusing existing action if concurrent process + * increments reference count before action is deleted. Both scenarios + * are acceptable. + */ if (p) { - if (bind) - p->tcfa_bindcnt--; - else if (strict && p->tcfa_bindcnt > 0) + if (!bind && strict && atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; - p->tcfa_refcnt--; - if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) { - if (p->ops->cleanup) - p->ops->cleanup(p); - tcf_idr_remove(p->idrinfo, p); + if (__tcf_action_put(p, bind)) ret = ACT_P_DELETED; - } } return ret; @@ -111,10 +149,15 @@ EXPORT_SYMBOL(__tcf_idr_release); static size_t tcf_action_shared_attrs_size(const struct tc_action *act) { + struct tc_cookie *act_cookie; u32 cookie_len = 0; - if (act->act_cookie) - cookie_len = nla_total_size(act->act_cookie->len); + rcu_read_lock(); + act_cookie = rcu_dereference(act->act_cookie); + + if (act_cookie) + cookie_len = nla_total_size(act_cookie->len); + rcu_read_unlock(); return nla_total_size(0) /* action number nested */ + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ @@ -257,46 +300,77 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, } EXPORT_SYMBOL(tcf_generic_walker); -static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo) +static bool __tcf_idr_check(struct tc_action_net *tn, u32 index, + struct tc_action **a, int bind) { - struct tc_action *p = NULL; + struct tcf_idrinfo *idrinfo = tn->idrinfo; + struct tc_action *p; spin_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); + if (IS_ERR(p)) { + p = NULL; + } else if (p) { + refcount_inc(&p->tcfa_refcnt); + if (bind) + atomic_inc(&p->tcfa_bindcnt); + } spin_unlock(&idrinfo->lock); - return p; + if (p) { + *a = p; + return true; + } + return false; } int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index) { - struct tcf_idrinfo *idrinfo = tn->idrinfo; - struct tc_action *p = tcf_idr_lookup(index, idrinfo); - - if (p) { - *a = p; - return 1; - } - return 0; + return __tcf_idr_check(tn, index, a, 0); } EXPORT_SYMBOL(tcf_idr_search); bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, int bind) { + return __tcf_idr_check(tn, index, a, bind); +} +EXPORT_SYMBOL(tcf_idr_check); + +int tcf_idr_delete_index(struct tc_action_net *tn, u32 index) +{ struct tcf_idrinfo *idrinfo = tn->idrinfo; - struct tc_action *p = tcf_idr_lookup(index, idrinfo); + struct tc_action *p; + int ret = 0; - if (index && p) { - if (bind) - p->tcfa_bindcnt++; - p->tcfa_refcnt++; - *a = p; - return true; + spin_lock(&idrinfo->lock); + p = idr_find(&idrinfo->action_idr, index); + if (!p) { + spin_unlock(&idrinfo->lock); + return -ENOENT; } - return false; + + if (!atomic_read(&p->tcfa_bindcnt)) { + if (refcount_dec_and_test(&p->tcfa_refcnt)) { + struct module *owner = p->ops->owner; + + WARN_ON(p != idr_remove(&idrinfo->action_idr, + p->tcfa_index)); + spin_unlock(&idrinfo->lock); + + tcf_action_cleanup(p); + module_put(owner); + return 0; + } + ret = 0; + } else { + ret = -EPERM; + } + + spin_unlock(&idrinfo->lock); + return ret; } -EXPORT_SYMBOL(tcf_idr_check); +EXPORT_SYMBOL(tcf_idr_delete_index); int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, @@ -304,14 +378,13 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, { struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_idrinfo *idrinfo = tn->idrinfo; - struct idr *idr = &idrinfo->action_idr; int err = -ENOMEM; if (unlikely(!p)) return -ENOMEM; - p->tcfa_refcnt = 1; + refcount_set(&p->tcfa_refcnt, 1); if (bind) - p->tcfa_bindcnt = 1; + atomic_set(&p->tcfa_bindcnt, 1); if (cpustats) { p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); @@ -322,20 +395,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, goto err2; } spin_lock_init(&p->tcfa_lock); - idr_preload(GFP_KERNEL); - spin_lock(&idrinfo->lock); - /* user doesn't specify an index */ - if (!index) { - index = 1; - err = idr_alloc_u32(idr, NULL, &index, UINT_MAX, GFP_ATOMIC); - } else { - err = idr_alloc_u32(idr, NULL, &index, index, GFP_ATOMIC); - } - spin_unlock(&idrinfo->lock); - idr_preload_end(); - if (err) - goto err3; - p->tcfa_index = index; p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; @@ -345,7 +404,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, &p->tcfa_rate_est, &p->tcfa_lock, NULL, est); if (err) - goto err4; + goto err3; } p->idrinfo = idrinfo; @@ -353,8 +412,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, INIT_LIST_HEAD(&p->list); *a = p; return 0; -err4: - idr_remove(idr, index); err3: free_percpu(p->cpu_qstats); err2: @@ -370,11 +427,78 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) struct tcf_idrinfo *idrinfo = tn->idrinfo; spin_lock(&idrinfo->lock); - idr_replace(&idrinfo->action_idr, a, a->tcfa_index); + /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ + WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); spin_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_insert); +/* Cleanup idr index that was allocated but not initialized. */ + +void tcf_idr_cleanup(struct tc_action_net *tn, u32 index) +{ + struct tcf_idrinfo *idrinfo = tn->idrinfo; + + spin_lock(&idrinfo->lock); + /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ + WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index))); + spin_unlock(&idrinfo->lock); +} +EXPORT_SYMBOL(tcf_idr_cleanup); + +/* Check if action with specified index exists. If actions is found, increments + * its reference and bind counters, and return 1. Otherwise insert temporary + * error pointer (to prevent concurrent users from inserting actions with same + * index) and return 0. + */ + +int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, + struct tc_action **a, int bind) +{ + struct tcf_idrinfo *idrinfo = tn->idrinfo; + struct tc_action *p; + int ret; + +again: + spin_lock(&idrinfo->lock); + if (*index) { + p = idr_find(&idrinfo->action_idr, *index); + if (IS_ERR(p)) { + /* This means that another process allocated + * index but did not assign the pointer yet. + */ + spin_unlock(&idrinfo->lock); + goto again; + } + + if (p) { + refcount_inc(&p->tcfa_refcnt); + if (bind) + atomic_inc(&p->tcfa_bindcnt); + *a = p; + ret = 1; + } else { + *a = NULL; + ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, + *index, GFP_ATOMIC); + if (!ret) + idr_replace(&idrinfo->action_idr, + ERR_PTR(-EBUSY), *index); + } + } else { + *index = 1; + *a = NULL; + ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, + UINT_MAX, GFP_ATOMIC); + if (!ret) + idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY), + *index); + } + spin_unlock(&idrinfo->lock); + return ret; +} +EXPORT_SYMBOL(tcf_idr_check_alloc); + void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo) { @@ -538,13 +662,15 @@ repeat: } EXPORT_SYMBOL(tcf_action_exec); -int tcf_action_destroy(struct list_head *actions, int bind) +int tcf_action_destroy(struct tc_action *actions[], int bind) { const struct tc_action_ops *ops; - struct tc_action *a, *tmp; - int ret = 0; + struct tc_action *a; + int ret = 0, i; - list_for_each_entry_safe(a, tmp, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + a = actions[i]; + actions[i] = NULL; ops = a->ops; ret = __tcf_idr_release(a, bind, true); if (ret == ACT_P_DELETED) @@ -555,6 +681,24 @@ int tcf_action_destroy(struct list_head *actions, int bind) return ret; } +static int tcf_action_put(struct tc_action *p) +{ + return __tcf_action_put(p, false); +} + +static void tcf_action_put_many(struct tc_action *actions[]) +{ + int i; + + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + struct tc_action *a = actions[i]; + const struct tc_action_ops *ops = a->ops; + + if (tcf_action_put(a)) + module_put(ops->owner); + } +} + int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -567,16 +711,22 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) int err = -EINVAL; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; + struct tc_cookie *cookie; if (nla_put_string(skb, TCA_KIND, a->ops->kind)) goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) goto nla_put_failure; - if (a->act_cookie) { - if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len, - a->act_cookie->data)) + + rcu_read_lock(); + cookie = rcu_dereference(a->act_cookie); + if (cookie) { + if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { + rcu_read_unlock(); goto nla_put_failure; + } } + rcu_read_unlock(); nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) @@ -593,14 +743,15 @@ nla_put_failure: } EXPORT_SYMBOL(tcf_action_dump_1); -int tcf_action_dump(struct sk_buff *skb, struct list_head *actions, +int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref) { struct tc_action *a; - int err = -EINVAL; + int err = -EINVAL, i; struct nlattr *nest; - list_for_each_entry(a, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + a = actions[i]; nest = nla_nest_start(skb, a->order); if (nest == NULL) goto nla_put_failure; @@ -638,6 +789,7 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, + bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action *a; @@ -688,9 +840,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, a_o = tc_lookup_action_n(act_name); if (a_o == NULL) { #ifdef CONFIG_MODULES - rtnl_unlock(); + if (rtnl_held) + rtnl_unlock(); request_module("act_%s", act_name); - rtnl_lock(); + if (rtnl_held) + rtnl_lock(); a_o = tc_lookup_action_n(act_name); @@ -713,19 +867,15 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, /* backward compatibility for policer */ if (name == NULL) err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, - extack); + rtnl_held, extack); else - err = a_o->init(net, nla, est, &a, ovr, bind, extack); + err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, + extack); if (err < 0) goto err_mod; - if (name == NULL && tb[TCA_ACT_COOKIE]) { - if (a->act_cookie) { - kfree(a->act_cookie->data); - kfree(a->act_cookie); - } - a->act_cookie = cookie; - } + if (!name && tb[TCA_ACT_COOKIE]) + tcf_set_action_cookie(&a->act_cookie, cookie); /* module count goes up only when brand new policy is created * if it exists and is only bound to in a_o->init() then @@ -737,10 +887,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) { err = tcf_action_goto_chain_init(a, tp); if (err) { - LIST_HEAD(actions); + struct tc_action *actions[] = { a, NULL }; - list_add_tail(&a->list, &actions); - tcf_action_destroy(&actions, bind); + tcf_action_destroy(actions, bind); NL_SET_ERR_MSG(extack, "Failed to init TC action chain"); return ERR_PTR(err); } @@ -758,21 +907,12 @@ err_out: return ERR_PTR(err); } -static void cleanup_a(struct list_head *actions, int ovr) -{ - struct tc_action *a; - - if (!ovr) - return; - - list_for_each_entry(a, actions, list) - a->tcfa_refcnt--; -} +/* Returns numbers of initialized actions or negative error. */ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions, size_t *attr_size, - struct netlink_ext_ack *extack) + struct tc_action *actions[], size_t *attr_size, + bool rtnl_held, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; @@ -786,25 +926,19 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, - extack); + rtnl_held, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; } act->order = i; sz += tcf_action_fill_size(act); - if (ovr) - act->tcfa_refcnt++; - list_add_tail(&act->list, actions); + /* Start from index 0 */ + actions[i - 1] = act; } *attr_size = tcf_action_full_attrs_size(sz); - - /* Remove the temp refcnt which was necessary to protect against - * destroying an existing action which was being replaced - */ - cleanup_a(actions, ovr); - return 0; + return i - 1; err: tcf_action_destroy(actions, bind); @@ -855,7 +989,7 @@ errout: return -1; } -static int tca_get_fill(struct sk_buff *skb, struct list_head *actions, +static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], u32 portid, u32 seq, u16 flags, int event, int bind, int ref) { @@ -891,7 +1025,7 @@ out_nlmsg_trim: static int tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, - struct list_head *actions, int event, + struct tc_action *actions[], int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; @@ -900,7 +1034,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, - 0, 0) <= 0) { + 0, 1) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; @@ -1027,9 +1161,41 @@ err_out: return err; } +static int tcf_action_delete(struct net *net, struct tc_action *actions[], + int *acts_deleted, struct netlink_ext_ack *extack) +{ + u32 act_index; + int ret, i; + + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + struct tc_action *a = actions[i]; + const struct tc_action_ops *ops = a->ops; + + /* Actions can be deleted concurrently so we must save their + * type and id to search again after reference is released. + */ + act_index = a->tcfa_index; + + if (tcf_action_put(a)) { + /* last reference, action was deleted concurrently */ + module_put(ops->owner); + } else { + /* now do the delete */ + ret = ops->delete(net, act_index); + if (ret < 0) { + *acts_deleted = i + 1; + return ret; + } + } + } + *acts_deleted = i; + return 0; +} + static int -tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, - u32 portid, size_t attr_size, struct netlink_ext_ack *extack) +tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], + int *acts_deleted, u32 portid, size_t attr_size, + struct netlink_ext_ack *extack) { int ret; struct sk_buff *skb; @@ -1040,14 +1206,14 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, - 0, 1) <= 0) { + 0, 2) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return -EINVAL; } /* now do the delete */ - ret = tcf_action_destroy(actions, 0); + ret = tcf_action_delete(net, actions, acts_deleted, extack); if (ret < 0) { NL_SET_ERR_MSG(extack, "Failed to delete TC action"); kfree_skb(skb); @@ -1069,7 +1235,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t attr_size = 0; - LIST_HEAD(actions); + struct tc_action *actions[TCA_ACT_MAX_PRIO + 1] = {}; + int acts_deleted = 0; ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (ret < 0) @@ -1091,27 +1258,27 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, } act->order = i; attr_size += tcf_action_fill_size(act); - list_add_tail(&act->list, &actions); + actions[i - 1] = act; } attr_size = tcf_action_full_attrs_size(attr_size); if (event == RTM_GETACTION) - ret = tcf_get_notify(net, portid, n, &actions, event, extack); + ret = tcf_get_notify(net, portid, n, actions, event, extack); else { /* delete */ - ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack); + ret = tcf_del_notify(net, n, actions, &acts_deleted, portid, + attr_size, extack); if (ret) goto err; return ret; } err: - if (event != RTM_GETACTION) - tcf_action_destroy(&actions, 0); + tcf_action_put_many(&actions[acts_deleted]); return ret; } static int -tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, +tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; @@ -1142,14 +1309,17 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, { size_t attr_size = 0; int ret = 0; - LIST_HEAD(actions); + struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, - &attr_size, extack); - if (ret) + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions, + &attr_size, true, extack); + if (ret < 0) return ret; + ret = tcf_add_notify(net, n, actions, portid, attr_size, extack); + if (ovr) + tcf_action_put_many(actions); - return tcf_add_notify(net, n, &actions, portid, attr_size, extack); + return ret; } static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 18089c02e557..06f743d8ed41 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -141,8 +141,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, struct tcf_bpf *prog = to_bpf(act); struct tc_act_bpf opt = { .index = prog->tcf_index, - .refcnt = prog->tcf_refcnt - ref, - .bindcnt = prog->tcf_bindcnt - bind, + .refcnt = refcount_read(&prog->tcf_refcnt) - ref, + .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind, .action = prog->tcf_action, }; struct tcf_t tm; @@ -276,7 +276,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, - int replace, int bind, struct netlink_ext_ack *extack) + int replace, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; @@ -298,21 +299,27 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - if (!tcf_idr_check(tn, parm->index, act, bind)) { + ret = tcf_idr_check_alloc(tn, &parm->index, act, bind); + if (!ret) { ret = tcf_idr_create(tn, parm->index, est, act, &act_bpf_ops, bind, true); - if (ret < 0) + if (ret < 0) { + tcf_idr_cleanup(tn, parm->index); return ret; + } res = ACT_P_CREATED; - } else { + } else if (ret > 0) { /* Don't override defaults. */ if (bind) return 0; - tcf_idr_release(*act, bind); - if (!replace) + if (!replace) { + tcf_idr_release(*act, bind); return -EEXIST; + } + } else { + return ret; } is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; @@ -355,8 +362,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, return res; out: - if (res == ACT_P_CREATED) - tcf_idr_release(*act, bind); + tcf_idr_release(*act, bind); return ret; } @@ -387,6 +393,13 @@ static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_bpf_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .type = TCA_ACT_BPF, @@ -397,6 +410,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .init = tcf_bpf_init, .walk = tcf_bpf_walker, .lookup = tcf_bpf_search, + .delete = tcf_bpf_delete, .size = sizeof(struct tcf_bpf), }; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e4b880fa51fe..2f9bc833d046 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -96,7 +96,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, + int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); @@ -118,11 +118,14 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_CONNMARK_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + ret = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!ret) { ret = tcf_idr_create(tn, parm->index, est, a, &act_connmark_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ci = to_connmark(*a); ci->tcf_action = parm->action; @@ -131,16 +134,18 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, tcf_idr_insert(tn, *a); ret = ACT_P_CREATED; - } else { + } else if (ret > 0) { ci = to_connmark(*a); if (bind) return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } /* replacing action and zone */ ci->tcf_action = parm->action; ci->zone = parm->zone; + ret = 0; } return ret; @@ -154,8 +159,8 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, struct tc_connmark opt = { .index = ci->tcf_index, - .refcnt = ci->tcf_refcnt - ref, - .bindcnt = ci->tcf_bindcnt - bind, + .refcnt = refcount_read(&ci->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, .action = ci->tcf_action, .zone = ci->zone, }; @@ -193,6 +198,13 @@ static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_connmark_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .type = TCA_ACT_CONNMARK, @@ -202,6 +214,7 @@ static struct tc_action_ops act_connmark_ops = { .init = tcf_connmark_init, .walk = tcf_connmark_walker, .lookup = tcf_connmark_search, + .delete = tcf_connmark_delete, .size = sizeof(struct tcf_connmark_info), }; @@ -239,4 +252,3 @@ module_exit(connmark_cleanup_module); MODULE_AUTHOR("Felix Fietkau <nbd@openwrt.org>"); MODULE_DESCRIPTION("Connection tracking mark restoring"); MODULE_LICENSE("GPL"); - diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 526a8e491626..4e8c383f379e 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -46,7 +46,8 @@ static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); struct tcf_csum_params *params_old, *params_new; @@ -66,18 +67,24 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_CSUM_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_csum_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind)/* dont override defaults */ return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } + } else { + return err; } p = to_tcf_csum(*a); @@ -85,13 +92,12 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } params_old = rtnl_dereference(p->params); - params_new->action = parm->action; + p->tcf_action = parm->action; params_new->update_flags = parm->update_flags; rcu_assign_pointer(p->params, params_new); if (params_old) @@ -561,7 +567,7 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&p->tcf_tm); bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb); - action = params->action; + action = READ_ONCE(p->tcf_action); if (unlikely(action == TC_ACT_SHOT)) goto drop_stats; @@ -597,13 +603,13 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tcf_csum_params *params; struct tc_csum opt = { .index = p->tcf_index, - .refcnt = p->tcf_refcnt - ref, - .bindcnt = p->tcf_bindcnt - bind, + .refcnt = refcount_read(&p->tcf_refcnt) - ref, + .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, + .action = p->tcf_action, }; struct tcf_t t; params = rtnl_dereference(p->params); - opt.action = params->action; opt.update_flags = params->update_flags; if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) @@ -653,6 +659,13 @@ static size_t tcf_csum_get_fill_size(const struct tc_action *act) return nla_total_size(sizeof(struct tc_csum)); } +static int tcf_csum_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_csum_ops = { .kind = "csum", .type = TCA_ACT_CSUM, @@ -664,6 +677,7 @@ static struct tc_action_ops act_csum_ops = { .walk = tcf_csum_walker, .lookup = tcf_csum_search, .get_fill_size = tcf_csum_get_fill_size, + .delete = tcf_csum_delete, .size = sizeof(struct tcf_csum), }; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 4dc4f153cad8..661b72b9147d 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -56,7 +56,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; @@ -90,18 +91,24 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_gact_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind)/* dont override defaults */ return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } + } else { + return err; } gact = to_gact(*a); @@ -169,8 +176,8 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_gact *gact = to_gact(a); struct tc_gact opt = { .index = gact->tcf_index, - .refcnt = gact->tcf_refcnt - ref, - .bindcnt = gact->tcf_bindcnt - bind, + .refcnt = refcount_read(&gact->tcf_refcnt) - ref, + .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, .action = gact->tcf_action, }; struct tcf_t t; @@ -230,6 +237,13 @@ static size_t tcf_gact_get_fill_size(const struct tc_action *act) return sz; } +static int tcf_gact_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_gact_ops = { .kind = "gact", .type = TCA_ACT_GACT, @@ -241,6 +255,7 @@ static struct tc_action_ops act_gact_ops = { .walk = tcf_gact_walker, .lookup = tcf_gact_search, .get_fill_size = tcf_gact_get_fill_size, + .delete = tcf_gact_delete, .size = sizeof(struct tcf_gact), }; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 20d7d36b2fc9..3d6e265758c0 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -448,7 +448,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; @@ -483,7 +484,12 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (!p) return -ENOMEM; - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) { + kfree(p); + return err; + } + exists = err; if (exists && bind) { kfree(p); return 0; @@ -493,16 +499,15 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops, bind, true); if (ret) { + tcf_idr_cleanup(tn, parm->index); kfree(p); return ret; } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) { - kfree(p); - return -EEXIST; - } + kfree(p); + return -EEXIST; } ife = to_ife(*a); @@ -547,6 +552,8 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + tcf_idr_release(*a, bind); + kfree(p); return err; } @@ -596,8 +603,8 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tcf_ife_params *p = rtnl_dereference(ife->params); struct tc_ife opt = { .index = ife->tcf_index, - .refcnt = ife->tcf_refcnt - ref, - .bindcnt = ife->tcf_bindcnt - bind, + .refcnt = refcount_read(&ife->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, .action = ife->tcf_action, .flags = p->flags, }; @@ -843,6 +850,13 @@ static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_ife_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_ife_ops = { .kind = "ife", .type = TCA_ACT_IFE, @@ -853,6 +867,7 @@ static struct tc_action_ops act_ife_ops = { .init = tcf_ife_init, .walk = tcf_ife_walker, .lookup = tcf_ife_search, + .delete = tcf_ife_delete, .size = sizeof(struct tcf_ife_info), }; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 14c312d7908f..0dc787a57798 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -119,13 +119,18 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (tb[TCA_IPT_INDEX] != NULL) index = nla_get_u32(tb[TCA_IPT_INDEX]); - exists = tcf_idr_check(tn, index, a, bind); + err = tcf_idr_check_alloc(tn, &index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, index); return -EINVAL; } @@ -133,22 +138,27 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, index); return ret; + } ret = ACT_P_CREATED; } else { if (bind)/* dont override defaults */ return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } hook = nla_get_u32(tb[TCA_IPT_HOOK]); @@ -196,7 +206,8 @@ err1: static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, bind); @@ -204,7 +215,8 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool unlocked, + struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, bind); @@ -280,8 +292,8 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (unlikely(!t)) goto nla_put_failure; - c.bindcnt = ipt->tcf_bindcnt - bind; - c.refcnt = ipt->tcf_refcnt - ref; + c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind; + c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref; strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) || @@ -322,6 +334,13 @@ static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_ipt_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .type = TCA_ACT_IPT, @@ -332,6 +351,7 @@ static struct tc_action_ops act_ipt_ops = { .init = tcf_ipt_init, .walk = tcf_ipt_walker, .lookup = tcf_ipt_search, + .delete = tcf_ipt_delete, .size = sizeof(struct tcf_ipt), }; @@ -372,6 +392,13 @@ static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_xt_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_xt_ops = { .kind = "xt", .type = TCA_ACT_XT, @@ -382,6 +409,7 @@ static struct tc_action_ops act_xt_ops = { .init = tcf_xt_init, .walk = tcf_xt_walker, .lookup = tcf_xt_search, + .delete = tcf_xt_delete, .size = sizeof(struct tcf_ipt), }; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index fd34015331ab..6afd89a36c69 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -68,8 +68,9 @@ static unsigned int mirred_net_id; static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; @@ -78,7 +79,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct tcf_mirred *m; struct net_device *dev; bool exists = false; - int ret; + int ret, err; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed"); @@ -93,7 +94,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } parm = nla_data(tb[TCA_MIRRED_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -106,6 +110,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } @@ -114,6 +120,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (dev == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -ENODEV; } mac_header_xmit = dev_is_mac_header_xmit(dev); @@ -123,18 +131,20 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (!exists) { if (!dev) { + tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; } ret = tcf_idr_create(tn, parm->index, est, a, &act_mirred_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } m = to_mirred(*a); @@ -250,8 +260,8 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tc_mirred opt = { .index = m->tcf_index, .action = m->tcf_action, - .refcnt = m->tcf_refcnt - ref, - .bindcnt = m->tcf_bindcnt - bind, + .refcnt = refcount_read(&m->tcf_refcnt) - ref, + .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, .eaction = m->tcfm_eaction, .ifindex = dev ? dev->ifindex : 0, }; @@ -321,6 +331,13 @@ static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) return rtnl_dereference(m->tcfm_dev); } +static int tcf_mirred_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .type = TCA_ACT_MIRRED, @@ -334,6 +351,7 @@ static struct tc_action_ops act_mirred_ops = { .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, + .delete = tcf_mirred_delete, }; static __net_init int mirred_init_net(struct net *net) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 4b5848b6c252..4dd9188a72fd 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -38,7 +38,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; @@ -57,18 +57,24 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_NAT_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_nat_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind) return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } + } else { + return err; } p = to_tcf_nat(*a); @@ -257,8 +263,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, .index = p->tcf_index, .action = p->tcf_action, - .refcnt = p->tcf_refcnt - ref, - .bindcnt = p->tcf_bindcnt - bind, + .refcnt = refcount_read(&p->tcf_refcnt) - ref, + .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; struct tcf_t t; @@ -294,6 +300,13 @@ static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_nat_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_nat_ops = { .kind = "nat", .type = TCA_ACT_NAT, @@ -303,6 +316,7 @@ static struct tc_action_ops act_nat_ops = { .init = tcf_nat_init, .walk = tcf_nat_walker, .lookup = tcf_nat_search, + .delete = tcf_nat_delete, .size = sizeof(struct tcf_nat), }; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 8a925c72db5f..9ab5d81aff1a 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -132,20 +132,23 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb, static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; - struct nlattr *pattr; - struct tc_pedit *parm; - int ret = 0, err; - struct tcf_pedit *p; struct tc_pedit_key *keys = NULL; struct tcf_pedit_key_ex *keys_ex; + struct tc_pedit *parm; + struct nlattr *pattr; + struct tcf_pedit *p; + int ret = 0, err; int ksize; - if (nla == NULL) + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed"); return -EINVAL; + } err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); if (err < 0) @@ -154,47 +157,62 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, pattr = tb[TCA_PEDIT_PARMS]; if (!pattr) pattr = tb[TCA_PEDIT_PARMS_EX]; - if (!pattr) + if (!pattr) { + NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute"); return -EINVAL; + } parm = nla_data(pattr); ksize = parm->nkeys * sizeof(struct tc_pedit_key); - if (nla_len(pattr) < sizeof(*parm) + ksize) + if (nla_len(pattr) < sizeof(*parm) + ksize) { + NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); return -EINVAL; + } keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); if (IS_ERR(keys_ex)) return PTR_ERR(keys_ex); - if (!tcf_idr_check(tn, parm->index, a, bind)) { - if (!parm->nkeys) - return -EINVAL; + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { + if (!parm->nkeys) { + tcf_idr_cleanup(tn, parm->index); + NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); + ret = -EINVAL; + goto out_free; + } ret = tcf_idr_create(tn, parm->index, est, a, &act_pedit_ops, bind, false); - if (ret) - return ret; + if (ret) { + tcf_idr_cleanup(tn, parm->index); + goto out_free; + } p = to_pedit(*a); keys = kmalloc(ksize, GFP_KERNEL); - if (keys == NULL) { + if (!keys) { tcf_idr_release(*a, bind); - kfree(keys_ex); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind) - return 0; - tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + goto out_free; + if (!ovr) { + tcf_idr_release(*a, bind); + ret = -EEXIST; + goto out_free; + } p = to_pedit(*a); if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); if (!keys) { - kfree(keys_ex); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } } + } else { + return err; } spin_lock_bh(&p->tcf_lock); @@ -214,12 +232,17 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; +out_free: + kfree(keys_ex); + return ret; + } static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); struct tc_pedit_key *keys = p->tcfp_keys; + kfree(keys); kfree(p->tcfp_keys_ex); } @@ -284,11 +307,12 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, if (p->tcfp_nkeys > 0) { struct tc_pedit_key *tkey = p->tcfp_keys; struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex; - enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; + enum pedit_header_type htype = + TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { - u32 *ptr, _data; + u32 *ptr, hdata; int offset = tkey->off; int hoffset; u32 val; @@ -303,39 +327,39 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, rc = pedit_skb_hdr_offset(skb, htype, &hoffset); if (rc) { - pr_info("tc filter pedit bad header type specified (0x%x)\n", + pr_info("tc action pedit bad header type specified (0x%x)\n", htype); goto bad; } if (tkey->offmask) { - char *d, _d; + u8 *d, _d; if (!offset_valid(skb, hoffset + tkey->at)) { - pr_info("tc filter pedit 'at' offset %d out of bounds\n", + pr_info("tc action pedit 'at' offset %d out of bounds\n", hoffset + tkey->at); goto bad; } - d = skb_header_pointer(skb, hoffset + tkey->at, 1, - &_d); + d = skb_header_pointer(skb, hoffset + tkey->at, + sizeof(_d), &_d); if (!d) goto bad; offset += (*d & tkey->offmask) >> tkey->shift; } if (offset % 4) { - pr_info("tc filter pedit" - " offset must be on 32 bit boundaries\n"); + pr_info("tc action pedit offset must be on 32 bit boundaries\n"); goto bad; } if (!offset_valid(skb, hoffset + offset)) { - pr_info("tc filter pedit offset %d out of bounds\n", + pr_info("tc action pedit offset %d out of bounds\n", hoffset + offset); goto bad; } - ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data); + ptr = skb_header_pointer(skb, hoffset + offset, + sizeof(hdata), &hdata); if (!ptr) goto bad; /* just do it, baby */ @@ -347,19 +371,20 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, val = (*ptr + tkey->val) & ~tkey->mask; break; default: - pr_info("tc filter pedit bad command (%d)\n", + pr_info("tc action pedit bad command (%d)\n", cmd); goto bad; } *ptr = ((*ptr & tkey->mask) ^ val); - if (ptr == &_data) + if (ptr == &hdata) skb_store_bits(skb, hoffset + offset, ptr, 4); } goto done; - } else + } else { WARN(1, "pedit BUG: index %d\n", p->tcf_index); + } bad: p->tcf_qstats.overlimits++; @@ -391,8 +416,8 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, opt->nkeys = p->tcfp_nkeys; opt->flags = p->tcfp_flags; opt->action = p->tcf_action; - opt->refcnt = p->tcf_refcnt - ref; - opt->bindcnt = p->tcf_bindcnt - bind; + opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; + opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; if (p->tcfp_keys_ex) { tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys); @@ -435,6 +460,13 @@ static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_pedit_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .type = TCA_ACT_PEDIT, @@ -445,6 +477,7 @@ static struct tc_action_ops act_pedit_ops = { .init = tcf_pedit_init, .walk = tcf_pedit_walker, .lookup = tcf_pedit_search, + .delete = tcf_pedit_delete, .size = sizeof(struct tcf_pedit), }; @@ -483,4 +516,3 @@ static void __exit pedit_cleanup_module(void) module_init(pedit_init_module); module_exit(pedit_cleanup_module); - diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 4e72bc2a0dfb..1f3192ea8df7 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -75,7 +75,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_act_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, + int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { int ret = 0, err; @@ -101,20 +101,24 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_POLICE_TBF]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!exists) { ret = tcf_idr_create(tn, parm->index, NULL, a, &act_police_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } police = to_police(*a); @@ -195,8 +199,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, failure: qdisc_put_rtab(P_tab); qdisc_put_rtab(R_tab); - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return err; } @@ -274,8 +277,8 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, .action = police->tcf_action, .mtu = police->tcfp_mtu, .burst = PSCHED_NS2TICKS(police->tcfp_burst), - .refcnt = police->tcf_refcnt - ref, - .bindcnt = police->tcf_bindcnt - bind, + .refcnt = refcount_read(&police->tcf_refcnt) - ref, + .bindcnt = atomic_read(&police->tcf_bindcnt) - bind, }; struct tcf_t t; @@ -314,6 +317,13 @@ static int tcf_police_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_police_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + return tcf_idr_delete_index(tn, index); +} + MODULE_AUTHOR("Alexey Kuznetsov"); MODULE_DESCRIPTION("Policing actions"); MODULE_LICENSE("GPL"); @@ -327,6 +337,7 @@ static struct tc_action_ops act_police_ops = { .init = tcf_act_police_init, .walk = tcf_act_police_walker, .lookup = tcf_police_search, + .delete = tcf_police_delete, .size = sizeof(struct tcf_police), }; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 5db358497c9e..3079e7be5bde 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -37,7 +37,8 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); struct nlattr *tb[TCA_SAMPLE_MAX + 1]; @@ -45,7 +46,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, struct tc_sample *parm; struct tcf_sample *s; bool exists = false; - int ret; + int ret, err; if (!nla) return -EINVAL; @@ -58,20 +59,24 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SAMPLE_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_sample_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } s = to_sample(*a); @@ -80,8 +85,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); psample_group = psample_group_get(net, s->psample_group_num); if (!psample_group) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } RCU_INIT_POINTER(s->psample_group, psample_group); @@ -173,8 +177,8 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, struct tc_sample opt = { .index = s->tcf_index, .action = s->tcf_action, - .refcnt = s->tcf_refcnt - ref, - .bindcnt = s->tcf_bindcnt - bind, + .refcnt = refcount_read(&s->tcf_refcnt) - ref, + .bindcnt = atomic_read(&s->tcf_bindcnt) - bind, }; struct tcf_t t; @@ -219,6 +223,13 @@ static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_sample_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_sample_ops = { .kind = "sample", .type = TCA_ACT_SAMPLE, @@ -229,6 +240,7 @@ static struct tc_action_ops act_sample_ops = { .cleanup = tcf_sample_cleanup, .walk = tcf_sample_walker, .lookup = tcf_sample_search, + .delete = tcf_sample_delete, .size = sizeof(struct tcf_sample), }; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 98c4afe7c15b..aa51152e0066 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -79,7 +79,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; @@ -99,21 +100,28 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_DEF_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (tb[TCA_DEF_DATA] == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_simp_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } d = to_defact(*a); ret = alloc_defdata(d, tb[TCA_DEF_DATA]); @@ -126,9 +134,10 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, } else { d = to_defact(*a); - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } reset_policy(d, tb[TCA_DEF_DATA], parm); } @@ -145,8 +154,8 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_defact *d = to_defact(a); struct tc_defact opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; struct tcf_t t; @@ -183,6 +192,13 @@ static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_simp_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_simp_ops = { .kind = "simple", .type = TCA_ACT_SIMP, @@ -193,6 +209,7 @@ static struct tc_action_ops act_simp_ops = { .init = tcf_simp_init, .walk = tcf_simp_walker, .lookup = tcf_simp_search, + .delete = tcf_simp_delete, .size = sizeof(struct tcf_defact), }; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 6138d1d71900..da56e6938c9e 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -23,6 +23,9 @@ #include <linux/rtnetlink.h> #include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/dsfield.h> #include <linux/tc_act/tc_skbedit.h> #include <net/tc_act/tc_skbedit.h> @@ -34,25 +37,54 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; + int action; - spin_lock(&d->tcf_lock); tcf_lastuse_update(&d->tcf_tm); - bstats_update(&d->tcf_bstats, skb); - - if (d->flags & SKBEDIT_F_PRIORITY) - skb->priority = d->priority; - if (d->flags & SKBEDIT_F_QUEUE_MAPPING && - skb->dev->real_num_tx_queues > d->queue_mapping) - skb_set_queue_mapping(skb, d->queue_mapping); - if (d->flags & SKBEDIT_F_MARK) { - skb->mark &= ~d->mask; - skb->mark |= d->mark & d->mask; + bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + + rcu_read_lock(); + params = rcu_dereference(d->params); + action = READ_ONCE(d->tcf_action); + + if (params->flags & SKBEDIT_F_PRIORITY) + skb->priority = params->priority; + if (params->flags & SKBEDIT_F_INHERITDSFIELD) { + int wlen = skb_network_offset(skb); + + switch (tc_skb_protocol(skb)) { + case htons(ETH_P_IP): + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen)) + goto err; + skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2; + break; + + case htons(ETH_P_IPV6): + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen)) + goto err; + skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; + break; + } } - if (d->flags & SKBEDIT_F_PTYPE) - skb->pkt_type = d->ptype; - - spin_unlock(&d->tcf_lock); - return d->tcf_action; + if (params->flags & SKBEDIT_F_QUEUE_MAPPING && + skb->dev->real_num_tx_queues > params->queue_mapping) + skb_set_queue_mapping(skb, params->queue_mapping); + if (params->flags & SKBEDIT_F_MARK) { + skb->mark &= ~params->mask; + skb->mark |= params->mark & params->mask; + } + if (params->flags & SKBEDIT_F_PTYPE) + skb->pkt_type = params->ptype; + +unlock: + rcu_read_unlock(); + return action; +err: + qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); + action = TC_ACT_SHOT; + goto unlock; } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { @@ -62,13 +94,16 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) }, + [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) }, }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); + struct tcf_skbedit_params *params_old, *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; struct tcf_skbedit *d; @@ -114,52 +149,76 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, mask = nla_data(tb[TCA_SKBEDIT_MASK]); } + if (tb[TCA_SKBEDIT_FLAGS] != NULL) { + u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]); + + if (*pure_flags & SKBEDIT_F_INHERITDSFIELD) + flags |= SKBEDIT_F_INHERITDSFIELD; + } + parm = nla_data(tb[TCA_SKBEDIT_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!flags) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, - &act_skbedit_ops, bind, false); - if (ret) + &act_skbedit_ops, bind, true); + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } d = to_skbedit(*a); ret = ACT_P_CREATED; } else { d = to_skbedit(*a); - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } - spin_lock_bh(&d->tcf_lock); + ASSERT_RTNL(); + + params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); + if (unlikely(!params_new)) { + if (ret == ACT_P_CREATED) + tcf_idr_release(*a, bind); + return -ENOMEM; + } - d->flags = flags; + params_new->flags = flags; if (flags & SKBEDIT_F_PRIORITY) - d->priority = *priority; + params_new->priority = *priority; if (flags & SKBEDIT_F_QUEUE_MAPPING) - d->queue_mapping = *queue_mapping; + params_new->queue_mapping = *queue_mapping; if (flags & SKBEDIT_F_MARK) - d->mark = *mark; + params_new->mark = *mark; if (flags & SKBEDIT_F_PTYPE) - d->ptype = *ptype; + params_new->ptype = *ptype; /* default behaviour is to use all the bits */ - d->mask = 0xffffffff; + params_new->mask = 0xffffffff; if (flags & SKBEDIT_F_MASK) - d->mask = *mask; + params_new->mask = *mask; d->tcf_action = parm->action; - - spin_unlock_bh(&d->tcf_lock); + params_old = rtnl_dereference(d->params); + rcu_assign_pointer(d->params, params_new); + if (params_old) + kfree_rcu(params_old, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -171,30 +230,39 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; struct tc_skbedit opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; + u64 pure_flags = 0; struct tcf_t t; + params = rtnl_dereference(d->params); + if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_PRIORITY) && - nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, d->priority)) + if ((params->flags & SKBEDIT_F_PRIORITY) && + nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, params->priority)) + goto nla_put_failure; + if ((params->flags & SKBEDIT_F_QUEUE_MAPPING) && + nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, params->queue_mapping)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) && - nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, d->queue_mapping)) + if ((params->flags & SKBEDIT_F_MARK) && + nla_put_u32(skb, TCA_SKBEDIT_MARK, params->mark)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_MARK) && - nla_put_u32(skb, TCA_SKBEDIT_MARK, d->mark)) + if ((params->flags & SKBEDIT_F_PTYPE) && + nla_put_u16(skb, TCA_SKBEDIT_PTYPE, params->ptype)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_PTYPE) && - nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype)) + if ((params->flags & SKBEDIT_F_MASK) && + nla_put_u32(skb, TCA_SKBEDIT_MASK, params->mask)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_MASK) && - nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask)) + if (params->flags & SKBEDIT_F_INHERITDSFIELD) + pure_flags |= SKBEDIT_F_INHERITDSFIELD; + if (pure_flags != 0 && + nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags)) goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); @@ -207,6 +275,16 @@ nla_put_failure: return -1; } +static void tcf_skbedit_cleanup(struct tc_action *a) +{ + struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; + + params = rcu_dereference_protected(d->params, 1); + if (params) + kfree_rcu(params, rcu); +} + static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, @@ -225,6 +303,13 @@ static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_skbedit_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .type = TCA_ACT_SKBEDIT, @@ -232,8 +317,10 @@ static struct tc_action_ops act_skbedit_ops = { .act = tcf_skbedit, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, + .cleanup = tcf_skbedit_cleanup, .walk = tcf_skbedit_walker, .lookup = tcf_skbedit_search, + .delete = tcf_skbedit_delete, .size = sizeof(struct tcf_skbedit), }; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index ad050d7d4b46..cdc6bacfb190 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -84,7 +84,8 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); struct nlattr *tb[TCA_SKBMOD_MAX + 1]; @@ -127,27 +128,33 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (parm->flags & SKBMOD_F_SWAPMAC) lflags = SKBMOD_F_SWAPMAC; - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!lflags) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_skbmod_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } d = to_skbmod(*a); @@ -155,8 +162,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL); if (unlikely(!p)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } @@ -205,8 +211,8 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p); struct tc_skbmod opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; struct tcf_t t; @@ -252,6 +258,13 @@ static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_skbmod_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_skbmod_ops = { .kind = "skbmod", .type = TCA_ACT_SKBMOD, @@ -262,6 +275,7 @@ static struct tc_action_ops act_skbmod_ops = { .cleanup = tcf_skbmod_cleanup, .walk = tcf_skbmod_walker, .lookup = tcf_skbmod_search, + .delete = tcf_skbmod_delete, .size = sizeof(struct tcf_skbmod), }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 626dac81a48a..f811850fd1d0 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> +#include <net/geneve.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/dst.h> @@ -36,7 +37,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&t->tcf_tm); bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb); - action = params->action; + action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: @@ -57,6 +58,135 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, return action; } +static const struct nla_policy +enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, + [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, + .len = 128 }, +}; + +static int +tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1]; + int err, data_len, opt_len; + u8 *data; + + err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX, + nla, geneve_opt_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] || + !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] || + !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data"); + return -EINVAL; + } + + data = nla_data(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]); + data_len = nla_len(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]); + if (data_len < 4) { + NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long"); + return -ERANGE; + } + if (data_len % 4) { + NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long"); + return -ERANGE; + } + + opt_len = sizeof(struct geneve_opt) + data_len; + if (dst) { + struct geneve_opt *opt = dst; + + WARN_ON(dst_len < opt_len); + + opt->opt_class = + nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS]); + opt->type = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE]); + opt->length = data_len / 4; /* length is in units of 4 bytes */ + opt->r1 = 0; + opt->r2 = 0; + opt->r3 = 0; + + memcpy(opt + 1, data, data_len); + } + + return opt_len; +} + +static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, + int dst_len, struct netlink_ext_ack *extack) +{ + int err, rem, opt_len, len = nla_len(nla), opts_len = 0; + const struct nlattr *attr, *head = nla_data(nla); + + err = nla_validate(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); + if (err) + return err; + + nla_for_each_attr(attr, head, len, rem) { + switch (nla_type(attr)) { + case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: + opt_len = tunnel_key_copy_geneve_opt(attr, dst, + dst_len, extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + if (dst) { + dst_len -= opt_len; + dst += opt_len; + } + break; + } + } + + if (!opts_len) { + NL_SET_ERR_MSG(extack, "Empty list of tunnel options"); + return -EINVAL; + } + + if (rem > 0) { + NL_SET_ERR_MSG(extack, "Trailing data after parsing tunnel key options attributes"); + return -EINVAL; + } + + return opts_len; +} + +static int tunnel_key_get_opts_len(struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + return tunnel_key_copy_opts(nla, NULL, 0, extack); +} + +static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, + int opts_len, struct netlink_ext_ack *extack) +{ + info->options_len = opts_len; + switch (nla_type(nla_data(nla))) { + case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: +#if IS_ENABLED(CONFIG_INET) + info->key.tun_flags |= TUNNEL_GENEVE_OPT; + return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), + opts_len, extack); +#else + return -EAFNOSUPPORT; +#endif + default: + NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type"); + return -EINVAL; + } +} + static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) }, [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 }, @@ -66,11 +196,15 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16}, [TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPTS] = { .type = NLA_NESTED }, + [TCA_TUNNEL_KEY_ENC_TOS] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_TTL] = { .type = NLA_U8 }, }; static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; @@ -81,24 +215,35 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, struct tcf_tunnel_key *t; bool exists = false; __be16 dst_port = 0; + int opts_len = 0; __be64 key_id; __be16 flags; + u8 tos, ttl; int ret = 0; int err; - if (!nla) + if (!nla) { + NL_SET_ERR_MSG(extack, "Tunnel requires attributes to be passed"); return -EINVAL; + } err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy, - NULL); - if (err < 0) + extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Failed to parse nested tunnel key attributes"); return err; + } - if (!tb[TCA_TUNNEL_KEY_PARMS]) + if (!tb[TCA_TUNNEL_KEY_PARMS]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key parameters"); return -EINVAL; + } parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -107,6 +252,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, break; case TCA_TUNNEL_KEY_ACT_SET: if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key id"); ret = -EINVAL; goto err_out; } @@ -121,6 +267,22 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT]) dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]); + if (tb[TCA_TUNNEL_KEY_ENC_OPTS]) { + opts_len = tunnel_key_get_opts_len(tb[TCA_TUNNEL_KEY_ENC_OPTS], + extack); + if (opts_len < 0) { + ret = opts_len; + goto err_out; + } + } + + tos = 0; + if (tb[TCA_TUNNEL_KEY_ENC_TOS]) + tos = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TOS]); + ttl = 0; + if (tb[TCA_TUNNEL_KEY_ENC_TTL]) + ttl = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TTL]); + if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) { __be32 saddr; @@ -129,9 +291,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]); daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]); - metadata = __ip_tun_set_dst(saddr, daddr, 0, 0, + metadata = __ip_tun_set_dst(saddr, daddr, tos, ttl, dst_port, flags, - key_id, 0); + key_id, opts_len); } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) { struct in6_addr saddr; @@ -140,19 +302,33 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]); daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]); - metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port, + metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port, 0, flags, key_id, 0); + } else { + NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst"); + ret = -EINVAL; + goto err_out; } if (!metadata) { - ret = -EINVAL; + NL_SET_ERR_MSG(extack, "Cannot allocate tunnel metadata dst"); + ret = -ENOMEM; goto err_out; } + if (opts_len) { + ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS], + &metadata->u.tun_info, + opts_len, extack); + if (ret < 0) + goto err_out; + } + metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; break; default: + NL_SET_ERR_MSG(extack, "Unknown tunnel key action"); ret = -EINVAL; goto err_out; } @@ -160,14 +336,16 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_tunnel_key_ops, bind, true); - if (ret) - return ret; + if (ret) { + NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); + goto err_out; + } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + NL_SET_ERR_MSG(extack, "TC IDR already exists"); + return -EEXIST; } t = to_tunnel_key(*a); @@ -175,14 +353,14 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); + NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); return -ENOMEM; } params_old = rtnl_dereference(t->params); - params_new->action = parm->action; + t->tcf_action = parm->action; params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; @@ -199,6 +377,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, err_out: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return ret; } @@ -216,6 +396,61 @@ static void tunnel_key_release(struct tc_action *a) } } +static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + int len = info->options_len; + u8 *src = (u8 *)(info + 1); + struct nlattr *start; + + start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); + if (!start) + return -EMSGSIZE; + + while (len > 0) { + struct geneve_opt *opt = (struct geneve_opt *)src; + + if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, + opt->opt_class) || + nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE, + opt->type) || + nla_put(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA, + opt->length * 4, opt + 1)) + return -EMSGSIZE; + + len -= sizeof(struct geneve_opt) + opt->length * 4; + src += sizeof(struct geneve_opt) + opt->length * 4; + } + + nla_nest_end(skb, start); + return 0; +} + +static int tunnel_key_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + struct nlattr *start; + int err; + + if (!info->options_len) + return 0; + + start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS); + if (!start) + return -EMSGSIZE; + + if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + err = tunnel_key_geneve_opts_dump(skb, info); + if (err) + return err; + } else { + return -EINVAL; + } + + nla_nest_end(skb, start); + return 0; +} + static int tunnel_key_dump_addresses(struct sk_buff *skb, const struct ip_tunnel_info *info) { @@ -252,22 +487,23 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_tunnel_key_params *params; struct tc_tunnel_key opt = { .index = t->tcf_index, - .refcnt = t->tcf_refcnt - ref, - .bindcnt = t->tcf_bindcnt - bind, + .refcnt = refcount_read(&t->tcf_refcnt) - ref, + .bindcnt = atomic_read(&t->tcf_bindcnt) - bind, + .action = t->tcf_action, }; struct tcf_t tm; params = rtnl_dereference(t->params); opt.t_action = params->tcft_action; - opt.action = params->action; if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) { - struct ip_tunnel_key *key = - ¶ms->tcft_enc_metadata->u.tun_info.key; + struct ip_tunnel_info *info = + ¶ms->tcft_enc_metadata->u.tun_info; + struct ip_tunnel_key *key = &info->key; __be32 key_id = tunnel_id_to_key32(key->tun_id); if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) || @@ -275,7 +511,14 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, ¶ms->tcft_enc_metadata->u.tun_info) || nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) || nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM, - !(key->tun_flags & TUNNEL_CSUM))) + !(key->tun_flags & TUNNEL_CSUM)) || + tunnel_key_opts_dump(skb, info)) + goto nla_put_failure; + + if (key->tos && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TOS, key->tos)) + goto nla_put_failure; + + if (key->ttl && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TTL, key->ttl)) goto nla_put_failure; } @@ -309,6 +552,13 @@ static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tunnel_key_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_tunnel_key_ops = { .kind = "tunnel_key", .type = TCA_ACT_TUNNEL_KEY, @@ -319,6 +569,7 @@ static struct tc_action_ops act_tunnel_key_ops = { .cleanup = tunnel_key_release, .walk = tunnel_key_walker, .lookup = tunnel_key_search, + .delete = tunnel_key_delete, .size = sizeof(struct tcf_tunnel_key), }; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 1fb39e1f9d07..ad37f308175a 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -109,7 +109,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; @@ -133,7 +134,10 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!tb[TCA_VLAN_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_VLAN_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -145,12 +149,16 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); if (push_vid >= VLAN_VID_MASK) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -ERANGE; } @@ -163,6 +171,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EPROTONOSUPPORT; } } else { @@ -175,6 +185,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } action = parm->v_action; @@ -182,14 +194,15 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_vlan_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } v = to_vlan(*a); @@ -197,8 +210,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } @@ -239,8 +251,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p); struct tc_vlan opt = { .index = v->tcf_index, - .refcnt = v->tcf_refcnt - ref, - .bindcnt = v->tcf_bindcnt - bind, + .refcnt = refcount_read(&v->tcf_refcnt) - ref, + .bindcnt = atomic_read(&v->tcf_bindcnt) - bind, .action = v->tcf_action, .v_action = p->tcfv_action, }; @@ -286,6 +298,13 @@ static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_vlan_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_vlan_ops = { .kind = "vlan", .type = TCA_ACT_VLAN, @@ -296,6 +315,7 @@ static struct tc_action_ops act_vlan_ops = { .cleanup = tcf_vlan_cleanup, .walk = tcf_vlan_walker, .lookup = tcf_vlan_search, + .delete = tcf_vlan_delete, .size = sizeof(struct tcf_vlan), }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index cdc3c87c53e6..75cce2819de9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -39,7 +39,7 @@ static DEFINE_RWLOCK(cls_mod_lock); /* Find classifier type by string name */ -static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind) +static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind) { const struct tcf_proto_ops *t, *res = NULL; @@ -57,6 +57,33 @@ static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind) return res; } +static const struct tcf_proto_ops * +tcf_proto_lookup_ops(const char *kind, struct netlink_ext_ack *extack) +{ + const struct tcf_proto_ops *ops; + + ops = __tcf_proto_lookup_ops(kind); + if (ops) + return ops; +#ifdef CONFIG_MODULES + rtnl_unlock(); + request_module("cls_%s", kind); + rtnl_lock(); + ops = __tcf_proto_lookup_ops(kind); + /* We dropped the RTNL semaphore in order to perform + * the module load. So, even if we succeeded in loading + * the module we have to replay the request. We indicate + * this using -EAGAIN. + */ + if (ops) { + module_put(ops->owner); + return ERR_PTR(-EAGAIN); + } +#endif + NL_SET_ERR_MSG(extack, "TC classifier not found"); + return ERR_PTR(-ENOENT); +} + /* Register(unregister) new classifier type */ int register_tcf_proto_ops(struct tcf_proto_ops *ops) @@ -133,27 +160,9 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, if (!tp) return ERR_PTR(-ENOBUFS); - err = -ENOENT; - tp->ops = tcf_proto_lookup_ops(kind); - if (!tp->ops) { -#ifdef CONFIG_MODULES - rtnl_unlock(); - request_module("cls_%s", kind); - rtnl_lock(); - tp->ops = tcf_proto_lookup_ops(kind); - /* We dropped the RTNL semaphore in order to perform - * the module load. So, even if we succeeded in loading - * the module we have to replay the request. We indicate - * this using -EAGAIN. - */ - if (tp->ops) { - module_put(tp->ops->owner); - err = -EAGAIN; - } else { - NL_SET_ERR_MSG(extack, "TC classifier not found"); - err = -ENOENT; - } -#endif + tp->ops = tcf_proto_lookup_ops(kind, extack); + if (IS_ERR(tp->ops)) { + err = PTR_ERR(tp->ops); goto errout; } tp->classify = tp->ops->classify; @@ -195,11 +204,12 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (!chain) return NULL; - INIT_LIST_HEAD(&chain->filter_chain_list); list_add_tail(&chain->list, &block->chain_list); chain->block = block; chain->index = chain_index; chain->refcnt = 1; + if (!chain->index) + block->chain0.chain = chain; return chain; } @@ -209,12 +219,16 @@ static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item, if (item->chain_head_change) item->chain_head_change(tp_head, item->chain_head_change_priv); } -static void tcf_chain_head_change(struct tcf_chain *chain, - struct tcf_proto *tp_head) + +static void tcf_chain0_head_change(struct tcf_chain *chain, + struct tcf_proto *tp_head) { struct tcf_filter_chain_list_item *item; + struct tcf_block *block = chain->block; - list_for_each_entry(item, &chain->filter_chain_list, list) + if (chain->index) + return; + list_for_each_entry(item, &block->chain0.filter_chain_list, list) tcf_chain_head_change_item(item, tp_head); } @@ -222,7 +236,7 @@ static void tcf_chain_flush(struct tcf_chain *chain) { struct tcf_proto *tp = rtnl_dereference(chain->filter_chain); - tcf_chain_head_change(chain, NULL); + tcf_chain0_head_change(chain, NULL); while (tp) { RCU_INIT_POINTER(chain->filter_chain, tp->next); tcf_proto_destroy(tp, NULL); @@ -236,8 +250,10 @@ static void tcf_chain_destroy(struct tcf_chain *chain) struct tcf_block *block = chain->block; list_del(&chain->list); + if (!chain->index) + block->chain0.chain = NULL; kfree(chain); - if (list_empty(&block->chain_list)) + if (list_empty(&block->chain_list) && block->refcnt == 0) kfree(block); } @@ -246,29 +262,60 @@ static void tcf_chain_hold(struct tcf_chain *chain) ++chain->refcnt; } -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, - bool create) +static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, + u32 chain_index) { struct tcf_chain *chain; list_for_each_entry(chain, &block->chain_list, list) { - if (chain->index == chain_index) { - tcf_chain_hold(chain); + if (chain->index == chain_index) return chain; - } + } + return NULL; +} + +static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, + u32 seq, u16 flags, int event, bool unicast); + +struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, + bool create) +{ + struct tcf_chain *chain = tcf_chain_lookup(block, chain_index); + + if (chain) { + tcf_chain_hold(chain); + return chain; } - return create ? tcf_chain_create(block, chain_index) : NULL; + if (!create) + return NULL; + chain = tcf_chain_create(block, chain_index); + if (!chain) + return NULL; + tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, + RTM_NEWCHAIN, false); + return chain; } EXPORT_SYMBOL(tcf_chain_get); +static void tc_chain_tmplt_del(struct tcf_chain *chain); + void tcf_chain_put(struct tcf_chain *chain) { - if (--chain->refcnt == 0) + if (--chain->refcnt == 0) { + tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false); + tc_chain_tmplt_del(chain); tcf_chain_destroy(chain); + } } EXPORT_SYMBOL(tcf_chain_put); +static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) +{ + if (chain->explicitly_created) + tcf_chain_put(chain); +} + static bool tcf_block_offload_in_use(struct tcf_block *block) { return block->offloadcnt; @@ -277,18 +324,21 @@ static bool tcf_block_offload_in_use(struct tcf_block *block) static int tcf_block_offload_cmd(struct tcf_block *block, struct net_device *dev, struct tcf_block_ext_info *ei, - enum tc_block_command command) + enum tc_block_command command, + struct netlink_ext_ack *extack) { struct tc_block_offload bo = {}; bo.command = command; bo.binder_type = ei->binder_type; bo.block = block; + bo.extack = extack; return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); } static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, - struct tcf_block_ext_info *ei) + struct tcf_block_ext_info *ei, + struct netlink_ext_ack *extack) { struct net_device *dev = q->dev_queue->dev; int err; @@ -299,10 +349,12 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, /* If tc offload feature is disabled and the block we try to bind * to already has some offloaded filters, forbid to bind. */ - if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) + if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) { + NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled"); return -EOPNOTSUPP; + } - err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND); + err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; return err; @@ -322,7 +374,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, if (!dev->netdev_ops->ndo_setup_tc) goto no_offload_dev_dec; - err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND); + err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; return; @@ -332,10 +384,11 @@ no_offload_dev_dec: } static int -tcf_chain_head_change_cb_add(struct tcf_chain *chain, - struct tcf_block_ext_info *ei, - struct netlink_ext_ack *extack) +tcf_chain0_head_change_cb_add(struct tcf_block *block, + struct tcf_block_ext_info *ei, + struct netlink_ext_ack *extack) { + struct tcf_chain *chain0 = block->chain0.chain; struct tcf_filter_chain_list_item *item; item = kmalloc(sizeof(*item), GFP_KERNEL); @@ -345,23 +398,25 @@ tcf_chain_head_change_cb_add(struct tcf_chain *chain, } item->chain_head_change = ei->chain_head_change; item->chain_head_change_priv = ei->chain_head_change_priv; - if (chain->filter_chain) - tcf_chain_head_change_item(item, chain->filter_chain); - list_add(&item->list, &chain->filter_chain_list); + if (chain0 && chain0->filter_chain) + tcf_chain_head_change_item(item, chain0->filter_chain); + list_add(&item->list, &block->chain0.filter_chain_list); return 0; } static void -tcf_chain_head_change_cb_del(struct tcf_chain *chain, - struct tcf_block_ext_info *ei) +tcf_chain0_head_change_cb_del(struct tcf_block *block, + struct tcf_block_ext_info *ei) { + struct tcf_chain *chain0 = block->chain0.chain; struct tcf_filter_chain_list_item *item; - list_for_each_entry(item, &chain->filter_chain_list, list) { + list_for_each_entry(item, &block->chain0.filter_chain_list, list) { if ((!ei->chain_head_change && !ei->chain_head_change_priv) || (item->chain_head_change == ei->chain_head_change && item->chain_head_change_priv == ei->chain_head_change_priv)) { - tcf_chain_head_change_item(item, NULL); + if (chain0) + tcf_chain_head_change_item(item, NULL); list_del(&item->list); kfree(item); return; @@ -397,8 +452,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, struct netlink_ext_ack *extack) { struct tcf_block *block; - struct tcf_chain *chain; - int err; block = kzalloc(sizeof(*block), GFP_KERNEL); if (!block) { @@ -408,14 +461,8 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, INIT_LIST_HEAD(&block->chain_list); INIT_LIST_HEAD(&block->cb_list); INIT_LIST_HEAD(&block->owner_list); + INIT_LIST_HEAD(&block->chain0.filter_chain_list); - /* Create chain 0 by default, it has to be always present. */ - chain = tcf_chain_create(block, 0); - if (!chain) { - NL_SET_ERR_MSG(extack, "Failed to create new tcf chain"); - err = -ENOMEM; - goto err_chain_create; - } block->refcnt = 1; block->net = net; block->index = block_index; @@ -424,10 +471,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, if (!tcf_block_shared(block)) block->q = q; return block; - -err_chain_create: - kfree(block); - return ERR_PTR(err); } static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index) @@ -509,11 +552,6 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, return block; } -static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block) -{ - return list_first_entry(&block->chain_list, struct tcf_chain, list); -} - struct tcf_block_owner_item { struct list_head list; struct Qdisc *q; @@ -607,12 +645,11 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, tcf_block_owner_netif_keep_dst(block, q, ei->binder_type); - err = tcf_chain_head_change_cb_add(tcf_block_chain_zero(block), - ei, extack); + err = tcf_chain0_head_change_cb_add(block, ei, extack); if (err) - goto err_chain_head_change_cb_add; + goto err_chain0_head_change_cb_add; - err = tcf_block_offload_bind(block, q, ei); + err = tcf_block_offload_bind(block, q, ei, extack); if (err) goto err_block_offload_bind; @@ -620,15 +657,14 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, return 0; err_block_offload_bind: - tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei); -err_chain_head_change_cb_add: + tcf_chain0_head_change_cb_del(block, ei); +err_chain0_head_change_cb_add: tcf_block_owner_del(block, q, ei->binder_type); err_block_owner_add: if (created) { if (tcf_block_shared(block)) tcf_block_remove(block, net); err_block_insert: - kfree(tcf_block_chain_zero(block)); kfree(block); } else { block->refcnt--; @@ -668,10 +704,10 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, if (!block) return; - tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei); + tcf_chain0_head_change_cb_del(block, ei); tcf_block_owner_del(block, q, ei->binder_type); - if (--block->refcnt == 0) { + if (block->refcnt == 1) { if (tcf_block_shared(block)) tcf_block_remove(block, block->net); @@ -687,13 +723,16 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, tcf_block_offload_unbind(block, q, ei); - if (block->refcnt == 0) { + if (block->refcnt == 1) { /* At this point, all the chains should have refcnt >= 1. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { + tcf_chain_put_explicitly_created(chain); tcf_chain_put(chain); + } - /* Finally, put chain 0 and allow block to be freed. */ - tcf_chain_put(tcf_block_chain_zero(block)); + block->refcnt--; + if (list_empty(&block->chain_list)) + kfree(block); } } EXPORT_SYMBOL(tcf_block_put_ext); @@ -746,18 +785,53 @@ unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb) } EXPORT_SYMBOL(tcf_block_cb_decref); +static int +tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb, + void *cb_priv, bool add, bool offload_in_use, + struct netlink_ext_ack *extack) +{ + struct tcf_chain *chain; + struct tcf_proto *tp; + int err; + + list_for_each_entry(chain, &block->chain_list, list) { + for (tp = rtnl_dereference(chain->filter_chain); tp; + tp = rtnl_dereference(tp->next)) { + if (tp->ops->reoffload) { + err = tp->ops->reoffload(tp, add, cb, cb_priv, + extack); + if (err && add) + goto err_playback_remove; + } else if (add && offload_in_use) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(extack, "Filter HW offload failed - classifier without re-offloading support"); + goto err_playback_remove; + } + } + } + + return 0; + +err_playback_remove: + tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use, + extack); + return err; +} + struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, + struct netlink_ext_ack *extack) { struct tcf_block_cb *block_cb; + int err; - /* At this point, playback of previous block cb calls is not supported, - * so forbid to register to block which already has some offloaded - * filters present. - */ - if (tcf_block_offload_in_use(block)) - return ERR_PTR(-EOPNOTSUPP); + /* Replay any already present rules */ + err = tcf_block_playback_offloads(block, cb, cb_priv, true, + tcf_block_offload_in_use(block), + extack); + if (err) + return ERR_PTR(err); block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); if (!block_cb) @@ -772,17 +846,22 @@ EXPORT_SYMBOL(__tcf_block_cb_register); int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, struct netlink_ext_ack *extack) { struct tcf_block_cb *block_cb; - block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv); - return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0; + block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv, + extack); + return PTR_ERR_OR_ZERO(block_cb); } EXPORT_SYMBOL(tcf_block_cb_register); -void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb) +void __tcf_block_cb_unregister(struct tcf_block *block, + struct tcf_block_cb *block_cb) { + tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, + false, tcf_block_offload_in_use(block), + NULL); list_del(&block_cb->list); kfree(block_cb); } @@ -796,7 +875,7 @@ void tcf_block_cb_unregister(struct tcf_block *block, block_cb = tcf_block_cb_lookup(block, cb, cb_ident); if (!block_cb) return; - __tcf_block_cb_unregister(block_cb); + __tcf_block_cb_unregister(block, block_cb); } EXPORT_SYMBOL(tcf_block_cb_unregister); @@ -893,7 +972,7 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain, struct tcf_proto *tp) { if (*chain_info->pprev == chain->filter_chain) - tcf_chain_head_change(chain, tp); + tcf_chain0_head_change(chain, tp); RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info)); rcu_assign_pointer(*chain_info->pprev, tp); tcf_chain_hold(chain); @@ -906,7 +985,7 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain, struct tcf_proto *next = rtnl_dereference(chain_info->next); if (tp == chain->filter_chain) - tcf_chain_head_change(chain, next); + tcf_chain0_head_change(chain, next); RCU_INIT_POINTER(*chain_info->pprev, next); tcf_chain_put(chain); } @@ -1053,7 +1132,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, for (tp = rtnl_dereference(chain->filter_chain); tp; tp = rtnl_dereference(tp->next)) tfilter_notify(net, oskb, n, tp, block, - q, parent, 0, event, false); + q, parent, NULL, event, false); } static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, @@ -1182,6 +1261,12 @@ replay: goto errout; } + if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) { + NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind"); + err = -EINVAL; + goto errout; + } + err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE, extack); @@ -1444,7 +1529,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(net, skb, tp, block, q, parent, 0, + if (tcf_fill_node(net, skb, tp, block, q, parent, NULL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) @@ -1463,7 +1548,9 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, arg.w.stop = 0; arg.w.skip = cb->args[1] - 1; arg.w.count = 0; + arg.w.cookie = cb->args[2]; tp->ops->walk(tp, &arg.w); + cb->args[2] = arg.w.cookie; cb->args[1] = arg.w.count + 1; if (arg.w.stop) return false; @@ -1561,14 +1648,324 @@ out: return skb->len; } +static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, + struct sk_buff *skb, struct tcf_block *block, + u32 portid, u32 seq, u16 flags, int event) +{ + unsigned char *b = skb_tail_pointer(skb); + const struct tcf_proto_ops *ops; + struct nlmsghdr *nlh; + struct tcmsg *tcm; + void *priv; + + ops = chain->tmplt_ops; + priv = chain->tmplt_priv; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); + if (!nlh) + goto out_nlmsg_trim; + tcm = nlmsg_data(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm__pad1 = 0; + tcm->tcm__pad2 = 0; + tcm->tcm_handle = 0; + if (block->q) { + tcm->tcm_ifindex = qdisc_dev(block->q)->ifindex; + tcm->tcm_parent = block->q->handle; + } else { + tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK; + tcm->tcm_block_index = block->index; + } + + if (nla_put_u32(skb, TCA_CHAIN, chain->index)) + goto nla_put_failure; + + if (ops) { + if (nla_put_string(skb, TCA_KIND, ops->kind)) + goto nla_put_failure; + if (ops->tmplt_dump(skb, net, priv) < 0) + goto nla_put_failure; + } + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; + +out_nlmsg_trim: +nla_put_failure: + nlmsg_trim(skb, b); + return -EMSGSIZE; +} + +static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, + u32 seq, u16 flags, int event, bool unicast) +{ + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; + struct tcf_block *block = chain->block; + struct net *net = block->net; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tc_chain_fill_node(chain, net, skb, block, portid, + seq, flags, event) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + if (unicast) + return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); +} + +static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, + struct nlattr **tca, + struct netlink_ext_ack *extack) +{ + const struct tcf_proto_ops *ops; + void *tmplt_priv; + + /* If kind is not set, user did not specify template. */ + if (!tca[TCA_KIND]) + return 0; + + ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), extack); + if (IS_ERR(ops)) + return PTR_ERR(ops); + if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) { + NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier"); + return -EOPNOTSUPP; + } + + tmplt_priv = ops->tmplt_create(net, chain, tca, extack); + if (IS_ERR(tmplt_priv)) { + module_put(ops->owner); + return PTR_ERR(tmplt_priv); + } + chain->tmplt_ops = ops; + chain->tmplt_priv = tmplt_priv; + return 0; +} + +static void tc_chain_tmplt_del(struct tcf_chain *chain) +{ + const struct tcf_proto_ops *ops = chain->tmplt_ops; + + /* If template ops are set, no work to do for us. */ + if (!ops) + return; + + ops->tmplt_destroy(chain->tmplt_priv); + module_put(ops->owner); +} + +/* Add/delete/get a chain */ + +static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + struct tcmsg *t; + u32 parent; + u32 chain_index; + struct Qdisc *q = NULL; + struct tcf_chain *chain = NULL; + struct tcf_block *block; + unsigned long cl; + int err; + + if (n->nlmsg_type != RTM_GETCHAIN && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + +replay: + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + if (err < 0) + return err; + + t = nlmsg_data(n); + parent = t->tcm_parent; + cl = 0; + + block = tcf_block_find(net, &q, &parent, &cl, + t->tcm_ifindex, t->tcm_block_index, extack); + if (IS_ERR(block)) + return PTR_ERR(block); + + chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + if (chain_index > TC_ACT_EXT_VAL_MASK) { + NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); + return -EINVAL; + } + chain = tcf_chain_lookup(block, chain_index); + if (n->nlmsg_type == RTM_NEWCHAIN) { + if (chain) { + NL_SET_ERR_MSG(extack, "Filter chain already exists"); + return -EEXIST; + } + if (!(n->nlmsg_flags & NLM_F_CREATE)) { + NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); + return -ENOENT; + } + chain = tcf_chain_create(block, chain_index); + if (!chain) { + NL_SET_ERR_MSG(extack, "Failed to create filter chain"); + return -ENOMEM; + } + } else { + if (!chain) { + NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); + return -EINVAL; + } + tcf_chain_hold(chain); + } + + switch (n->nlmsg_type) { + case RTM_NEWCHAIN: + err = tc_chain_tmplt_add(chain, net, tca, extack); + if (err) + goto errout; + /* In case the chain was successfully added, take a reference + * to the chain. This ensures that an empty chain + * does not disappear at the end of this function. + */ + tcf_chain_hold(chain); + chain->explicitly_created = true; + tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, + RTM_NEWCHAIN, false); + break; + case RTM_DELCHAIN: + /* Flush the chain first as the user requested chain removal. */ + tcf_chain_flush(chain); + /* In case the chain was successfully deleted, put a reference + * to the chain previously taken during addition. + */ + tcf_chain_put_explicitly_created(chain); + chain->explicitly_created = false; + break; + case RTM_GETCHAIN: + err = tc_chain_notify(chain, skb, n->nlmsg_seq, + n->nlmsg_seq, n->nlmsg_type, true); + if (err < 0) + NL_SET_ERR_MSG(extack, "Failed to send chain notify message"); + break; + default: + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(extack, "Unsupported message type"); + goto errout; + } + +errout: + tcf_chain_put(chain); + if (err == -EAGAIN) + /* Replay the request. */ + goto replay; + return err; +} + +/* called with RTNL */ +static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + struct Qdisc *q = NULL; + struct tcf_block *block; + struct tcf_chain *chain; + struct tcmsg *tcm = nlmsg_data(cb->nlh); + long index_start; + long index; + u32 parent; + int err; + + if (nlmsg_len(cb->nlh) < sizeof(*tcm)) + return skb->len; + + err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + if (err) + return err; + + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { + block = tcf_block_lookup(net, tcm->tcm_block_index); + if (!block) + goto out; + /* If we work with block index, q is NULL and parent value + * will never be used in the following code. The check + * in tcf_fill_node prevents it. However, compiler does not + * see that far, so set parent to zero to silence the warning + * about parent being uninitialized. + */ + parent = 0; + } else { + const struct Qdisc_class_ops *cops; + struct net_device *dev; + unsigned long cl = 0; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return skb->len; + + parent = tcm->tcm_parent; + if (!parent) { + q = dev->qdisc; + parent = q->handle; + } else { + q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); + } + if (!q) + goto out; + cops = q->ops->cl_ops; + if (!cops) + goto out; + if (!cops->tcf_block) + goto out; + if (TC_H_MIN(tcm->tcm_parent)) { + cl = cops->find(q, tcm->tcm_parent); + if (cl == 0) + goto out; + } + block = cops->tcf_block(q, cl, NULL); + if (!block) + goto out; + if (tcf_block_shared(block)) + q = NULL; + } + + index_start = cb->args[0]; + index = 0; + + list_for_each_entry(chain, &block->chain_list, list) { + if ((tca[TCA_CHAIN] && + nla_get_u32(tca[TCA_CHAIN]) != chain->index)) + continue; + if (index < index_start) { + index++; + continue; + } + err = tc_chain_fill_node(chain, net, skb, block, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWCHAIN); + if (err <= 0) + break; + index++; + } + + cb->args[0] = index; + +out: + /* If we did no progress, the error (EMSGSIZE) is real */ + if (skb->len == 0 && err) + return err; + return skb->len; +} + void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - LIST_HEAD(actions); - - ASSERT_RTNL(); - tcf_exts_to_list(exts, &actions); - tcf_action_destroy(&actions, TCA_ACT_UNBIND); + tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); kfree(exts->actions); exts->nr_actions = 0; #endif @@ -1587,7 +1984,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, "police", ovr, - TCA_ACT_BIND, extack); + TCA_ACT_BIND, true, extack); if (IS_ERR(act)) return PTR_ERR(act); @@ -1595,17 +1992,15 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, exts->actions[0] = act; exts->nr_actions = 1; } else if (exts->action && tb[exts->action]) { - LIST_HEAD(actions); - int err, i = 0; + int err; err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - &actions, &attr_size, extack); - if (err) + exts->actions, &attr_size, true, + extack); + if (err < 0) return err; - list_for_each_entry(act, &actions, list) - exts->actions[i++] = act; - exts->nr_actions = i; + exts->nr_actions = err; } exts->net = net; } @@ -1654,14 +2049,11 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) * tc data even if iproute2 was newer - jhs */ if (exts->type != TCA_OLD_COMPAT) { - LIST_HEAD(actions); - nest = nla_nest_start(skb, exts->action); if (nest == NULL) goto nla_put_failure; - tcf_exts_to_list(exts, &actions); - if (tcf_action_dump(skb, &actions, 0, 0) < 0) + if (tcf_action_dump(skb, exts->actions, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (exts->police) { @@ -1786,6 +2178,10 @@ static int __init tc_filter_init(void) rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter, tc_dump_tfilter, 0); + rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain, + tc_dump_chain, 0); return 0; diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 95367f37098d..6a5dce8baf19 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -324,4 +324,3 @@ static void __exit exit_basic(void) module_init(init_basic) module_exit(exit_basic) MODULE_LICENSE("GPL"); - diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 1aa7f6511065..66e0ac9811f9 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -43,6 +43,7 @@ struct cls_bpf_prog { struct tcf_result res; bool exts_integrated; u32 gen_flags; + unsigned int in_hw_count; struct tcf_exts exts; u32 handle; u16 bpf_num_ops; @@ -174,6 +175,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, cls_bpf_offload_cmd(tp, oldprog, prog, extack); return err; } else if (err > 0) { + prog->in_hw_count = err; tcf_block_offload_inc(block, &prog->gen_flags); } } @@ -652,6 +654,42 @@ skip: } } +static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct cls_bpf_head *head = rtnl_dereference(tp->root); + struct tcf_block *block = tp->chain->block; + struct tc_cls_bpf_offload cls_bpf = {}; + struct cls_bpf_prog *prog; + int err; + + list_for_each_entry(prog, &head->plist, link) { + if (tc_skip_hw(prog->gen_flags)) + continue; + + tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags, + extack); + cls_bpf.command = TC_CLSBPF_OFFLOAD; + cls_bpf.exts = &prog->exts; + cls_bpf.prog = add ? prog->filter : NULL; + cls_bpf.oldprog = add ? NULL : prog->filter; + cls_bpf.name = prog->bpf_name; + cls_bpf.exts_integrated = prog->exts_integrated; + + err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv); + if (err) { + if (add && tc_skip_sw(prog->gen_flags)) + return err; + continue; + } + + tc_cls_offload_cnt_update(block, &prog->in_hw_count, + &prog->gen_flags, add); + } + + return 0; +} + static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .kind = "bpf", .owner = THIS_MODULE, @@ -662,6 +700,7 @@ static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .change = cls_bpf_change, .delete = cls_bpf_delete, .walk = cls_bpf_walk, + .reoffload = cls_bpf_reoffload, .dump = cls_bpf_dump, .bind_class = cls_bpf_bind_class, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9e8b26a80fb3..e8bd08ba998a 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -35,6 +35,7 @@ struct fl_flow_key { struct flow_dissector_key_basic basic; struct flow_dissector_key_eth_addrs eth; struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_vlan cvlan; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; @@ -51,6 +52,7 @@ struct fl_flow_key { struct flow_dissector_key_mpls mpls; struct flow_dissector_key_tcp tcp; struct flow_dissector_key_ip ip; + struct flow_dissector_key_ip enc_ip; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -70,6 +72,13 @@ struct fl_flow_mask { struct list_head list; }; +struct fl_flow_tmplt { + struct fl_flow_key dummy_key; + struct fl_flow_key mask; + struct flow_dissector dissector; + struct tcf_chain *chain; +}; + struct cls_fl_head { struct rhashtable ht; struct list_head masks; @@ -87,6 +96,7 @@ struct cls_fl_filter { struct list_head list; u32 handle; u32 flags; + unsigned int in_hw_count; struct rcu_work rwork; struct net_device *hw_dev; }; @@ -144,6 +154,23 @@ static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key, *lmkey++ = *lkey++ & *lmask++; } +static bool fl_mask_fits_tmplt(struct fl_flow_tmplt *tmplt, + struct fl_flow_mask *mask) +{ + const long *lmask = fl_key_get_start(&mask->key, mask); + const long *ltmplt; + int i; + + if (!tmplt) + return true; + ltmplt = fl_key_get_start(&tmplt->mask, mask); + for (i = 0; i < fl_mask_range(mask); i += sizeof(long)) { + if (~*ltmplt++ & *lmask++) + return false; + } + return true; +} + static void fl_clear_masked_range(struct fl_flow_key *key, struct fl_flow_mask *mask) { @@ -289,6 +316,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, fl_hw_destroy_filter(tp, f, NULL); return err; } else if (err > 0) { + f->in_hw_count = err; tcf_block_offload_inc(block, &f->flags); } @@ -447,6 +475,13 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_IP_TOS_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_IP_TTL] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_IP_TTL_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_CVLAN_ID] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CVLAN_PRIO] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_CVLAN_ETH_TYPE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_ENC_IP_TOS] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_IP_TOS_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_IP_TTL] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -498,22 +533,26 @@ static int fl_set_key_mpls(struct nlattr **tb, } static void fl_set_key_vlan(struct nlattr **tb, + __be16 ethertype, + int vlan_id_key, int vlan_prio_key, struct flow_dissector_key_vlan *key_val, struct flow_dissector_key_vlan *key_mask) { #define VLAN_PRIORITY_MASK 0x7 - if (tb[TCA_FLOWER_KEY_VLAN_ID]) { + if (tb[vlan_id_key]) { key_val->vlan_id = - nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK; + nla_get_u16(tb[vlan_id_key]) & VLAN_VID_MASK; key_mask->vlan_id = VLAN_VID_MASK; } - if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) { + if (tb[vlan_prio_key]) { key_val->vlan_priority = - nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) & + nla_get_u8(tb[vlan_prio_key]) & VLAN_PRIORITY_MASK; key_mask->vlan_priority = VLAN_PRIORITY_MASK; } + key_val->vlan_tpid = ethertype; + key_mask->vlan_tpid = cpu_to_be16(~0); } static void fl_set_key_flag(u32 flower_key, u32 flower_mask, @@ -551,17 +590,17 @@ static int fl_set_key_flags(struct nlattr **tb, return 0; } -static void fl_set_key_ip(struct nlattr **tb, +static void fl_set_key_ip(struct nlattr **tb, bool encap, struct flow_dissector_key_ip *key, struct flow_dissector_key_ip *mask) { - fl_set_key_val(tb, &key->tos, TCA_FLOWER_KEY_IP_TOS, - &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK, - sizeof(key->tos)); + int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS; + int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL; + int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK; + int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK; - fl_set_key_val(tb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, - &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK, - sizeof(key->ttl)); + fl_set_key_val(tb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)); + fl_set_key_val(tb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl)); } static int fl_set_key(struct net *net, struct nlattr **tb, @@ -590,12 +629,28 @@ static int fl_set_key(struct net *net, struct nlattr **tb, if (tb[TCA_FLOWER_KEY_ETH_TYPE]) { ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]); - if (ethertype == htons(ETH_P_8021Q)) { - fl_set_key_vlan(tb, &key->vlan, &mask->vlan); - fl_set_key_val(tb, &key->basic.n_proto, - TCA_FLOWER_KEY_VLAN_ETH_TYPE, - &mask->basic.n_proto, TCA_FLOWER_UNSPEC, - sizeof(key->basic.n_proto)); + if (eth_type_vlan(ethertype)) { + fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID, + TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, + &mask->vlan); + + if (tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]) { + ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]); + if (eth_type_vlan(ethertype)) { + fl_set_key_vlan(tb, ethertype, + TCA_FLOWER_KEY_CVLAN_ID, + TCA_FLOWER_KEY_CVLAN_PRIO, + &key->cvlan, &mask->cvlan); + fl_set_key_val(tb, &key->basic.n_proto, + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, + &mask->basic.n_proto, + TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + } else { + key->basic.n_proto = ethertype; + mask->basic.n_proto = cpu_to_be16(~0); + } + } } else { key->basic.n_proto = ethertype; mask->basic.n_proto = cpu_to_be16(~0); @@ -607,7 +662,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)); - fl_set_key_ip(tb, &key->ip, &mask->ip); + fl_set_key_ip(tb, false, &key->ip, &mask->ip); } if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) { @@ -742,6 +797,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, sizeof(key->enc_tp.dst)); + fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip); + if (tb[TCA_FLOWER_KEY_FLAGS]) ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); @@ -793,47 +850,52 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask) FL_KEY_SET(keys, cnt, id, member); \ } while(0); -static void fl_init_dissector(struct fl_flow_mask *mask) +static void fl_init_dissector(struct flow_dissector *dissector, + struct fl_flow_key *mask) { struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; size_t cnt = 0; FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IP, ip); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_TCP, tcp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ICMP, icmp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ARP, arp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_MPLS, mpls); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_CVLAN, cvlan); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6); - if (FL_KEY_IS_MASKED(&mask->key, enc_ipv4) || - FL_KEY_IS_MASKED(&mask->key, enc_ipv6)) + if (FL_KEY_IS_MASKED(mask, enc_ipv4) || + FL_KEY_IS_MASKED(mask, enc_ipv6)) FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_ENC_IP, enc_ip); - skb_flow_dissector_init(&mask->dissector, keys, cnt); + skb_flow_dissector_init(dissector, keys, cnt); } static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, @@ -852,7 +914,7 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, if (err) goto errout_free; - fl_init_dissector(newmask); + fl_init_dissector(&newmask->dissector, &newmask->key); INIT_LIST_HEAD_RCU(&newmask->filters); @@ -901,6 +963,7 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct cls_fl_filter *f, struct fl_flow_mask *mask, unsigned long base, struct nlattr **tb, struct nlattr *est, bool ovr, + struct fl_flow_tmplt *tmplt, struct netlink_ext_ack *extack) { int err; @@ -921,6 +984,11 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, fl_mask_update_range(mask); fl_set_masked_key(&f->mkey, &f->key, mask); + if (!fl_mask_fits_tmplt(tmplt, mask)) { + NL_SET_ERR_MSG_MOD(extack, "Mask does not fit the template"); + return -EINVAL; + } + return 0; } @@ -986,7 +1054,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr, - extack); + tp->chain->tmplt_priv, extack); if (err) goto errout_idr; @@ -1071,20 +1139,144 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f; + + arg->count = arg->skip; + + while ((f = idr_get_next_ul(&head->handle_idr, + &arg->cookie)) != NULL) { + if (arg->fn(tp, f, arg) < 0) { + arg->stop = 1; + break; + } + arg->cookie = f->handle + 1; + arg->count++; + } +} + +static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = tp->chain->block; struct fl_flow_mask *mask; + struct cls_fl_filter *f; + int err; - list_for_each_entry_rcu(mask, &head->masks, list) { - list_for_each_entry_rcu(f, &mask->filters, list) { - if (arg->count < arg->skip) - goto skip; - if (arg->fn(tp, f, arg) < 0) { - arg->stop = 1; - break; + list_for_each_entry(mask, &head->masks, list) { + list_for_each_entry(f, &mask->filters, list) { + if (tc_skip_hw(f->flags)) + continue; + + tc_cls_common_offload_init(&cls_flower.common, tp, + f->flags, extack); + cls_flower.command = add ? + TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY; + cls_flower.cookie = (unsigned long)f; + cls_flower.dissector = &mask->dissector; + cls_flower.mask = &f->mkey; + cls_flower.key = &f->key; + cls_flower.exts = &f->exts; + cls_flower.classid = f->res.classid; + + err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv); + if (err) { + if (add && tc_skip_sw(f->flags)) + return err; + continue; } -skip: - arg->count++; + + tc_cls_offload_cnt_update(block, &f->in_hw_count, + &f->flags, add); } } + + return 0; +} + +static void fl_hw_create_tmplt(struct tcf_chain *chain, + struct fl_flow_tmplt *tmplt) +{ + struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = chain->block; + struct tcf_exts dummy_exts = { 0, }; + + cls_flower.common.chain_index = chain->index; + cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE; + cls_flower.cookie = (unsigned long) tmplt; + cls_flower.dissector = &tmplt->dissector; + cls_flower.mask = &tmplt->mask; + cls_flower.key = &tmplt->dummy_key; + cls_flower.exts = &dummy_exts; + + /* We don't care if driver (any of them) fails to handle this + * call. It serves just as a hint for it. + */ + tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER, + &cls_flower, false); +} + +static void fl_hw_destroy_tmplt(struct tcf_chain *chain, + struct fl_flow_tmplt *tmplt) +{ + struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = chain->block; + + cls_flower.common.chain_index = chain->index; + cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY; + cls_flower.cookie = (unsigned long) tmplt; + + tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER, + &cls_flower, false); +} + +static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, + struct nlattr **tca, + struct netlink_ext_ack *extack) +{ + struct fl_flow_tmplt *tmplt; + struct nlattr **tb; + int err; + + if (!tca[TCA_OPTIONS]) + return ERR_PTR(-EINVAL); + + tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL); + if (!tb) + return ERR_PTR(-ENOBUFS); + err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], + fl_policy, NULL); + if (err) + goto errout_tb; + + tmplt = kzalloc(sizeof(*tmplt), GFP_KERNEL); + if (!tmplt) + goto errout_tb; + tmplt->chain = chain; + err = fl_set_key(net, tb, &tmplt->dummy_key, &tmplt->mask, extack); + if (err) + goto errout_tmplt; + kfree(tb); + + fl_init_dissector(&tmplt->dissector, &tmplt->mask); + + fl_hw_create_tmplt(chain, tmplt); + + return tmplt; + +errout_tmplt: + kfree(tmplt); +errout_tb: + kfree(tb); + return ERR_PTR(err); +} + +static void fl_tmplt_destroy(void *tmplt_priv) +{ + struct fl_flow_tmplt *tmplt = tmplt_priv; + + fl_hw_destroy_tmplt(tmplt->chain, tmplt); + kfree(tmplt); } static int fl_dump_key_val(struct sk_buff *skb, @@ -1141,20 +1333,24 @@ static int fl_dump_key_mpls(struct sk_buff *skb, return 0; } -static int fl_dump_key_ip(struct sk_buff *skb, +static int fl_dump_key_ip(struct sk_buff *skb, bool encap, struct flow_dissector_key_ip *key, struct flow_dissector_key_ip *mask) { - if (fl_dump_key_val(skb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos, - TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)) || - fl_dump_key_val(skb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl, - TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl))) + int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS; + int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL; + int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK; + int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK; + + if (fl_dump_key_val(skb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)) || + fl_dump_key_val(skb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl))) return -1; return 0; } static int fl_dump_key_vlan(struct sk_buff *skb, + int vlan_id_key, int vlan_prio_key, struct flow_dissector_key_vlan *vlan_key, struct flow_dissector_key_vlan *vlan_mask) { @@ -1163,13 +1359,13 @@ static int fl_dump_key_vlan(struct sk_buff *skb, if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask))) return 0; if (vlan_mask->vlan_id) { - err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID, + err = nla_put_u16(skb, vlan_id_key, vlan_key->vlan_id); if (err) return err; } if (vlan_mask->vlan_priority) { - err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO, + err = nla_put_u8(skb, vlan_prio_key, vlan_key->vlan_priority); if (err) return err; @@ -1216,29 +1412,9 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask); } -static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) +static int fl_dump_key(struct sk_buff *skb, struct net *net, + struct fl_flow_key *key, struct fl_flow_key *mask) { - struct cls_fl_filter *f = fh; - struct nlattr *nest; - struct fl_flow_key *key, *mask; - - if (!f) - return skb->len; - - t->tcm_handle = f->handle; - - nest = nla_nest_start(skb, TCA_OPTIONS); - if (!nest) - goto nla_put_failure; - - if (f->res.classid && - nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) - goto nla_put_failure; - - key = &f->key; - mask = &f->mask->key; - if (mask->indev_ifindex) { struct net_device *dev; @@ -1247,9 +1423,6 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, goto nla_put_failure; } - if (!tc_skip_hw(f->flags)) - fl_hw_update_stats(tp, f); - if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, sizeof(key->eth.dst)) || @@ -1264,15 +1437,36 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls)) goto nla_put_failure; - if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan)) + if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_VLAN_ID, + TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, &mask->vlan)) goto nla_put_failure; + if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_CVLAN_ID, + TCA_FLOWER_KEY_CVLAN_PRIO, + &key->cvlan, &mask->cvlan) || + (mask->cvlan.vlan_tpid && + nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + key->cvlan.vlan_tpid))) + goto nla_put_failure; + + if (mask->basic.n_proto) { + if (mask->cvlan.vlan_tpid) { + if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE, + key->basic.n_proto)) + goto nla_put_failure; + } else if (mask->vlan.vlan_tpid) { + if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + key->basic.n_proto)) + goto nla_put_failure; + } + } + if ((key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) && (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)) || - fl_dump_key_ip(skb, &key->ip, &mask->ip))) + fl_dump_key_ip(skb, false, &key->ip, &mask->ip))) goto nla_put_failure; if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && @@ -1397,12 +1591,48 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, TCA_FLOWER_KEY_ENC_UDP_DST_PORT, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, - sizeof(key->enc_tp.dst))) + sizeof(key->enc_tp.dst)) || + fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip)) goto nla_put_failure; if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct cls_fl_filter *f = fh; + struct nlattr *nest; + struct fl_flow_key *key, *mask; + + if (!f) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + if (f->res.classid && + nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) + goto nla_put_failure; + + key = &f->key; + mask = &f->mask->key; + + if (fl_dump_key(skb, net, key, mask)) + goto nla_put_failure; + + if (!tc_skip_hw(f->flags)) + fl_hw_update_stats(tp, f); + if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) goto nla_put_failure; @@ -1421,6 +1651,31 @@ nla_put_failure: return -1; } +static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv) +{ + struct fl_flow_tmplt *tmplt = tmplt_priv; + struct fl_flow_key *key, *mask; + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + key = &tmplt->dummy_key; + mask = &tmplt->mask; + + if (fl_dump_key(skb, net, key, mask)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static void fl_bind_class(void *fh, u32 classid, unsigned long cl) { struct cls_fl_filter *f = fh; @@ -1438,8 +1693,12 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .change = fl_change, .delete = fl_delete, .walk = fl_walk, + .reoffload = fl_reoffload, .dump = fl_dump, .bind_class = fl_bind_class, + .tmplt_create = fl_tmplt_create, + .tmplt_destroy = fl_tmplt_destroy, + .tmplt_dump = fl_tmplt_dump, .owner = THIS_MODULE, }; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 47b207ef7762..af16f36ed578 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -21,6 +21,7 @@ struct cls_mall_head { struct tcf_result res; u32 handle; u32 flags; + unsigned int in_hw_count; struct rcu_work rwork; }; @@ -95,6 +96,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, mall_destroy_hw_filter(tp, head, cookie, NULL); return err; } else if (err > 0) { + head->in_hw_count = err; tcf_block_offload_inc(block, &head->flags); } @@ -235,6 +237,35 @@ skip: arg->count++; } +static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + struct tc_cls_matchall_offload cls_mall = {}; + struct tcf_block *block = tp->chain->block; + int err; + + if (tc_skip_hw(head->flags)) + return 0; + + tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack); + cls_mall.command = add ? + TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY; + cls_mall.exts = &head->exts; + cls_mall.cookie = (unsigned long)head; + + err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv); + if (err) { + if (add && tc_skip_sw(head->flags)) + return err; + return 0; + } + + tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add); + + return 0; +} + static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t) { @@ -289,6 +320,7 @@ static struct tcf_proto_ops cls_mall_ops __read_mostly = { .change = mall_change, .delete = mall_delete, .walk = mall_walk, + .reoffload = mall_reoffload, .dump = mall_dump, .bind_class = mall_bind_class, .owner = THIS_MODULE, diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index fb861f90fde6..d5d2a6dc3921 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -62,6 +62,7 @@ struct tc_u_knode { struct tc_u32_pcnt __percpu *pf; #endif u32 flags; + unsigned int in_hw_count; #ifdef CONFIG_CLS_U32_MARK u32 val; u32 mask; @@ -571,6 +572,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, u32_remove_hw_knode(tp, n, NULL); return err; } else if (err > 0) { + n->in_hw_count = err; tcf_block_offload_inc(block, &n->flags); } @@ -1199,6 +1201,114 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } +static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, + bool add, tc_setup_cb_t *cb, void *cb_priv, + struct netlink_ext_ack *extack) +{ + struct tc_cls_u32_offload cls_u32 = {}; + int err; + + tc_cls_common_offload_init(&cls_u32.common, tp, ht->flags, extack); + cls_u32.command = add ? TC_CLSU32_NEW_HNODE : TC_CLSU32_DELETE_HNODE; + cls_u32.hnode.divisor = ht->divisor; + cls_u32.hnode.handle = ht->handle; + cls_u32.hnode.prio = ht->prio; + + err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv); + if (err && add && tc_skip_sw(ht->flags)) + return err; + + return 0; +} + +static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, + bool add, tc_setup_cb_t *cb, void *cb_priv, + struct netlink_ext_ack *extack) +{ + struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); + struct tcf_block *block = tp->chain->block; + struct tc_cls_u32_offload cls_u32 = {}; + int err; + + tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack); + cls_u32.command = add ? + TC_CLSU32_REPLACE_KNODE : TC_CLSU32_DELETE_KNODE; + cls_u32.knode.handle = n->handle; + + if (add) { + cls_u32.knode.fshift = n->fshift; +#ifdef CONFIG_CLS_U32_MARK + cls_u32.knode.val = n->val; + cls_u32.knode.mask = n->mask; +#else + cls_u32.knode.val = 0; + cls_u32.knode.mask = 0; +#endif + cls_u32.knode.sel = &n->sel; + cls_u32.knode.exts = &n->exts; + if (n->ht_down) + cls_u32.knode.link_handle = ht->handle; + } + + err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv); + if (err) { + if (add && tc_skip_sw(n->flags)) + return err; + return 0; + } + + tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add); + + return 0; +} + +static int u32_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + unsigned int h; + int err; + + for (ht = rtnl_dereference(tp_c->hlist); + ht; + ht = rtnl_dereference(ht->next)) { + if (ht->prio != tp->prio) + continue; + + /* When adding filters to a new dev, try to offload the + * hashtable first. When removing, do the filters before the + * hashtable. + */ + if (add && !tc_skip_hw(ht->flags)) { + err = u32_reoffload_hnode(tp, ht, add, cb, cb_priv, + extack); + if (err) + return err; + } + + for (h = 0; h <= ht->divisor; h++) { + for (n = rtnl_dereference(ht->ht[h]); + n; + n = rtnl_dereference(n->next)) { + if (tc_skip_hw(n->flags)) + continue; + + err = u32_reoffload_knode(tp, n, add, cb, + cb_priv, extack); + if (err) + return err; + } + } + + if (!add && !tc_skip_hw(ht->flags)) + u32_reoffload_hnode(tp, ht, add, cb, cb_priv, extack); + } + + return 0; +} + static void u32_bind_class(void *fh, u32 classid, unsigned long cl) { struct tc_u_knode *n = fh; @@ -1336,6 +1446,7 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = { .change = u32_change, .delete = u32_delete, .walk = u32_walk, + .reoffload = u32_reoffload, .dump = u32_dump, .bind_class = u32_bind_class, .owner = THIS_MODULE, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 54eca685420f..98541c6399db 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -596,12 +596,19 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid) { - hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } +EXPORT_SYMBOL(qdisc_watchdog_init_clockid); + +void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +{ + qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); +} EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c new file mode 100644 index 000000000000..539c9490c308 --- /dev/null +++ b/net/sched/sch_cake.c @@ -0,0 +1,3019 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* COMMON Applications Kept Enhanced (CAKE) discipline + * + * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> + * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> + * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> + * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> + * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> + * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> + * + * The CAKE Principles: + * (or, how to have your cake and eat it too) + * + * This is a combination of several shaping, AQM and FQ techniques into one + * easy-to-use package: + * + * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE + * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), + * eliminating the need for any sort of burst parameter (eg. token bucket + * depth). Burst support is limited to that necessary to overcome scheduling + * latency. + * + * - A Diffserv-aware priority queue, giving more priority to certain classes, + * up to a specified fraction of bandwidth. Above that bandwidth threshold, + * the priority is reduced to avoid starving other tins. + * + * - Each priority tin has a separate Flow Queue system, to isolate traffic + * flows from each other. This prevents a burst on one flow from increasing + * the delay to another. Flows are distributed to queues using a + * set-associative hash function. + * + * - Each queue is actively managed by Cobalt, which is a combination of the + * Codel and Blue AQM algorithms. This serves flows fairly, and signals + * congestion early via ECN (if available) and/or packet drops, to keep + * latency low. The codel parameters are auto-tuned based on the bandwidth + * setting, as is necessary at low bandwidths. + * + * The configuration parameters are kept deliberately simple for ease of use. + * Everything has sane defaults. Complete generality of configuration is *not* + * a goal. + * + * The priority queue operates according to a weighted DRR scheme, combined with + * a bandwidth tracker which reuses the shaper logic to detect which side of the + * bandwidth sharing threshold the tin is operating. This determines whether a + * priority-based weight (high) or a bandwidth-based weight (low) is used for + * that tin in the current pass. + * + * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly + * granted us permission to leverage. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/reciprocal_div.h> +#include <net/netlink.h> +#include <linux/version.h> +#include <linux/if_vlan.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> +#include <net/tcp.h> +#include <net/flow_dissector.h> + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack_core.h> +#endif + +#define CAKE_SET_WAYS (8) +#define CAKE_MAX_TINS (8) +#define CAKE_QUEUES (1024) +#define CAKE_FLOW_MASK 63 +#define CAKE_FLOW_NAT_FLAG 64 +#define CAKE_SPLIT_GSO_THRESHOLD (125000000) /* 1Gbps */ + +/* struct cobalt_params - contains codel and blue parameters + * @interval: codel initial drop rate + * @target: maximum persistent sojourn time & blue update rate + * @mtu_time: serialisation delay of maximum-size packet + * @p_inc: increment of blue drop probability (0.32 fxp) + * @p_dec: decrement of blue drop probability (0.32 fxp) + */ +struct cobalt_params { + u64 interval; + u64 target; + u64 mtu_time; + u32 p_inc; + u32 p_dec; +}; + +/* struct cobalt_vars - contains codel and blue variables + * @count: codel dropping frequency + * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 + * @drop_next: time to drop next packet, or when we dropped last + * @blue_timer: Blue time to next drop + * @p_drop: BLUE drop probability (0.32 fxp) + * @dropping: set if in dropping state + * @ecn_marked: set if marked + */ +struct cobalt_vars { + u32 count; + u32 rec_inv_sqrt; + ktime_t drop_next; + ktime_t blue_timer; + u32 p_drop; + bool dropping; + bool ecn_marked; +}; + +enum { + CAKE_SET_NONE = 0, + CAKE_SET_SPARSE, + CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ + CAKE_SET_BULK, + CAKE_SET_DECAYING +}; + +struct cake_flow { + /* this stuff is all needed per-flow at dequeue time */ + struct sk_buff *head; + struct sk_buff *tail; + struct list_head flowchain; + s32 deficit; + u32 dropped; + struct cobalt_vars cvars; + u16 srchost; /* index into cake_host table */ + u16 dsthost; + u8 set; +}; /* please try to keep this structure <= 64 bytes */ + +struct cake_host { + u32 srchost_tag; + u32 dsthost_tag; + u16 srchost_refcnt; + u16 dsthost_refcnt; +}; + +struct cake_heap_entry { + u16 t:3, b:10; +}; + +struct cake_tin_data { + struct cake_flow flows[CAKE_QUEUES]; + u32 backlogs[CAKE_QUEUES]; + u32 tags[CAKE_QUEUES]; /* for set association */ + u16 overflow_idx[CAKE_QUEUES]; + struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ + u16 flow_quantum; + + struct cobalt_params cparams; + u32 drop_overlimit; + u16 bulk_flow_count; + u16 sparse_flow_count; + u16 decaying_flow_count; + u16 unresponsive_flow_count; + + u32 max_skblen; + + struct list_head new_flows; + struct list_head old_flows; + struct list_head decaying_flows; + + /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ + ktime_t time_next_packet; + u64 tin_rate_ns; + u64 tin_rate_bps; + u16 tin_rate_shft; + + u16 tin_quantum_prio; + u16 tin_quantum_band; + s32 tin_deficit; + u32 tin_backlog; + u32 tin_dropped; + u32 tin_ecn_mark; + + u32 packets; + u64 bytes; + + u32 ack_drops; + + /* moving averages */ + u64 avge_delay; + u64 peak_delay; + u64 base_delay; + + /* hash function stats */ + u32 way_directs; + u32 way_hits; + u32 way_misses; + u32 way_collisions; +}; /* number of tins is small, so size of this struct doesn't matter much */ + +struct cake_sched_data { + struct tcf_proto __rcu *filter_list; /* optional external classifier */ + struct tcf_block *block; + struct cake_tin_data *tins; + + struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; + u16 overflow_timeout; + + u16 tin_cnt; + u8 tin_mode; + u8 flow_mode; + u8 ack_filter; + u8 atm_mode; + + /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ + u16 rate_shft; + ktime_t time_next_packet; + ktime_t failsafe_next_packet; + u64 rate_ns; + u64 rate_bps; + u16 rate_flags; + s16 rate_overhead; + u16 rate_mpu; + u64 interval; + u64 target; + + /* resource tracking */ + u32 buffer_used; + u32 buffer_max_used; + u32 buffer_limit; + u32 buffer_config_limit; + + /* indices for dequeue */ + u16 cur_tin; + u16 cur_flow; + + struct qdisc_watchdog watchdog; + const u8 *tin_index; + const u8 *tin_order; + + /* bandwidth capacity estimate */ + ktime_t last_packet_time; + ktime_t avg_window_begin; + u64 avg_packet_interval; + u64 avg_window_bytes; + u64 avg_peak_bandwidth; + ktime_t last_reconfig_time; + + /* packet length stats */ + u32 avg_netoff; + u16 max_netlen; + u16 max_adjlen; + u16 min_netlen; + u16 min_adjlen; +}; + +enum { + CAKE_FLAG_OVERHEAD = BIT(0), + CAKE_FLAG_AUTORATE_INGRESS = BIT(1), + CAKE_FLAG_INGRESS = BIT(2), + CAKE_FLAG_WASH = BIT(3), + CAKE_FLAG_SPLIT_GSO = BIT(4) +}; + +/* COBALT operates the Codel and BLUE algorithms in parallel, in order to + * obtain the best features of each. Codel is excellent on flows which + * respond to congestion signals in a TCP-like way. BLUE is more effective on + * unresponsive flows. + */ + +struct cobalt_skb_cb { + ktime_t enqueue_time; + u32 adjusted_len; +}; + +static u64 us_to_ns(u64 us) +{ + return us * NSEC_PER_USEC; +} + +static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); + return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) +{ + return get_cobalt_cb(skb)->enqueue_time; +} + +static void cobalt_set_enqueue_time(struct sk_buff *skb, + ktime_t now) +{ + get_cobalt_cb(skb)->enqueue_time = now; +} + +static u16 quantum_div[CAKE_QUEUES + 1] = {0}; + +/* Diffserv lookup tables */ + +static const u8 precedence[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, +}; + +static const u8 diffserv8[] = { + 2, 5, 1, 2, 4, 2, 2, 2, + 0, 2, 1, 2, 1, 2, 1, 2, + 5, 2, 4, 2, 4, 2, 4, 2, + 3, 2, 3, 2, 3, 2, 3, 2, + 6, 2, 3, 2, 3, 2, 3, 2, + 6, 2, 2, 2, 6, 2, 6, 2, + 7, 2, 2, 2, 2, 2, 2, 2, + 7, 2, 2, 2, 2, 2, 2, 2, +}; + +static const u8 diffserv4[] = { + 0, 2, 0, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 2, 0, 2, 0, 2, 0, + 2, 0, 2, 0, 2, 0, 2, 0, + 3, 0, 2, 0, 2, 0, 2, 0, + 3, 0, 0, 0, 3, 0, 3, 0, + 3, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, 0, 0, +}; + +static const u8 diffserv3[] = { + 0, 0, 0, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 0, 2, 0, + 2, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, +}; + +static const u8 besteffort[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* tin priority order for stats dumping */ + +static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; +static const u8 bulk_order[] = {1, 0, 2, 3}; + +#define REC_INV_SQRT_CACHE (16) +static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; + +/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots + * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) + * + * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 + */ + +static void cobalt_newton_step(struct cobalt_vars *vars) +{ + u32 invsqrt, invsqrt2; + u64 val; + + invsqrt = vars->rec_inv_sqrt; + invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; + val = (3LL << 32) - ((u64)vars->count * invsqrt2); + + val >>= 2; /* avoid overflow in following multiply */ + val = (val * invsqrt) >> (32 - 2 + 1); + + vars->rec_inv_sqrt = val; +} + +static void cobalt_invsqrt(struct cobalt_vars *vars) +{ + if (vars->count < REC_INV_SQRT_CACHE) + vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; + else + cobalt_newton_step(vars); +} + +/* There is a big difference in timing between the accurate values placed in + * the cache and the approximations given by a single Newton step for small + * count values, particularly when stepping from count 1 to 2 or vice versa. + * Above 16, a single Newton step gives sufficient accuracy in either + * direction, given the precision stored. + * + * The magnitude of the error when stepping up to count 2 is such as to give + * the value that *should* have been produced at count 4. + */ + +static void cobalt_cache_init(void) +{ + struct cobalt_vars v; + + memset(&v, 0, sizeof(v)); + v.rec_inv_sqrt = ~0U; + cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; + + for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { + cobalt_newton_step(&v); + cobalt_newton_step(&v); + cobalt_newton_step(&v); + cobalt_newton_step(&v); + + cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; + } +} + +static void cobalt_vars_init(struct cobalt_vars *vars) +{ + memset(vars, 0, sizeof(*vars)); + + if (!cobalt_rec_inv_sqrt_cache[0]) { + cobalt_cache_init(); + cobalt_rec_inv_sqrt_cache[0] = ~0; + } +} + +/* CoDel control_law is t + interval/sqrt(count) + * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid + * both sqrt() and divide operation. + */ +static ktime_t cobalt_control(ktime_t t, + u64 interval, + u32 rec_inv_sqrt) +{ + return ktime_add_ns(t, reciprocal_scale(interval, + rec_inv_sqrt)); +} + +/* Call this when a packet had to be dropped due to queue overflow. Returns + * true if the BLUE state was quiescent before but active after this call. + */ +static bool cobalt_queue_full(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now) +{ + bool up = false; + + if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { + up = !vars->p_drop; + vars->p_drop += p->p_inc; + if (vars->p_drop < p->p_inc) + vars->p_drop = ~0; + vars->blue_timer = now; + } + vars->dropping = true; + vars->drop_next = now; + if (!vars->count) + vars->count = 1; + + return up; +} + +/* Call this when the queue was serviced but turned out to be empty. Returns + * true if the BLUE state was active before but quiescent after this call. + */ +static bool cobalt_queue_empty(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now) +{ + bool down = false; + + if (vars->p_drop && + ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { + if (vars->p_drop < p->p_dec) + vars->p_drop = 0; + else + vars->p_drop -= p->p_dec; + vars->blue_timer = now; + down = !vars->p_drop; + } + vars->dropping = false; + + if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { + vars->count--; + cobalt_invsqrt(vars); + vars->drop_next = cobalt_control(vars->drop_next, + p->interval, + vars->rec_inv_sqrt); + } + + return down; +} + +/* Call this with a freshly dequeued packet for possible congestion marking. + * Returns true as an instruction to drop the packet, false for delivery. + */ +static bool cobalt_should_drop(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now, + struct sk_buff *skb, + u32 bulk_flows) +{ + bool next_due, over_target, drop = false; + ktime_t schedule; + u64 sojourn; + +/* The 'schedule' variable records, in its sign, whether 'now' is before or + * after 'drop_next'. This allows 'drop_next' to be updated before the next + * scheduling decision is actually branched, without destroying that + * information. Similarly, the first 'schedule' value calculated is preserved + * in the boolean 'next_due'. + * + * As for 'drop_next', we take advantage of the fact that 'interval' is both + * the delay between first exceeding 'target' and the first signalling event, + * *and* the scaling factor for the signalling frequency. It's therefore very + * natural to use a single mechanism for both purposes, and eliminates a + * significant amount of reference Codel's spaghetti code. To help with this, + * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close + * as possible to 1.0 in fixed-point. + */ + + sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); + schedule = ktime_sub(now, vars->drop_next); + over_target = sojourn > p->target && + sojourn > p->mtu_time * bulk_flows * 2 && + sojourn > p->mtu_time * 4; + next_due = vars->count && ktime_to_ns(schedule) >= 0; + + vars->ecn_marked = false; + + if (over_target) { + if (!vars->dropping) { + vars->dropping = true; + vars->drop_next = cobalt_control(now, + p->interval, + vars->rec_inv_sqrt); + } + if (!vars->count) + vars->count = 1; + } else if (vars->dropping) { + vars->dropping = false; + } + + if (next_due && vars->dropping) { + /* Use ECN mark if possible, otherwise drop */ + drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); + + vars->count++; + if (!vars->count) + vars->count--; + cobalt_invsqrt(vars); + vars->drop_next = cobalt_control(vars->drop_next, + p->interval, + vars->rec_inv_sqrt); + schedule = ktime_sub(now, vars->drop_next); + } else { + while (next_due) { + vars->count--; + cobalt_invsqrt(vars); + vars->drop_next = cobalt_control(vars->drop_next, + p->interval, + vars->rec_inv_sqrt); + schedule = ktime_sub(now, vars->drop_next); + next_due = vars->count && ktime_to_ns(schedule) >= 0; + } + } + + /* Simple BLUE implementation. Lack of ECN is deliberate. */ + if (vars->p_drop) + drop |= (prandom_u32() < vars->p_drop); + + /* Overload the drop_next field as an activity timeout */ + if (!vars->count) + vars->drop_next = ktime_add_ns(now, p->interval); + else if (ktime_to_ns(schedule) > 0 && !drop) + vars->drop_next = now; + + return drop; +} + +static void cake_update_flowkeys(struct flow_keys *keys, + const struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + struct nf_conntrack_tuple tuple = {}; + bool rev = !skb->_nfct; + + if (tc_skb_protocol(skb) != htons(ETH_P_IP)) + return; + + if (!nf_ct_get_tuple_skb(&tuple, skb)) + return; + + keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; + keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + + if (keys->ports.ports) { + keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all; + keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all; + } +#endif +} + +/* Cake has several subtle multiple bit settings. In these cases you + * would be matching triple isolate mode as well. + */ + +static bool cake_dsrc(int flow_mode) +{ + return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; +} + +static bool cake_ddst(int flow_mode) +{ + return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; +} + +static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, + int flow_mode) +{ + u32 flow_hash = 0, srchost_hash, dsthost_hash; + u16 reduced_hash, srchost_idx, dsthost_idx; + struct flow_keys keys, host_keys; + + if (unlikely(flow_mode == CAKE_FLOW_NONE)) + return 0; + + skb_flow_dissect_flow_keys(skb, &keys, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + + if (flow_mode & CAKE_FLOW_NAT_FLAG) + cake_update_flowkeys(&keys, skb); + + /* flow_hash_from_keys() sorts the addresses by value, so we have + * to preserve their order in a separate data structure to treat + * src and dst host addresses as independently selectable. + */ + host_keys = keys; + host_keys.ports.ports = 0; + host_keys.basic.ip_proto = 0; + host_keys.keyid.keyid = 0; + host_keys.tags.flow_label = 0; + + switch (host_keys.control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + host_keys.addrs.v4addrs.src = 0; + dsthost_hash = flow_hash_from_keys(&host_keys); + host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + host_keys.addrs.v4addrs.dst = 0; + srchost_hash = flow_hash_from_keys(&host_keys); + break; + + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + memset(&host_keys.addrs.v6addrs.src, 0, + sizeof(host_keys.addrs.v6addrs.src)); + dsthost_hash = flow_hash_from_keys(&host_keys); + host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + memset(&host_keys.addrs.v6addrs.dst, 0, + sizeof(host_keys.addrs.v6addrs.dst)); + srchost_hash = flow_hash_from_keys(&host_keys); + break; + + default: + dsthost_hash = 0; + srchost_hash = 0; + } + + /* This *must* be after the above switch, since as a + * side-effect it sorts the src and dst addresses. + */ + if (flow_mode & CAKE_FLOW_FLOWS) + flow_hash = flow_hash_from_keys(&keys); + + if (!(flow_mode & CAKE_FLOW_FLOWS)) { + if (flow_mode & CAKE_FLOW_SRC_IP) + flow_hash ^= srchost_hash; + + if (flow_mode & CAKE_FLOW_DST_IP) + flow_hash ^= dsthost_hash; + } + + reduced_hash = flow_hash % CAKE_QUEUES; + + /* set-associative hashing */ + /* fast path if no hash collision (direct lookup succeeds) */ + if (likely(q->tags[reduced_hash] == flow_hash && + q->flows[reduced_hash].set)) { + q->way_directs++; + } else { + u32 inner_hash = reduced_hash % CAKE_SET_WAYS; + u32 outer_hash = reduced_hash - inner_hash; + bool allocate_src = false; + bool allocate_dst = false; + u32 i, k; + + /* check if any active queue in the set is reserved for + * this flow. + */ + for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (q->tags[outer_hash + k] == flow_hash) { + if (i) + q->way_hits++; + + if (!q->flows[outer_hash + k].set) { + /* need to increment host refcnts */ + allocate_src = cake_dsrc(flow_mode); + allocate_dst = cake_ddst(flow_mode); + } + + goto found; + } + } + + /* no queue is reserved for this flow, look for an + * empty one. + */ + for (i = 0; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (!q->flows[outer_hash + k].set) { + q->way_misses++; + allocate_src = cake_dsrc(flow_mode); + allocate_dst = cake_ddst(flow_mode); + goto found; + } + } + + /* With no empty queues, default to the original + * queue, accept the collision, update the host tags. + */ + q->way_collisions++; + q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--; + q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--; + allocate_src = cake_dsrc(flow_mode); + allocate_dst = cake_ddst(flow_mode); +found: + /* reserve queue for future packets in same flow */ + reduced_hash = outer_hash + k; + q->tags[reduced_hash] = flow_hash; + + if (allocate_src) { + srchost_idx = srchost_hash % CAKE_QUEUES; + inner_hash = srchost_idx % CAKE_SET_WAYS; + outer_hash = srchost_idx - inner_hash; + for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (q->hosts[outer_hash + k].srchost_tag == + srchost_hash) + goto found_src; + } + for (i = 0; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (!q->hosts[outer_hash + k].srchost_refcnt) + break; + } + q->hosts[outer_hash + k].srchost_tag = srchost_hash; +found_src: + srchost_idx = outer_hash + k; + q->hosts[srchost_idx].srchost_refcnt++; + q->flows[reduced_hash].srchost = srchost_idx; + } + + if (allocate_dst) { + dsthost_idx = dsthost_hash % CAKE_QUEUES; + inner_hash = dsthost_idx % CAKE_SET_WAYS; + outer_hash = dsthost_idx - inner_hash; + for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (q->hosts[outer_hash + k].dsthost_tag == + dsthost_hash) + goto found_dst; + } + for (i = 0; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (!q->hosts[outer_hash + k].dsthost_refcnt) + break; + } + q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; +found_dst: + dsthost_idx = outer_hash + k; + q->hosts[dsthost_idx].dsthost_refcnt++; + q->flows[reduced_hash].dsthost = dsthost_idx; + } + } + + return reduced_hash; +} + +/* helper functions : might be changed when/if skb use a standard list_head */ +/* remove one skb from head of slot queue */ + +static struct sk_buff *dequeue_head(struct cake_flow *flow) +{ + struct sk_buff *skb = flow->head; + + if (skb) { + flow->head = skb->next; + skb->next = NULL; + } + + return skb; +} + +/* add skb to flow queue (tail add) */ + +static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) +{ + if (!flow->head) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; +} + +static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, + struct ipv6hdr *buf) +{ + unsigned int offset = skb_network_offset(skb); + struct iphdr *iph; + + iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); + + if (!iph) + return NULL; + + if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) + return skb_header_pointer(skb, offset + iph->ihl * 4, + sizeof(struct ipv6hdr), buf); + + else if (iph->version == 4) + return iph; + + else if (iph->version == 6) + return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), + buf); + + return NULL; +} + +static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, + void *buf, unsigned int bufsize) +{ + unsigned int offset = skb_network_offset(skb); + const struct ipv6hdr *ipv6h; + const struct tcphdr *tcph; + const struct iphdr *iph; + struct ipv6hdr _ipv6h; + struct tcphdr _tcph; + + ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); + + if (!ipv6h) + return NULL; + + if (ipv6h->version == 4) { + iph = (struct iphdr *)ipv6h; + offset += iph->ihl * 4; + + /* special-case 6in4 tunnelling, as that is a common way to get + * v6 connectivity in the home + */ + if (iph->protocol == IPPROTO_IPV6) { + ipv6h = skb_header_pointer(skb, offset, + sizeof(_ipv6h), &_ipv6h); + + if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) + return NULL; + + offset += sizeof(struct ipv6hdr); + + } else if (iph->protocol != IPPROTO_TCP) { + return NULL; + } + + } else if (ipv6h->version == 6) { + if (ipv6h->nexthdr != IPPROTO_TCP) + return NULL; + + offset += sizeof(struct ipv6hdr); + } else { + return NULL; + } + + tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (!tcph) + return NULL; + + return skb_header_pointer(skb, offset, + min(__tcp_hdrlen(tcph), bufsize), buf); +} + +static const void *cake_get_tcpopt(const struct tcphdr *tcph, + int code, int *oplen) +{ + /* inspired by tcp_parse_options in tcp_input.c */ + int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); + const u8 *ptr = (const u8 *)(tcph + 1); + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + if (opcode == TCPOPT_EOL) + break; + if (opcode == TCPOPT_NOP) { + length--; + continue; + } + opsize = *ptr++; + if (opsize < 2 || opsize > length) + break; + + if (opcode == code) { + *oplen = opsize; + return ptr; + } + + ptr += opsize - 2; + length -= opsize; + } + + return NULL; +} + +/* Compare two SACK sequences. A sequence is considered greater if it SACKs more + * bytes than the other. In the case where both sequences ACKs bytes that the + * other doesn't, A is considered greater. DSACKs in A also makes A be + * considered greater. + * + * @return -1, 0 or 1 as normal compare functions + */ +static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, + const struct tcphdr *tcph_b) +{ + const struct tcp_sack_block_wire *sack_a, *sack_b; + u32 ack_seq_a = ntohl(tcph_a->ack_seq); + u32 bytes_a = 0, bytes_b = 0; + int oplen_a, oplen_b; + bool first = true; + + sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); + sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); + + /* pointers point to option contents */ + oplen_a -= TCPOLEN_SACK_BASE; + oplen_b -= TCPOLEN_SACK_BASE; + + if (sack_a && oplen_a >= sizeof(*sack_a) && + (!sack_b || oplen_b < sizeof(*sack_b))) + return -1; + else if (sack_b && oplen_b >= sizeof(*sack_b) && + (!sack_a || oplen_a < sizeof(*sack_a))) + return 1; + else if ((!sack_a || oplen_a < sizeof(*sack_a)) && + (!sack_b || oplen_b < sizeof(*sack_b))) + return 0; + + while (oplen_a >= sizeof(*sack_a)) { + const struct tcp_sack_block_wire *sack_tmp = sack_b; + u32 start_a = get_unaligned_be32(&sack_a->start_seq); + u32 end_a = get_unaligned_be32(&sack_a->end_seq); + int oplen_tmp = oplen_b; + bool found = false; + + /* DSACK; always considered greater to prevent dropping */ + if (before(start_a, ack_seq_a)) + return -1; + + bytes_a += end_a - start_a; + + while (oplen_tmp >= sizeof(*sack_tmp)) { + u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); + u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); + + /* first time through we count the total size */ + if (first) + bytes_b += end_b - start_b; + + if (!after(start_b, start_a) && !before(end_b, end_a)) { + found = true; + if (!first) + break; + } + oplen_tmp -= sizeof(*sack_tmp); + sack_tmp++; + } + + if (!found) + return -1; + + oplen_a -= sizeof(*sack_a); + sack_a++; + first = false; + } + + /* If we made it this far, all ranges SACKed by A are covered by B, so + * either the SACKs are equal, or B SACKs more bytes. + */ + return bytes_b > bytes_a ? 1 : 0; +} + +static void cake_tcph_get_tstamp(const struct tcphdr *tcph, + u32 *tsval, u32 *tsecr) +{ + const u8 *ptr; + int opsize; + + ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); + + if (ptr && opsize == TCPOLEN_TIMESTAMP) { + *tsval = get_unaligned_be32(ptr); + *tsecr = get_unaligned_be32(ptr + 4); + } +} + +static bool cake_tcph_may_drop(const struct tcphdr *tcph, + u32 tstamp_new, u32 tsecr_new) +{ + /* inspired by tcp_parse_options in tcp_input.c */ + int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); + const u8 *ptr = (const u8 *)(tcph + 1); + u32 tstamp, tsecr; + + /* 3 reserved flags must be unset to avoid future breakage + * ACK must be set + * ECE/CWR are handled separately + * All other flags URG/PSH/RST/SYN/FIN must be unset + * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) + * 0x00C00000 = CWR/ECE (handled separately) + * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 + */ + if (((tcp_flag_word(tcph) & + cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) + return false; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + if (opcode == TCPOPT_EOL) + break; + if (opcode == TCPOPT_NOP) { + length--; + continue; + } + opsize = *ptr++; + if (opsize < 2 || opsize > length) + break; + + switch (opcode) { + case TCPOPT_MD5SIG: /* doesn't influence state */ + break; + + case TCPOPT_SACK: /* stricter checking performed later */ + if (opsize % 8 != 2) + return false; + break; + + case TCPOPT_TIMESTAMP: + /* only drop timestamps lower than new */ + if (opsize != TCPOLEN_TIMESTAMP) + return false; + tstamp = get_unaligned_be32(ptr); + tsecr = get_unaligned_be32(ptr + 4); + if (after(tstamp, tstamp_new) || + after(tsecr, tsecr_new)) + return false; + break; + + case TCPOPT_MSS: /* these should only be set on SYN */ + case TCPOPT_WINDOW: + case TCPOPT_SACK_PERM: + case TCPOPT_FASTOPEN: + case TCPOPT_EXP: + default: /* don't drop if any unknown options are present */ + return false; + } + + ptr += opsize - 2; + length -= opsize; + } + + return true; +} + +static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, + struct cake_flow *flow) +{ + bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; + struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; + struct sk_buff *skb_check, *skb_prev = NULL; + const struct ipv6hdr *ipv6h, *ipv6h_check; + unsigned char _tcph[64], _tcph_check[64]; + const struct tcphdr *tcph, *tcph_check; + const struct iphdr *iph, *iph_check; + struct ipv6hdr _iph, _iph_check; + const struct sk_buff *skb; + int seglen, num_found = 0; + u32 tstamp = 0, tsecr = 0; + __be32 elig_flags = 0; + int sack_comp; + + /* no other possible ACKs to filter */ + if (flow->head == flow->tail) + return NULL; + + skb = flow->tail; + tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); + iph = cake_get_iphdr(skb, &_iph); + if (!tcph) + return NULL; + + cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); + + /* the 'triggering' packet need only have the ACK flag set. + * also check that SYN is not set, as there won't be any previous ACKs. + */ + if ((tcp_flag_word(tcph) & + (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) + return NULL; + + /* the 'triggering' ACK is at the tail of the queue, we have already + * returned if it is the only packet in the flow. loop through the rest + * of the queue looking for pure ACKs with the same 5-tuple as the + * triggering one. + */ + for (skb_check = flow->head; + skb_check && skb_check != skb; + skb_prev = skb_check, skb_check = skb_check->next) { + iph_check = cake_get_iphdr(skb_check, &_iph_check); + tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, + sizeof(_tcph_check)); + + /* only TCP packets with matching 5-tuple are eligible, and only + * drop safe headers + */ + if (!tcph_check || iph->version != iph_check->version || + tcph_check->source != tcph->source || + tcph_check->dest != tcph->dest) + continue; + + if (iph_check->version == 4) { + if (iph_check->saddr != iph->saddr || + iph_check->daddr != iph->daddr) + continue; + + seglen = ntohs(iph_check->tot_len) - + (4 * iph_check->ihl); + } else if (iph_check->version == 6) { + ipv6h = (struct ipv6hdr *)iph; + ipv6h_check = (struct ipv6hdr *)iph_check; + + if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || + ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) + continue; + + seglen = ntohs(ipv6h_check->payload_len); + } else { + WARN_ON(1); /* shouldn't happen */ + continue; + } + + /* If the ECE/CWR flags changed from the previous eligible + * packet in the same flow, we should no longer be dropping that + * previous packet as this would lose information. + */ + if (elig_ack && (tcp_flag_word(tcph_check) & + (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { + elig_ack = NULL; + elig_ack_prev = NULL; + num_found--; + } + + /* Check TCP options and flags, don't drop ACKs with segment + * data, and don't drop ACKs with a higher cumulative ACK + * counter than the triggering packet. Check ACK seqno here to + * avoid parsing SACK options of packets we are going to exclude + * anyway. + */ + if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || + (seglen - __tcp_hdrlen(tcph_check)) != 0 || + after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) + continue; + + /* Check SACK options. The triggering packet must SACK more data + * than the ACK under consideration, or SACK the same range but + * have a larger cumulative ACK counter. The latter is a + * pathological case, but is contained in the following check + * anyway, just to be safe. + */ + sack_comp = cake_tcph_sack_compare(tcph_check, tcph); + + if (sack_comp < 0 || + (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && + sack_comp == 0)) + continue; + + /* At this point we have found an eligible pure ACK to drop; if + * we are in aggressive mode, we are done. Otherwise, keep + * searching unless this is the second eligible ACK we + * found. + * + * Since we want to drop ACK closest to the head of the queue, + * save the first eligible ACK we find, even if we need to loop + * again. + */ + if (!elig_ack) { + elig_ack = skb_check; + elig_ack_prev = skb_prev; + elig_flags = (tcp_flag_word(tcph_check) + & (TCP_FLAG_ECE | TCP_FLAG_CWR)); + } + + if (num_found++ > 0) + goto found; + } + + /* We made it through the queue without finding two eligible ACKs . If + * we found a single eligible ACK we can drop it in aggressive mode if + * we can guarantee that this does not interfere with ECN flag + * information. We ensure this by dropping it only if the enqueued + * packet is consecutive with the eligible ACK, and their flags match. + */ + if (elig_ack && aggressive && elig_ack->next == skb && + (elig_flags == (tcp_flag_word(tcph) & + (TCP_FLAG_ECE | TCP_FLAG_CWR)))) + goto found; + + return NULL; + +found: + if (elig_ack_prev) + elig_ack_prev->next = elig_ack->next; + else + flow->head = elig_ack->next; + + elig_ack->next = NULL; + + return elig_ack; +} + +static u64 cake_ewma(u64 avg, u64 sample, u32 shift) +{ + avg -= avg >> shift; + avg += sample >> shift; + return avg; +} + +static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) +{ + if (q->rate_flags & CAKE_FLAG_OVERHEAD) + len -= off; + + if (q->max_netlen < len) + q->max_netlen = len; + if (q->min_netlen > len) + q->min_netlen = len; + + len += q->rate_overhead; + + if (len < q->rate_mpu) + len = q->rate_mpu; + + if (q->atm_mode == CAKE_ATM_ATM) { + len += 47; + len /= 48; + len *= 53; + } else if (q->atm_mode == CAKE_ATM_PTM) { + /* Add one byte per 64 bytes or part thereof. + * This is conservative and easier to calculate than the + * precise value. + */ + len += (len + 63) / 64; + } + + if (q->max_adjlen < len) + q->max_adjlen = len; + if (q->min_adjlen > len) + q->min_adjlen = len; + + return len; +} + +static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int hdr_len, last_len = 0; + u32 off = skb_network_offset(skb); + u32 len = qdisc_pkt_len(skb); + u16 segs = 1; + + q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); + + if (!shinfo->gso_size) + return cake_calc_overhead(q, len, off); + + /* borrowed from qdisc_pkt_len_init() */ + hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + /* + transport layer */ + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | + SKB_GSO_TCPV6))) { + const struct tcphdr *th; + struct tcphdr _tcphdr; + + th = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_tcphdr), &_tcphdr); + if (likely(th)) + hdr_len += __tcp_hdrlen(th); + } else { + struct udphdr _udphdr; + + if (skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_udphdr), &_udphdr)) + hdr_len += sizeof(struct udphdr); + } + + if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) + segs = DIV_ROUND_UP(skb->len - hdr_len, + shinfo->gso_size); + else + segs = shinfo->gso_segs; + + len = shinfo->gso_size + hdr_len; + last_len = skb->len - shinfo->gso_size * (segs - 1); + + return (cake_calc_overhead(q, len, off) * (segs - 1) + + cake_calc_overhead(q, last_len, off)); +} + +static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) +{ + struct cake_heap_entry ii = q->overflow_heap[i]; + struct cake_heap_entry jj = q->overflow_heap[j]; + + q->overflow_heap[i] = jj; + q->overflow_heap[j] = ii; + + q->tins[ii.t].overflow_idx[ii.b] = j; + q->tins[jj.t].overflow_idx[jj.b] = i; +} + +static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) +{ + struct cake_heap_entry ii = q->overflow_heap[i]; + + return q->tins[ii.t].backlogs[ii.b]; +} + +static void cake_heapify(struct cake_sched_data *q, u16 i) +{ + static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; + u32 mb = cake_heap_get_backlog(q, i); + u32 m = i; + + while (m < a) { + u32 l = m + m + 1; + u32 r = l + 1; + + if (l < a) { + u32 lb = cake_heap_get_backlog(q, l); + + if (lb > mb) { + m = l; + mb = lb; + } + } + + if (r < a) { + u32 rb = cake_heap_get_backlog(q, r); + + if (rb > mb) { + m = r; + mb = rb; + } + } + + if (m != i) { + cake_heap_swap(q, i, m); + i = m; + } else { + break; + } + } +} + +static void cake_heapify_up(struct cake_sched_data *q, u16 i) +{ + while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { + u16 p = (i - 1) >> 1; + u32 ib = cake_heap_get_backlog(q, i); + u32 pb = cake_heap_get_backlog(q, p); + + if (ib > pb) { + cake_heap_swap(q, i, p); + i = p; + } else { + break; + } + } +} + +static int cake_advance_shaper(struct cake_sched_data *q, + struct cake_tin_data *b, + struct sk_buff *skb, + ktime_t now, bool drop) +{ + u32 len = get_cobalt_cb(skb)->adjusted_len; + + /* charge packet bandwidth to this tin + * and to the global shaper. + */ + if (q->rate_ns) { + u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; + u64 global_dur = (len * q->rate_ns) >> q->rate_shft; + u64 failsafe_dur = global_dur + (global_dur >> 1); + + if (ktime_before(b->time_next_packet, now)) + b->time_next_packet = ktime_add_ns(b->time_next_packet, + tin_dur); + + else if (ktime_before(b->time_next_packet, + ktime_add_ns(now, tin_dur))) + b->time_next_packet = ktime_add_ns(now, tin_dur); + + q->time_next_packet = ktime_add_ns(q->time_next_packet, + global_dur); + if (!drop) + q->failsafe_next_packet = \ + ktime_add_ns(q->failsafe_next_packet, + failsafe_dur); + } + return len; +} + +static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) +{ + struct cake_sched_data *q = qdisc_priv(sch); + ktime_t now = ktime_get(); + u32 idx = 0, tin = 0, len; + struct cake_heap_entry qq; + struct cake_tin_data *b; + struct cake_flow *flow; + struct sk_buff *skb; + + if (!q->overflow_timeout) { + int i; + /* Build fresh max-heap */ + for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) + cake_heapify(q, i); + } + q->overflow_timeout = 65535; + + /* select longest queue for pruning */ + qq = q->overflow_heap[0]; + tin = qq.t; + idx = qq.b; + + b = &q->tins[tin]; + flow = &b->flows[idx]; + skb = dequeue_head(flow); + if (unlikely(!skb)) { + /* heap has gone wrong, rebuild it next time */ + q->overflow_timeout = 0; + return idx + (tin << 16); + } + + if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) + b->unresponsive_flow_count++; + + len = qdisc_pkt_len(skb); + q->buffer_used -= skb->truesize; + b->backlogs[idx] -= len; + b->tin_backlog -= len; + sch->qstats.backlog -= len; + qdisc_tree_reduce_backlog(sch, 1, len); + + flow->dropped++; + b->tin_dropped++; + sch->qstats.drops++; + + if (q->rate_flags & CAKE_FLAG_INGRESS) + cake_advance_shaper(q, b, skb, now, true); + + __qdisc_drop(skb, to_free); + sch->q.qlen--; + + cake_heapify(q, 0); + + return idx + (tin << 16); +} + +static void cake_wash_diffserv(struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); + break; + case htons(ETH_P_IPV6): + ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); + break; + default: + break; + } +} + +static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) +{ + u8 dscp; + + switch (skb->protocol) { + case htons(ETH_P_IP): + dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; + if (wash && dscp) + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); + return dscp; + + case htons(ETH_P_IPV6): + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; + if (wash && dscp) + ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); + return dscp; + + case htons(ETH_P_ARP): + return 0x38; /* CS7 - Net Control */ + + default: + /* If there is no Diffserv field, treat as best-effort */ + return 0; + } +} + +static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, + struct sk_buff *skb) +{ + struct cake_sched_data *q = qdisc_priv(sch); + u32 tin; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->tin_cnt) { + tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; + + if (q->rate_flags & CAKE_FLAG_WASH) + cake_wash_diffserv(skb); + } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) { + /* extract the Diffserv Precedence field, if it exists */ + /* and clear DSCP bits if washing */ + tin = q->tin_index[cake_handle_diffserv(skb, + q->rate_flags & CAKE_FLAG_WASH)]; + if (unlikely(tin >= q->tin_cnt)) + tin = 0; + } else { + tin = 0; + if (q->rate_flags & CAKE_FLAG_WASH) + cake_wash_diffserv(skb); + } + + return &q->tins[tin]; +} + +static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, + struct sk_buff *skb, int flow_mode, int *qerr) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct tcf_proto *filter; + struct tcf_result res; + u32 flow = 0; + int result; + + filter = rcu_dereference_bh(q->filter_list); + if (!filter) + goto hash; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tcf_classify(skb, filter, &res, false); + + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= CAKE_QUEUES) + flow = TC_H_MIN(res.classid); + } +hash: + *t = cake_select_tin(sch, skb); + return flow ?: cake_hash(*t, skb, flow_mode) + 1; +} + +static void cake_reconfigure(struct Qdisc *sch); + +static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct cake_sched_data *q = qdisc_priv(sch); + int len = qdisc_pkt_len(skb); + int uninitialized_var(ret); + struct sk_buff *ack = NULL; + ktime_t now = ktime_get(); + struct cake_tin_data *b; + struct cake_flow *flow; + u32 idx; + + /* choose flow to insert into */ + idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); + if (idx == 0) { + if (ret & __NET_XMIT_BYPASS) + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + return ret; + } + idx--; + flow = &b->flows[idx]; + + /* ensure shaper state isn't stale */ + if (!b->tin_backlog) { + if (ktime_before(b->time_next_packet, now)) + b->time_next_packet = now; + + if (!sch->q.qlen) { + if (ktime_before(q->time_next_packet, now)) { + q->failsafe_next_packet = now; + q->time_next_packet = now; + } else if (ktime_after(q->time_next_packet, now) && + ktime_after(q->failsafe_next_packet, now)) { + u64 next = \ + min(ktime_to_ns(q->time_next_packet), + ktime_to_ns( + q->failsafe_next_packet)); + sch->qstats.overlimits++; + qdisc_watchdog_schedule_ns(&q->watchdog, next); + } + } + } + + if (unlikely(len > b->max_skblen)) + b->max_skblen = len; + + if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { + struct sk_buff *segs, *nskb; + netdev_features_t features = netif_skb_features(skb); + unsigned int slen = 0; + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(segs)) + return qdisc_drop(skb, sch, to_free); + + while (segs) { + nskb = segs->next; + segs->next = NULL; + qdisc_skb_cb(segs)->pkt_len = segs->len; + cobalt_set_enqueue_time(segs, now); + get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, + segs); + flow_queue_add(flow, segs); + + sch->q.qlen++; + slen += segs->len; + q->buffer_used += segs->truesize; + b->packets++; + segs = nskb; + } + + /* stats */ + b->bytes += slen; + b->backlogs[idx] += slen; + b->tin_backlog += slen; + sch->qstats.backlog += slen; + q->avg_window_bytes += slen; + + qdisc_tree_reduce_backlog(sch, 1, len); + consume_skb(skb); + } else { + /* not splitting */ + cobalt_set_enqueue_time(skb, now); + get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); + flow_queue_add(flow, skb); + + if (q->ack_filter) + ack = cake_ack_filter(q, flow); + + if (ack) { + b->ack_drops++; + sch->qstats.drops++; + b->bytes += qdisc_pkt_len(ack); + len -= qdisc_pkt_len(ack); + q->buffer_used += skb->truesize - ack->truesize; + if (q->rate_flags & CAKE_FLAG_INGRESS) + cake_advance_shaper(q, b, ack, now, true); + + qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); + consume_skb(ack); + } else { + sch->q.qlen++; + q->buffer_used += skb->truesize; + } + + /* stats */ + b->packets++; + b->bytes += len; + b->backlogs[idx] += len; + b->tin_backlog += len; + sch->qstats.backlog += len; + q->avg_window_bytes += len; + } + + if (q->overflow_timeout) + cake_heapify_up(q, b->overflow_idx[idx]); + + /* incoming bandwidth capacity estimate */ + if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { + u64 packet_interval = \ + ktime_to_ns(ktime_sub(now, q->last_packet_time)); + + if (packet_interval > NSEC_PER_SEC) + packet_interval = NSEC_PER_SEC; + + /* filter out short-term bursts, eg. wifi aggregation */ + q->avg_packet_interval = \ + cake_ewma(q->avg_packet_interval, + packet_interval, + (packet_interval > q->avg_packet_interval ? + 2 : 8)); + + q->last_packet_time = now; + + if (packet_interval > q->avg_packet_interval) { + u64 window_interval = \ + ktime_to_ns(ktime_sub(now, + q->avg_window_begin)); + u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; + + do_div(b, window_interval); + q->avg_peak_bandwidth = + cake_ewma(q->avg_peak_bandwidth, b, + b > q->avg_peak_bandwidth ? 2 : 8); + q->avg_window_bytes = 0; + q->avg_window_begin = now; + + if (ktime_after(now, + ktime_add_ms(q->last_reconfig_time, + 250))) { + q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; + cake_reconfigure(sch); + } + } + } else { + q->avg_window_bytes = 0; + q->last_packet_time = now; + } + + /* flowchain */ + if (!flow->set || flow->set == CAKE_SET_DECAYING) { + struct cake_host *srchost = &b->hosts[flow->srchost]; + struct cake_host *dsthost = &b->hosts[flow->dsthost]; + u16 host_load = 1; + + if (!flow->set) { + list_add_tail(&flow->flowchain, &b->new_flows); + } else { + b->decaying_flow_count--; + list_move_tail(&flow->flowchain, &b->new_flows); + } + flow->set = CAKE_SET_SPARSE; + b->sparse_flow_count++; + + if (cake_dsrc(q->flow_mode)) + host_load = max(host_load, srchost->srchost_refcnt); + + if (cake_ddst(q->flow_mode)) + host_load = max(host_load, dsthost->dsthost_refcnt); + + flow->deficit = (b->flow_quantum * + quantum_div[host_load]) >> 16; + } else if (flow->set == CAKE_SET_SPARSE_WAIT) { + /* this flow was empty, accounted as a sparse flow, but actually + * in the bulk rotation. + */ + flow->set = CAKE_SET_BULK; + b->sparse_flow_count--; + b->bulk_flow_count++; + } + + if (q->buffer_used > q->buffer_max_used) + q->buffer_max_used = q->buffer_used; + + if (q->buffer_used > q->buffer_limit) { + u32 dropped = 0; + + while (q->buffer_used > q->buffer_limit) { + dropped++; + cake_drop(sch, to_free); + } + b->drop_overlimit += dropped; + } + return NET_XMIT_SUCCESS; +} + +static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct cake_tin_data *b = &q->tins[q->cur_tin]; + struct cake_flow *flow = &b->flows[q->cur_flow]; + struct sk_buff *skb = NULL; + u32 len; + + if (flow->head) { + skb = dequeue_head(flow); + len = qdisc_pkt_len(skb); + b->backlogs[q->cur_flow] -= len; + b->tin_backlog -= len; + sch->qstats.backlog -= len; + q->buffer_used -= skb->truesize; + sch->q.qlen--; + + if (q->overflow_timeout) + cake_heapify(q, b->overflow_idx[q->cur_flow]); + } + return skb; +} + +/* Discard leftover packets from a tin no longer in use. */ +static void cake_clear_tin(struct Qdisc *sch, u16 tin) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + q->cur_tin = tin; + for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) + while (!!(skb = cake_dequeue_one(sch))) + kfree_skb(skb); +} + +static struct sk_buff *cake_dequeue(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct cake_tin_data *b = &q->tins[q->cur_tin]; + struct cake_host *srchost, *dsthost; + ktime_t now = ktime_get(); + struct cake_flow *flow; + struct list_head *head; + bool first_flow = true; + struct sk_buff *skb; + u16 host_load; + u64 delay; + u32 len; + +begin: + if (!sch->q.qlen) + return NULL; + + /* global hard shaper */ + if (ktime_after(q->time_next_packet, now) && + ktime_after(q->failsafe_next_packet, now)) { + u64 next = min(ktime_to_ns(q->time_next_packet), + ktime_to_ns(q->failsafe_next_packet)); + + sch->qstats.overlimits++; + qdisc_watchdog_schedule_ns(&q->watchdog, next); + return NULL; + } + + /* Choose a class to work on. */ + if (!q->rate_ns) { + /* In unlimited mode, can't rely on shaper timings, just balance + * with DRR + */ + bool wrapped = false, empty = true; + + while (b->tin_deficit < 0 || + !(b->sparse_flow_count + b->bulk_flow_count)) { + if (b->tin_deficit <= 0) + b->tin_deficit += b->tin_quantum_band; + if (b->sparse_flow_count + b->bulk_flow_count) + empty = false; + + q->cur_tin++; + b++; + if (q->cur_tin >= q->tin_cnt) { + q->cur_tin = 0; + b = q->tins; + + if (wrapped) { + /* It's possible for q->qlen to be + * nonzero when we actually have no + * packets anywhere. + */ + if (empty) + return NULL; + } else { + wrapped = true; + } + } + } + } else { + /* In shaped mode, choose: + * - Highest-priority tin with queue and meeting schedule, or + * - The earliest-scheduled tin with queue. + */ + ktime_t best_time = KTIME_MAX; + int tin, best_tin = 0; + + for (tin = 0; tin < q->tin_cnt; tin++) { + b = q->tins + tin; + if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { + ktime_t time_to_pkt = \ + ktime_sub(b->time_next_packet, now); + + if (ktime_to_ns(time_to_pkt) <= 0 || + ktime_compare(time_to_pkt, + best_time) <= 0) { + best_time = time_to_pkt; + best_tin = tin; + } + } + } + + q->cur_tin = best_tin; + b = q->tins + best_tin; + + /* No point in going further if no packets to deliver. */ + if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) + return NULL; + } + +retry: + /* service this class */ + head = &b->decaying_flows; + if (!first_flow || list_empty(head)) { + head = &b->new_flows; + if (list_empty(head)) { + head = &b->old_flows; + if (unlikely(list_empty(head))) { + head = &b->decaying_flows; + if (unlikely(list_empty(head))) + goto begin; + } + } + } + flow = list_first_entry(head, struct cake_flow, flowchain); + q->cur_flow = flow - b->flows; + first_flow = false; + + /* triple isolation (modified DRR++) */ + srchost = &b->hosts[flow->srchost]; + dsthost = &b->hosts[flow->dsthost]; + host_load = 1; + + if (cake_dsrc(q->flow_mode)) + host_load = max(host_load, srchost->srchost_refcnt); + + if (cake_ddst(q->flow_mode)) + host_load = max(host_load, dsthost->dsthost_refcnt); + + WARN_ON(host_load > CAKE_QUEUES); + + /* flow isolation (DRR++) */ + if (flow->deficit <= 0) { + /* The shifted prandom_u32() is a way to apply dithering to + * avoid accumulating roundoff errors + */ + flow->deficit += (b->flow_quantum * quantum_div[host_load] + + (prandom_u32() >> 16)) >> 16; + list_move_tail(&flow->flowchain, &b->old_flows); + + /* Keep all flows with deficits out of the sparse and decaying + * rotations. No non-empty flow can go into the decaying + * rotation, so they can't get deficits + */ + if (flow->set == CAKE_SET_SPARSE) { + if (flow->head) { + b->sparse_flow_count--; + b->bulk_flow_count++; + flow->set = CAKE_SET_BULK; + } else { + /* we've moved it to the bulk rotation for + * correct deficit accounting but we still want + * to count it as a sparse flow, not a bulk one. + */ + flow->set = CAKE_SET_SPARSE_WAIT; + } + } + goto retry; + } + + /* Retrieve a packet via the AQM */ + while (1) { + skb = cake_dequeue_one(sch); + if (!skb) { + /* this queue was actually empty */ + if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) + b->unresponsive_flow_count--; + + if (flow->cvars.p_drop || flow->cvars.count || + ktime_before(now, flow->cvars.drop_next)) { + /* keep in the flowchain until the state has + * decayed to rest + */ + list_move_tail(&flow->flowchain, + &b->decaying_flows); + if (flow->set == CAKE_SET_BULK) { + b->bulk_flow_count--; + b->decaying_flow_count++; + } else if (flow->set == CAKE_SET_SPARSE || + flow->set == CAKE_SET_SPARSE_WAIT) { + b->sparse_flow_count--; + b->decaying_flow_count++; + } + flow->set = CAKE_SET_DECAYING; + } else { + /* remove empty queue from the flowchain */ + list_del_init(&flow->flowchain); + if (flow->set == CAKE_SET_SPARSE || + flow->set == CAKE_SET_SPARSE_WAIT) + b->sparse_flow_count--; + else if (flow->set == CAKE_SET_BULK) + b->bulk_flow_count--; + else + b->decaying_flow_count--; + + flow->set = CAKE_SET_NONE; + srchost->srchost_refcnt--; + dsthost->dsthost_refcnt--; + } + goto begin; + } + + /* Last packet in queue may be marked, shouldn't be dropped */ + if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, + (b->bulk_flow_count * + !!(q->rate_flags & + CAKE_FLAG_INGRESS))) || + !flow->head) + break; + + /* drop this packet, get another one */ + if (q->rate_flags & CAKE_FLAG_INGRESS) { + len = cake_advance_shaper(q, b, skb, + now, true); + flow->deficit -= len; + b->tin_deficit -= len; + } + flow->dropped++; + b->tin_dropped++; + qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); + qdisc_qstats_drop(sch); + kfree_skb(skb); + if (q->rate_flags & CAKE_FLAG_INGRESS) + goto retry; + } + + b->tin_ecn_mark += !!flow->cvars.ecn_marked; + qdisc_bstats_update(sch, skb); + + /* collect delay stats */ + delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); + b->avge_delay = cake_ewma(b->avge_delay, delay, 8); + b->peak_delay = cake_ewma(b->peak_delay, delay, + delay > b->peak_delay ? 2 : 8); + b->base_delay = cake_ewma(b->base_delay, delay, + delay < b->base_delay ? 2 : 8); + + len = cake_advance_shaper(q, b, skb, now, false); + flow->deficit -= len; + b->tin_deficit -= len; + + if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { + u64 next = min(ktime_to_ns(q->time_next_packet), + ktime_to_ns(q->failsafe_next_packet)); + + qdisc_watchdog_schedule_ns(&q->watchdog, next); + } else if (!sch->q.qlen) { + int i; + + for (i = 0; i < q->tin_cnt; i++) { + if (q->tins[i].decaying_flow_count) { + ktime_t next = \ + ktime_add_ns(now, + q->tins[i].cparams.target); + + qdisc_watchdog_schedule_ns(&q->watchdog, + ktime_to_ns(next)); + break; + } + } + } + + if (q->overflow_timeout) + q->overflow_timeout--; + + return skb; +} + +static void cake_reset(struct Qdisc *sch) +{ + u32 c; + + for (c = 0; c < CAKE_MAX_TINS; c++) + cake_clear_tin(sch, c); +} + +static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { + [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, + [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, + [TCA_CAKE_ATM] = { .type = NLA_U32 }, + [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, + [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, + [TCA_CAKE_RTT] = { .type = NLA_U32 }, + [TCA_CAKE_TARGET] = { .type = NLA_U32 }, + [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, + [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, + [TCA_CAKE_NAT] = { .type = NLA_U32 }, + [TCA_CAKE_RAW] = { .type = NLA_U32 }, + [TCA_CAKE_WASH] = { .type = NLA_U32 }, + [TCA_CAKE_MPU] = { .type = NLA_U32 }, + [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, + [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, +}; + +static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, + u64 target_ns, u64 rtt_est_ns) +{ + /* convert byte-rate into time-per-byte + * so it will always unwedge in reasonable time. + */ + static const u64 MIN_RATE = 64; + u32 byte_target = mtu; + u64 byte_target_ns; + u8 rate_shft = 0; + u64 rate_ns = 0; + + b->flow_quantum = 1514; + if (rate) { + b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); + rate_shft = 34; + rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; + rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); + while (!!(rate_ns >> 34)) { + rate_ns >>= 1; + rate_shft--; + } + } /* else unlimited, ie. zero delay */ + + b->tin_rate_bps = rate; + b->tin_rate_ns = rate_ns; + b->tin_rate_shft = rate_shft; + + byte_target_ns = (byte_target * rate_ns) >> rate_shft; + + b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); + b->cparams.interval = max(rtt_est_ns + + b->cparams.target - target_ns, + b->cparams.target * 2); + b->cparams.mtu_time = byte_target_ns; + b->cparams.p_inc = 1 << 24; /* 1/256 */ + b->cparams.p_dec = 1 << 20; /* 1/4096 */ +} + +static int cake_config_besteffort(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct cake_tin_data *b = &q->tins[0]; + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + + q->tin_cnt = 1; + + q->tin_index = besteffort; + q->tin_order = normal_order; + + cake_set_rate(b, rate, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + b->tin_quantum_band = 65535; + b->tin_quantum_prio = 65535; + + return 0; +} + +static int cake_config_precedence(struct Qdisc *sch) +{ + /* convert high-level (user visible) parameters into internal format */ + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum1 = 256; + u32 quantum2 = 256; + u32 i; + + q->tin_cnt = 8; + q->tin_index = precedence; + q->tin_order = normal_order; + + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[i]; + + cake_set_rate(b, rate, mtu, us_to_ns(q->target), + us_to_ns(q->interval)); + + b->tin_quantum_prio = max_t(u16, 1U, quantum1); + b->tin_quantum_band = max_t(u16, 1U, quantum2); + + /* calculate next class's parameters */ + rate *= 7; + rate >>= 3; + + quantum1 *= 3; + quantum1 >>= 1; + + quantum2 *= 7; + quantum2 >>= 3; + } + + return 0; +} + +/* List of known Diffserv codepoints: + * + * Least Effort (CS1) + * Best Effort (CS0) + * Max Reliability & LLT "Lo" (TOS1) + * Max Throughput (TOS2) + * Min Delay (TOS4) + * LLT "La" (TOS5) + * Assured Forwarding 1 (AF1x) - x3 + * Assured Forwarding 2 (AF2x) - x3 + * Assured Forwarding 3 (AF3x) - x3 + * Assured Forwarding 4 (AF4x) - x3 + * Precedence Class 2 (CS2) + * Precedence Class 3 (CS3) + * Precedence Class 4 (CS4) + * Precedence Class 5 (CS5) + * Precedence Class 6 (CS6) + * Precedence Class 7 (CS7) + * Voice Admit (VA) + * Expedited Forwarding (EF) + + * Total 25 codepoints. + */ + +/* List of traffic classes in RFC 4594: + * (roughly descending order of contended priority) + * (roughly ascending order of uncontended throughput) + * + * Network Control (CS6,CS7) - routing traffic + * Telephony (EF,VA) - aka. VoIP streams + * Signalling (CS5) - VoIP setup + * Multimedia Conferencing (AF4x) - aka. video calls + * Realtime Interactive (CS4) - eg. games + * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch + * Broadcast Video (CS3) + * Low Latency Data (AF2x,TOS4) - eg. database + * Ops, Admin, Management (CS2,TOS1) - eg. ssh + * Standard Service (CS0 & unrecognised codepoints) + * High Throughput Data (AF1x,TOS2) - eg. web traffic + * Low Priority Data (CS1) - eg. BitTorrent + + * Total 12 traffic classes. + */ + +static int cake_config_diffserv8(struct Qdisc *sch) +{ +/* Pruned list of traffic classes for typical applications: + * + * Network Control (CS6, CS7) + * Minimum Latency (EF, VA, CS5, CS4) + * Interactive Shell (CS2, TOS1) + * Low Latency Transactions (AF2x, TOS4) + * Video Streaming (AF4x, AF3x, CS3) + * Bog Standard (CS0 etc.) + * High Throughput (AF1x, TOS2) + * Background Traffic (CS1) + * + * Total 8 traffic classes. + */ + + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum1 = 256; + u32 quantum2 = 256; + u32 i; + + q->tin_cnt = 8; + + /* codepoint to class mapping */ + q->tin_index = diffserv8; + q->tin_order = normal_order; + + /* class characteristics */ + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[i]; + + cake_set_rate(b, rate, mtu, us_to_ns(q->target), + us_to_ns(q->interval)); + + b->tin_quantum_prio = max_t(u16, 1U, quantum1); + b->tin_quantum_band = max_t(u16, 1U, quantum2); + + /* calculate next class's parameters */ + rate *= 7; + rate >>= 3; + + quantum1 *= 3; + quantum1 >>= 1; + + quantum2 *= 7; + quantum2 >>= 3; + } + + return 0; +} + +static int cake_config_diffserv4(struct Qdisc *sch) +{ +/* Further pruned list of traffic classes for four-class system: + * + * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) + * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1) + * Best Effort (CS0, AF1x, TOS2, and those not specified) + * Background Traffic (CS1) + * + * Total 4 traffic classes. + */ + + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum = 1024; + + q->tin_cnt = 4; + + /* codepoint to class mapping */ + q->tin_index = diffserv4; + q->tin_order = bulk_order; + + /* class characteristics */ + cake_set_rate(&q->tins[0], rate, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[1], rate >> 4, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[2], rate >> 1, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[3], rate >> 2, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + + /* priority weights */ + q->tins[0].tin_quantum_prio = quantum; + q->tins[1].tin_quantum_prio = quantum >> 4; + q->tins[2].tin_quantum_prio = quantum << 2; + q->tins[3].tin_quantum_prio = quantum << 4; + + /* bandwidth-sharing weights */ + q->tins[0].tin_quantum_band = quantum; + q->tins[1].tin_quantum_band = quantum >> 4; + q->tins[2].tin_quantum_band = quantum >> 1; + q->tins[3].tin_quantum_band = quantum >> 2; + + return 0; +} + +static int cake_config_diffserv3(struct Qdisc *sch) +{ +/* Simplified Diffserv structure with 3 tins. + * Low Priority (CS1) + * Best Effort + * Latency Sensitive (TOS4, VA, EF, CS6, CS7) + */ + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum = 1024; + + q->tin_cnt = 3; + + /* codepoint to class mapping */ + q->tin_index = diffserv3; + q->tin_order = bulk_order; + + /* class characteristics */ + cake_set_rate(&q->tins[0], rate, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[1], rate >> 4, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[2], rate >> 2, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + + /* priority weights */ + q->tins[0].tin_quantum_prio = quantum; + q->tins[1].tin_quantum_prio = quantum >> 4; + q->tins[2].tin_quantum_prio = quantum << 4; + + /* bandwidth-sharing weights */ + q->tins[0].tin_quantum_band = quantum; + q->tins[1].tin_quantum_band = quantum >> 4; + q->tins[2].tin_quantum_band = quantum >> 2; + + return 0; +} + +static void cake_reconfigure(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + int c, ft; + + switch (q->tin_mode) { + case CAKE_DIFFSERV_BESTEFFORT: + ft = cake_config_besteffort(sch); + break; + + case CAKE_DIFFSERV_PRECEDENCE: + ft = cake_config_precedence(sch); + break; + + case CAKE_DIFFSERV_DIFFSERV8: + ft = cake_config_diffserv8(sch); + break; + + case CAKE_DIFFSERV_DIFFSERV4: + ft = cake_config_diffserv4(sch); + break; + + case CAKE_DIFFSERV_DIFFSERV3: + default: + ft = cake_config_diffserv3(sch); + break; + } + + for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { + cake_clear_tin(sch, c); + q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; + } + + q->rate_ns = q->tins[ft].tin_rate_ns; + q->rate_shft = q->tins[ft].tin_rate_shft; + + if (q->buffer_config_limit) { + q->buffer_limit = q->buffer_config_limit; + } else if (q->rate_bps) { + u64 t = q->rate_bps * q->interval; + + do_div(t, USEC_PER_SEC / 4); + q->buffer_limit = max_t(u32, t, 4U << 20); + } else { + q->buffer_limit = ~0; + } + + sch->flags &= ~TCQ_F_CAN_BYPASS; + + q->buffer_limit = min(q->buffer_limit, + max(sch->limit * psched_mtu(qdisc_dev(sch)), + q->buffer_config_limit)); +} + +static int cake_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CAKE_MAX + 1]; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack); + if (err < 0) + return err; + + if (tb[TCA_CAKE_NAT]) { +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; + q->flow_mode |= CAKE_FLOW_NAT_FLAG * + !!nla_get_u32(tb[TCA_CAKE_NAT]); +#else + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], + "No conntrack support in kernel"); + return -EOPNOTSUPP; +#endif + } + + if (tb[TCA_CAKE_BASE_RATE64]) + q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); + + if (tb[TCA_CAKE_DIFFSERV_MODE]) + q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); + + if (tb[TCA_CAKE_WASH]) { + if (!!nla_get_u32(tb[TCA_CAKE_WASH])) + q->rate_flags |= CAKE_FLAG_WASH; + else + q->rate_flags &= ~CAKE_FLAG_WASH; + } + + if (tb[TCA_CAKE_FLOW_MODE]) + q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | + (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & + CAKE_FLOW_MASK)); + + if (tb[TCA_CAKE_ATM]) + q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); + + if (tb[TCA_CAKE_OVERHEAD]) { + q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); + q->rate_flags |= CAKE_FLAG_OVERHEAD; + + q->max_netlen = 0; + q->max_adjlen = 0; + q->min_netlen = ~0; + q->min_adjlen = ~0; + } + + if (tb[TCA_CAKE_RAW]) { + q->rate_flags &= ~CAKE_FLAG_OVERHEAD; + + q->max_netlen = 0; + q->max_adjlen = 0; + q->min_netlen = ~0; + q->min_adjlen = ~0; + } + + if (tb[TCA_CAKE_MPU]) + q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); + + if (tb[TCA_CAKE_RTT]) { + q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); + + if (!q->interval) + q->interval = 1; + } + + if (tb[TCA_CAKE_TARGET]) { + q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); + + if (!q->target) + q->target = 1; + } + + if (tb[TCA_CAKE_AUTORATE]) { + if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) + q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; + else + q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; + } + + if (tb[TCA_CAKE_INGRESS]) { + if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) + q->rate_flags |= CAKE_FLAG_INGRESS; + else + q->rate_flags &= ~CAKE_FLAG_INGRESS; + } + + if (tb[TCA_CAKE_ACK_FILTER]) + q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); + + if (tb[TCA_CAKE_MEMORY]) + q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); + + if (q->rate_bps && q->rate_bps <= CAKE_SPLIT_GSO_THRESHOLD) + q->rate_flags |= CAKE_FLAG_SPLIT_GSO; + else + q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; + + if (q->tins) { + sch_tree_lock(sch); + cake_reconfigure(sch); + sch_tree_unlock(sch); + } + + return 0; +} + +static void cake_destroy(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_cancel(&q->watchdog); + tcf_block_put(q->block); + kvfree(q->tins); +} + +static int cake_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *q = qdisc_priv(sch); + int i, j, err; + + sch->limit = 10240; + q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; + q->flow_mode = CAKE_FLOW_TRIPLE; + + q->rate_bps = 0; /* unlimited by default */ + + q->interval = 100000; /* 100ms default */ + q->target = 5000; /* 5ms: codel RFC argues + * for 5 to 10% of interval + */ + + q->cur_tin = 0; + q->cur_flow = 0; + + qdisc_watchdog_init(&q->watchdog, sch); + + if (opt) { + int err = cake_change(sch, opt, extack); + + if (err) + return err; + } + + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + if (err) + return err; + + quantum_div[0] = ~0; + for (i = 1; i <= CAKE_QUEUES; i++) + quantum_div[i] = 65535 / i; + + q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data), + GFP_KERNEL); + if (!q->tins) + goto nomem; + + for (i = 0; i < CAKE_MAX_TINS; i++) { + struct cake_tin_data *b = q->tins + i; + + INIT_LIST_HEAD(&b->new_flows); + INIT_LIST_HEAD(&b->old_flows); + INIT_LIST_HEAD(&b->decaying_flows); + b->sparse_flow_count = 0; + b->bulk_flow_count = 0; + b->decaying_flow_count = 0; + + for (j = 0; j < CAKE_QUEUES; j++) { + struct cake_flow *flow = b->flows + j; + u32 k = j * CAKE_MAX_TINS + i; + + INIT_LIST_HEAD(&flow->flowchain); + cobalt_vars_init(&flow->cvars); + + q->overflow_heap[k].t = i; + q->overflow_heap[k].b = j; + b->overflow_idx[j] = k; + } + } + + cake_reconfigure(sch); + q->avg_peak_bandwidth = q->rate_bps; + q->min_netlen = ~0; + q->min_adjlen = ~0; + return 0; + +nomem: + cake_destroy(sch); + return -ENOMEM; +} + +static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (!opts) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, + TCA_CAKE_PAD)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, + q->flow_mode & CAKE_FLOW_MASK)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_AUTORATE, + !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_INGRESS, + !!(q->rate_flags & CAKE_FLAG_INGRESS))) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_NAT, + !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_WASH, + !!(q->rate_flags & CAKE_FLAG_WASH))) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) + goto nla_put_failure; + + if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) + if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, + !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + return -1; +} + +static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP); + struct cake_sched_data *q = qdisc_priv(sch); + struct nlattr *tstats, *ts; + int i; + + if (!stats) + return -1; + +#define PUT_STAT_U32(attr, data) do { \ + if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) +#define PUT_STAT_U64(attr, data) do { \ + if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ + data, TCA_CAKE_STATS_PAD)) \ + goto nla_put_failure; \ + } while (0) + + PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); + PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); + PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); + PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); + PUT_STAT_U32(MAX_NETLEN, q->max_netlen); + PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); + PUT_STAT_U32(MIN_NETLEN, q->min_netlen); + PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); + +#undef PUT_STAT_U32 +#undef PUT_STAT_U64 + + tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS); + if (!tstats) + goto nla_put_failure; + +#define PUT_TSTAT_U32(attr, data) do { \ + if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) +#define PUT_TSTAT_U64(attr, data) do { \ + if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ + data, TCA_CAKE_TIN_STATS_PAD)) \ + goto nla_put_failure; \ + } while (0) + + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[q->tin_order[i]]; + + ts = nla_nest_start(d->skb, i + 1); + if (!ts) + goto nla_put_failure; + + PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); + PUT_TSTAT_U64(SENT_BYTES64, b->bytes); + PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); + + PUT_TSTAT_U32(TARGET_US, + ktime_to_us(ns_to_ktime(b->cparams.target))); + PUT_TSTAT_U32(INTERVAL_US, + ktime_to_us(ns_to_ktime(b->cparams.interval))); + + PUT_TSTAT_U32(SENT_PACKETS, b->packets); + PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); + PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); + PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); + + PUT_TSTAT_U32(PEAK_DELAY_US, + ktime_to_us(ns_to_ktime(b->peak_delay))); + PUT_TSTAT_U32(AVG_DELAY_US, + ktime_to_us(ns_to_ktime(b->avge_delay))); + PUT_TSTAT_U32(BASE_DELAY_US, + ktime_to_us(ns_to_ktime(b->base_delay))); + + PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); + PUT_TSTAT_U32(WAY_MISSES, b->way_misses); + PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); + + PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + + b->decaying_flow_count); + PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); + PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); + PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); + + PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); + nla_nest_end(d->skb, ts); + } + +#undef PUT_TSTAT_U32 +#undef PUT_TSTAT_U64 + + nla_nest_end(d->skb, tstats); + return nla_nest_end(d->skb, stats); + +nla_put_failure: + nla_nest_cancel(d->skb, stats); + return -1; +} + +static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long cake_find(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + +static void cake_unbind(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return q->block; +} + +static int cake_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct cake_sched_data *q = qdisc_priv(sch); + const struct cake_flow *flow = NULL; + struct gnet_stats_queue qs = { 0 }; + struct nlattr *stats; + u32 idx = cl - 1; + + if (idx < CAKE_QUEUES * q->tin_cnt) { + const struct cake_tin_data *b = \ + &q->tins[q->tin_order[idx / CAKE_QUEUES]]; + const struct sk_buff *skb; + + flow = &b->flows[idx % CAKE_QUEUES]; + + if (flow->head) { + sch_tree_lock(sch); + skb = flow->head; + while (skb) { + qs.qlen++; + skb = skb->next; + } + sch_tree_unlock(sch); + } + qs.backlog = b->backlogs[idx % CAKE_QUEUES]; + qs.drops = flow->dropped; + } + if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) + return -1; + if (flow) { + ktime_t now = ktime_get(); + + stats = nla_nest_start(d->skb, TCA_STATS_APP); + if (!stats) + return -1; + +#define PUT_STAT_U32(attr, data) do { \ + if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) +#define PUT_STAT_S32(attr, data) do { \ + if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) + + PUT_STAT_S32(DEFICIT, flow->deficit); + PUT_STAT_U32(DROPPING, flow->cvars.dropping); + PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); + PUT_STAT_U32(P_DROP, flow->cvars.p_drop); + if (flow->cvars.p_drop) { + PUT_STAT_S32(BLUE_TIMER_US, + ktime_to_us( + ktime_sub(now, + flow->cvars.blue_timer))); + } + if (flow->cvars.dropping) { + PUT_STAT_S32(DROP_NEXT_US, + ktime_to_us( + ktime_sub(now, + flow->cvars.drop_next))); + } + + if (nla_nest_end(d->skb, stats) < 0) + return -1; + } + + return 0; + +nla_put_failure: + nla_nest_cancel(d->skb, stats); + return -1; +} + +static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct cake_sched_data *q = qdisc_priv(sch); + unsigned int i, j; + + if (arg->stop) + return; + + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[q->tin_order[i]]; + + for (j = 0; j < CAKE_QUEUES; j++) { + if (list_empty(&b->flows[j].flowchain) || + arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } + } +} + +static const struct Qdisc_class_ops cake_class_ops = { + .leaf = cake_leaf, + .find = cake_find, + .tcf_block = cake_tcf_block, + .bind_tcf = cake_bind, + .unbind_tcf = cake_unbind, + .dump = cake_dump_class, + .dump_stats = cake_dump_class_stats, + .walk = cake_walk, +}; + +static struct Qdisc_ops cake_qdisc_ops __read_mostly = { + .cl_ops = &cake_class_ops, + .id = "cake", + .priv_size = sizeof(struct cake_sched_data), + .enqueue = cake_enqueue, + .dequeue = cake_dequeue, + .peek = qdisc_peek_dequeued, + .init = cake_init, + .reset = cake_reset, + .destroy = cake_destroy, + .change = cake_change, + .dump = cake_dump, + .dump_stats = cake_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init cake_module_init(void) +{ + return register_qdisc(&cake_qdisc_ops); +} + +static void __exit cake_module_exit(void) +{ + unregister_qdisc(&cake_qdisc_ops); +} + +module_init(cake_module_init) +module_exit(cake_module_exit) +MODULE_AUTHOR("Jonathan Morton"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("The CAKE shaper."); diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index cdd96b9a27bc..e26a24017faa 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -78,18 +78,42 @@ struct cbs_sched_data { s64 sendslope; /* in bytes/s */ s64 idleslope; /* in bytes/s */ struct qdisc_watchdog watchdog; - int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch); + int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free); struct sk_buff *(*dequeue)(struct Qdisc *sch); + struct Qdisc *qdisc; }; -static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch) +static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct Qdisc *child, + struct sk_buff **to_free) { - return qdisc_enqueue_tail(skb, sch); + int err; + + err = child->ops->enqueue(skb, child, to_free); + if (err != NET_XMIT_SUCCESS) + return err; + + qdisc_qstats_backlog_inc(sch, skb); + sch->q.qlen++; + + return NET_XMIT_SUCCESS; } -static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch) +static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct cbs_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc = q->qdisc; + + return cbs_child_enqueue(skb, sch, qdisc, to_free); +} + +static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc = q->qdisc; if (sch->q.qlen == 0 && q->credits > 0) { /* We need to stop accumulating credits when there's @@ -99,7 +123,7 @@ static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch) q->last = ktime_get_ns(); } - return qdisc_enqueue_tail(skb, sch); + return cbs_child_enqueue(skb, sch, qdisc, to_free); } static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -107,7 +131,7 @@ static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct cbs_sched_data *q = qdisc_priv(sch); - return q->enqueue(skb, sch); + return q->enqueue(skb, sch, to_free); } /* timediff is in ns, slope is in bytes/s */ @@ -132,9 +156,25 @@ static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate) return div64_s64(len * slope, port_rate); } +static struct sk_buff *cbs_child_dequeue(struct Qdisc *sch, struct Qdisc *child) +{ + struct sk_buff *skb; + + skb = child->ops->dequeue(child); + if (!skb) + return NULL; + + qdisc_qstats_backlog_dec(sch, skb); + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + + return skb; +} + static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) { struct cbs_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc = q->qdisc; s64 now = ktime_get_ns(); struct sk_buff *skb; s64 credits; @@ -157,8 +197,7 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) return NULL; } } - - skb = qdisc_dequeue_head(sch); + skb = cbs_child_dequeue(sch, qdisc); if (!skb) return NULL; @@ -178,7 +217,10 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch) { - return qdisc_dequeue_head(sch); + struct cbs_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc = q->qdisc; + + return cbs_child_dequeue(sch, qdisc); } static struct sk_buff *cbs_dequeue(struct Qdisc *sch) @@ -310,6 +352,13 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; } + q->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + sch->handle, extack); + if (!q->qdisc) + return -ENOMEM; + + qdisc_hash_add(q->qdisc, false); + q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); q->enqueue = cbs_enqueue_soft; @@ -328,6 +377,9 @@ static void cbs_destroy(struct Qdisc *sch) qdisc_watchdog_cancel(&q->watchdog); cbs_disable_offload(dev, q); + + if (q->qdisc) + qdisc_destroy(q->qdisc); } static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -356,8 +408,72 @@ nla_put_failure: return -1; } +static int cbs_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + if (cl != 1 || !q->qdisc) /* only one class */ + return -ENOENT; + + tcm->tcm_handle |= TC_H_MIN(1); + tcm->tcm_info = q->qdisc->handle; + + return 0; +} + +static int cbs_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old, struct netlink_ext_ack *extack) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + if (!new) { + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + sch->handle, NULL); + if (!new) + new = &noop_qdisc; + } + + *old = qdisc_replace(sch, new, &q->qdisc); + return 0; +} + +static struct Qdisc *cbs_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + return q->qdisc; +} + +static unsigned long cbs_find(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void cbs_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) { + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + } + walker->count++; + } +} + +static const struct Qdisc_class_ops cbs_class_ops = { + .graft = cbs_graft, + .leaf = cbs_leaf, + .find = cbs_find, + .walk = cbs_walk, + .dump = cbs_dump_class, +}; + static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { .id = "cbs", + .cl_ops = &cbs_class_ops, .priv_size = sizeof(struct cbs_sched_data), .enqueue = cbs_enqueue, .dequeue = cbs_dequeue, diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c new file mode 100644 index 000000000000..1538d6fa8165 --- /dev/null +++ b/net/sched/sch_etf.c @@ -0,0 +1,484 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* net/sched/sch_etf.c Earliest TxTime First queueing discipline. + * + * Authors: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com> + * Vinicius Costa Gomes <vinicius.gomes@intel.com> + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/errqueue.h> +#include <linux/rbtree.h> +#include <linux/skbuff.h> +#include <linux/posix-timers.h> +#include <net/netlink.h> +#include <net/sch_generic.h> +#include <net/pkt_sched.h> +#include <net/sock.h> + +#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) +#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) + +struct etf_sched_data { + bool offload; + bool deadline_mode; + int clockid; + int queue; + s32 delta; /* in ns */ + ktime_t last; /* The txtime of the last skb sent to the netdevice. */ + struct rb_root head; + struct qdisc_watchdog watchdog; + ktime_t (*get_time)(void); +}; + +static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = { + [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) }, +}; + +static inline int validate_input_params(struct tc_etf_qopt *qopt, + struct netlink_ext_ack *extack) +{ + /* Check if params comply to the following rules: + * * Clockid and delta must be valid. + * + * * Dynamic clockids are not supported. + * + * * Delta must be a positive integer. + * + * Also note that for the HW offload case, we must + * expect that system clocks have been synchronized to PHC. + */ + if (qopt->clockid < 0) { + NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); + return -ENOTSUPP; + } + + if (qopt->clockid != CLOCK_TAI) { + NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used"); + return -EINVAL; + } + + if (qopt->delta < 0) { + NL_SET_ERR_MSG(extack, "Delta must be positive"); + return -EINVAL; + } + + return 0; +} + +static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + ktime_t txtime = nskb->tstamp; + struct sock *sk = nskb->sk; + ktime_t now; + + if (!sk) + return false; + + if (!sock_flag(sk, SOCK_TXTIME)) + return false; + + /* We don't perform crosstimestamping. + * Drop if packet's clockid differs from qdisc's. + */ + if (sk->sk_clockid != q->clockid) + return false; + + if (sk->sk_txtime_deadline_mode != q->deadline_mode) + return false; + + now = q->get_time(); + if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) + return false; + + return true; +} + +static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p; + + p = rb_first(&q->head); + if (!p) + return NULL; + + return rb_to_skb(p); +} + +static void reset_watchdog(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = etf_peek_timesortedlist(sch); + ktime_t next; + + if (!skb) + return; + + next = ktime_sub_ns(skb->tstamp, q->delta); + qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); +} + +static void report_sock_error(struct sk_buff *skb, u32 err, u8 code) +{ + struct sock_exterr_skb *serr; + struct sk_buff *clone; + ktime_t txtime = skb->tstamp; + + if (!skb->sk || !(skb->sk->sk_txtime_report_errors)) + return; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) + return; + + serr = SKB_EXT_ERR(clone); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME; + serr->ee.ee_type = 0; + serr->ee.ee_code = code; + serr->ee.ee_pad = 0; + serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */ + serr->ee.ee_info = txtime; /* low part of tstamp */ + + if (sock_queue_err_skb(skb->sk, clone)) + kfree_skb(clone); +} + +static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node **p = &q->head.rb_node, *parent = NULL; + ktime_t txtime = nskb->tstamp; + + if (!is_packet_valid(sch, nskb)) { + report_sock_error(nskb, EINVAL, + SO_EE_CODE_TXTIME_INVALID_PARAM); + return qdisc_drop(nskb, sch, to_free); + } + + while (*p) { + struct sk_buff *skb; + + parent = *p; + skb = rb_to_skb(parent); + if (ktime_after(txtime, skb->tstamp)) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&nskb->rbnode, parent, p); + rb_insert_color(&nskb->rbnode, &q->head); + + qdisc_qstats_backlog_inc(sch, nskb); + sch->q.qlen++; + + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return NET_XMIT_SUCCESS; +} + +static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, + bool drop) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + rb_erase(&skb->rbnode, &q->head); + + /* The rbnode field in the skb re-uses these fields, now that + * we are done with the rbnode, reset them. + */ + skb->next = NULL; + skb->prev = NULL; + skb->dev = qdisc_dev(sch); + + qdisc_qstats_backlog_dec(sch, skb); + + if (drop) { + struct sk_buff *to_free = NULL; + + report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); + + qdisc_drop(skb, sch, &to_free); + kfree_skb_list(to_free); + qdisc_qstats_overlimit(sch); + } else { + qdisc_bstats_update(sch, skb); + + q->last = skb->tstamp; + } + + sch->q.qlen--; +} + +static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + ktime_t now, next; + + skb = etf_peek_timesortedlist(sch); + if (!skb) + return NULL; + + now = q->get_time(); + + /* Drop if packet has expired while in queue. */ + if (ktime_before(skb->tstamp, now)) { + timesortedlist_erase(sch, skb, true); + skb = NULL; + goto out; + } + + /* When in deadline mode, dequeue as soon as possible and change the + * txtime from deadline to (now + delta). + */ + if (q->deadline_mode) { + timesortedlist_erase(sch, skb, false); + skb->tstamp = now; + goto out; + } + + next = ktime_sub_ns(skb->tstamp, q->delta); + + /* Dequeue only if now is within the [txtime - delta, txtime] range. */ + if (ktime_after(now, next)) + timesortedlist_erase(sch, skb, false); + else + skb = NULL; + +out: + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return skb; +} + +static void etf_disable_offload(struct net_device *dev, + struct etf_sched_data *q) +{ + struct tc_etf_qopt_offload etf = { }; + const struct net_device_ops *ops; + int err; + + if (!q->offload) + return; + + ops = dev->netdev_ops; + if (!ops->ndo_setup_tc) + return; + + etf.queue = q->queue; + etf.enable = 0; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) + pr_warn("Couldn't disable ETF offload for queue %d\n", + etf.queue); +} + +static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct tc_etf_qopt_offload etf = { }; + int err; + + if (q->offload) + return 0; + + if (!ops->ndo_setup_tc) { + NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload"); + return -EOPNOTSUPP; + } + + etf.queue = q->queue; + etf.enable = 1; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload"); + return err; + } + + return 0; +} + +static int etf_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct nlattr *tb[TCA_ETF_MAX + 1]; + struct tc_etf_qopt *qopt; + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, + "Missing ETF qdisc options which are mandatory"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETF_PARMS]) { + NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters"); + return -EINVAL; + } + + qopt = nla_data(tb[TCA_ETF_PARMS]); + + pr_debug("delta %d clockid %d offload %s deadline %s\n", + qopt->delta, qopt->clockid, + OFFLOAD_IS_ON(qopt) ? "on" : "off", + DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); + + err = validate_input_params(qopt, extack); + if (err < 0) + return err; + + q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + + if (OFFLOAD_IS_ON(qopt)) { + err = etf_enable_offload(dev, q, extack); + if (err < 0) + return err; + } + + /* Everything went OK, save the parameters used. */ + q->delta = qopt->delta; + q->clockid = qopt->clockid; + q->offload = OFFLOAD_IS_ON(qopt); + q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); + + switch (q->clockid) { + case CLOCK_REALTIME: + q->get_time = ktime_get_real; + break; + case CLOCK_MONOTONIC: + q->get_time = ktime_get; + break; + case CLOCK_BOOTTIME: + q->get_time = ktime_get_boottime; + break; + case CLOCK_TAI: + q->get_time = ktime_get_clocktai; + break; + default: + NL_SET_ERR_MSG(extack, "Clockid is not supported"); + return -ENOTSUPP; + } + + qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid); + + return 0; +} + +static void timesortedlist_clear(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p = rb_first(&q->head); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + + rb_erase(&skb->rbnode, &q->head); + rtnl_kfree_skbs(skb, skb); + sch->q.qlen--; + } +} + +static void etf_reset(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); + + /* No matter which mode we are on, it's safe to clear both lists. */ + timesortedlist_clear(sch); + __qdisc_reset_queue(&sch->q); + + sch->qstats.backlog = 0; + sch->q.qlen = 0; + + q->last = 0; +} + +static void etf_destroy(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); + + etf_disable_offload(dev, q); +} + +static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct tc_etf_qopt opt = { }; + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + opt.delta = q->delta; + opt.clockid = q->clockid; + if (q->offload) + opt.flags |= TC_ETF_OFFLOAD_ON; + + if (q->deadline_mode) + opt.flags |= TC_ETF_DEADLINE_MODE_ON; + + if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc_ops etf_qdisc_ops __read_mostly = { + .id = "etf", + .priv_size = sizeof(struct etf_sched_data), + .enqueue = etf_enqueue_timesortedlist, + .dequeue = etf_dequeue_timesortedlist, + .peek = etf_peek_timesortedlist, + .init = etf_init, + .reset = etf_reset, + .destroy = etf_destroy, + .dump = etf_dump, + .owner = THIS_MODULE, +}; + +static int __init etf_module_init(void) +{ + return register_qdisc(&etf_qdisc_ops); +} + +static void __exit etf_module_exit(void) +{ + unregister_qdisc(&etf_qdisc_ops); +} +module_init(etf_module_init) +module_exit(etf_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index cd2e0e342fb6..6c0a9d5dbf94 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -479,24 +479,28 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, q->cparams.mtu = psched_mtu(qdisc_dev(sch)); if (opt) { - int err = fq_codel_change(sch, opt, extack); + err = fq_codel_change(sch, opt, extack); if (err) - return err; + goto init_failure; } err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) - return err; + goto init_failure; if (!q->flows) { q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_codel_flow), GFP_KERNEL); - if (!q->flows) - return -ENOMEM; + if (!q->flows) { + err = -ENOMEM; + goto init_failure; + } q->backlogs = kvcalloc(q->flows_cnt, sizeof(u32), GFP_KERNEL); - if (!q->backlogs) - return -ENOMEM; + if (!q->backlogs) { + err = -ENOMEM; + goto alloc_failure; + } for (i = 0; i < q->flows_cnt; i++) { struct fq_codel_flow *flow = q->flows + i; @@ -509,6 +513,13 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, else sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; + +alloc_failure: + kvfree(q->flows); + q->flows = NULL; +init_failure: + q->flows_cnt = 0; + return err; } static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 3ae9877ea205..3278a76f6861 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1385,8 +1385,8 @@ hfsc_schedule_watchdog(struct Qdisc *sch) if (next_time == 0 || next_time > q->root.cl_cfmin) next_time = q->root.cl_cfmin; } - WARN_ON(next_time == 0); - qdisc_watchdog_schedule(&q->watchdog, next_time); + if (next_time) + qdisc_watchdog_schedule(&q->watchdog, next_time); } static int diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 2a4ab7caf553..43c4bfe625a9 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -126,7 +126,6 @@ struct htb_class { union { struct htb_class_leaf { - struct list_head drop_list; int deficit[TC_HTB_MAXDEPTH]; struct Qdisc *q; } leaf; @@ -171,7 +170,6 @@ struct htb_sched { struct qdisc_watchdog watchdog; s64 now; /* cached dequeue time */ - struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ /* time of nearest event per level (row) */ s64 near_ev_cache[TC_HTB_MAXDEPTH]; @@ -562,8 +560,6 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) if (!cl->prio_activity) { cl->prio_activity = 1 << cl->prio; htb_activate_prios(q, cl); - list_add_tail(&cl->un.leaf.drop_list, - q->drops + cl->prio); } } @@ -579,7 +575,6 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) htb_deactivate_prios(q, cl); cl->prio_activity = 0; - list_del_init(&cl->un.leaf.drop_list); } static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, @@ -981,7 +976,6 @@ static void htb_reset(struct Qdisc *sch) else { if (cl->un.leaf.q) qdisc_reset(cl->un.leaf.q); - INIT_LIST_HEAD(&cl->un.leaf.drop_list); } cl->prio_activity = 0; cl->cmode = HTB_CAN_SEND; @@ -993,8 +987,6 @@ static void htb_reset(struct Qdisc *sch) sch->qstats.backlog = 0; memset(q->hlevel, 0, sizeof(q->hlevel)); memset(q->row_mask, 0, sizeof(q->row_mask)); - for (i = 0; i < TC_HTB_NUMPRIO; i++) - INIT_LIST_HEAD(q->drops + i); } static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { @@ -1024,7 +1016,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_glob *gopt; int err; - int i; qdisc_watchdog_init(&q->watchdog, sch); INIT_WORK(&q->work, htb_work_func); @@ -1050,8 +1041,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; - for (i = 0; i < TC_HTB_NUMPRIO; i++) - INIT_LIST_HEAD(q->drops + i); qdisc_skb_head_init(&q->direct_queue); @@ -1224,7 +1213,6 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, parent->level = 0; memset(&parent->un.inner, 0, sizeof(parent->un.inner)); - INIT_LIST_HEAD(&parent->un.leaf.drop_list); parent->un.leaf.q = new_q ? new_q : &noop_qdisc; parent->tokens = parent->buffer; parent->ctokens = parent->cbuffer; @@ -1418,7 +1406,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, } cl->children = 0; - INIT_LIST_HEAD(&cl->un.leaf.drop_list); RB_CLEAR_NODE(&cl->pq_node); for (prio = 0; prio < TC_HTB_NUMPRIO; prio++) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 7d6801fc5340..ad18a2052416 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -68,6 +68,11 @@ Fabio Ludovici <fabio.ludovici at yahoo.it> */ +struct disttable { + u32 size; + s16 table[0]; +}; + struct netem_sched_data { /* internal t(ime)fifo qdisc uses t_root and sch->limit */ struct rb_root t_root; @@ -99,10 +104,7 @@ struct netem_sched_data { u32 rho; } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; - struct disttable { - u32 size; - s16 table[0]; - } *delay_dist; + struct disttable *delay_dist; enum { CLG_RANDOM, @@ -142,6 +144,7 @@ struct netem_sched_data { s32 bytes_left; } slot; + struct disttable *slot_dist; }; /* Time stamp put into socket buffer control block @@ -180,7 +183,7 @@ static u32 get_crandom(struct crndstate *state) u64 value, rho; unsigned long answer; - if (state->rho == 0) /* no correlation */ + if (!state || state->rho == 0) /* no correlation */ return prandom_u32(); value = prandom_u32(); @@ -601,10 +604,19 @@ finish_segs: static void get_slot_next(struct netem_sched_data *q, u64 now) { - q->slot.slot_next = now + q->slot_config.min_delay + - (prandom_u32() * - (q->slot_config.max_delay - - q->slot_config.min_delay) >> 32); + s64 next_delay; + + if (!q->slot_dist) + next_delay = q->slot_config.min_delay + + (prandom_u32() * + (q->slot_config.max_delay - + q->slot_config.min_delay) >> 32); + else + next_delay = tabledist(q->slot_config.dist_delay, + (s32)(q->slot_config.dist_jitter), + NULL, q->slot_dist); + + q->slot.slot_next = now + next_delay; q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; } @@ -721,9 +733,9 @@ static void dist_free(struct disttable *d) * signed 16 bit values. */ -static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) +static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, + const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); size_t n = nla_len(attr)/sizeof(__s16); const __s16 *data = nla_data(attr); spinlock_t *root_lock; @@ -744,7 +756,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) root_lock = qdisc_root_sleeping_lock(sch); spin_lock_bh(root_lock); - swap(q->delay_dist, d); + swap(*tbl, d); spin_unlock_bh(root_lock); dist_free(d); @@ -762,7 +774,8 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) q->slot_config.max_bytes = INT_MAX; q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; - if (q->slot_config.min_delay | q->slot_config.max_delay) + if (q->slot_config.min_delay | q->slot_config.max_delay | + q->slot_config.dist_jitter) q->slot.slot_next = ktime_get_ns(); else q->slot.slot_next = 0; @@ -926,16 +939,17 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_NETEM_DELAY_DIST]) { - ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); - if (ret) { - /* recover clg and loss_model, in case of - * q->clg and q->loss_model were modified - * in get_loss_clg() - */ - q->clg = old_clg; - q->loss_model = old_loss_model; - return ret; - } + ret = get_dist_table(sch, &q->delay_dist, + tb[TCA_NETEM_DELAY_DIST]); + if (ret) + goto get_table_failure; + } + + if (tb[TCA_NETEM_SLOT_DIST]) { + ret = get_dist_table(sch, &q->slot_dist, + tb[TCA_NETEM_SLOT_DIST]); + if (ret) + goto get_table_failure; } sch->limit = qopt->limit; @@ -983,6 +997,15 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, get_slot(q, tb[TCA_NETEM_SLOT]); return ret; + +get_table_failure: + /* recover clg and loss_model, in case of + * q->clg and q->loss_model were modified + * in get_loss_clg() + */ + q->clg = old_clg; + q->loss_model = old_loss_model; + return ret; } static int netem_init(struct Qdisc *sch, struct nlattr *opt, @@ -1011,6 +1034,7 @@ static void netem_destroy(struct Qdisc *sch) if (q->qdisc) qdisc_destroy(q->qdisc); dist_free(q->delay_dist); + dist_free(q->slot_dist); } static int dump_loss_model(const struct netem_sched_data *q, @@ -1127,7 +1151,8 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) if (dump_loss_model(q, skb) != 0) goto nla_put_failure; - if (q->slot_config.min_delay | q->slot_config.max_delay) { + if (q->slot_config.min_delay | q->slot_config.max_delay | + q->slot_config.dist_jitter) { slot = q->slot_config; if (slot.max_packets == INT_MAX) slot.max_packets = 0; diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c new file mode 100644 index 000000000000..52c0b6d8f1d7 --- /dev/null +++ b/net/sched/sch_skbprio.c @@ -0,0 +1,320 @@ +/* + * net/sched/sch_skbprio.c SKB Priority Queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Nishanth Devarajan, <ndev2021@gmail.com> + * Cody Doucette, <doucette@bu.edu> + * original idea by Michel Machado, Cody Doucette, and Qiaobin Fu + */ + +#include <linux/string.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> +#include <net/sch_generic.h> +#include <net/inet_ecn.h> + +/* SKB Priority Queue + * ================================= + * + * Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes + * packets according to their skb->priority field. Under congestion, + * Skbprio drops already-enqueued lower priority packets to make space + * available for higher priority packets; it was conceived as a solution + * for denial-of-service defenses that need to route packets with different + * priorities as a mean to overcome DoS attacks. + */ + +struct skbprio_sched_data { + /* Queue state. */ + struct sk_buff_head qdiscs[SKBPRIO_MAX_PRIORITY]; + struct gnet_stats_queue qstats[SKBPRIO_MAX_PRIORITY]; + u16 highest_prio; + u16 lowest_prio; +}; + +static u16 calc_new_high_prio(const struct skbprio_sched_data *q) +{ + int prio; + + for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) { + if (!skb_queue_empty(&q->qdiscs[prio])) + return prio; + } + + /* SKB queue is empty, return 0 (default highest priority setting). */ + return 0; +} + +static u16 calc_new_low_prio(const struct skbprio_sched_data *q) +{ + int prio; + + for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) { + if (!skb_queue_empty(&q->qdiscs[prio])) + return prio; + } + + /* SKB queue is empty, return SKBPRIO_MAX_PRIORITY - 1 + * (default lowest priority setting). + */ + return SKBPRIO_MAX_PRIORITY - 1; +} + +static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1; + struct skbprio_sched_data *q = qdisc_priv(sch); + struct sk_buff_head *qdisc; + struct sk_buff_head *lp_qdisc; + struct sk_buff *to_drop; + u16 prio, lp; + + /* Obtain the priority of @skb. */ + prio = min(skb->priority, max_priority); + + qdisc = &q->qdiscs[prio]; + if (sch->q.qlen < sch->limit) { + __skb_queue_tail(qdisc, skb); + qdisc_qstats_backlog_inc(sch, skb); + q->qstats[prio].backlog += qdisc_pkt_len(skb); + + /* Check to update highest and lowest priorities. */ + if (prio > q->highest_prio) + q->highest_prio = prio; + + if (prio < q->lowest_prio) + q->lowest_prio = prio; + + sch->q.qlen++; + return NET_XMIT_SUCCESS; + } + + /* If this packet has the lowest priority, drop it. */ + lp = q->lowest_prio; + if (prio <= lp) { + q->qstats[prio].drops++; + q->qstats[prio].overlimits++; + return qdisc_drop(skb, sch, to_free); + } + + __skb_queue_tail(qdisc, skb); + qdisc_qstats_backlog_inc(sch, skb); + q->qstats[prio].backlog += qdisc_pkt_len(skb); + + /* Drop the packet at the tail of the lowest priority qdisc. */ + lp_qdisc = &q->qdiscs[lp]; + to_drop = __skb_dequeue_tail(lp_qdisc); + BUG_ON(!to_drop); + qdisc_qstats_backlog_dec(sch, to_drop); + qdisc_drop(to_drop, sch, to_free); + + q->qstats[lp].backlog -= qdisc_pkt_len(to_drop); + q->qstats[lp].drops++; + q->qstats[lp].overlimits++; + + /* Check to update highest and lowest priorities. */ + if (skb_queue_empty(lp_qdisc)) { + if (q->lowest_prio == q->highest_prio) { + /* The incoming packet is the only packet in queue. */ + BUG_ON(sch->q.qlen != 1); + q->lowest_prio = prio; + q->highest_prio = prio; + } else { + q->lowest_prio = calc_new_low_prio(q); + } + } + + if (prio > q->highest_prio) + q->highest_prio = prio; + + return NET_XMIT_CN; +} + +static struct sk_buff *skbprio_dequeue(struct Qdisc *sch) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio]; + struct sk_buff *skb = __skb_dequeue(hpq); + + if (unlikely(!skb)) + return NULL; + + sch->q.qlen--; + qdisc_qstats_backlog_dec(sch, skb); + qdisc_bstats_update(sch, skb); + + q->qstats[q->highest_prio].backlog -= qdisc_pkt_len(skb); + + /* Update highest priority field. */ + if (skb_queue_empty(hpq)) { + if (q->lowest_prio == q->highest_prio) { + BUG_ON(sch->q.qlen); + q->highest_prio = 0; + q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; + } else { + q->highest_prio = calc_new_high_prio(q); + } + } + return skb; +} + +static int skbprio_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct tc_skbprio_qopt *ctl = nla_data(opt); + + sch->limit = ctl->limit; + return 0; +} + +static int skbprio_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + int prio; + + /* Initialise all queues, one for each possible priority. */ + for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) + __skb_queue_head_init(&q->qdiscs[prio]); + + memset(&q->qstats, 0, sizeof(q->qstats)); + q->highest_prio = 0; + q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; + sch->limit = 64; + if (!opt) + return 0; + + return skbprio_change(sch, opt, extack); +} + +static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct tc_skbprio_qopt opt; + + opt.limit = sch->limit; + + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + return -1; + + return skb->len; +} + +static void skbprio_reset(struct Qdisc *sch) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + int prio; + + sch->qstats.backlog = 0; + sch->q.qlen = 0; + + for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) + __skb_queue_purge(&q->qdiscs[prio]); + + memset(&q->qstats, 0, sizeof(q->qstats)); + q->highest_prio = 0; + q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; +} + +static void skbprio_destroy(struct Qdisc *sch) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + int prio; + + for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) + __skb_queue_purge(&q->qdiscs[prio]); +} + +static struct Qdisc *skbprio_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long skbprio_find(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static int skbprio_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int skbprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + if (gnet_stats_copy_queue(d, NULL, &q->qstats[cl - 1], + q->qstats[cl - 1].qlen) < 0) + return -1; + return 0; +} + +static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops skbprio_class_ops = { + .leaf = skbprio_leaf, + .find = skbprio_find, + .dump = skbprio_dump_class, + .dump_stats = skbprio_dump_class_stats, + .walk = skbprio_walk, +}; + +static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = { + .cl_ops = &skbprio_class_ops, + .id = "skbprio", + .priv_size = sizeof(struct skbprio_sched_data), + .enqueue = skbprio_enqueue, + .dequeue = skbprio_dequeue, + .peek = qdisc_peek_dequeued, + .init = skbprio_init, + .reset = skbprio_reset, + .change = skbprio_change, + .dump = skbprio_dump, + .destroy = skbprio_destroy, + .owner = THIS_MODULE, +}; + +static int __init skbprio_module_init(void) +{ + return register_qdisc(&skbprio_qdisc_ops); +} + +static void __exit skbprio_module_exit(void) +{ + unregister_qdisc(&skbprio_qdisc_ops); +} + +module_init(skbprio_module_init) +module_exit(skbprio_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index c740b189d4ba..950ecf6e7439 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -41,8 +41,8 @@ config SCTP_DBG_OBJCNT bool "SCTP: Debug object counts" depends on PROC_FS help - If you say Y, this will enable debugging support for counting the - type of objects that are currently allocated. This is useful for + If you say Y, this will enable debugging support for counting the + type of objects that are currently allocated. This is useful for identifying memory leaks. This debug information can be viewed by 'cat /proc/net/sctp/sctp_dbg_objcnt' diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 5d5a16204d50..297d9cf960b9 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -115,6 +115,9 @@ static struct sctp_association *sctp_association_init( /* Initialize path max retrans value. */ asoc->pathmaxrxt = sp->pathmaxrxt; + asoc->flowlabel = sp->flowlabel; + asoc->dscp = sp->dscp; + /* Initialize default path MTU. */ asoc->pathmtu = sp->pathmtu; @@ -647,6 +650,18 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, peer->sackdelay = asoc->sackdelay; peer->sackfreq = asoc->sackfreq; + if (addr->sa.sa_family == AF_INET6) { + __be32 info = addr->v6.sin6_flowinfo; + + if (info) { + peer->flowlabel = ntohl(info & IPV6_FLOWLABEL_MASK); + peer->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } else { + peer->flowlabel = asoc->flowlabel; + } + } + peer->dscp = asoc->dscp; + /* Enable/disable heartbeat, SACK delay, and path MTU discovery * based on association setting. */ diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 7339918a805d..fc6c5e4bffa5 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -209,12 +209,17 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 *fl6 = &transport->fl.u.ip6; + __u8 tclass = np->tclass; int res; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); - IP6_ECN_flow_xmit(sk, fl6->flowlabel); + if (transport->dscp & SCTP_DSCP_SET_MASK) + tclass = transport->dscp & SCTP_DSCP_VAL_MASK; + + if (INET_ECN_is_capable(tclass)) + IP6_ECN_flow_xmit(sk, fl6->flowlabel); if (!(transport->param_flags & SPP_PMTUD_ENABLE)) skb->ignore_df = 1; @@ -223,7 +228,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), - np->tclass); + tclass); rcu_read_unlock(); return res; } @@ -254,6 +259,17 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl6->flowi6_oif = daddr->v6.sin6_scope_id; else if (asoc) fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if; + if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK) + fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK); + + if (np->sndflow && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) { + struct ip6_flowlabel *flowlabel; + + flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); + if (!flowlabel) + goto out; + fl6_sock_release(flowlabel); + } pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr); @@ -1010,7 +1026,7 @@ static const struct proto_ops inet6_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = sctp_getname, - .poll_mask = sctp_poll_mask, + .poll = sctp_poll, .ioctl = inet6_ioctl, .listen = sctp_inet_listen, .shutdown = inet_shutdown, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 5dffbc493008..e948db29ab53 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -426,13 +426,16 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct dst_entry *dst = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; + __u8 tos = inet_sk(sk)->tos; + if (t->dscp & SCTP_DSCP_SET_MASK) + tos = t->dscp & SCTP_DSCP_VAL_MASK; memset(fl4, 0x0, sizeof(struct flowi4)); fl4->daddr = daddr->v4.sin_addr.s_addr; fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk); + fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); } @@ -495,7 +498,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_sport = laddr->a.v4.sin_port; flowi4_update_output(fl4, asoc->base.sk->sk_bound_dev_if, - RT_CONN_FLAGS(asoc->base.sk), + RT_CONN_FLAGS_TOS(asoc->base.sk, tos), daddr->v4.sin_addr.s_addr, laddr->a.v4.sin_addr.s_addr); @@ -971,16 +974,21 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *transport) { struct inet_sock *inet = inet_sk(skb->sk); + __u8 dscp = inet->tos; pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb, - skb->len, &transport->fl.u.ip4.saddr, &transport->fl.u.ip4.daddr); + skb->len, &transport->fl.u.ip4.saddr, + &transport->fl.u.ip4.daddr); + + if (transport->dscp & SCTP_DSCP_SET_MASK) + dscp = transport->dscp & SCTP_DSCP_VAL_MASK; inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ? IP_PMTUDISC_DO : IP_PMTUDISC_DONT; SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS); - return ip_queue_xmit(&inet->sk, skb, &transport->fl); + return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp); } static struct sctp_af sctp_af_inet; @@ -1016,7 +1024,7 @@ static const struct proto_ops inet_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, /* Semantics are different. */ - .poll_mask = sctp_poll_mask, + .poll = sctp_poll, .ioctl = inet_ioctl, .listen = sctp_inet_listen, .shutdown = inet_shutdown, /* Looks harmless. */ diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 298112ca8c06..85d393090238 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1827,4 +1827,3 @@ nomem: error = -ENOMEM; goto out; } - diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 0e91e83eea5a..502c0d7cb105 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1697,6 +1697,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, struct sctp_association *asoc; enum sctp_scope scope; struct cmsghdr *cmsg; + __be32 flowinfo = 0; struct sctp_af *af; int err; @@ -1781,6 +1782,9 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, if (!cmsgs->addrs_msg) return 0; + if (daddr->sa.sa_family == AF_INET6) + flowinfo = daddr->v6.sin6_flowinfo; + /* sendv addr list parse */ for_each_cmsghdr(cmsg, cmsgs->addrs_msg) { struct sctp_transport *transport; @@ -1813,6 +1817,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, } dlen = sizeof(struct in6_addr); + daddr->v6.sin6_flowinfo = flowinfo; daddr->v6.sin6_family = AF_INET6; daddr->v6.sin6_port = htons(asoc->peer.port); memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen); @@ -2393,6 +2398,8 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, * uint32_t spp_pathmtu; * uint32_t spp_sackdelay; * uint32_t spp_flags; + * uint32_t spp_ipv6_flowlabel; + * uint8_t spp_dscp; * }; * * spp_assoc_id - (one-to-many style socket) This is filled in the @@ -2472,6 +2479,45 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, * also that this field is mutually exclusive to * SPP_SACKDELAY_ENABLE, setting both will have undefined * results. + * + * SPP_IPV6_FLOWLABEL: Setting this flag enables the + * setting of the IPV6 flow label value. The value is + * contained in the spp_ipv6_flowlabel field. + * Upon retrieval, this flag will be set to indicate that + * the spp_ipv6_flowlabel field has a valid value returned. + * If a specific destination address is set (in the + * spp_address field), then the value returned is that of + * the address. If just an association is specified (and + * no address), then the association's default flow label + * is returned. If neither an association nor a destination + * is specified, then the socket's default flow label is + * returned. For non-IPv6 sockets, this flag will be left + * cleared. + * + * SPP_DSCP: Setting this flag enables the setting of the + * Differentiated Services Code Point (DSCP) value + * associated with either the association or a specific + * address. The value is obtained in the spp_dscp field. + * Upon retrieval, this flag will be set to indicate that + * the spp_dscp field has a valid value returned. If a + * specific destination address is set when called (in the + * spp_address field), then that specific destination + * address's DSCP value is returned. If just an association + * is specified, then the association's default DSCP is + * returned. If neither an association nor a destination is + * specified, then the socket's default DSCP is returned. + * + * spp_ipv6_flowlabel + * - This field is used in conjunction with the + * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label. + * The 20 least significant bits are used for the flow + * label. This setting has precedence over any IPv6-layer + * setting. + * + * spp_dscp - This field is used in conjunction with the SPP_DSCP flag + * and contains the DSCP. The 6 most significant bits are + * used for the DSCP. This setting has precedence over any + * IPv4- or IPv6- layer setting. */ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, struct sctp_transport *trans, @@ -2611,6 +2657,51 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, } } + if (params->spp_flags & SPP_IPV6_FLOWLABEL) { + if (trans && trans->ipaddr.sa.sa_family == AF_INET6) { + trans->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } else if (asoc) { + list_for_each_entry(trans, + &asoc->peer.transport_addr_list, + transports) { + if (trans->ipaddr.sa.sa_family != AF_INET6) + continue; + trans->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } + asoc->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + asoc->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } else if (sctp_opt2sk(sp)->sk_family == AF_INET6) { + sp->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + sp->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } + } + + if (params->spp_flags & SPP_DSCP) { + if (trans) { + trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; + trans->dscp |= SCTP_DSCP_SET_MASK; + } else if (asoc) { + list_for_each_entry(trans, + &asoc->peer.transport_addr_list, + transports) { + trans->dscp = params->spp_dscp & + SCTP_DSCP_VAL_MASK; + trans->dscp |= SCTP_DSCP_SET_MASK; + } + asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; + asoc->dscp |= SCTP_DSCP_SET_MASK; + } else { + sp->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; + sp->dscp |= SCTP_DSCP_SET_MASK; + } + } + return 0; } @@ -2625,11 +2716,18 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk, int error; int hb_change, pmtud_change, sackdelay_change; - if (optlen != sizeof(struct sctp_paddrparams)) + if (optlen == sizeof(params)) { + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + } else if (optlen == ALIGN(offsetof(struct sctp_paddrparams, + spp_ipv6_flowlabel), 4)) { + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + if (params.spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL)) + return -EINVAL; + } else { return -EINVAL; - - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; + } /* Validate flags and value parameters. */ hb_change = params.spp_flags & SPP_HB; @@ -4170,6 +4268,28 @@ out: return retval; } +static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + int val; + + if (!sctp_style(sk, TCP)) + return -EOPNOTSUPP; + + if (sctp_sk(sk)->ep->base.bind_addr.port) + return -EFAULT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + sctp_sk(sk)->reuse = !!val; + + return 0; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4364,6 +4484,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_interleaving_supported(sk, optval, optlen); break; + case SCTP_REUSE_PORT: + retval = sctp_setsockopt_reuse_port(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -5428,6 +5551,45 @@ out: * also that this field is mutually exclusive to * SPP_SACKDELAY_ENABLE, setting both will have undefined * results. + * + * SPP_IPV6_FLOWLABEL: Setting this flag enables the + * setting of the IPV6 flow label value. The value is + * contained in the spp_ipv6_flowlabel field. + * Upon retrieval, this flag will be set to indicate that + * the spp_ipv6_flowlabel field has a valid value returned. + * If a specific destination address is set (in the + * spp_address field), then the value returned is that of + * the address. If just an association is specified (and + * no address), then the association's default flow label + * is returned. If neither an association nor a destination + * is specified, then the socket's default flow label is + * returned. For non-IPv6 sockets, this flag will be left + * cleared. + * + * SPP_DSCP: Setting this flag enables the setting of the + * Differentiated Services Code Point (DSCP) value + * associated with either the association or a specific + * address. The value is obtained in the spp_dscp field. + * Upon retrieval, this flag will be set to indicate that + * the spp_dscp field has a valid value returned. If a + * specific destination address is set when called (in the + * spp_address field), then that specific destination + * address's DSCP value is returned. If just an association + * is specified, then the association's default DSCP is + * returned. If neither an association nor a destination is + * specified, then the socket's default DSCP is returned. + * + * spp_ipv6_flowlabel + * - This field is used in conjunction with the + * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label. + * The 20 least significant bits are used for the flow + * label. This setting has precedence over any IPv6-layer + * setting. + * + * spp_dscp - This field is used in conjunction with the SPP_DSCP flag + * and contains the DSCP. The 6 most significant bits are + * used for the DSCP. This setting has precedence over any + * IPv4- or IPv6- layer setting. */ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, char __user *optval, int __user *optlen) @@ -5437,9 +5599,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, struct sctp_association *asoc = NULL; struct sctp_sock *sp = sctp_sk(sk); - if (len < sizeof(struct sctp_paddrparams)) + if (len >= sizeof(params)) + len = sizeof(params); + else if (len >= ALIGN(offsetof(struct sctp_paddrparams, + spp_ipv6_flowlabel), 4)) + len = ALIGN(offsetof(struct sctp_paddrparams, + spp_ipv6_flowlabel), 4); + else return -EINVAL; - len = sizeof(struct sctp_paddrparams); + if (copy_from_user(¶ms, optval, len)) return -EFAULT; @@ -5474,6 +5642,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = trans->param_flags; + if (trans->flowlabel & SCTP_FLOWLABEL_SET_MASK) { + params.spp_ipv6_flowlabel = trans->flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + params.spp_flags |= SPP_IPV6_FLOWLABEL; + } + if (trans->dscp & SCTP_DSCP_SET_MASK) { + params.spp_dscp = trans->dscp & SCTP_DSCP_VAL_MASK; + params.spp_flags |= SPP_DSCP; + } } else if (asoc) { /* Fetch association values. */ params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval); @@ -5483,6 +5660,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = asoc->param_flags; + if (asoc->flowlabel & SCTP_FLOWLABEL_SET_MASK) { + params.spp_ipv6_flowlabel = asoc->flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + params.spp_flags |= SPP_IPV6_FLOWLABEL; + } + if (asoc->dscp & SCTP_DSCP_SET_MASK) { + params.spp_dscp = asoc->dscp & SCTP_DSCP_VAL_MASK; + params.spp_flags |= SPP_DSCP; + } } else { /* Fetch socket values. */ params.spp_hbinterval = sp->hbinterval; @@ -5492,6 +5678,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = sp->param_flags; + if (sp->flowlabel & SCTP_FLOWLABEL_SET_MASK) { + params.spp_ipv6_flowlabel = sp->flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + params.spp_flags |= SPP_IPV6_FLOWLABEL; + } + if (sp->dscp & SCTP_DSCP_SET_MASK) { + params.spp_dscp = sp->dscp & SCTP_DSCP_VAL_MASK; + params.spp_flags |= SPP_DSCP; + } } if (copy_to_user(optval, ¶ms, len)) @@ -7197,6 +7392,26 @@ out: return retval; } +static int sctp_getsockopt_reuse_port(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + int val; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + val = sctp_sk(sk)->reuse; + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -7392,6 +7607,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_interleaving_supported(sk, len, optval, optlen); break; + case SCTP_REUSE_PORT: + retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -7429,6 +7647,7 @@ static struct sctp_bind_bucket *sctp_bucket_create( static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) { + bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse); struct sctp_bind_hashbucket *head; /* hash list */ struct sctp_bind_bucket *pp; unsigned short snum; @@ -7501,13 +7720,11 @@ pp_found: * used by other socket (pp->owner not empty); that other * socket is going to be sk2. */ - int reuse = sk->sk_reuse; struct sock *sk2; pr_debug("%s: found a possible match\n", __func__); - if (pp->fastreuse && sk->sk_reuse && - sk->sk_state != SCTP_SS_LISTENING) + if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING) goto success; /* Run through the list of sockets bound to the port @@ -7525,7 +7742,7 @@ pp_found: ep2 = sctp_sk(sk2)->ep; if (sk == sk2 || - (reuse && sk2->sk_reuse && + (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) && sk2->sk_state != SCTP_SS_LISTENING)) continue; @@ -7549,12 +7766,12 @@ pp_not_found: * SO_REUSEADDR on this socket -sk-). */ if (hlist_empty(&pp->owner)) { - if (sk->sk_reuse && sk->sk_state != SCTP_SS_LISTENING) + if (reuse && sk->sk_state != SCTP_SS_LISTENING) pp->fastreuse = 1; else pp->fastreuse = 0; } else if (pp->fastreuse && - (!sk->sk_reuse || sk->sk_state == SCTP_SS_LISTENING)) + (!reuse || sk->sk_state == SCTP_SS_LISTENING)) pp->fastreuse = 0; /* We are set, so fill up all the data in the hash table @@ -7685,7 +7902,7 @@ int sctp_inet_listen(struct socket *sock, int backlog) err = 0; sctp_unhash_endpoint(ep); sk->sk_state = SCTP_SS_CLOSED; - if (sk->sk_reuse) + if (sk->sk_reuse || sctp_sk(sk)->reuse) sctp_sk(sk)->bind_hash->fastreuse = 1; goto out; } @@ -7718,12 +7935,14 @@ out: * here, again, by modeling the current TCP/UDP code. We don't have * a good way to test with it yet. */ -__poll_t sctp_poll_mask(struct socket *sock, __poll_t events) +__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct sctp_sock *sp = sctp_sk(sk); __poll_t mask; + poll_wait(file, sk_sleep(sk), wait); + sock_rps_record_flow(sk); /* A TCP-style listening socket becomes readable when the accept queue @@ -8550,6 +8769,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newsk->sk_no_check_tx = sk->sk_no_check_tx; newsk->sk_no_check_rx = sk->sk_no_check_rx; newsk->sk_reuse = sk->sk_reuse; + sctp_sk(newsk)->reuse = sp->reuse; newsk->sk_shutdown = sk->sk_shutdown; newsk->sk_destruct = sctp_destruct_sock; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 445b7ef61677..12cac85da994 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -282,7 +282,7 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) if (dst) { /* Re-fetch, as under layers may have a higher minimum size */ - pmtu = SCTP_TRUNC4(dst_mtu(dst)); + pmtu = sctp_dst_mtu(dst); change = t->pathmtu != pmtu; } t->pathmtu = pmtu; diff --git a/net/smc/Makefile b/net/smc/Makefile index 188104654b54..4df96b4b8130 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o -smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index da7f02edcd37..fce7e4751151 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -23,6 +23,7 @@ #include <linux/workqueue.h> #include <linux/in.h> #include <linux/sched/signal.h> +#include <linux/if_vlan.h> #include <net/sock.h> #include <net/tcp.h> @@ -35,6 +36,7 @@ #include "smc_cdc.h" #include "smc_core.h" #include "smc_ib.h" +#include "smc_ism.h" #include "smc_pnet.h" #include "smc_tx.h" #include "smc_rx.h" @@ -45,6 +47,7 @@ static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group */ static void smc_tcp_listen_work(struct work_struct *); +static void smc_connect_work(struct work_struct *); static void smc_set_keepalive(struct sock *sk, int val) { @@ -122,6 +125,12 @@ static int smc_release(struct socket *sock) goto out; smc = smc_sk(sk); + + /* cleanup for a dangling non-blocking connect */ + flush_work(&smc->connect_work); + kfree(smc->connect_info); + smc->connect_info = NULL; + if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires * sock lock for child sockets again @@ -140,7 +149,8 @@ static int smc_release(struct socket *sock) smc->clcsock = NULL; } if (smc->use_fallback) { - sock_put(sk); /* passive closing */ + if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT) + sock_put(sk); /* passive closing */ sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); } @@ -186,6 +196,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_protocol = protocol; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); spin_lock_init(&smc->accept_q_lock); @@ -333,20 +344,17 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) rc = smc_ib_modify_qp_rts(link); if (rc) - return SMC_CLC_DECL_INTERR; + return SMC_CLC_DECL_ERR_RDYLNK; smc_wr_remember_qp_attr(link); if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) - return SMC_CLC_DECL_INTERR; + return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK response over RoCE fabric */ - rc = smc_llc_send_confirm_link(link, - link->smcibdev->mac[link->ibport - 1], - &link->smcibdev->gid[link->ibport - 1], - SMC_LLC_RESP); + rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); if (rc < 0) - return SMC_CLC_DECL_TCL; + return SMC_CLC_DECL_TIMEOUT_CL; /* receive ADD LINK request from server over RoCE fabric */ rest = wait_for_completion_interruptible_timeout(&link->llc_add, @@ -362,18 +370,17 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) /* send add link reject message, only one link supported for now */ rc = smc_llc_send_add_link(link, link->smcibdev->mac[link->ibport - 1], - &link->smcibdev->gid[link->ibport - 1], - SMC_LLC_RESP); + link->gid, SMC_LLC_RESP); if (rc < 0) - return SMC_CLC_DECL_TCL; + return SMC_CLC_DECL_TIMEOUT_AL; smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); return 0; } -static void smc_conn_save_peer_info(struct smc_sock *smc, - struct smc_clc_msg_accept_confirm *clc) +static void smcr_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) { int bufsize = smc_uncompress_bufsize(clc->rmbe_size); @@ -384,6 +391,28 @@ static void smc_conn_save_peer_info(struct smc_sock *smc, smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); } +static void smcd_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + int bufsize = smc_uncompress_bufsize(clc->dmbe_size); + + smc->conn.peer_rmbe_idx = clc->dmbe_idx; + smc->conn.peer_token = clc->token; + /* msg header takes up space in the buffer */ + smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); + atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); + smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx; +} + +static void smc_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + if (smc->conn.lgr->is_smcd) + smcd_conn_save_peer_info(smc, clc); + else + smcr_conn_save_peer_info(smc, clc); +} + static void smc_link_save_peer_info(struct smc_link *link, struct smc_clc_msg_accept_confirm *clc) { @@ -395,9 +424,10 @@ static void smc_link_save_peer_info(struct smc_link *link, } /* fall back during connect */ -static int smc_connect_fallback(struct smc_sock *smc) +static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { smc->use_fallback = true; + smc->fallback_rsn = reason_code; smc_copy_sock_settings_to_clc(smc); if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; @@ -409,14 +439,20 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) { int rc; - if (reason_code < 0) /* error, fallback is not possible */ + if (reason_code < 0) { /* error, fallback is not possible */ + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ return reason_code; - if (reason_code != SMC_CLC_DECL_REPLY) { + } + if (reason_code != SMC_CLC_DECL_PEERDECL) { rc = smc_clc_send_decline(smc, reason_code); - if (rc < 0) + if (rc < 0) { + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ return rc; + } } - return smc_connect_fallback(smc); + return smc_connect_fallback(smc, reason_code); } /* abort connecting */ @@ -427,15 +463,13 @@ static int smc_connect_abort(struct smc_sock *smc, int reason_code, smc_lgr_forget(smc->conn.lgr); mutex_unlock(&smc_create_lgr_pending); smc_conn_free(&smc->conn); - if (reason_code < 0 && smc->sk.sk_state == SMC_INIT) - sock_put(&smc->sk); /* passive closing */ return reason_code; } /* check if there is a rdma device available for this connection. */ /* called for connect and listen */ static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, - u8 *ibport) + u8 *ibport, unsigned short vlan_id, u8 gid[]) { int reason_code = 0; @@ -443,22 +477,59 @@ static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, * within same PNETID that also contains the ethernet device * used for the internal TCP socket */ - smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport); + smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id, + gid); if (!(*ibdev)) reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ return reason_code; } +/* check if there is an ISM device available for this connection. */ +/* called for connect and listen */ +static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev) +{ + /* Find ISM device with same PNETID as connecting interface */ + smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev); + if (!(*ismdev)) + return SMC_CLC_DECL_CNFERR; /* configuration error */ + return 0; +} + +/* Check for VLAN ID and register it on ISM device just for CLC handshake */ +static int smc_connect_ism_vlan_setup(struct smc_sock *smc, + struct smcd_dev *ismdev, + unsigned short vlan_id) +{ + if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id)) + return SMC_CLC_DECL_CNFERR; + return 0; +} + +/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is + * used, the VLAN ID will be registered again during the connection setup. + */ +static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, + struct smcd_dev *ismdev, + unsigned short vlan_id) +{ + if (!is_smcd) + return 0; + if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id)) + return SMC_CLC_DECL_CNFERR; + return 0; +} + /* CLC handshake during connect */ -static int smc_connect_clc(struct smc_sock *smc, +static int smc_connect_clc(struct smc_sock *smc, int smc_type, struct smc_clc_msg_accept_confirm *aclc, - struct smc_ib_device *ibdev, u8 ibport) + struct smc_ib_device *ibdev, u8 ibport, + u8 gid[], struct smcd_dev *ismdev) { int rc = 0; /* do inband token exchange */ - rc = smc_clc_send_proposal(smc, ibdev, ibport); + rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev); if (rc) return rc; /* receive SMC Accept CLC message */ @@ -475,8 +546,8 @@ static int smc_connect_rdma(struct smc_sock *smc, int reason_code = 0; mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl, - aclc->hdr.flag); + local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev, + ibport, &aclc->lcl, NULL, 0); if (local_contact < 0) { if (local_contact == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ @@ -491,14 +562,14 @@ static int smc_connect_rdma(struct smc_sock *smc, smc_conn_save_peer_info(smc, aclc); /* create send buffer and rmb */ - if (smc_buf_create(smc)) + if (smc_buf_create(smc, false)) return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); if (local_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, aclc); if (smc_rmb_rtoken_handling(&smc->conn, aclc)) - return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, local_contact); smc_close_init(smc); @@ -506,12 +577,12 @@ static int smc_connect_rdma(struct smc_sock *smc, if (local_contact == SMC_FIRST_CONTACT) { if (smc_ib_ready_link(link)) - return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, local_contact); } else { if (!smc->conn.rmb_desc->reused && smc_reg_rmb(link, smc->conn.rmb_desc, true)) - return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, local_contact); } smc_rmb_sync_sg_for_device(&smc->conn); @@ -538,44 +609,145 @@ static int smc_connect_rdma(struct smc_sock *smc, return 0; } +/* setup for ISM connection of client */ +static int smc_connect_ism(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smcd_dev *ismdev) +{ + int local_contact = SMC_FIRST_CONTACT; + int rc = 0; + + mutex_lock(&smc_create_lgr_pending); + local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, + NULL, ismdev, aclc->gid); + if (local_contact < 0) + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0); + + /* Create send and receive buffers */ + if (smc_buf_create(smc, true)) + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); + + smc_conn_save_peer_info(smc, aclc); + smc_close_init(smc); + smc_rx_init(smc); + smc_tx_init(smc); + + rc = smc_clc_send_confirm(smc); + if (rc) + return smc_connect_abort(smc, rc, local_contact); + mutex_unlock(&smc_create_lgr_pending); + + smc_copy_sock_settings_to_clc(smc); + if (smc->sk.sk_state == SMC_INIT) + smc->sk.sk_state = SMC_ACTIVE; + + return 0; +} + /* perform steps before actually connecting */ static int __smc_connect(struct smc_sock *smc) { + bool ism_supported = false, rdma_supported = false; struct smc_clc_msg_accept_confirm aclc; struct smc_ib_device *ibdev; + struct smcd_dev *ismdev; + u8 gid[SMC_GID_SIZE]; + unsigned short vlan; + int smc_type; int rc = 0; u8 ibport; sock_hold(&smc->sk); /* sock put in passive closing */ if (smc->use_fallback) - return smc_connect_fallback(smc); + return smc_connect_fallback(smc, smc->fallback_rsn); /* if peer has not signalled SMC-capability, fall back */ if (!tcp_sk(smc->clcsock->sk)->syn_smc) - return smc_connect_fallback(smc); + return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC); /* IPSec connections opt out of SMC-R optimizations */ if (using_ipsec(smc)) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); - /* check if a RDMA device is available; if not, fall back */ - if (smc_check_rdma(smc, &ibdev, &ibport)) + /* check for VLAN ID */ + if (smc_vlan_by_tcpsk(smc->clcsock, &vlan)) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); + /* check if there is an ism device available */ + if (!smc_check_ism(smc, &ismdev) && + !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) { + /* ISM is supported for this connection */ + ism_supported = true; + smc_type = SMC_TYPE_D; + } + + /* check if there is a rdma device available */ + if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) { + /* RDMA is supported for this connection */ + rdma_supported = true; + if (ism_supported) + smc_type = SMC_TYPE_B; /* both */ + else + smc_type = SMC_TYPE_R; /* only RDMA */ + } + + /* if neither ISM nor RDMA are supported, fallback */ + if (!rdma_supported && !ism_supported) + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV); + /* perform CLC handshake */ - rc = smc_connect_clc(smc, &aclc, ibdev, ibport); - if (rc) + rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev); + if (rc) { + smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); return smc_connect_decline_fallback(smc, rc); + } - /* connect using rdma */ - rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); - if (rc) + /* depending on previous steps, connect using rdma or ism */ + if (rdma_supported && aclc.hdr.path == SMC_TYPE_R) + rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); + else if (ism_supported && aclc.hdr.path == SMC_TYPE_D) + rc = smc_connect_ism(smc, &aclc, ismdev); + else + rc = SMC_CLC_DECL_MODEUNSUPP; + if (rc) { + smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); return smc_connect_decline_fallback(smc, rc); + } + smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); return 0; } +static void smc_connect_work(struct work_struct *work) +{ + struct smc_sock *smc = container_of(work, struct smc_sock, + connect_work); + int rc; + + lock_sock(&smc->sk); + rc = kernel_connect(smc->clcsock, &smc->connect_info->addr, + smc->connect_info->alen, smc->connect_info->flags); + if (smc->clcsock->sk->sk_err) { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + goto out; + } + if (rc < 0) { + smc->sk.sk_err = -rc; + goto out; + } + + rc = __smc_connect(smc); + if (rc < 0) + smc->sk.sk_err = -rc; + +out: + smc->sk.sk_state_change(&smc->sk); + kfree(smc->connect_info); + smc->connect_info = NULL; + release_sock(&smc->sk); +} + static int smc_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { @@ -605,15 +777,32 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; - rc = kernel_connect(smc->clcsock, addr, alen, flags); - if (rc) - goto out; + if (flags & O_NONBLOCK) { + if (smc->connect_info) { + rc = -EALREADY; + goto out; + } + smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL); + if (!smc->connect_info) { + rc = -ENOMEM; + goto out; + } + smc->connect_info->alen = alen; + smc->connect_info->flags = flags ^ O_NONBLOCK; + memcpy(&smc->connect_info->addr, addr, alen); + schedule_work(&smc->connect_work); + rc = -EINPROGRESS; + } else { + rc = kernel_connect(smc->clcsock, addr, alen, flags); + if (rc) + goto out; - rc = __smc_connect(smc); - if (rc < 0) - goto out; - else - rc = 0; /* success cases including fallback */ + rc = __smc_connect(smc); + if (rc < 0) + goto out; + else + rc = 0; /* success cases including fallback */ + } out: release_sock(sk); @@ -758,15 +947,12 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) link = &lgr->lnk[SMC_SINGLE_LINK]; if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) - return SMC_CLC_DECL_INTERR; + return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK request to client over the RoCE fabric */ - rc = smc_llc_send_confirm_link(link, - link->smcibdev->mac[link->ibport - 1], - &link->smcibdev->gid[link->ibport - 1], - SMC_LLC_REQ); + rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); if (rc < 0) - return SMC_CLC_DECL_TCL; + return SMC_CLC_DECL_TIMEOUT_CL; /* receive CONFIRM LINK response from client over the RoCE fabric */ rest = wait_for_completion_interruptible_timeout( @@ -786,10 +972,9 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) /* send ADD LINK request to client over the RoCE fabric */ rc = smc_llc_send_add_link(link, link->smcibdev->mac[link->ibport - 1], - &link->smcibdev->gid[link->ibport - 1], - SMC_LLC_REQ); + link->gid, SMC_LLC_REQ); if (rc < 0) - return SMC_CLC_DECL_TCL; + return SMC_CLC_DECL_TIMEOUT_AL; /* receive ADD LINK response from client over the RoCE fabric */ rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, @@ -864,7 +1049,8 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, } smc_conn_free(&new_smc->conn); new_smc->use_fallback = true; - if (reason_code && reason_code != SMC_CLC_DECL_REPLY) { + new_smc->fallback_rsn = reason_code; + if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code) < 0) { smc_listen_out_err(new_smc); return; @@ -894,7 +1080,8 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, int *local_contact) { /* allocate connection / link group */ - *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0); + *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, + &pclc->lcl, NULL, 0); if (*local_contact < 0) { if (*local_contact == -ENOMEM) return SMC_CLC_DECL_MEM;/* insufficient memory*/ @@ -902,12 +1089,50 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, } /* create send buffer and rmb */ - if (smc_buf_create(new_smc)) + if (smc_buf_create(new_smc, false)) return SMC_CLC_DECL_MEM; return 0; } +/* listen worker: initialize connection and buffers for SMC-D */ +static int smc_listen_ism_init(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smcd_dev *ismdev, + int *local_contact) +{ + struct smc_clc_msg_smcd *pclc_smcd; + + pclc_smcd = smc_get_clc_msg_smcd(pclc); + *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, NULL, + ismdev, pclc_smcd->gid); + if (*local_contact < 0) { + if (*local_contact == -ENOMEM) + return SMC_CLC_DECL_MEM;/* insufficient memory*/ + return SMC_CLC_DECL_INTERR; /* other error */ + } + + /* Check if peer can be reached via ISM device */ + if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid, + new_smc->conn.lgr->vlan_id, + new_smc->conn.lgr->smcd)) { + if (*local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(new_smc->conn.lgr); + smc_conn_free(&new_smc->conn); + return SMC_CLC_DECL_CNFERR; + } + + /* Create send and receive buffers */ + if (smc_buf_create(new_smc, true)) { + if (*local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(new_smc->conn.lgr); + smc_conn_free(&new_smc->conn); + return SMC_CLC_DECL_MEM; + } + + return 0; +} + /* listen worker: register buffers */ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) { @@ -916,7 +1141,7 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) if (local_contact != SMC_FIRST_CONTACT) { if (!new_smc->conn.rmb_desc->reused) { if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) - return SMC_CLC_DECL_INTERR; + return SMC_CLC_DECL_ERR_REGRMB; } } smc_rmb_sync_sg_for_device(&new_smc->conn); @@ -936,13 +1161,13 @@ static void smc_listen_rdma_finish(struct smc_sock *new_smc, smc_link_save_peer_info(link, cclc); if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { - reason_code = SMC_CLC_DECL_INTERR; + reason_code = SMC_CLC_DECL_ERR_RTOK; goto decline; } if (local_contact == SMC_FIRST_CONTACT) { if (smc_ib_ready_link(link)) { - reason_code = SMC_CLC_DECL_INTERR; + reason_code = SMC_CLC_DECL_ERR_RDYLNK; goto decline; } /* QP confirmation over RoCE fabric */ @@ -966,8 +1191,11 @@ static void smc_listen_work(struct work_struct *work) struct smc_clc_msg_accept_confirm cclc; struct smc_clc_msg_proposal *pclc; struct smc_ib_device *ibdev; + bool ism_supported = false; + struct smcd_dev *ismdev; u8 buf[SMC_CLC_MAX_LEN]; int local_contact = 0; + unsigned short vlan; int reason_code = 0; int rc = 0; u8 ibport; @@ -980,6 +1208,7 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { new_smc->use_fallback = true; + new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; smc_listen_out_connected(new_smc); return; } @@ -1006,15 +1235,26 @@ static void smc_listen_work(struct work_struct *work) smc_rx_init(new_smc); smc_tx_init(new_smc); + /* check if ISM is available */ + if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) && + !smc_check_ism(new_smc, &ismdev) && + !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) { + ism_supported = true; + } + /* check if RDMA is available */ - if (smc_check_rdma(new_smc, &ibdev, &ibport) || - smc_listen_rdma_check(new_smc, pclc) || - smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, - &local_contact) || - smc_listen_rdma_reg(new_smc, local_contact)) { + if (!ism_supported && + ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) || + smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) || + smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) || + smc_listen_rdma_check(new_smc, pclc) || + smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, + &local_contact) || + smc_listen_rdma_reg(new_smc, local_contact))) { /* SMC not supported, decline */ mutex_unlock(&smc_create_lgr_pending); - smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact); + smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP, + local_contact); return; } @@ -1036,7 +1276,8 @@ static void smc_listen_work(struct work_struct *work) } /* finish worker */ - smc_listen_rdma_finish(new_smc, &cclc, local_contact); + if (!ism_supported) + smc_listen_rdma_finish(new_smc, &cclc, local_contact); smc_conn_save_peer_info(new_smc, &cclc); mutex_unlock(&smc_create_lgr_pending); smc_listen_out_connected(new_smc); @@ -1060,6 +1301,7 @@ static void smc_tcp_listen_work(struct work_struct *work) new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; + new_smc->fallback_rsn = lsmc->fallback_rsn; sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); @@ -1214,6 +1456,7 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { if (sk->sk_state == SMC_INIT) { smc->use_fallback = true; + smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { rc = -EINVAL; goto out; @@ -1273,40 +1516,26 @@ static __poll_t smc_accept_poll(struct sock *parent) return mask; } -static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) +static __poll_t smc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; struct smc_sock *smc; - int rc; if (!sk) return EPOLLNVAL; smc = smc_sk(sock->sk); - sock_hold(sk); - lock_sock(sk); if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { /* delegate to CLC child sock */ - release_sock(sk); - mask = smc->clcsock->ops->poll_mask(smc->clcsock, events); - lock_sock(sk); + mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; - if (sk->sk_err) { + if (sk->sk_err) mask |= EPOLLERR; - } else { - /* if non-blocking connect finished ... */ - if (sk->sk_state == SMC_INIT && - mask & EPOLLOUT && - smc->clcsock->sk->sk_state != TCP_CLOSE) { - rc = __smc_connect(smc); - if (rc < 0) - mask |= EPOLLERR; - /* success cases including fallback */ - mask |= EPOLLOUT | EPOLLWRNORM; - } - } } else { + if (sk->sk_state != SMC_CLOSED) + sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || @@ -1332,10 +1561,7 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) } if (smc->conn.urg_state == SMC_URG_VALID) mask |= EPOLLPRI; - } - release_sock(sk); - sock_put(sk); return mask; } @@ -1415,7 +1641,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - get_user(val, (int __user *)optval); + if (get_user(val, (int __user *)optval)) + return -EFAULT; lock_sock(sk); switch (optname) { @@ -1427,6 +1654,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* option not supported by SMC */ if (sk->sk_state == SMC_INIT) { smc->use_fallback = true; + smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { if (!smc->use_fallback) rc = -EINVAL; @@ -1483,10 +1711,13 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, return -EBADF; return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); } + lock_sock(&smc->sk); switch (cmd) { case SIOCINQ: /* same as FIONREAD */ - if (smc->sk.sk_state == SMC_LISTEN) + if (smc->sk.sk_state == SMC_LISTEN) { + release_sock(&smc->sk); return -EINVAL; + } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; @@ -1495,8 +1726,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, break; case SIOCOUTQ: /* output queue size (not send + not acked) */ - if (smc->sk.sk_state == SMC_LISTEN) + if (smc->sk.sk_state == SMC_LISTEN) { + release_sock(&smc->sk); return -EINVAL; + } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; @@ -1506,8 +1739,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, break; case SIOCOUTQNSD: /* output queue size (not send only) */ - if (smc->sk.sk_state == SMC_LISTEN) + if (smc->sk.sk_state == SMC_LISTEN) { + release_sock(&smc->sk); return -EINVAL; + } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; @@ -1515,25 +1750,25 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, answ = smc_tx_prepared_sends(&smc->conn); break; case SIOCATMARK: - if (smc->sk.sk_state == SMC_LISTEN) + if (smc->sk.sk_state == SMC_LISTEN) { + release_sock(&smc->sk); return -EINVAL; + } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) { answ = 0; } else { - smc_curs_write(&cons, - smc_curs_read(&conn->local_tx_ctrl.cons, conn), - conn); - smc_curs_write(&urg, - smc_curs_read(&conn->urg_curs, conn), - conn); + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); + smc_curs_copy(&urg, &conn->urg_curs, conn); answ = smc_curs_diff(conn->rmb_desc->len, &cons, &urg) == 1; } break; default: + release_sock(&smc->sk); return -ENOIOCTLCMD; } + release_sock(&smc->sk); return put_user(answ, (int __user *)arg); } @@ -1619,7 +1854,7 @@ static const struct proto_ops smc_sock_ops = { .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, - .poll_mask = smc_poll_mask, + .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, @@ -1657,6 +1892,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, /* create internal TCP socket for CLC handshake and fallback */ smc = smc_sk(sk); smc->use_fallback = false; /* assume rdma capability first */ + smc->fallback_rsn = 0; rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &smc->clcsock); if (rc) { diff --git a/net/smc/smc.h b/net/smc/smc.h index 51ae1f10d81a..08786ace6010 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -21,8 +21,6 @@ #define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ #define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ -#define SMC_MAX_PORTS 2 /* Max # of ports */ - extern struct proto smc_proto; extern struct proto smc_proto6; @@ -185,6 +183,17 @@ struct smc_connection { spinlock_t acurs_lock; /* protect cursors */ #endif struct work_struct close_work; /* peer sent some closing */ + struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ + u8 rx_off; /* receive offset: + * 0 for SMC-R, 32 for SMC-D + */ + u64 peer_token; /* SMC-D token of peer */ +}; + +struct smc_connect_info { + int flags; + int alen; + struct sockaddr addr; }; struct smc_sock { /* smc sock container */ @@ -192,11 +201,15 @@ struct smc_sock { /* smc sock container */ struct socket *clcsock; /* internal tcp socket */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ + struct smc_connect_info *connect_info; /* connect address & flags */ + struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool use_fallback; /* fallback to tcp */ + int fallback_rsn; /* reason for fallback */ + u32 peer_diagnosis; /* decline reason from peer */ int sockopt_defer_accept; /* sockopt TCP_DEFER_ACCEPT * value diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index a7e8d63fc8ae..a7af2289cdff 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -34,14 +34,15 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, enum ib_wc_status wc_status) { struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd; + struct smc_connection *conn = cdcpend->conn; struct smc_sock *smc; int diff; - if (!cdcpend->conn) + if (!conn) /* already dismissed */ return; - smc = container_of(cdcpend->conn, struct smc_sock, conn); + smc = container_of(conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); if (!wc_status) { diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len, @@ -52,9 +53,7 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, atomic_add(diff, &cdcpend->conn->sndbuf_space); /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); - smc_curs_write(&cdcpend->conn->tx_curs_fin, - smc_curs_read(&cdcpend->cursor, cdcpend->conn), - cdcpend->conn); + smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn); } smc_tx_sndbuf_nonfull(smc); bh_unlock_sock(&smc->sk); @@ -110,14 +109,13 @@ int smc_cdc_msg_send(struct smc_connection *conn, &conn->local_tx_ctrl, conn); rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); if (!rc) - smc_curs_write(&conn->rx_curs_confirmed, - smc_curs_read(&conn->local_tx_ctrl.cons, conn), - conn); + smc_curs_copy(&conn->rx_curs_confirmed, + &conn->local_tx_ctrl.cons, conn); return rc; } -int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) +static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) { struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; @@ -130,6 +128,21 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) return smc_cdc_msg_send(conn, wr_buf, pend); } +int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) +{ + int rc; + + if (conn->lgr->is_smcd) { + spin_lock_bh(&conn->send_lock); + rc = smcd_cdc_msg_send(conn); + spin_unlock_bh(&conn->send_lock); + } else { + rc = smcr_cdc_get_slot_and_msg_send(conn); + } + + return rc; +} + static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend, unsigned long data) { @@ -157,6 +170,44 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) (unsigned long)conn); } +/* Send a SMC-D CDC header. + * This increments the free space available in our send buffer. + * Also update the confirmed receive buffer with what was sent to the peer. + */ +int smcd_cdc_msg_send(struct smc_connection *conn) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + struct smcd_cdc_msg cdc; + int rc, diff; + + memset(&cdc, 0, sizeof(cdc)); + cdc.common.type = SMC_CDC_MSG_TYPE; + cdc.prod_wrap = conn->local_tx_ctrl.prod.wrap; + cdc.prod_count = conn->local_tx_ctrl.prod.count; + + cdc.cons_wrap = conn->local_tx_ctrl.cons.wrap; + cdc.cons_count = conn->local_tx_ctrl.cons.count; + cdc.prod_flags = conn->local_tx_ctrl.prod_flags; + cdc.conn_state_flags = conn->local_tx_ctrl.conn_state_flags; + rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1); + if (rc) + return rc; + smc_curs_copy(&conn->rx_curs_confirmed, &conn->local_tx_ctrl.cons, + conn); + /* Calculate transmitted data and increment free send buffer space */ + diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin, + &conn->tx_curs_sent); + /* increased by confirmed number of bytes */ + smp_mb__before_atomic(); + atomic_add(diff, &conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ + smp_mb__after_atomic(); + smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn); + + smc_tx_sndbuf_nonfull(smc); + return rc; +} + /********************************* receive ***********************************/ static inline bool smc_cdc_before(u16 seq1, u16 seq2) @@ -171,14 +222,12 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc, char *base; /* new data included urgent business */ - smc_curs_write(&conn->urg_curs, - smc_curs_read(&conn->local_rx_ctrl.prod, conn), - conn); + smc_curs_copy(&conn->urg_curs, &conn->local_rx_ctrl.prod, conn); conn->urg_state = SMC_URG_VALID; if (!sock_flag(&smc->sk, SOCK_URGINLINE)) /* we'll skip the urgent byte, so don't account for it */ (*diff_prod)--; - base = (char *)conn->rmb_desc->cpu_addr; + base = (char *)conn->rmb_desc->cpu_addr + conn->rx_off; if (conn->urg_curs.count) conn->urg_rx_byte = *(base + conn->urg_curs.count - 1); else @@ -193,12 +242,8 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, struct smc_connection *conn = &smc->conn; int diff_cons, diff_prod; - smc_curs_write(&prod_old, - smc_curs_read(&conn->local_rx_ctrl.prod, conn), - conn); - smc_curs_write(&cons_old, - smc_curs_read(&conn->local_rx_ctrl.cons, conn), - conn); + smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn); + smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn); smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn); diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old, @@ -276,6 +321,34 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc) sock_put(&smc->sk); /* no free sk in softirq-context */ } +/* Schedule a tasklet for this connection. Triggered from the ISM device IRQ + * handler to indicate update in the DMBE. + * + * Context: + * - tasklet context + */ +static void smcd_cdc_rx_tsklet(unsigned long data) +{ + struct smc_connection *conn = (struct smc_connection *)data; + struct smcd_cdc_msg cdc; + struct smc_sock *smc; + + if (!conn) + return; + + memcpy(&cdc, conn->rmb_desc->cpu_addr, sizeof(cdc)); + smc = container_of(conn, struct smc_sock, conn); + smc_cdc_msg_recv(smc, (struct smc_cdc_msg *)&cdc); +} + +/* Initialize receive tasklet. Called from ISM device IRQ handler to start + * receiver side. + */ +void smcd_cdc_rx_init(struct smc_connection *conn) +{ + tasklet_init(&conn->rx_tsklet, smcd_cdc_rx_tsklet, (unsigned long)conn); +} + /***************************** init, exit, misc ******************************/ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) @@ -292,7 +365,7 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) return; /* invalid message */ /* lookup connection */ - lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + lgr = smc_get_lgr(link); read_lock_bh(&lgr->conns_lock); conn = smc_lgr_find_conn(ntohl(cdc->token), lgr); read_unlock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index f60082fee5b8..934df4473a7c 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -50,6 +50,20 @@ struct smc_cdc_msg { u8 reserved[18]; } __packed; /* format defined in RFC7609 */ +/* CDC message for SMC-D */ +struct smcd_cdc_msg { + struct smc_wr_rx_hdr common; /* Type = 0xFE */ + u8 res1[7]; + u16 prod_wrap; + u32 prod_count; + u8 res2[2]; + u16 cons_wrap; + u32 cons_count; + struct smc_cdc_producer_flags prod_flags; + struct smc_cdc_conn_state_flags conn_state_flags; + u8 res3[8]; +} __packed; + static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) { return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort || @@ -90,47 +104,34 @@ static inline u64 smc_curs_read(union smc_host_cursor *curs, #endif } -static inline u64 smc_curs_read_net(union smc_cdc_cursor *curs, - struct smc_connection *conn) -{ -#ifndef KERNEL_HAS_ATOMIC64 - unsigned long flags; - u64 ret; - - spin_lock_irqsave(&conn->acurs_lock, flags); - ret = curs->acurs; - spin_unlock_irqrestore(&conn->acurs_lock, flags); - return ret; -#else - return atomic64_read(&curs->acurs); -#endif -} - -static inline void smc_curs_write(union smc_host_cursor *curs, u64 val, - struct smc_connection *conn) +/* Copy cursor src into tgt */ +static inline void smc_curs_copy(union smc_host_cursor *tgt, + union smc_host_cursor *src, + struct smc_connection *conn) { #ifndef KERNEL_HAS_ATOMIC64 unsigned long flags; spin_lock_irqsave(&conn->acurs_lock, flags); - curs->acurs = val; + tgt->acurs = src->acurs; spin_unlock_irqrestore(&conn->acurs_lock, flags); #else - atomic64_set(&curs->acurs, val); + atomic64_set(&tgt->acurs, atomic64_read(&src->acurs)); #endif } -static inline void smc_curs_write_net(union smc_cdc_cursor *curs, u64 val, - struct smc_connection *conn) +static inline void smc_curs_copy_net(union smc_cdc_cursor *tgt, + union smc_cdc_cursor *src, + struct smc_connection *conn) { #ifndef KERNEL_HAS_ATOMIC64 unsigned long flags; spin_lock_irqsave(&conn->acurs_lock, flags); - curs->acurs = val; + tgt->acurs = src->acurs; spin_unlock_irqrestore(&conn->acurs_lock, flags); #else - atomic64_set(&curs->acurs, val); + atomic64_set(&tgt->acurs, atomic64_read(&src->acurs)); #endif } @@ -165,7 +166,7 @@ static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer, { union smc_host_cursor temp; - smc_curs_write(&temp, smc_curs_read(local, conn), conn); + smc_curs_copy(&temp, local, conn); peer->count = htonl(temp.count); peer->wrap = htons(temp.wrap); /* peer->reserved = htons(0); must be ensured by caller */ @@ -192,8 +193,8 @@ static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local, union smc_host_cursor temp, old; union smc_cdc_cursor net; - smc_curs_write(&old, smc_curs_read(local, conn), conn); - smc_curs_write_net(&net, smc_curs_read_net(peer, conn), conn); + smc_curs_copy(&old, local, conn); + smc_curs_copy_net(&net, peer, conn); temp.count = ntohl(net.count); temp.wrap = ntohs(net.wrap); if ((old.wrap > temp.wrap) && temp.wrap) @@ -201,12 +202,12 @@ static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local, if ((old.wrap == temp.wrap) && (old.count > temp.count)) return; - smc_curs_write(local, smc_curs_read(&temp, conn), conn); + smc_curs_copy(local, &temp, conn); } -static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, - struct smc_cdc_msg *peer, - struct smc_connection *conn) +static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smc_cdc_msg *peer, + struct smc_connection *conn) { local->common.type = peer->common.type; local->len = peer->len; @@ -218,6 +219,27 @@ static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, local->conn_state_flags = peer->conn_state_flags; } +static inline void smcd_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smcd_cdc_msg *peer) +{ + local->prod.wrap = peer->prod_wrap; + local->prod.count = peer->prod_count; + local->cons.wrap = peer->cons_wrap; + local->cons.count = peer->cons_count; + local->prod_flags = peer->prod_flags; + local->conn_state_flags = peer->conn_state_flags; +} + +static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smc_cdc_msg *peer, + struct smc_connection *conn) +{ + if (conn->lgr->is_smcd) + smcd_cdc_msg_to_host(local, (struct smcd_cdc_msg *)peer); + else + smcr_cdc_msg_to_host(local, peer, conn); +} + struct smc_cdc_tx_pend; int smc_cdc_get_free_slot(struct smc_connection *conn, @@ -227,6 +249,8 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend); int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); +int smcd_cdc_msg_send(struct smc_connection *conn); int smc_cdc_init(void) __init; +void smcd_cdc_rx_init(struct smc_connection *conn); #endif /* SMC_CDC_H */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 717449b1da0b..83aba9ade060 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -23,9 +23,15 @@ #include "smc_core.h" #include "smc_clc.h" #include "smc_ib.h" +#include "smc_ism.h" + +#define SMCR_CLC_ACCEPT_CONFIRM_LEN 68 +#define SMCD_CLC_ACCEPT_CONFIRM_LEN 48 /* eye catcher "SMCR" EBCDIC for CLC messages */ static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; +/* eye catcher "SMCD" EBCDIC for CLC messages */ +static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'}; /* check if received message has a correct header length and contains valid * heading and trailing eyecatchers @@ -38,10 +44,14 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) struct smc_clc_msg_decline *dclc; struct smc_clc_msg_trail *trl; - if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) + if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && + memcmp(clcm->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) return false; switch (clcm->type) { case SMC_CLC_PROPOSAL: + if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D && + clcm->path != SMC_TYPE_B) + return false; pclc = (struct smc_clc_msg_proposal *)clcm; pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (ntohs(pclc->hdr.length) != @@ -56,10 +66,16 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) break; case SMC_CLC_ACCEPT: case SMC_CLC_CONFIRM: + if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D) + return false; clc = (struct smc_clc_msg_accept_confirm *)clcm; - if (ntohs(clc->hdr.length) != sizeof(*clc)) + if ((clcm->path == SMC_TYPE_R && + ntohs(clc->hdr.length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) || + (clcm->path == SMC_TYPE_D && + ntohs(clc->hdr.length) != SMCD_CLC_ACCEPT_CONFIRM_LEN)) return false; - trl = &clc->trl; + trl = (struct smc_clc_msg_trail *) + ((u8 *)clc + ntohs(clc->hdr.length) - sizeof(*trl)); break; case SMC_CLC_DECLINE: dclc = (struct smc_clc_msg_decline *)clcm; @@ -70,7 +86,8 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) default: return false; } - if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) + if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && + memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) return false; return true; } @@ -250,6 +267,7 @@ out: int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type) { + long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo; struct sock *clc_sk = smc->clcsock->sk; struct smc_clc_msg_hdr *clcm = buf; struct msghdr msg = {NULL, 0}; @@ -295,6 +313,9 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, datlen = ntohs(clcm->length); if ((len < sizeof(struct smc_clc_msg_hdr)) || (datlen > buflen) || + (clcm->version != SMC_CLC_V1) || + (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D && + clcm->path != SMC_TYPE_B) || ((clcm->type != SMC_CLC_DECLINE) && (clcm->type != expected_type))) { smc->sk.sk_err = EPROTO; @@ -306,7 +327,6 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, memset(&msg, 0, sizeof(struct msghdr)); iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, datlen); krflags = MSG_WAITALL; - smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; len = sock_recvmsg(smc->clcsock, &msg, krflags); if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) { smc->sk.sk_err = EPROTO; @@ -314,7 +334,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, goto out; } if (clcm->type == SMC_CLC_DECLINE) { - reason_code = SMC_CLC_DECL_REPLY; + struct smc_clc_msg_decline *dclc; + + dclc = (struct smc_clc_msg_decline *)clcm; + reason_code = SMC_CLC_DECL_PEERDECL; + smc->peer_diagnosis = ntohl(dclc->peer_diagnosis); if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { smc->conn.lgr->sync_err = 1; smc_lgr_terminate(smc->conn.lgr); @@ -322,6 +346,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } out: + smc->clcsock->sk->sk_rcvtimeo = rcvtimeo; return reason_code; } @@ -356,17 +381,18 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) } /* send CLC PROPOSAL message across internal TCP socket */ -int smc_clc_send_proposal(struct smc_sock *smc, - struct smc_ib_device *smcibdev, - u8 ibport) +int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, + struct smc_ib_device *ibdev, u8 ibport, u8 gid[], + struct smcd_dev *ismdev) { struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; struct smc_clc_msg_proposal_prefix pclc_prfx; + struct smc_clc_msg_smcd pclc_smcd; struct smc_clc_msg_proposal pclc; struct smc_clc_msg_trail trl; int len, i, plen, rc; int reason_code = 0; - struct kvec vec[4]; + struct kvec vec[5]; struct msghdr msg; /* retrieve ip prefixes for CLC proposal msg */ @@ -381,18 +407,34 @@ int smc_clc_send_proposal(struct smc_sock *smc, memset(&pclc, 0, sizeof(pclc)); memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); pclc.hdr.type = SMC_CLC_PROPOSAL; - pclc.hdr.length = htons(plen); pclc.hdr.version = SMC_CLC_V1; /* SMC version */ - memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE); - memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN); - pclc.iparea_offset = htons(0); + pclc.hdr.path = smc_type; + if (smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B) { + /* add SMC-R specifics */ + memcpy(pclc.lcl.id_for_peer, local_systemid, + sizeof(local_systemid)); + memcpy(&pclc.lcl.gid, gid, SMC_GID_SIZE); + memcpy(&pclc.lcl.mac, &ibdev->mac[ibport - 1], ETH_ALEN); + pclc.iparea_offset = htons(0); + } + if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { + /* add SMC-D specifics */ + memset(&pclc_smcd, 0, sizeof(pclc_smcd)); + plen += sizeof(pclc_smcd); + pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET); + pclc_smcd.gid = ismdev->local_gid; + } + pclc.hdr.length = htons(plen); memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memset(&msg, 0, sizeof(msg)); i = 0; vec[i].iov_base = &pclc; vec[i++].iov_len = sizeof(pclc); + if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { + vec[i].iov_base = &pclc_smcd; + vec[i++].iov_len = sizeof(pclc_smcd); + } vec[i].iov_base = &pclc_prfx; vec[i++].iov_len = sizeof(pclc_prfx); if (pclc_prfx.ipv6_prefixes_cnt > 0) { @@ -428,35 +470,55 @@ int smc_clc_send_confirm(struct smc_sock *smc) struct kvec vec; int len; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; /* send SMC Confirm CLC msg */ memset(&cclc, 0, sizeof(cclc)); - memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); cclc.hdr.type = SMC_CLC_CONFIRM; - cclc.hdr.length = htons(sizeof(cclc)); cclc.hdr.version = SMC_CLC_V1; /* SMC version */ - memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], - SMC_GID_SIZE); - memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); - hton24(cclc.qpn, link->roce_qp->qp_num); - cclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ - cclc.rmbe_alert_token = htonl(conn->alert_token_local); - cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); - cclc.rmbe_size = conn->rmbe_size_short; - cclc.rmb_dma_addr = cpu_to_be64( - (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); - hton24(cclc.psn, link->psn_initial); - - memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + if (smc->conn.lgr->is_smcd) { + /* SMC-D specific settings */ + memcpy(cclc.hdr.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + cclc.hdr.path = SMC_TYPE_D; + cclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); + cclc.gid = conn->lgr->smcd->local_gid; + cclc.token = conn->rmb_desc->token; + cclc.dmbe_size = conn->rmbe_size_short; + cclc.dmbe_idx = 0; + memcpy(&cclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); + memcpy(cclc.smcd_trl.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + } else { + /* SMC-R specific settings */ + link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + cclc.hdr.path = SMC_TYPE_R; + cclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + memcpy(cclc.lcl.id_for_peer, local_systemid, + sizeof(local_systemid)); + memcpy(&cclc.lcl.gid, link->gid, SMC_GID_SIZE); + memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], + ETH_ALEN); + hton24(cclc.qpn, link->roce_qp->qp_num); + cclc.rmb_rkey = + htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ + cclc.rmbe_alert_token = htonl(conn->alert_token_local); + cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); + cclc.rmbe_size = conn->rmbe_size_short; + cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + hton24(cclc.psn, link->psn_initial); + memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + } memset(&msg, 0, sizeof(msg)); vec.iov_base = &cclc; - vec.iov_len = sizeof(cclc); - len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc)); - if (len < sizeof(cclc)) { + vec.iov_len = ntohs(cclc.hdr.length); + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, + ntohs(cclc.hdr.length)); + if (len < ntohs(cclc.hdr.length)) { if (len >= 0) { reason_code = -ENETUNREACH; smc->sk.sk_err = -reason_code; @@ -479,35 +541,57 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) int rc = 0; int len; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; memset(&aclc, 0, sizeof(aclc)); - memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); aclc.hdr.type = SMC_CLC_ACCEPT; - aclc.hdr.length = htons(sizeof(aclc)); aclc.hdr.version = SMC_CLC_V1; /* SMC version */ if (srv_first_contact) aclc.hdr.flag = 1; - memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], - SMC_GID_SIZE); - memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); - hton24(aclc.qpn, link->roce_qp->qp_num); - aclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ - aclc.rmbe_alert_token = htonl(conn->alert_token_local); - aclc.qp_mtu = link->path_mtu; - aclc.rmbe_size = conn->rmbe_size_short, - aclc.rmb_dma_addr = cpu_to_be64( - (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); - hton24(aclc.psn, link->psn_initial); - memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + if (new_smc->conn.lgr->is_smcd) { + /* SMC-D specific settings */ + aclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); + memcpy(aclc.hdr.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + aclc.hdr.path = SMC_TYPE_D; + aclc.gid = conn->lgr->smcd->local_gid; + aclc.token = conn->rmb_desc->token; + aclc.dmbe_size = conn->rmbe_size_short; + aclc.dmbe_idx = 0; + memcpy(&aclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); + memcpy(aclc.smcd_trl.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + } else { + /* SMC-R specific settings */ + aclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + aclc.hdr.path = SMC_TYPE_R; + link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + memcpy(aclc.lcl.id_for_peer, local_systemid, + sizeof(local_systemid)); + memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE); + memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], + ETH_ALEN); + hton24(aclc.qpn, link->roce_qp->qp_num); + aclc.rmb_rkey = + htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ + aclc.rmbe_alert_token = htonl(conn->alert_token_local); + aclc.qp_mtu = link->path_mtu; + aclc.rmbe_size = conn->rmbe_size_short, + aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + hton24(aclc.psn, link->psn_initial); + memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + } memset(&msg, 0, sizeof(msg)); vec.iov_base = &aclc; - vec.iov_len = sizeof(aclc); - len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc)); - if (len < sizeof(aclc)) { + vec.iov_len = ntohs(aclc.hdr.length); + len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, + ntohs(aclc.hdr.length)); + if (len < ntohs(aclc.hdr.length)) { if (len >= 0) new_smc->sk.sk_err = EPROTO; else diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 41ff9ea96139..18da89b681c2 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -23,17 +23,26 @@ #define SMC_CLC_DECLINE 0x04 #define SMC_CLC_V1 0x1 /* SMC version */ +#define SMC_TYPE_R 0 /* SMC-R only */ +#define SMC_TYPE_D 1 /* SMC-D only */ +#define SMC_TYPE_B 3 /* SMC-R and SMC-D */ #define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ #define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ -#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */ +#define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */ +#define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */ #define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */ -#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */ +#define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */ +#define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */ +#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found */ +#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ +#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ +#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ -#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */ +#define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */ -#define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */ -#define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */ -#define SMC_CLC_DECL_RMBE_EC 0x08000000 /* peer has eyecatcher in RMBE */ +#define SMC_CLC_DECL_ERR_RTOK 0x99990001 /* rtoken handling failed */ +#define SMC_CLC_DECL_ERR_RDYLNK 0x99990002 /* ib ready link failed */ +#define SMC_CLC_DECL_ERR_REGRMB 0x99990003 /* reg rmb failed */ struct smc_clc_msg_hdr { /* header1 of clc messages */ u8 eyecatcher[4]; /* eye catcher */ @@ -42,9 +51,11 @@ struct smc_clc_msg_hdr { /* header1 of clc messages */ #if defined(__BIG_ENDIAN_BITFIELD) u8 version : 4, flag : 1, - rsvd : 3; + rsvd : 1, + path : 2; #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 rsvd : 3, + u8 path : 2, + rsvd : 1, flag : 1, version : 4; #endif @@ -77,6 +88,11 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */ } __aligned(4); +struct smc_clc_msg_smcd { /* SMC-D GID information */ + u64 gid; /* ISM GID of requestor */ + u8 res[32]; +}; + struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ struct smc_clc_msg_hdr hdr; struct smc_clc_msg_local lcl; @@ -94,23 +110,45 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_hdr hdr; - struct smc_clc_msg_local lcl; - u8 qpn[3]; /* QP number */ - __be32 rmb_rkey; /* RMB rkey */ - u8 rmbe_idx; /* Index of RMBE in RMB */ - __be32 rmbe_alert_token;/* unique connection id */ + union { + struct { /* SMC-R */ + struct smc_clc_msg_local lcl; + u8 qpn[3]; /* QP number */ + __be32 rmb_rkey; /* RMB rkey */ + u8 rmbe_idx; /* Index of RMBE in RMB */ + __be32 rmbe_alert_token;/* unique connection id */ #if defined(__BIG_ENDIAN_BITFIELD) - u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */ - qp_mtu : 4; /* QP mtu */ + u8 rmbe_size : 4, /* buf size (compressed) */ + qp_mtu : 4; /* QP mtu */ #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 qp_mtu : 4, - rmbe_size : 4; + u8 qp_mtu : 4, + rmbe_size : 4; #endif - u8 reserved; - __be64 rmb_dma_addr; /* RMB virtual address */ - u8 reserved2; - u8 psn[3]; /* initial packet sequence number */ - struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ + u8 reserved; + __be64 rmb_dma_addr; /* RMB virtual address */ + u8 reserved2; + u8 psn[3]; /* packet sequence number */ + struct smc_clc_msg_trail smcr_trl; + /* eye catcher "SMCR" EBCDIC */ + } __packed; + struct { /* SMC-D */ + u64 gid; /* Sender GID */ + u64 token; /* DMB token */ + u8 dmbe_idx; /* DMBE index */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 dmbe_size : 4, /* buf size (compressed) */ + reserved3 : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved3 : 4, + dmbe_size : 4; +#endif + u16 reserved4; + u32 linkid; /* Link identifier */ + u32 reserved5[3]; + struct smc_clc_msg_trail smcd_trl; + /* eye catcher "SMCD" EBCDIC */ + } __packed; + }; } __packed; /* format defined in RFC7609 */ struct smc_clc_msg_decline { /* clc decline message */ @@ -129,13 +167,26 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); } +/* get SMC-D info from proposal message */ +static inline struct smc_clc_msg_smcd * +smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop) +{ + if (ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd)) + return NULL; + + return (struct smc_clc_msg_smcd *)(prop + 1); +} + +struct smcd_dev; + int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop); int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); -int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, - u8 ibport); +int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, + struct smc_ib_device *smcibdev, u8 ibport, u8 gid[], + struct smcd_dev *ismdev); int smc_clc_send_confirm(struct smc_sock *smc); int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact); diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index fa41d9881741..ac961dfb1ea1 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -107,6 +107,8 @@ static void smc_close_active_abort(struct smc_sock *smc) } switch (sk->sk_state) { case SMC_INIT: + sk->sk_state = SMC_PEERABORTWAIT; + break; case SMC_ACTIVE: sk->sk_state = SMC_PEERABORTWAIT; release_sock(sk); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index add82b0266f3..a46418f45ecd 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -25,10 +25,12 @@ #include "smc_llc.h" #include "smc_cdc.h" #include "smc_close.h" +#include "smc_ism.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) +#define SMC_LGR_FREE_DELAY_FAST (8 * HZ) static struct smc_lgr_list smc_lgr_list = { /* established link groups */ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), @@ -46,8 +48,13 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) * otherwise there is a risk of out-of-sync link groups. */ mod_delayed_work(system_wq, &lgr->free_work, - lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : - SMC_LGR_FREE_DELAY_SERV); + (!lgr->is_smcd && lgr->role == SMC_CLNT) ? + SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); +} + +void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr) +{ + mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST); } /* Register connection's alert token in our lookup structure. @@ -132,6 +139,20 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) smc_lgr_schedule_free_work(lgr); } +/* Send delete link, either as client to request the initiation + * of the DELETE LINK sequence from server; or as server to + * initiate the delete processing. See smc_llc_rx_delete_link(). + */ +static int smc_link_send_delete(struct smc_link *lnk) +{ + if (lnk->state == SMC_LNK_ACTIVE && + !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) { + smc_llc_link_deleting(lnk); + return 0; + } + return -ENOTCONN; +} + static void smc_lgr_free_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(to_delayed_work(work), @@ -152,17 +173,30 @@ static void smc_lgr_free_work(struct work_struct *work) list_del_init(&lgr->list); /* remove from smc_lgr_list */ free: spin_unlock_bh(&smc_lgr_list.lock); + + if (!lgr->is_smcd && !lgr->terminating) { + /* try to send del link msg, on error free lgr immediately */ + if (!smc_link_send_delete(&lgr->lnk[SMC_SINGLE_LINK])) { + /* reschedule in case we never receive a response */ + smc_lgr_schedule_free_work(lgr); + return; + } + } + if (!delayed_work_pending(&lgr->free_work)) { - if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + + if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) + smc_llc_link_inactive(lnk); smc_lgr_free(lgr); } } /* create a new SMC link group */ -static int smc_lgr_create(struct smc_sock *smc, +static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, struct smc_ib_device *smcibdev, u8 ibport, - char *peer_systemid, unsigned short vlan_id) + char *peer_systemid, unsigned short vlan_id, + struct smcd_dev *smcismdev, u64 peer_gid) { struct smc_link_group *lgr; struct smc_link *lnk; @@ -170,17 +204,23 @@ static int smc_lgr_create(struct smc_sock *smc, int rc = 0; int i; + if (is_smcd && vlan_id) { + rc = smc_ism_get_vlan(smcismdev, vlan_id); + if (rc) + goto out; + } + lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { rc = -ENOMEM; goto out; } - lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + lgr->is_smcd = is_smcd; lgr->sync_err = 0; - memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); lgr->vlan_id = vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); + rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); @@ -189,36 +229,48 @@ static int smc_lgr_create(struct smc_sock *smc, memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); lgr->conns_all = RB_ROOT; - - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* initialize link */ - lnk->state = SMC_LNK_ACTIVATING; - lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = smcibdev; - lnk->ibport = ibport; - lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; - if (!smcibdev->initialized) - smc_ib_setup_per_ibdev(smcibdev); - get_random_bytes(rndvec, sizeof(rndvec)); - lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); - rc = smc_llc_link_init(lnk); - if (rc) - goto free_lgr; - rc = smc_wr_alloc_link_mem(lnk); - if (rc) - goto clear_llc_lnk; - rc = smc_ib_create_protection_domain(lnk); - if (rc) - goto free_link_mem; - rc = smc_ib_create_queue_pair(lnk); - if (rc) - goto dealloc_pd; - rc = smc_wr_create_link(lnk); - if (rc) - goto destroy_qp; - + if (is_smcd) { + /* SMC-D specific settings */ + lgr->peer_gid = peer_gid; + lgr->smcd = smcismdev; + } else { + /* SMC-R specific settings */ + lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); + + lnk = &lgr->lnk[SMC_SINGLE_LINK]; + /* initialize link */ + lnk->state = SMC_LNK_ACTIVATING; + lnk->link_id = SMC_SINGLE_LINK; + lnk->smcibdev = smcibdev; + lnk->ibport = ibport; + lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; + if (!smcibdev->initialized) + smc_ib_setup_per_ibdev(smcibdev); + get_random_bytes(rndvec, sizeof(rndvec)); + lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + + (rndvec[2] << 16); + rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, + vlan_id, lnk->gid, &lnk->sgid_index); + if (rc) + goto free_lgr; + rc = smc_llc_link_init(lnk); + if (rc) + goto free_lgr; + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto clear_llc_lnk; + rc = smc_ib_create_protection_domain(lnk); + if (rc) + goto free_link_mem; + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_create_link(lnk); + if (rc) + goto destroy_qp; + } smc->conn.lgr = lgr; - rwlock_init(&lgr->conns_lock); spin_lock_bh(&smc_lgr_list.lock); list_add(&lgr->list, &smc_lgr_list.list); spin_unlock_bh(&smc_lgr_list.lock); @@ -264,7 +316,12 @@ void smc_conn_free(struct smc_connection *conn) { if (!conn->lgr) return; - smc_cdc_tx_dismiss_slots(conn); + if (conn->lgr->is_smcd) { + smc_ism_unset_conn(conn); + tasklet_kill(&conn->rx_tsklet); + } else { + smc_cdc_tx_dismiss_slots(conn); + } smc_lgr_unregister_conn(conn); smc_buf_unuse(conn); } @@ -280,8 +337,8 @@ static void smc_link_clear(struct smc_link *lnk) smc_wr_free_link_mem(lnk); } -static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, - struct smc_buf_desc *buf_desc) +static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc) { struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; @@ -301,6 +358,28 @@ static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, kfree(buf_desc); } +static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, + struct smc_buf_desc *buf_desc) +{ + if (is_dmb) { + /* restore original buf len */ + buf_desc->len += sizeof(struct smcd_cdc_msg); + smc_ism_unregister_dmb(lgr->smcd, buf_desc); + } else { + kfree(buf_desc->cpu_addr); + } + kfree(buf_desc); +} + +static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc) +{ + if (lgr->is_smcd) + smcd_buf_free(lgr, is_rmb, buf_desc); + else + smcr_buf_free(lgr, is_rmb, buf_desc); +} + static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf_desc; @@ -332,7 +411,10 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); - smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); + if (lgr->is_smcd) + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + else + smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); kfree(lgr); } @@ -357,7 +439,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) lgr->terminating = 1; if (!list_empty(&lgr->list)) /* forget lgr */ list_del_init(&lgr->list); - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + if (!lgr->is_smcd) + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); write_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); @@ -374,7 +457,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) node = rb_first(&lgr->conns_all); } write_unlock_bh(&lgr->conns_lock); - wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); + if (!lgr->is_smcd) + wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); smc_lgr_schedule_free_work(lgr); } @@ -392,17 +476,44 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && + if (!lgr->is_smcd && + lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) __smc_lgr_terminate(lgr); } spin_unlock_bh(&smc_lgr_list.lock); } +/* Called when SMC-D device is terminated or peer is lost */ +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) +{ + struct smc_link_group *lgr, *l; + LIST_HEAD(lgr_free_list); + + /* run common cleanup function and build free list */ + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { + if (lgr->is_smcd && lgr->smcd == dev && + (!peer_gid || lgr->peer_gid == peer_gid) && + !list_empty(&lgr->list)) { + __smc_lgr_terminate(lgr); + list_move(&lgr->list, &lgr_free_list); + } + } + spin_unlock_bh(&smc_lgr_list.lock); + + /* cancel the regular free workers and actually free lgrs */ + list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { + list_del_init(&lgr->list); + cancel_delayed_work_sync(&lgr->free_work); + smc_lgr_free(lgr); + } +} + /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ -static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) +int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) { struct dst_entry *dst = sk_dst_get(clcsock->sk); struct net_device *ndev; @@ -446,41 +557,30 @@ out: return rc; } -/* determine the link gid matching the vlan id of the link group */ -static int smc_link_determine_gid(struct smc_link_group *lgr) +static bool smcr_lgr_match(struct smc_link_group *lgr, + struct smc_clc_msg_local *lcl, + enum smc_lgr_role role) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; - struct ib_gid_attr gattr; - union ib_gid gid; - int i; - - if (!lgr->vlan_id) { - lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1]; - return 0; - } + return !memcmp(lgr->peer_systemid, lcl->id_for_peer, + SMC_SYSTEMID_LEN) && + !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, + SMC_GID_SIZE) && + !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, + sizeof(lcl->mac)) && + lgr->role == role; +} - for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len; - i++) { - if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid, - &gattr)) - continue; - if (gattr.ndev) { - if (is_vlan_dev(gattr.ndev) && - vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) { - lnk->gid = gid; - dev_put(gattr.ndev); - return 0; - } - dev_put(gattr.ndev); - } - } - return -ENODEV; +static bool smcd_lgr_match(struct smc_link_group *lgr, + struct smcd_dev *smcismdev, u64 peer_gid) +{ + return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; } /* create a new SMC connection (and a new link group if necessary) */ -int smc_conn_create(struct smc_sock *smc, +int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, struct smc_ib_device *smcibdev, u8 ibport, - struct smc_clc_msg_local *lcl, int srv_first_contact) + struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, + u64 peer_gid) { struct smc_connection *conn = &smc->conn; int local_contact = SMC_FIRST_CONTACT; @@ -502,17 +602,12 @@ int smc_conn_create(struct smc_sock *smc, spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry(lgr, &smc_lgr_list.list, list) { write_lock_bh(&lgr->conns_lock); - if (!memcmp(lgr->peer_systemid, lcl->id_for_peer, - SMC_SYSTEMID_LEN) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, - SMC_GID_SIZE) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, - sizeof(lcl->mac)) && + if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : + smcr_lgr_match(lgr, lcl, role)) && !lgr->sync_err && - (lgr->role == role) && - (lgr->vlan_id == vlan_id) && - ((role == SMC_CLNT) || - (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) { + lgr->vlan_id == vlan_id && + (role == SMC_CLNT || + lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { /* link group found */ local_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; @@ -535,16 +630,19 @@ int smc_conn_create(struct smc_sock *smc, create: if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_lgr_create(smc, smcibdev, ibport, - lcl->id_for_peer, vlan_id); + rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport, + lcl->id_for_peer, vlan_id, smcd, peer_gid); if (rc) goto out; smc_lgr_register_conn(conn); /* add smc conn to lgr */ - rc = smc_link_determine_gid(conn->lgr); } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; + if (is_smcd) { + conn->rx_off = sizeof(struct smcd_cdc_msg); + smcd_cdc_rx_init(conn); /* init tasklet for this conn */ + } #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&conn->acurs_lock); #endif @@ -609,8 +707,8 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } -static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, - bool is_rmb, int bufsize) +static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, + bool is_rmb, int bufsize) { struct smc_buf_desc *buf_desc; struct smc_link *lnk; @@ -668,7 +766,44 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, return buf_desc; } -static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) +#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ + +static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, + bool is_dmb, int bufsize) +{ + struct smc_buf_desc *buf_desc; + int rc; + + if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) + return ERR_PTR(-EAGAIN); + + /* try to alloc a new DMB */ + buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + if (!buf_desc) + return ERR_PTR(-ENOMEM); + if (is_dmb) { + rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); + if (rc) { + kfree(buf_desc); + return ERR_PTR(-EAGAIN); + } + buf_desc->pages = virt_to_page(buf_desc->cpu_addr); + /* CDC header stored in buf. So, pretend it was smaller */ + buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg); + } else { + buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC); + if (!buf_desc->cpu_addr) { + kfree(buf_desc); + return ERR_PTR(-EAGAIN); + } + buf_desc->len = bufsize; + } + return buf_desc; +} + +static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) { struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct smc_connection *conn = &smc->conn; @@ -706,7 +841,11 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) break; /* found reusable slot */ } - buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize); + if (is_smcd) + buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); + else + buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); + if (PTR_ERR(buf_desc) == -ENOMEM) break; if (IS_ERR(buf_desc)) @@ -727,7 +866,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) conn->rmbe_size_short = bufsize_short; smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); - conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize); + conn->rmbe_update_limit = + smc_rmb_wnd_update_limit(buf_desc->len); + if (is_smcd) + smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; smc->sk.sk_sndbuf = bufsize * 2; @@ -740,6 +882,8 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -748,6 +892,8 @@ void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -756,6 +902,8 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->rmb_desc, DMA_FROM_DEVICE); } @@ -764,6 +912,8 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->rmb_desc, DMA_FROM_DEVICE); } @@ -774,16 +924,16 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) * the Linux implementation uses just one RMB-element per RMB, i.e. uses an * extra RMB for every connection in a link group */ -int smc_buf_create(struct smc_sock *smc) +int smc_buf_create(struct smc_sock *smc, bool is_smcd) { int rc; /* create send buffer */ - rc = __smc_buf_create(smc, false); + rc = __smc_buf_create(smc, is_smcd, false); if (rc) return rc; /* create rmb */ - rc = __smc_buf_create(smc, true); + rc = __smc_buf_create(smc, is_smcd, true); if (rc) smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); return rc; @@ -865,7 +1015,14 @@ void smc_core_exit(void) spin_unlock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { list_del_init(&lgr->list); - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + if (!lgr->is_smcd) { + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + + if (lnk->state == SMC_LNK_ACTIVE) + smc_llc_send_delete_link(lnk, SMC_LLC_REQ, + false); + smc_llc_link_inactive(lnk); + } cancel_delayed_work_sync(&lgr->free_work); smc_lgr_free(lgr); /* free link group */ } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 93cb3523bf50..c156674733c9 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -34,7 +34,8 @@ enum smc_lgr_role { /* possible roles of a link group */ enum smc_link_state { /* possible states of a link */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ - SMC_LNK_ACTIVE /* link is active */ + SMC_LNK_ACTIVE, /* link is active */ + SMC_LNK_DELETING, /* link is being deleted */ }; #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ @@ -84,14 +85,15 @@ struct smc_link { wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ - union ib_gid gid; /* gid matching used vlan id */ + u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ + u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ enum ib_mtu path_mtu; /* used mtu */ enum ib_mtu peer_mtu; /* mtu size of peer */ u32 psn_initial; /* QP tx initial packet seqno */ u32 peer_psn; /* QP rx initial packet seqno */ u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ - u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/ + u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ enum smc_link_state state; /* state of link */ @@ -124,15 +126,28 @@ struct smc_buf_desc { void *cpu_addr; /* virtual address of buffer */ struct page *pages; int len; /* length of buffer */ - struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region - * incl. rkey provided to peer - */ - u32 order; /* allocation order */ u32 used; /* currently used / unused */ u8 reused : 1; /* new created / reused */ u8 regerr : 1; /* err during registration */ + union { + struct { /* SMC-R */ + struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; + /* virtual buffer */ + struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + /* for rmb only: memory region + * incl. rkey provided to peer + */ + u32 order; /* allocation order */ + }; + struct { /* SMC-D */ + unsigned short sba_idx; + /* SBA index number */ + u64 token; + /* DMB token number */ + dma_addr_t dma_addr; + /* DMA address */ + }; + }; }; struct smc_rtoken { /* address/key of remote RMB */ @@ -148,12 +163,10 @@ struct smc_rtoken { /* address/key of remote RMB */ * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) */ +struct smcd_dev; + struct smc_link_group { struct list_head list; - enum smc_lgr_role role; /* client or server */ - struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ - char peer_systemid[SMC_SYSTEMID_LEN]; - /* unique system_id of peer */ struct rb_root conns_all; /* connection tree */ rwlock_t conns_lock; /* protects conns_all */ unsigned int conns_num; /* current # of connections */ @@ -163,17 +176,34 @@ struct smc_link_group { rwlock_t sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ rwlock_t rmbs_lock; /* protects rx buffers */ - struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] - [SMC_LINKS_PER_LGR_MAX]; - /* remote addr/key pairs */ - unsigned long rtokens_used_mask[BITS_TO_LONGS( - SMC_RMBS_PER_LGR_MAX)]; - /* used rtoken elements */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ + + bool is_smcd; /* SMC-R or SMC-D */ + union { + struct { /* SMC-R */ + enum smc_lgr_role role; + /* client or server */ + struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; + /* smc link */ + char peer_systemid[SMC_SYSTEMID_LEN]; + /* unique system_id of peer */ + struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] + [SMC_LINKS_PER_LGR_MAX]; + /* remote addr/key pairs */ + DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); + /* used rtoken elements */ + }; + struct { /* SMC-D */ + u64 peer_gid; + /* Peer GID (remote) */ + struct smcd_dev *smcd; + /* ISM device for VLAN reg. */ + }; + }; }; /* Find the connection associated with the given alert token in the link group. @@ -217,7 +247,8 @@ void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); -int smc_buf_create(struct smc_sock *smc); +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid); +int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc); @@ -227,9 +258,19 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); void smc_rmb_sync_sg_for_device(struct smc_connection *conn); +int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id); + void smc_conn_free(struct smc_connection *conn); -int smc_conn_create(struct smc_sock *smc, +int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, struct smc_ib_device *smcibdev, u8 ibport, - struct smc_clc_msg_local *lcl, int srv_first_contact); + struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, + u64 peer_gid); +void smcd_conn_free(struct smc_connection *conn); +void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); void smc_core_exit(void); + +static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) +{ + return container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); +} #endif diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 839354402215..dbf64a93d68a 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -79,6 +79,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, struct nlattr *bc) { struct smc_sock *smc = smc_sk(sk); + struct smc_diag_fallback fallback; struct user_namespace *user_ns; struct smc_diag_msg *r; struct nlmsghdr *nlh; @@ -91,11 +92,21 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, r = nlmsg_data(nlh); smc_diag_msg_common_fill(r, sk); r->diag_state = sk->sk_state; - r->diag_fallback = smc->use_fallback; + if (smc->use_fallback) + r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP; + else if (smc->conn.lgr && smc->conn.lgr->is_smcd) + r->diag_mode = SMC_DIAG_MODE_SMCD; + else + r->diag_mode = SMC_DIAG_MODE_SMCR; user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk); if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) goto errout; + fallback.reason = smc->fallback_rsn; + fallback.peer_diagnosis = smc->peer_diagnosis; + if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0) + goto errout; + if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.alert_token_local) { struct smc_connection *conn = &smc->conn; @@ -136,7 +147,8 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, goto errout; } - if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr && + if (smc->conn.lgr && !smc->conn.lgr->is_smcd && + (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, @@ -148,13 +160,28 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, smc->conn.lgr->lnk[0].smcibdev->ibdev->name, sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name)); smc_gid_be16_convert(linfo.lnk[0].gid, - smc->conn.lgr->lnk[0].gid.raw); + smc->conn.lgr->lnk[0].gid); smc_gid_be16_convert(linfo.lnk[0].peer_gid, smc->conn.lgr->lnk[0].peer_gid); if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; } + if (smc->conn.lgr && smc->conn.lgr->is_smcd && + (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && + !list_empty(&smc->conn.lgr->list)) { + struct smc_connection *conn = &smc->conn; + struct smcd_diag_dmbinfo dinfo = { + .linkid = *((u32 *)conn->lgr->id), + .peer_gid = conn->lgr->peer_gid, + .my_gid = conn->lgr->smcd->local_gid, + .token = conn->rmb_desc->token, + .peer_token = conn->peer_token + }; + + if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0) + goto errout; + } nlmsg_end(skb, nlh); return 0; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 0eed7ab9f28b..2cc64bc8ae20 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -68,7 +68,7 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk) qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); - rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, 0, 1, 0); + rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0); rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, sizeof(lnk->peer_mac)); @@ -112,8 +112,7 @@ int smc_ib_modify_qp_reset(struct smc_link *lnk) int smc_ib_ready_link(struct smc_link *lnk) { - struct smc_link_group *lgr = - container_of(lnk, struct smc_link_group, lnk[0]); + struct smc_link_group *lgr = smc_get_lgr(lnk); int rc = 0; rc = smc_ib_modify_qp_init(lnk); @@ -143,6 +142,93 @@ out: return rc; } +static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct ib_gid_attr gattr; + union ib_gid gid; + int rc; + + rc = ib_query_gid(smcibdev->ibdev, ibport, 0, &gid, &gattr); + if (rc || !gattr.ndev) + return -ENODEV; + + memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN); + dev_put(gattr.ndev); + return 0; +} + +/* Create an identifier unique for this instance of SMC-R. + * The MAC-address of the first active registered IB device + * plus a random 2-byte number is used to create this identifier. + * This name is delivered to the peer during connection initialization. + */ +static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, + u8 ibport) +{ + memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], + sizeof(smcibdev->mac[ibport - 1])); + get_random_bytes(&local_systemid[0], 2); +} + +bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) +{ + return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; +} + +/* determine the gid for an ib-device port and vlan id */ +int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, + unsigned short vlan_id, u8 gid[], u8 *sgid_index) +{ + struct ib_gid_attr gattr; + union ib_gid _gid; + int i; + + for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { + memset(&_gid, 0, SMC_GID_SIZE); + memset(&gattr, 0, sizeof(gattr)); + if (ib_query_gid(smcibdev->ibdev, ibport, i, &_gid, &gattr)) + continue; + if (!gattr.ndev) + continue; + if (((!vlan_id && !is_vlan_dev(gattr.ndev)) || + (vlan_id && is_vlan_dev(gattr.ndev) && + vlan_dev_vlan_id(gattr.ndev) == vlan_id)) && + gattr.gid_type == IB_GID_TYPE_IB) { + if (gid) + memcpy(gid, &_gid, SMC_GID_SIZE); + if (sgid_index) + *sgid_index = i; + dev_put(gattr.ndev); + return 0; + } + dev_put(gattr.ndev); + } + return -ENODEV; +} + +static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) +{ + int rc; + + memset(&smcibdev->pattr[ibport - 1], 0, + sizeof(smcibdev->pattr[ibport - 1])); + rc = ib_query_port(smcibdev->ibdev, ibport, + &smcibdev->pattr[ibport - 1]); + if (rc) + goto out; + /* the SMC protocol requires specification of the RoCE MAC address */ + rc = smc_ib_fill_mac(smcibdev, ibport); + if (rc) + goto out; + if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET, + sizeof(local_systemid)) && + smc_ib_port_active(smcibdev, ibport)) + /* create unique system identifier */ + smc_ib_define_local_systemid(smcibdev, ibport); +out: + return rc; +} + /* process context wrapper for might_sleep smc_ib_remember_port_attr */ static void smc_ib_port_event_work(struct work_struct *work) { @@ -370,62 +456,6 @@ void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0; } -static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) -{ - struct ib_gid_attr gattr; - int rc; - - rc = ib_query_gid(smcibdev->ibdev, ibport, 0, - &smcibdev->gid[ibport - 1], &gattr); - if (rc || !gattr.ndev) - return -ENODEV; - - memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN); - dev_put(gattr.ndev); - return 0; -} - -/* Create an identifier unique for this instance of SMC-R. - * The MAC-address of the first active registered IB device - * plus a random 2-byte number is used to create this identifier. - * This name is delivered to the peer during connection initialization. - */ -static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, - u8 ibport) -{ - memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], - sizeof(smcibdev->mac[ibport - 1])); - get_random_bytes(&local_systemid[0], 2); -} - -bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) -{ - return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; -} - -int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) -{ - int rc; - - memset(&smcibdev->pattr[ibport - 1], 0, - sizeof(smcibdev->pattr[ibport - 1])); - rc = ib_query_port(smcibdev->ibdev, ibport, - &smcibdev->pattr[ibport - 1]); - if (rc) - goto out; - /* the SMC protocol requires specification of the RoCE MAC address */ - rc = smc_ib_fill_gid_and_mac(smcibdev, ibport); - if (rc) - goto out; - if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET, - sizeof(local_systemid)) && - smc_ib_port_active(smcibdev, ibport)) - /* create unique system identifier */ - smc_ib_define_local_systemid(smcibdev, ibport); -out: - return rc; -} - long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { struct ib_cq_init_attr cqattr = { @@ -454,9 +484,6 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smcibdev->roce_cq_recv = NULL; goto err; } - INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, - smc_ib_global_event_handler); - ib_register_event_handler(&smcibdev->event_handler); smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; return rc; @@ -472,7 +499,6 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) return; smcibdev->initialized = 0; smc_wr_remove_dev(smcibdev); - ib_unregister_event_handler(&smcibdev->event_handler); ib_destroy_cq(smcibdev->roce_cq_recv); ib_destroy_cq(smcibdev->roce_cq_send); } @@ -483,6 +509,8 @@ static struct ib_client smc_ib_client; static void smc_ib_add_dev(struct ib_device *ibdev) { struct smc_ib_device *smcibdev; + u8 port_cnt; + int i; if (ibdev->node_type != RDMA_NODE_IB_CA) return; @@ -498,6 +526,21 @@ static void smc_ib_add_dev(struct ib_device *ibdev) list_add_tail(&smcibdev->list, &smc_ib_devices.list); spin_unlock(&smc_ib_devices.lock); ib_set_client_data(ibdev, &smc_ib_client, smcibdev); + INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, + smc_ib_global_event_handler); + ib_register_event_handler(&smcibdev->event_handler); + + /* trigger reading of the port attributes */ + port_cnt = smcibdev->ibdev->phys_port_cnt; + for (i = 0; + i < min_t(size_t, port_cnt, SMC_MAX_PORTS); + i++) { + set_bit(i, &smcibdev->port_event_mask); + /* determine pnetids of the port */ + smc_pnetid_by_dev_port(ibdev->dev.parent, i, + smcibdev->pnetid[i]); + } + schedule_work(&smcibdev->port_event_work); } /* callback function for ib_register_client() */ @@ -512,6 +555,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) spin_unlock(&smc_ib_devices.lock); smc_pnet_remove_by_ibdev(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); + ib_unregister_event_handler(&smcibdev->event_handler); kfree(smcibdev); } diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index e90630dadf8e..bac7fd65a4c0 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -15,6 +15,7 @@ #include <linux/interrupt.h> #include <linux/if_ether.h> #include <rdma/ib_verbs.h> +#include <net/smc.h> #define SMC_MAX_PORTS 2 /* Max # of ports */ #define SMC_GID_SIZE sizeof(union ib_gid) @@ -39,7 +40,8 @@ struct smc_ib_device { /* ib-device infos for smc */ struct tasklet_struct recv_tasklet; /* called by recv cq handler */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ - union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */ + u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; + /* pnetid per port */ u8 initialized : 1; /* ib dev CQ, evthdl done */ struct work_struct port_event_work; unsigned long port_event_mask; @@ -51,7 +53,6 @@ struct smc_link; int smc_ib_register_client(void) __init; void smc_ib_unregister_client(void); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); -int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport); int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); @@ -75,4 +76,6 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); +int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, + unsigned short vlan_id, u8 gid[], u8 *sgid_index); #endif diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c new file mode 100644 index 000000000000..cfade7fdcc6d --- /dev/null +++ b/net/smc/smc_ism.c @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Shared Memory Communications Direct over ISM devices (SMC-D) + * + * Functions for ISM device. + * + * Copyright IBM Corp. 2018 + */ + +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <asm/page.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_ism.h" +#include "smc_pnet.h" + +struct smcd_dev_list smcd_dev_list = { + .list = LIST_HEAD_INIT(smcd_dev_list.list), + .lock = __SPIN_LOCK_UNLOCKED(smcd_dev_list.lock) +}; + +/* Test if an ISM communication is possible. */ +int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) +{ + return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, + vlan_id); +} + +int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos, + void *data, size_t len) +{ + int rc; + + rc = smcd->ops->move_data(smcd, pos->token, pos->index, pos->signal, + pos->offset, data, len); + + return rc < 0 ? rc : 0; +} + +/* Set a connection using this DMBE. */ +void smc_ism_set_conn(struct smc_connection *conn) +{ + unsigned long flags; + + spin_lock_irqsave(&conn->lgr->smcd->lock, flags); + conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn; + spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); +} + +/* Unset a connection using this DMBE. */ +void smc_ism_unset_conn(struct smc_connection *conn) +{ + unsigned long flags; + + if (!conn->rmb_desc) + return; + + spin_lock_irqsave(&conn->lgr->smcd->lock, flags); + conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL; + spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); +} + +/* Register a VLAN identifier with the ISM device. Use a reference count + * and add a VLAN identifier only when the first DMB using this VLAN is + * registered. + */ +int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) +{ + struct smc_ism_vlanid *new_vlan, *vlan; + unsigned long flags; + int rc = 0; + + if (!vlanid) /* No valid vlan id */ + return -EINVAL; + + /* create new vlan entry, in case we need it */ + new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); + if (!new_vlan) + return -ENOMEM; + new_vlan->vlanid = vlanid; + refcount_set(&new_vlan->refcnt, 1); + + /* if there is an existing entry, increase count and return */ + spin_lock_irqsave(&smcd->lock, flags); + list_for_each_entry(vlan, &smcd->vlan, list) { + if (vlan->vlanid == vlanid) { + refcount_inc(&vlan->refcnt); + kfree(new_vlan); + goto out; + } + } + + /* no existing entry found. + * add new entry to device; might fail, e.g., if HW limit reached + */ + if (smcd->ops->add_vlan_id(smcd, vlanid)) { + kfree(new_vlan); + rc = -EIO; + goto out; + } + list_add_tail(&new_vlan->list, &smcd->vlan); +out: + spin_unlock_irqrestore(&smcd->lock, flags); + return rc; +} + +/* Unregister a VLAN identifier with the ISM device. Use a reference count + * and remove a VLAN identifier only when the last DMB using this VLAN is + * unregistered. + */ +int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) +{ + struct smc_ism_vlanid *vlan; + unsigned long flags; + bool found = false; + int rc = 0; + + if (!vlanid) /* No valid vlan id */ + return -EINVAL; + + spin_lock_irqsave(&smcd->lock, flags); + list_for_each_entry(vlan, &smcd->vlan, list) { + if (vlan->vlanid == vlanid) { + if (!refcount_dec_and_test(&vlan->refcnt)) + goto out; + found = true; + break; + } + } + if (!found) { + rc = -ENOENT; + goto out; /* VLAN id not in table */ + } + + /* Found and the last reference just gone */ + if (smcd->ops->del_vlan_id(smcd, vlanid)) + rc = -EIO; + list_del(&vlan->list); + kfree(vlan); +out: + spin_unlock_irqrestore(&smcd->lock, flags); + return rc; +} + +int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) +{ + struct smcd_dmb dmb; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_tok = dmb_desc->token; + dmb.sba_idx = dmb_desc->sba_idx; + dmb.cpu_addr = dmb_desc->cpu_addr; + dmb.dma_addr = dmb_desc->dma_addr; + dmb.dmb_len = dmb_desc->len; + return smcd->ops->unregister_dmb(smcd, &dmb); +} + +int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, + struct smc_buf_desc *dmb_desc) +{ + struct smcd_dmb dmb; + int rc; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_len = dmb_len; + dmb.sba_idx = dmb_desc->sba_idx; + dmb.vlan_id = lgr->vlan_id; + dmb.rgid = lgr->peer_gid; + rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb); + if (!rc) { + dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->token = dmb.dmb_tok; + dmb_desc->cpu_addr = dmb.cpu_addr; + dmb_desc->dma_addr = dmb.dma_addr; + dmb_desc->len = dmb.dmb_len; + } + return rc; +} + +struct smc_ism_event_work { + struct work_struct work; + struct smcd_dev *smcd; + struct smcd_event event; +}; + +/* worker for SMC-D events */ +static void smc_ism_event_work(struct work_struct *work) +{ + struct smc_ism_event_work *wrk = + container_of(work, struct smc_ism_event_work, work); + + switch (wrk->event.type) { + case ISM_EVENT_GID: /* GID event, token is peer GID */ + smc_smcd_terminate(wrk->smcd, wrk->event.tok); + break; + case ISM_EVENT_DMB: + break; + } + kfree(wrk); +} + +static void smcd_release(struct device *dev) +{ + struct smcd_dev *smcd = container_of(dev, struct smcd_dev, dev); + + kfree(smcd->conn); + kfree(smcd); +} + +struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, + const struct smcd_ops *ops, int max_dmbs) +{ + struct smcd_dev *smcd; + + smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); + if (!smcd) + return NULL; + smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), + GFP_KERNEL); + if (!smcd->conn) { + kfree(smcd); + return NULL; + } + + smcd->dev.parent = parent; + smcd->dev.release = smcd_release; + device_initialize(&smcd->dev); + dev_set_name(&smcd->dev, name); + smcd->ops = ops; + smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); + + spin_lock_init(&smcd->lock); + INIT_LIST_HEAD(&smcd->vlan); + smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", + WQ_MEM_RECLAIM, name); + return smcd; +} +EXPORT_SYMBOL_GPL(smcd_alloc_dev); + +int smcd_register_dev(struct smcd_dev *smcd) +{ + spin_lock(&smcd_dev_list.lock); + list_add_tail(&smcd->list, &smcd_dev_list.list); + spin_unlock(&smcd_dev_list.lock); + + return device_add(&smcd->dev); +} +EXPORT_SYMBOL_GPL(smcd_register_dev); + +void smcd_unregister_dev(struct smcd_dev *smcd) +{ + spin_lock(&smcd_dev_list.lock); + list_del(&smcd->list); + spin_unlock(&smcd_dev_list.lock); + flush_workqueue(smcd->event_wq); + destroy_workqueue(smcd->event_wq); + smc_smcd_terminate(smcd, 0); + + device_del(&smcd->dev); +} +EXPORT_SYMBOL_GPL(smcd_unregister_dev); + +void smcd_free_dev(struct smcd_dev *smcd) +{ + put_device(&smcd->dev); +} +EXPORT_SYMBOL_GPL(smcd_free_dev); + +/* SMCD Device event handler. Called from ISM device interrupt handler. + * Parameters are smcd device pointer, + * - event->type (0 --> DMB, 1 --> GID), + * - event->code (event code), + * - event->tok (either DMB token when event type 0, or GID when event type 1) + * - event->time (time of day) + * - event->info (debug info). + * + * Context: + * - Function called in IRQ context from ISM device driver event handler. + */ +void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) +{ + struct smc_ism_event_work *wrk; + + /* copy event to event work queue, and let it be handled there */ + wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); + if (!wrk) + return; + INIT_WORK(&wrk->work, smc_ism_event_work); + wrk->smcd = smcd; + wrk->event = *event; + queue_work(smcd->event_wq, &wrk->work); +} +EXPORT_SYMBOL_GPL(smcd_handle_event); + +/* SMCD Device interrupt handler. Called from ISM device interrupt handler. + * Parameters are smcd device pointer and DMB number. Find the connection and + * schedule the tasklet for this connection. + * + * Context: + * - Function called in IRQ context from ISM device driver IRQ handler. + */ +void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno) +{ + struct smc_connection *conn = NULL; + unsigned long flags; + + spin_lock_irqsave(&smcd->lock, flags); + conn = smcd->conn[dmbno]; + if (conn) + tasklet_schedule(&conn->rx_tsklet); + spin_unlock_irqrestore(&smcd->lock, flags); +} +EXPORT_SYMBOL_GPL(smcd_handle_irq); diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h new file mode 100644 index 000000000000..aee45b860b79 --- /dev/null +++ b/net/smc/smc_ism.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Shared Memory Communications Direct over ISM devices (SMC-D) + * + * SMC-D ISM device structure definitions. + * + * Copyright IBM Corp. 2018 + */ + +#ifndef SMCD_ISM_H +#define SMCD_ISM_H + +#include <linux/uio.h> + +#include "smc.h" + +struct smcd_dev_list { /* List of SMCD devices */ + struct list_head list; + spinlock_t lock; /* Protects list of devices */ +}; + +extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */ + +struct smc_ism_vlanid { /* VLAN id set on ISM device */ + struct list_head list; + unsigned short vlanid; /* Vlan id */ + refcount_t refcnt; /* Reference count */ +}; + +struct smc_ism_position { /* ISM device position to write to */ + u64 token; /* Token of DMB */ + u32 offset; /* Offset into DMBE */ + u8 index; /* Index of DMBE */ + u8 signal; /* Generate interrupt on owner side */ +}; + +struct smcd_dev; + +int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev); +void smc_ism_set_conn(struct smc_connection *conn); +void smc_ism_unset_conn(struct smc_connection *conn); +int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id); +int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); +int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, + struct smc_buf_desc *dmb_desc); +int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos, + void *data, size_t len); +#endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 5800a6b43d83..9c916c709ca7 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -182,12 +182,10 @@ static int smc_llc_add_pending_send(struct smc_link *link, } /* high-level API to send LLC confirm link */ -int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], - union ib_gid *gid, +int smc_llc_send_confirm_link(struct smc_link *link, enum smc_llc_reqresp reqresp) { - struct smc_link_group *lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); + struct smc_link_group *lgr = smc_get_lgr(link); struct smc_llc_msg_confirm_link *confllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; @@ -203,8 +201,9 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC; if (reqresp == SMC_LLC_RESP) confllc->hd.flags |= SMC_LLC_FLAG_RESP; - memcpy(confllc->sender_mac, mac, ETH_ALEN); - memcpy(confllc->sender_gid, gid, SMC_GID_SIZE); + memcpy(confllc->sender_mac, link->smcibdev->mac[link->ibport - 1], + ETH_ALEN); + memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE); hton24(confllc->sender_qp_num, link->roce_qp->qp_num); confllc->link_num = link->link_id; memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE); @@ -241,8 +240,7 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link, /* prepare an add link message */ static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, - struct smc_link *link, u8 mac[], - union ib_gid *gid, + struct smc_link *link, u8 mac[], u8 gid[], enum smc_llc_reqresp reqresp) { memset(addllc, 0, sizeof(*addllc)); @@ -259,8 +257,7 @@ static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, } /* send ADD LINK request or response */ -int smc_llc_send_add_link(struct smc_link *link, u8 mac[], - union ib_gid *gid, +int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], enum smc_llc_reqresp reqresp) { struct smc_llc_msg_add_link *addllc; @@ -281,7 +278,7 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], /* prepare a delete link message */ static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc, struct smc_link *link, - enum smc_llc_reqresp reqresp) + enum smc_llc_reqresp reqresp, bool orderly) { memset(delllc, 0, sizeof(*delllc)); delllc->hd.common.type = SMC_LLC_DELETE_LINK; @@ -290,13 +287,14 @@ static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc, delllc->hd.flags |= SMC_LLC_FLAG_RESP; /* DEL_LINK_ALL because only 1 link supported */ delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; - delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + if (orderly) + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; delllc->link_num = link->link_id; } /* send DELETE LINK request or response */ int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp) + enum smc_llc_reqresp reqresp, bool orderly) { struct smc_llc_msg_del_link *delllc; struct smc_wr_tx_pend_priv *pend; @@ -307,7 +305,7 @@ int smc_llc_send_delete_link(struct smc_link *link, if (rc) return rc; delllc = (struct smc_llc_msg_del_link *)wr_buf; - smc_llc_prep_delete_link(delllc, link, reqresp); + smc_llc_prep_delete_link(delllc, link, reqresp, orderly); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -381,11 +379,9 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) static void smc_llc_rx_confirm_link(struct smc_link *link, struct smc_llc_msg_confirm_link *llc) { - struct smc_link_group *lgr; + struct smc_link_group *lgr = smc_get_lgr(link); int conf_rc; - lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); - /* RMBE eyecatchers are not supported */ if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC) conf_rc = 0; @@ -411,8 +407,7 @@ static void smc_llc_rx_confirm_link(struct smc_link *link, static void smc_llc_rx_add_link(struct smc_link *link, struct smc_llc_msg_add_link *llc) { - struct smc_link_group *lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); + struct smc_link_group *lgr = smc_get_lgr(link); if (llc->hd.flags & SMC_LLC_FLAG_RESP) { if (link->state == SMC_LNK_ACTIVATING) @@ -426,14 +421,12 @@ static void smc_llc_rx_add_link(struct smc_link *link, if (lgr->role == SMC_SERV) { smc_llc_prep_add_link(llc, link, link->smcibdev->mac[link->ibport - 1], - &link->smcibdev->gid[link->ibport - 1], - SMC_LLC_REQ); + link->gid, SMC_LLC_REQ); } else { smc_llc_prep_add_link(llc, link, link->smcibdev->mac[link->ibport - 1], - &link->smcibdev->gid[link->ibport - 1], - SMC_LLC_RESP); + link->gid, SMC_LLC_RESP); } smc_llc_send_message(link, llc, sizeof(*llc)); } @@ -442,22 +435,23 @@ static void smc_llc_rx_add_link(struct smc_link *link, static void smc_llc_rx_delete_link(struct smc_link *link, struct smc_llc_msg_del_link *llc) { - struct smc_link_group *lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); + struct smc_link_group *lgr = smc_get_lgr(link); if (llc->hd.flags & SMC_LLC_FLAG_RESP) { if (lgr->role == SMC_SERV) - smc_lgr_terminate(lgr); + smc_lgr_schedule_free_work_fast(lgr); } else { + smc_lgr_forget(lgr); + smc_llc_link_deleting(link); if (lgr->role == SMC_SERV) { - smc_lgr_forget(lgr); - smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ); - smc_llc_send_message(link, llc, sizeof(*llc)); + /* client asks to delete this link, send request */ + smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); } else { - smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP); - smc_llc_send_message(link, llc, sizeof(*llc)); - smc_lgr_terminate(lgr); + /* server requests to delete this link, send response */ + smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); } + smc_llc_send_message(link, llc, sizeof(*llc)); + smc_lgr_schedule_free_work_fast(lgr); } } @@ -476,17 +470,14 @@ static void smc_llc_rx_test_link(struct smc_link *link, static void smc_llc_rx_confirm_rkey(struct smc_link *link, struct smc_llc_msg_confirm_rkey *llc) { - struct smc_link_group *lgr; int rc; - lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { link->llc_confirm_rkey_rc = llc->hd.flags & SMC_LLC_FLAG_RKEY_NEG; complete(&link->llc_confirm_rkey); } else { - rc = smc_rtoken_add(lgr, + rc = smc_rtoken_add(smc_get_lgr(link), llc->rtoken[0].rmb_vaddr, llc->rtoken[0].rmb_key); @@ -514,18 +505,15 @@ static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, static void smc_llc_rx_delete_rkey(struct smc_link *link, struct smc_llc_msg_delete_rkey *llc) { - struct smc_link_group *lgr; u8 err_mask = 0; int i, max; - lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { /* unused as long as we don't send this type of msg */ } else { max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); for (i = 0; i < max; i++) { - if (smc_rtoken_delete(lgr, llc->rkey[i])) + if (smc_rtoken_delete(smc_get_lgr(link), llc->rkey[i])) err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); } @@ -583,12 +571,10 @@ static void smc_llc_testlink_work(struct work_struct *work) struct smc_link *link = container_of(to_delayed_work(work), struct smc_link, llc_testlink_wrk); unsigned long next_interval; - struct smc_link_group *lgr; unsigned long expire_time; u8 user_data[16] = { 0 }; int rc; - lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); if (link->state != SMC_LNK_ACTIVE) return; /* don't reschedule worker */ expire_time = link->wr_rx_tstamp + link->llc_testlink_time; @@ -602,7 +588,7 @@ static void smc_llc_testlink_work(struct work_struct *work) rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, SMC_LLC_WAIT_TIME); if (rc <= 0) { - smc_lgr_terminate(lgr); + smc_lgr_terminate(smc_get_lgr(link)); return; } next_interval = link->llc_testlink_time; @@ -613,8 +599,7 @@ out: int smc_llc_link_init(struct smc_link *link) { - struct smc_link_group *lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); + struct smc_link_group *lgr = smc_get_lgr(link); link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM, *((u32 *)lgr->id), link->link_id); @@ -640,6 +625,11 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time) } } +void smc_llc_link_deleting(struct smc_link *link) +{ + link->state = SMC_LNK_DELETING; +} + /* called in tasklet context */ void smc_llc_link_inactive(struct smc_link *link) { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 65c8645e96a1..9e2ff088e301 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -36,14 +36,15 @@ enum smc_llc_msg_type { }; /* transmit */ -int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid, +int smc_llc_send_confirm_link(struct smc_link *lnk, enum smc_llc_reqresp reqresp); -int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid, +int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], enum smc_llc_reqresp reqresp); int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp); + enum smc_llc_reqresp reqresp, bool orderly); int smc_llc_link_init(struct smc_link *link); void smc_llc_link_active(struct smc_link *link, int testlink_time); +void smc_llc_link_deleting(struct smc_link *link); void smc_llc_link_inactive(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *link, diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index d7b88b2d1b22..01c6ce042a1c 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -22,13 +22,12 @@ #include "smc_pnet.h" #include "smc_ib.h" - -#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */ +#include "smc_ism.h" static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, - .len = SMC_MAX_PNET_ID_LEN - 1 + .len = SMC_MAX_PNETID_LEN - 1 }, [SMC_PNETID_ETHNAME] = { .type = NLA_NUL_STRING, @@ -65,7 +64,7 @@ static struct smc_pnettable { */ struct smc_pnetentry { struct list_head list; - char pnet_name[SMC_MAX_PNET_ID_LEN + 1]; + char pnet_name[SMC_MAX_PNETID_LEN + 1]; struct net_device *ndev; struct smc_ib_device *smcibdev; u8 ib_port; @@ -209,7 +208,7 @@ static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) return false; while (--end >= bf && isspace(*end)) ; - if (end - bf >= SMC_MAX_PNET_ID_LEN) + if (end - bf >= SMC_MAX_PNETID_LEN) return false; while (bf <= end) { if (!isalnum(*bf)) @@ -358,9 +357,6 @@ static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) kfree(pnetelem); return rc; } - rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port); - if (rc) - smc_pnet_remove_by_pnetid(pnetelem->pnet_name); return rc; } @@ -485,10 +481,10 @@ static int smc_pnet_netdev_event(struct notifier_block *this, case NETDEV_REBOOT: case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); + return NOTIFY_OK; default: - break; + return NOTIFY_DONE; } - return NOTIFY_DONE; } static struct notifier_block smc_netdev_notifier = { @@ -515,28 +511,104 @@ void smc_pnet_exit(void) genl_unregister_family(&smc_pnet_nl_family); } -/* PNET table analysis for a given sock: - * determine ib_device and port belonging to used internal TCP socket - * ethernet interface. +/* Determine one base device for stacked net devices. + * If the lower device level contains more than one devices + * (for instance with bonding slaves), just the first device + * is used to reach a base device. */ -void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport) +static struct net_device *pnet_find_base_ndev(struct net_device *ndev) { - struct dst_entry *dst = sk_dst_get(sk); - struct smc_pnetentry *pnetelem; + int i, nest_lvl; - *smcibdev = NULL; - *ibport = 0; + rtnl_lock(); + nest_lvl = dev_get_nest_level(ndev); + for (i = 0; i < nest_lvl; i++) { + struct list_head *lower = &ndev->adj_list.lower; + + if (list_empty(lower)) + break; + lower = lower->next; + ndev = netdev_lower_get_next(ndev, &lower); + } + rtnl_unlock(); + return ndev; +} + +/* Determine the corresponding IB device port based on the hardware PNETID. + * Searching stops at the first matching active IB device port with vlan_id + * configured. + */ +static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, + struct smc_ib_device **smcibdev, + u8 *ibport, unsigned short vlan_id, + u8 gid[]) +{ + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct smc_ib_device *ibdev; + int i; + + ndev = pnet_find_base_ndev(ndev); + if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + ndev_pnetid)) + return; /* pnetid could not be determined */ + + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(ibdev->ibdev, i)) + continue; + if (!memcmp(ibdev->pnetid[i - 1], ndev_pnetid, + SMC_MAX_PNETID_LEN) && + smc_ib_port_active(ibdev, i) && + !smc_ib_determine_gid(ibdev, i, vlan_id, gid, + NULL)) { + *smcibdev = ibdev; + *ibport = i; + goto out; + } + } + } +out: + spin_unlock(&smc_ib_devices.lock); +} + +static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, + struct smcd_dev **smcismdev) +{ + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct smcd_dev *ismdev; + + ndev = pnet_find_base_ndev(ndev); + if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + ndev_pnetid)) + return; /* pnetid could not be determined */ + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(ismdev, &smcd_dev_list.list, list) { + if (!memcmp(ismdev->pnetid, ndev_pnetid, SMC_MAX_PNETID_LEN)) { + *smcismdev = ismdev; + break; + } + } + spin_unlock(&smcd_dev_list.lock); +} + +/* Lookup of coupled ib_device via SMC pnet table */ +static void smc_pnet_find_roce_by_table(struct net_device *netdev, + struct smc_ib_device **smcibdev, + u8 *ibport, unsigned short vlan_id, + u8 gid[]) +{ + struct smc_pnetentry *pnetelem; - if (!dst) - return; - if (!dst->dev) - goto out_rel; read_lock(&smc_pnettable.lock); list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { - if (dst->dev == pnetelem->ndev) { + if (netdev == pnetelem->ndev) { if (smc_ib_port_active(pnetelem->smcibdev, - pnetelem->ib_port)) { + pnetelem->ib_port) && + !smc_ib_determine_gid(pnetelem->smcibdev, + pnetelem->ib_port, vlan_id, + gid, NULL)) { *smcibdev = pnetelem->smcibdev; *ibport = pnetelem->ib_port; } @@ -544,6 +616,55 @@ void smc_pnet_find_roce_resource(struct sock *sk, } } read_unlock(&smc_pnettable.lock); +} + +/* PNET table analysis for a given sock: + * determine ib_device and port belonging to used internal TCP socket + * ethernet interface. + */ +void smc_pnet_find_roce_resource(struct sock *sk, + struct smc_ib_device **smcibdev, u8 *ibport, + unsigned short vlan_id, u8 gid[]) +{ + struct dst_entry *dst = sk_dst_get(sk); + + *smcibdev = NULL; + *ibport = 0; + + if (!dst) + goto out; + if (!dst->dev) + goto out_rel; + + /* if possible, lookup via hardware-defined pnetid */ + smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport, vlan_id, gid); + if (*smcibdev) + goto out_rel; + + /* lookup via SMC PNET table */ + smc_pnet_find_roce_by_table(dst->dev, smcibdev, ibport, vlan_id, gid); + +out_rel: + dst_release(dst); +out: + return; +} + +void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev) +{ + struct dst_entry *dst = sk_dst_get(sk); + + *smcismdev = NULL; + if (!dst) + goto out; + if (!dst->dev) + goto out_rel; + + /* if possible, lookup via hardware-defined pnetid */ + smc_pnet_find_ism_by_pnetid(dst->dev, smcismdev); + out_rel: dst_release(dst); +out: + return; } diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 5a29519db976..8ff777636e32 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -12,12 +12,29 @@ #ifndef _SMC_PNET_H #define _SMC_PNET_H +#if IS_ENABLED(CONFIG_HAVE_PNETID) +#include <asm/pnet.h> +#endif + struct smc_ib_device; +struct smcd_dev; + +static inline int smc_pnetid_by_dev_port(struct device *dev, + unsigned short port, u8 *pnetid) +{ +#if IS_ENABLED(CONFIG_HAVE_PNETID) + return pnet_id_by_dev_port(dev, port, pnetid); +#else + return -ENOENT; +#endif +} int smc_pnet_init(void) __init; void smc_pnet_exit(void); int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev); void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport); + struct smc_ib_device **smcibdev, u8 *ibport, + unsigned short vlan_id, u8 gid[]); +void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev); #endif diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 3d77b383cccd..bbcf0fe4ae10 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -82,8 +82,7 @@ static int smc_rx_update_consumer(struct smc_sock *smc, } } - smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn), - conn); + smc_curs_copy(&conn->local_tx_ctrl.cons, &cons, conn); /* send consumer cursor update if required */ /* similar to advertising new TCP rcv_wnd if required */ @@ -97,8 +96,7 @@ static void smc_rx_update_cons(struct smc_sock *smc, size_t len) struct smc_connection *conn = &smc->conn; union smc_host_cursor cons; - smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn), - conn); + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); smc_rx_update_consumer(smc, cons, len); } @@ -157,10 +155,8 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, struct splice_pipe_desc spd; struct partial_page partial; struct smc_spd_priv *priv; - struct page *page; int bytes; - page = virt_to_page(smc->conn.rmb_desc->cpu_addr); priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; @@ -172,7 +168,7 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, spd.nr_pages_max = 1; spd.nr_pages = 1; - spd.pages = &page; + spd.pages = &smc->conn.rmb_desc->pages; spd.partial = &partial; spd.ops = &smc_pipe_ops; spd.spd_release = smc_rx_spd_release; @@ -245,10 +241,7 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, if (!(flags & MSG_TRUNC)) rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1); len = 1; - smc_curs_write(&cons, - smc_curs_read(&conn->local_tx_ctrl.cons, - conn), - conn); + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); if (smc_curs_diff(conn->rmb_desc->len, &cons, &conn->urg_curs) > 1) conn->urg_rx_skip_pend = true; @@ -305,7 +298,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ - rcvbuf_base = conn->rmb_desc->cpu_addr; + rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr; do { /* while (read_remaining) */ if (read_done >= target || (pipe && read_done)) @@ -370,9 +363,7 @@ copy: continue; } - smc_curs_write(&cons, - smc_curs_read(&conn->local_tx_ctrl.cons, conn), - conn); + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); /* subsequent splice() calls pick up where previous left */ if (splbytes) smc_curs_add(conn->rmb_desc->len, &cons, splbytes); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index cee666400752..2f5e324e54b9 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -24,6 +24,7 @@ #include "smc.h" #include "smc_wr.h" #include "smc_cdc.h" +#include "smc_ism.h" #include "smc_tx.h" #define SMC_TX_WORK_DELAY HZ @@ -180,9 +181,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) copylen = min_t(size_t, send_remaining, writespace); /* determine start of sndbuf */ sndbuf_base = conn->sndbuf_desc->cpu_addr; - smc_curs_write(&prep, - smc_curs_read(&conn->tx_curs_prep, conn), - conn); + smc_curs_copy(&prep, &conn->tx_curs_prep, conn); tx_cnt_prep = prep.count; /* determine chunks where to write into sndbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ @@ -213,9 +212,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) smc_sndbuf_sync_sg_for_device(conn); /* update cursors */ smc_curs_add(conn->sndbuf_desc->len, &prep, copylen); - smc_curs_write(&conn->tx_curs_prep, - smc_curs_read(&prep, conn), - conn); + smc_curs_copy(&conn->tx_curs_prep, &prep, conn); /* increased in send tasklet smc_cdc_tx_handler() */ smp_mb__before_atomic(); atomic_sub(copylen, &conn->sndbuf_space); @@ -250,6 +247,24 @@ out_err: /***************************** sndbuf consumer *******************************/ +/* sndbuf consumer: actual data transfer of one target chunk with ISM write */ +int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, + u32 offset, int signal) +{ + struct smc_ism_position pos; + int rc; + + memset(&pos, 0, sizeof(pos)); + pos.token = conn->peer_token; + pos.index = conn->peer_rmbe_idx; + pos.offset = conn->tx_off + offset; + pos.signal = signal; + rc = smc_ism_write(conn->lgr->smcd, &pos, data, len); + if (rc) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + return rc; +} + /* sndbuf consumer: actual data transfer of one target chunk with RDMA write */ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, int num_sges, struct ib_sge sges[]) @@ -297,26 +312,109 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn, smc_curs_add(conn->sndbuf_desc->len, sent, len); } +/* SMC-R helper for smc_tx_rdma_writes() */ +static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, + size_t src_off, size_t src_len, + size_t dst_off, size_t dst_len) +{ + dma_addr_t dma_addr = + sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); + struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + int src_len_sum = src_len, dst_len_sum = dst_len; + struct ib_sge sges[SMC_IB_MAX_SEND_SGE]; + int sent_count = src_off; + int srcchunk, dstchunk; + int num_sges; + int rc; + + for (dstchunk = 0; dstchunk < 2; dstchunk++) { + num_sges = 0; + for (srcchunk = 0; srcchunk < 2; srcchunk++) { + sges[srcchunk].addr = dma_addr + src_off; + sges[srcchunk].length = src_len; + sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; + num_sges++; + + src_off += src_len; + if (src_off >= conn->sndbuf_desc->len) + src_off -= conn->sndbuf_desc->len; + /* modulo in send ring */ + if (src_len_sum == dst_len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + src_len = dst_len - src_len; /* remainder */ + src_len_sum += src_len; + } + rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges); + if (rc) + return rc; + if (dst_len_sum == len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + dst_off = 0; /* modulo offset in RMBE ring buffer */ + dst_len = len - dst_len; /* remainder */ + dst_len_sum += dst_len; + src_len = min_t(int, dst_len, conn->sndbuf_desc->len - + sent_count); + src_len_sum = src_len; + } + return 0; +} + +/* SMC-D helper for smc_tx_rdma_writes() */ +static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, + size_t src_off, size_t src_len, + size_t dst_off, size_t dst_len) +{ + int src_len_sum = src_len, dst_len_sum = dst_len; + int srcchunk, dstchunk; + int rc; + + for (dstchunk = 0; dstchunk < 2; dstchunk++) { + for (srcchunk = 0; srcchunk < 2; srcchunk++) { + void *data = conn->sndbuf_desc->cpu_addr + src_off; + + rc = smcd_tx_ism_write(conn, data, src_len, dst_off + + sizeof(struct smcd_cdc_msg), 0); + if (rc) + return rc; + dst_off += src_len; + src_off += src_len; + if (src_off >= conn->sndbuf_desc->len) + src_off -= conn->sndbuf_desc->len; + /* modulo in send ring */ + if (src_len_sum == dst_len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + src_len = dst_len - src_len; /* remainder */ + src_len_sum += src_len; + } + if (dst_len_sum == len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + dst_off = 0; /* modulo offset in RMBE ring buffer */ + dst_len = len - dst_len; /* remainder */ + dst_len_sum += dst_len; + src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off); + src_len_sum = src_len; + } + return 0; +} + /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; * usable snd_wnd as max transmit */ static int smc_tx_rdma_writes(struct smc_connection *conn) { - size_t src_off, src_len, dst_off, dst_len; /* current chunk values */ - size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk; + size_t len, src_len, dst_off, dst_len; /* current chunk values */ union smc_host_cursor sent, prep, prod, cons; - struct ib_sge sges[SMC_IB_MAX_SEND_SGE]; - struct smc_link_group *lgr = conn->lgr; struct smc_cdc_producer_flags *pflags; int to_send, rmbespace; - struct smc_link *link; - dma_addr_t dma_addr; - int num_sges; int rc; /* source: sndbuf */ - smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); - smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); + smc_curs_copy(&sent, &conn->tx_curs_sent, conn); + smc_curs_copy(&prep, &conn->tx_curs_prep, conn); /* cf. wmem_alloc - (snd_max - snd_una) */ to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); if (to_send <= 0) @@ -327,12 +425,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) rmbespace = atomic_read(&conn->peer_rmbe_space); if (rmbespace <= 0) return 0; - smc_curs_write(&prod, - smc_curs_read(&conn->local_tx_ctrl.prod, conn), - conn); - smc_curs_write(&cons, - smc_curs_read(&conn->local_rx_ctrl.cons, conn), - conn); + smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); /* if usable snd_wnd closes ask peer to advertise once it opens again */ pflags = &conn->local_tx_ctrl.prod_flags; @@ -341,7 +435,6 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) len = min(to_send, rmbespace); /* initialize variables for first iteration of subsequent nested loop */ - link = &lgr->lnk[SMC_SINGLE_LINK]; dst_off = prod.count; if (prod.wrap == cons.wrap) { /* the filled destination area is unwrapped, @@ -358,8 +451,6 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) */ dst_len = len; } - dst_len_sum = dst_len; - src_off = sent.count; /* dst_len determines the maximum src_len */ if (sent.count + dst_len <= conn->sndbuf_desc->len) { /* unwrapped src case: single chunk of entire dst_len */ @@ -368,51 +459,23 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ src_len = conn->sndbuf_desc->len - sent.count; } - src_len_sum = src_len; - dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); - for (dstchunk = 0; dstchunk < 2; dstchunk++) { - num_sges = 0; - for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sges[srcchunk].addr = dma_addr + src_off; - sges[srcchunk].length = src_len; - sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; - num_sges++; - src_off += src_len; - if (src_off >= conn->sndbuf_desc->len) - src_off -= conn->sndbuf_desc->len; - /* modulo in send ring */ - if (src_len_sum == dst_len) - break; /* either on 1st or 2nd iteration */ - /* prepare next (== 2nd) iteration */ - src_len = dst_len - src_len; /* remainder */ - src_len_sum += src_len; - } - rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges); - if (rc) - return rc; - if (dst_len_sum == len) - break; /* either on 1st or 2nd iteration */ - /* prepare next (== 2nd) iteration */ - dst_off = 0; /* modulo offset in RMBE ring buffer */ - dst_len = len - dst_len; /* remainder */ - dst_len_sum += dst_len; - src_len = min_t(int, - dst_len, conn->sndbuf_desc->len - sent.count); - src_len_sum = src_len; - } + + if (conn->lgr->is_smcd) + rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len, + dst_off, dst_len); + else + rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len, + dst_off, dst_len); + if (rc) + return rc; if (conn->urg_tx_pend && len == to_send) pflags->urg_data_present = 1; smc_tx_advance_cursors(conn, &prod, &sent, len); /* update connection's cursors with advanced local cursors */ - smc_curs_write(&conn->local_tx_ctrl.prod, - smc_curs_read(&prod, conn), - conn); + smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn); /* dst: peer RMBE */ - smc_curs_write(&conn->tx_curs_sent, - smc_curs_read(&sent, conn), - conn); - /* src: local sndbuf */ + smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */ return 0; } @@ -420,7 +483,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) /* Wakeup sndbuf consumers from any context (IRQ or process) * since there is more data to transmit; usable snd_wnd as max transmit */ -int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags; struct smc_cdc_tx_pend *pend; @@ -467,6 +530,37 @@ out_unlock: return rc; } +static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; + int rc = 0; + + spin_lock_bh(&conn->send_lock); + if (!pflags->urg_data_present) + rc = smc_tx_rdma_writes(conn); + if (!rc) + rc = smcd_cdc_msg_send(conn); + + if (!rc && pflags->urg_data_present) { + pflags->urg_data_pending = 0; + pflags->urg_data_present = 0; + } + spin_unlock_bh(&conn->send_lock); + return rc; +} + +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + int rc; + + if (conn->lgr->is_smcd) + rc = smcd_tx_sndbuf_nonempty(conn); + else + rc = smcr_tx_sndbuf_nonempty(conn); + + return rc; +} + /* Wakeup sndbuf consumers from process context * since there is more data to transmit */ @@ -495,21 +589,23 @@ out: void smc_tx_consumer_update(struct smc_connection *conn, bool force) { - union smc_host_cursor cfed, cons; + union smc_host_cursor cfed, cons, prod; + int sender_free = conn->rmb_desc->len; int to_confirm; - smc_curs_write(&cons, - smc_curs_read(&conn->local_tx_ctrl.cons, conn), - conn); - smc_curs_write(&cfed, - smc_curs_read(&conn->rx_curs_confirmed, conn), - conn); + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); + smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn); to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons); + if (to_confirm > conn->rmbe_update_limit) { + smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn); + sender_free = conn->rmb_desc->len - + smc_curs_diff(conn->rmb_desc->len, &prod, &cfed); + } if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || force || ((to_confirm > conn->rmbe_update_limit) && - ((to_confirm > (conn->rmb_desc->len / 2)) || + ((sender_free <= (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && conn->alert_token_local) { /* connection healthy */ @@ -517,9 +613,8 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) SMC_TX_WORK_DELAY); return; } - smc_curs_write(&conn->rx_curs_confirmed, - smc_curs_read(&conn->local_tx_ctrl.cons, conn), - conn); + smc_curs_copy(&conn->rx_curs_confirmed, + &conn->local_tx_ctrl.cons, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; } if (conn->local_rx_ctrl.prod_flags.write_blocked && diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index 9d2238909fa0..07e6ad76224a 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -22,8 +22,8 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn) { union smc_host_cursor sent, prep; - smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); - smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); + smc_curs_copy(&sent, &conn->tx_curs_sent, conn); + smc_curs_copy(&prep, &conn->tx_curs_prep, conn); return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } @@ -33,5 +33,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); +int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, + u32 offset, int signal); #endif /* SMC_TX_H */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index dbd2605d1962..f856b8402b3f 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -92,8 +92,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) return; if (wc->status) { - struct smc_link_group *lgr; - for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { /* clear full struct smc_wr_tx_pend including .priv */ memset(&link->wr_tx_pends[i], 0, @@ -103,9 +101,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) clear_bit(i, link->wr_tx_mask); } /* terminate connections of this link group abnormally */ - lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); - smc_lgr_terminate(lgr); + smc_lgr_terminate(smc_get_lgr(link)); } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); @@ -186,18 +182,14 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, if (rc) return rc; } else { - struct smc_link_group *lgr; - - lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); rc = wait_event_timeout( link->wr_tx_wait, - list_empty(&lgr->list) || /* lgr terminated */ + link->state == SMC_LNK_INACTIVE || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { /* timeout - terminate connections */ - smc_lgr_terminate(lgr); + smc_lgr_terminate(smc_get_lgr(link)); return -EPIPE; } if (idx == link->wr_tx_cnt) @@ -250,12 +242,8 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], &failed_wr); if (rc) { - struct smc_link_group *lgr = - container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); - smc_wr_tx_put_slot(link, priv); - smc_lgr_terminate(lgr); + smc_lgr_terminate(smc_get_lgr(link)); } return rc; } @@ -283,11 +271,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) SMC_WR_REG_MR_WAIT_TIME); if (!rc) { /* timeout - terminate connections */ - struct smc_link_group *lgr; - - lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); - smc_lgr_terminate(lgr); + smc_lgr_terminate(smc_get_lgr(link)); return -EPIPE; } if (rc == -ERESTARTSYS) @@ -380,8 +364,6 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) smc_wr_rx_demultiplex(&wc[i]); smc_wr_rx_post(link); /* refill WR RX */ } else { - struct smc_link_group *lgr; - /* handle status errors */ switch (wc[i].status) { case IB_WC_RETRY_EXC_ERR: @@ -390,9 +372,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) /* terminate connections of this link group * abnormally */ - lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); - smc_lgr_terminate(lgr); + smc_lgr_terminate(smc_get_lgr(link)); break; default: smc_wr_rx_post(link); /* refill WR RX */ diff --git a/net/socket.c b/net/socket.c index 8a109012608a..85633622c94d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -117,10 +117,8 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from); static int sock_mmap(struct file *file, struct vm_area_struct *vma); static int sock_close(struct inode *inode, struct file *file); -static struct wait_queue_head *sock_get_poll_head(struct file *file, - __poll_t events); -static __poll_t sock_poll_mask(struct file *file, __poll_t); -static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait); +static __poll_t sock_poll(struct file *file, + struct poll_table_struct *wait); static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT static long compat_sock_ioctl(struct file *file, @@ -143,8 +141,6 @@ static const struct file_operations socket_file_ops = { .llseek = no_llseek, .read_iter = sock_read_iter, .write_iter = sock_write_iter, - .get_poll_head = sock_get_poll_head, - .poll_mask = sock_poll_mask, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, #ifdef CONFIG_COMPAT @@ -1130,48 +1126,16 @@ out_release: } EXPORT_SYMBOL(sock_create_lite); -static struct wait_queue_head *sock_get_poll_head(struct file *file, - __poll_t events) -{ - struct socket *sock = file->private_data; - - if (!sock->ops->poll_mask) - return NULL; - sock_poll_busy_loop(sock, events); - return sk_sleep(sock->sk); -} - -static __poll_t sock_poll_mask(struct file *file, __poll_t events) -{ - struct socket *sock = file->private_data; - - /* - * We need to be sure we are in sync with the socket flags modification. - * - * This memory barrier is paired in the wq_has_sleeper. - */ - smp_mb(); - - /* this socket can poll_ll so tell the system call */ - return sock->ops->poll_mask(sock, events) | - (sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0); -} - /* No kernel lock held - perfect */ static __poll_t sock_poll(struct file *file, poll_table *wait) { struct socket *sock = file->private_data; - __poll_t events = poll_requested_events(wait), mask = 0; - - if (sock->ops->poll) { - sock_poll_busy_loop(sock, events); - mask = sock->ops->poll(file, sock, wait); - } else if (sock->ops->poll_mask) { - sock_poll_wait(file, sock_get_poll_head(file, events), wait); - mask = sock->ops->poll_mask(sock, events); - } + __poll_t events = poll_requested_events(wait); - return mask | sock_poll_busy_flag(sock); + sock_poll_busy_loop(sock, events); + if (!sock->ops->poll) + return 0; + return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock); } static int sock_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 373836615c57..3a512936eea9 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -35,7 +35,6 @@ struct _strp_msg { */ struct strp_msg strp; int accum_len; - int early_eaten; }; static inline struct _strp_msg *_strp_msg(struct sk_buff *skb) @@ -115,20 +114,6 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, head = strp->skb_head; if (head) { /* Message already in progress */ - - stm = _strp_msg(head); - if (unlikely(stm->early_eaten)) { - /* Already some number of bytes on the receive sock - * data saved in skb_head, just indicate they - * are consumed. - */ - eaten = orig_len <= stm->early_eaten ? - orig_len : stm->early_eaten; - stm->early_eaten -= eaten; - - return eaten; - } - if (unlikely(orig_offset)) { /* Getting data with a non-zero offset when a message is * in progress is not expected. If it does happen, we @@ -155,11 +140,13 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, /* We are going to append to the frags_list of head. * Need to unshare the frag_list. */ - err = skb_unclone(head, GFP_ATOMIC); - if (err) { - STRP_STATS_INCR(strp->stats.mem_fail); - desc->error = err; - return 0; + if (skb_has_frag_list(head)) { + err = skb_unclone(head, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.mem_fail); + desc->error = err; + return 0; + } } if (unlikely(skb_shinfo(head)->frag_list)) { @@ -216,14 +203,16 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, memset(stm, 0, sizeof(*stm)); stm->strp.offset = orig_offset + eaten; } else { - /* Unclone since we may be appending to an skb that we + /* Unclone if we are appending to an skb that we * already share a frag_list with. */ - err = skb_unclone(skb, GFP_ATOMIC); - if (err) { - STRP_STATS_INCR(strp->stats.mem_fail); - desc->error = err; - break; + if (skb_has_frag_list(skb)) { + err = skb_unclone(skb, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.mem_fail); + desc->error = err; + break; + } } stm = _strp_msg(head); @@ -297,9 +286,9 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, } stm->accum_len += cand_len; + eaten += cand_len; strp->need_bytes = stm->strp.full_len - stm->accum_len; - stm->early_eaten = cand_len; STRP_STATS_ADD(strp->stats.bytes, cand_len); desc->count = 0; /* Stop reading socket */ break; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 3c85af058227..3fabf9f6a0f9 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -987,8 +987,6 @@ bool xprt_prepare_transmit(struct rpc_task *task) task->tk_status = -EAGAIN; goto out_unlock; } - if (!bc_prealloc(req) && !req->rq_xmit_bytes_sent) - req->rq_xid = xprt_alloc_xid(xprt); ret = true; out_unlock: spin_unlock_bh(&xprt->transport_lock); @@ -1298,7 +1296,12 @@ void xprt_retry_reserve(struct rpc_task *task) static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) { - return (__force __be32)xprt->xid++; + __be32 xid; + + spin_lock(&xprt->reserve_lock); + xid = (__force __be32)xprt->xid++; + spin_unlock(&xprt->reserve_lock); + return xid; } static inline void xprt_init_xid(struct rpc_xprt *xprt) @@ -1316,6 +1319,7 @@ void xprt_request_init(struct rpc_task *task) req->rq_task = task; req->rq_xprt = xprt; req->rq_buffer = NULL; + req->rq_xid = xprt_alloc_xid(xprt); req->rq_connect_cookie = xprt->connect_cookie - 1; req->rq_bytes_sent = 0; req->rq_snd_buf.len = 0; diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 2dfb492a7c94..418f03d0be90 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -395,6 +395,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, tipc_net_init(net, node_id, 0); } if (!tipc_own_id(net)) { + dev_put(dev); pr_warn("Failed to obtain node identity\n"); return -EINVAL; } @@ -610,6 +611,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, case NETDEV_CHANGE: if (netif_carrier_ok(dev)) break; + /* else: fall through */ case NETDEV_UP: test_and_set_bit_lock(0, &b->up); break; diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 9f666e0650e2..2830709957bd 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -133,6 +133,8 @@ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, } /* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer + * Returns true if message should be dropped by caller, i.e., if it is a + * trial message or we are inside trial period. Otherwise false. */ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, struct tipc_media_addr *maddr, @@ -168,8 +170,9 @@ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); } + /* Accept regular link requests/responses only after trial period */ if (mtyp != DSC_TRIAL_MSG) - return false; + return trial; sugg_addr = tipc_node_try_addr(net, peer_id, src); if (sugg_addr) @@ -284,7 +287,6 @@ static void tipc_disc_timeout(struct timer_list *t) { struct tipc_discoverer *d = from_timer(d, t, timer); struct tipc_net *tn = tipc_net(d->net); - u32 self = tipc_own_addr(d->net); struct tipc_media_addr maddr; struct sk_buff *skb = NULL; struct net *net = d->net; @@ -298,12 +300,14 @@ static void tipc_disc_timeout(struct timer_list *t) goto exit; } - /* Did we just leave the address trial period ? */ - if (!self && !time_before(jiffies, tn->addr_trial_end)) { - self = tn->trial_addr; - tipc_net_finalize(net, self); - msg_set_prevnode(buf_msg(d->skb), self); + /* Trial period over ? */ + if (!time_before(jiffies, tn->addr_trial_end)) { + /* Did we just leave it ? */ + if (!tipc_own_addr(net)) + tipc_net_finalize(net, tn->trial_addr); + msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); + msg_set_prevnode(buf_msg(d->skb), tipc_own_addr(net)); } /* Adjust timeout interval according to discovery phase */ diff --git a/net/tipc/group.c b/net/tipc/group.c index d7a7befeddd4..e82f13cb2dc5 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -159,11 +159,6 @@ u32 tipc_group_exclude(struct tipc_group *grp) return 0; } -int tipc_group_size(struct tipc_group *grp) -{ - return grp->member_cnt; -} - struct tipc_group *tipc_group_create(struct net *net, u32 portid, struct tipc_group_req *mreq, bool *group_is_open) @@ -232,8 +227,8 @@ void tipc_group_delete(struct net *net, struct tipc_group *grp) kfree(grp); } -struct tipc_member *tipc_group_find_member(struct tipc_group *grp, - u32 node, u32 port) +static struct tipc_member *tipc_group_find_member(struct tipc_group *grp, + u32 node, u32 port) { struct rb_node *n = grp->members.rb_node; u64 nkey, key = (u64)node << 32 | port; @@ -918,3 +913,35 @@ void tipc_group_member_evt(struct tipc_group *grp, } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } + +int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb) +{ + struct nlattr *group = nla_nest_start(skb, TIPC_NLA_SOCK_GROUP); + + if (nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_ID, + grp->type) || + nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_INSTANCE, + grp->instance) || + nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT, + grp->bc_snd_nxt)) + goto group_msg_cancel; + + if (grp->scope == TIPC_NODE_SCOPE) + if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_NODE_SCOPE)) + goto group_msg_cancel; + + if (grp->scope == TIPC_CLUSTER_SCOPE) + if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE)) + goto group_msg_cancel; + + if (*grp->open) + if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_OPEN)) + goto group_msg_cancel; + + nla_nest_end(skb, group); + return 0; + +group_msg_cancel: + nla_nest_cancel(skb, group); + return -1; +} diff --git a/net/tipc/group.h b/net/tipc/group.h index 5996af6e9f1d..76b4e5a7b39d 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -72,4 +72,5 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq); u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); void tipc_group_update_member(struct tipc_member *m, int len); +int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb); #endif diff --git a/net/tipc/link.c b/net/tipc/link.c index 695acb783969..b1f0bee54eac 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -106,7 +106,8 @@ struct tipc_stats { * @backlogq: queue for messages waiting to be sent * @snt_nxt: next sequence number to use for outbound messages * @last_retransmitted: sequence number of most recently retransmitted message - * @stale_count: # of identical retransmit requests made by peer + * @stale_cnt: counter for number of identical retransmit attempts + * @stale_limit: time when repeated identical retransmits must force link reset * @ackers: # of peers that needs to ack each packet before it can be released * @acked: # last packet acked by a certain peer. Used for broadcast. * @rcv_nxt: next sequence number to expect for inbound messages @@ -127,14 +128,17 @@ struct tipc_link { struct net *net; /* Management and link supervision data */ - u32 peer_session; - u32 session; + u16 peer_session; + u16 session; + u16 snd_nxt_state; + u16 rcv_nxt_state; u32 peer_bearer_id; u32 bearer_id; u32 tolerance; u32 abort_limit; u32 state; u16 peer_caps; + bool in_session; bool active; u32 silent_intv_cnt; char if_name[TIPC_MAX_IF_NAME]; @@ -161,7 +165,8 @@ struct tipc_link { u16 snd_nxt; u16 last_retransm; u16 window; - u32 stale_count; + u16 stale_cnt; + unsigned long stale_limit; /* Reception */ u16 rcv_nxt; @@ -212,11 +217,6 @@ enum { */ #define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2) -/* Wildcard value for link session numbers. When it is known that - * peer endpoint is down, any session number must be accepted. - */ -#define ANY_SESSION 0x10000 - /* Link FSM states: */ enum { @@ -297,11 +297,6 @@ static bool link_is_bc_rcvlink(struct tipc_link *l) return ((l->bc_rcvlink == l) && !link_is_bc_sndlink(l)); } -int tipc_link_is_active(struct tipc_link *l) -{ - return l->active; -} - void tipc_link_set_active(struct tipc_link *l, bool active) { l->active = active; @@ -337,6 +332,11 @@ char tipc_link_plane(struct tipc_link *l) return l->net_plane; } +void tipc_link_update_caps(struct tipc_link *l, u16 capabilities) +{ + l->peer_caps = capabilities; +} + void tipc_link_add_bc_peer(struct tipc_link *snd_l, struct tipc_link *uc_l, struct sk_buff_head *xmitq) @@ -373,7 +373,7 @@ int tipc_link_bc_peers(struct tipc_link *l) return l->ackers; } -u16 link_bc_rcv_gap(struct tipc_link *l) +static u16 link_bc_rcv_gap(struct tipc_link *l) { struct sk_buff *skb = skb_peek(&l->deferdq); u16 gap = 0; @@ -469,7 +469,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, l->addr = peer; l->peer_caps = peer_caps; l->net = net; - l->peer_session = ANY_SESSION; + l->in_session = false; l->bearer_id = bearer_id; l->tolerance = tolerance; l->net_plane = net_plane; @@ -820,7 +820,7 @@ static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr) * Wake up a number of waiting users, as permitted by available space * in the send queue */ -void link_prepare_wakeup(struct tipc_link *l) +static void link_prepare_wakeup(struct tipc_link *l) { struct sk_buff *skb, *tmp; int imp, i = 0; @@ -838,7 +838,7 @@ void link_prepare_wakeup(struct tipc_link *l) void tipc_link_reset(struct tipc_link *l) { - l->peer_session = ANY_SESSION; + l->in_session = false; l->session++; l->mtu = l->advertised_mtu; __skb_queue_purge(&l->transmq); @@ -857,10 +857,12 @@ void tipc_link_reset(struct tipc_link *l) l->rcv_unacked = 0; l->snd_nxt = 1; l->rcv_nxt = 1; + l->snd_nxt_state = 1; + l->rcv_nxt_state = 1; l->acked = 0; l->silent_intv_cnt = 0; l->rst_cnt = 0; - l->stale_count = 0; + l->stale_cnt = 0; l->bc_peer_is_up = false; memset(&l->mon_state, 0, sizeof(l->mon_state)); tipc_link_reset_stats(l); @@ -954,7 +956,8 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, return rc; } -void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) +static void tipc_link_advance_backlog(struct tipc_link *l, + struct sk_buff_head *xmitq) { struct sk_buff *skb, *_skb; struct tipc_msg *hdr; @@ -997,39 +1000,41 @@ static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb) msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr)); } -int tipc_link_retrans(struct tipc_link *l, struct tipc_link *nacker, - u16 from, u16 to, struct sk_buff_head *xmitq) +/* tipc_link_retrans() - retransmit one or more packets + * @l: the link to transmit on + * @r: the receiving link ordering the retransmit. Same as l if unicast + * @from: retransmit from (inclusive) this sequence number + * @to: retransmit to (inclusive) this sequence number + * xmitq: queue for accumulating the retransmitted packets + */ +static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, + u16 from, u16 to, struct sk_buff_head *xmitq) { struct sk_buff *_skb, *skb = skb_peek(&l->transmq); - struct tipc_msg *hdr; - u16 ack = l->rcv_nxt - 1; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + u16 ack = l->rcv_nxt - 1; + struct tipc_msg *hdr; if (!skb) return 0; /* Detect repeated retransmit failures on same packet */ - if (nacker->last_retransm != buf_seqno(skb)) { - nacker->last_retransm = buf_seqno(skb); - nacker->stale_count = 1; - } else if (++nacker->stale_count > 100) { + if (r->last_retransm != buf_seqno(skb)) { + r->last_retransm = buf_seqno(skb); + r->stale_limit = jiffies + msecs_to_jiffies(l->tolerance); + } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) { link_retransmit_failure(l, skb); - nacker->stale_count = 0; if (link_is_bc_sndlink(l)) return TIPC_LINK_DOWN_EVT; return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } - /* Move forward to where retransmission should start */ skb_queue_walk(&l->transmq, skb) { - if (!less(buf_seqno(skb), from)) - break; - } - - skb_queue_walk_from(&l->transmq, skb) { - if (more(buf_seqno(skb), to)) - break; hdr = buf_msg(skb); + if (less(msg_seqno(hdr), from)) + continue; + if (more(msg_seqno(hdr), to)) + break; _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); if (!_skb) return 0; @@ -1063,6 +1068,7 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, skb_queue_tail(mc_inputq, skb); return true; } + /* else: fall through */ case CONN_MANAGER: skb_queue_tail(inputq, skb); return true; @@ -1271,6 +1277,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, /* Forward queues and wake up waiting users */ if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { + l->stale_cnt = 0; tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); @@ -1347,6 +1354,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2); if (mtyp == STATE_MSG) { + if (l->peer_caps & TIPC_LINK_PROTO_SEQNO) + msg_set_seqno(hdr, l->snd_nxt_state++); msg_set_seq_gap(hdr, rcvgap); msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); msg_set_probe(hdr, probe); @@ -1438,6 +1447,44 @@ tnl: } } +/* tipc_link_validate_msg(): validate message against current link state + * Returns true if message should be accepted, otherwise false + */ +bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr) +{ + u16 curr_session = l->peer_session; + u16 session = msg_session(hdr); + int mtyp = msg_type(hdr); + + if (msg_user(hdr) != LINK_PROTOCOL) + return true; + + switch (mtyp) { + case RESET_MSG: + if (!l->in_session) + return true; + /* Accept only RESET with new session number */ + return more(session, curr_session); + case ACTIVATE_MSG: + if (!l->in_session) + return true; + /* Accept only ACTIVATE with new or current session number */ + return !less(session, curr_session); + case STATE_MSG: + /* Accept only STATE with current session number */ + if (!l->in_session) + return false; + if (session != curr_session) + return false; + if (!(l->peer_caps & TIPC_LINK_PROTO_SEQNO)) + return true; + /* Accept only STATE with new sequence number */ + return !less(msg_seqno(hdr), l->rcv_nxt_state); + default: + return false; + } +} + /* tipc_link_proto_rcv(): receive link level protocol message : * Note that network plane id propagates through the network, and may * change at any time. The node with lowest numerical id determines @@ -1471,17 +1518,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, hdr = buf_msg(skb); data = msg_data(hdr); + if (!tipc_link_validate_msg(l, hdr)) + goto exit; + switch (mtyp) { case RESET_MSG: - - /* Ignore duplicate RESET with old session number */ - if ((less_eq(msg_session(hdr), l->peer_session)) && - (l->peer_session != ANY_SESSION)) - break; - /* fall thru' */ - case ACTIVATE_MSG: - /* Complete own link name with peer's interface name */ if_name = strrchr(l->name, ':') + 1; if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME) @@ -1509,12 +1551,14 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, rc = TIPC_LINK_UP_EVT; l->peer_session = msg_session(hdr); + l->in_session = true; l->peer_bearer_id = msg_bearer_id(hdr); if (l->mtu > msg_max_pkt(hdr)) l->mtu = msg_max_pkt(hdr); break; case STATE_MSG: + l->rcv_nxt_state = msg_seqno(hdr) + 1; /* Update own tolerance if peer indicates a non-zero value */ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) diff --git a/net/tipc/link.h b/net/tipc/link.h index ec59348a81e8..7bc494a33fdf 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -110,6 +110,8 @@ char *tipc_link_name(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); int tipc_link_window(struct tipc_link *l); +void tipc_link_update_caps(struct tipc_link *l, u16 capabilities); +bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr); unsigned long tipc_link_tolerance(struct tipc_link *l); void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, struct sk_buff_head *xmitq); diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 5453e564da82..67f69389ec17 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -684,7 +684,8 @@ int tipc_nl_monitor_get_threshold(struct net *net) return tn->mon_threshold; } -int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, struct tipc_nl_msg *msg) +static int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, + struct tipc_nl_msg *msg) { struct tipc_mon_domain *dom = peer->domain; struct nlattr *attrs; diff --git a/net/tipc/msg.c b/net/tipc/msg.c index b6c45dccba3d..b61891054709 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -416,26 +416,31 @@ bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu) */ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) { - struct tipc_msg *msg; - int imsz, offset; + struct tipc_msg *hdr, *ihdr; + int imsz; *iskb = NULL; if (unlikely(skb_linearize(skb))) goto none; - msg = buf_msg(skb); - offset = msg_hdr_sz(msg) + *pos; - if (unlikely(offset > (msg_size(msg) - MIN_H_SIZE))) + hdr = buf_msg(skb); + if (unlikely(*pos > (msg_data_sz(hdr) - MIN_H_SIZE))) goto none; - *iskb = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!*iskb)) + ihdr = (struct tipc_msg *)(msg_data(hdr) + *pos); + imsz = msg_size(ihdr); + + if ((*pos + imsz) > msg_data_sz(hdr)) goto none; - skb_pull(*iskb, offset); - imsz = msg_size(buf_msg(*iskb)); - skb_trim(*iskb, imsz); + + *iskb = tipc_buf_acquire(imsz, GFP_ATOMIC); + if (!*iskb) + goto none; + + skb_copy_to_linear_data(*iskb, ihdr, imsz); if (unlikely(!tipc_msg_validate(iskb))) goto none; + *pos += align(imsz); return true; none: @@ -531,12 +536,6 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) msg_set_hdr_sz(hdr, BASIC_H_SIZE); } - if (skb_cloned(_skb) && - pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC)) - goto exit; - - /* reassign after skb header modifications */ - hdr = buf_msg(_skb); /* Now reverse the concerned fields */ msg_set_errcode(hdr, err); msg_set_non_seq(hdr, 0); @@ -595,10 +594,6 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) if (!skb_cloned(skb)) return true; - /* Unclone buffer in case it was bundled */ - if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC)) - return false; - return true; } diff --git a/net/tipc/net.c b/net/tipc/net.c index 4fbaa0464405..a7f6964c3a4b 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -121,12 +121,17 @@ int tipc_net_init(struct net *net, u8 *node_id, u32 addr) void tipc_net_finalize(struct net *net, u32 addr) { - tipc_set_node_addr(net, addr); - smp_mb(); - tipc_named_reinit(net); - tipc_sk_reinit(net); - tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, - TIPC_CLUSTER_SCOPE, 0, addr); + struct tipc_net *tn = tipc_net(net); + + spin_lock_bh(&tn->node_list_lock); + if (!tipc_own_addr(net)) { + tipc_set_node_addr(net, addr); + tipc_named_reinit(net); + tipc_sk_reinit(net); + tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, + TIPC_CLUSTER_SCOPE, 0, addr); + } + spin_unlock_bh(&tn->node_list_lock); } void tipc_net_stop(struct net *net) diff --git a/net/tipc/node.c b/net/tipc/node.c index 6a44eb812baf..68014f1b6976 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -45,6 +45,7 @@ #include "netlink.h" #define INVALID_NODE_SIG 0x10000 +#define NODE_CLEANUP_AFTER 300000 /* Flags used to take different actions according to flag type * TIPC_NOTIFY_NODE_DOWN: notify node is down @@ -96,6 +97,7 @@ struct tipc_bclink_entry { * @link_id: local and remote bearer ids of changing link, if any * @publ_list: list of publications * @rcu: rcu struct for tipc_node + * @delete_at: indicates the time for deleting a down node */ struct tipc_node { u32 addr; @@ -121,6 +123,7 @@ struct tipc_node { unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; + unsigned long delete_at; }; /* Node FSM states and events: @@ -160,6 +163,7 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); static void tipc_node_put(struct tipc_node *node); static bool node_is_up(struct tipc_node *n); +static void tipc_node_delete_from_list(struct tipc_node *node); struct tipc_sock_conn { u32 port; @@ -359,13 +363,24 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n, *temp_node; + struct tipc_link *l; + int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr); if (n) { + if (n->capabilities == capabilities) + goto exit; /* Same node may come back with new capabilities */ + write_lock_bh(&n->lock); n->capabilities = capabilities; + for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { + l = n->links[bearer_id].link; + if (l) + tipc_link_update_caps(l, capabilities); + } + write_unlock_bh(&n->lock); goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); @@ -390,6 +405,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); n->state = SELF_DOWN_PEER_LEAVING; + n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; @@ -433,11 +449,16 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) tipc_link_set_abort_limit(l, tol / n->keepalive_intv); } -static void tipc_node_delete(struct tipc_node *node) +static void tipc_node_delete_from_list(struct tipc_node *node) { list_del_rcu(&node->list); hlist_del_rcu(&node->hash); tipc_node_put(node); +} + +static void tipc_node_delete(struct tipc_node *node) +{ + tipc_node_delete_from_list(node); del_timer_sync(&node->timer); tipc_node_put(node); @@ -544,6 +565,42 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) tipc_node_put(node); } +static void tipc_node_clear_links(struct tipc_node *node) +{ + int i; + + for (i = 0; i < MAX_BEARERS; i++) { + struct tipc_link_entry *le = &node->links[i]; + + if (le->link) { + kfree(le->link); + le->link = NULL; + node->link_cnt--; + } + } +} + +/* tipc_node_cleanup - delete nodes that does not + * have active links for NODE_CLEANUP_AFTER time + */ +static int tipc_node_cleanup(struct tipc_node *peer) +{ + struct tipc_net *tn = tipc_net(peer->net); + bool deleted = false; + + spin_lock_bh(&tn->node_list_lock); + tipc_node_write_lock(peer); + + if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) { + tipc_node_clear_links(peer); + tipc_node_delete_from_list(peer); + deleted = true; + } + tipc_node_write_unlock(peer); + spin_unlock_bh(&tn->node_list_lock); + return deleted; +} + /* tipc_node_timeout - handle expiration of node timer */ static void tipc_node_timeout(struct timer_list *t) @@ -551,21 +608,29 @@ static void tipc_node_timeout(struct timer_list *t) struct tipc_node *n = from_timer(n, t, timer); struct tipc_link_entry *le; struct sk_buff_head xmitq; + int remains = n->link_cnt; int bearer_id; int rc = 0; + if (!node_is_up(n) && tipc_node_cleanup(n)) { + /*Removing the reference of Timer*/ + tipc_node_put(n); + return; + } + __skb_queue_head_init(&xmitq); - for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { + for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { tipc_node_read_lock(n); le = &n->links[bearer_id]; - spin_lock_bh(&le->lock); if (le->link) { + spin_lock_bh(&le->lock); /* Link tolerance may change asynchronously: */ tipc_node_calculate_timer(n, le->link); rc = tipc_link_timeout(le->link, &xmitq); + spin_unlock_bh(&le->lock); + remains--; } - spin_unlock_bh(&le->lock); tipc_node_read_unlock(n); tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr); if (rc & TIPC_LINK_DOWN_EVT) @@ -797,6 +862,7 @@ static u32 tipc_node_suggest_addr(struct net *net, u32 addr) } /* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not + * Returns suggested address if any, otherwise 0 */ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) { @@ -819,12 +885,14 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) if (n) { addr = n->addr; tipc_node_put(n); + return addr; } - /* Even this node may be in trial phase */ + + /* Even this node may be in conflict */ if (tn->trial_addr == addr) return tipc_node_suggest_addr(net, addr); - return addr; + return 0; } void tipc_node_check_dest(struct net *net, u32 addr, @@ -1171,6 +1239,7 @@ static void node_lost_contact(struct tipc_node *n, uint i; pr_debug("Lost contact with %x\n", n->addr); + n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); @@ -1478,7 +1547,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id * tipc_node_check_state - check and if necessary update node state * @skb: TIPC packet * @bearer_id: identity of bearer delivering the packet - * Returns true if state is ok, otherwise consumes buffer and returns false + * Returns true if state and msg are ok, otherwise false */ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int bearer_id, struct sk_buff_head *xmitq) @@ -1512,6 +1581,9 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, } } + if (!tipc_link_validate_msg(l, hdr)) + return false; + /* Check and update node accesibility if applicable */ if (state == SELF_UP_PEER_COMING) { if (!tipc_link_is_up(l)) @@ -1740,7 +1812,6 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) struct tipc_node *peer; u32 addr; int err; - int i; /* We identify the peer by its net */ if (!info->attrs[TIPC_NLA_NET]) @@ -1775,15 +1846,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) goto err_out; } - for (i = 0; i < MAX_BEARERS; i++) { - struct tipc_link_entry *le = &peer->links[i]; - - if (le->link) { - kfree(le->link); - le->link = NULL; - peer->link_cnt--; - } - } + tipc_node_clear_links(peer); tipc_node_write_unlock(peer); tipc_node_delete(peer); diff --git a/net/tipc/node.h b/net/tipc/node.h index 846c8f240872..48b3298a248d 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -49,14 +49,16 @@ enum { TIPC_BCAST_STATE_NACK = (1 << 2), TIPC_BLOCK_FLOWCTL = (1 << 3), TIPC_BCAST_RCAST = (1 << 4), - TIPC_NODE_ID128 = (1 << 5) + TIPC_NODE_ID128 = (1 << 5), + TIPC_LINK_PROTO_SEQNO = (1 << 6) }; -#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ - TIPC_BCAST_STATE_NACK | \ - TIPC_BCAST_RCAST | \ - TIPC_BLOCK_FLOWCTL | \ - TIPC_NODE_ID128) +#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ + TIPC_BCAST_STATE_NACK | \ + TIPC_BCAST_RCAST | \ + TIPC_BLOCK_FLOWCTL | \ + TIPC_NODE_ID128 | \ + TIPC_LINK_PROTO_SEQNO) #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 14a5d055717d..3d21414ba357 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -692,9 +692,10 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, } /** - * tipc_poll - read pollmask + * tipc_poll - read and possibly block on pollmask * @file: file structure associated with the socket * @sock: socket for which to calculate the poll bits + * @wait: ??? * * Returns pollmask value * @@ -708,12 +709,15 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, * imply that the operation will succeed, merely that it should be performed * and will not block. */ -static __poll_t tipc_poll_mask(struct socket *sock, __poll_t events) +static __poll_t tipc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); __poll_t revents = 0; + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_shutdown & RCV_SHUTDOWN) revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) @@ -3033,7 +3037,7 @@ static const struct proto_ops msg_ops = { .socketpair = tipc_socketpair, .accept = sock_no_accept, .getname = tipc_getname, - .poll_mask = tipc_poll_mask, + .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = sock_no_listen, .shutdown = tipc_shutdown, @@ -3054,7 +3058,7 @@ static const struct proto_ops packet_ops = { .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, - .poll_mask = tipc_poll_mask, + .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = tipc_listen, .shutdown = tipc_shutdown, @@ -3075,7 +3079,7 @@ static const struct proto_ops stream_ops = { .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, - .poll_mask = tipc_poll_mask, + .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = tipc_listen, .shutdown = tipc_shutdown, @@ -3316,6 +3320,11 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, goto stat_msg_cancel; nla_nest_end(skb, stat); + + if (tsk->group) + if (tipc_group_fill_sock_diag(tsk->group, skb)) + goto stat_msg_cancel; + nla_nest_end(skb, attrs); return 0; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a7a8f8e20ff3..1e968d238adf 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -52,9 +52,12 @@ static DEFINE_SPINLOCK(tls_device_lock); static void tls_device_free_ctx(struct tls_context *ctx) { - struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx); + if (ctx->tx_conf == TLS_HW) + kfree(tls_offload_ctx_tx(ctx)); + + if (ctx->rx_conf == TLS_HW) + kfree(tls_offload_ctx_rx(ctx)); - kfree(offload_ctx); kfree(ctx); } @@ -71,10 +74,11 @@ static void tls_device_gc_task(struct work_struct *work) list_for_each_entry_safe(ctx, tmp, &gc_list, list) { struct net_device *netdev = ctx->netdev; - if (netdev) { + if (netdev && ctx->tx_conf == TLS_HW) { netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); dev_put(netdev); + ctx->netdev = NULL; } list_del(&ctx->list); @@ -82,6 +86,22 @@ static void tls_device_gc_task(struct work_struct *work) } } +static void tls_device_attach(struct tls_context *ctx, struct sock *sk, + struct net_device *netdev) +{ + if (sk->sk_destruct != tls_device_sk_destruct) { + refcount_set(&ctx->refcount, 1); + dev_hold(netdev); + ctx->netdev = netdev; + spin_lock_irq(&tls_device_lock); + list_add_tail(&ctx->list, &tls_device_list); + spin_unlock_irq(&tls_device_lock); + + ctx->sk_destruct = sk->sk_destruct; + sk->sk_destruct = tls_device_sk_destruct; + } +} + static void tls_device_queue_ctx_destruction(struct tls_context *ctx) { unsigned long flags; @@ -125,7 +145,7 @@ static void destroy_record(struct tls_record_info *record) kfree(record); } -static void delete_all_records(struct tls_offload_context *offload_ctx) +static void delete_all_records(struct tls_offload_context_tx *offload_ctx) { struct tls_record_info *info, *temp; @@ -141,14 +161,14 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_record_info *info, *temp; - struct tls_offload_context *ctx; + struct tls_offload_context_tx *ctx; u64 deleted_records = 0; unsigned long flags; if (!tls_ctx) return; - ctx = tls_offload_ctx(tls_ctx); + ctx = tls_offload_ctx_tx(tls_ctx); spin_lock_irqsave(&ctx->lock, flags); info = ctx->retransmit_hint; @@ -179,15 +199,17 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); - if (ctx->open_record) - destroy_record(ctx->open_record); + tls_ctx->sk_destruct(sk); - delete_all_records(ctx); - crypto_free_aead(ctx->aead_send); - ctx->sk_destruct(sk); - clean_acked_data_disable(inet_csk(sk)); + if (tls_ctx->tx_conf == TLS_HW) { + if (ctx->open_record) + destroy_record(ctx->open_record); + delete_all_records(ctx); + crypto_free_aead(ctx->aead_send); + clean_acked_data_disable(inet_csk(sk)); + } if (refcount_dec_and_test(&tls_ctx->refcount)) tls_device_queue_ctx_destruction(tls_ctx); @@ -219,7 +241,7 @@ static void tls_append_frag(struct tls_record_info *record, static int tls_push_record(struct sock *sk, struct tls_context *ctx, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_record_info *record, struct page_frag *pfrag, int flags, @@ -264,7 +286,7 @@ static int tls_push_record(struct sock *sk, return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); } -static int tls_create_new_record(struct tls_offload_context *offload_ctx, +static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { @@ -290,7 +312,7 @@ static int tls_create_new_record(struct tls_offload_context *offload_ctx, } static int tls_do_allocation(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { @@ -324,7 +346,7 @@ static int tls_push_data(struct sock *sk, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); struct tls_record_info *record = ctx->open_record; @@ -477,7 +499,7 @@ out: return rc; } -struct tls_record_info *tls_get_record(struct tls_offload_context *context, +struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { u64 record_sn = context->hint_record_sn; @@ -520,11 +542,123 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); } +void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct net_device *netdev = tls_ctx->netdev; + struct tls_offload_context_rx *rx_ctx; + u32 is_req_pending; + s64 resync_req; + u32 req_seq; + + if (tls_ctx->rx_conf != TLS_HW) + return; + + rx_ctx = tls_offload_ctx_rx(tls_ctx); + resync_req = atomic64_read(&rx_ctx->resync_req); + req_seq = ntohl(resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1); + is_req_pending = resync_req; + + if (unlikely(is_req_pending) && req_seq == seq && + atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) + netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, + seq + TLS_HEADER_SIZE - 1, + rcd_sn); +} + +static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) +{ + struct strp_msg *rxm = strp_msg(skb); + int err = 0, offset = rxm->offset, copy, nsg; + struct sk_buff *skb_iter, *unused; + struct scatterlist sg[1]; + char *orig_buf, *buf; + + orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE, sk->sk_allocation); + if (!orig_buf) + return -ENOMEM; + buf = orig_buf; + + nsg = skb_cow_data(skb, 0, &unused); + if (unlikely(nsg < 0)) { + err = nsg; + goto free_buf; + } + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], buf, + rxm->full_len + TLS_HEADER_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE); + skb_copy_bits(skb, offset, buf, + TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); + + /* We are interested only in the decrypted data not the auth */ + err = decrypt_skb(sk, skb, sg); + if (err != -EBADMSG) + goto free_buf; + else + err = 0; + + copy = min_t(int, skb_pagelen(skb) - offset, + rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + + if (skb->decrypted) + skb_store_bits(skb, offset, buf, copy); + + offset += copy; + buf += copy; + + skb_walk_frags(skb, skb_iter) { + copy = min_t(int, skb_iter->len, + rxm->full_len - offset + rxm->offset - + TLS_CIPHER_AES_GCM_128_TAG_SIZE); + + if (skb_iter->decrypted) + skb_store_bits(skb_iter, offset, buf, copy); + + offset += copy; + buf += copy; + } + +free_buf: + kfree(orig_buf); + return err; +} + +int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); + int is_decrypted = skb->decrypted; + int is_encrypted = !is_decrypted; + struct sk_buff *skb_iter; + + /* Skip if it is already decrypted */ + if (ctx->sw.decrypted) + return 0; + + /* Check if all the data is decrypted already */ + skb_walk_frags(skb, skb_iter) { + is_decrypted &= skb_iter->decrypted; + is_encrypted &= !skb_iter->decrypted; + } + + ctx->sw.decrypted |= is_decrypted; + + /* Return immedeatly if the record is either entirely plaintext or + * entirely ciphertext. Otherwise handle reencrypt partially decrypted + * record. + */ + return (is_encrypted || is_decrypted) ? 0 : + tls_device_reencrypt(sk, skb); +} + int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { u16 nonce_size, tag_size, iv_size, rec_seq_size; struct tls_record_info *start_marker_record; - struct tls_offload_context *offload_ctx; + struct tls_offload_context_tx *offload_ctx; struct tls_crypto_info *crypto_info; struct net_device *netdev; char *iv, *rec_seq; @@ -546,7 +680,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) goto out; } - offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL); + offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL); if (!offload_ctx) { rc = -ENOMEM; goto free_marker_record; @@ -609,7 +743,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); ctx->push_pending_record = tls_device_push_pending_record; - offload_ctx->sk_destruct = sk->sk_destruct; /* TLS offload is greatly simplified if we don't send * SKBs where only part of the payload needs to be encrypted. @@ -619,8 +752,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) if (skb) TCP_SKB_CB(skb)->eor = 1; - refcount_set(&ctx->refcount, 1); - /* We support starting offload on multiple sockets * concurrently, so we only need a read lock here. * This lock must precede get_netdev_for_sock to prevent races between @@ -655,19 +786,14 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) if (rc) goto release_netdev; - ctx->netdev = netdev; - - spin_lock_irq(&tls_device_lock); - list_add_tail(&ctx->list, &tls_device_list); - spin_unlock_irq(&tls_device_lock); + tls_device_attach(ctx, sk, netdev); - sk->sk_validate_xmit_skb = tls_validate_xmit_skb; /* following this assignment tls_is_sk_tx_device_offloaded * will return true and the context might be accessed * by the netdev's xmit function. */ - smp_store_release(&sk->sk_destruct, - &tls_device_sk_destruct); + smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb); + dev_put(netdev); up_read(&device_offload_lock); goto out; @@ -690,6 +816,105 @@ out: return rc; } +int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) +{ + struct tls_offload_context_rx *context; + struct net_device *netdev; + int rc = 0; + + /* We support starting offload on multiple sockets + * concurrently, so we only need a read lock here. + * This lock must precede get_netdev_for_sock to prevent races between + * NETDEV_DOWN and setsockopt. + */ + down_read(&device_offload_lock); + netdev = get_netdev_for_sock(sk); + if (!netdev) { + pr_err_ratelimited("%s: netdev not found\n", __func__); + rc = -EINVAL; + goto release_lock; + } + + if (!(netdev->features & NETIF_F_HW_TLS_RX)) { + pr_err_ratelimited("%s: netdev %s with no TLS offload\n", + __func__, netdev->name); + rc = -ENOTSUPP; + goto release_netdev; + } + + /* Avoid offloading if the device is down + * We don't want to offload new flows after + * the NETDEV_DOWN event + */ + if (!(netdev->flags & IFF_UP)) { + rc = -EINVAL; + goto release_netdev; + } + + context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL); + if (!context) { + rc = -ENOMEM; + goto release_netdev; + } + + ctx->priv_ctx_rx = context; + rc = tls_set_sw_offload(sk, ctx, 0); + if (rc) + goto release_ctx; + + rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, + &ctx->crypto_recv, + tcp_sk(sk)->copied_seq); + if (rc) { + pr_err_ratelimited("%s: The netdev has refused to offload this socket\n", + __func__); + goto free_sw_resources; + } + + tls_device_attach(ctx, sk, netdev); + goto release_netdev; + +free_sw_resources: + tls_sw_free_resources_rx(sk); +release_ctx: + ctx->priv_ctx_rx = NULL; +release_netdev: + dev_put(netdev); +release_lock: + up_read(&device_offload_lock); + return rc; +} + +void tls_device_offload_cleanup_rx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct net_device *netdev; + + down_read(&device_offload_lock); + netdev = tls_ctx->netdev; + if (!netdev) + goto out; + + if (!(netdev->features & NETIF_F_HW_TLS_RX)) { + pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n", + __func__); + goto out; + } + + netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx, + TLS_OFFLOAD_CTX_DIR_RX); + + if (tls_ctx->tx_conf != TLS_HW) { + dev_put(netdev); + tls_ctx->netdev = NULL; + } +out: + up_read(&device_offload_lock); + kfree(tls_ctx->rx.rec_seq); + kfree(tls_ctx->rx.iv); + tls_sw_release_resources_rx(sk); +} + static int tls_device_down(struct net_device *netdev) { struct tls_context *ctx, *tmp; @@ -710,8 +935,12 @@ static int tls_device_down(struct net_device *netdev) spin_unlock_irqrestore(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &list, list) { - netdev->tlsdev_ops->tls_dev_del(netdev, ctx, - TLS_OFFLOAD_CTX_DIR_TX); + if (ctx->tx_conf == TLS_HW) + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_TX); + if (ctx->rx_conf == TLS_HW) + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_RX); ctx->netdev = NULL; dev_put(netdev); list_del_init(&ctx->list); @@ -732,12 +961,16 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - if (!(dev->features & NETIF_F_HW_TLS_TX)) + if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: + if ((dev->features & NETIF_F_HW_TLS_RX) && + !dev->tlsdev_ops->tls_dev_resync_rx) + return NOTIFY_BAD; + if (dev->tlsdev_ops && dev->tlsdev_ops->tls_dev_add && dev->tlsdev_ops->tls_dev_del) diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 748914abdb60..e3313c45663f 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -214,7 +214,7 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) static int fill_sg_in(struct scatterlist *sg_in, struct sk_buff *skb, - struct tls_offload_context *ctx, + struct tls_offload_context_tx *ctx, u64 *rcd_sn, s32 *sync_size, int *resync_sgs) @@ -299,7 +299,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, s32 sync_size, u64 rcd_sn) { int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int payload_len = skb->len - tcp_payload_offset; void *buf, *iv, *aad, *dummy_buf; struct aead_request *aead_req; @@ -361,7 +361,7 @@ static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb) { int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int payload_len = skb->len - tcp_payload_offset; struct scatterlist *sg_in, sg_out[3]; struct sk_buff *nskb = NULL; @@ -413,9 +413,10 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, return tls_sw_fallback(sk, skb); } +EXPORT_SYMBOL_GPL(tls_validate_xmit_skb); int tls_sw_fallback_init(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info) { const u8 *key; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a127d61e8af9..b09867c8b817 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -51,15 +51,6 @@ enum { TLSV6, TLS_NUM_PROTS, }; -enum { - TLS_BASE, - TLS_SW, -#ifdef CONFIG_TLS_DEVICE - TLS_HW, -#endif - TLS_HW_RECORD, - TLS_NUM_CONFIG, -}; static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); @@ -290,7 +281,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) } #ifdef CONFIG_TLS_DEVICE - if (ctx->tx_conf != TLS_HW) { + if (ctx->rx_conf == TLS_HW) + tls_device_offload_cleanup_rx(sk); + + if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW) { #else { #endif @@ -470,8 +464,16 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, conf = TLS_SW; } } else { - rc = tls_set_sw_offload(sk, ctx, 0); - conf = TLS_SW; +#ifdef CONFIG_TLS_DEVICE + rc = tls_set_device_offload_rx(sk, ctx); + conf = TLS_HW; + if (rc) { +#else + { +#endif + rc = tls_set_sw_offload(sk, ctx, 0); + conf = TLS_SW; + } } if (rc) @@ -629,6 +631,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW]; prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg; prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage; + + prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW]; + + prot[TLS_SW][TLS_HW] = prot[TLS_SW][TLS_SW]; + + prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW]; #endif prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; @@ -712,7 +720,7 @@ static int __init tls_register(void) build_protos(tls_prots[TLSV4], &tcp_prot); tls_sw_proto_ops = inet_stream_ops; - tls_sw_proto_ops.poll_mask = tls_sw_poll_mask; + tls_sw_proto_ops.poll = tls_sw_poll; tls_sw_proto_ops.splice_read = tls_sw_splice_read; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f127fac88acf..f9971717f7e0 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -53,18 +53,14 @@ static int tls_do_decryption(struct sock *sk, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct strp_msg *rxm = strp_msg(skb); struct aead_request *aead_req; int ret; - unsigned int req_size = sizeof(struct aead_request) + - crypto_aead_reqsize(ctx->aead_recv); - aead_req = kzalloc(req_size, flags); + aead_req = aead_request_alloc(ctx->aead_recv, flags); if (!aead_req) return -ENOMEM; - aead_request_set_tfm(aead_req, ctx->aead_recv); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); aead_request_set_crypt(aead_req, sgin, sgout, data_len + tls_ctx->rx.tag_size, @@ -74,19 +70,7 @@ static int tls_do_decryption(struct sock *sk, ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); - if (ret < 0) - goto out; - - rxm->offset += tls_ctx->rx.prepend_size; - rxm->full_len -= tls_ctx->rx.overhead_size; - tls_advance_record_sn(sk, &tls_ctx->rx); - - ctx->decrypted = true; - - ctx->saved_data_ready(sk); - -out: - kfree(aead_req); + aead_request_free(aead_req); return ret; } @@ -224,8 +208,7 @@ static int tls_push_record(struct sock *sk, int flags, struct aead_request *req; int rc; - req = kzalloc(sizeof(struct aead_request) + - crypto_aead_reqsize(ctx->aead_send), sk->sk_allocation); + req = aead_request_alloc(ctx->aead_send, sk->sk_allocation); if (!req) return -ENOMEM; @@ -267,7 +250,7 @@ static int tls_push_record(struct sock *sk, int flags, tls_advance_record_sn(sk, &tls_ctx->tx); out_req: - kfree(req); + aead_request_free(req); return rc; } @@ -280,7 +263,7 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, int length, int *pages_used, unsigned int *size_used, struct scatterlist *to, int to_max_pages, - bool charge) + bool charge, bool revert) { struct page *pages[MAX_SKB_FRAGS]; @@ -331,6 +314,8 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, out: *size_used = size; *pages_used = num_elem; + if (revert) + iov_iter_revert(from, size); return rc; } @@ -377,6 +362,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int record_room; bool full_record; int orig_size; + bool is_kvec = msg->msg_iter.type & ITER_KVEC; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -ENOTSUPP; @@ -425,14 +411,13 @@ alloc_encrypted: try_to_copy -= required_size - ctx->sg_encrypted_size; full_record = true; } - - if (full_record || eor) { + if (!is_kvec && (full_record || eor)) { ret = zerocopy_from_iter(sk, &msg->msg_iter, try_to_copy, &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, ctx->sg_plaintext_data, ARRAY_SIZE(ctx->sg_plaintext_data), - true); + true, false); if (ret) goto fallback_to_reg_send; @@ -440,7 +425,7 @@ alloc_encrypted: ret = tls_push_record(sk, msg->msg_flags, record_type); if (!ret) continue; - if (ret == -EAGAIN) + if (ret < 0) goto send_end; copied -= try_to_copy; @@ -646,6 +631,9 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return NULL; } + if (sk->sk_shutdown & RCV_SHUTDOWN) + return NULL; + if (sock_flag(sk, SOCK_DONE)) return NULL; @@ -670,8 +658,38 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return skb; } -static int decrypt_skb(struct sock *sk, struct sk_buff *skb, - struct scatterlist *sgout) +static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout, bool *zc) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + int err = 0; + +#ifdef CONFIG_TLS_DEVICE + err = tls_device_decrypted(sk, skb); + if (err < 0) + return err; +#endif + if (!ctx->decrypted) { + err = decrypt_skb(sk, skb, sgout); + if (err < 0) + return err; + } else { + *zc = false; + } + + rxm->offset += tls_ctx->rx.prepend_size; + rxm->full_len -= tls_ctx->rx.overhead_size; + tls_advance_record_sn(sk, &tls_ctx->rx); + ctx->decrypted = true; + ctx->saved_data_ready(sk); + + return err; +} + +int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -701,6 +719,10 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, nsg = skb_to_sgvec(skb, &sgin[1], rxm->offset + tls_ctx->rx.prepend_size, rxm->full_len - tls_ctx->rx.prepend_size); + if (nsg < 0) { + ret = nsg; + goto out; + } tls_make_aad(ctx->rx_aad_ciphertext, rxm->full_len - tls_ctx->rx.overhead_size, @@ -712,6 +734,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, rxm->full_len - tls_ctx->rx.overhead_size, skb, sk->sk_allocation); +out: if (sgin != &sgin_arr[0]) kfree(sgin); @@ -756,6 +779,7 @@ int tls_sw_recvmsg(struct sock *sk, bool cmsg = false; int target, err = 0; long timeo; + bool is_kvec = msg->msg_iter.type & ITER_KVEC; flags |= nonblock; @@ -799,7 +823,7 @@ int tls_sw_recvmsg(struct sock *sk, page_count = iov_iter_npages(&msg->msg_iter, MAX_SKB_FRAGS); to_copy = rxm->full_len - tls_ctx->rx.overhead_size; - if (to_copy <= len && page_count < MAX_SKB_FRAGS && + if (!is_kvec && to_copy <= len && page_count < MAX_SKB_FRAGS && likely(!(flags & MSG_PEEK))) { struct scatterlist sgin[MAX_SKB_FRAGS + 1]; int pages = 0; @@ -812,11 +836,11 @@ int tls_sw_recvmsg(struct sock *sk, err = zerocopy_from_iter(sk, &msg->msg_iter, to_copy, &pages, &chunk, &sgin[1], - MAX_SKB_FRAGS, false); + MAX_SKB_FRAGS, false, true); if (err < 0) goto fallback_to_reg_recv; - err = decrypt_skb(sk, skb, sgin); + err = decrypt_skb_update(sk, skb, sgin, &zc); for (; pages > 0; pages--) put_page(sg_page(&sgin[pages])); if (err < 0) { @@ -825,7 +849,7 @@ int tls_sw_recvmsg(struct sock *sk, } } else { fallback_to_reg_recv: - err = decrypt_skb(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL, &zc); if (err < 0) { tls_err_abort(sk, EBADMSG); goto recv_end; @@ -880,6 +904,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, int err = 0; long timeo; int chunk; + bool zc; lock_sock(sk); @@ -896,7 +921,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (!ctx->decrypted) { - err = decrypt_skb(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL, &zc); if (err < 0) { tls_err_abort(sk, EBADMSG); @@ -919,29 +944,30 @@ splice_read_end: return copied ? : err; } -__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events) +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) { + unsigned int ret; struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - __poll_t mask; - /* Grab EPOLLOUT and EPOLLHUP from the underlying socket */ - mask = ctx->sk_poll_mask(sock, events); + /* Grab POLLOUT and POLLHUP from the underlying socket */ + ret = ctx->sk_poll(file, sock, wait); - /* Clear EPOLLIN bits, and set based on recv_pkt */ - mask &= ~(EPOLLIN | EPOLLRDNORM); + /* Clear POLLIN bits, and set based on recv_pkt */ + ret &= ~(POLLIN | POLLRDNORM); if (ctx->recv_pkt) - mask |= EPOLLIN | EPOLLRDNORM; + ret |= POLLIN | POLLRDNORM; - return mask; + return ret; } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - char header[tls_ctx->rx.prepend_size]; + char header[TLS_HEADER_SIZE + MAX_IV_SIZE]; struct strp_msg *rxm = strp_msg(skb); size_t cipher_overhead; size_t data_len = 0; @@ -951,6 +977,12 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) if (rxm->offset + tls_ctx->rx.prepend_size > skb->len) return 0; + /* Sanity-check size of on-stack buffer. */ + if (WARN_ON(tls_ctx->rx.prepend_size > sizeof(header))) { + ret = -EINVAL; + goto read_failure; + } + /* Linearize header to local buffer */ ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size); @@ -978,6 +1010,10 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) goto read_failure; } +#ifdef CONFIG_TLS_DEVICE + handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset, + *(u64*)tls_ctx->rx.rec_seq); +#endif return data_len + TLS_HEADER_SIZE; read_failure: @@ -990,9 +1026,6 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct strp_msg *rxm; - - rxm = strp_msg(skb); ctx->decrypted = false; @@ -1015,23 +1048,20 @@ void tls_sw_free_resources_tx(struct sock *sk) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - if (ctx->aead_send) - crypto_free_aead(ctx->aead_send); + crypto_free_aead(ctx->aead_send); tls_free_both_sg(sk); kfree(ctx); } -void tls_sw_free_resources_rx(struct sock *sk) +void tls_sw_release_resources_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); if (ctx->aead_recv) { - if (ctx->recv_pkt) { - kfree_skb(ctx->recv_pkt); - ctx->recv_pkt = NULL; - } + kfree_skb(ctx->recv_pkt); + ctx->recv_pkt = NULL; crypto_free_aead(ctx->aead_recv); strp_stop(&ctx->strp); write_lock_bh(&sk->sk_callback_lock); @@ -1041,6 +1071,14 @@ void tls_sw_free_resources_rx(struct sock *sk) strp_done(&ctx->strp); lock_sock(sk); } +} + +void tls_sw_free_resources_rx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + + tls_sw_release_resources_rx(sk); kfree(ctx); } @@ -1065,28 +1103,38 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } if (tx) { - sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL); - if (!sw_ctx_tx) { - rc = -ENOMEM; - goto out; + if (!ctx->priv_ctx_tx) { + sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL); + if (!sw_ctx_tx) { + rc = -ENOMEM; + goto out; + } + ctx->priv_ctx_tx = sw_ctx_tx; + } else { + sw_ctx_tx = + (struct tls_sw_context_tx *)ctx->priv_ctx_tx; } - crypto_init_wait(&sw_ctx_tx->async_wait); - ctx->priv_ctx_tx = sw_ctx_tx; } else { - sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL); - if (!sw_ctx_rx) { - rc = -ENOMEM; - goto out; + if (!ctx->priv_ctx_rx) { + sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL); + if (!sw_ctx_rx) { + rc = -ENOMEM; + goto out; + } + ctx->priv_ctx_rx = sw_ctx_rx; + } else { + sw_ctx_rx = + (struct tls_sw_context_rx *)ctx->priv_ctx_rx; } - crypto_init_wait(&sw_ctx_rx->async_wait); - ctx->priv_ctx_rx = sw_ctx_rx; } if (tx) { + crypto_init_wait(&sw_ctx_tx->async_wait); crypto_info = &ctx->crypto_send; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; } else { + crypto_init_wait(&sw_ctx_rx->async_wait); crypto_info = &ctx->crypto_recv; cctx = &ctx->rx; aead = &sw_ctx_rx->aead_recv; @@ -1111,7 +1159,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } /* Sanity-check the IV size for stack allocations. */ - if (iv_size > MAX_IV_SIZE) { + if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE) { rc = -EINVAL; goto free_priv; } @@ -1191,7 +1239,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); - sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask; + sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; strp_check_rcv(&sw_ctx_rx->strp); } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 95b02a71fd47..e5473c03d667 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -638,8 +638,9 @@ static int unix_stream_connect(struct socket *, struct sockaddr *, static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, int, bool); static int unix_getname(struct socket *, struct sockaddr *, int); -static __poll_t unix_poll_mask(struct socket *, __poll_t); -static __poll_t unix_dgram_poll_mask(struct socket *, __poll_t); +static __poll_t unix_poll(struct file *, struct socket *, poll_table *); +static __poll_t unix_dgram_poll(struct file *, struct socket *, + poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); @@ -680,7 +681,7 @@ static const struct proto_ops unix_stream_ops = { .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, - .poll_mask = unix_poll_mask, + .poll = unix_poll, .ioctl = unix_ioctl, .listen = unix_listen, .shutdown = unix_shutdown, @@ -703,7 +704,7 @@ static const struct proto_ops unix_dgram_ops = { .socketpair = unix_socketpair, .accept = sock_no_accept, .getname = unix_getname, - .poll_mask = unix_dgram_poll_mask, + .poll = unix_dgram_poll, .ioctl = unix_ioctl, .listen = sock_no_listen, .shutdown = unix_shutdown, @@ -725,7 +726,7 @@ static const struct proto_ops unix_seqpacket_ops = { .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, - .poll_mask = unix_dgram_poll_mask, + .poll = unix_dgram_poll, .ioctl = unix_ioctl, .listen = unix_listen, .shutdown = unix_shutdown, @@ -2629,10 +2630,13 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return err; } -static __poll_t unix_poll_mask(struct socket *sock, __poll_t events) +static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err) @@ -2661,11 +2665,15 @@ static __poll_t unix_poll_mask(struct socket *sock, __poll_t events) return mask; } -static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events) +static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk, *other; - int writable; - __poll_t mask = 0; + unsigned int writable; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -2691,7 +2699,7 @@ static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events) } /* No write status requested, avoid expensive OUT tests. */ - if (!(events & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) + if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) return mask; writable = unix_writable(sk); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index bb5d5fa68c35..c1076c19b858 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -850,11 +850,18 @@ static int vsock_shutdown(struct socket *sock, int mode) return err; } -static __poll_t vsock_poll_mask(struct socket *sock, __poll_t events) +static __poll_t vsock_poll(struct file *file, struct socket *sock, + poll_table *wait) { - struct sock *sk = sock->sk; - struct vsock_sock *vsk = vsock_sk(sk); - __poll_t mask = 0; + struct sock *sk; + __poll_t mask; + struct vsock_sock *vsk; + + sk = sock->sk; + vsk = vsock_sk(sk); + + poll_wait(file, sk_sleep(sk), wait); + mask = 0; if (sk->sk_err) /* Signify that there has been an error on this socket. */ @@ -1084,7 +1091,7 @@ static const struct proto_ops vsock_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = vsock_getname, - .poll_mask = vsock_poll_mask, + .poll = vsock_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, @@ -1842,7 +1849,7 @@ static const struct proto_ops vsock_stream_ops = { .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, - .poll_mask = vsock_poll_mask, + .poll = vsock_poll, .ioctl = sock_no_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, diff --git a/net/wimax/Makefile b/net/wimax/Makefile index eb2db0d3b880..c2a71ae487ac 100644 --- a/net/wimax/Makefile +++ b/net/wimax/Makefile @@ -11,5 +11,3 @@ wimax-y := \ stack.o wimax-$(CONFIG_DEBUG_FS) += debugfs.o - - diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c index 6c9bedb7431e..24514840746e 100644 --- a/net/wimax/debugfs.c +++ b/net/wimax/debugfs.c @@ -76,5 +76,3 @@ void wimax_debugfs_rm(struct wimax_dev *wimax_dev) { debugfs_remove_recursive(wimax_dev->debugfs_dentry); } - - diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c index 54aa146930bd..101b2fa3f32e 100644 --- a/net/wimax/op-msg.c +++ b/net/wimax/op-msg.c @@ -404,4 +404,3 @@ error_no_wimax_dev: d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); return result; } - diff --git a/net/wimax/stack.c b/net/wimax/stack.c index 5db731512014..a6307813b6d5 100644 --- a/net/wimax/stack.c +++ b/net/wimax/stack.c @@ -486,7 +486,8 @@ int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev) d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev); /* Do the RFKILL setup before locking, as RFKILL will call - * into our functions. */ + * into our functions. + */ wimax_dev->net_dev = net_dev; result = wimax_rfkill_add(wimax_dev); if (result < 0) @@ -629,4 +630,3 @@ module_exit(wimax_subsys_exit); MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>"); MODULE_DESCRIPTION("Linux WiMAX stack"); MODULE_LICENSE("GPL"); - diff --git a/net/wireless/core.c b/net/wireless/core.c index 48e8097339ab..a88551f3bc43 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -3,7 +3,7 @@ * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2015 Intel Deutschland GmbH + * Copyright 2015-2017 Intel Deutschland GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -744,6 +744,8 @@ int wiphy_register(struct wiphy *wiphy) /* sanity check supported bands/channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { + u16 types = 0; + sband = wiphy->bands[band]; if (!sband) continue; @@ -788,6 +790,23 @@ int wiphy_register(struct wiphy *wiphy) sband->channels[i].band = band; } + for (i = 0; i < sband->n_iftype_data; i++) { + const struct ieee80211_sband_iftype_data *iftd; + + iftd = &sband->iftype_data[i]; + + if (WARN_ON(!iftd->types_mask)) + return -EINVAL; + if (WARN_ON(types & iftd->types_mask)) + return -EINVAL; + + /* at least one piece of information must be present */ + if (WARN_ON(!iftd->he_cap.has_he)) + return -EINVAL; + + types |= iftd->types_mask; + } + have_band = true; } diff --git a/net/wireless/core.h b/net/wireless/core.h index 63eb1b5fdd04..7f52ef569320 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -76,7 +76,7 @@ struct cfg80211_registered_device { struct cfg80211_scan_request *scan_req; /* protected by RTNL */ struct sk_buff *scan_msg; struct list_head sched_scan_req_list; - unsigned long suspend_at; + time64_t suspend_at; struct work_struct scan_done_wk; struct genl_info *cur_cmd_info; diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c index ba0a1f398ce5..e6bce1f130c9 100644 --- a/net/wireless/lib80211_crypt_tkip.c +++ b/net/wireless/lib80211_crypt_tkip.c @@ -65,9 +65,9 @@ struct lib80211_tkip_data { int key_idx; struct crypto_skcipher *rx_tfm_arc4; - struct crypto_ahash *rx_tfm_michael; + struct crypto_shash *rx_tfm_michael; struct crypto_skcipher *tx_tfm_arc4; - struct crypto_ahash *tx_tfm_michael; + struct crypto_shash *tx_tfm_michael; /* scratch buffers for virt_to_page() (crypto API) */ u8 rx_hdr[16], tx_hdr[16]; @@ -106,8 +106,7 @@ static void *lib80211_tkip_init(int key_idx) goto fail; } - priv->tx_tfm_michael = crypto_alloc_ahash("michael_mic", 0, - CRYPTO_ALG_ASYNC); + priv->tx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0); if (IS_ERR(priv->tx_tfm_michael)) { priv->tx_tfm_michael = NULL; goto fail; @@ -120,8 +119,7 @@ static void *lib80211_tkip_init(int key_idx) goto fail; } - priv->rx_tfm_michael = crypto_alloc_ahash("michael_mic", 0, - CRYPTO_ALG_ASYNC); + priv->rx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0); if (IS_ERR(priv->rx_tfm_michael)) { priv->rx_tfm_michael = NULL; goto fail; @@ -131,9 +129,9 @@ static void *lib80211_tkip_init(int key_idx) fail: if (priv) { - crypto_free_ahash(priv->tx_tfm_michael); + crypto_free_shash(priv->tx_tfm_michael); crypto_free_skcipher(priv->tx_tfm_arc4); - crypto_free_ahash(priv->rx_tfm_michael); + crypto_free_shash(priv->rx_tfm_michael); crypto_free_skcipher(priv->rx_tfm_arc4); kfree(priv); } @@ -145,9 +143,9 @@ static void lib80211_tkip_deinit(void *priv) { struct lib80211_tkip_data *_priv = priv; if (_priv) { - crypto_free_ahash(_priv->tx_tfm_michael); + crypto_free_shash(_priv->tx_tfm_michael); crypto_free_skcipher(_priv->tx_tfm_arc4); - crypto_free_ahash(_priv->rx_tfm_michael); + crypto_free_shash(_priv->rx_tfm_michael); crypto_free_skcipher(_priv->rx_tfm_arc4); } kfree(priv); @@ -510,29 +508,36 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) return keyidx; } -static int michael_mic(struct crypto_ahash *tfm_michael, u8 * key, u8 * hdr, - u8 * data, size_t data_len, u8 * mic) +static int michael_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *hdr, + u8 *data, size_t data_len, u8 *mic) { - AHASH_REQUEST_ON_STACK(req, tfm_michael); - struct scatterlist sg[2]; + SHASH_DESC_ON_STACK(desc, tfm_michael); int err; if (tfm_michael == NULL) { pr_warn("%s(): tfm_michael == NULL\n", __func__); return -1; } - sg_init_table(sg, 2); - sg_set_buf(&sg[0], hdr, 16); - sg_set_buf(&sg[1], data, data_len); - if (crypto_ahash_setkey(tfm_michael, key, 8)) + desc->tfm = tfm_michael; + desc->flags = 0; + + if (crypto_shash_setkey(tfm_michael, key, 8)) return -1; - ahash_request_set_tfm(req, tfm_michael); - ahash_request_set_callback(req, 0, NULL, NULL); - ahash_request_set_crypt(req, sg, mic, data_len + 16); - err = crypto_ahash_digest(req); - ahash_request_zero(req); + err = crypto_shash_init(desc); + if (err) + goto out; + err = crypto_shash_update(desc, hdr, 16); + if (err) + goto out; + err = crypto_shash_update(desc, data, data_len); + if (err) + goto out; + err = crypto_shash_final(desc, mic); + +out: + shash_desc_zero(desc); return err; } @@ -654,9 +659,9 @@ static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv) { struct lib80211_tkip_data *tkey = priv; int keyidx; - struct crypto_ahash *tfm = tkey->tx_tfm_michael; + struct crypto_shash *tfm = tkey->tx_tfm_michael; struct crypto_skcipher *tfm2 = tkey->tx_tfm_arc4; - struct crypto_ahash *tfm3 = tkey->rx_tfm_michael; + struct crypto_shash *tfm3 = tkey->rx_tfm_michael; struct crypto_skcipher *tfm4 = tkey->rx_tfm_arc4; keyidx = tkey->key_idx; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c7bbe5f0aae8..5fb9b7dd9831 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -428,6 +428,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, + [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY, + .len = NL80211_HE_MAX_CAPABILITY_LEN }, }; /* policy for the key attributes */ @@ -1324,6 +1326,34 @@ static int nl80211_send_coalesce(struct sk_buff *msg, return 0; } +static int +nl80211_send_iftype_data(struct sk_buff *msg, + const struct ieee80211_sband_iftype_data *iftdata) +{ + const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap; + + if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES, + iftdata->types_mask)) + return -ENOBUFS; + + if (he_cap->has_he) { + if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC, + sizeof(he_cap->he_cap_elem.mac_cap_info), + he_cap->he_cap_elem.mac_cap_info) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY, + sizeof(he_cap->he_cap_elem.phy_cap_info), + he_cap->he_cap_elem.phy_cap_info) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET, + sizeof(he_cap->he_mcs_nss_supp), + &he_cap->he_mcs_nss_supp) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE, + sizeof(he_cap->ppe_thres), he_cap->ppe_thres)) + return -ENOBUFS; + } + + return 0; +} + static int nl80211_send_band_rateinfo(struct sk_buff *msg, struct ieee80211_supported_band *sband) { @@ -1353,6 +1383,32 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg, sband->vht_cap.cap))) return -ENOBUFS; + if (sband->n_iftype_data) { + struct nlattr *nl_iftype_data = + nla_nest_start(msg, NL80211_BAND_ATTR_IFTYPE_DATA); + int err; + + if (!nl_iftype_data) + return -ENOBUFS; + + for (i = 0; i < sband->n_iftype_data; i++) { + struct nlattr *iftdata; + + iftdata = nla_nest_start(msg, i + 1); + if (!iftdata) + return -ENOBUFS; + + err = nl80211_send_iftype_data(msg, + &sband->iftype_data[i]); + if (err) + return err; + + nla_nest_end(msg, iftdata); + } + + nla_nest_end(msg, nl_iftype_data); + } + /* add bitrates */ nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES); if (!nl_rates) @@ -2757,7 +2813,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, wdev_address(wdev)) || nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->devlist_generation ^ - (cfg80211_rdev_list_generation << 2))) + (cfg80211_rdev_list_generation << 2)) || + nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr)) goto nla_put_failure; if (rdev->ops->get_channel) { @@ -4409,6 +4466,7 @@ static int parse_station_flags(struct genl_info *info, params->sta_flags_mask = BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_MFP) | BIT(NL80211_STA_FLAG_AUTHORIZED); + break; default: return -EINVAL; } @@ -4471,6 +4529,9 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, case RATE_INFO_BW_160: rate_flg = NL80211_RATE_INFO_160_MHZ_WIDTH; break; + case RATE_INFO_BW_HE_RU: + rate_flg = 0; + WARN_ON(!(info->flags & RATE_INFO_FLAGS_HE_MCS)); } if (rate_flg && nla_put_flag(msg, rate_flg)) @@ -4490,6 +4551,19 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, if (info->flags & RATE_INFO_FLAGS_SHORT_GI && nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI)) return false; + } else if (info->flags & RATE_INFO_FLAGS_HE_MCS) { + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_MCS, info->mcs)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_NSS, info->nss)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_GI, info->he_gi)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_DCM, info->he_dcm)) + return false; + if (info->bw == RATE_INFO_BW_HE_RU && + nla_put_u8(msg, NL80211_RATE_INFO_HE_RU_ALLOC, + info->he_ru_alloc)) + return false; } nla_nest_end(msg, rate); @@ -4546,13 +4620,13 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, #define PUT_SINFO(attr, memb, type) do { \ BUILD_BUG_ON(sizeof(type) == sizeof(u64)); \ - if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \ + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \ nla_put_ ## type(msg, NL80211_STA_INFO_ ## attr, \ sinfo->memb)) \ goto nla_put_failure; \ } while (0) #define PUT_SINFO_U64(attr, memb) do { \ - if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \ + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \ nla_put_u64_64bit(msg, NL80211_STA_INFO_ ## attr, \ sinfo->memb, NL80211_STA_INFO_PAD)) \ goto nla_put_failure; \ @@ -4561,14 +4635,14 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(CONNECTED_TIME, connected_time, u32); PUT_SINFO(INACTIVE_TIME, inactive_time, u32); - if (sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES) | - BIT(NL80211_STA_INFO_RX_BYTES64)) && + if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES) | + BIT_ULL(NL80211_STA_INFO_RX_BYTES64)) && nla_put_u32(msg, NL80211_STA_INFO_RX_BYTES, (u32)sinfo->rx_bytes)) goto nla_put_failure; - if (sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES) | - BIT(NL80211_STA_INFO_TX_BYTES64)) && + if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES) | + BIT_ULL(NL80211_STA_INFO_TX_BYTES64)) && nla_put_u32(msg, NL80211_STA_INFO_TX_BYTES, (u32)sinfo->tx_bytes)) goto nla_put_failure; @@ -4588,24 +4662,24 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, default: break; } - if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL)) { if (!nl80211_put_signal(msg, sinfo->chains, sinfo->chain_signal, NL80211_STA_INFO_CHAIN_SIGNAL)) goto nla_put_failure; } - if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) { if (!nl80211_put_signal(msg, sinfo->chains, sinfo->chain_signal_avg, NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) goto nla_put_failure; } - if (sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) { if (!nl80211_put_sta_rate(msg, &sinfo->txrate, NL80211_STA_INFO_TX_BITRATE)) goto nla_put_failure; } - if (sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) { if (!nl80211_put_sta_rate(msg, &sinfo->rxrate, NL80211_STA_INFO_RX_BITRATE)) goto nla_put_failure; @@ -4621,7 +4695,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(PEER_PM, peer_pm, u32); PUT_SINFO(NONPEER_PM, nonpeer_pm, u32); - if (sinfo->filled & BIT(NL80211_STA_INFO_BSS_PARAM)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) { bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM); if (!bss_param) goto nla_put_failure; @@ -4640,7 +4714,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, nla_nest_end(msg, bss_param); } - if ((sinfo->filled & BIT(NL80211_STA_INFO_STA_FLAGS)) && + if ((sinfo->filled & BIT_ULL(NL80211_STA_INFO_STA_FLAGS)) && nla_put(msg, NL80211_STA_INFO_STA_FLAGS, sizeof(struct nl80211_sta_flag_update), &sinfo->sta_flags)) @@ -4886,7 +4960,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; if (params->supported_rates) return -EINVAL; - if (params->ext_capab || params->ht_capa || params->vht_capa) + if (params->ext_capab || params->ht_capa || params->vht_capa || + params->he_capa) return -EINVAL; } @@ -5092,6 +5167,15 @@ static int nl80211_set_station_tdls(struct genl_info *info, if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) params->vht_capa = nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); + if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) { + params->he_capa = + nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + params->he_capa_len = + nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + + if (params->he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN) + return -EINVAL; + } err = nl80211_parse_sta_channel_info(info, params); if (err) @@ -5319,6 +5403,17 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.vht_capa = nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); + if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) { + params.he_capa = + nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + params.he_capa_len = + nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + + /* max len is validated in nla policy */ + if (params.he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN) + return -EINVAL; + } + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { params.opmode_notif_used = true; params.opmode_notif = @@ -5351,6 +5446,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) { params.ht_capa = NULL; params.vht_capa = NULL; + + /* HE requires WME */ + if (params.he_capa_len) + return -EINVAL; } /* When you run into this, adjust the code below for the new flag */ @@ -6231,7 +6330,7 @@ do { \ nl80211_check_s32); /* * Check HT operation mode based on - * IEEE 802.11 2012 8.4.2.59 HT Operation element. + * IEEE 802.11-2016 9.4.2.57 HT Operation element. */ if (tb[NL80211_MESHCONF_HT_OPMODE]) { ht_opmode = nla_get_u16(tb[NL80211_MESHCONF_HT_OPMODE]); @@ -6241,22 +6340,9 @@ do { \ IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)) return -EINVAL; - if ((ht_opmode & IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT) && - (ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)) - return -EINVAL; + /* NON_HT_STA bit is reserved, but some programs set it */ + ht_opmode &= ~IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT; - switch (ht_opmode & IEEE80211_HT_OP_MODE_PROTECTION) { - case IEEE80211_HT_OP_MODE_PROTECTION_NONE: - case IEEE80211_HT_OP_MODE_PROTECTION_20MHZ: - if (ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT) - return -EINVAL; - break; - case IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER: - case IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED: - if (!(ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)) - return -EINVAL; - break; - } cfg->ht_opmode = ht_opmode; mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1)); } @@ -6861,6 +6947,16 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev) return regulatory_pre_cac_allowed(wdev->wiphy); } +static bool nl80211_check_scan_feat(struct wiphy *wiphy, u32 flags, u32 flag, + enum nl80211_ext_feature_index feat) +{ + if (!(flags & flag)) + return true; + if (wiphy_ext_feature_isset(wiphy, feat)) + return true; + return false; +} + static int nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, void *request, struct nlattr **attrs, @@ -6895,15 +6991,33 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, if (((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) || - ((*flags & NL80211_SCAN_FLAG_LOW_SPAN) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_LOW_SPAN_SCAN)) || - ((*flags & NL80211_SCAN_FLAG_LOW_POWER) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_LOW_POWER_SCAN)) || - ((*flags & NL80211_SCAN_FLAG_HIGH_ACCURACY) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN))) + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_LOW_SPAN, + NL80211_EXT_FEATURE_LOW_SPAN_SCAN) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_LOW_POWER, + NL80211_EXT_FEATURE_LOW_POWER_SCAN) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_HIGH_ACCURACY, + NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME, + NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP, + NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_RANDOM_SN, + NL80211_EXT_FEATURE_SCAN_RANDOM_SN) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_MIN_PREQ_CONTENT, + NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT)) return -EOPNOTSUPP; if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { @@ -6918,26 +7032,6 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, return err; } - if ((*flags & NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME)) - return -EOPNOTSUPP; - - if ((*flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP)) - return -EOPNOTSUPP; - - if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION)) - return -EOPNOTSUPP; - - if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE)) - return -EOPNOTSUPP; - return 0; } @@ -10160,7 +10254,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, if (err) return err; - if (sinfo.filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG)) wdev->cqm_config->last_rssi_event_value = (s8) sinfo.rx_beacon_signal_avg; } @@ -10962,9 +11056,12 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) rem) { u8 *mask_pat; - nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, - nl80211_packet_pattern_policy, - info->extack); + err = nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, + nl80211_packet_pattern_policy, + info->extack); + if (err) + goto error; + err = -EINVAL; if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) @@ -11213,8 +11310,11 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, rem) { u8 *mask_pat; - nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, - nl80211_packet_pattern_policy, NULL); + err = nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, + nl80211_packet_pattern_policy, NULL); + if (err) + return err; + if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) return -EINVAL; @@ -14930,20 +15030,24 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, EXPORT_SYMBOL(cfg80211_mgmt_tx_status); static int __nl80211_rx_control_port(struct net_device *dev, - const u8 *buf, size_t len, - const u8 *addr, u16 proto, + struct sk_buff *skb, bool unencrypted, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct ethhdr *ehdr = eth_hdr(skb); + const u8 *addr = ehdr->h_source; + u16 proto = be16_to_cpu(skb->protocol); struct sk_buff *msg; void *hdr; + struct nlattr *frame; + u32 nlportid = READ_ONCE(wdev->conn_owner_nlportid); if (!nlportid) return -ENOENT; - msg = nlmsg_new(100 + len, gfp); + msg = nlmsg_new(100 + skb->len, gfp); if (!msg) return -ENOMEM; @@ -14957,13 +15061,17 @@ static int __nl80211_rx_control_port(struct net_device *dev, nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD) || - nla_put(msg, NL80211_ATTR_FRAME, len, buf) || nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) || (unencrypted && nla_put_flag(msg, NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT))) goto nla_put_failure; + frame = nla_reserve(msg, NL80211_ATTR_FRAME, skb->len); + if (!frame) + goto nla_put_failure; + + skb_copy_bits(skb, 0, nla_data(frame), skb->len); genlmsg_end(msg, hdr); return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid); @@ -14974,14 +15082,12 @@ static int __nl80211_rx_control_port(struct net_device *dev, } bool cfg80211_rx_control_port(struct net_device *dev, - const u8 *buf, size_t len, - const u8 *addr, u16 proto, bool unencrypted) + struct sk_buff *skb, bool unencrypted) { int ret; - trace_cfg80211_rx_control_port(dev, buf, len, addr, proto, unencrypted); - ret = __nl80211_rx_control_port(dev, buf, len, addr, proto, - unencrypted, GFP_ATOMIC); + trace_cfg80211_rx_control_port(dev, skb, unencrypted); + ret = __nl80211_rx_control_port(dev, skb, unencrypted, GFP_ATOMIC); trace_cfg80211_return_bool(ret == 0); return ret == 0; } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index bbe6298e4bb9..4fc66a117b7d 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2240,7 +2240,9 @@ static void wiphy_update_regulatory(struct wiphy *wiphy, * as some drivers used this to restore its orig_* reg domain. */ if (initiator == NL80211_REGDOM_SET_BY_CORE && - wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) + wiphy->regulatory_flags & REGULATORY_CUSTOM_REG && + !(wiphy->regulatory_flags & + REGULATORY_WIPHY_SELF_MANAGED)) reg_call_notifier(wiphy, lr); return; } @@ -2787,26 +2789,6 @@ static void notify_self_managed_wiphys(struct regulatory_request *request) } } -static bool reg_only_self_managed_wiphys(void) -{ - struct cfg80211_registered_device *rdev; - struct wiphy *wiphy; - bool self_managed_found = false; - - ASSERT_RTNL(); - - list_for_each_entry(rdev, &cfg80211_rdev_list, list) { - wiphy = &rdev->wiphy; - if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) - self_managed_found = true; - else - return false; - } - - /* make sure at least one self-managed wiphy exists */ - return self_managed_found; -} - /* * Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_* * Regulatory hints come on a first come first serve basis and we @@ -2839,10 +2821,6 @@ static void reg_process_pending_hints(void) spin_unlock(®_requests_lock); notify_self_managed_wiphys(reg_request); - if (reg_only_self_managed_wiphys()) { - reg_free_request(reg_request); - return; - } reg_process_hint(reg_request); diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 570a2b67ca10..6ab32f6a1961 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -102,7 +102,7 @@ static int wiphy_suspend(struct device *dev) struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; - rdev->suspend_at = get_seconds(); + rdev->suspend_at = ktime_get_boottime_seconds(); rtnl_lock(); if (rdev->wiphy.registered) { @@ -130,7 +130,7 @@ static int wiphy_resume(struct device *dev) int ret = 0; /* Age scan results with time spent in suspend */ - cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at); + cfg80211_bss_age(rdev, ktime_get_boottime_seconds() - rdev->suspend_at); rtnl_lock(); if (rdev->wiphy.registered && rdev->ops->resume) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 2b417a2fe63f..7c73510b161f 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2627,23 +2627,25 @@ TRACE_EVENT(cfg80211_mgmt_tx_status, ); TRACE_EVENT(cfg80211_rx_control_port, - TP_PROTO(struct net_device *netdev, const u8 *buf, size_t len, - const u8 *addr, u16 proto, bool unencrypted), - TP_ARGS(netdev, buf, len, addr, proto, unencrypted), + TP_PROTO(struct net_device *netdev, struct sk_buff *skb, + bool unencrypted), + TP_ARGS(netdev, skb, unencrypted), TP_STRUCT__entry( NETDEV_ENTRY - MAC_ENTRY(addr) + __field(int, len) + MAC_ENTRY(from) __field(u16, proto) __field(bool, unencrypted) ), TP_fast_assign( NETDEV_ASSIGN; - MAC_ASSIGN(addr, addr); - __entry->proto = proto; + __entry->len = skb->len; + MAC_ASSIGN(from, eth_hdr(skb)->h_source); + __entry->proto = be16_to_cpu(skb->protocol); __entry->unencrypted = unencrypted; ), - TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT " proto: 0x%x, unencrypted: %s", - NETDEV_PR_ARG, MAC_PR_ARG(addr), + TP_printk(NETDEV_PR_FMT ", len=%d, " MAC_PR_FMT ", proto: 0x%x, unencrypted: %s", + NETDEV_PR_ARG, __entry->len, MAC_PR_ARG(from), __entry->proto, BOOL_TO_STR(__entry->unencrypted)) ); diff --git a/net/wireless/util.c b/net/wireless/util.c index 3c654cd7ba56..e0825a019e9f 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -4,6 +4,7 @@ * * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2017 Intel Deutschland GmbH */ #include <linux/export.h> #include <linux/bitops.h> @@ -1142,6 +1143,85 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) return 0; } +static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) +{ +#define SCALE 2048 + u16 mcs_divisors[12] = { + 34133, /* 16.666666... */ + 17067, /* 8.333333... */ + 11378, /* 5.555555... */ + 8533, /* 4.166666... */ + 5689, /* 2.777777... */ + 4267, /* 2.083333... */ + 3923, /* 1.851851... */ + 3413, /* 1.666666... */ + 2844, /* 1.388888... */ + 2560, /* 1.250000... */ + 2276, /* 1.111111... */ + 2048, /* 1.000000... */ + }; + u32 rates_160M[3] = { 960777777, 907400000, 816666666 }; + u32 rates_969[3] = { 480388888, 453700000, 408333333 }; + u32 rates_484[3] = { 229411111, 216666666, 195000000 }; + u32 rates_242[3] = { 114711111, 108333333, 97500000 }; + u32 rates_106[3] = { 40000000, 37777777, 34000000 }; + u32 rates_52[3] = { 18820000, 17777777, 16000000 }; + u32 rates_26[3] = { 9411111, 8888888, 8000000 }; + u64 tmp; + u32 result; + + if (WARN_ON_ONCE(rate->mcs > 11)) + return 0; + + if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2)) + return 0; + if (WARN_ON_ONCE(rate->he_ru_alloc > + NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) + return 0; + if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) + return 0; + + if (rate->bw == RATE_INFO_BW_160) + result = rates_160M[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_80 || + (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996)) + result = rates_969[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_40 || + (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484)) + result = rates_484[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_20 || + (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242)) + result = rates_242[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106) + result = rates_106[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52) + result = rates_52[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26) + result = rates_26[rate->he_gi]; + else if (WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", + rate->bw, rate->he_ru_alloc)) + return 0; + + /* now scale to the appropriate MCS */ + tmp = result; + tmp *= SCALE; + do_div(tmp, mcs_divisors[rate->mcs]); + result = tmp; + + /* and take NSS, DCM into account */ + result = (result * rate->nss) / 8; + if (rate->he_dcm) + result /= 2; + + return result; +} + u32 cfg80211_calculate_bitrate(struct rate_info *rate) { if (rate->flags & RATE_INFO_FLAGS_MCS) @@ -1150,6 +1230,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate) return cfg80211_calculate_bitrate_60g(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) return cfg80211_calculate_bitrate_vht(rate); + if (rate->flags & RATE_INFO_FLAGS_HE_MCS) + return cfg80211_calculate_bitrate_he(rate); return rate->legacy; } @@ -1791,8 +1873,9 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) { - sinfo->pertid = kcalloc(sizeof(*(sinfo->pertid)), - IEEE80211_NUM_TIDS + 1, gfp); + sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, + sizeof(*(sinfo->pertid)), + gfp); if (!sinfo->pertid) return -ENOMEM; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 05186a47878f..167f7025ac98 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1278,7 +1278,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev, if (err) return err; - if (!(sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))) + if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) return -EOPNOTSUPP; rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate); @@ -1320,7 +1320,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) switch (rdev->wiphy.signal_type) { case CFG80211_SIGNAL_TYPE_MBM: - if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) { + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) { int sig = sinfo.signal; wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; @@ -1334,7 +1334,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) break; } case CFG80211_SIGNAL_TYPE_UNSPEC: - if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) { + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) { wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; wstats.qual.level = sinfo.signal; @@ -1347,9 +1347,9 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) } wstats.qual.updated |= IW_QUAL_NOISE_INVALID; - if (sinfo.filled & BIT(NL80211_STA_INFO_RX_DROP_MISC)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC)) wstats.discard.misc = sinfo.rx_dropped_misc; - if (sinfo.filled & BIT(NL80211_STA_INFO_TX_FAILED)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED)) wstats.discard.retries = sinfo.tx_failed; return &wstats; diff --git a/net/x25/Kconfig b/net/x25/Kconfig index e2fa133f9fba..59fcb41fc5e6 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -31,5 +31,3 @@ config X25 To compile this driver as a module, choose M here: the module will be called x25. If unsure, say N. - - diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index f93365ae0fdd..d49aa79b7997 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1750,7 +1750,7 @@ static const struct proto_ops x25_proto_ops = { .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index 9c214ec681ac..743103786652 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -381,4 +381,3 @@ void x25_check_rbuf(struct sock *sk) x25_stop_timer(sk); } } - diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3b3410ada097..72335c2e8108 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -199,8 +199,11 @@ static void xsk_destruct_skb(struct sk_buff *skb) { u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; struct xdp_sock *xs = xdp_sk(skb->sk); + unsigned long flags; + spin_lock_irqsave(&xs->tx_completion_lock, flags); WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); + spin_unlock_irqrestore(&xs->tx_completion_lock, flags); sock_wfree(skb); } @@ -215,9 +218,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, struct sk_buff *skb; int err = 0; - if (unlikely(!xs->tx)) - return -ENOBUFS; - mutex_lock(&xs->mutex); while (xskq_peek_desc(xs->tx, &desc)) { @@ -230,22 +230,13 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } - if (xskq_reserve_addr(xs->umem->cq)) { - err = -EAGAIN; - goto out; - } - - len = desc.len; - if (unlikely(len > xs->dev->mtu)) { - err = -EMSGSIZE; + if (xskq_reserve_addr(xs->umem->cq)) goto out; - } - if (xs->queue_id >= xs->dev->real_num_tx_queues) { - err = -ENXIO; + if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; - } + len = desc.len; skb = sock_alloc_send_skb(sk, len, 1, &err); if (unlikely(!skb)) { err = -EAGAIN; @@ -268,15 +259,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, skb->destructor = xsk_destruct_skb; err = dev_direct_xmit(skb, xs->queue_id); + xskq_discard_desc(xs->tx); /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { - err = -EAGAIN; - /* SKB consumed by dev_direct_xmit() */ + /* SKB completed but not sent */ + err = -EBUSY; goto out; } sent_frame = true; - xskq_discard_desc(xs->tx); } out: @@ -297,15 +288,18 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return -ENXIO; if (unlikely(!(xs->dev->flags & IFF_UP))) return -ENETDOWN; + if (unlikely(!xs->tx)) + return -ENOBUFS; if (need_wait) return -EOPNOTSUPP; return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); } -static __poll_t xsk_poll_mask(struct socket *sock, __poll_t events) +static unsigned int xsk_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) { - __poll_t mask = datagram_poll_mask(sock, events); + unsigned int mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); @@ -696,7 +690,7 @@ static const struct proto_ops xsk_proto_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = xsk_poll_mask, + .poll = xsk_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -754,6 +748,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs = xdp_sk(sk); mutex_init(&xs->mutex); + spin_lock_init(&xs->tx_completion_lock); local_bh_disable(); sock_prot_inuse_add(net, &xsk_proto, 1); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index ef6a6f0ec949..52ecaf770642 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -62,14 +62,9 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) return (entries > dcnt) ? dcnt : entries; } -static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer) -{ - return q->nentries - (producer - q->cons_tail); -} - static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) { - u32 free_entries = xskq_nb_free_lazy(q, producer); + u32 free_entries = q->nentries - (producer - q->cons_tail); if (free_entries >= dcnt) return free_entries; @@ -129,7 +124,7 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0) + if (xskq_nb_free(q, q->prod_tail, 1) == 0) return -ENOSPC; ring->desc[q->prod_tail++ & q->ring_mask] = addr; diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 53381888a7b3..4a9ee2d83158 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -95,4 +95,3 @@ config NET_KEY_MIGRATE <draft-sugimoto-mip6-pfkey-migrate>. If unsure, say N. - diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 79245e1c3487..5553724b5fcc 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3350,4 +3350,3 @@ module_init(xfrm_user_init); module_exit(xfrm_user_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM); - |