From d594e987c6f5417cc63dd7e107a2a03a7eeee03f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Jun 2012 03:50:35 +0000 Subject: sock_diag: add SK_MEMINFO_BACKLOG Adding socket backlog len in INET_DIAG_SKMEMINFO is really useful to diagnose various TCP problems. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock_diag.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 5fd146720f39..0d934ce1075f 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -46,6 +46,7 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); + mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; return 0; -- cgit v1.2.3 From 35b2a113cb0298d4f9a1263338b456094a414057 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 16 May 2012 23:40:18 +0200 Subject: wireless: remove wext sysfs The only user of this was hal prior to its 0.5.12 release which happened over two years ago, so I'm sure this can be removed without issues. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- Documentation/feature-removal-schedule.txt | 9 ---- net/core/net-sysfs.c | 74 ------------------------------ net/wireless/Kconfig | 13 ------ 3 files changed, 96 deletions(-) (limited to 'net/core') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 56000b33340b..0258e3d06280 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -249,15 +249,6 @@ Who: Ravikiran Thirumalai --------------------------- -What: Code that is now under CONFIG_WIRELESS_EXT_SYSFS - (in net/core/net-sysfs.c) -When: 3.5 -Why: Over 1K .text/.data size reduction, data is available in other - ways (ioctls) -Who: Johannes Berg - ---------------------------- - What: sysfs ui for changing p4-clockmod parameters When: September 2009 Why: See commits 129f8ae9b1b5be94517da76009ea956e89104ce8 and diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fdf9e61d0651..72607174ea5a 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -417,72 +417,6 @@ static struct attribute_group netstat_group = { .name = "statistics", .attrs = netstat_attrs, }; - -#ifdef CONFIG_WIRELESS_EXT_SYSFS -/* helper function that does all the locking etc for wireless stats */ -static ssize_t wireless_show(struct device *d, char *buf, - ssize_t (*format)(const struct iw_statistics *, - char *)) -{ - struct net_device *dev = to_net_dev(d); - const struct iw_statistics *iw; - ssize_t ret = -EINVAL; - - if (!rtnl_trylock()) - return restart_syscall(); - if (dev_isalive(dev)) { - iw = get_wireless_stats(dev); - if (iw) - ret = (*format)(iw, buf); - } - rtnl_unlock(); - - return ret; -} - -/* show function template for wireless fields */ -#define WIRELESS_SHOW(name, field, format_string) \ -static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \ -{ \ - return sprintf(buf, format_string, iw->field); \ -} \ -static ssize_t show_iw_##name(struct device *d, \ - struct device_attribute *attr, char *buf) \ -{ \ - return wireless_show(d, buf, format_iw_##name); \ -} \ -static DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL) - -WIRELESS_SHOW(status, status, fmt_hex); -WIRELESS_SHOW(link, qual.qual, fmt_dec); -WIRELESS_SHOW(level, qual.level, fmt_dec); -WIRELESS_SHOW(noise, qual.noise, fmt_dec); -WIRELESS_SHOW(nwid, discard.nwid, fmt_dec); -WIRELESS_SHOW(crypt, discard.code, fmt_dec); -WIRELESS_SHOW(fragment, discard.fragment, fmt_dec); -WIRELESS_SHOW(misc, discard.misc, fmt_dec); -WIRELESS_SHOW(retries, discard.retries, fmt_dec); -WIRELESS_SHOW(beacon, miss.beacon, fmt_dec); - -static struct attribute *wireless_attrs[] = { - &dev_attr_status.attr, - &dev_attr_link.attr, - &dev_attr_level.attr, - &dev_attr_noise.attr, - &dev_attr_nwid.attr, - &dev_attr_crypt.attr, - &dev_attr_fragment.attr, - &dev_attr_retries.attr, - &dev_attr_misc.attr, - &dev_attr_beacon.attr, - NULL -}; - -static struct attribute_group wireless_group = { - .name = "wireless", - .attrs = wireless_attrs, -}; -#endif #endif /* CONFIG_SYSFS */ #ifdef CONFIG_RPS @@ -1463,14 +1397,6 @@ int netdev_register_kobject(struct net_device *net) groups++; *groups++ = &netstat_group; -#ifdef CONFIG_WIRELESS_EXT_SYSFS - if (net->ieee80211_ptr) - *groups++ = &wireless_group; -#ifdef CONFIG_WIRELESS_EXT - else if (net->wireless_handlers) - *groups++ = &wireless_group; -#endif -#endif #endif /* CONFIG_SYSFS */ error = device_add(dev); diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 2e4444fedbe0..8dba3c60794a 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -119,19 +119,6 @@ config CFG80211_WEXT Enable this option if you need old userspace for wireless extensions with cfg80211-based drivers. -config WIRELESS_EXT_SYSFS - bool "Wireless extensions sysfs files" - depends on WEXT_CORE && SYSFS - help - This option enables the deprecated wireless statistics - files in /sys/class/net/*/wireless/. The same information - is available via the ioctls as well. - - Say N. If you know you have ancient tools requiring it, - like very old versions of hal (prior to 0.5.12 release), - say Y and update the tools as soon as possible as this - option will be removed soon. - config LIB80211 tristate "Common routines for IEEE802.11 drivers" default n -- cgit v1.2.3 From 94b6042cfed02229b05e04002ab00085b60f8213 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 6 Jun 2012 15:23:37 +0000 Subject: net: Update kernel-doc for __alloc_skb() __alloc_skb() now extends tailroom to allow the use of padding added by the heap allocator. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 016694d62484..1d74cea22aaa 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -160,8 +160,8 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) * @node: numa node to allocate memory on * * Allocate a new &sk_buff. The returned buffer has no headroom and a - * tail room of size bytes. The object has a reference count of one. - * The return is the buffer. On a failure the return is %NULL. + * tail room of at least size bytes. The object has a reference count + * of one. The return is the buffer. On a failure the return is %NULL. * * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC. -- cgit v1.2.3 From 80f12eccce775dc6bb93dba9b52529740f929237 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 6 Jun 2012 17:13:06 +0000 Subject: Added kernel support in EEE Ethtool commands This patch extends the kernel's ethtool interface by adding support for 2 new EEE commands - get_eee and set_eee. Thanks goes to Giuseppe Cavallaro for his original patch adding this support. Signed-off-by: Yuval Mintz Signed-off-by: Eilon Greenstein Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/ethtool.h | 35 +++++++++++++++++++++++++++++++++++ net/core/ethtool.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) (limited to 'net/core') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e17fa7140588..297370a6cb18 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -136,6 +136,35 @@ struct ethtool_eeprom { __u8 data[0]; }; +/** + * struct ethtool_eee - Energy Efficient Ethernet information + * @cmd: ETHTOOL_{G,S}EEE + * @supported: Mask of %SUPPORTED_* flags for the speed/duplex combinations + * for which there is EEE support. + * @advertised: Mask of %ADVERTISED_* flags for the speed/duplex combinations + * advertised as eee capable. + * @lp_advertised: Mask of %ADVERTISED_* flags for the speed/duplex + * combinations advertised by the link partner as eee capable. + * @eee_active: Result of the eee auto negotiation. + * @eee_enabled: EEE configured mode (enabled/disabled). + * @tx_lpi_enabled: Whether the interface should assert its tx lpi, given + * that eee was negotiated. + * @tx_lpi_timer: Time in microseconds the interface delays prior to asserting + * its tx lpi (after reaching 'idle' state). Effective only when eee + * was negotiated and tx_lpi_enabled was set. + */ +struct ethtool_eee { + __u32 cmd; + __u32 supported; + __u32 advertised; + __u32 lp_advertised; + __u32 eee_active; + __u32 eee_enabled; + __u32 tx_lpi_enabled; + __u32 tx_lpi_timer; + __u32 reserved[2]; +}; + /** * struct ethtool_modinfo - plugin module eeprom information * @cmd: %ETHTOOL_GMODULEINFO @@ -945,6 +974,8 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) * @get_module_info: Get the size and type of the eeprom contained within * a plug-in module. * @get_module_eeprom: Get the eeprom information from the plug-in module + * @get_eee: Get Energy-Efficient (EEE) supported and status. + * @set_eee: Set EEE status (enable/disable) as well as LPI timers. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -1011,6 +1042,8 @@ struct ethtool_ops { struct ethtool_modinfo *); int (*get_module_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); + int (*get_eee)(struct net_device *, struct ethtool_eee *); + int (*set_eee)(struct net_device *, struct ethtool_eee *); }; @@ -1089,6 +1122,8 @@ struct ethtool_ops { #define ETHTOOL_GET_TS_INFO 0x00000041 /* Get time stamping and PHC info */ #define ETHTOOL_GMODULEINFO 0x00000042 /* Get plug-in module information */ #define ETHTOOL_GMODULEEEPROM 0x00000043 /* Get plug-in module eeprom */ +#define ETHTOOL_GEEE 0x00000044 /* Get EEE settings */ +#define ETHTOOL_SEEE 0x00000045 /* Set EEE settings */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 9c2afb480270..c73d0a59212c 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -729,6 +729,40 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) return dev->ethtool_ops->set_wol(dev, &wol); } +static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_eee edata; + int rc; + + if (!dev->ethtool_ops->get_eee) + return -EOPNOTSUPP; + + memset(&edata, 0, sizeof(struct ethtool_eee)); + edata.cmd = ETHTOOL_GEEE; + rc = dev->ethtool_ops->get_eee(dev, &edata); + + if (rc) + return rc; + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + + return 0; +} + +static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_eee edata; + + if (!dev->ethtool_ops->set_eee) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + return dev->ethtool_ops->set_eee(dev, &edata); +} + static int ethtool_nway_reset(struct net_device *dev) { if (!dev->ethtool_ops->nway_reset) @@ -1471,6 +1505,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) rc = ethtool_set_value_void(dev, useraddr, dev->ethtool_ops->set_msglevel); break; + case ETHTOOL_GEEE: + rc = ethtool_get_eee(dev, useraddr); + break; + case ETHTOOL_SEEE: + rc = ethtool_set_eee(dev, useraddr); + break; case ETHTOOL_NWAY_RST: rc = ethtool_nway_reset(dev); break; -- cgit v1.2.3 From 95603e2293de556de7e82221649bfd7fd98b64a3 Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Tue, 12 Jun 2012 10:16:35 +0000 Subject: net-next: add dev_loopback_xmit() to avoid duplicate code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add dev_loopback_xmit() in order to deduplicate functions ip_dev_loopback_xmit() (in net/ipv4/ip_output.c) and ip6_dev_loopback_xmit() (in net/ipv6/ip6_output.c). I was about to reinvent the wheel when I noticed that ip_dev_loopback_xmit() and ip6_dev_loopback_xmit() do exactly what I need and are not IP-only functions, but they were not available to reuse elsewhere. ip6_dev_loopback_xmit() does not have line "skb_dst_force(skb);", but I understand that this is harmless, and should be in dev_loopback_xmit(). Signed-off-by: Michel Machado CC: "David S. Miller" CC: Alexey Kuznetsov CC: James Morris CC: Hideaki YOSHIFUJI CC: Patrick McHardy CC: Eric Dumazet CC: Jiri Pirko CC: "Michał Mirosław" CC: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 17 +++++++++++++++++ net/ipv4/ip_output.c | 17 ++--------------- net/ipv6/ip6_output.c | 15 +-------------- 4 files changed, 21 insertions(+), 29 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a0b84e3b087c..2c2ecea28a1b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1627,6 +1627,7 @@ extern int dev_alloc_name(struct net_device *dev, const char *name); extern int dev_open(struct net_device *dev); extern int dev_close(struct net_device *dev); extern void dev_disable_lro(struct net_device *dev); +extern int dev_loopback_xmit(struct sk_buff *newskb); extern int dev_queue_xmit(struct sk_buff *skb); extern int register_netdevice(struct net_device *dev); extern void unregister_netdevice_queue(struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index cd0981977f5c..c6e29ea65bd9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2475,6 +2475,23 @@ static void skb_update_prio(struct sk_buff *skb) static DEFINE_PER_CPU(int, xmit_recursion); #define RECURSION_LIMIT 10 +/** + * dev_loopback_xmit - loop back @skb + * @skb: buffer to transmit + */ +int dev_loopback_xmit(struct sk_buff *skb) +{ + skb_reset_mac_header(skb); + __skb_pull(skb, skb_network_offset(skb)); + skb->pkt_type = PACKET_LOOPBACK; + skb->ip_summed = CHECKSUM_UNNECESSARY; + WARN_ON(!skb_dst(skb)); + skb_dst_force(skb); + netif_rx_ni(skb); + return 0; +} +EXPORT_SYMBOL(dev_loopback_xmit); + /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b99ca4e154b9..0f3185a662c3 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -113,19 +113,6 @@ int ip_local_out(struct sk_buff *skb) } EXPORT_SYMBOL_GPL(ip_local_out); -/* dev_loopback_xmit for use with netfilter. */ -static int ip_dev_loopback_xmit(struct sk_buff *newskb) -{ - skb_reset_mac_header(newskb); - __skb_pull(newskb, skb_network_offset(newskb)); - newskb->pkt_type = PACKET_LOOPBACK; - newskb->ip_summed = CHECKSUM_UNNECESSARY; - WARN_ON(!skb_dst(newskb)); - skb_dst_force(newskb); - netif_rx_ni(newskb); - return 0; -} - static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { int ttl = inet->uc_ttl; @@ -281,7 +268,7 @@ int ip_mc_output(struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, NULL, newskb->dev, - ip_dev_loopback_xmit); + dev_loopback_xmit); } /* Multicasts with ttl 0 must not go beyond the host */ @@ -296,7 +283,7 @@ int ip_mc_output(struct sk_buff *skb) struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, - NULL, newskb->dev, ip_dev_loopback_xmit); + NULL, newskb->dev, dev_loopback_xmit); } return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 62fcf3e48aca..ee1bb450bfe4 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -83,19 +83,6 @@ int ip6_local_out(struct sk_buff *skb) } EXPORT_SYMBOL_GPL(ip6_local_out); -/* dev_loopback_xmit for use with netfilter. */ -static int ip6_dev_loopback_xmit(struct sk_buff *newskb) -{ - skb_reset_mac_header(newskb); - __skb_pull(newskb, skb_network_offset(newskb)); - newskb->pkt_type = PACKET_LOOPBACK; - newskb->ip_summed = CHECKSUM_UNNECESSARY; - WARN_ON(!skb_dst(newskb)); - - netif_rx_ni(newskb); - return 0; -} - static int ip6_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -121,7 +108,7 @@ static int ip6_finish_output2(struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, newskb, NULL, newskb->dev, - ip6_dev_loopback_xmit); + dev_loopback_xmit); if (ipv6_hdr(skb)->hop_limit == 0) { IP6_INC_STATS(dev_net(dev), idev, -- cgit v1.2.3 From 2da45db2bdd432a9dca825099c791f5c851f92b9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 12 Jun 2012 13:05:41 +0000 Subject: ethtool: Make more commands available to unprivileged processes 'Get' commands should generally not require CAP_NET_ADMIN, with the exception of those that expose internal state. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index c73d0a59212c..cbf033dcaf1f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1443,6 +1443,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: + case ETHTOOL_GLINK: case ETHTOOL_GCOALESCE: case ETHTOOL_GRINGPARAM: case ETHTOOL_GPAUSEPARAM: @@ -1451,6 +1452,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GSG: case ETHTOOL_GSSET_INFO: case ETHTOOL_GSTRINGS: + case ETHTOOL_GSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: @@ -1463,8 +1465,11 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: + case ETHTOOL_GRXFHINDIR: case ETHTOOL_GFEATURES: + case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: + case ETHTOOL_GEEE: break; default: if (!capable(CAP_NET_ADMIN)) -- cgit v1.2.3 From 41063e9dd11956f2d285e12e4342e1d232ba0ea2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 19 Jun 2012 21:22:05 -0700 Subject: ipv4: Early TCP socket demux. Input packet processing for local sockets involves two major demuxes. One for the route and one for the socket. But we can optimize this down to one demux for certain kinds of local sockets. Currently we only do this for established TCP sockets, but it could at least in theory be expanded to other kinds of connections. If a TCP socket is established then it's identity is fully specified. This means that whatever input route was used during the three-way handshake must work equally well for the rest of the connection since the keys will not change. Once we move to established state, we cache the receive packet's input route to use later. Like the existing cached route in sk->sk_dst_cache used for output packets, we have to check for route invalidations using dst->obsolete and dst->ops->check(). Early demux occurs outside of a socket locked section, so when a route invalidation occurs we defer the fixup of sk->sk_rx_dst until we are actually inside of established state packet processing and thus have the socket locked. Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 4 ++-- include/net/protocol.h | 1 + include/net/sock.h | 2 ++ include/net/tcp.h | 1 + net/core/sock.c | 5 +++++ net/ipv4/af_inet.c | 18 +++++++++-------- net/ipv4/ip_input.c | 39 ++++++++++++++++++++++++------------ net/ipv4/tcp_input.c | 16 ++++++++++++++- net/ipv4/tcp_ipv4.c | 46 +++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_minisocks.c | 2 ++ 10 files changed, 110 insertions(+), 24 deletions(-) (limited to 'net/core') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 808fc5f76b03..54be0287eb98 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -379,10 +379,10 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, const __be16 sport, const __be16 dport) { - struct sock *sk; + struct sock *sk = skb_steal_sock(skb); const struct iphdr *iph = ip_hdr(skb); - if (unlikely(sk = skb_steal_sock(skb))) + if (sk) return sk; else return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, diff --git a/include/net/protocol.h b/include/net/protocol.h index a1b1b530c338..967b926cbfb1 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -37,6 +37,7 @@ /* This is used to register protocols. */ struct net_protocol { + int (*early_demux)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); int (*gso_send_check)(struct sk_buff *skb); diff --git a/include/net/sock.h b/include/net/sock.h index 4a4521699563..87b424ae750a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -319,6 +319,7 @@ struct sock { unsigned long sk_flags; struct dst_entry *sk_dst_cache; spinlock_t sk_dst_lock; + struct dst_entry *sk_rx_dst; atomic_t sk_wmem_alloc; atomic_t sk_omem_alloc; int sk_sndbuf; @@ -1426,6 +1427,7 @@ extern struct sk_buff *sock_rmalloc(struct sock *sk, gfp_t priority); extern void sock_wfree(struct sk_buff *skb); extern void sock_rfree(struct sk_buff *skb); +extern void sock_edemux(struct sk_buff *skb); extern int sock_setsockopt(struct socket *sock, int level, int op, char __user *optval, diff --git a/include/net/tcp.h b/include/net/tcp.h index 9332f342259a..6660ffc4963d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -325,6 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); +extern int tcp_v4_early_demux(struct sk_buff *skb); extern int tcp_v4_rcv(struct sk_buff *skb); extern struct inet_peer *tcp_v4_get_peer(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 9e5b71fda6ec..929bdcc2383b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +void sock_edemux(struct sk_buff *skb) +{ + sock_put(skb->sk); +} +EXPORT_SYMBOL(sock_edemux); int sock_i_uid(struct sock *sk) { diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 85a3b1763136..07a02f6e9696 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); + dst_release(sk->sk_rx_dst); sk_refcnt_debug_dec(sk); } EXPORT_SYMBOL(inet_sock_destruct); @@ -1518,14 +1519,15 @@ static const struct net_protocol igmp_protocol = { #endif static const struct net_protocol tcp_protocol = { - .handler = tcp_v4_rcv, - .err_handler = tcp_v4_err, - .gso_send_check = tcp_v4_gso_send_check, - .gso_segment = tcp_tso_segment, - .gro_receive = tcp4_gro_receive, - .gro_complete = tcp4_gro_complete, - .no_policy = 1, - .netns_ok = 1, + .early_demux = tcp_v4_early_demux, + .handler = tcp_v4_rcv, + .err_handler = tcp_v4_err, + .gso_send_check = tcp_v4_gso_send_check, + .gso_segment = tcp_tso_segment, + .gro_receive = tcp4_gro_receive, + .gro_complete = tcp4_gro_complete, + .no_policy = 1, + .netns_ok = 1, }; static const struct net_protocol udp_protocol = { diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c4fe1d271131..93b092c9a394 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -323,19 +323,32 @@ static int ip_rcv_finish(struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (skb_dst(skb) == NULL) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); - if (unlikely(err)) { - if (err == -EHOSTUNREACH) - IP_INC_STATS_BH(dev_net(skb->dev), - IPSTATS_MIB_INADDRERRORS); - else if (err == -ENETUNREACH) - IP_INC_STATS_BH(dev_net(skb->dev), - IPSTATS_MIB_INNOROUTES); - else if (err == -EXDEV) - NET_INC_STATS_BH(dev_net(skb->dev), - LINUX_MIB_IPRPFILTER); - goto drop; + const struct net_protocol *ipprot; + int protocol = iph->protocol; + int err; + + rcu_read_lock(); + ipprot = rcu_dereference(inet_protos[protocol]); + err = -ENOENT; + if (ipprot && ipprot->early_demux) + err = ipprot->early_demux(skb); + rcu_read_unlock(); + + if (err) { + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev); + if (unlikely(err)) { + if (err == -EHOSTUNREACH) + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INADDRERRORS); + else if (err == -ENETUNREACH) + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INNOROUTES); + else if (err == -EXDEV) + NET_INC_STATS_BH(dev_net(skb->dev), + LINUX_MIB_IPRPFILTER); + goto drop; + } } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b224eb8bce8b..8416f8a68e65 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); int res; + if (sk->sk_rx_dst) { + struct dst_entry *dst = sk->sk_rx_dst; + if (unlikely(dst->obsolete)) { + if (dst->ops->check(dst, 0) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + } + if (unlikely(sk->sk_rx_dst == NULL)) + sk->sk_rx_dst = dst_clone(skb_dst(skb)); + /* * Header prediction. * The code loosely follows the one in the famous @@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); - if (skb != NULL) + if (skb != NULL) { + sk->sk_rx_dst = dst_clone(skb_dst(skb)); security_inet_conn_established(sk, skb); + } /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fda2ca17135e..13857df1dae1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1671,6 +1671,52 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); +int tcp_v4_early_demux(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + const struct iphdr *iph; + const struct tcphdr *th; + struct sock *sk; + int err; + + err = -ENOENT; + if (skb->pkt_type != PACKET_HOST) + goto out_err; + + if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr))) + goto out_err; + + iph = ip_hdr(skb); + th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb)); + + if (th->doff < sizeof(struct tcphdr) / 4) + goto out_err; + + if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4)) + goto out_err; + + sk = __inet_lookup_established(net, &tcp_hashinfo, + iph->saddr, th->source, + iph->daddr, th->dest, + skb->dev->ifindex); + if (sk) { + skb->sk = sk; + skb->destructor = sock_edemux; + if (sk->sk_state != TCP_TIME_WAIT) { + struct dst_entry *dst = sk->sk_rx_dst; + if (dst) + dst = dst_check(dst, 0); + if (dst) { + skb_dst_set_noref(skb, dst); + err = 0; + } + } + } + +out_err: + return err; +} + /* * From tcp_input.c */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index cb015317c9f7..72b7c63b1a39 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; + newsk->sk_rx_dst = dst_clone(skb_dst(skb)); + /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to * copy any s_data_payload stored at the original socket. -- cgit v1.2.3 From 7b46866dd0a6fe38ecee523eb27eda9c8f484dc5 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 26 Jun 2012 23:36:11 +0000 Subject: sock_diag: Do not use RTA_PUT() macros Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/sock_diag.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 0d934ce1075f..ff2967acbfae 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include @@ -35,9 +34,7 @@ EXPORT_SYMBOL_GPL(sock_diag_save_cookie); int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) { - __u32 *mem; - - mem = RTA_DATA(__RTA_PUT(skb, attrtype, SK_MEMINFO_VARS * sizeof(__u32))); + u32 mem[SK_MEMINFO_VARS]; mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; @@ -48,10 +45,7 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; - return 0; - -rtattr_failure: - return -EMSGSIZE; + return nla_put(skb, attrtype, sizeof(mem), &mem); } EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); @@ -121,7 +115,7 @@ static inline void sock_diag_unlock_handler(const struct sock_diag_handler *h) static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { int err; - struct sock_diag_req *req = NLMSG_DATA(nlh); + struct sock_diag_req *req = nlmsg_data(nlh); const struct sock_diag_handler *hndl; if (nlmsg_len(nlh) < sizeof(*req)) -- cgit v1.2.3 From 4c3af034fafeb7269176bf1310c9bcff0b9fd9bb Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 26 Jun 2012 23:36:16 +0000 Subject: netlink: Get rid of obsolete rtnetlink macros Removes all RTA_GET*() and RTA_PUT*() variations, as well as the the unused rtattr_strcmp(). Get rid of rtm_get_table() by moving it to its only user decnet. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 129 ---------------------------------------------- net/core/rtnetlink.c | 13 ----- net/decnet/dn_fib.c | 8 +++ 3 files changed, 8 insertions(+), 142 deletions(-) (limited to 'net/core') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 2c1de8982c85..ea60b0854109 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -612,12 +612,6 @@ struct tcamsg { #include #include -static __inline__ int rtattr_strcmp(const struct rtattr *rta, const char *str) -{ - int len = strlen(str) + 1; - return len > rta->rta_len || memcmp(RTA_DATA(rta), str, len); -} - extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, @@ -628,122 +622,6 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, u32 ts, u32 tsage, long expires, u32 error); -extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data); - -#define RTA_PUT(skb, attrtype, attrlen, data) \ -({ if (unlikely(skb_tailroom(skb) < (int)RTA_SPACE(attrlen))) \ - goto rtattr_failure; \ - __rta_fill(skb, attrtype, attrlen, data); }) - -#define RTA_APPEND(skb, attrlen, data) \ -({ if (unlikely(skb_tailroom(skb) < (int)(attrlen))) \ - goto rtattr_failure; \ - memcpy(skb_put(skb, attrlen), data, attrlen); }) - -#define RTA_PUT_NOHDR(skb, attrlen, data) \ -({ RTA_APPEND(skb, RTA_ALIGN(attrlen), data); \ - memset(skb_tail_pointer(skb) - (RTA_ALIGN(attrlen) - attrlen), 0, \ - RTA_ALIGN(attrlen) - attrlen); }) - -#define RTA_PUT_U8(skb, attrtype, value) \ -({ u8 _tmp = (value); \ - RTA_PUT(skb, attrtype, sizeof(u8), &_tmp); }) - -#define RTA_PUT_U16(skb, attrtype, value) \ -({ u16 _tmp = (value); \ - RTA_PUT(skb, attrtype, sizeof(u16), &_tmp); }) - -#define RTA_PUT_U32(skb, attrtype, value) \ -({ u32 _tmp = (value); \ - RTA_PUT(skb, attrtype, sizeof(u32), &_tmp); }) - -#define RTA_PUT_U64(skb, attrtype, value) \ -({ u64 _tmp = (value); \ - RTA_PUT(skb, attrtype, sizeof(u64), &_tmp); }) - -#define RTA_PUT_SECS(skb, attrtype, value) \ - RTA_PUT_U64(skb, attrtype, (value) / HZ) - -#define RTA_PUT_MSECS(skb, attrtype, value) \ - RTA_PUT_U64(skb, attrtype, jiffies_to_msecs(value)) - -#define RTA_PUT_STRING(skb, attrtype, value) \ - RTA_PUT(skb, attrtype, strlen(value) + 1, value) - -#define RTA_PUT_FLAG(skb, attrtype) \ - RTA_PUT(skb, attrtype, 0, NULL); - -#define RTA_NEST(skb, type) \ -({ struct rtattr *__start = (struct rtattr *)skb_tail_pointer(skb); \ - RTA_PUT(skb, type, 0, NULL); \ - __start; }) - -#define RTA_NEST_END(skb, start) \ -({ (start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \ - (skb)->len; }) - -#define RTA_NEST_COMPAT(skb, type, attrlen, data) \ -({ struct rtattr *__start = (struct rtattr *)skb_tail_pointer(skb); \ - RTA_PUT(skb, type, attrlen, data); \ - RTA_NEST(skb, type); \ - __start; }) - -#define RTA_NEST_COMPAT_END(skb, start) \ -({ struct rtattr *__nest = (void *)(start) + NLMSG_ALIGN((start)->rta_len); \ - (start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \ - RTA_NEST_END(skb, __nest); \ - (skb)->len; }) - -#define RTA_NEST_CANCEL(skb, start) \ -({ if (start) \ - skb_trim(skb, (unsigned char *) (start) - (skb)->data); \ - -1; }) - -#define RTA_GET_U8(rta) \ -({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u8)) \ - goto rtattr_failure; \ - *(u8 *) RTA_DATA(rta); }) - -#define RTA_GET_U16(rta) \ -({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u16)) \ - goto rtattr_failure; \ - *(u16 *) RTA_DATA(rta); }) - -#define RTA_GET_U32(rta) \ -({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u32)) \ - goto rtattr_failure; \ - *(u32 *) RTA_DATA(rta); }) - -#define RTA_GET_U64(rta) \ -({ u64 _tmp; \ - if (!rta || RTA_PAYLOAD(rta) < sizeof(u64)) \ - goto rtattr_failure; \ - memcpy(&_tmp, RTA_DATA(rta), sizeof(_tmp)); \ - _tmp; }) - -#define RTA_GET_FLAG(rta) (!!(rta)) - -#define RTA_GET_SECS(rta) ((unsigned long) RTA_GET_U64(rta) * HZ) -#define RTA_GET_MSECS(rta) (msecs_to_jiffies((unsigned long) RTA_GET_U64(rta))) - -static inline struct rtattr * -__rta_reserve(struct sk_buff *skb, int attrtype, int attrlen) -{ - struct rtattr *rta; - int size = RTA_LENGTH(attrlen); - - rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); - rta->rta_type = attrtype; - rta->rta_len = size; - memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); - return rta; -} - -#define __RTA_PUT(skb, attrtype, attrlen) \ -({ if (unlikely(skb_tailroom(skb) < (int)RTA_SPACE(attrlen))) \ - goto rtattr_failure; \ - __rta_reserve(skb, attrtype, attrlen); }) - extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change); /* RTNL is used as a global lock for all changes to network configuration */ @@ -794,13 +672,6 @@ extern void __rtnl_unlock(void); } \ } while(0) -static inline u32 rtm_get_table(struct rtattr **rta, u8 table) -{ - return RTA_GET_U32(rta[RTA_TABLE-1]); -rtattr_failure: - return table; -} - extern int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 21318d15bbc3..bc8a1cdaac98 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -541,19 +541,6 @@ static const int rta_max[RTM_NR_FAMILIES] = [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX, }; -void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) -{ - struct rtattr *rta; - int size = RTA_LENGTH(attrlen); - - rta = (struct rtattr *)skb_put(skb, RTA_ALIGN(size)); - rta->rta_type = attrtype; - rta->rta_len = size; - memcpy(RTA_DATA(rta), data, attrlen); - memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); -} -EXPORT_SYMBOL(__rta_fill); - int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 7eaf98799729..102d6106a942 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -505,6 +505,14 @@ static int dn_fib_check_attr(struct rtmsg *r, struct rtattr **rta) return 0; } +static inline u32 rtm_get_table(struct rtattr **rta, u8 table) +{ + if (rta[RTA_TABLE - 1]) + table = nla_get_u32((struct nlattr *) rta[RTA_TABLE - 1]); + + return table; +} + static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { struct net *net = sock_net(skb->sk); -- cgit v1.2.3 From 22911fc581f6a241e2897a7a8603e97344a6ec82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Jun 2012 00:23:44 +0000 Subject: net: skb_free_datagram_locked() doesnt drop all packets dropwatch wrongly diagnose all received UDP packets as drops. This patch removes trace_kfree_skb() done in skb_free_datagram_locked(). Locations calling skb_free_datagram_locked() should do it on their own. As a result, drops are accounted on the right function. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 1 - net/ipv4/udp.c | 5 ++++- net/ipv6/udp.c | 8 +++++--- net/sunrpc/svcsock.c | 12 ++++++------ 4 files changed, 15 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index ae6acf6a3dea..0337e2b76862 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -248,7 +248,6 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) unlock_sock_fast(sk, slow); /* skb is now orphaned, can be freed outside of locked section */ - trace_kfree_skb(skb, skb_free_datagram_locked); __kfree_skb(skb); } EXPORT_SYMBOL(skb_free_datagram_locked); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index db017efb76ea..ee37d47d472e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -108,6 +108,7 @@ #include #include #include +#include #include "udp_impl.h" struct udp_table udp_table __read_mostly; @@ -1220,8 +1221,10 @@ try_again: goto csum_copy_err; } - if (err) + if (unlikely(err)) { + trace_kfree_skb(skb, udp_recvmsg); goto out_free; + } if (!peeked) UDP_INC_STATS_USER(sock_net(sk), diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 051ad481973f..1ecd10249488 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -48,6 +48,7 @@ #include #include +#include #include "udp_impl.h" int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) @@ -385,15 +386,16 @@ try_again: if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied ); + msg->msg_iov, copied); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); if (err == -EINVAL) goto csum_copy_err; } - if (err) + if (unlikely(err)) { + trace_kfree_skb(skb, udpv6_recvmsg); goto out_free; - + } if (!peeked) { if (is_udp4) UDP_INC_STATS_USER(sock_net(sk), diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index a6de09de5d21..18bc130255a7 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -619,6 +620,8 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) if (!svc_udp_get_dest_address(rqstp, cmh)) { net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n", cmh->cmsg_level, cmh->cmsg_type); +out_free: + trace_kfree_skb(skb, svc_udp_recvfrom); skb_free_datagram_locked(svsk->sk_sk, skb); return 0; } @@ -630,8 +633,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { local_bh_enable(); /* checksum error */ - skb_free_datagram_locked(svsk->sk_sk, skb); - return 0; + goto out_free; } local_bh_enable(); skb_free_datagram_locked(svsk->sk_sk, skb); @@ -640,10 +642,8 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); rqstp->rq_arg.head[0].iov_len = len; - if (skb_checksum_complete(skb)) { - skb_free_datagram_locked(svsk->sk_sk, skb); - return 0; - } + if (skb_checksum_complete(skb)) + goto out_free; rqstp->rq_xprt_ctxt = skb; } -- cgit v1.2.3 From 7a9bc9b81a5bc6e44ebc80ef781332e4385083f2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 Jun 2012 01:32:45 -0700 Subject: ipv4: Elide fib_validate_source() completely when possible. If rpfilter is off (or the SKB has an IPSEC path) and there are not tclassid users, we don't have to do anything at all when fib_validate_source() is invoked besides setting the itag to zero. We monitor tclassid uses with a counter (modified only under RTNL and marked __read_mostly) and we protect the fib_validate_source() real work with a test against this counter and whether rpfilter is to be done. Having a way to know whether we need no tclassid processing or not also opens the door for future optimized rpfilter algorithms that do not perform full FIB lookups. Signed-off-by: David S. Miller --- include/net/fib_rules.h | 1 + include/net/ip_fib.h | 5 +++++ net/core/fib_rules.c | 4 ++++ net/ipv4/fib_frontend.c | 32 ++++++++++++++++++++++++-------- net/ipv4/fib_rules.c | 16 +++++++++++++++- net/ipv4/fib_semantics.c | 10 ++++++++++ 6 files changed, 59 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 075f1e3a0fed..e361f4882426 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -52,6 +52,7 @@ struct fib_rules_ops { struct sk_buff *, struct fib_rule_hdr *, struct nlattr **); + void (*delete)(struct fib_rule *); int (*compare)(struct fib_rule *, struct fib_rule_hdr *, struct nlattr **); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 619f68a7185c..3dc7c96bbeab 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -235,6 +235,11 @@ extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); extern void fib_select_default(struct fib_result *res); +#ifdef CONFIG_IP_ROUTE_CLASSID +extern int fib_num_tclassid_users; +#else +#define fib_num_tclassid_users 0 +#endif /* Exported by fib_semantics.c */ extern int ip_fib_check_default(__be32 gw, struct net_device *dev); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 72cceb79d0d4..ab7db83236c9 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -151,6 +151,8 @@ static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); + if (ops->delete) + ops->delete(rule); fib_rule_put(rule); } } @@ -499,6 +501,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).pid); + if (ops->delete) + ops->delete(rule); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c84cff52021e..ae528d1b293a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -217,6 +218,10 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } +#ifdef CONFIG_IP_ROUTE_CLASSID +int fib_num_tclassid_users __read_mostly; +#endif + /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. @@ -225,11 +230,11 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ -int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, - int oif, struct net_device *dev, struct in_device *idev, - u32 *itag) +static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, + u8 tos, int oif, struct net_device *dev, + int rpf, struct in_device *idev, u32 *itag) { - int ret, no_addr, rpf, accept_local; + int ret, no_addr, accept_local; struct fib_result res; struct flowi4 fl4; struct net *net; @@ -242,12 +247,9 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; - no_addr = rpf = accept_local = 0; + no_addr = accept_local = 0; no_addr = idev->ifa_list == NULL; - /* Ignore rp_filter for packets protected by IPsec. */ - rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); - accept_local = IN_DEV_ACCEPT_LOCAL(idev); fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; @@ -303,6 +305,20 @@ e_rpf: return -EXDEV; } +/* Ignore rp_filter for packets protected by IPsec. */ +int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, + u8 tos, int oif, struct net_device *dev, + struct in_device *idev, u32 *itag) +{ + int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); + + if (!r && !fib_num_tclassid_users) { + *itag = 0; + return 0; + } + return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); +} + static inline __be32 sk_extract_addr(struct sockaddr *addr) { return ((struct sockaddr_in *) addr)->sin_addr.s_addr; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 2d043f71ef70..b23fd952c84f 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -169,8 +169,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->dst = nla_get_be32(tb[FRA_DST]); #ifdef CONFIG_IP_ROUTE_CLASSID - if (tb[FRA_FLOW]) + if (tb[FRA_FLOW]) { rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); + if (rule4->tclassid) + fib_num_tclassid_users++; + } #endif rule4->src_len = frh->src_len; @@ -184,6 +187,16 @@ errout: return err; } +static void fib4_rule_delete(struct fib_rule *rule) +{ +#ifdef CONFIG_IP_ROUTE_CLASSID + struct fib4_rule *rule4 = (struct fib4_rule *) rule; + + if (rule4->tclassid) + fib_num_tclassid_users--; +#endif +} + static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { @@ -256,6 +269,7 @@ static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { .action = fib4_rule_action, .match = fib4_rule_match, .configure = fib4_rule_configure, + .delete = fib4_rule_delete, .compare = fib4_rule_compare, .fill = fib4_rule_fill, .default_pref = fib_default_rule_pref, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 415f8230fc88..c46c20b6b0b6 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -163,6 +163,12 @@ void free_fib_info(struct fib_info *fi) return; } fib_info_cnt--; +#ifdef CONFIG_IP_ROUTE_CLASSID + change_nexthops(fi) { + if (nexthop_nh->nh_tclassid) + fib_num_tclassid_users--; + } endfor_nexthops(fi); +#endif call_rcu(&fi->rcu, free_fib_info_rcu); } @@ -421,6 +427,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; + if (nexthop_nh->nh_tclassid) + fib_num_tclassid_users++; #endif } @@ -815,6 +823,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) nh->nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; + if (nh->nh_tclassid) + fib_num_tclassid_users++; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight = 1; -- cgit v1.2.3 From a31f2d17b331db970259e875b7223d3aba7e3821 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 Jun 2012 06:15:21 +0000 Subject: netlink: add netlink_kernel_cfg parameter to netlink_kernel_create This patch adds the following structure: struct netlink_kernel_cfg { unsigned int groups; void (*input)(struct sk_buff *skb); struct mutex *cb_mutex; }; That can be passed to netlink_kernel_create to set optional configurations for netlink kernel sockets. I've populated this structure by looking for NULL and zero parameters at the existing code. The remaining parameters that always need to be set are still left in the original interface. That includes optional parameters for the netlink socket creation. This allows easy extensibility of this interface in the future. This patch also adapts all callers to use this new interface. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- crypto/crypto_user.c | 7 +++++-- drivers/connector/connector.c | 13 +++++++++---- drivers/infiniband/core/netlink.c | 7 +++++-- drivers/scsi/scsi_netlink.c | 7 +++++-- drivers/scsi/scsi_transport_iscsi.c | 9 ++++++--- drivers/staging/gdm72xx/netlink_k.c | 6 ++++-- include/linux/netlink.h | 15 ++++++++++----- kernel/audit.c | 7 +++++-- lib/kobject_uevent.c | 5 ++++- net/bridge/netfilter/ebt_ulog.c | 6 ++++-- net/core/rtnetlink.c | 9 +++++++-- net/core/sock_diag.c | 8 ++++++-- net/decnet/netfilter/dn_rtmsg.c | 8 +++++--- net/ipv4/fib_frontend.c | 7 +++++-- net/ipv4/netfilter/ipt_ULOG.c | 8 +++++--- net/netfilter/nfnetlink.c | 7 +++++-- net/netlink/af_netlink.c | 16 ++++++++++------ net/netlink/genetlink.c | 10 +++++++--- net/xfrm/xfrm_user.c | 7 +++++-- security/selinux/netlink.c | 6 +++++- 20 files changed, 117 insertions(+), 51 deletions(-) (limited to 'net/core') diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c index 5a37eadb4e56..ba2c611154af 100644 --- a/crypto/crypto_user.c +++ b/crypto/crypto_user.c @@ -496,9 +496,12 @@ static void crypto_netlink_rcv(struct sk_buff *skb) static int __init crypto_user_init(void) { + struct netlink_kernel_cfg cfg = { + .input = crypto_netlink_rcv, + }; + crypto_nlsk = netlink_kernel_create(&init_net, NETLINK_CRYPTO, - 0, crypto_netlink_rcv, - NULL, THIS_MODULE); + THIS_MODULE, &cfg); if (!crypto_nlsk) return -ENOMEM; diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c index 34e0e9e4d913..116cf8d02834 100644 --- a/drivers/connector/connector.c +++ b/drivers/connector/connector.c @@ -251,15 +251,20 @@ static const struct file_operations cn_file_ops = { .release = single_release }; +static struct cn_dev cdev = { + .input = cn_rx_skb, +}; + static int __devinit cn_init(void) { struct cn_dev *dev = &cdev; - - dev->input = cn_rx_skb; + struct netlink_kernel_cfg cfg = { + .groups = CN_NETLINK_USERS + 0xf, + .input = dev->input, + }; dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR, - CN_NETLINK_USERS + 0xf, - dev->input, NULL, THIS_MODULE); + THIS_MODULE, &cfg); if (!dev->nls) return -EIO; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 1e691dca1820..3ae2bfd31015 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -173,8 +173,11 @@ static void ibnl_rcv(struct sk_buff *skb) int __init ibnl_init(void) { - nls = netlink_kernel_create(&init_net, NETLINK_RDMA, 0, ibnl_rcv, - NULL, THIS_MODULE); + struct netlink_kernel_cfg cfg = { + .input = ibnl_rcv, + }; + + nls = netlink_kernel_create(&init_net, NETLINK_RDMA, THIS_MODULE, &cfg); if (!nls) { pr_warn("Failed to create netlink socket\n"); return -ENOMEM; diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c index c77628afbf9f..8818dd681c19 100644 --- a/drivers/scsi/scsi_netlink.c +++ b/drivers/scsi/scsi_netlink.c @@ -486,6 +486,10 @@ void scsi_netlink_init(void) { int error; + struct netlink_kernel_cfg cfg = { + .input = scsi_nl_rcv_msg, + .groups = SCSI_NL_GRP_CNT, + }; INIT_LIST_HEAD(&scsi_nl_drivers); @@ -497,8 +501,7 @@ scsi_netlink_init(void) } scsi_nl_sock = netlink_kernel_create(&init_net, NETLINK_SCSITRANSPORT, - SCSI_NL_GRP_CNT, scsi_nl_rcv_msg, NULL, - THIS_MODULE); + THIS_MODULE, &cfg); if (!scsi_nl_sock) { printk(KERN_ERR "%s: register of receive handler failed\n", __func__); diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 1cf640e575da..6042954d8f3b 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2936,7 +2936,10 @@ EXPORT_SYMBOL_GPL(iscsi_unregister_transport); static __init int iscsi_transport_init(void) { int err; - + struct netlink_kernel_cfg cfg = { + .groups = 1, + .input = iscsi_if_rx, + }; printk(KERN_INFO "Loading iSCSI transport class v%s.\n", ISCSI_TRANSPORT_VERSION); @@ -2966,8 +2969,8 @@ static __init int iscsi_transport_init(void) if (err) goto unregister_conn_class; - nls = netlink_kernel_create(&init_net, NETLINK_ISCSI, 1, iscsi_if_rx, - NULL, THIS_MODULE); + nls = netlink_kernel_create(&init_net, NETLINK_ISCSI, + THIS_MODULE, &cfg); if (!nls) { err = -ENOBUFS; goto unregister_session_class; diff --git a/drivers/staging/gdm72xx/netlink_k.c b/drivers/staging/gdm72xx/netlink_k.c index 2489bb5597ca..87c3a07ed80e 100644 --- a/drivers/staging/gdm72xx/netlink_k.c +++ b/drivers/staging/gdm72xx/netlink_k.c @@ -88,13 +88,15 @@ struct sock *netlink_init(int unit, void (*cb)(struct net_device *dev, u16 type, void *msg, int len)) { struct sock *sock; + struct netlink_kernel_cfg cfg = { + .input = netlink_rcv, + }; #if !defined(DEFINE_MUTEX) init_MUTEX(&netlink_mutex); #endif - sock = netlink_kernel_create(&init_net, unit, 0, netlink_rcv, NULL, - THIS_MODULE); + sock = netlink_kernel_create(&init_net, unit, THIS_MODULE, &cfg); if (sock) rcv_cb = cb; diff --git a/include/linux/netlink.h b/include/linux/netlink.h index ed33f0901bc2..6085e4919cb3 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -174,11 +174,16 @@ struct netlink_skb_parms { extern void netlink_table_grab(void); extern void netlink_table_ungrab(void); -extern struct sock *netlink_kernel_create(struct net *net, - int unit,unsigned int groups, - void (*input)(struct sk_buff *skb), - struct mutex *cb_mutex, - struct module *module); +/* optional Netlink kernel configuration parameters */ +struct netlink_kernel_cfg { + unsigned int groups; + void (*input)(struct sk_buff *skb); + struct mutex *cb_mutex; +}; + +extern struct sock *netlink_kernel_create(struct net *net, int unit, + struct module *module, + struct netlink_kernel_cfg *cfg); extern void netlink_kernel_release(struct sock *sk); extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups); extern int netlink_change_ngroups(struct sock *sk, unsigned int groups); diff --git a/kernel/audit.c b/kernel/audit.c index 30b252a1fb61..4a3f28d2ca65 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -962,14 +962,17 @@ static void audit_receive(struct sk_buff *skb) static int __init audit_init(void) { int i; + struct netlink_kernel_cfg cfg = { + .input = audit_receive, + }; if (audit_initialized == AUDIT_DISABLED) return 0; printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, - audit_receive, NULL, THIS_MODULE); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, + THIS_MODULE, &cfg); if (!audit_sock) audit_panic("cannot initialize netlink socket"); else diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 1a91efa6d121..0401d2916d9f 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -373,13 +373,16 @@ EXPORT_SYMBOL_GPL(add_uevent_var); static int uevent_net_init(struct net *net) { struct uevent_sock *ue_sk; + struct netlink_kernel_cfg cfg = { + .groups = 1, + }; ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL); if (!ue_sk) return -ENOMEM; ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT, - 1, NULL, NULL, THIS_MODULE); + THIS_MODULE, &cfg); if (!ue_sk->sk) { printk(KERN_ERR "kobject_uevent: unable to create netlink socket!\n"); diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 1bd173218f7b..374bdcd77039 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -282,6 +282,9 @@ static int __init ebt_ulog_init(void) { int ret; int i; + struct netlink_kernel_cfg cfg = { + .groups = EBT_ULOG_MAXNLGROUPS, + }; if (nlbufsiz >= 128*1024) { pr_warning("Netlink buffer has to be <= 128kB," @@ -296,8 +299,7 @@ static int __init ebt_ulog_init(void) } ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, - EBT_ULOG_MAXNLGROUPS, NULL, NULL, - THIS_MODULE); + THIS_MODULE, &cfg); if (!ebtulognl) ret = -ENOMEM; else if ((ret = xt_register_target(&ebt_ulog_tg_reg)) != 0) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bc8a1cdaac98..2b325c340b44 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2353,8 +2353,13 @@ static struct notifier_block rtnetlink_dev_notifier = { static int __net_init rtnetlink_net_init(struct net *net) { struct sock *sk; - sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, - rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); + struct netlink_kernel_cfg cfg = { + .groups = RTNLGRP_MAX, + .input = rtnetlink_rcv, + .cb_mutex = &rtnl_mutex, + }; + + sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg); if (!sk) return -ENOMEM; net->rtnl = sk; diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index ff2967acbfae..07a29eb34a41 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -171,8 +171,12 @@ EXPORT_SYMBOL_GPL(sock_diag_nlsk); static int __init sock_diag_init(void) { - sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0, - sock_diag_rcv, NULL, THIS_MODULE); + struct netlink_kernel_cfg cfg = { + .input = sock_diag_rcv, + }; + + sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, + THIS_MODULE, &cfg); return sock_diag_nlsk == NULL ? -ENOMEM : 0; } diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index b8f7f5b8c350..11db0ecf342f 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -125,11 +125,13 @@ static struct nf_hook_ops dnrmg_ops __read_mostly = { static int __init dn_rtmsg_init(void) { int rv = 0; + struct netlink_kernel_cfg cfg = { + .groups = DNRNG_NLGRP_MAX, + .input = dnrmg_receive_user_skb, + }; dnrmg = netlink_kernel_create(&init_net, - NETLINK_DNRTMSG, DNRNG_NLGRP_MAX, - dnrmg_receive_user_skb, - NULL, THIS_MODULE); + NETLINK_DNRTMSG, THIS_MODULE, &cfg); if (dnrmg == NULL) { printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket"); return -ENOMEM; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ae528d1b293a..3e11ea225dad 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -976,8 +976,11 @@ static void nl_fib_input(struct sk_buff *skb) static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; - sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, - nl_fib_input, NULL, THIS_MODULE); + struct netlink_kernel_cfg cfg = { + .input = nl_fib_input, + }; + + sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg); if (sk == NULL) return -EAFNOSUPPORT; net->ipv4.fibnl = sk; diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 99b3f53f16a7..1109f7f6c254 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -381,6 +381,9 @@ static struct nf_logger ipt_ulog_logger __read_mostly = { static int __init ulog_tg_init(void) { int ret, i; + struct netlink_kernel_cfg cfg = { + .groups = ULOG_MAXNLGROUPS, + }; pr_debug("init module\n"); @@ -393,9 +396,8 @@ static int __init ulog_tg_init(void) for (i = 0; i < ULOG_MAXNLGROUPS; i++) setup_timer(&ulog_buffers[i].timer, ulog_timer, i); - nflognl = netlink_kernel_create(&init_net, - NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, - NULL, THIS_MODULE); + nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, + THIS_MODULE, &cfg); if (!nflognl) return -ENOMEM; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 3e797d1fcb94..700e4616a098 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -203,9 +203,12 @@ static void nfnetlink_rcv(struct sk_buff *skb) static int __net_init nfnetlink_net_init(struct net *net) { struct sock *nfnl; + struct netlink_kernel_cfg cfg = { + .groups = NFNLGRP_MAX, + .input = nfnetlink_rcv, + }; - nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX, - nfnetlink_rcv, NULL, THIS_MODULE); + nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, THIS_MODULE, &cfg); if (!nfnl) return -ENOMEM; net->nfnl_stash = nfnl; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b3025a603d56..43a124feaad8 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1503,14 +1503,16 @@ static void netlink_data_ready(struct sock *sk, int len) */ struct sock * -netlink_kernel_create(struct net *net, int unit, unsigned int groups, - void (*input)(struct sk_buff *skb), - struct mutex *cb_mutex, struct module *module) +netlink_kernel_create(struct net *net, int unit, + struct module *module, + struct netlink_kernel_cfg *cfg) { struct socket *sock; struct sock *sk; struct netlink_sock *nlk; struct listeners *listeners = NULL; + struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL; + unsigned int groups; BUG_ON(!nl_table); @@ -1532,16 +1534,18 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups, sk = sock->sk; sk_change_net(sk, net); - if (groups < 32) + if (!cfg || cfg->groups < 32) groups = 32; + else + groups = cfg->groups; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) goto out_sock_release; sk->sk_data_ready = netlink_data_ready; - if (input) - nlk_sk(sk)->netlink_rcv = input; + if (cfg && cfg->input) + nlk_sk(sk)->netlink_rcv = cfg->input; if (netlink_insert(sk, net, 0)) goto out_sock_release; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 2cc7c1ee7690..32761b53015e 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -915,10 +915,14 @@ static struct genl_multicast_group notify_grp = { static int __net_init genl_pernet_init(struct net *net) { + struct netlink_kernel_cfg cfg = { + .input = genl_rcv, + .cb_mutex = &genl_mutex, + }; + /* we'll bump the group number right afterwards */ - net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, 0, - genl_rcv, &genl_mutex, - THIS_MODULE); + net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, + THIS_MODULE, &cfg); if (!net->genl_sock && net_eq(net, &init_net)) panic("GENL: Cannot initialize generic netlink\n"); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 540762726aaf..e75d8e47f35c 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2959,9 +2959,12 @@ static struct xfrm_mgr netlink_mgr = { static int __net_init xfrm_user_net_init(struct net *net) { struct sock *nlsk; + struct netlink_kernel_cfg cfg = { + .groups = XFRMNLGRP_MAX, + .input = xfrm_netlink_rcv, + }; - nlsk = netlink_kernel_create(net, NETLINK_XFRM, XFRMNLGRP_MAX, - xfrm_netlink_rcv, NULL, THIS_MODULE); + nlsk = netlink_kernel_create(net, NETLINK_XFRM, THIS_MODULE, &cfg); if (nlsk == NULL) return -ENOMEM; net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */ diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c index 8a23a35b9c5b..8a77725423e0 100644 --- a/security/selinux/netlink.c +++ b/security/selinux/netlink.c @@ -111,8 +111,12 @@ void selnl_notify_policyload(u32 seqno) static int __init selnl_init(void) { + struct netlink_kernel_cfg cfg = { + .groups = SELNLGRP_MAX, + }; + selnl = netlink_kernel_create(&init_net, NETLINK_SELINUX, - SELNLGRP_MAX, NULL, NULL, THIS_MODULE); + THIS_MODULE, &cfg); if (selnl == NULL) panic("SELinux: Cannot create netlink socket."); netlink_set_nonroot(NETLINK_SELINUX, NL_NONROOT_RECV); -- cgit v1.2.3 From a263b3093641fb1ec377582c90986a7fd0625184 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Jul 2012 02:02:15 -0700 Subject: ipv4: Make neigh lookups directly in output packet path. Do not use the dst cached neigh, we'll be getting rid of that. Signed-off-by: David S. Miller --- include/net/arp.h | 28 +++++++++++++++++++--------- include/net/neighbour.h | 11 +++++++++-- net/core/neighbour.c | 12 +++++++----- net/ipv4/ip_output.c | 12 ++++++++---- net/ipv4/route.c | 6 +----- 5 files changed, 44 insertions(+), 25 deletions(-) (limited to 'net/core') diff --git a/include/net/arp.h b/include/net/arp.h index 4a1f3fb562eb..4617d9841132 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -15,24 +15,34 @@ static inline u32 arp_hashfn(u32 key, const struct net_device *dev, u32 hash_rnd return val * hash_rnd; } -static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) +static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { - struct neigh_hash_table *nht; + struct neigh_hash_table *nht = rcu_dereference_bh(arp_tbl.nht); struct neighbour *n; u32 hash_val; - rcu_read_lock_bh(); - nht = rcu_dereference_bh(arp_tbl.nht); + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + key = 0; + hash_val = arp_hashfn(key, dev, nht->hash_rnd[0]) >> (32 - nht->hash_shift); for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); n != NULL; n = rcu_dereference_bh(n->next)) { - if (n->dev == dev && *(u32 *)n->primary_key == key) { - if (!atomic_inc_not_zero(&n->refcnt)) - n = NULL; - break; - } + if (n->dev == dev && *(u32 *)n->primary_key == key) + return n; } + + return NULL; +} + +static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) +{ + struct neighbour *n; + + rcu_read_lock_bh(); + n = __ipv4_neigh_lookup_noref(dev, key); + if (n && !atomic_inc_not_zero(&n->refcnt)) + n = NULL; rcu_read_unlock_bh(); return n; diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 6cdfeedb650b..e1d18bdeebb8 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -202,9 +202,16 @@ extern struct neighbour * neigh_lookup(struct neigh_table *tbl, extern struct neighbour * neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, const void *pkey); -extern struct neighbour * neigh_create(struct neigh_table *tbl, +extern struct neighbour * __neigh_create(struct neigh_table *tbl, + const void *pkey, + struct net_device *dev, + bool want_ref); +static inline struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev); + struct net_device *dev) +{ + return __neigh_create(tbl, pkey, dev, true); +} extern void neigh_destroy(struct neighbour *neigh); extern int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); extern int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d81d026138f0..a793af9af150 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -474,8 +474,8 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev) +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref) { u32 hash_val; int key_len = tbl->key_len; @@ -535,14 +535,16 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, n1 = rcu_dereference_protected(n1->next, lockdep_is_held(&tbl->lock))) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { - neigh_hold(n1); + if (want_ref) + neigh_hold(n1); rc = n1; goto out_tbl_unlock; } } n->dead = 0; - neigh_hold(n); + if (want_ref) + neigh_hold(n); rcu_assign_pointer(n->next, rcu_dereference_protected(nht->hash_buckets[hash_val], lockdep_is_held(&tbl->lock))); @@ -558,7 +560,7 @@ out_neigh_release: neigh_release(n); goto out; } -EXPORT_SYMBOL(neigh_create); +EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, int key_len) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 2630900e480a..6e9a266a0535 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -170,6 +170,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; + u32 nexthop; if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); @@ -191,15 +192,18 @@ static inline int ip_finish_output2(struct sk_buff *skb) skb = skb2; } - rcu_read_lock(); - neigh = dst_get_neighbour_noref(dst); + rcu_read_lock_bh(); + nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr; + neigh = __ipv4_neigh_lookup_noref(dev, nexthop); + if (unlikely(!neigh)) + neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); if (neigh) { int res = neigh_output(neigh, skb); - rcu_read_unlock(); + rcu_read_unlock_bh(); return res; } - rcu_read_unlock(); + rcu_read_unlock_bh(); net_dbg_ratelimited("%s: No header cache and no neighbour!\n", __func__); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6a5afc715558..2f40363e2851 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1098,17 +1098,13 @@ static int slow_chain_length(const struct rtable *head) static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) { - static const __be32 inaddr_any = 0; struct net_device *dev = dst->dev; const __be32 *pkey = daddr; const struct rtable *rt; struct neighbour *n; rt = (const struct rtable *) dst; - - if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) - pkey = &inaddr_any; - else if (rt->rt_gateway) + if (rt->rt_gateway) pkey = (const __be32 *) &rt->rt_gateway; n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); -- cgit v1.2.3 From 5110effee8fde2edfacac9cd12a9960ab2dc39ea Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Jul 2012 02:21:03 -0700 Subject: net: Do delayed neigh confirmation. When a dst_confirm() happens, mark the confirmation as pending in the dst. Then on the next packet out, when we have the neigh in-hand, do the update. This removes the dependency in dst_confirm() of dst's having an attached neigh. While we're here, remove the explicit 'dst' NULL check, all except 2 or 3 call sites ensure it's not NULL. So just fix those cases up. Signed-off-by: David S. Miller --- include/net/dst.h | 29 +++++++++++++++++++++-------- include/net/neighbour.h | 15 --------------- net/core/dst.c | 3 ++- net/ipv4/ip_output.c | 2 +- net/ipv4/tcp_input.c | 19 +++++++++++++------ net/ipv6/ip6_output.c | 2 +- 6 files changed, 38 insertions(+), 32 deletions(-) (limited to 'net/core') diff --git a/include/net/dst.h b/include/net/dst.h index f0bf3b8d5911..84e7a3ff968d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -51,7 +51,7 @@ struct dst_entry { int (*input)(struct sk_buff *); int (*output)(struct sk_buff *); - int flags; + unsigned short flags; #define DST_HOST 0x0001 #define DST_NOXFRM 0x0002 #define DST_NOPOLICY 0x0004 @@ -62,6 +62,8 @@ struct dst_entry { #define DST_FAKE_RTABLE 0x0080 #define DST_XFRM_TUNNEL 0x0100 + unsigned short pending_confirm; + short error; short obsolete; unsigned short header_len; /* more space at head required */ @@ -371,7 +373,8 @@ static inline struct dst_entry *skb_dst_pop(struct sk_buff *skb) extern int dst_discard(struct sk_buff *skb); extern void *dst_alloc(struct dst_ops *ops, struct net_device *dev, - int initial_ref, int initial_obsolete, int flags); + int initial_ref, int initial_obsolete, + unsigned short flags); extern void __dst_free(struct dst_entry *dst); extern struct dst_entry *dst_destroy(struct dst_entry *dst); @@ -395,14 +398,24 @@ static inline void dst_rcu_free(struct rcu_head *head) static inline void dst_confirm(struct dst_entry *dst) { - if (dst) { - struct neighbour *n; + dst->pending_confirm = 1; +} - rcu_read_lock(); - n = dst_get_neighbour_noref(dst); - neigh_confirm(n); - rcu_read_unlock(); +static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n, + struct sk_buff *skb) +{ + struct hh_cache *hh; + + if (unlikely(dst->pending_confirm)) { + n->confirmed = jiffies; + dst->pending_confirm = 0; } + + hh = &n->hh; + if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) + return neigh_hh_output(hh, skb); + else + return n->output(n, skb); } static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index e1d18bdeebb8..344d8988842a 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -309,12 +309,6 @@ static inline struct neighbour * neigh_clone(struct neighbour *neigh) #define neigh_hold(n) atomic_inc(&(n)->refcnt) -static inline void neigh_confirm(struct neighbour *neigh) -{ - if (neigh) - neigh->confirmed = jiffies; -} - static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { unsigned long now = jiffies; @@ -358,15 +352,6 @@ static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb) return dev_queue_xmit(skb); } -static inline int neigh_output(struct neighbour *n, struct sk_buff *skb) -{ - struct hh_cache *hh = &n->hh; - if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) - return neigh_hh_output(hh, skb); - else - return n->output(n, skb); -} - static inline struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat) { diff --git a/net/core/dst.c b/net/core/dst.c index 43d94cedbf7c..a6e19a23a745 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -152,7 +152,7 @@ EXPORT_SYMBOL(dst_discard); const u32 dst_default_metrics[RTAX_MAX]; void *dst_alloc(struct dst_ops *ops, struct net_device *dev, - int initial_ref, int initial_obsolete, int flags) + int initial_ref, int initial_obsolete, unsigned short flags) { struct dst_entry *dst; @@ -188,6 +188,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; + dst->pending_confirm = 0; dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6e9a266a0535..cc52679790b2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -198,7 +198,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); if (neigh) { - int res = neigh_output(neigh, skb); + int res = dst_neigh_output(dst, neigh, skb); rcu_read_unlock_bh(); return res; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8416f8a68e65..ca0d0e7c9778 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -740,13 +740,13 @@ void tcp_update_metrics(struct sock *sk) if (sysctl_tcp_nometrics_save) return; - dst_confirm(dst); - if (dst && (dst->flags & DST_HOST)) { const struct inet_connection_sock *icsk = inet_csk(sk); int m; unsigned long rtt; + dst_confirm(dst); + if (icsk->icsk_backoff || !tp->srtt) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. @@ -3869,9 +3869,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_cong_avoid(sk, ack, prior_in_flight); } - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) - dst_confirm(__sk_dst_get(sk)); - + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { + struct dst_entry *dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); + } return 1; no_queue: @@ -6140,9 +6142,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case TCP_FIN_WAIT1: if (tp->snd_una == tp->write_seq) { + struct dst_entry *dst; + tcp_set_state(sk, TCP_FIN_WAIT2); sk->sk_shutdown |= SEND_SHUTDOWN; - dst_confirm(__sk_dst_get(sk)); + + dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); if (!sock_flag(sk, SOCK_DEAD)) /* Wake up lingering close() */ diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a233a7ccbc3a..c94e4aabe11b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -125,7 +125,7 @@ static int ip6_finish_output2(struct sk_buff *skb) rcu_read_lock(); neigh = dst_get_neighbour_noref(dst); if (neigh) { - int res = neigh_output(neigh, skb); + int res = dst_neigh_output(dst, neigh, skb); rcu_read_unlock(); return res; -- cgit v1.2.3 From 13a43d94ab026c423dc8902170ef27c2bd36aa87 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Jul 2012 22:15:37 -0700 Subject: neigh: Convert over to dst_neigh_lookup_skb(). Signed-off-by: David S. Miller --- net/core/neighbour.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a793af9af150..117afaf51268 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1201,10 +1201,23 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, write_unlock_bh(&neigh->lock); rcu_read_lock(); - /* On shaper/eql skb->dst->neighbour != neigh :( */ - if (dst && (n2 = dst_get_neighbour_noref(dst)) != NULL) - n1 = n2; + + /* Why not just use 'neigh' as-is? The problem is that + * things such as shaper, eql, and sch_teql can end up + * using alternative, different, neigh objects to output + * the packet in the output path. So what we need to do + * here is re-lookup the top-level neigh in the path so + * we can reinject the packet there. + */ + n2 = NULL; + if (dst) { + n2 = dst_neigh_lookup_skb(dst, skb); + if (n2) + n1 = n2; + } n1->output(n1, skb); + if (n2) + neigh_release(n2); rcu_read_unlock(); write_lock_bh(&neigh->lock); -- cgit v1.2.3 From 36bdbcae2fa2a6dfa99344d4190fcea0aa7b7c25 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Jul 2012 22:58:02 -0700 Subject: net: Kill dst->_neighbour, accessors, and final uses. No longer used. Signed-off-by: David S. Miller --- include/net/dst.h | 17 +---------------- net/core/dst.c | 18 ------------------ 2 files changed, 1 insertion(+), 34 deletions(-) (limited to 'net/core') diff --git a/include/net/dst.h b/include/net/dst.h index 295a70547e7d..b2634e446613 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -42,7 +42,7 @@ struct dst_entry { struct dst_entry *from; }; struct dst_entry *path; - struct neighbour __rcu *_neighbour; + void *__pad0; #ifdef CONFIG_XFRM struct xfrm_state *xfrm; #else @@ -96,21 +96,6 @@ struct dst_entry { }; }; -static inline struct neighbour *dst_get_neighbour_noref(struct dst_entry *dst) -{ - return rcu_dereference(dst->_neighbour); -} - -static inline struct neighbour *dst_get_neighbour_noref_raw(struct dst_entry *dst) -{ - return rcu_dereference_raw(dst->_neighbour); -} - -static inline void dst_set_neighbour(struct dst_entry *dst, struct neighbour *neigh) -{ - rcu_assign_pointer(dst->_neighbour, neigh); -} - extern u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); extern const u32 dst_default_metrics[RTAX_MAX]; diff --git a/net/core/dst.c b/net/core/dst.c index a6e19a23a745..07bacff84aa4 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -171,7 +171,6 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst_init_metrics(dst, dst_default_metrics, true); dst->expires = 0UL; dst->path = dst; - RCU_INIT_POINTER(dst->_neighbour, NULL); #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif @@ -225,19 +224,12 @@ EXPORT_SYMBOL(__dst_free); struct dst_entry *dst_destroy(struct dst_entry * dst) { struct dst_entry *child; - struct neighbour *neigh; smp_rmb(); again: - neigh = rcu_dereference_protected(dst->_neighbour, 1); child = dst->child; - if (neigh) { - RCU_INIT_POINTER(dst->_neighbour, NULL); - neigh_release(neigh); - } - if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); @@ -361,19 +353,9 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (!unregister) { dst->input = dst->output = dst_discard; } else { - struct neighbour *neigh; - dst->dev = dev_net(dst->dev)->loopback_dev; dev_hold(dst->dev); dev_put(dev); - rcu_read_lock(); - neigh = dst_get_neighbour_noref(dst); - if (neigh && neigh->dev == dev) { - neigh->dev = dst->dev; - dev_hold(dst->dev); - dev_put(dev); - } - rcu_read_unlock(); } } -- cgit v1.2.3 From 16917b87a23b429226527f393270047069d665e9 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Sun, 1 Jul 2012 03:18:50 +0000 Subject: net-next: Add netif_get_num_default_rss_queues Most multi-queue networking driver consider the number of online cpus when configuring RSS queues. This patch adds a wrapper to the number of cpus, setting an upper limit on the number of cpus a driver should consider (by default) when allocating resources for his queues. Signed-off-by: Yuval Mintz Signed-off-by: Eilon Greenstein Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 11 +++++++++++ 2 files changed, 14 insertions(+) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2c2ecea28a1b..ab0251d541ab 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2119,6 +2119,9 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev, #endif } +#define DEFAULT_MAX_NUM_RSS_QUEUES (8) +extern int netif_get_num_default_rss_queues(void); + /* Use this variant when it is known for sure that it * is executing from hardware interrupt context or with hardware interrupts * disabled. diff --git a/net/core/dev.c b/net/core/dev.c index ed674e212b7a..69f7a1a393d8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1793,6 +1793,17 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif +/* netif_get_num_default_rss_queues - default number of RSS queues + * + * This routine should set an upper limit on the number of RSS queues + * used by default by multiqueue devices. + */ +int netif_get_num_default_rss_queues() +{ + return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); +} +EXPORT_SYMBOL(netif_get_num_default_rss_queues); + static inline void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; -- cgit v1.2.3 From b761c9b1f4f69eb53fb6147547a1ab25237a93b3 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Wed, 4 Jul 2012 23:28:40 +0000 Subject: cgroup: fix panic in netprio_cgroup we set max_prioidx to the first zero bit index of prioidx_map in function get_prioidx. So when we delete the low index netprio cgroup and adding a new netprio cgroup again,the max_prioidx will be set to the low index. when we set the high index cgroup's net_prio.ifpriomap,the function write_priomap will call update_netdev_tables to alloc memory which size is sizeof(struct netprio_map) + sizeof(u32) * (max_prioidx + 1), so the size of array that map->priomap point to is max_prioidx +1, which is low than what we actually need. fix this by adding check in get_prioidx,only set max_prioidx when max_prioidx low than the new prioidx. Signed-off-by: Gao feng Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 5b8aa2fae48b..aa907ed466ea 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -49,8 +49,9 @@ static int get_prioidx(u32 *prio) return -ENOSPC; } set_bit(prioidx, prioidx_map); + if (atomic_read(&max_prioidx) < prioidx) + atomic_set(&max_prioidx, prioidx); spin_unlock_irqrestore(&prioidx_map_lock, flags); - atomic_set(&max_prioidx, prioidx); *prio = prioidx; return 0; } -- cgit v1.2.3 From 91c68ce2b26319248a32d7baa1226f819d283758 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 8 Jul 2012 21:45:10 +0000 Subject: net: cgroup: fix out of bounds accesses dev->priomap is allocated by extend_netdev_table() called from update_netdev_tables(). And this is only called if write_priomap() is called. But if write_priomap() is not called, it seems we can have out of bounds accesses in cgrp_destroy(), read_priomap() & skb_update_prio() With help from Gao Feng Signed-off-by: Eric Dumazet Cc: Neil Horman Cc: Gao feng Acked-by: Gao feng Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++-- net/core/netprio_cgroup.c | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 84f01ba81a34..0f28a9e0b8ad 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2444,8 +2444,12 @@ static void skb_update_prio(struct sk_buff *skb) { struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); - if ((!skb->priority) && (skb->sk) && map) - skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx]; + if (!skb->priority && skb->sk && map) { + unsigned int prioidx = skb->sk->sk_cgrp_prioidx; + + if (prioidx < map->priomap_len) + skb->priority = map->priomap[prioidx]; + } } #else #define skb_update_prio(skb) diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index aa907ed466ea..3e953eaddbfc 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -142,7 +142,7 @@ static void cgrp_destroy(struct cgroup *cgrp) rtnl_lock(); for_each_netdev(&init_net, dev) { map = rtnl_dereference(dev->priomap); - if (map) + if (map && cs->prioidx < map->priomap_len) map->priomap[cs->prioidx] = 0; } rtnl_unlock(); @@ -166,7 +166,7 @@ static int read_priomap(struct cgroup *cont, struct cftype *cft, rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { map = rcu_dereference(dev->priomap); - priority = map ? map->priomap[prioidx] : 0; + priority = (map && prioidx < map->priomap_len) ? map->priomap[prioidx] : 0; cb->fill(cb, dev->name, priority); } rcu_read_unlock(); -- cgit v1.2.3 From 0f307323a48e47f064aa38e87f6fa03c88b551fc Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Sat, 7 Jul 2012 10:51:56 +0800 Subject: netprio_cgroup.c: fix comment typo poitner -> pointer. Signed-off-by: Liu Bo Signed-off-by: Jiri Kosina --- net/core/netprio_cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 5b8aa2fae48b..e13ad4946565 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -198,7 +198,7 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft, /* *Separate the devname from the associated priority - *and advance the priostr poitner to the priority value + *and advance the priostr pointer to the priority value */ *priostr = '\0'; priostr++; -- cgit v1.2.3 From 87a50699cb6d169591cc776fb82683a2c77cecac Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 10 Jul 2012 05:06:14 -0700 Subject: rtnetlink: Remove ts/tsage args to rtnl_put_cacheinfo(). Nobody provides non-zero values any longer. Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 3 +-- net/core/rtnetlink.c | 4 +--- net/decnet/dn_route.c | 2 +- net/ipv4/route.c | 3 +-- net/ipv6/route.c | 3 +-- 5 files changed, 5 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index ea60b0854109..db71c4ad8624 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -619,8 +619,7 @@ extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, extern void rtnl_set_sk_err(struct net *net, u32 group, int error); extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, - u32 id, u32 ts, u32 tsage, long expires, - u32 error); + u32 id, long expires, u32 error); extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2b325c340b44..64127eee786d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -615,7 +615,7 @@ nla_put_failure: EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, - u32 ts, u32 tsage, long expires, u32 error) + long expires, u32 error) { struct rta_cacheinfo ci = { .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse), @@ -623,8 +623,6 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, .rta_clntref = atomic_read(&(dst->__refcnt)), .rta_error = error, .rta_id = id, - .rta_ts = ts, - .rta_tsage = tsage, }; if (expires) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 707027fae8ab..b5594cc73ee1 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1590,7 +1590,7 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, goto errout; expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; - if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, expires, + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto errout; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d4834e2914a0..67b08745daf9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2931,8 +2931,7 @@ static int rt_fill_info(struct net *net, goto nla_put_failure; } - if (rtnl_put_cacheinfo(skb, &rt->dst, id, 0, 0, - expires, error) < 0) + if (rtnl_put_cacheinfo(skb, &rt->dst, id, expires, error) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b7eb51e1a0e1..563f12c1c99c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2471,8 +2471,7 @@ static int rt6_fill_node(struct net *net, else expires = INT_MAX; - if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, - expires, rt->dst.error) < 0) + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); -- cgit v1.2.3 From a55b138b1da3d25c04f66f8df03d659dfd46c950 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 10 Jul 2012 10:54:38 +0000 Subject: net: Properly define functions with no parameters Defining a function with no parameters as 'T foo()' is the deprecated K&R style, and is not strictly equivalent to defining it as 'T foo(void)'. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- drivers/net/cris/eth_v10.c | 2 +- net/core/dev.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/drivers/net/cris/eth_v10.c b/drivers/net/cris/eth_v10.c index 9c755db6b16d..f0c8bd54ce29 100644 --- a/drivers/net/cris/eth_v10.c +++ b/drivers/net/cris/eth_v10.c @@ -1008,7 +1008,7 @@ e100_send_mdio_bit(unsigned char bit) } static unsigned char -e100_receive_mdio_bit() +e100_receive_mdio_bit(void) { unsigned char bit; *R_NETWORK_MGM_CTRL = 0; diff --git a/net/core/dev.c b/net/core/dev.c index 69f7a1a393d8..9c21548e5b31 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1798,7 +1798,7 @@ EXPORT_SYMBOL(netif_set_real_num_rx_queues); * This routine should set an upper limit on the number of RSS queues * used by default by multiqueue devices. */ -int netif_get_num_default_rss_queues() +int netif_get_num_default_rss_queues(void) { return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); } -- cgit v1.2.3 From 2c53040f018b6c36a46eec75b9b937aaa5f78e6d Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 10 Jul 2012 10:55:09 +0000 Subject: net: Fix (nearly-)kernel-doc comments for various functions Fix incorrect start markers, wrapped summary lines, missing section breaks, incorrect separators, and some name mismatches. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/9p/trans_virtio.c | 2 +- net/appletalk/ddp.c | 8 +++--- net/batman-adv/bridge_loop_avoidance.c | 51 ++++++++++++++++++++++------------ net/batman-adv/hash.h | 3 +- net/batman-adv/main.h | 3 +- net/batman-adv/types.h | 3 +- net/core/dev.c | 8 ++++-- net/core/rtnetlink.c | 2 +- net/core/skbuff.c | 5 ++-- net/dccp/ackvec.h | 7 +++-- net/dccp/ccid.c | 1 + net/dccp/ccids/ccid3.c | 8 ++++-- net/dccp/ccids/lib/loss_interval.c | 1 + net/dccp/ccids/lib/packet_history.c | 3 +- net/dccp/ccids/lib/tfrc_equation.c | 2 ++ net/dccp/dccp.h | 1 + net/dccp/feat.c | 10 +++++++ net/dccp/input.c | 1 + net/dccp/options.c | 1 + net/dccp/output.c | 1 + net/ethernet/eth.c | 3 ++ net/ipv4/ipmr.c | 4 +-- net/ipv6/ip6_tunnel.c | 2 +- net/llc/af_llc.c | 2 +- net/llc/llc_station.c | 16 +++++------ net/mac80211/mesh.c | 2 +- net/mac80211/mesh_hwmp.c | 7 +++-- net/mac80211/mesh_pathtbl.c | 4 +-- net/mac80211/mesh_plink.c | 5 ++-- net/mac80211/rx.c | 2 +- net/netfilter/xt_TPROXY.c | 4 +-- net/netlink/genetlink.c | 2 +- net/rds/page.c | 9 +++--- net/rxrpc/ar-output.c | 2 +- net/sunrpc/backchannel_rqst.c | 9 +++--- net/sunrpc/clnt.c | 2 +- net/sunrpc/xdr.c | 12 ++++---- net/sunrpc/xprt.c | 2 +- net/tipc/bcast.c | 10 +++---- net/tipc/bearer.c | 7 ++--- net/tipc/bearer.h | 2 +- net/tipc/link.c | 22 +++++++-------- net/tipc/name_table.c | 10 +++---- net/tipc/port.c | 2 +- net/tipc/port.h | 1 + net/x25/x25_route.c | 2 +- 46 files changed, 163 insertions(+), 103 deletions(-) (limited to 'net/core') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 2a167658bb95..35b8911b1c8e 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -212,7 +212,7 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req) * this takes a list of pages. * @sg: scatter/gather list to pack into * @start: which segment of the sg_list to start at - * @**pdata: a list of pages to add into sg. + * @pdata: a list of pages to add into sg. * @nr_pages: number of pages to pack into the scatter/gather list * @data: data to pack into scatter/gather list * @count: amount of data to pack into the scatter/gather list diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 86852963b7f7..33475291c9c1 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -129,8 +129,8 @@ found: /** * atalk_find_or_insert_socket - Try to find a socket matching ADDR - * @sk - socket to insert in the list if it is not there already - * @sat - address to search for + * @sk: socket to insert in the list if it is not there already + * @sat: address to search for * * Try to find a socket matching ADDR in the socket list, if found then return * it. If not, insert SK into the socket list. @@ -1066,8 +1066,8 @@ static int atalk_release(struct socket *sock) /** * atalk_pick_and_bind_port - Pick a source port when one is not given - * @sk - socket to insert into the tables - * @sat - address to search for + * @sk: socket to insert into the tables + * @sat: address to search for * * Pick a source port when one is not given. If we can find a suitable free * one, we insert the socket into the tables using it. diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 49e10d91c00b..3483e4035cbe 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -162,12 +162,13 @@ static struct batadv_claim *batadv_claim_hash_find(struct batadv_priv *bat_priv, return claim_tmp; } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_backbone_hash_find - looks for a claim in the hash + * @bat_priv: the bat priv with all the soft interface information * @addr: the address of the originator * @vid: the VLAN ID * - * looks for a claim in the hash, and returns it if found - * or NULL otherwise. + * Returns claim if found or NULL otherwise. */ static struct batadv_backbone_gw * batadv_backbone_hash_find(struct batadv_priv *bat_priv, @@ -242,12 +243,12 @@ batadv_bla_del_backbone_claims(struct batadv_backbone_gw *backbone_gw) backbone_gw->crc = BATADV_BLA_CRC_INIT; } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_bla_send_claim - sends a claim frame according to the provided info + * @bat_priv: the bat priv with all the soft interface information * @orig: the mac address to be announced within the claim * @vid: the VLAN ID * @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...) - * - * sends a claim frame according to the provided info. */ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, short vid, int claimtype) @@ -348,7 +349,9 @@ out: batadv_hardif_free_ref(primary_if); } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_bla_get_backbone_gw + * @bat_priv: the bat priv with all the soft interface information * @orig: the mac address of the originator * @vid: the VLAN ID * @@ -520,12 +523,12 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_bla_add_claim - Adds a claim in the claim hash + * @bat_priv: the bat priv with all the soft interface information * @mac: the mac address of the claim * @vid: the VLAN ID of the frame * @backbone_gw: the backbone gateway which claims it - * - * Adds a claim in the claim hash. */ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, const uint8_t *mac, const short vid, @@ -743,7 +746,9 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, return 1; } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_check_claim_group + * @bat_priv: the bat priv with all the soft interface information * @hw_src: the Hardware source in the ARP Header * @hw_dst: the Hardware destination in the ARP Header * @ethhdr: pointer to the Ethernet header of the claim frame @@ -975,7 +980,9 @@ purge_now: } } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_bla_purge_claims + * @bat_priv: the bat priv with all the soft interface information * @primary_if: the selected primary interface, may be NULL if now is set * @now: whether the whole hash shall be wiped now * @@ -1023,7 +1030,9 @@ purge_now: } } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_bla_update_orig_address + * @bat_priv: the bat priv with all the soft interface information * @primary_if: the new selected primary_if * @oldif: the old primary interface, may be NULL * @@ -1193,7 +1202,9 @@ int batadv_bla_init(struct batadv_priv *bat_priv) return 0; } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_bla_check_bcast_duplist + * @bat_priv: the bat priv with all the soft interface information * @bcast_packet: originator mac address * @hdr_size: maximum length of the frame * @@ -1297,7 +1308,9 @@ int batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, uint8_t *orig) } -/* @skb: the frame to be checked +/** + * batadv_bla_is_backbone_gw + * @skb: the frame to be checked * @orig_node: the orig_node of the frame * @hdr_size: maximum length of the frame * @@ -1363,7 +1376,9 @@ void batadv_bla_free(struct batadv_priv *bat_priv) batadv_hardif_free_ref(primary_if); } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_bla_rx + * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * @@ -1450,7 +1465,9 @@ out: return ret; } -/* @bat_priv: the bat priv with all the soft interface information +/** + * batadv_bla_tx + * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 83990e318e43..977de9c75fc2 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -81,7 +81,8 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash, batadv_hash_destroy(hash); } -/* hash_add - adds data to the hashtable +/** + * batadv_hash_add - adds data to the hashtable * @hash: storage hash table * @compare: callback to determine if 2 hash elements are identical * @choose: callback calculating the hash index diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index b8d4ac17f001..5d8fa0757947 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -216,7 +216,8 @@ static inline int batadv_compare_eth(const void *data1, const void *data2) return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0); } -/* has_timed_out - compares current time (jiffies) and timestamp + timeout +/** + * has_timed_out - compares current time (jiffies) and timestamp + timeout * @timestamp: base value to compare with (in jiffies) * @timeout: added to base value before comparing (in milliseconds) * diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 2141c1304898..12635fd2c3d3 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -44,7 +44,8 @@ struct batadv_hard_iface { struct rcu_head rcu; }; -/* batadv_orig_node - structure for orig_list maintaining nodes of mesh +/** + * struct batadv_orig_node - structure for orig_list maintaining nodes of mesh * @primary_addr: hosts primary interface address * @last_seen: when last packet from this node was received * @bcast_seqno_reset: time when the broadcast seqno window was reset diff --git a/net/core/dev.c b/net/core/dev.c index 9c21548e5b31..5ab6f4b37c0c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1691,7 +1691,8 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) rcu_read_unlock(); } -/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change +/** + * netif_setup_tc - Handle tc mappings on real_num_tx_queues change * @dev: Network device * @txq: number of queues available * @@ -1793,7 +1794,8 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif -/* netif_get_num_default_rss_queues - default number of RSS queues +/** + * netif_get_num_default_rss_queues - default number of RSS queues * * This routine should set an upper limit on the number of RSS queues * used by default by multiqueue devices. @@ -5670,7 +5672,7 @@ int netdev_refcnt_read(const struct net_device *dev) } EXPORT_SYMBOL(netdev_refcnt_read); -/* +/** * netdev_wait_allrefs - wait until all references are gone. * * This is called when unregistering network devices. diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 64127eee786d..045db8ad87c8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2174,7 +2174,7 @@ skip: } /** - * ndo_dflt_fdb_dump: default netdevice operation to dump an FDB table. + * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table. * @nlh: netlink message header * @dev: netdevice * diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5a789a807ec3..506f678e9d95 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -713,7 +713,8 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); -/* skb_copy_ubufs - copy userspace skb frags buffers to kernel +/** + * skb_copy_ubufs - copy userspace skb frags buffers to kernel * @skb: the skb to modify * @gfp_mask: allocation priority * @@ -2614,7 +2615,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, EXPORT_SYMBOL(skb_find_text); /** - * skb_append_datato_frags: - append the user data to a skb + * skb_append_datato_frags - append the user data to a skb * @sk: sock structure * @skb: skb structure to be appened with user data. * @getfrag: call back function to be used for getting the user data diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index e2ab0627a5ff..a269aa7f7923 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -50,7 +50,8 @@ static inline u8 dccp_ackvec_state(const u8 *cell) return *cell & ~DCCPAV_MAX_RUNLEN; } -/** struct dccp_ackvec - Ack Vector main data structure +/** + * struct dccp_ackvec - Ack Vector main data structure * * This implements a fixed-size circular buffer within an array and is largely * based on Appendix A of RFC 4340. @@ -76,7 +77,8 @@ struct dccp_ackvec { struct list_head av_records; }; -/** struct dccp_ackvec_record - Records information about sent Ack Vectors +/** + * struct dccp_ackvec_record - Records information about sent Ack Vectors * * These list entries define the additional information which the HC-Receiver * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. @@ -121,6 +123,7 @@ static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) * @len: length of @vec * @nonce: whether @vec had an ECN nonce of 0 or 1 * @node: FIFO - arranged in descending order of ack_ackno + * * This structure is used by CCIDs to access Ack Vectors in a received skb. */ struct dccp_ackvec_parsed { diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 48b585a5cba7..597557254ddb 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -46,6 +46,7 @@ bool ccid_support_check(u8 const *ccid_array, u8 array_len) * ccid_get_builtin_ccids - Populate a list of built-in CCIDs * @ccid_array: pointer to copy into * @array_len: value to return length into + * * This function allocates memory - caller must see that it is freed after use. */ int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 8c67bedf85b0..d65e98798eca 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -113,6 +113,7 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) /** * ccid3_hc_tx_update_x - Update allowed sending rate X * @stamp: most recent time if available - can be left NULL. + * * This function tracks draft rfc3448bis, check there for latest details. * * Note: X and X_recv are both stored in units of 64 * bytes/second, to support @@ -161,9 +162,11 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) } } -/* - * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1) +/** + * ccid3_hc_tx_update_s - Track the mean packet size `s' * @len: DCCP packet payload size in bytes + * + * cf. RFC 4342, 5.3 and RFC 3448, 4.1 */ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len) { @@ -270,6 +273,7 @@ out: /** * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets * @skb: next packet candidate to send on @sk + * * This function uses the convention of ccid_packet_dequeue_eval() and * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. */ diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index 497723c4d4bb..57f9fd78c4df 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -133,6 +133,7 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, * @rh: Receive history containing a fresh loss event * @calc_first_li: Caller-dependent routine to compute length of first interval * @sk: Used by @calc_first_li in caller-specific way (subtyping) + * * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. */ int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index de8fe294bf0b..08df7a3acb3d 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -315,6 +315,7 @@ static void __three_after_loss(struct tfrc_rx_hist *h) * @ndp: The NDP count belonging to @skb * @calc_first_li: Caller-dependent computation of first loss interval in @lh * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) + * * Chooses action according to pending loss, updates LI database when a new * loss was detected, and does required post-processing. Returns 1 when caller * should send feedback, 0 otherwise. @@ -387,7 +388,7 @@ static inline struct tfrc_rx_hist_entry * } /** - * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry + * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry */ static inline struct tfrc_rx_hist_entry * tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index a052a4377e26..88ef98285bec 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -611,6 +611,7 @@ static inline u32 tfrc_binsearch(u32 fval, u8 small) * @s: packet size in bytes * @R: RTT scaled by 1000000 (i.e., microseconds) * @p: loss ratio estimate scaled by 1000000 + * * Returns X_calc in bytes per second (not scaled). */ u32 tfrc_calc_x(u16 s, u32 R, u32 p) @@ -659,6 +660,7 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) /** * tfrc_calc_x_reverse_lookup - try to find p given f(p) * @fvalue: function value to match, scaled by 1000000 + * * Returns closest match for p, also scaled by 1000000 */ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 9040be049d8c..708e75bf623d 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -352,6 +352,7 @@ static inline int dccp_bad_service_code(const struct sock *sk, * @dccpd_opt_len: total length of all options (5.8) in the packet * @dccpd_seq: sequence number * @dccpd_ack_seq: acknowledgment number subheader field value + * * This is used for transmission as well as for reception. */ struct dccp_skb_cb { diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 78a2ad70e1b0..9733ddbc96cb 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -350,6 +350,7 @@ static int __dccp_feat_activate(struct sock *sk, const int idx, * @feat_num: feature to activate, one of %dccp_feature_numbers * @local: whether local (1) or remote (0) @feat_num is meant * @fval: the value (SP or NN) to activate, or NULL to use the default value + * * For general use this function is preferable over __dccp_feat_activate(). */ static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, @@ -446,6 +447,7 @@ static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, * @head: list to add to * @feat: feature number * @local: whether the local (1) or remote feature with number @feat is meant + * * This is the only constructor and serves to ensure the above invariants. */ static struct dccp_feat_entry * @@ -504,6 +506,7 @@ static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, * @feat: one of %dccp_feature_numbers * @local: whether local (1) or remote (0) @feat_num is being confirmed * @fval: pointer to NN/SP value to be inserted or NULL + * * Returns 0 on success, a Reset code for further processing otherwise. */ static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, @@ -691,6 +694,7 @@ int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, * @feat: an NN feature from %dccp_feature_numbers * @mandatory: use Mandatory option if 1 * @nn_val: value to register (restricted to 4 bytes) + * * Note that NN features are local by definition (RFC 4340, 6.3.2). */ static int __feat_register_nn(struct list_head *fn, u8 feat, @@ -760,6 +764,7 @@ int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, * dccp_feat_nn_get - Query current/pending value of NN feature * @sk: DCCP socket of an established connection * @feat: NN feature number from %dccp_feature_numbers + * * For a known NN feature, returns value currently being negotiated, or * current (confirmed) value if no negotiation is going on. */ @@ -790,6 +795,7 @@ EXPORT_SYMBOL_GPL(dccp_feat_nn_get); * @sk: DCCP socket of an established connection * @feat: NN feature number from %dccp_feature_numbers * @nn_val: the new value to use + * * This function is used to communicate NN updates out-of-band. */ int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) @@ -930,6 +936,7 @@ static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) * @fn: feature-negotiation list to update * @id: CCID number to track * @is_local: whether TX CCID (1) or RX CCID (0) is meant + * * This function needs to be called after registering all other features. */ static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) @@ -953,6 +960,7 @@ static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) /** * dccp_feat_finalise_settings - Finalise settings before starting negotiation * @dp: client or listening socket (settings will be inherited) + * * This is called after all registrations (socket initialisation, sysctls, and * sockopt calls), and before sending the first packet containing Change options * (ie. client-Request or server-Response), to ensure internal consistency. @@ -1284,6 +1292,7 @@ confirmation_failed: * @feat: NN number, one of %dccp_feature_numbers * @val: NN value * @len: length of @val in bytes + * * This function combines the functionality of change_recv/confirm_recv, with * the following differences (reset codes are the same): * - cleanup after receiving the Confirm; @@ -1379,6 +1388,7 @@ fast_path_failed: * @feat: one of %dccp_feature_numbers * @val: value contents of @opt * @len: length of @val in bytes + * * Returns 0 on success, a Reset code for ending the connection otherwise. */ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, diff --git a/net/dccp/input.c b/net/dccp/input.c index bc93a333931e..14cdafad7a90 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -710,6 +710,7 @@ EXPORT_SYMBOL_GPL(dccp_rcv_state_process); /** * dccp_sample_rtt - Validate and finalise computation of RTT sample * @delta: number of microseconds between packet and acknowledgment + * * The routine is kept generic to work in different contexts. It should be * called immediately when the ACK used for the RTT sample arrives. */ diff --git a/net/dccp/options.c b/net/dccp/options.c index 68fa6b7a3e01..a58e0b634050 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -527,6 +527,7 @@ int dccp_insert_option_mandatory(struct sk_buff *skb) * @val: NN value or SP array (preferred element first) to copy * @len: true length of @val in bytes (excluding first element repetition) * @repeat_first: whether to copy the first element of @val twice + * * The last argument is used to construct Confirm options, where the preferred * value and the preference list appear separately (RFC 4340, 6.3.1). Preference * lists are kept such that the preferred entry is always first, so we only need diff --git a/net/dccp/output.c b/net/dccp/output.c index 787367308797..d17fc90a74b6 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -214,6 +214,7 @@ void dccp_write_space(struct sock *sk) * dccp_wait_for_ccid - Await CCID send permission * @sk: socket to wait for * @delay: timeout in jiffies + * * This is used by CCIDs which need to delay the send time in process context. */ static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index db6a6c17d790..4efad533e5f6 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -232,6 +232,7 @@ EXPORT_SYMBOL(eth_header_parse); * @neigh: source neighbour * @hh: destination cache entry * @type: Ethernet type field + * * Create an Ethernet header template from the neighbour. */ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type) @@ -274,6 +275,7 @@ EXPORT_SYMBOL(eth_header_cache_update); * eth_mac_addr - set new Ethernet hardware address * @dev: network device * @p: socket address + * * Change hardware address of device. * * This doesn't change hardware matching, so needs to be overridden @@ -331,6 +333,7 @@ const struct header_ops eth_header_ops ____cacheline_aligned = { /** * ether_setup - setup Ethernet network device * @dev: network device + * * Fill in the fields of the device structure with Ethernet-generic values. */ void ether_setup(struct net_device *dev) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index b4ac39f11d19..5716c6b808d6 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -524,8 +524,8 @@ failure: } #endif -/* - * Delete a VIF entry +/** + * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call */ diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 04a3cba2c123..6af3fcfdcbbd 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -252,7 +252,7 @@ static void ip6_dev_free(struct net_device *dev) } /** - * ip6_tnl_create() - create a new tunnel + * ip6_tnl_create - create a new tunnel * @p: tunnel parameters * @pt: pointer to new tunnel * diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index fe5453c3e719..f6fe4d400502 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1024,7 +1024,7 @@ static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, * @sock: Socket to set options on. * @level: Socket level user is requesting operations on. * @optname: Operation name. - * @optval User provided operation data. + * @optval: User provided operation data. * @optlen: Length of optval. * * Set various connection specific parameters. diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index cf4aea3ba30f..39a8d8924b9c 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -30,12 +30,12 @@ * * SAP and connection resource manager, one per adapter. * - * @state - state of station - * @xid_r_count - XID response PDU counter - * @mac_sa - MAC source address - * @sap_list - list of related SAPs - * @ev_q - events entering state mach. - * @mac_pdu_q - PDUs ready to send to MAC + * @state: state of station + * @xid_r_count: XID response PDU counter + * @mac_sa: MAC source address + * @sap_list: list of related SAPs + * @ev_q: events entering state mach. + * @mac_pdu_q: PDUs ready to send to MAC */ struct llc_station { u8 state; @@ -646,7 +646,7 @@ static void llc_station_service_events(void) } /** - * llc_station_state_process: queue event and try to process queue. + * llc_station_state_process - queue event and try to process queue. * @skb: Address of the event * * Queues an event (on the station event queue) for handling by the @@ -672,7 +672,7 @@ static void llc_station_ack_tmr_cb(unsigned long timeout_data) } } -/* +/** * llc_station_rcv - send received pdu to the station state machine * @skb: received frame. * diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 764593d65fc3..6fac18c0423f 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -133,7 +133,7 @@ bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie) } /** - * mesh_accept_plinks_update: update accepting_plink in local mesh beacons + * mesh_accept_plinks_update - update accepting_plink in local mesh beacons * * @sdata: mesh interface in which mesh beacons are going to be updated */ diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index fb7b6a11d0ba..494bc39f61a4 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1054,12 +1054,15 @@ enddiscovery: kfree(preq_node); } -/* mesh_nexthop_resolve - lookup next hop for given skb and start path - * discovery if no forwarding information is found. +/** + * mesh_nexthop_resolve - lookup next hop; conditionally start path discovery * * @skb: 802.11 frame to be sent * @sdata: network subif the frame will be sent through * + * Lookup next hop for given skb and start path discovery if no + * forwarding information is found. + * * Returns: 0 if the next hop was found and -ENOENT if the frame was queued. * skb is freeed here if no mpath could be allocated. */ diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index c9ae931dd693..075bc535c601 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -778,7 +778,7 @@ static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node) /** * mesh_path_flush_by_nexthop - Deletes mesh paths if their next hop matches * - * @sta - mesh peer to match + * @sta: mesh peer to match * * RCU notes: this function is called when a mesh plink transitions from * PLINK_ESTAB to any other state, since PLINK_ESTAB state is the only one that @@ -833,7 +833,7 @@ static void table_flush_by_iface(struct mesh_table *tbl, * * This function deletes both mesh paths as well as mesh portal paths. * - * @sdata - interface data to match + * @sdata: interface data to match * */ void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index a1dbd1540276..9ad74dd87a7b 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -99,7 +99,7 @@ static struct sta_info *mesh_plink_alloc(struct ieee80211_sub_if_data *sdata, return sta; } -/* +/** * mesh_set_ht_prot_mode - set correct HT protection mode * * Section 9.23.3.5 of IEEE 80211-2012 describes the protection rules for HT @@ -320,7 +320,8 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, return 0; } -/* mesh_peer_init - initialize new mesh peer and return resulting sta_info +/** + * mesh_peer_init - initialize new mesh peer and return resulting sta_info * * @sdata: local meshif * @addr: peer's address diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 839cac8fab57..67edd69e8421 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -94,7 +94,7 @@ ieee80211_rx_radiotap_len(struct ieee80211_local *local, return len; } -/* +/** * ieee80211_add_rx_radiotap_header - add radiotap header * * add a radiotap header containing all the fields which the hardware provided. diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 146033a86de8..d7f195388f66 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -69,7 +69,7 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) } /** - * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections + * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections * @skb: The skb being processed. * @laddr: IPv4 address to redirect to or zero. * @lport: TCP port to redirect to or zero. @@ -220,7 +220,7 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, } /** - * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections + * tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections * @skb: The skb being processed. * @tproto: Transport protocol. * @thoff: Transport protocol header offset. diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 32761b53015e..62ebe3c6291c 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -504,7 +504,7 @@ EXPORT_SYMBOL(genl_unregister_family); * @pid: netlink pid the message is addressed to * @seq: sequence number (usually the one of the sender) * @family: generic netlink family - * @flags netlink message flags + * @flags: netlink message flags * @cmd: generic netlink command * * Returns pointer to user specific header diff --git a/net/rds/page.c b/net/rds/page.c index 2499cd108421..9005a2c920ee 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -74,11 +74,12 @@ int rds_page_copy_user(struct page *page, unsigned long offset, } EXPORT_SYMBOL_GPL(rds_page_copy_user); -/* - * Message allocation uses this to build up regions of a message. +/** + * rds_page_remainder_alloc - build up regions of a message. * - * @bytes - the number of bytes needed. - * @gfp - the waiting behaviour of the allocation + * @scat: Scatter list for message + * @bytes: the number of bytes needed. + * @gfp: the waiting behaviour of the allocation * * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to * kmap the pages, etc. diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index 16ae88762d00..e1ac183d50bb 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -242,7 +242,7 @@ int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, EXPORT_SYMBOL(rxrpc_kernel_send_data); -/* +/** * rxrpc_kernel_abort_call - Allow a kernel service to abort a call * @call: The call to be aborted * @abort_code: The abort code to stick into the ABORT packet diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 31def68a0f6e..5a3d675d2f2f 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -176,13 +176,14 @@ out_free: } EXPORT_SYMBOL_GPL(xprt_setup_backchannel); -/* - * Destroys the backchannel preallocated structures. +/** + * xprt_destroy_backchannel - Destroys the backchannel preallocated structures. + * @xprt: the transport holding the preallocated strucures + * @max_reqs the maximum number of preallocated structures to destroy + * * Since these structures may have been allocated by multiple calls * to xprt_setup_backchannel, we only destroy up to the maximum number * of reqs specified by the caller. - * @xprt: the transport holding the preallocated strucures - * @max_reqs the maximum number of preallocated structures to destroy */ void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs) { diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index f56f045778ae..00eb859b7de5 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -385,7 +385,7 @@ out_no_rpciod: return ERR_PTR(err); } -/* +/** * rpc_create - create an RPC client and transport with one call * @args: rpc_clnt create argument structure * diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index fddcccfcdf76..0cf165580d8d 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -180,7 +180,9 @@ EXPORT_SYMBOL_GPL(xdr_inline_pages); /* * Helper routines for doing 'memmove' like operations on a struct xdr_buf - * + */ + +/** * _shift_data_right_pages * @pages: vector of pages containing both the source and dest memory area. * @pgto_base: page vector address of destination @@ -242,7 +244,7 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base, } while ((len -= copy) != 0); } -/* +/** * _copy_to_pages * @pages: array of pages * @pgbase: page vector address of destination @@ -286,7 +288,7 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len) flush_dcache_page(*pgto); } -/* +/** * _copy_from_pages * @p: pointer to destination * @pages: array of pages @@ -326,7 +328,7 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) } EXPORT_SYMBOL_GPL(_copy_from_pages); -/* +/** * xdr_shrink_bufhead * @buf: xdr_buf * @len: bytes to remove from buf->head[0] @@ -399,7 +401,7 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len) buf->len = buf->buflen; } -/* +/** * xdr_shrink_pagelen * @buf: xdr_buf * @len: bytes to remove from buf->pages diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 3c83035cdaa9..a5a402a7d21f 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -531,7 +531,7 @@ void xprt_set_retrans_timeout_def(struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def); -/* +/** * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout * @task: task whose timeout is to be set * diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 2625f5ebe3e8..d9df34fbd7ca 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -162,7 +162,7 @@ static void bclink_update_last_sent(struct tipc_node *node, u32 seqno) } -/* +/** * tipc_bclink_retransmit_to - get most recent node to request retransmission * * Called with bc_lock locked @@ -270,7 +270,7 @@ exit: spin_unlock_bh(&bc_lock); } -/* +/** * tipc_bclink_update_link_state - update broadcast link state * * tipc_net_lock and node lock set @@ -330,7 +330,7 @@ void tipc_bclink_update_link_state(struct tipc_node *n_ptr, u32 last_sent) } } -/* +/** * bclink_peek_nack - monitor retransmission requests sent by other nodes * * Delay any upcoming NACK by this node if another node has already @@ -381,7 +381,7 @@ exit: return res; } -/* +/** * bclink_accept_pkt - accept an incoming, in-sequence broadcast packet * * Called with both sending node's lock and bc_lock taken. @@ -406,7 +406,7 @@ static void bclink_accept_pkt(struct tipc_node *node, u32 seqno) } } -/* +/** * tipc_bclink_recv_pkt - receive a broadcast packet, and deliver upwards * * tipc_net_lock is read_locked, no other locks set diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index a297e3a2e3e7..86b703f55092 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -172,8 +172,8 @@ struct sk_buff *tipc_media_get_names(void) /** * bearer_name_validate - validate & (optionally) deconstruct bearer name - * @name - ptr to bearer name string - * @name_parts - ptr to area for bearer name components (or NULL if not needed) + * @name: ptr to bearer name string + * @name_parts: ptr to area for bearer name components (or NULL if not needed) * * Returns 1 if bearer name is valid, otherwise 0. */ @@ -520,8 +520,7 @@ exit: } /** - * tipc_block_bearer(): Block the bearer with the given name, - * and reset all its links + * tipc_block_bearer - Block the bearer with the given name, and reset all its links */ int tipc_block_bearer(const char *name) { diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index e3b2be37fb31..4680de118aff 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -57,7 +57,7 @@ */ #define TIPC_MEDIA_TYPE_ETH 1 -/* +/** * struct tipc_media_addr - destination address used by TIPC bearers * @value: address info (format defined by media) * @media_id: TIPC media type identifier diff --git a/net/tipc/link.c b/net/tipc/link.c index 7a614f43549d..f6bf4830ddfe 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -153,8 +153,8 @@ int tipc_link_is_active(struct tipc_link *l_ptr) /** * link_name_validate - validate & (optionally) deconstruct tipc_link name - * @name - ptr to link name string - * @name_parts - ptr to area for link name components (or NULL if not needed) + * @name: ptr to link name string + * @name_parts: ptr to area for link name components (or NULL if not needed) * * Returns 1 if link name is valid, otherwise 0. */ @@ -944,7 +944,7 @@ int tipc_link_send(struct sk_buff *buf, u32 dest, u32 selector) return res; } -/* +/** * tipc_link_send_names - send name table entries to new neighbor * * Send routine for bulk delivery of name table messages when contact @@ -1787,7 +1787,7 @@ cont: read_unlock_bh(&tipc_net_lock); } -/* +/** * tipc_link_defer_pkt - Add out-of-sequence message to deferred reception queue * * Returns increase in queue length (i.e. 0 or 1) @@ -2635,8 +2635,8 @@ void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window) /** * link_find_link - locate link by name - * @name - ptr to link name string - * @node - ptr to area to be filled with ptr to associated node + * @name: ptr to link name string + * @node: ptr to area to be filled with ptr to associated node * * Caller must hold 'tipc_net_lock' to ensure node and bearer are not deleted; * this also prevents link deletion. @@ -2671,8 +2671,8 @@ static struct tipc_link *link_find_link(const char *name, /** * link_value_is_valid -- validate proposed link tolerance/priority/window * - * @cmd - value type (TIPC_CMD_SET_LINK_*) - * @new_value - the new value + * @cmd: value type (TIPC_CMD_SET_LINK_*) + * @new_value: the new value * * Returns 1 if value is within range, 0 if not. */ @@ -2693,9 +2693,9 @@ static int link_value_is_valid(u16 cmd, u32 new_value) /** * link_cmd_set_value - change priority/tolerance/window for link/bearer/media - * @name - ptr to link, bearer, or media name - * @new_value - new value of link, bearer, or media setting - * @cmd - which link, bearer, or media attribute to set (TIPC_CMD_SET_LINK_*) + * @name: ptr to link, bearer, or media name + * @new_value: new value of link, bearer, or media setting + * @cmd: which link, bearer, or media attribute to set (TIPC_CMD_SET_LINK_*) * * Caller must hold 'tipc_net_lock' to ensure link/bearer/media is not deleted. * diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 010f24a59da2..13fb9d559ea5 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -191,7 +191,7 @@ static void nameseq_delete_empty(struct name_seq *seq) } } -/* +/** * nameseq_find_subseq - find sub-sequence (if any) matching a name instance * * Very time-critical, so binary searches through sub-sequence array. @@ -435,7 +435,7 @@ found: } /** - * tipc_nameseq_subscribe: attach a subscription, and issue + * tipc_nameseq_subscribe - attach a subscription, and issue * the prescribed number of events if there is any sub- * sequence overlapping with the requested sequence */ @@ -520,7 +520,7 @@ struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower, return publ; } -/* +/** * tipc_nametbl_translate - perform name translation * * On entry, 'destnode' is the search domain used during translation. @@ -751,7 +751,7 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s) /** - * subseq_list: print specified sub-sequence contents into the given buffer + * subseq_list - print specified sub-sequence contents into the given buffer */ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth, u32 index) @@ -787,7 +787,7 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth, } /** - * nameseq_list: print specified name sequence contents into the given buffer + * nameseq_list - print specified name sequence contents into the given buffer */ static void nameseq_list(struct name_seq *seq, struct print_buf *buf, u32 depth, u32 type, u32 lowbound, u32 upbound, u32 index) diff --git a/net/tipc/port.c b/net/tipc/port.c index a1e828989d7a..70bf78bd5b75 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -69,7 +69,7 @@ static u32 port_peerport(struct tipc_port *p_ptr) return msg_destport(&p_ptr->phdr); } -/* +/** * tipc_port_peer_msg - verify message was sent by connected port's peer * * Handles cases where the node's network address has changed from diff --git a/net/tipc/port.h b/net/tipc/port.h index 98cbec9c4532..4660e3065790 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -79,6 +79,7 @@ typedef void (*tipc_continue_event) (void *usr_handle, u32 portref); * struct user_port - TIPC user port (used with native API) * @usr_handle: user-specified field * @ref: object reference to associated TIPC port + * * */ struct user_port { diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c index cf6366270054..277c8d2448d6 100644 --- a/net/x25/x25_route.c +++ b/net/x25/x25_route.c @@ -66,7 +66,7 @@ out: /** * __x25_remove_route - remove route from x25_route_list - * @rt - route to remove + * @rt: route to remove * * Remove route from x25_route_list. If it was there. * Caller must hold x25_route_list_lock. -- cgit v1.2.3 From 46d3ceabd8d98ed0ad10f20c595ca784e34786c5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Jul 2012 05:50:31 +0000 Subject: tcp: TCP Small Queues This introduce TSQ (TCP Small Queues) TSQ goal is to reduce number of TCP packets in xmit queues (qdisc & device queues), to reduce RTT and cwnd bias, part of the bufferbloat problem. sk->sk_wmem_alloc not allowed to grow above a given limit, allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a given time. TSO packets are sized/capped to half the limit, so that we have two TSO packets in flight, allowing better bandwidth use. As a side effect, setting the limit to 40000 automatically reduces the standard gso max limit (65536) to 40000/2 : It can help to reduce latencies of high prio packets, having smaller TSO packets. This means we divert sock_wfree() to a tcp_wfree() handler, to queue/send following frames when skb_orphan() [2] is called for the already queued skbs. Results on my dev machines (tg3/ixgbe nics) are really impressive, using standard pfifo_fast, and with or without TSO/GSO. Without reduction of nominal bandwidth, we have reduction of buffering per bulk sender : < 1ms on Gbit (instead of 50ms with TSO) < 8ms on 100Mbit (instead of 132 ms) I no longer have 4 MBytes backlogged in qdisc by a single netperf session, and both side socket autotuning no longer use 4 Mbytes. As skb destructor cannot restart xmit itself ( as qdisc lock might be taken at this point ), we delegate the work to a tasklet. We use one tasklest per cpu for performance reasons. If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag. This flag is tested in a new protocol method called from release_sock(), to eventually send new segments. [1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable [2] skb_orphan() is usually called at TX completion time, but some drivers call it in their start_xmit() handler. These drivers should at least use BQL, or else a single TCP session can still fill the whole NIC TX ring, since TSQ will have no effect. Signed-off-by: Eric Dumazet Cc: Dave Taht Cc: Tom Herbert Cc: Matt Mathis Cc: Yuchung Cheng Cc: Nandita Dukkipati Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 14 +++ include/linux/tcp.h | 9 ++ include/net/sock.h | 2 + include/net/tcp.h | 4 + net/core/sock.c | 4 + net/ipv4/sysctl_net_ipv4.c | 7 ++ net/ipv4/tcp.c | 6 ++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 1 + net/ipv4/tcp_output.c | 154 ++++++++++++++++++++++++++++++++- net/ipv6/tcp_ipv6.c | 1 + 11 files changed, 202 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 47b6c79e9b05..e20c17a7d34e 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -551,6 +551,20 @@ tcp_thin_dupack - BOOLEAN Documentation/networking/tcp-thin.txt Default: 0 +tcp_limit_output_bytes - INTEGER + Controls TCP Small Queue limit per tcp socket. + TCP bulk sender tends to increase packets in flight until it + gets losses notifications. With SNDBUF autotuning, this can + result in a large amount of packets queued in qdisc/device + on the local machine, hurting latency of other flows, for + typical pfifo_fast qdiscs. + tcp_limit_output_bytes limits the number of bytes on qdisc + or device to reduce artificial RTT/cwnd and reduce bufferbloat. + Note: For GSO/TSO enabled flows, we try to have at least two + packets in flight. Reducing tcp_limit_output_bytes might also + reduce the size of individual GSO packet (64KB being the max) + Default: 131072 + UDP variables: udp_mem - vector of 3 INTEGERs: min, pressure, max diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2de9cf46f9fc..1888169e07c7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -339,6 +339,9 @@ struct tcp_sock { u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ + struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ + unsigned long tsq_flags; + /* Data for direct copy to user */ struct { struct sk_buff_head prequeue; @@ -494,6 +497,12 @@ struct tcp_sock { struct tcp_cookie_values *cookie_values; }; +enum tsq_flags { + TSQ_THROTTLED, + TSQ_QUEUED, + TSQ_OWNED, /* tcp_tasklet_func() found socket was locked */ +}; + static inline struct tcp_sock *tcp_sk(const struct sock *sk) { return (struct tcp_sock *)sk; diff --git a/include/net/sock.h b/include/net/sock.h index dcb54a0793ec..88de092df50f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -858,6 +858,8 @@ struct proto { int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); + void (*release_cb)(struct sock *sk); + /* Keeping track of sk's, looking them up, and port selection methods. */ void (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 3618fefae049..439984b9af49 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -253,6 +253,7 @@ extern int sysctl_tcp_cookie_size; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; +extern int sysctl_tcp_limit_output_bytes; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -321,6 +322,8 @@ extern struct proto tcp_prot; extern void tcp_init_mem(struct net *net); +extern void tcp_tasklet_init(void); + extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); @@ -334,6 +337,7 @@ extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size); extern int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +extern void tcp_release_cb(struct sock *sk); extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len); diff --git a/net/core/sock.c b/net/core/sock.c index 929bdcc2383b..24039ac12426 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2159,6 +2159,10 @@ void release_sock(struct sock *sk) spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); + + if (sk->sk_prot->release_cb) + sk->sk_prot->release_cb(sk); + sk->sk_lock.owned = 0; if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 12aa0c5867c4..70730f7aeafe 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -598,6 +598,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_limit_output_bytes", + .data = &sysctl_tcp_limit_output_bytes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #ifdef CONFIG_NET_DMA { .procname = "tcp_dma_copybreak", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d902da96d154..4252cd8f39fd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -376,6 +376,7 @@ void tcp_init_sock(struct sock *sk) skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); + INIT_LIST_HEAD(&tp->tsq_node); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; @@ -796,6 +797,10 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, inet_csk(sk)->icsk_ext_hdr_len - tp->tcp_header_len); + /* TSQ : try to have two TSO segments in flight */ + xmit_size_goal = min_t(u32, xmit_size_goal, + sysctl_tcp_limit_output_bytes >> 1); + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); /* We try hard to avoid divides here */ @@ -3574,4 +3579,5 @@ void __init tcp_init(void) tcp_secret_primary = &tcp_secret_one; tcp_secret_retiring = &tcp_secret_two; tcp_secret_secondary = &tcp_secret_two; + tcp_tasklet_init(); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ddefd39ac0cf..01545a3fc0f2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2588,6 +2588,7 @@ struct proto tcp_prot = { .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, + .release_cb = tcp_release_cb, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 65608863fdee..c66f2ede160e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -424,6 +424,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, treq->snt_isn + 1 + tcp_s_data_size(oldtp); tcp_prequeue_init(newtp); + INIT_LIST_HEAD(&newtp->tsq_node); tcp_init_wl(newtp, treq->rcv_isn); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c465d3e51e28..03854abfd9d8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; */ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; +/* Default TSQ limit of two TSO segments */ +int sysctl_tcp_limit_output_bytes __read_mostly = 131072; + /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. @@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); +static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) @@ -783,6 +788,140 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb return size; } + +/* TCP SMALL QUEUES (TSQ) + * + * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) + * to reduce RTT and bufferbloat. + * We do this using a special skb destructor (tcp_wfree). + * + * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb + * needs to be reallocated in a driver. + * The invariant being skb->truesize substracted from sk->sk_wmem_alloc + * + * Since transmit from skb destructor is forbidden, we use a tasklet + * to process all sockets that eventually need to send more skbs. + * We use one tasklet per cpu, with its own queue of sockets. + */ +struct tsq_tasklet { + struct tasklet_struct tasklet; + struct list_head head; /* queue of tcp sockets */ +}; +static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); + +/* + * One tasklest per cpu tries to send more skbs. + * We run in tasklet context but need to disable irqs when + * transfering tsq->head because tcp_wfree() might + * interrupt us (non NAPI drivers) + */ +static void tcp_tasklet_func(unsigned long data) +{ + struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; + LIST_HEAD(list); + unsigned long flags; + struct list_head *q, *n; + struct tcp_sock *tp; + struct sock *sk; + + local_irq_save(flags); + list_splice_init(&tsq->head, &list); + local_irq_restore(flags); + + list_for_each_safe(q, n, &list) { + tp = list_entry(q, struct tcp_sock, tsq_node); + list_del(&tp->tsq_node); + + sk = (struct sock *)tp; + bh_lock_sock(sk); + + if (!sock_owned_by_user(sk)) { + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | + TCPF_CLOSING | TCPF_CLOSE_WAIT)) + tcp_write_xmit(sk, + tcp_current_mss(sk), + 0, 0, + GFP_ATOMIC); + } else { + /* defer the work to tcp_release_cb() */ + set_bit(TSQ_OWNED, &tp->tsq_flags); + } + bh_unlock_sock(sk); + + clear_bit(TSQ_QUEUED, &tp->tsq_flags); + sk_free(sk); + } +} + +/** + * tcp_release_cb - tcp release_sock() callback + * @sk: socket + * + * called from release_sock() to perform protocol dependent + * actions before socket release. + */ +void tcp_release_cb(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) { + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | + TCPF_CLOSING | TCPF_CLOSE_WAIT)) + tcp_write_xmit(sk, + tcp_current_mss(sk), + 0, 0, + GFP_ATOMIC); + } +} +EXPORT_SYMBOL(tcp_release_cb); + +void __init tcp_tasklet_init(void) +{ + int i; + + for_each_possible_cpu(i) { + struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); + + INIT_LIST_HEAD(&tsq->head); + tasklet_init(&tsq->tasklet, + tcp_tasklet_func, + (unsigned long)tsq); + } +} + +/* + * Write buffer destructor automatically called from kfree_skb. + * We cant xmit new skbs from this context, as we might already + * hold qdisc lock. + */ +void tcp_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct tcp_sock *tp = tcp_sk(sk); + + if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && + !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { + unsigned long flags; + struct tsq_tasklet *tsq; + + /* Keep a ref on socket. + * This last ref will be released in tcp_tasklet_func() + */ + atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); + + /* queue this socket to tasklet queue */ + local_irq_save(flags); + tsq = &__get_cpu_var(tsq_tasklet); + list_add(&tp->tsq_node, &tsq->head); + tasklet_schedule(&tsq->tasklet); + local_irq_restore(flags); + } else { + sock_wfree(skb); + } +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -844,7 +983,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); - skb_set_owner_w(skb, sk); + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? + tcp_wfree : sock_wfree; + atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ th = tcp_hdr(skb); @@ -1780,6 +1924,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; + tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); @@ -1800,6 +1945,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } + /* TSQ : sk_wmem_alloc accounts skb truesize, + * including skb overhead. But thats OK. + */ + if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { + set_bit(TSQ_THROTTLED, &tp->tsq_flags); + break; + } limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 61175cb2478f..70458a9cd837 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1970,6 +1970,7 @@ struct proto tcpv6_prot = { .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, + .release_cb = tcp_release_cb, .hash = tcp_v6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, -- cgit v1.2.3 From 540eb7bf0bbedb65277d68ab89ae43cdec3fd6ba Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 12 Jul 2012 14:23:50 +0000 Subject: net: Update alloc frag to reduce get/put page usage and recycle pages This patch is meant to help improve performance by reducing the number of locked operations required to allocate a frag on x86 and other platforms. This is accomplished by using atomic_set operations on the page count instead of calling get_page and put_page. It is based on work originally provided by Eric Dumazet. In addition it also helps to reduce memory overhead when using TCP. This is done by recycling the page if the only holder of the frame is the netdev_alloc_frag call itself. This can occur when skb heads are stolen by either GRO or TCP and the driver providing the packets is using paged frags to store all of the data for the packets. Cc: Eric Dumazet Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 506f678e9d95..8b6d38fdb443 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -296,9 +296,12 @@ EXPORT_SYMBOL(build_skb); struct netdev_alloc_cache { struct page *page; unsigned int offset; + unsigned int pagecnt_bias; }; static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); +#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES) + /** * netdev_alloc_frag - allocate a page fragment * @fragsz: fragment size @@ -317,17 +320,26 @@ void *netdev_alloc_frag(unsigned int fragsz) if (unlikely(!nc->page)) { refill: nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD); + if (unlikely(!nc->page)) + goto end; +recycle: + atomic_set(&nc->page->_count, NETDEV_PAGECNT_BIAS); + nc->pagecnt_bias = NETDEV_PAGECNT_BIAS; nc->offset = 0; } - if (likely(nc->page)) { - if (nc->offset + fragsz > PAGE_SIZE) { - put_page(nc->page); - goto refill; - } - data = page_address(nc->page) + nc->offset; - nc->offset += fragsz; - get_page(nc->page); + + if (nc->offset + fragsz > PAGE_SIZE) { + /* avoid unnecessary locked operations if possible */ + if ((atomic_read(&nc->page->_count) == nc->pagecnt_bias) || + atomic_sub_and_test(nc->pagecnt_bias, &nc->page->_count)) + goto recycle; + goto refill; } + + data = page_address(nc->page) + nc->offset; + nc->offset += fragsz; + nc->pagecnt_bias--; +end: local_irq_restore(flags); return data; } -- cgit v1.2.3 From 310e158cc3b7a6adf41e778d52be746c4dc88561 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Jul 2012 13:15:52 +0200 Subject: net: respect GFP_DMA in __netdev_alloc_skb() Few drivers use GFP_DMA allocations, and netdev_alloc_frag() doesn't allocate pages in DMA zone. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 46a3d23d259e..d124306b81fd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -353,7 +353,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - if (fragsz <= PAGE_SIZE && !(gfp_mask & __GFP_WAIT)) { + if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { void *data = netdev_alloc_frag(fragsz); if (likely(data)) { -- cgit v1.2.3 From 51d7cccf07238f5236c5b9269231a30dd5f8e714 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Mon, 16 Jul 2012 04:28:49 +0000 Subject: net: make sock diag per-namespace Before this patch sock_diag works for init_net only and dumps information about sockets from all namespaces. This patch expands sock_diag for all name-spaces. It creates a netlink kernel socket for each netns and filters data during dumping. v2: filter accoding with netns in all places remove an unused variable. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Cc: Pavel Emelyanov CC: Eric Dumazet Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Andrew Vagin Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- include/linux/sock_diag.h | 1 - include/net/net_namespace.h | 1 + net/core/sock_diag.c | 27 ++++++++++++++++++++------- net/ipv4/inet_diag.c | 21 ++++++++++++++++----- net/ipv4/udp_diag.c | 10 +++++++--- net/unix/diag.c | 9 +++++++-- 6 files changed, 51 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index 6793fac5eab5..e3e395acc2fd 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -44,6 +44,5 @@ void sock_diag_save_cookie(void *sk, __u32 *cookie); int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr); -extern struct sock *sock_diag_nlsk; #endif /* KERNEL */ #endif diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index ac9195e6a062..ae1cd6c9ba52 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -101,6 +101,7 @@ struct net { struct netns_xfrm xfrm; #endif struct netns_ipvs *ipvs; + struct sock *diag_nlsk; }; diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 07a29eb34a41..9d8755e4a7a5 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -166,23 +166,36 @@ static void sock_diag_rcv(struct sk_buff *skb) mutex_unlock(&sock_diag_mutex); } -struct sock *sock_diag_nlsk; -EXPORT_SYMBOL_GPL(sock_diag_nlsk); - -static int __init sock_diag_init(void) +static int __net_init diag_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = sock_diag_rcv, }; - sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, + net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, THIS_MODULE, &cfg); - return sock_diag_nlsk == NULL ? -ENOMEM : 0; + return net->diag_nlsk == NULL ? -ENOMEM : 0; +} + +static void __net_exit diag_net_exit(struct net *net) +{ + netlink_kernel_release(net->diag_nlsk); + net->diag_nlsk = NULL; +} + +static struct pernet_operations diag_net_ops = { + .init = diag_net_init, + .exit = diag_net_exit, +}; + +static int __init sock_diag_init(void) +{ + return register_pernet_subsys(&diag_net_ops); } static void __exit sock_diag_exit(void) { - netlink_kernel_release(sock_diag_nlsk); + unregister_pernet_subsys(&diag_net_ops); } module_init(sock_diag_init); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 38064a285cca..570e61f9611f 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -272,16 +272,17 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s int err; struct sock *sk; struct sk_buff *rep; + struct net *net = sock_net(in_skb->sk); err = -EINVAL; if (req->sdiag_family == AF_INET) { - sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0], + sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); } #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { - sk = inet6_lookup(&init_net, hashinfo, + sk = inet6_lookup(net, hashinfo, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, @@ -317,7 +318,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s nlmsg_free(rep); goto out; } - err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); if (err > 0) err = 0; @@ -724,6 +725,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, { int i, num; int s_i, s_num; + struct net *net = sock_net(skb->sk); s_i = cb->args[1]; s_num = num = cb->args[2]; @@ -743,6 +745,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, sk_nulls_for_each(sk, node, &ilb->head) { struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) { num++; continue; @@ -813,6 +818,8 @@ skip_listen_ht: sk_nulls_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net)) + continue; if (num < s_num) goto next_normal; if (!(r->idiag_states & (1 << sk->sk_state))) @@ -839,6 +846,8 @@ next_normal: inet_twsk_for_each(tw, node, &head->twchain) { + if (!net_eq(twsk_net(tw), net)) + continue; if (num < s_num) goto next_dying; @@ -943,6 +952,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb, static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) { int hdrlen = sizeof(struct inet_diag_req); + struct net *net = sock_net(skb->sk); if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || nlmsg_len(nlh) < hdrlen) @@ -963,7 +973,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) struct netlink_dump_control c = { .dump = inet_diag_dump_compat, }; - return netlink_dump_start(sock_diag_nlsk, skb, nlh, &c); + return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); } } @@ -973,6 +983,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct inet_diag_req_v2); + struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; @@ -991,7 +1002,7 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) struct netlink_dump_control c = { .dump = inet_diag_dump, }; - return netlink_dump_start(sock_diag_nlsk, skb, h, &c); + return netlink_dump_start(net->diag_nlsk, skb, h, &c); } } diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index a7f86a3cd502..16d0960062be 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -34,15 +34,16 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, int err = -EINVAL; struct sock *sk; struct sk_buff *rep; + struct net *net = sock_net(in_skb->sk); if (req->sdiag_family == AF_INET) - sk = __udp4_lib_lookup(&init_net, + sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_if, tbl); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) - sk = __udp6_lib_lookup(&init_net, + sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, @@ -75,7 +76,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, kfree_skb(rep); goto out; } - err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); if (err > 0) err = 0; @@ -90,6 +91,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin struct inet_diag_req_v2 *r, struct nlattr *bc) { int num, s_num, slot, s_slot; + struct net *net = sock_net(skb->sk); s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -106,6 +108,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin sk_nulls_for_each(sk, node, &hslot->head) { struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net)) + continue; if (num < s_num) goto next; if (!(r->idiag_states & (1 << sk->sk_state))) diff --git a/net/unix/diag.c b/net/unix/diag.c index a74864eedfcd..750b13408449 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -177,6 +177,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct unix_diag_req *req; int num, s_num, slot, s_slot; + struct net *net = sock_net(skb->sk); req = nlmsg_data(cb->nlh); @@ -192,6 +193,8 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) num = 0; sk_for_each(sk, node, &unix_socket_table[slot]) { + if (!net_eq(sock_net(sk), net)) + continue; if (num < s_num) goto next; if (!(req->udiag_states & (1 << sk->sk_state))) @@ -243,6 +246,7 @@ static int unix_diag_get_exact(struct sk_buff *in_skb, struct sock *sk; struct sk_buff *rep; unsigned int extra_len; + struct net *net = sock_net(in_skb->sk); if (req->udiag_ino == 0) goto out_nosk; @@ -273,7 +277,7 @@ again: goto again; } - err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); if (err > 0) err = 0; @@ -287,6 +291,7 @@ out_nosk: static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct unix_diag_req); + struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; @@ -295,7 +300,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) struct netlink_dump_control c = { .dump = unix_diag_dump, }; - return netlink_dump_start(sock_diag_nlsk, skb, h, &c); + return netlink_dump_start(net->diag_nlsk, skb, h, &c); } else return unix_diag_get_exact(skb, h, nlmsg_data(h)); } -- cgit v1.2.3 From ef209f15980360f6945873df3cd710c5f62f2a3e Mon Sep 17 00:00:00 2001 From: Gao feng Date: Wed, 11 Jul 2012 21:50:15 +0000 Subject: net: cgroup: fix access the unallocated memory in netprio cgroup there are some out of bound accesses in netprio cgroup. now before accessing the dev->priomap.priomap array,we only check if the dev->priomap exist.and because we don't want to see additional bound checkings in fast path, so we should make sure that dev->priomap is null or array size of dev->priomap.priomap is equal to max_prioidx + 1; so in write_priomap logic,we should call extend_netdev_table when dev->priomap is null and dev->priomap.priomap_len < max_len. and in cgrp_create->update_netdev_tables logic,we should call extend_netdev_table only when dev->priomap exist and dev->priomap.priomap_len < max_len. and it's not needed to call update_netdev_tables in write_priomap, we can only allocate the net device's priomap which we change through net_prio.ifpriomap. this patch also add a return value for update_netdev_tables & extend_netdev_table, so when new_priomap is allocated failed, write_priomap will stop to access the priomap,and return -ENOMEM back to the userspace to tell the user what happend. Change From v3: 1. add rtnl protect when reading max_prioidx in write_priomap. 2. only call extend_netdev_table when map->priomap_len < max_len, this will make sure array size of dev->map->priomap always bigger than any prioidx. 3. add a function write_update_netdev_table to make codes clear. Change From v2: 1. protect extend_netdev_table by RTNL. 2. when extend_netdev_table failed,call dev_put to reduce device's refcount. Signed-off-by: Gao feng Cc: Neil Horman Cc: Eric Dumazet Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 71 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 3e953eaddbfc..b2e9caa1ad1a 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -65,7 +65,7 @@ static void put_prioidx(u32 idx) spin_unlock_irqrestore(&prioidx_map_lock, flags); } -static void extend_netdev_table(struct net_device *dev, u32 new_len) +static int extend_netdev_table(struct net_device *dev, u32 new_len) { size_t new_size = sizeof(struct netprio_map) + ((sizeof(u32) * new_len)); @@ -77,7 +77,7 @@ static void extend_netdev_table(struct net_device *dev, u32 new_len) if (!new_priomap) { pr_warn("Unable to alloc new priomap!\n"); - return; + return -ENOMEM; } for (i = 0; @@ -90,46 +90,79 @@ static void extend_netdev_table(struct net_device *dev, u32 new_len) rcu_assign_pointer(dev->priomap, new_priomap); if (old_priomap) kfree_rcu(old_priomap, rcu); + return 0; } -static void update_netdev_tables(void) +static int write_update_netdev_table(struct net_device *dev) { + int ret = 0; + u32 max_len; + struct netprio_map *map; + + rtnl_lock(); + max_len = atomic_read(&max_prioidx) + 1; + map = rtnl_dereference(dev->priomap); + if (!map || map->priomap_len < max_len) + ret = extend_netdev_table(dev, max_len); + rtnl_unlock(); + + return ret; +} + +static int update_netdev_tables(void) +{ + int ret = 0; struct net_device *dev; - u32 max_len = atomic_read(&max_prioidx) + 1; + u32 max_len; struct netprio_map *map; rtnl_lock(); + max_len = atomic_read(&max_prioidx) + 1; for_each_netdev(&init_net, dev) { map = rtnl_dereference(dev->priomap); - if ((!map) || - (map->priomap_len < max_len)) - extend_netdev_table(dev, max_len); + /* + * don't allocate priomap if we didn't + * change net_prio.ifpriomap (map == NULL), + * this will speed up skb_update_prio. + */ + if (map && map->priomap_len < max_len) { + ret = extend_netdev_table(dev, max_len); + if (ret < 0) + break; + } } rtnl_unlock(); + return ret; } static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) { struct cgroup_netprio_state *cs; - int ret; + int ret = -EINVAL; cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); - if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) { - kfree(cs); - return ERR_PTR(-EINVAL); - } + if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) + goto out; ret = get_prioidx(&cs->prioidx); - if (ret != 0) { + if (ret < 0) { pr_warn("No space in priority index array\n"); - kfree(cs); - return ERR_PTR(ret); + goto out; + } + + ret = update_netdev_tables(); + if (ret < 0) { + put_prioidx(cs->prioidx); + goto out; } return &cs->css; +out: + kfree(cs); + return ERR_PTR(ret); } static void cgrp_destroy(struct cgroup *cgrp) @@ -221,13 +254,17 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft, if (!dev) goto out_free_devname; - update_netdev_tables(); - ret = 0; + ret = write_update_netdev_table(dev); + if (ret < 0) + goto out_put_dev; + rcu_read_lock(); map = rcu_dereference(dev->priomap); if (map) map->priomap[prioidx] = priority; rcu_read_unlock(); + +out_put_dev: dev_put(dev); out_free_devname: -- cgit v1.2.3 From 30fdd8a082a00126a6feec994e43e8dc12f5bccb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 17 Jul 2012 05:22:35 +0000 Subject: netpoll: move np->dev and np->dev_name init into __netpoll_setup() Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 4 +--- include/linux/netpoll.h | 2 +- net/8021q/vlan_dev.c | 5 +---- net/bridge/br_device.c | 5 +---- net/core/netpoll.c | 10 +++++----- 5 files changed, 9 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 4ddcc3e41dab..1eb3979d0af5 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1240,9 +1240,7 @@ static inline int slave_enable_netpoll(struct slave *slave) if (!np) goto out; - np->dev = slave->dev; - strlcpy(np->dev_name, slave->dev->name, IFNAMSIZ); - err = __netpoll_setup(np); + err = __netpoll_setup(np, slave->dev); if (err) { kfree(np); goto out; diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 5dfa091c3347..28f5389c924b 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -43,7 +43,7 @@ struct netpoll_info { void netpoll_send_udp(struct netpoll *np, const char *msg, int len); void netpoll_print_options(struct netpoll *np); int netpoll_parse_options(struct netpoll *np, char *opt); -int __netpoll_setup(struct netpoll *np); +int __netpoll_setup(struct netpoll *np, struct net_device *ndev); int netpoll_setup(struct netpoll *np); int netpoll_trap(void); void netpoll_set_trap(int trap); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index da1bc9c3cf38..73a2a83ee2da 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -681,10 +681,7 @@ static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *n if (!netpoll) goto out; - netpoll->dev = real_dev; - strlcpy(netpoll->dev_name, real_dev->name, IFNAMSIZ); - - err = __netpoll_setup(netpoll); + err = __netpoll_setup(netpoll, real_dev); if (err) { kfree(netpoll); goto out; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 929e48aed444..f4be1bbfef26 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -246,10 +246,7 @@ int br_netpoll_enable(struct net_bridge_port *p) if (!np) goto out; - np->dev = p->dev; - strlcpy(np->dev_name, p->dev->name, IFNAMSIZ); - - err = __netpoll_setup(np); + err = __netpoll_setup(np, p->dev); if (err) { kfree(np); goto out; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index f9f40b932e4b..b4c90e42b443 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -715,14 +715,16 @@ int netpoll_parse_options(struct netpoll *np, char *opt) } EXPORT_SYMBOL(netpoll_parse_options); -int __netpoll_setup(struct netpoll *np) +int __netpoll_setup(struct netpoll *np, struct net_device *ndev) { - struct net_device *ndev = np->dev; struct netpoll_info *npinfo; const struct net_device_ops *ops; unsigned long flags; int err; + np->dev = ndev; + strlcpy(np->dev_name, ndev->name, IFNAMSIZ); + if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) || !ndev->netdev_ops->ndo_poll_controller) { np_err(np, "%s doesn't support polling, aborting\n", @@ -851,13 +853,11 @@ int netpoll_setup(struct netpoll *np) np_info(np, "local IP %pI4\n", &np->local_ip); } - np->dev = ndev; - /* fill up the skb queue */ refill_skbs(); rtnl_lock(); - err = __netpoll_setup(np); + err = __netpoll_setup(np, ndev); rtnl_unlock(); if (err) -- cgit v1.2.3 From 02756ed4a79f15e4f265c1f6fbc634ce9966f153 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Tue, 17 Jul 2012 02:05:29 +0000 Subject: skbuff: Use correct allocation in skb_copy_ubufs Use correct allocation flags during copy of user space fragments to the kernel. Also "improve" couple of for loops. Signed-off-by: Krishna Kumar Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8b6d38fdb443..c011d7fab62d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -751,7 +751,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) u8 *vaddr; skb_frag_t *f = &skb_shinfo(skb)->frags[i]; - page = alloc_page(GFP_ATOMIC); + page = alloc_page(gfp_mask); if (!page) { while (head) { struct page *next = (struct page *)head->private; @@ -769,15 +769,15 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) } /* skb frags release userspace buffers */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + for (i = 0; i < num_frags; i++) skb_frag_unref(skb, i); uarg->callback(uarg); /* skb frags point to kernel buffers */ - for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { - __skb_fill_page_desc(skb, i-1, head, 0, - skb_shinfo(skb)->frags[i - 1].size); + for (i = num_frags - 1; i >= 0; i--) { + __skb_fill_page_desc(skb, i, head, 0, + skb_shinfo(skb)->frags[i].size); head = (struct page *)head->private; } -- cgit v1.2.3 From ddbe503203855939946430e39bae58de11b70b69 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Jul 2012 08:11:12 +0000 Subject: ipv6: add ipv6_addr_hash() helper Introduce ipv6_addr_hash() helper doing a XOR on all bits of an IPv6 address, with an optimized x86_64 version. Use it in flow dissector, as suggested by Andrew McGregor, to reduce hash collision probabilities in fq_codel (and other users of flow dissector) Use it in ip6_tunnel.c and use more bit shuffling, as suggested by David Laight, as existing hash was ignoring most of them. Use it in sunrpc and use more bit shuffling, using hash_32(). Use it in net/ipv6/addrconf.c, using hash_32() as well. As a cleanup, use it in net/ipv4/tcp_metrics.c Signed-off-by: Eric Dumazet Reported-by: Andrew McGregor Cc: Dave Taht Cc: Tom Herbert Cc: David Laight Cc: Joe Perches Signed-off-by: David S. Miller --- include/net/addrconf.h | 3 ++- include/net/ipv6.h | 13 +++++++++++++ net/core/flow_dissector.c | 5 +++-- net/ipv4/tcp_metrics.c | 15 +++------------ net/ipv6/addrconf.c | 21 ++++++++------------- net/ipv6/ip6_tunnel.c | 20 ++++++++++++-------- net/sunrpc/svcauth_unix.c | 22 ++++------------------ 7 files changed, 45 insertions(+), 54 deletions(-) (limited to 'net/core') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index f2b801c4b555..089a09d001d1 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -46,7 +46,8 @@ struct prefix_info { #include #include -#define IN6_ADDR_HSIZE 16 +#define IN6_ADDR_HSIZE_SHIFT 4 +#define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT) extern int addrconf_init(void); extern void addrconf_cleanup(void); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index f695f39e8926..01c34b363a34 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -419,6 +419,19 @@ static inline bool ipv6_addr_any(const struct in6_addr *a) #endif } +static inline u32 ipv6_addr_hash(const struct in6_addr *a) +{ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + const unsigned long *ul = (const unsigned long *)a; + unsigned long x = ul[0] ^ ul[1]; + + return (u32)(x ^ (x >> 32)); +#else + return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^ + a->s6_addr32[2] ^ a->s6_addr32[3]); +#endif +} + static inline bool ipv6_addr_loopback(const struct in6_addr *a) { return (a->s6_addr32[0] | a->s6_addr32[1] | diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index a225089df5b6..466820b6e344 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -55,8 +56,8 @@ ipv6: return false; ip_proto = iph->nexthdr; - flow->src = iph->saddr.s6_addr32[3]; - flow->dst = iph->daddr.s6_addr32[3]; + flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); + flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); nhoff += sizeof(struct ipv6hdr); break; } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 5a38a2d5a95b..1a115b665792 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -211,10 +211,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, break; case AF_INET6: *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr; - hash = ((__force unsigned int) addr.addr.a6[0] ^ - (__force unsigned int) addr.addr.a6[1] ^ - (__force unsigned int) addr.addr.a6[2] ^ - (__force unsigned int) addr.addr.a6[3]); + hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr); break; default: return NULL; @@ -251,10 +248,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock case AF_INET6: tw6 = inet6_twsk((struct sock *)tw); *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr; - hash = ((__force unsigned int) addr.addr.a6[0] ^ - (__force unsigned int) addr.addr.a6[1] ^ - (__force unsigned int) addr.addr.a6[2] ^ - (__force unsigned int) addr.addr.a6[3]); + hash = ipv6_addr_hash(&tw6->tw_v6_daddr); break; default: return NULL; @@ -291,10 +285,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, break; case AF_INET6: *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr; - hash = ((__force unsigned int) addr.addr.a6[0] ^ - (__force unsigned int) addr.addr.a6[1] ^ - (__force unsigned int) addr.addr.a6[2] ^ - (__force unsigned int) addr.addr.a6[3]); + hash = ipv6_addr_hash(&inet6_sk(sk)->daddr); break; default: return NULL; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8f6411c97189..79181819a24f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -579,15 +580,9 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) list_add_tail(&ifp->if_list, p); } -static u32 ipv6_addr_hash(const struct in6_addr *addr) +static u32 inet6_addr_hash(const struct in6_addr *addr) { - /* - * We perform the hash function over the last 64 bits of the address - * This will include the IEEE address token on links that support it. - */ - return jhash_2words((__force u32)addr->s6_addr32[2], - (__force u32)addr->s6_addr32[3], 0) - & (IN6_ADDR_HSIZE - 1); + return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT); } /* On success it returns ifp with increased reference count */ @@ -662,7 +657,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, in6_ifa_hold(ifa); /* Add to big hash table */ - hash = ipv6_addr_hash(addr); + hash = inet6_addr_hash(addr); hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); spin_unlock(&addrconf_hash_lock); @@ -1270,7 +1265,7 @@ int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, { struct inet6_ifaddr *ifp; struct hlist_node *node; - unsigned int hash = ipv6_addr_hash(addr); + unsigned int hash = inet6_addr_hash(addr); rcu_read_lock_bh(); hlist_for_each_entry_rcu(ifp, node, &inet6_addr_lst[hash], addr_lst) { @@ -1293,7 +1288,7 @@ EXPORT_SYMBOL(ipv6_chk_addr); static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, struct net_device *dev) { - unsigned int hash = ipv6_addr_hash(addr); + unsigned int hash = inet6_addr_hash(addr); struct inet6_ifaddr *ifp; struct hlist_node *node; @@ -1336,7 +1331,7 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add struct net_device *dev, int strict) { struct inet6_ifaddr *ifp, *result = NULL; - unsigned int hash = ipv6_addr_hash(addr); + unsigned int hash = inet6_addr_hash(addr); struct hlist_node *node; rcu_read_lock_bh(); @@ -3223,7 +3218,7 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) int ret = 0; struct inet6_ifaddr *ifp = NULL; struct hlist_node *n; - unsigned int hash = ipv6_addr_hash(addr); + unsigned int hash = inet6_addr_hash(addr); rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(ifp, n, &inet6_addr_lst[hash], addr_lst) { diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index db3284667968..9a1d5fe6aef8 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -70,11 +71,15 @@ MODULE_ALIAS_NETDEV("ip6tnl0"); #define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) #define IPV6_TCLASS_SHIFT 20 -#define HASH_SIZE 32 +#define HASH_SIZE_SHIFT 5 +#define HASH_SIZE (1 << HASH_SIZE_SHIFT) -#define HASH(addr) ((__force u32)((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \ - (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \ - (HASH_SIZE - 1)) +static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) +{ + u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); + + return hash_32(hash, HASH_SIZE_SHIFT); +} static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); @@ -166,12 +171,11 @@ static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) static struct ip6_tnl * ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local) { - unsigned int h0 = HASH(remote); - unsigned int h1 = HASH(local); + unsigned int hash = HASH(remote, local); struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[h0 ^ h1]) { + for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && (t->dev->flags & IFF_UP)) @@ -205,7 +209,7 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct ip6_tnl_parm *p) if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; - h = HASH(remote) ^ HASH(local); + h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 2777fa896645..4d0129203733 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -104,23 +104,9 @@ static void ip_map_put(struct kref *kref) kfree(im); } -#if IP_HASHBITS == 8 -/* hash_long on a 64 bit machine is currently REALLY BAD for - * IP addresses in reverse-endian (i.e. on a little-endian machine). - * So use a trivial but reliable hash instead - */ -static inline int hash_ip(__be32 ip) -{ - int hash = (__force u32)ip ^ ((__force u32)ip>>16); - return (hash ^ (hash>>8)) & 0xff; -} -#endif -static inline int hash_ip6(struct in6_addr ip) +static inline int hash_ip6(const struct in6_addr *ip) { - return (hash_ip(ip.s6_addr32[0]) ^ - hash_ip(ip.s6_addr32[1]) ^ - hash_ip(ip.s6_addr32[2]) ^ - hash_ip(ip.s6_addr32[3])); + return hash_32(ipv6_addr_hash(ip), IP_HASHBITS); } static int ip_map_match(struct cache_head *corig, struct cache_head *cnew) { @@ -301,7 +287,7 @@ static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, ip.m_addr = *addr; ch = sunrpc_cache_lookup(cd, &ip.h, hash_str(class, IP_HASHBITS) ^ - hash_ip6(*addr)); + hash_ip6(addr)); if (ch) return container_of(ch, struct ip_map, h); @@ -331,7 +317,7 @@ static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, ip.h.expiry_time = expiry; ch = sunrpc_cache_update(cd, &ip.h, &ipm->h, hash_str(ipm->m_class, IP_HASHBITS) ^ - hash_ip6(ipm->m_addr)); + hash_ip6(&ipm->m_addr)); if (!ch) return -ENOMEM; cache_put(ch, cd); -- cgit v1.2.3 From 734b65417b24d6eea3e3d7457e1f11493890ee1d Mon Sep 17 00:00:00 2001 From: "Rustad, Mark D" Date: Wed, 18 Jul 2012 09:06:07 +0000 Subject: net: Statically initialize init_net.dev_base_head This change eliminates an initialization-order hazard most recently seen when netprio_cgroup is built into the kernel. With thanks to Eric Dumazet for catching a bug. Signed-off-by: Mark Rustad Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- net/core/net_namespace.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0f28a9e0b8ad..1cb0d8a6aa6c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6283,7 +6283,8 @@ static struct hlist_head *netdev_create_hash(void) /* Initialize per network namespace state */ static int __net_init netdev_init(struct net *net) { - INIT_LIST_HEAD(&net->dev_base_head); + if (net != &init_net) + INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index dddbacb8f28c..42f1e1c7514f 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -27,7 +27,9 @@ static DEFINE_MUTEX(net_mutex); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); -struct net init_net; +struct net init_net = { + .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), +}; EXPORT_SYMBOL(init_net); #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ -- cgit v1.2.3 From d40156aa5ecbd51fed932ed4813df82b56e5ff4d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 20 Jul 2012 02:28:47 +0000 Subject: rtnl: allow to specify different num for rx and tx queue count Also cut out unused function parameters and possible err in return value. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 14 ++++++++------ include/net/rtnetlink.h | 10 ++++++---- net/core/rtnetlink.c | 16 ++++++++-------- 3 files changed, 22 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 3960b1b26178..f41ddc2d48be 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4845,17 +4845,19 @@ static int bond_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; } -static int bond_get_tx_queues(struct net *net, struct nlattr *tb[]) +static unsigned int bond_get_num_tx_queues(void) { return tx_queues; } static struct rtnl_link_ops bond_link_ops __read_mostly = { - .kind = "bond", - .priv_size = sizeof(struct bonding), - .setup = bond_setup, - .validate = bond_validate, - .get_tx_queues = bond_get_tx_queues, + .kind = "bond", + .priv_size = sizeof(struct bonding), + .setup = bond_setup, + .validate = bond_validate, + .get_num_tx_queues = bond_get_num_tx_queues, + .get_num_rx_queues = bond_get_num_tx_queues, /* Use the same number + as for TX queues */ }; /* Create a new bond based on the specified name and bonding parameters. diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index bbcfd0993432..6b00c4fc4291 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -44,8 +44,10 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) * @get_xstats_size: Function to calculate required room for dumping device * specific statistics * @fill_xstats: Function to dump device specific statistics - * @get_tx_queues: Function to determine number of transmit queues to create when - * creating a new device. + * @get_num_tx_queues: Function to determine number of transmit queues + * to create when creating a new device. + * @get_num_rx_queues: Function to determine number of receive queues + * to create when creating a new device. */ struct rtnl_link_ops { struct list_head list; @@ -77,8 +79,8 @@ struct rtnl_link_ops { size_t (*get_xstats_size)(const struct net_device *dev); int (*fill_xstats)(struct sk_buff *skb, const struct net_device *dev); - int (*get_tx_queues)(struct net *net, - struct nlattr *tb[]); + unsigned int (*get_num_tx_queues)(void); + unsigned int (*get_num_rx_queues)(void); }; extern int __rtnl_link_register(struct rtnl_link_ops *ops); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 045db8ad87c8..db5a8ad8a79b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1624,17 +1624,17 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net, { int err; struct net_device *dev; - unsigned int num_queues = 1; + unsigned int num_tx_queues = 1; + unsigned int num_rx_queues = 1; - if (ops->get_tx_queues) { - err = ops->get_tx_queues(src_net, tb); - if (err < 0) - goto err; - num_queues = err; - } + if (ops->get_num_tx_queues) + num_tx_queues = ops->get_num_tx_queues(); + if (ops->get_num_rx_queues) + num_rx_queues = ops->get_num_rx_queues(); err = -ENOMEM; - dev = alloc_netdev_mq(ops->priv_size, ifname, ops->setup, num_queues); + dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup, + num_tx_queues, num_rx_queues); if (!dev) goto err; -- cgit v1.2.3 From 76ff5cc91935c51fcf1a6a99ffa28b97a6e7a884 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 20 Jul 2012 02:28:48 +0000 Subject: rtnl: allow to specify number of rx and tx queues on device creation This patch introduces IFLA_NUM_TX_QUEUES and IFLA_NUM_RX_QUEUES by which userspace can set number of rx and/or tx queues to be allocated for newly created netdevice. This overrides ops->get_num_[tr]x_queues() Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_link.h | 2 ++ net/core/rtnetlink.c | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/include/linux/if_link.h b/include/linux/if_link.h index f715750d0b87..ac173bd2ab65 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -140,6 +140,8 @@ enum { IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */ #define IFLA_PROMISCUITY IFLA_PROMISCUITY + IFLA_NUM_TX_QUEUES, + IFLA_NUM_RX_QUEUES, __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index db5a8ad8a79b..5bb1ebca2eb0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -771,6 +771,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(ext_filter_mask @@ -889,6 +891,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || + nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || + nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || (dev->ifindex != dev->iflink && nla_put_u32(skb, IFLA_LINK, dev->iflink)) || (dev->master && @@ -1106,6 +1110,8 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_AF_SPEC] = { .type = NLA_NESTED }, [IFLA_EXT_MASK] = { .type = NLA_U32 }, [IFLA_PROMISCUITY] = { .type = NLA_U32 }, + [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, + [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, }; EXPORT_SYMBOL(ifla_policy); @@ -1627,9 +1633,14 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net, unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; - if (ops->get_num_tx_queues) + if (tb[IFLA_NUM_TX_QUEUES]) + num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); + else if (ops->get_num_tx_queues) num_tx_queues = ops->get_num_tx_queues(); - if (ops->get_num_rx_queues) + + if (tb[IFLA_NUM_RX_QUEUES]) + num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]); + else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); err = -ENOMEM; -- cgit v1.2.3 From f5b0a8743601a4477419171f5046bd07d1c080a0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 19 Jul 2012 12:31:33 -0700 Subject: net: Document dst->obsolete better. Add a big comment explaining how the field works, and use defines instead of magic constants for the values assigned to it. Suggested by Joe Perches. Signed-off-by: David S. Miller --- include/net/dst.h | 14 +++++++++++++- net/core/dst.c | 4 ++-- net/decnet/dn_route.c | 4 ++-- net/ipv4/route.c | 5 +++-- net/ipv6/route.c | 4 ++-- net/sctp/transport.c | 2 +- net/xfrm/xfrm_policy.c | 23 ++++++++++++----------- 7 files changed, 35 insertions(+), 21 deletions(-) (limited to 'net/core') diff --git a/include/net/dst.h b/include/net/dst.h index 51610468c63d..0df661a0c295 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -65,7 +65,19 @@ struct dst_entry { unsigned short pending_confirm; short error; + + /* A non-zero value of dst->obsolete forces by-hand validation + * of the route entry. Positive values are set by the generic + * dst layer to indicate that the entry has been forcefully + * destroyed. + * + * Negative values are used by the implementation layer code to + * force invocation of the dst_ops->check() method. + */ short obsolete; +#define DST_OBSOLETE_NONE 0 +#define DST_OBSOLETE_DEAD 2 +#define DST_OBSOLETE_FORCE_CHK -1 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ #ifdef CONFIG_IP_ROUTE_CLASSID @@ -359,7 +371,7 @@ extern struct dst_entry *dst_destroy(struct dst_entry *dst); static inline void dst_free(struct dst_entry *dst) { - if (dst->obsolete > 1) + if (dst->obsolete > 0) return; if (!atomic_read(&dst->__refcnt)) { dst = dst_destroy(dst); diff --git a/net/core/dst.c b/net/core/dst.c index 07bacff84aa4..069d51d29414 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -94,7 +94,7 @@ loop: * But we do not have state "obsoleted, but * referenced by parent", so it is right. */ - if (dst->obsolete > 1) + if (dst->obsolete > 0) continue; ___dst_free(dst); @@ -202,7 +202,7 @@ static void ___dst_free(struct dst_entry *dst) */ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) dst->input = dst->output = dst_discard; - dst->obsolete = 2; + dst->obsolete = DST_OBSOLETE_DEAD; } void __dst_free(struct dst_entry *dst) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 47de90d8fe94..23cc11dd4e40 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1176,7 +1176,7 @@ make_route: if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - rt = dst_alloc(&dn_dst_ops, dev_out, 1, 0, DST_HOST); + rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST); if (rt == NULL) goto e_nobufs; @@ -1444,7 +1444,7 @@ static int dn_route_input_slow(struct sk_buff *skb) } make_route: - rt = dst_alloc(&dn_dst_ops, out_dev, 0, 0, DST_HOST); + rt = dst_alloc(&dn_dst_ops, out_dev, 0, DST_OBSOLETE_NONE, DST_HOST); if (rt == NULL) goto e_nobufs; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d1d579638092..50d2498c9284 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1221,7 +1221,7 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, static struct rtable *rt_dst_alloc(struct net_device *dev, bool nopolicy, bool noxfrm) { - return dst_alloc(&ipv4_dst_ops, dev, 1, -1, + return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, DST_HOST | DST_NOCACHE | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); @@ -1969,9 +1969,10 @@ static struct dst_ops ipv4_dst_blackhole_ops = { struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); struct rtable *ort = (struct rtable *) dst_orig; + struct rtable *rt; + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); if (rt) { struct dst_entry *new = &rt->dst; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 84f6564dd372..cf02cb97bbdd 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -281,7 +281,7 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, struct fib6_table *table) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, - 0, 0, flags); + 0, DST_OBSOLETE_NONE, flags); if (rt) { struct dst_entry *dst = &rt->dst; @@ -985,7 +985,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; struct dst_entry *new = NULL; - rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0); + rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0); if (rt) { new = &rt->dst; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index a6b7ee9ce28a..ec3a12b9b802 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -216,7 +216,7 @@ void sctp_transport_set_owner(struct sctp_transport *transport, void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) { /* If we don't have a fresh route, look one up */ - if (!transport->dst || transport->dst->obsolete > 1) { + if (!transport->dst || transport->dst->obsolete) { dst_release(transport->dst); transport->af_specific->get_dst(transport, &transport->saddr, &transport->fl, sk); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 65bd1ca51517..c5a5165a5927 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1350,7 +1350,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) default: BUG(); } - xdst = dst_alloc(dst_ops, NULL, 0, 0, 0); + xdst = dst_alloc(dst_ops, NULL, 0, DST_OBSOLETE_NONE, 0); if (likely(xdst)) { struct dst_entry *dst = &xdst->u.dst; @@ -1477,7 +1477,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->xfrm = xfrm[i]; xdst->xfrm_genid = xfrm[i]->genid; - dst1->obsolete = -1; + dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->flags |= DST_HOST; dst1->lastuse = now; @@ -2219,12 +2219,13 @@ EXPORT_SYMBOL(__xfrm_route_forward); static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) { /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete - * to "-1" to force all XFRM destinations to get validated by - * dst_ops->check on every use. We do this because when a - * normal route referenced by an XFRM dst is obsoleted we do - * not go looking around for all parent referencing XFRM dsts - * so that we can invalidate them. It is just too much work. - * Instead we make the checks here on every use. For example: + * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to + * get validated by dst_ops->check on every use. We do this + * because when a normal route referenced by an XFRM dst is + * obsoleted we do not go looking around for all parent + * referencing XFRM dsts so that we can invalidate them. It + * is just too much work. Instead we make the checks here on + * every use. For example: * * XFRM dst A --> IPv4 dst X * @@ -2234,9 +2235,9 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) * stale_bundle() check. * * When a policy's bundle is pruned, we dst_free() the XFRM - * dst which causes it's ->obsolete field to be set to a - * positive non-zero integer. If an XFRM dst has been pruned - * like this, we want to force a new route lookup. + * dst which causes it's ->obsolete field to be set to + * DST_OBSOLETE_DEAD. If an XFRM dst has been pruned like + * this, we want to force a new route lookup. */ if (dst->obsolete < 0 && !stale_bundle(dst)) return dst; -- cgit v1.2.3 From 1d69c2b343c7e1dc9584b7aa446f40dbab4c4f80 Mon Sep 17 00:00:00 2001 From: "Mark A. Greer" Date: Fri, 20 Jul 2012 13:35:13 +0000 Subject: rtnl: Add #ifdef CONFIG_RPS around num_rx_queues reference Commit 76ff5cc91935c51fcf1a6a99ffa28b97a6e7a884 (rtnl: allow to specify number of rx and tx queues on device creation) added a reference to the net_device structure's 'num_rx_queues' member in net/core/rtnetlink.c:rtnl_fill_ifinfo() However, the definition for 'num_rx_queues' is surrounded by an '#ifdef CONFIG_RPS' while the new reference to it is not. This causes a compile error when CONFIG_RPS is not defined. Fix the compile error by surrounding the new reference to 'num_rx_queues' by an '#ifdef CONFIG_RPS'. CC: Jiri Pirko Signed-off-by: Mark A. Greer Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5bb1ebca2eb0..334b930e0de3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -892,7 +892,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || +#ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || +#endif (dev->ifindex != dev->iflink && nla_put_u32(skb, IFLA_LINK, dev->iflink)) || (dev->master && -- cgit v1.2.3 From 70008aa50e927670ceee7f0c87e159ca7b1517a2 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 20 Jul 2012 09:23:10 +0000 Subject: skbuff: convert to skb_orphan_frags Reduce code duplication a bit using the new helper. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/skbuff.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ccfcb7d8711e..438bbc5fd898 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -804,10 +804,8 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *n; - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, gfp_mask)) - return NULL; - } + if (skb_orphan_frags(skb, gfp_mask)) + return NULL; n = skb + 1; if (skb->fclone == SKB_FCLONE_ORIG && @@ -927,12 +925,10 @@ struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) if (skb_shinfo(skb)->nr_frags) { int i; - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, gfp_mask)) { - kfree_skb(n); - n = NULL; - goto out; - } + if (skb_orphan_frags(skb, gfp_mask)) { + kfree_skb(n); + n = NULL; + goto out; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; @@ -1005,10 +1001,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, */ if (skb_cloned(skb)) { /* copy this zero copy skb frags */ - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, gfp_mask)) - goto nofrags; - } + if (skb_orphan_frags(skb, gfp_mask)) + goto nofrags; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); -- cgit v1.2.3 From 1080e512d44d4f67b8beb8edf25a1bbcb1066dc7 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 20 Jul 2012 09:23:17 +0000 Subject: net: orphan frags on receive zero copy packets are normally sent to the outside network, but bridging, tun etc might loop them back to host networking stack. If this happens destructors will never be called, so orphan the frags immediately on receive. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/dev.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d70e4a3a49f2..cca02ae7a844 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1632,6 +1632,8 @@ static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) { + if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + return -ENOMEM; atomic_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } @@ -3262,7 +3264,10 @@ ncls: } if (pt_prev) { - ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + ret = -ENOMEM; + else + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); -- cgit v1.2.3 From dcc0fb782b3a6e2abfeaaeb45dd88ed09596be0f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 20 Jul 2012 09:23:20 +0000 Subject: skbuff: export skb_copy_ubufs Export skb_copy_ubufs so that modules can orphan frags. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 438bbc5fd898..368f65c15e4f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -784,7 +784,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; return 0; } - +EXPORT_SYMBOL_GPL(skb_copy_ubufs); /** * skb_clone - duplicate an sk_buff -- cgit v1.2.3 From 406a3c638ce8b17d9704052c07955490f732c2b8 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 20 Jul 2012 10:39:25 +0000 Subject: net: netprio_cgroup: rework update socket logic Instead of updating the sk_cgrp_prioidx struct field on every send this only updates the field when a task is moved via cgroup infrastructure. This allows sockets that may be used by a kernel worker thread to be managed. For example in the iscsi case today a user can put iscsid in a netprio cgroup and control traffic will be sent with the correct sk_cgrp_prioidx value set but as soon as data is sent the kernel worker thread isssues a send and sk_cgrp_prioidx is updated with the kernel worker threads value which is the default case. It seems more correct to only update the field when the user explicitly sets it via control group infrastructure. This allows the users to manage sockets that may be used with other threads. Signed-off-by: John Fastabend Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/linux/net.h | 1 + include/net/netprio_cgroup.h | 4 ++-- net/core/netprio_cgroup.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ net/core/sock.c | 6 ++--- net/socket.c | 5 ++--- 5 files changed, 61 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/include/linux/net.h b/include/linux/net.h index dc95700de5df..99276c3dc89a 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -248,6 +248,7 @@ extern int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); extern int sock_map_fd(struct socket *sock, int flags); extern struct socket *sockfd_lookup(int fd, int *err); +extern struct socket *sock_from_file(struct file *file, int *err); #define sockfd_put(sock) fput(sock->file) extern int net_ratelimit(void); diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index d58fdec47597..2719dec6b5a8 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -35,7 +35,7 @@ struct cgroup_netprio_state { extern int net_prio_subsys_id; #endif -extern void sock_update_netprioidx(struct sock *sk); +extern void sock_update_netprioidx(struct sock *sk, struct task_struct *task); #if IS_BUILTIN(CONFIG_NETPRIO_CGROUP) @@ -82,7 +82,7 @@ static inline u32 task_netprioidx(struct task_struct *p) #endif /* CONFIG_NETPRIO_CGROUP */ #else -#define sock_update_netprioidx(sk) +#define sock_update_netprioidx(sk, task) #endif #endif /* _NET_CLS_CGROUP_H */ diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index b2e9caa1ad1a..63d15e8f80e9 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -25,6 +25,8 @@ #include #include +#include + #define PRIOIDX_SZ 128 static unsigned long prioidx_map[PRIOIDX_SZ]; @@ -272,6 +274,56 @@ out_free_devname: return ret; } +void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +{ + struct task_struct *p; + char *tmp = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); + + if (!tmp) { + pr_warn("Unable to attach cgrp due to alloc failure!\n"); + return; + } + + cgroup_taskset_for_each(p, cgrp, tset) { + unsigned int fd; + struct fdtable *fdt; + struct files_struct *files; + + task_lock(p); + files = p->files; + if (!files) { + task_unlock(p); + continue; + } + + rcu_read_lock(); + fdt = files_fdtable(files); + for (fd = 0; fd < fdt->max_fds; fd++) { + char *path; + struct file *file; + struct socket *sock; + unsigned long s; + int rv, err = 0; + + file = fcheck_files(files, fd); + if (!file) + continue; + + path = d_path(&file->f_path, tmp, PAGE_SIZE); + rv = sscanf(path, "socket:[%lu]", &s); + if (rv <= 0) + continue; + + sock = sock_from_file(file, &err); + if (!err) + sock_update_netprioidx(sock->sk, p); + } + rcu_read_unlock(); + task_unlock(p); + } + kfree(tmp); +} + static struct cftype ss_files[] = { { .name = "prioidx", @@ -289,6 +341,7 @@ struct cgroup_subsys net_prio_subsys = { .name = "net_prio", .create = cgrp_create, .destroy = cgrp_destroy, + .attach = net_prio_attach, #ifdef CONFIG_NETPRIO_CGROUP .subsys_id = net_prio_subsys_id, #endif diff --git a/net/core/sock.c b/net/core/sock.c index 24039ac12426..2676a88f533e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1180,12 +1180,12 @@ void sock_update_classid(struct sock *sk) } EXPORT_SYMBOL(sock_update_classid); -void sock_update_netprioidx(struct sock *sk) +void sock_update_netprioidx(struct sock *sk, struct task_struct *task) { if (in_interrupt()) return; - sk->sk_cgrp_prioidx = task_netprioidx(current); + sk->sk_cgrp_prioidx = task_netprioidx(task); } EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif @@ -1215,7 +1215,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); - sock_update_netprioidx(sk); + sock_update_netprioidx(sk, current); } return sk; diff --git a/net/socket.c b/net/socket.c index 0452dca4cd24..dfe5b66c97e0 100644 --- a/net/socket.c +++ b/net/socket.c @@ -398,7 +398,7 @@ int sock_map_fd(struct socket *sock, int flags) } EXPORT_SYMBOL(sock_map_fd); -static struct socket *sock_from_file(struct file *file, int *err) +struct socket *sock_from_file(struct file *file, int *err) { if (file->f_op == &socket_file_ops) return file->private_data; /* set in sock_map_fd */ @@ -406,6 +406,7 @@ static struct socket *sock_from_file(struct file *file, int *err) *err = -ENOTSOCK; return NULL; } +EXPORT_SYMBOL(sock_from_file); /** * sockfd_lookup - Go from a file number to its socket slot @@ -554,8 +555,6 @@ static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock, sock_update_classid(sock->sk); - sock_update_netprioidx(sock->sk); - si->sock = sock; si->scm = NULL; si->msg = msg; -- cgit v1.2.3 From 6120d3dbb1220792ebea88cd475e1ec8f8620a93 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Jun 2012 10:03:05 +0400 Subject: get rid of ->scm_work_list recursion in __scm_destroy() will be cut by delaying final fput() Signed-off-by: Al Viro --- include/linux/sched.h | 1 - include/net/scm.h | 1 - net/core/scm.c | 22 +++------------------- 3 files changed, 3 insertions(+), 21 deletions(-) (limited to 'net/core') diff --git a/include/linux/sched.h b/include/linux/sched.h index af3555cc760f..598ba2da7865 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1546,7 +1546,6 @@ struct task_struct { unsigned long timer_slack_ns; unsigned long default_timer_slack_ns; - struct list_head *scm_work_list; #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack */ int curr_ret_stack; diff --git a/include/net/scm.h b/include/net/scm.h index d456f4c71a32..079d7887dac1 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -13,7 +13,6 @@ #define SCM_MAX_FD 253 struct scm_fp_list { - struct list_head list; short count; short max; struct file *fp[SCM_MAX_FD]; diff --git a/net/core/scm.c b/net/core/scm.c index 611c5efd4cb0..8f6ccfd68ef4 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -109,25 +109,9 @@ void __scm_destroy(struct scm_cookie *scm) if (fpl) { scm->fp = NULL; - if (current->scm_work_list) { - list_add_tail(&fpl->list, current->scm_work_list); - } else { - LIST_HEAD(work_list); - - current->scm_work_list = &work_list; - - list_add(&fpl->list, &work_list); - while (!list_empty(&work_list)) { - fpl = list_first_entry(&work_list, struct scm_fp_list, list); - - list_del(&fpl->list); - for (i=fpl->count-1; i>=0; i--) - fput(fpl->fp[i]); - kfree(fpl); - } - - current->scm_work_list = NULL; - } + for (i=fpl->count-1; i>=0; i--) + fput(fpl->fp[i]); + kfree(fpl); } } EXPORT_SYMBOL(__scm_destroy); -- cgit v1.2.3 From b68581778cd0051a3fb9a2b614dee7eccb5127ff Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 Jul 2012 16:27:54 -0700 Subject: net: Make skb->skb_iif always track skb->dev Make it follow device decapsulation, from things such as VLAN and bonding. The stuff that actually cares about pre-demuxed device pointers, is handled by the "orig_dev" variable in __netif_receive_skb(). And the only consumer of that is the po->origdev feature of AF_PACKET sockets. Signed-off-by: David S. Miller --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index cca02ae7a844..0ebaea16632f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3173,8 +3173,6 @@ static int __netif_receive_skb(struct sk_buff *skb) if (netpoll_receive_skb(skb)) return NET_RX_DROP; - if (!skb->skb_iif) - skb->skb_iif = skb->dev->ifindex; orig_dev = skb->dev; skb_reset_network_header(skb); @@ -3186,6 +3184,7 @@ static int __netif_receive_skb(struct sk_buff *skb) rcu_read_lock(); another_round: + skb->skb_iif = skb->dev->ifindex; __this_cpu_inc(softnet_data.processed); -- cgit v1.2.3