From ee06b1728b95643668e40fc58ae118aeb7c1753e Mon Sep 17 00:00:00 2001 From: "Alvaro G. M" Date: Mon, 17 Jul 2017 09:12:28 +0200 Subject: net: axienet: add support for standard phy-mode binding Keep supporting proprietary "xlnx,phy-type" attribute and add support for MII connectivity to the PHY. Reviewed-by: Andrew Lunn Signed-off-by: Alvaro Gamez Machado Signed-off-by: David S. Miller --- .../devicetree/bindings/net/xilinx_axienet.txt | 55 ++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/xilinx_axienet.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/xilinx_axienet.txt b/Documentation/devicetree/bindings/net/xilinx_axienet.txt new file mode 100644 index 000000000000..38f9ec076743 --- /dev/null +++ b/Documentation/devicetree/bindings/net/xilinx_axienet.txt @@ -0,0 +1,55 @@ +XILINX AXI ETHERNET Device Tree Bindings +-------------------------------------------------------- + +Also called AXI 1G/2.5G Ethernet Subsystem, the xilinx axi ethernet IP core +provides connectivity to an external ethernet PHY supporting different +interfaces: MII, GMII, RGMII, SGMII, 1000BaseX. It also includes two +segments of memory for buffering TX and RX, as well as the capability of +offloading TX/RX checksum calculation off the processor. + +Management configuration is done through the AXI interface, while payload is +sent and received through means of an AXI DMA controller. This driver +includes the DMA driver code, so this driver is incompatible with AXI DMA +driver. + +For more details about mdio please refer phy.txt file in the same directory. + +Required properties: +- compatible : Must be one of "xlnx,axi-ethernet-1.00.a", + "xlnx,axi-ethernet-1.01.a", "xlnx,axi-ethernet-2.01.a" +- reg : Address and length of the IO space. +- interrupts : Should be a list of two interrupt, TX and RX. +- phy-handle : Should point to the external phy device. + See ethernet.txt file in the same directory. +- xlnx,rxmem : Set to allocated memory buffer for Rx/Tx in the hardware + +Optional properties: +- phy-mode : See ethernet.txt +- xlnx,phy-type : Deprecated, do not use, but still accepted in preference + to phy-mode. +- xlnx,txcsum : 0 or empty for disabling TX checksum offload, + 1 to enable partial TX checksum offload, + 2 to enable full TX checksum offload +- xlnx,rxcsum : Same values as xlnx,txcsum but for RX checksum offload + +Example: + axi_ethernet_eth: ethernet@40c00000 { + compatible = "xlnx,axi-ethernet-1.00.a"; + device_type = "network"; + interrupt-parent = <µblaze_0_axi_intc>; + interrupts = <2 0>; + phy-mode = "mii"; + reg = <0x40c00000 0x40000>; + xlnx,rxcsum = <0x2>; + xlnx,rxmem = <0x800>; + xlnx,txcsum = <0x2>; + phy-handle = <&phy0>; + axi_ethernetlite_0_mdio: mdio { + #address-cells = <1>; + #size-cells = <0>; + phy0: phy@0 { + device_type = "ethernet-phy"; + reg = <1>; + }; + }; + }; -- cgit v1.2.3 From b3a703c7a698bcc0938fe22ba74d32593ad6e662 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Mon, 17 Jul 2017 09:33:52 +0100 Subject: dt-bindings: net: ravb : Add support for r8a7743 SoC Add a new compatible string for the RZ/G1M (R8A7743) SoC. Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../devicetree/bindings/net/renesas,ravb.txt | 29 +++++++++++++--------- 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/renesas,ravb.txt b/Documentation/devicetree/bindings/net/renesas,ravb.txt index b519503be51a..4717bc24eada 100644 --- a/Documentation/devicetree/bindings/net/renesas,ravb.txt +++ b/Documentation/devicetree/bindings/net/renesas,ravb.txt @@ -4,19 +4,24 @@ This file provides information on what the device node for the Ethernet AVB interface contains. Required properties: -- compatible: "renesas,etheravb-r8a7790" if the device is a part of R8A7790 SoC. - "renesas,etheravb-r8a7791" if the device is a part of R8A7791 SoC. - "renesas,etheravb-r8a7792" if the device is a part of R8A7792 SoC. - "renesas,etheravb-r8a7793" if the device is a part of R8A7793 SoC. - "renesas,etheravb-r8a7794" if the device is a part of R8A7794 SoC. - "renesas,etheravb-r8a7795" if the device is a part of R8A7795 SoC. - "renesas,etheravb-r8a7796" if the device is a part of R8A7796 SoC. - "renesas,etheravb-rcar-gen2" for generic R-Car Gen 2 compatible interface. - "renesas,etheravb-rcar-gen3" for generic R-Car Gen 3 compatible interface. +- compatible: Must contain one or more of the following: + - "renesas,etheravb-r8a7743" for the R8A7743 SoC. + - "renesas,etheravb-r8a7790" for the R8A7790 SoC. + - "renesas,etheravb-r8a7791" for the R8A7791 SoC. + - "renesas,etheravb-r8a7792" for the R8A7792 SoC. + - "renesas,etheravb-r8a7793" for the R8A7793 SoC. + - "renesas,etheravb-r8a7794" for the R8A7794 SoC. + - "renesas,etheravb-rcar-gen2" as a fallback for the above + R-Car Gen2 and RZ/G1 devices. - When compatible with the generic version, nodes must list the - SoC-specific version corresponding to the platform first - followed by the generic version. + - "renesas,etheravb-r8a7795" for the R8A7795 SoC. + - "renesas,etheravb-r8a7796" for the R8A7796 SoC. + - "renesas,etheravb-rcar-gen3" as a fallback for the above + R-Car Gen3 devices. + + When compatible with the generic version, nodes must list the + SoC-specific version corresponding to the platform first followed by + the generic version. - reg: offset and length of (1) the register block and (2) the stream buffer. - interrupts: A list of interrupt-specifiers, one for each entry in -- cgit v1.2.3 From 3c2a89ddc11896cf5498115c0380ab54b1c424b7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 17 Jul 2017 13:57:20 +0200 Subject: net: xfrm: revert to lower xfrm dst gc limit revert c386578f1cdb4dac230395 ("xfrm: Let the flowcache handle its size by default."). Once we remove flow cache, we don't have a flow cache limit anymore. We must not allow (virtually) unlimited allocations of xfrm dst entries. Revert back to the old xfrm dst gc limits. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 6 ++---- net/ipv4/xfrm4_policy.c | 2 +- net/ipv6/xfrm6_policy.c | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 974ab47ae53a..f485d553e65c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1291,8 +1291,7 @@ tag - INTEGER xfrm4_gc_thresh - INTEGER The threshold at which we will start garbage collecting for IPv4 destination cache entries. At twice this value the system will - refuse new allocations. The value must be set below the flowcache - limit (4096 * number of online cpus) to take effect. + refuse new allocations. igmp_link_local_mcast_reports - BOOLEAN Enable IGMP reports for link local multicast groups in the @@ -1778,8 +1777,7 @@ ratelimit - INTEGER xfrm6_gc_thresh - INTEGER The threshold at which we will start garbage collecting for IPv6 destination cache entries. At twice this value the system will - refuse new allocations. The value must be set below the flowcache - limit (4096 * number of online cpus) to take effect. + refuse new allocations. IPv6 Update by: diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 71b4ecc195c7..19455a5fc328 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -266,7 +266,7 @@ static struct dst_ops xfrm4_dst_ops_template = { .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, - .gc_thresh = INT_MAX, + .gc_thresh = 32768, }; static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = { diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 79651bc71bf0..ae30dc4973e8 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -286,7 +286,7 @@ static struct dst_ops xfrm6_dst_ops_template = { .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, - .gc_thresh = INT_MAX, + .gc_thresh = 32768, }; static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = { -- cgit v1.2.3 From e45eba2467bd64fd196dc6f8b50ff5e59c0058da Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 12 Jul 2017 13:14:48 +0200 Subject: batman-adv: Convert batman-adv.txt to reStructuredText Converting the freeform text to parsable reStructuredText, allows the integration in the sphinx based documentation system of the kernel. It will therefore be accessible as hypertext under https://www.kernel.org/doc/html/latest/ Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- Documentation/networking/00-INDEX | 2 - Documentation/networking/batman-adv.rst | 220 ++++++++++++++++++++++++++++++++ Documentation/networking/batman-adv.txt | 215 ------------------------------- Documentation/networking/index.rst | 1 + MAINTAINERS | 2 +- 5 files changed, 222 insertions(+), 218 deletions(-) create mode 100644 Documentation/networking/batman-adv.rst delete mode 100644 Documentation/networking/batman-adv.txt (limited to 'Documentation') diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX index c6beb5f1637f..7a79b3587dd3 100644 --- a/Documentation/networking/00-INDEX +++ b/Documentation/networking/00-INDEX @@ -30,8 +30,6 @@ atm.txt - info on where to get ATM programs and support for Linux. ax25.txt - info on using AX.25 and NET/ROM code for Linux -batman-adv.txt - - B.A.T.M.A.N routing protocol on top of layer 2 Ethernet Frames. baycom.txt - info on the driver for Baycom style amateur radio modems bonding.txt diff --git a/Documentation/networking/batman-adv.rst b/Documentation/networking/batman-adv.rst new file mode 100644 index 000000000000..a342b2cc3dc6 --- /dev/null +++ b/Documentation/networking/batman-adv.rst @@ -0,0 +1,220 @@ +========== +batman-adv +========== + +Batman advanced is a new approach to wireless networking which does no longer +operate on the IP basis. Unlike the batman daemon, which exchanges information +using UDP packets and sets routing tables, batman-advanced operates on ISO/OSI +Layer 2 only and uses and routes (or better: bridges) Ethernet Frames. It +emulates a virtual network switch of all nodes participating. Therefore all +nodes appear to be link local, thus all higher operating protocols won't be +affected by any changes within the network. You can run almost any protocol +above batman advanced, prominent examples are: IPv4, IPv6, DHCP, IPX. + +Batman advanced was implemented as a Linux kernel driver to reduce the overhead +to a minimum. It does not depend on any (other) network driver, and can be used +on wifi as well as ethernet lan, vpn, etc ... (anything with ethernet-style +layer 2). + + +Configuration +============= + +Load the batman-adv module into your kernel:: + + $ insmod batman-adv.ko + +The module is now waiting for activation. You must add some interfaces on which +batman can operate. After loading the module batman advanced will scan your +systems interfaces to search for compatible interfaces. Once found, it will +create subfolders in the ``/sys`` directories of each supported interface, +e.g.:: + + $ ls /sys/class/net/eth0/batman_adv/ + elp_interval iface_status mesh_iface throughput_override + +If an interface does not have the ``batman_adv`` subfolder, it probably is not +supported. Not supported interfaces are: loopback, non-ethernet and batman's +own interfaces. + +Note: After the module was loaded it will continuously watch for new +interfaces to verify the compatibility. There is no need to reload the module +if you plug your USB wifi adapter into your machine after batman advanced was +initially loaded. + +The batman-adv soft-interface can be created using the iproute2 tool ``ip``:: + + $ ip link add name bat0 type batadv + +To activate a given interface simply attach it to the ``bat0`` interface:: + + $ ip link set dev eth0 master bat0 + +Repeat this step for all interfaces you wish to add. Now batman starts +using/broadcasting on this/these interface(s). + +By reading the "iface_status" file you can check its status:: + + $ cat /sys/class/net/eth0/batman_adv/iface_status + active + +To deactivate an interface you have to detach it from the "bat0" interface:: + + $ ip link set dev eth0 nomaster + + +All mesh wide settings can be found in batman's own interface folder:: + + $ ls /sys/class/net/bat0/mesh/ + aggregated_ogms fragmentation isolation_mark routing_algo + ap_isolation gw_bandwidth log_level vlan0 + bonding gw_mode multicast_mode + bridge_loop_avoidance gw_sel_class network_coding + distributed_arp_table hop_penalty orig_interval + +There is a special folder for debugging information:: + + $ ls /sys/kernel/debug/batman_adv/bat0/ + bla_backbone_table log neighbors transtable_local + bla_claim_table mcast_flags originators + dat_cache nc socket + gateways nc_nodes transtable_global + +Some of the files contain all sort of status information regarding the mesh +network. For example, you can view the table of originators (mesh +participants) with:: + + $ cat /sys/kernel/debug/batman_adv/bat0/originators + +Other files allow to change batman's behaviour to better fit your requirements. +For instance, you can check the current originator interval (value in +milliseconds which determines how often batman sends its broadcast packets):: + + $ cat /sys/class/net/bat0/mesh/orig_interval + 1000 + +and also change its value:: + + $ echo 3000 > /sys/class/net/bat0/mesh/orig_interval + +In very mobile scenarios, you might want to adjust the originator interval to a +lower value. This will make the mesh more responsive to topology changes, but +will also increase the overhead. + + +Usage +===== + +To make use of your newly created mesh, batman advanced provides a new +interface "bat0" which you should use from this point on. All interfaces added +to batman advanced are not relevant any longer because batman handles them for +you. Basically, one "hands over" the data by using the batman interface and +batman will make sure it reaches its destination. + +The "bat0" interface can be used like any other regular interface. It needs an +IP address which can be either statically configured or dynamically (by using +DHCP or similar services):: + + NodeA: ip link set up dev bat0 + NodeA: ip addr add 192.168.0.1/24 dev bat0 + + NodeB: ip link set up dev bat0 + NodeB: ip addr add 192.168.0.2/24 dev bat0 + NodeB: ping 192.168.0.1 + +Note: In order to avoid problems remove all IP addresses previously assigned to +interfaces now used by batman advanced, e.g.:: + + $ ip addr flush dev eth0 + + +Logging/Debugging +================= + +All error messages, warnings and information messages are sent to the kernel +log. Depending on your operating system distribution this can be read in one of +a number of ways. Try using the commands: ``dmesg``, ``logread``, or looking in +the files ``/var/log/kern.log`` or ``/var/log/syslog``. All batman-adv messages +are prefixed with "batman-adv:" So to see just these messages try:: + + $ dmesg | grep batman-adv + +When investigating problems with your mesh network, it is sometimes necessary to +see more detail debug messages. This must be enabled when compiling the +batman-adv module. When building batman-adv as part of kernel, use "make +menuconfig" and enable the option ``B.A.T.M.A.N. debugging`` +(``CONFIG_BATMAN_ADV_DEBUG=y``). + +Those additional debug messages can be accessed using a special file in +debugfs:: + + $ cat /sys/kernel/debug/batman_adv/bat0/log + +The additional debug output is by default disabled. It can be enabled during +run time. Following log_levels are defined: + +.. flat-table:: + + * - 0 + - All debug output disabled + * - 1 + - Enable messages related to routing / flooding / broadcasting + * - 2 + - Enable messages related to route added / changed / deleted + * - 4 + - Enable messages related to translation table operations + * - 8 + - Enable messages related to bridge loop avoidance + * - 16 + - Enable messages related to DAT, ARP snooping and parsing + * - 32 + - Enable messages related to network coding + * - 64 + - Enable messages related to multicast + * - 128 + - Enable messages related to throughput meter + * - 255 + - Enable all messages + +The debug output can be changed at runtime using the file +``/sys/class/net/bat0/mesh/log_level``. e.g.:: + + $ echo 6 > /sys/class/net/bat0/mesh/log_level + +will enable debug messages for when routes change. + +Counters for different types of packets entering and leaving the batman-adv +module are available through ethtool:: + + $ ethtool --statistics bat0 + + +batctl +====== + +As batman advanced operates on layer 2, all hosts participating in the virtual +switch are completely transparent for all protocols above layer 2. Therefore +the common diagnosis tools do not work as expected. To overcome these problems, +batctl was created. At the moment the batctl contains ping, traceroute, tcpdump +and interfaces to the kernel module settings. + +For more information, please see the manpage (``man batctl``). + +batctl is available on https://www.open-mesh.org/ + + +Contact +======= + +Please send us comments, experiences, questions, anything :) + +IRC: + #batman on irc.freenode.org +Mailing-list: + b.a.t.m.a.n@open-mesh.org (optional subscription at + https://lists.open-mesh.org/mm/listinfo/b.a.t.m.a.n) + +You can also contact the Authors: + +* Marek Lindner +* Simon Wunderlich diff --git a/Documentation/networking/batman-adv.txt b/Documentation/networking/batman-adv.txt deleted file mode 100644 index ccf94677b240..000000000000 --- a/Documentation/networking/batman-adv.txt +++ /dev/null @@ -1,215 +0,0 @@ -BATMAN-ADV ----------- - -Batman advanced is a new approach to wireless networking which -does no longer operate on the IP basis. Unlike the batman daemon, -which exchanges information using UDP packets and sets routing -tables, batman-advanced operates on ISO/OSI Layer 2 only and uses -and routes (or better: bridges) Ethernet Frames. It emulates a -virtual network switch of all nodes participating. Therefore all -nodes appear to be link local, thus all higher operating proto- -cols won't be affected by any changes within the network. You can -run almost any protocol above batman advanced, prominent examples -are: IPv4, IPv6, DHCP, IPX. - -Batman advanced was implemented as a Linux kernel driver to re- -duce the overhead to a minimum. It does not depend on any (other) -network driver, and can be used on wifi as well as ethernet lan, -vpn, etc ... (anything with ethernet-style layer 2). - - -CONFIGURATION -------------- - -Load the batman-adv module into your kernel: - -# insmod batman-adv.ko - -The module is now waiting for activation. You must add some in- -terfaces on which batman can operate. After loading the module -batman advanced will scan your systems interfaces to search for -compatible interfaces. Once found, it will create subfolders in -the /sys directories of each supported interface, e.g. - -# ls /sys/class/net/eth0/batman_adv/ -# elp_interval iface_status mesh_iface throughput_override - -If an interface does not have the "batman_adv" subfolder it prob- -ably is not supported. Not supported interfaces are: loopback, -non-ethernet and batman's own interfaces. - -Note: After the module was loaded it will continuously watch for -new interfaces to verify the compatibility. There is no need to -reload the module if you plug your USB wifi adapter into your ma- -chine after batman advanced was initially loaded. - -The batman-adv soft-interface can be created using the iproute2 -tool "ip" - -# ip link add name bat0 type batadv - -To activate a given interface simply attach it to the "bat0" -interface - -# ip link set dev eth0 master bat0 - -Repeat this step for all interfaces you wish to add. Now batman -starts using/broadcasting on this/these interface(s). - -By reading the "iface_status" file you can check its status: - -# cat /sys/class/net/eth0/batman_adv/iface_status -# active - -To deactivate an interface you have to detach it from the -"bat0" interface: - -# ip link set dev eth0 nomaster - - -All mesh wide settings can be found in batman's own interface -folder: - -# ls /sys/class/net/bat0/mesh/ -# aggregated_ogms fragmentation isolation_mark routing_algo -# ap_isolation gw_bandwidth log_level vlan0 -# bonding gw_mode multicast_mode -# bridge_loop_avoidance gw_sel_class network_coding -# distributed_arp_table hop_penalty orig_interval - -There is a special folder for debugging information: - -# ls /sys/kernel/debug/batman_adv/bat0/ -# bla_backbone_table log neighbors transtable_local -# bla_claim_table mcast_flags originators -# dat_cache nc socket -# gateways nc_nodes transtable_global - -Some of the files contain all sort of status information regard- -ing the mesh network. For example, you can view the table of -originators (mesh participants) with: - -# cat /sys/kernel/debug/batman_adv/bat0/originators - -Other files allow to change batman's behaviour to better fit your -requirements. For instance, you can check the current originator -interval (value in milliseconds which determines how often batman -sends its broadcast packets): - -# cat /sys/class/net/bat0/mesh/orig_interval -# 1000 - -and also change its value: - -# echo 3000 > /sys/class/net/bat0/mesh/orig_interval - -In very mobile scenarios, you might want to adjust the originator -interval to a lower value. This will make the mesh more respon- -sive to topology changes, but will also increase the overhead. - - -USAGE ------ - -To make use of your newly created mesh, batman advanced provides -a new interface "bat0" which you should use from this point on. -All interfaces added to batman advanced are not relevant any -longer because batman handles them for you. Basically, one "hands -over" the data by using the batman interface and batman will make -sure it reaches its destination. - -The "bat0" interface can be used like any other regular inter- -face. It needs an IP address which can be either statically con- -figured or dynamically (by using DHCP or similar services): - -# NodeA: ip link set up dev bat0 -# NodeA: ip addr add 192.168.0.1/24 dev bat0 - -# NodeB: ip link set up dev bat0 -# NodeB: ip addr add 192.168.0.2/24 dev bat0 -# NodeB: ping 192.168.0.1 - -Note: In order to avoid problems remove all IP addresses previ- -ously assigned to interfaces now used by batman advanced, e.g. - -# ip addr flush dev eth0 - - -LOGGING/DEBUGGING ------------------ - -All error messages, warnings and information messages are sent to -the kernel log. Depending on your operating system distribution -this can be read in one of a number of ways. Try using the com- -mands: dmesg, logread, or looking in the files /var/log/kern.log -or /var/log/syslog. All batman-adv messages are prefixed with -"batman-adv:" So to see just these messages try - -# dmesg | grep batman-adv - -When investigating problems with your mesh network it is some- -times necessary to see more detail debug messages. This must be -enabled when compiling the batman-adv module. When building bat- -man-adv as part of kernel, use "make menuconfig" and enable the -option "B.A.T.M.A.N. debugging". - -Those additional debug messages can be accessed using a special -file in debugfs - -# cat /sys/kernel/debug/batman_adv/bat0/log - -The additional debug output is by default disabled. It can be en- -abled during run time. Following log_levels are defined: - - 0 - All debug output disabled - 1 - Enable messages related to routing / flooding / broadcasting - 2 - Enable messages related to route added / changed / deleted - 4 - Enable messages related to translation table operations - 8 - Enable messages related to bridge loop avoidance - 16 - Enable messages related to DAT, ARP snooping and parsing - 32 - Enable messages related to network coding - 64 - Enable messages related to multicast -128 - Enable messages related to throughput meter -255 - Enable all messages - -The debug output can be changed at runtime using the file -/sys/class/net/bat0/mesh/log_level. e.g. - -# echo 6 > /sys/class/net/bat0/mesh/log_level - -will enable debug messages for when routes change. - -Counters for different types of packets entering and leaving the -batman-adv module are available through ethtool: - -# ethtool --statistics bat0 - - -BATCTL ------- - -As batman advanced operates on layer 2 all hosts participating in -the virtual switch are completely transparent for all protocols -above layer 2. Therefore the common diagnosis tools do not work -as expected. To overcome these problems batctl was created. At -the moment the batctl contains ping, traceroute, tcpdump and -interfaces to the kernel module settings. - -For more information, please see the manpage (man batctl). - -batctl is available on https://www.open-mesh.org/ - - -CONTACT -------- - -Please send us comments, experiences, questions, anything :) - -IRC: #batman on irc.freenode.org -Mailing-list: b.a.t.m.a.n@open-mesh.org (optional subscription - at https://lists.open-mesh.org/mm/listinfo/b.a.t.m.a.n) - -You can also contact the Authors: - -Marek Lindner -Simon Wunderlich diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index b5bd87e01f52..66e620866245 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -6,6 +6,7 @@ Contents: .. toctree:: :maxdepth: 2 + batman-adv kapi z8530book diff --git a/MAINTAINERS b/MAINTAINERS index 205d3977ac46..c28a1325724c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2481,7 +2481,7 @@ Q: https://patchwork.open-mesh.org/project/batman/list/ S: Maintained F: Documentation/ABI/testing/sysfs-class-net-batman-adv F: Documentation/ABI/testing/sysfs-class-net-mesh -F: Documentation/networking/batman-adv.txt +F: Documentation/networking/batman-adv.rst F: include/uapi/linux/batman_adv.h F: net/batman-adv/ -- cgit v1.2.3 From 4d3a57f23dec59f0a2362e63540b2d01b37afe0a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 Jul 2017 11:22:04 +0200 Subject: netfilter: conntrack: do not enable connection tracking unless needed Discussion during NFWS 2017 in Faro has shown that the current conntrack behaviour is unreasonable. Even if conntrack module is loaded on behalf of a single net namespace, its turned on for all namespaces, which is expensive. Commit 481fa373476 ("netfilter: conntrack: add nf_conntrack_default_on sysctl") attempted to provide an alternative to the 'default on' behaviour by adding a sysctl to change it. However, as Eric points out, the sysctl only becomes available once the module is loaded, and then its too late. So we either have to move the sysctl to the core, or, alternatively, change conntrack to become active only once the rule set requires this. This does the latter, conntrack is only enabled when a rule needs it. Reported-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- Documentation/networking/nf_conntrack-sysctl.txt | 11 --------- include/net/netfilter/nf_conntrack_l3proto.h | 15 ------------ net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 16 ++----------- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 17 ++------------ net/netfilter/nf_conntrack_proto.c | 29 ------------------------ net/netfilter/nf_conntrack_standalone.c | 10 -------- 6 files changed, 4 insertions(+), 94 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/nf_conntrack-sysctl.txt b/Documentation/networking/nf_conntrack-sysctl.txt index 497d668288f9..433b6724797a 100644 --- a/Documentation/networking/nf_conntrack-sysctl.txt +++ b/Documentation/networking/nf_conntrack-sysctl.txt @@ -96,17 +96,6 @@ nf_conntrack_max - INTEGER Size of connection tracking table. Default value is nf_conntrack_buckets value * 4. -nf_conntrack_default_on - BOOLEAN - 0 - don't register conntrack in new net namespaces - 1 - register conntrack in new net namespaces (default) - - This controls wheter newly created network namespaces have connection - tracking enabled by default. It will be enabled automatically - regardless of this setting if the new net namespace requires - connection tracking, e.g. when NAT rules are created. - This setting is only visible in initial user namespace, it has no - effect on existing namespaces. - nf_conntrack_tcp_be_liberal - BOOLEAN 0 - disabled (default) not 0 - enabled diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index 6d14b36e3a49..1b8de164d744 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -73,21 +73,6 @@ struct nf_conntrack_l3proto { extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO]; -#ifdef CONFIG_SYSCTL -/* Protocol pernet registration. */ -int nf_ct_l3proto_pernet_register(struct net *net, - struct nf_conntrack_l3proto *proto); -#else -static inline int nf_ct_l3proto_pernet_register(struct net *n, - struct nf_conntrack_l3proto *p) -{ - return 0; -} -#endif - -void nf_ct_l3proto_pernet_unregister(struct net *net, - struct nf_conntrack_l3proto *proto); - /* Protocol global registration. */ int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto); void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 63e4ea0e01f8..de5f0e6ddd1b 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -398,24 +398,12 @@ static struct nf_conntrack_l4proto *builtin_l4proto4[] = { static int ipv4_net_init(struct net *net) { - int ret = 0; - - ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - if (ret < 0) - return ret; - ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: pernet registration failed\n"); - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - } - return ret; + return nf_ct_l4proto_pernet_register(net, builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); } static void ipv4_net_exit(struct net *net) { - nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4); nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4, ARRAY_SIZE(builtin_l4proto4)); } diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index f2d2f4a9294b..ddef5ee9e0a8 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -398,25 +398,12 @@ static struct nf_conntrack_l4proto *builtin_l4proto6[] = { static int ipv6_net_init(struct net *net) { - int ret = 0; - - ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - if (ret < 0) - return ret; - - ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv6); - if (ret < 0) { - pr_err("nf_conntrack_ipv6: pernet registration failed.\n"); - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - } - return ret; + return nf_ct_l4proto_pernet_register(net, builtin_l4proto6, + ARRAY_SIZE(builtin_l4proto6)); } static void ipv6_net_exit(struct net *net) { - nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv6); nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6, ARRAY_SIZE(builtin_l4proto6)); } diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 1dcad229c3cc..7c89dade6fd3 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -238,20 +238,6 @@ out_unlock: } EXPORT_SYMBOL_GPL(nf_ct_l3proto_register); -#ifdef CONFIG_SYSCTL -extern unsigned int nf_conntrack_default_on; - -int nf_ct_l3proto_pernet_register(struct net *net, - struct nf_conntrack_l3proto *proto) -{ - if (nf_conntrack_default_on == 0) - return 0; - - return proto->net_ns_get ? proto->net_ns_get(net) : 0; -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register); -#endif - void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto) { BUG_ON(proto->l3proto >= NFPROTO_NUMPROTO); @@ -270,21 +256,6 @@ void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto) } EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister); -void nf_ct_l3proto_pernet_unregister(struct net *net, - struct nf_conntrack_l3proto *proto) -{ - /* - * nf_conntrack_default_on *might* have registered hooks. - * ->net_ns_put must cope with more puts() than get(), i.e. - * if nf_conntrack_default_on was 0 at time of - * nf_ct_l3proto_pernet_register invocation this net_ns_put() - * should be a noop. - */ - if (proto->net_ns_put) - proto->net_ns_put(net); -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister); - static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, struct nf_conntrack_l4proto *l4proto) { diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index ccb5cb9043e0..5b6c675d55b1 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -452,9 +452,6 @@ static int log_invalid_proto_max __read_mostly = 255; /* size the user *wants to set */ static unsigned int nf_conntrack_htable_size_user __read_mostly; -extern unsigned int nf_conntrack_default_on; -unsigned int nf_conntrack_default_on __read_mostly = 1; - static int nf_conntrack_hash_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -520,13 +517,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "nf_conntrack_default_on", - .data = &nf_conntrack_default_on, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { } }; -- cgit v1.2.3 From b6690b14386698ce2c19309abad3f17656bdfaea Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 30 Jul 2017 03:57:20 +0200 Subject: tcp: remove low_latency sysctl Was only checked by the removed prequeue code. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 +------ include/net/tcp.h | 1 - net/ipv4/sysctl_net_ipv4.c | 3 +++ net/ipv4/tcp_ipv4.c | 2 -- 4 files changed, 4 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index f485d553e65c..84c9b8cee780 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -353,12 +353,7 @@ tcp_l3mdev_accept - BOOLEAN compiled with CONFIG_NET_L3_MASTER_DEV. tcp_low_latency - BOOLEAN - If set, the TCP stack makes decisions that prefer lower - latency as opposed to higher throughput. By default, this - option is not set meaning that higher throughput is preferred. - An example of an application where this default should be - changed would be a Beowulf compute cluster. - Default: 0 + This is a legacy option, it has no effect anymore. tcp_max_orphans - INTEGER Maximal number of TCP sockets not attached to any user file handle, diff --git a/include/net/tcp.h b/include/net/tcp.h index 93f115cfc8f8..8507c81fb0e9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -256,7 +256,6 @@ extern int sysctl_tcp_rmem[3]; extern int sysctl_tcp_app_win; extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_frto; -extern int sysctl_tcp_low_latency; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9bf809726066..0d3c038d7b04 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -45,6 +45,9 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +/* obsolete */ +static int sysctl_tcp_low_latency __read_mostly; + /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a68eb4577d36..9b51663cd5a4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -85,8 +85,6 @@ #include #include -int sysctl_tcp_low_latency __read_mostly; - #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); -- cgit v1.2.3 From bbb03029a899679d73e62d7e6ae80348cc5d0054 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 28 Jul 2017 16:22:43 -0700 Subject: strparser: Generalize strparser Generalize strparser from more than just being used in conjunction with read_sock. strparser will also be used in the send path with zero proxy. The primary change is to create strp_process function that performs the critical processing on skbs. The documentation is also updated to reflect the new uses. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- Documentation/networking/strparser.txt | 207 +++++++++++++++------- include/net/strparser.h | 119 +++++++------ net/kcm/kcmproc.c | 34 ++-- net/kcm/kcmsock.c | 38 ++-- net/strparser/strparser.c | 313 ++++++++++++++++++++------------- 5 files changed, 424 insertions(+), 287 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/strparser.txt b/Documentation/networking/strparser.txt index a0bf573dfa61..fe01302471ae 100644 --- a/Documentation/networking/strparser.txt +++ b/Documentation/networking/strparser.txt @@ -1,45 +1,107 @@ -Stream Parser -------------- +Stream Parser (strparser) + +Introduction +============ The stream parser (strparser) is a utility that parses messages of an -application layer protocol running over a TCP connection. The stream +application layer protocol running over a data stream. The stream parser works in conjunction with an upper layer in the kernel to provide kernel support for application layer messages. For instance, Kernel Connection Multiplexor (KCM) uses the Stream Parser to parse messages using a BPF program. +The strparser works in one of two modes: receive callback or general +mode. + +In receive callback mode, the strparser is called from the data_ready +callback of a TCP socket. Messages are parsed and delivered as they are +received on the socket. + +In general mode, a sequence of skbs are fed to strparser from an +outside source. Message are parsed and delivered as the sequence is +processed. This modes allows strparser to be applied to arbitrary +streams of data. + Interface ---------- +========= The API includes a context structure, a set of callbacks, utility -functions, and a data_ready function. The callbacks include -a parse_msg function that is called to perform parsing (e.g. -BPF parsing in case of KCM), and a rcv_msg function that is called -when a full message has been completed. +functions, and a data_ready function for receive callback mode. The +callbacks include a parse_msg function that is called to perform +parsing (e.g. BPF parsing in case of KCM), and a rcv_msg function +that is called when a full message has been completed. -A stream parser can be instantiated for a TCP connection. This is done -by: +Functions +========= -strp_init(struct strparser *strp, struct sock *csk, +strp_init(struct strparser *strp, struct sock *sk, struct strp_callbacks *cb) -strp is a struct of type strparser that is allocated by the upper layer. -csk is the TCP socket associated with the stream parser. Callbacks are -called by the stream parser. + Called to initialize a stream parser. strp is a struct of type + strparser that is allocated by the upper layer. sk is the TCP + socket associated with the stream parser for use with receive + callback mode; in general mode this is set to NULL. Callbacks + are called by the stream parser (the callbacks are listed below). + +void strp_pause(struct strparser *strp) + + Temporarily pause a stream parser. Message parsing is suspended + and no new messages are delivered to the upper layer. + +void strp_pause(struct strparser *strp) + + Unpause a paused stream parser. + +void strp_stop(struct strparser *strp); + + strp_stop is called to completely stop stream parser operations. + This is called internally when the stream parser encounters an + error, and it is called from the upper layer to stop parsing + operations. + +void strp_done(struct strparser *strp); + + strp_done is called to release any resources held by the stream + parser instance. This must be called after the stream processor + has been stopped. + +int strp_process(struct strparser *strp, struct sk_buff *orig_skb, + unsigned int orig_offset, size_t orig_len, + size_t max_msg_size, long timeo) + + strp_process is called in general mode for a stream parser to + parse an sk_buff. The number of bytes processed or a negative + error number is returned. Note that strp_process does not + consume the sk_buff. max_msg_size is maximum size the stream + parser will parse. timeo is timeout for completing a message. + +void strp_data_ready(struct strparser *strp); + + The upper layer calls strp_tcp_data_ready when data is ready on + the lower socket for strparser to process. This should be called + from a data_ready callback that is set on the socket. Note that + maximum messages size is the limit of the receive socket + buffer and message timeout is the receive timeout for the socket. + +void strp_check_rcv(struct strparser *strp); + + strp_check_rcv is called to check for new messages on the socket. + This is normally called at initialization of a stream parser + instance or after strp_unpause. Callbacks ---------- +========= -There are four callbacks: +There are six callbacks: int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); parse_msg is called to determine the length of the next message in the stream. The upper layer must implement this function. It should parse the sk_buff as containing the headers for the - next application layer messages in the stream. + next application layer message in the stream. - The skb->cb in the input skb is a struct strp_rx_msg. Only + The skb->cb in the input skb is a struct strp_msg. Only the offset field is relevant in parse_msg and gives the offset where the message starts in the skb. @@ -50,26 +112,41 @@ int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); -ESTRPIPE : current message should not be processed by the kernel, return control of the socket to userspace which can proceed to read the messages itself - other < 0 : Error is parsing, give control back to userspace + other < 0 : Error in parsing, give control back to userspace assuming that synchronization is lost and the stream is unrecoverable (application expected to close TCP socket) In the case that an error is returned (return value is less than - zero) the stream parser will set the error on TCP socket and wake - it up. If parse_msg returned -ESTRPIPE and the stream parser had - previously read some bytes for the current message, then the error - set on the attached socket is ENODATA since the stream is - unrecoverable in that case. + zero) and the parser is in receive callback mode, then it will set + the error on TCP socket and wake it up. If parse_msg returned + -ESTRPIPE and the stream parser had previously read some bytes for + the current message, then the error set on the attached socket is + ENODATA since the stream is unrecoverable in that case. + +void (*lock)(struct strparser *strp) + + The lock callback is called to lock the strp structure when + the strparser is performing an asynchronous operation (such as + processing a timeout). In receive callback mode the default + function is to lock_sock for the associated socket. In general + mode the callback must be set appropriately. + +void (*unlock)(struct strparser *strp) + + The unlock callback is called to release the lock obtained + by the lock callback. In receive callback mode the default + function is release_sock for the associated socket. In general + mode the callback must be set appropriately. void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); rcv_msg is called when a full message has been received and is queued. The callee must consume the sk_buff; it can call strp_pause to prevent any further messages from being - received in rcv_msg (see strp_pause below). This callback + received in rcv_msg (see strp_pause above). This callback must be set. - The skb->cb in the input skb is a struct strp_rx_msg. This + The skb->cb in the input skb is a struct strp_msg. This struct contains two fields: offset and full_len. Offset is where the message starts in the skb, and full_len is the the length of the message. skb->len - offset may be greater @@ -78,59 +155,53 @@ void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); int (*read_sock_done)(struct strparser *strp, int err); read_sock_done is called when the stream parser is done reading - the TCP socket. The stream parser may read multiple messages - in a loop and this function allows cleanup to occur when existing - the loop. If the callback is not set (NULL in strp_init) a - default function is used. + the TCP socket in receive callback mode. The stream parser may + read multiple messages in a loop and this function allows cleanup + to occur when exiting the loop. If the callback is not set (NULL + in strp_init) a default function is used. void (*abort_parser)(struct strparser *strp, int err); This function is called when stream parser encounters an error - in parsing. The default function stops the stream parser for the - TCP socket and sets the error in the socket. The default function - can be changed by setting the callback to non-NULL in strp_init. + in parsing. The default function stops the stream parser and + sets the error in the socket if the parser is in receive callback + mode. The default function can be changed by setting the callback + to non-NULL in strp_init. -Functions ---------- +Statistics +========== -The upper layer calls strp_tcp_data_ready when data is ready on the lower -socket for strparser to process. This should be called from a data_ready -callback that is set on the socket. +Various counters are kept for each stream parser instance. These are in +the strp_stats structure. strp_aggr_stats is a convenience structure for +accumulating statistics for multiple stream parser instances. +save_strp_stats and aggregate_strp_stats are helper functions to save +and aggregate statistics. -strp_stop is called to completely stop stream parser operations. This -is called internally when the stream parser encounters an error, and -it is called from the upper layer when unattaching a TCP socket. +Message assembly limits +======================= -strp_done is called to unattach the stream parser from the TCP socket. -This must be called after the stream processor has be stopped. +The stream parser provide mechanisms to limit the resources consumed by +message assembly. -strp_check_rcv is called to check for new messages on the socket. This -is normally called at initialization of the a stream parser instance -of after strp_unpause. +A timer is set when assembly starts for a new message. In receive +callback mode the message timeout is taken from rcvtime for the +associated TCP socket. In general mode, the timeout is passed as an +argument in strp_process. If the timer fires before assembly completes +the stream parser is aborted and the ETIMEDOUT error is set on the TCP +socket if in receive callback mode. -Statistics ----------- +In receive callback mode, message length is limited to the receive +buffer size of the associated TCP socket. If the length returned by +parse_msg is greater than the socket buffer size then the stream parser +is aborted with EMSGSIZE error set on the TCP socket. Note that this +makes the maximum size of receive skbuffs for a socket with a stream +parser to be 2*sk_rcvbuf of the TCP socket. -Various counters are kept for each stream parser for a TCP socket. -These are in the strp_stats structure. strp_aggr_stats is a convenience -structure for accumulating statistics for multiple stream parser -instances. save_strp_stats and aggregate_strp_stats are helper functions -to save and aggregate statistics. +In general mode the message length limit is passed in as an argument +to strp_process. -Message assembly limits ------------------------ +Author +====== -The stream parser provide mechanisms to limit the resources consumed by -message assembly. +Tom Herbert (tom@quantonium.net) -A timer is set when assembly starts for a new message. The message -timeout is taken from rcvtime for the associated TCP socket. If the -timer fires before assembly completes the stream parser is aborted -and the ETIMEDOUT error is set on the TCP socket. - -Message length is limited to the receive buffer size of the associated -TCP socket. If the length returned by parse_msg is greater than -the socket buffer size then the stream parser is aborted with -EMSGSIZE error set on the TCP socket. Note that this makes the -maximum size of receive skbuffs for a socket with a stream parser -to be 2*sk_rcvbuf of the TCP socket. diff --git a/include/net/strparser.h b/include/net/strparser.h index 0c28ad97c52f..4fe966a0ad92 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -18,26 +18,26 @@ #define STRP_STATS_INCR(stat) ((stat)++) struct strp_stats { - unsigned long long rx_msgs; - unsigned long long rx_bytes; - unsigned int rx_mem_fail; - unsigned int rx_need_more_hdr; - unsigned int rx_msg_too_big; - unsigned int rx_msg_timeouts; - unsigned int rx_bad_hdr_len; + unsigned long long msgs; + unsigned long long bytes; + unsigned int mem_fail; + unsigned int need_more_hdr; + unsigned int msg_too_big; + unsigned int msg_timeouts; + unsigned int bad_hdr_len; }; struct strp_aggr_stats { - unsigned long long rx_msgs; - unsigned long long rx_bytes; - unsigned int rx_mem_fail; - unsigned int rx_need_more_hdr; - unsigned int rx_msg_too_big; - unsigned int rx_msg_timeouts; - unsigned int rx_bad_hdr_len; - unsigned int rx_aborts; - unsigned int rx_interrupted; - unsigned int rx_unrecov_intr; + unsigned long long msgs; + unsigned long long bytes; + unsigned int mem_fail; + unsigned int need_more_hdr; + unsigned int msg_too_big; + unsigned int msg_timeouts; + unsigned int bad_hdr_len; + unsigned int aborts; + unsigned int interrupted; + unsigned int unrecov_intr; }; struct strparser; @@ -48,16 +48,18 @@ struct strp_callbacks { void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); int (*read_sock_done)(struct strparser *strp, int err); void (*abort_parser)(struct strparser *strp, int err); + void (*lock)(struct strparser *strp); + void (*unlock)(struct strparser *strp); }; -struct strp_rx_msg { +struct strp_msg { int full_len; int offset; }; -static inline struct strp_rx_msg *strp_rx_msg(struct sk_buff *skb) +static inline struct strp_msg *strp_msg(struct sk_buff *skb) { - return (struct strp_rx_msg *)((void *)skb->cb + + return (struct strp_msg *)((void *)skb->cb + offsetof(struct qdisc_skb_cb, data)); } @@ -65,18 +67,18 @@ static inline struct strp_rx_msg *strp_rx_msg(struct sk_buff *skb) struct strparser { struct sock *sk; - u32 rx_stopped : 1; - u32 rx_paused : 1; - u32 rx_aborted : 1; - u32 rx_interrupted : 1; - u32 rx_unrecov_intr : 1; - - struct sk_buff **rx_skb_nextp; - struct timer_list rx_msg_timer; - struct sk_buff *rx_skb_head; - unsigned int rx_need_bytes; - struct delayed_work rx_delayed_work; - struct work_struct rx_work; + u32 stopped : 1; + u32 paused : 1; + u32 aborted : 1; + u32 interrupted : 1; + u32 unrecov_intr : 1; + + struct sk_buff **skb_nextp; + struct timer_list msg_timer; + struct sk_buff *skb_head; + unsigned int need_bytes; + struct delayed_work delayed_work; + struct work_struct work; struct strp_stats stats; struct strp_callbacks cb; }; @@ -84,7 +86,7 @@ struct strparser { /* Must be called with lock held for attached socket */ static inline void strp_pause(struct strparser *strp) { - strp->rx_paused = 1; + strp->paused = 1; } /* May be called without holding lock for attached socket */ @@ -97,37 +99,37 @@ static inline void save_strp_stats(struct strparser *strp, #define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += \ strp->stats._stat) - SAVE_PSOCK_STATS(rx_msgs); - SAVE_PSOCK_STATS(rx_bytes); - SAVE_PSOCK_STATS(rx_mem_fail); - SAVE_PSOCK_STATS(rx_need_more_hdr); - SAVE_PSOCK_STATS(rx_msg_too_big); - SAVE_PSOCK_STATS(rx_msg_timeouts); - SAVE_PSOCK_STATS(rx_bad_hdr_len); + SAVE_PSOCK_STATS(msgs); + SAVE_PSOCK_STATS(bytes); + SAVE_PSOCK_STATS(mem_fail); + SAVE_PSOCK_STATS(need_more_hdr); + SAVE_PSOCK_STATS(msg_too_big); + SAVE_PSOCK_STATS(msg_timeouts); + SAVE_PSOCK_STATS(bad_hdr_len); #undef SAVE_PSOCK_STATS - if (strp->rx_aborted) - agg_stats->rx_aborts++; - if (strp->rx_interrupted) - agg_stats->rx_interrupted++; - if (strp->rx_unrecov_intr) - agg_stats->rx_unrecov_intr++; + if (strp->aborted) + agg_stats->aborts++; + if (strp->interrupted) + agg_stats->interrupted++; + if (strp->unrecov_intr) + agg_stats->unrecov_intr++; } static inline void aggregate_strp_stats(struct strp_aggr_stats *stats, struct strp_aggr_stats *agg_stats) { #define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += stats->_stat) - SAVE_PSOCK_STATS(rx_msgs); - SAVE_PSOCK_STATS(rx_bytes); - SAVE_PSOCK_STATS(rx_mem_fail); - SAVE_PSOCK_STATS(rx_need_more_hdr); - SAVE_PSOCK_STATS(rx_msg_too_big); - SAVE_PSOCK_STATS(rx_msg_timeouts); - SAVE_PSOCK_STATS(rx_bad_hdr_len); - SAVE_PSOCK_STATS(rx_aborts); - SAVE_PSOCK_STATS(rx_interrupted); - SAVE_PSOCK_STATS(rx_unrecov_intr); + SAVE_PSOCK_STATS(msgs); + SAVE_PSOCK_STATS(bytes); + SAVE_PSOCK_STATS(mem_fail); + SAVE_PSOCK_STATS(need_more_hdr); + SAVE_PSOCK_STATS(msg_too_big); + SAVE_PSOCK_STATS(msg_timeouts); + SAVE_PSOCK_STATS(bad_hdr_len); + SAVE_PSOCK_STATS(aborts); + SAVE_PSOCK_STATS(interrupted); + SAVE_PSOCK_STATS(unrecov_intr); #undef SAVE_PSOCK_STATS } @@ -135,8 +137,11 @@ static inline void aggregate_strp_stats(struct strp_aggr_stats *stats, void strp_done(struct strparser *strp); void strp_stop(struct strparser *strp); void strp_check_rcv(struct strparser *strp); -int strp_init(struct strparser *strp, struct sock *csk, +int strp_init(struct strparser *strp, struct sock *sk, struct strp_callbacks *cb); void strp_data_ready(struct strparser *strp); +int strp_process(struct strparser *strp, struct sk_buff *orig_skb, + unsigned int orig_offset, size_t orig_len, + size_t max_msg_size, long timeo); #endif /* __NET_STRPARSER_H_ */ diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c index c343ac60bf50..c748e8a6a72c 100644 --- a/net/kcm/kcmproc.c +++ b/net/kcm/kcmproc.c @@ -155,8 +155,8 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, seq_printf(seq, " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ", psock->index, - psock->strp.stats.rx_msgs, - psock->strp.stats.rx_bytes, + psock->strp.stats.msgs, + psock->strp.stats.bytes, psock->stats.tx_msgs, psock->stats.tx_bytes, psock->sk->sk_receive_queue.qlen, @@ -170,22 +170,22 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, if (psock->tx_stopped) seq_puts(seq, "TxStop "); - if (psock->strp.rx_stopped) + if (psock->strp.stopped) seq_puts(seq, "RxStop "); if (psock->tx_kcm) seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index); - if (!psock->strp.rx_paused && !psock->ready_rx_msg) { + if (!psock->strp.paused && !psock->ready_rx_msg) { if (psock->sk->sk_receive_queue.qlen) { - if (psock->strp.rx_need_bytes) + if (psock->strp.need_bytes) seq_printf(seq, "RxWait=%u ", - psock->strp.rx_need_bytes); + psock->strp.need_bytes); else seq_printf(seq, "RxWait "); } } else { - if (psock->strp.rx_paused) + if (psock->strp.paused) seq_puts(seq, "RxPause "); if (psock->ready_rx_msg) @@ -371,20 +371,20 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", "", - strp_stats.rx_msgs, - strp_stats.rx_bytes, + strp_stats.msgs, + strp_stats.bytes, psock_stats.tx_msgs, psock_stats.tx_bytes, psock_stats.reserved, psock_stats.unreserved, - strp_stats.rx_aborts, - strp_stats.rx_interrupted, - strp_stats.rx_unrecov_intr, - strp_stats.rx_mem_fail, - strp_stats.rx_need_more_hdr, - strp_stats.rx_bad_hdr_len, - strp_stats.rx_msg_too_big, - strp_stats.rx_msg_timeouts, + strp_stats.aborts, + strp_stats.interrupted, + strp_stats.unrecov_intr, + strp_stats.mem_fail, + strp_stats.need_more_hdr, + strp_stats.bad_hdr_len, + strp_stats.msg_too_big, + strp_stats.msg_timeouts, psock_stats.tx_aborts); return 0; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index da49191f7ad0..88ce73288247 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -96,12 +96,12 @@ static void kcm_update_rx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { STRP_STATS_ADD(mux->stats.rx_bytes, - psock->strp.stats.rx_bytes - + psock->strp.stats.bytes - psock->saved_rx_bytes); mux->stats.rx_msgs += - psock->strp.stats.rx_msgs - psock->saved_rx_msgs; - psock->saved_rx_msgs = psock->strp.stats.rx_msgs; - psock->saved_rx_bytes = psock->strp.stats.rx_bytes; + psock->strp.stats.msgs - psock->saved_rx_msgs; + psock->saved_rx_msgs = psock->strp.stats.msgs; + psock->saved_rx_bytes = psock->strp.stats.bytes; } static void kcm_update_tx_mux_stats(struct kcm_mux *mux, @@ -1118,7 +1118,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, struct kcm_sock *kcm = kcm_sk(sk); int err = 0; long timeo; - struct strp_rx_msg *rxm; + struct strp_msg *stm; int copied = 0; struct sk_buff *skb; @@ -1132,26 +1132,26 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, /* Okay, have a message on the receive queue */ - rxm = strp_rx_msg(skb); + stm = strp_msg(skb); - if (len > rxm->full_len) - len = rxm->full_len; + if (len > stm->full_len) + len = stm->full_len; - err = skb_copy_datagram_msg(skb, rxm->offset, msg, len); + err = skb_copy_datagram_msg(skb, stm->offset, msg, len); if (err < 0) goto out; copied = len; if (likely(!(flags & MSG_PEEK))) { KCM_STATS_ADD(kcm->stats.rx_bytes, copied); - if (copied < rxm->full_len) { + if (copied < stm->full_len) { if (sock->type == SOCK_DGRAM) { /* Truncated message */ msg->msg_flags |= MSG_TRUNC; goto msg_finished; } - rxm->offset += copied; - rxm->full_len -= copied; + stm->offset += copied; + stm->full_len -= copied; } else { msg_finished: /* Finished with message */ @@ -1175,7 +1175,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); long timeo; - struct strp_rx_msg *rxm; + struct strp_msg *stm; int err = 0; ssize_t copied; struct sk_buff *skb; @@ -1192,12 +1192,12 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, /* Okay, have a message on the receive queue */ - rxm = strp_rx_msg(skb); + stm = strp_msg(skb); - if (len > rxm->full_len) - len = rxm->full_len; + if (len > stm->full_len) + len = stm->full_len; - copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags); + copied = skb_splice_bits(skb, sk, stm->offset, pipe, len, flags); if (copied < 0) { err = copied; goto err_out; @@ -1205,8 +1205,8 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, KCM_STATS_ADD(kcm->stats.rx_bytes, copied); - rxm->offset += copied; - rxm->full_len -= copied; + stm->offset += copied; + stm->full_len -= copied; /* We have no way to return MSG_EOR. If all the bytes have been * read we still leave the message in the receive socket buffer. diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index b5c279b22680..0d18fbc6f870 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -29,44 +29,46 @@ static struct workqueue_struct *strp_wq; -struct _strp_rx_msg { - /* Internal cb structure. struct strp_rx_msg must be first for passing +struct _strp_msg { + /* Internal cb structure. struct strp_msg must be first for passing * to upper layer. */ - struct strp_rx_msg strp; + struct strp_msg strp; int accum_len; int early_eaten; }; -static inline struct _strp_rx_msg *_strp_rx_msg(struct sk_buff *skb) +static inline struct _strp_msg *_strp_msg(struct sk_buff *skb) { - return (struct _strp_rx_msg *)((void *)skb->cb + + return (struct _strp_msg *)((void *)skb->cb + offsetof(struct qdisc_skb_cb, data)); } /* Lower lock held */ -static void strp_abort_rx_strp(struct strparser *strp, int err) +static void strp_abort_strp(struct strparser *strp, int err) { - struct sock *csk = strp->sk; - /* Unrecoverable error in receive */ - del_timer(&strp->rx_msg_timer); + del_timer(&strp->msg_timer); - if (strp->rx_stopped) + if (strp->stopped) return; - strp->rx_stopped = 1; + strp->stopped = 1; + + if (strp->sk) { + struct sock *sk = strp->sk; - /* Report an error on the lower socket */ - csk->sk_err = err; - csk->sk_error_report(csk); + /* Report an error on the lower socket */ + sk->sk_err = err; + sk->sk_error_report(sk); + } } -static void strp_start_rx_timer(struct strparser *strp) +static void strp_start_timer(struct strparser *strp, long timeo) { - if (strp->sk->sk_rcvtimeo) - mod_timer(&strp->rx_msg_timer, strp->sk->sk_rcvtimeo); + if (timeo) + mod_timer(&strp->msg_timer, timeo); } /* Lower lock held */ @@ -74,46 +76,55 @@ static void strp_parser_err(struct strparser *strp, int err, read_descriptor_t *desc) { desc->error = err; - kfree_skb(strp->rx_skb_head); - strp->rx_skb_head = NULL; + kfree_skb(strp->skb_head); + strp->skb_head = NULL; strp->cb.abort_parser(strp, err); } static inline int strp_peek_len(struct strparser *strp) { - struct socket *sock = strp->sk->sk_socket; + if (strp->sk) { + struct socket *sock = strp->sk->sk_socket; + + return sock->ops->peek_len(sock); + } + + /* If we don't have an associated socket there's nothing to peek. + * Return int max to avoid stopping the strparser. + */ - return sock->ops->peek_len(sock); + return INT_MAX; } /* Lower socket lock held */ -static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, - unsigned int orig_offset, size_t orig_len) +static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, + unsigned int orig_offset, size_t orig_len, + size_t max_msg_size, long timeo) { struct strparser *strp = (struct strparser *)desc->arg.data; - struct _strp_rx_msg *rxm; + struct _strp_msg *stm; struct sk_buff *head, *skb; size_t eaten = 0, cand_len; ssize_t extra; int err; bool cloned_orig = false; - if (strp->rx_paused) + if (strp->paused) return 0; - head = strp->rx_skb_head; + head = strp->skb_head; if (head) { /* Message already in progress */ - rxm = _strp_rx_msg(head); - if (unlikely(rxm->early_eaten)) { + stm = _strp_msg(head); + if (unlikely(stm->early_eaten)) { /* Already some number of bytes on the receive sock - * data saved in rx_skb_head, just indicate they + * data saved in skb_head, just indicate they * are consumed. */ - eaten = orig_len <= rxm->early_eaten ? - orig_len : rxm->early_eaten; - rxm->early_eaten -= eaten; + eaten = orig_len <= stm->early_eaten ? + orig_len : stm->early_eaten; + stm->early_eaten -= eaten; return eaten; } @@ -126,12 +137,12 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, */ orig_skb = skb_clone(orig_skb, GFP_ATOMIC); if (!orig_skb) { - STRP_STATS_INCR(strp->stats.rx_mem_fail); + STRP_STATS_INCR(strp->stats.mem_fail); desc->error = -ENOMEM; return 0; } if (!pskb_pull(orig_skb, orig_offset)) { - STRP_STATS_INCR(strp->stats.rx_mem_fail); + STRP_STATS_INCR(strp->stats.mem_fail); kfree_skb(orig_skb); desc->error = -ENOMEM; return 0; @@ -140,13 +151,13 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, orig_offset = 0; } - if (!strp->rx_skb_nextp) { + if (!strp->skb_nextp) { /* We are going to append to the frags_list of head. * Need to unshare the frag_list. */ err = skb_unclone(head, GFP_ATOMIC); if (err) { - STRP_STATS_INCR(strp->stats.rx_mem_fail); + STRP_STATS_INCR(strp->stats.mem_fail); desc->error = err; return 0; } @@ -165,20 +176,20 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, skb = alloc_skb(0, GFP_ATOMIC); if (!skb) { - STRP_STATS_INCR(strp->stats.rx_mem_fail); + STRP_STATS_INCR(strp->stats.mem_fail); desc->error = -ENOMEM; return 0; } skb->len = head->len; skb->data_len = head->len; skb->truesize = head->truesize; - *_strp_rx_msg(skb) = *_strp_rx_msg(head); - strp->rx_skb_nextp = &head->next; + *_strp_msg(skb) = *_strp_msg(head); + strp->skb_nextp = &head->next; skb_shinfo(skb)->frag_list = head; - strp->rx_skb_head = skb; + strp->skb_head = skb; head = skb; } else { - strp->rx_skb_nextp = + strp->skb_nextp = &skb_shinfo(head)->frag_list; } } @@ -188,112 +199,112 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, /* Always clone since we will consume something */ skb = skb_clone(orig_skb, GFP_ATOMIC); if (!skb) { - STRP_STATS_INCR(strp->stats.rx_mem_fail); + STRP_STATS_INCR(strp->stats.mem_fail); desc->error = -ENOMEM; break; } cand_len = orig_len - eaten; - head = strp->rx_skb_head; + head = strp->skb_head; if (!head) { head = skb; - strp->rx_skb_head = head; - /* Will set rx_skb_nextp on next packet if needed */ - strp->rx_skb_nextp = NULL; - rxm = _strp_rx_msg(head); - memset(rxm, 0, sizeof(*rxm)); - rxm->strp.offset = orig_offset + eaten; + strp->skb_head = head; + /* Will set skb_nextp on next packet if needed */ + strp->skb_nextp = NULL; + stm = _strp_msg(head); + memset(stm, 0, sizeof(*stm)); + stm->strp.offset = orig_offset + eaten; } else { /* Unclone since we may be appending to an skb that we * already share a frag_list with. */ err = skb_unclone(skb, GFP_ATOMIC); if (err) { - STRP_STATS_INCR(strp->stats.rx_mem_fail); + STRP_STATS_INCR(strp->stats.mem_fail); desc->error = err; break; } - rxm = _strp_rx_msg(head); - *strp->rx_skb_nextp = skb; - strp->rx_skb_nextp = &skb->next; + stm = _strp_msg(head); + *strp->skb_nextp = skb; + strp->skb_nextp = &skb->next; head->data_len += skb->len; head->len += skb->len; head->truesize += skb->truesize; } - if (!rxm->strp.full_len) { + if (!stm->strp.full_len) { ssize_t len; len = (*strp->cb.parse_msg)(strp, head); if (!len) { /* Need more header to determine length */ - if (!rxm->accum_len) { + if (!stm->accum_len) { /* Start RX timer for new message */ - strp_start_rx_timer(strp); + strp_start_timer(strp, timeo); } - rxm->accum_len += cand_len; + stm->accum_len += cand_len; eaten += cand_len; - STRP_STATS_INCR(strp->stats.rx_need_more_hdr); + STRP_STATS_INCR(strp->stats.need_more_hdr); WARN_ON(eaten != orig_len); break; } else if (len < 0) { - if (len == -ESTRPIPE && rxm->accum_len) { + if (len == -ESTRPIPE && stm->accum_len) { len = -ENODATA; - strp->rx_unrecov_intr = 1; + strp->unrecov_intr = 1; } else { - strp->rx_interrupted = 1; + strp->interrupted = 1; } strp_parser_err(strp, len, desc); break; - } else if (len > strp->sk->sk_rcvbuf) { + } else if (len > max_msg_size) { /* Message length exceeds maximum allowed */ - STRP_STATS_INCR(strp->stats.rx_msg_too_big); + STRP_STATS_INCR(strp->stats.msg_too_big); strp_parser_err(strp, -EMSGSIZE, desc); break; } else if (len <= (ssize_t)head->len - - skb->len - rxm->strp.offset) { + skb->len - stm->strp.offset) { /* Length must be into new skb (and also * greater than zero) */ - STRP_STATS_INCR(strp->stats.rx_bad_hdr_len); + STRP_STATS_INCR(strp->stats.bad_hdr_len); strp_parser_err(strp, -EPROTO, desc); break; } - rxm->strp.full_len = len; + stm->strp.full_len = len; } - extra = (ssize_t)(rxm->accum_len + cand_len) - - rxm->strp.full_len; + extra = (ssize_t)(stm->accum_len + cand_len) - + stm->strp.full_len; if (extra < 0) { /* Message not complete yet. */ - if (rxm->strp.full_len - rxm->accum_len > + if (stm->strp.full_len - stm->accum_len > strp_peek_len(strp)) { - /* Don't have the whole messages in the socket - * buffer. Set strp->rx_need_bytes to wait for + /* Don't have the whole message in the socket + * buffer. Set strp->need_bytes to wait for * the rest of the message. Also, set "early * eaten" since we've already buffered the skb * but don't consume yet per strp_read_sock. */ - if (!rxm->accum_len) { + if (!stm->accum_len) { /* Start RX timer for new message */ - strp_start_rx_timer(strp); + strp_start_timer(strp, timeo); } - strp->rx_need_bytes = rxm->strp.full_len - - rxm->accum_len; - rxm->accum_len += cand_len; - rxm->early_eaten = cand_len; - STRP_STATS_ADD(strp->stats.rx_bytes, cand_len); + strp->need_bytes = stm->strp.full_len - + stm->accum_len; + stm->accum_len += cand_len; + stm->early_eaten = cand_len; + STRP_STATS_ADD(strp->stats.bytes, cand_len); desc->count = 0; /* Stop reading socket */ break; } - rxm->accum_len += cand_len; + stm->accum_len += cand_len; eaten += cand_len; WARN_ON(eaten != orig_len); break; @@ -308,14 +319,14 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, eaten += (cand_len - extra); /* Hurray, we have a new message! */ - del_timer(&strp->rx_msg_timer); - strp->rx_skb_head = NULL; - STRP_STATS_INCR(strp->stats.rx_msgs); + del_timer(&strp->msg_timer); + strp->skb_head = NULL; + STRP_STATS_INCR(strp->stats.msgs); /* Give skb to upper layer */ strp->cb.rcv_msg(strp, head); - if (unlikely(strp->rx_paused)) { + if (unlikely(strp->paused)) { /* Upper layer paused strp */ break; } @@ -324,11 +335,33 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, if (cloned_orig) kfree_skb(orig_skb); - STRP_STATS_ADD(strp->stats.rx_bytes, eaten); + STRP_STATS_ADD(strp->stats.bytes, eaten); return eaten; } +int strp_process(struct strparser *strp, struct sk_buff *orig_skb, + unsigned int orig_offset, size_t orig_len, + size_t max_msg_size, long timeo) +{ + read_descriptor_t desc; /* Dummy arg to strp_recv */ + + desc.arg.data = strp; + + return __strp_recv(&desc, orig_skb, orig_offset, orig_len, + max_msg_size, timeo); +} +EXPORT_SYMBOL_GPL(strp_process); + +static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, + unsigned int orig_offset, size_t orig_len) +{ + struct strparser *strp = (struct strparser *)desc->arg.data; + + return __strp_recv(desc, orig_skb, orig_offset, orig_len, + strp->sk->sk_rcvbuf, strp->sk->sk_rcvtimeo); +} + static int default_read_sock_done(struct strparser *strp, int err) { return err; @@ -355,101 +388,129 @@ static int strp_read_sock(struct strparser *strp) /* Lower sock lock held */ void strp_data_ready(struct strparser *strp) { - if (unlikely(strp->rx_stopped)) + if (unlikely(strp->stopped)) return; - /* This check is needed to synchronize with do_strp_rx_work. - * do_strp_rx_work acquires a process lock (lock_sock) whereas + /* This check is needed to synchronize with do_strp_work. + * do_strp_work acquires a process lock (lock_sock) whereas * the lock held here is bh_lock_sock. The two locks can be * held by different threads at the same time, but bh_lock_sock * allows a thread in BH context to safely check if the process * lock is held. In this case, if the lock is held, queue work. */ if (sock_owned_by_user(strp->sk)) { - queue_work(strp_wq, &strp->rx_work); + queue_work(strp_wq, &strp->work); return; } - if (strp->rx_paused) + if (strp->paused) return; - if (strp->rx_need_bytes) { - if (strp_peek_len(strp) >= strp->rx_need_bytes) - strp->rx_need_bytes = 0; + if (strp->need_bytes) { + if (strp_peek_len(strp) >= strp->need_bytes) + strp->need_bytes = 0; else return; } if (strp_read_sock(strp) == -ENOMEM) - queue_work(strp_wq, &strp->rx_work); + queue_work(strp_wq, &strp->work); } EXPORT_SYMBOL_GPL(strp_data_ready); -static void do_strp_rx_work(struct strparser *strp) +static void do_strp_work(struct strparser *strp) { read_descriptor_t rd_desc; - struct sock *csk = strp->sk; /* We need the read lock to synchronize with strp_data_ready. We * need the socket lock for calling strp_read_sock. */ - lock_sock(csk); + strp->cb.lock(strp); - if (unlikely(strp->rx_stopped)) + if (unlikely(strp->stopped)) goto out; - if (strp->rx_paused) + if (strp->paused) goto out; rd_desc.arg.data = strp; if (strp_read_sock(strp) == -ENOMEM) - queue_work(strp_wq, &strp->rx_work); + queue_work(strp_wq, &strp->work); out: - release_sock(csk); + strp->cb.unlock(strp); } -static void strp_rx_work(struct work_struct *w) +static void strp_work(struct work_struct *w) { - do_strp_rx_work(container_of(w, struct strparser, rx_work)); + do_strp_work(container_of(w, struct strparser, work)); } -static void strp_rx_msg_timeout(unsigned long arg) +static void strp_msg_timeout(unsigned long arg) { struct strparser *strp = (struct strparser *)arg; /* Message assembly timed out */ - STRP_STATS_INCR(strp->stats.rx_msg_timeouts); - lock_sock(strp->sk); + STRP_STATS_INCR(strp->stats.msg_timeouts); + strp->cb.lock(strp); strp->cb.abort_parser(strp, ETIMEDOUT); + strp->cb.unlock(strp); +} + +static void strp_sock_lock(struct strparser *strp) +{ + lock_sock(strp->sk); +} + +static void strp_sock_unlock(struct strparser *strp) +{ release_sock(strp->sk); } -int strp_init(struct strparser *strp, struct sock *csk, +int strp_init(struct strparser *strp, struct sock *sk, struct strp_callbacks *cb) { - struct socket *sock = csk->sk_socket; if (!cb || !cb->rcv_msg || !cb->parse_msg) return -EINVAL; - if (!sock->ops->read_sock || !sock->ops->peek_len) - return -EAFNOSUPPORT; + /* The sk (sock) arg determines the mode of the stream parser. + * + * If the sock is set then the strparser is in receive callback mode. + * The upper layer calls strp_data_ready to kick receive processing + * and strparser calls the read_sock function on the socket to + * get packets. + * + * If the sock is not set then the strparser is in general mode. + * The upper layer calls strp_process for each skb to be parsed. + */ - memset(strp, 0, sizeof(*strp)); + if (sk) { + struct socket *sock = sk->sk_socket; - strp->sk = csk; + if (!sock->ops->read_sock || !sock->ops->peek_len) + return -EAFNOSUPPORT; + } else { + if (!cb->lock || !cb->unlock) + return -EINVAL; + } - setup_timer(&strp->rx_msg_timer, strp_rx_msg_timeout, - (unsigned long)strp); + memset(strp, 0, sizeof(*strp)); - INIT_WORK(&strp->rx_work, strp_rx_work); + strp->sk = sk; + strp->cb.lock = cb->lock ? : strp_sock_lock; + strp->cb.unlock = cb->unlock ? : strp_sock_unlock; strp->cb.rcv_msg = cb->rcv_msg; strp->cb.parse_msg = cb->parse_msg; strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done; - strp->cb.abort_parser = cb->abort_parser ? : strp_abort_rx_strp; + strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp; + + setup_timer(&strp->msg_timer, strp_msg_timeout, + (unsigned long)strp); + + INIT_WORK(&strp->work, strp_work); return 0; } @@ -457,12 +518,12 @@ EXPORT_SYMBOL_GPL(strp_init); void strp_unpause(struct strparser *strp) { - strp->rx_paused = 0; + strp->paused = 0; - /* Sync setting rx_paused with RX work */ + /* Sync setting paused with RX work */ smp_mb(); - queue_work(strp_wq, &strp->rx_work); + queue_work(strp_wq, &strp->work); } EXPORT_SYMBOL_GPL(strp_unpause); @@ -471,27 +532,27 @@ EXPORT_SYMBOL_GPL(strp_unpause); */ void strp_done(struct strparser *strp) { - WARN_ON(!strp->rx_stopped); + WARN_ON(!strp->stopped); - del_timer_sync(&strp->rx_msg_timer); - cancel_work_sync(&strp->rx_work); + del_timer_sync(&strp->msg_timer); + cancel_work_sync(&strp->work); - if (strp->rx_skb_head) { - kfree_skb(strp->rx_skb_head); - strp->rx_skb_head = NULL; + if (strp->skb_head) { + kfree_skb(strp->skb_head); + strp->skb_head = NULL; } } EXPORT_SYMBOL_GPL(strp_done); void strp_stop(struct strparser *strp) { - strp->rx_stopped = 1; + strp->stopped = 1; } EXPORT_SYMBOL_GPL(strp_stop); void strp_check_rcv(struct strparser *strp) { - queue_work(strp_wq, &strp->rx_work); + queue_work(strp_wq, &strp->work); } EXPORT_SYMBOL_GPL(strp_check_rcv); -- cgit v1.2.3 From da779b4011c872a28c4ab369e923e786b19055a0 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Mon, 31 Jul 2017 18:05:08 +0800 Subject: dt-bindings: net: mediatek: add support for MediaTek MT7623 and MT7622 SoC The patch adds the supplements in the dt-binding document for MediaTek MT7622 SoC with extra SGMII system controller and relevant clock consumers listed as the requirements for those SoCs equipped with the SGMII circuit. Also, add the missing binding information for MT7623 SoC here which relies on the fallback binding of MT2701. Signed-off-by: Sean Wang Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/mediatek-net.txt | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/mediatek-net.txt b/Documentation/devicetree/bindings/net/mediatek-net.txt index c7194e87d5f4..1d1168b805cc 100644 --- a/Documentation/devicetree/bindings/net/mediatek-net.txt +++ b/Documentation/devicetree/bindings/net/mediatek-net.txt @@ -7,24 +7,30 @@ have dual GMAC each represented by a child node.. * Ethernet controller node Required properties: -- compatible: Should be "mediatek,mt2701-eth" +- compatible: Should be + "mediatek,mt2701-eth": for MT2701 SoC + "mediatek,mt7623-eth", "mediatek,mt2701-eth": for MT7623 SoC + "mediatek,mt7622-eth": for MT7622 SoC - reg: Address and length of the register set for the device - interrupts: Should contain the three frame engines interrupts in numeric order. These are fe_int0, fe_int1 and fe_int2. - clocks: the clock used by the core - clock-names: the names of the clock listed in the clocks property. These are - "ethif", "esw", "gp2", "gp1" + "ethif", "esw", "gp2", "gp1" : For MT2701 and MT7623 SoC + "ethif", "esw", "gp0", "gp1", "gp2", "sgmii_tx250m", "sgmii_rx250m", + "sgmii_cdr_ref", "sgmii_cdr_fb", "sgmii_ck", "eth2pll" : For MT7622 SoC - power-domains: phandle to the power domain that the ethernet is part of - resets: Should contain a phandle to the ethsys reset signal - reset-names: Should contain the reset signal name "eth" - mediatek,ethsys: phandle to the syscon node that handles the port setup +- mediatek,sgmiisys: phandle to the syscon node that handles the SGMII setup + which is required for those SoCs equipped with SGMII such as MT7622 SoC. - mediatek,pctl: phandle to the syscon node that handles the ports slew rate and driver current Optional properties: - interrupt-parent: Should be the phandle for the interrupt controller that services interrupts for this device - * Ethernet MAC node Required properties: -- cgit v1.2.3 From a5050c61036859e6fd7924f25cc6a97e7462039d Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 1 Aug 2017 19:58:54 -0700 Subject: netvsc: add documentation Add some background documentation on netvsc device options and limitations. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- Documentation/networking/netvsc.txt | 63 +++++++++++++++++++++++++++++++++++++ MAINTAINERS | 1 + 2 files changed, 64 insertions(+) create mode 100644 Documentation/networking/netvsc.txt (limited to 'Documentation') diff --git a/Documentation/networking/netvsc.txt b/Documentation/networking/netvsc.txt new file mode 100644 index 000000000000..4ddb4e4b0426 --- /dev/null +++ b/Documentation/networking/netvsc.txt @@ -0,0 +1,63 @@ +Hyper-V network driver +====================== + +Compatibility +============= + +This driver is compatible with Windows Server 2012 R2, 2016 and +Windows 10. + +Features +======== + + Checksum offload + ---------------- + The netvsc driver supports checksum offload as long as the + Hyper-V host version does. Windows Server 2016 and Azure + support checksum offload for TCP and UDP for both IPv4 and + IPv6. Windows Server 2012 only supports checksum offload for TCP. + + Receive Side Scaling + -------------------- + Hyper-V supports receive side scaling. For TCP, packets are + distributed among available queues based on IP address and port + number. Current versions of Hyper-V host, only distribute UDP + packets based on the IP source and destination address. + The port number is not used as part of the hash value for UDP. + Fragmented IP packets are not distributed between queues; + all fragmented packets arrive on the first channel. + + Generic Receive Offload, aka GRO + -------------------------------- + The driver supports GRO and it is enabled by default. GRO coalesces + like packets and significantly reduces CPU usage under heavy Rx + load. + + SR-IOV support + -------------- + Hyper-V supports SR-IOV as a hardware acceleration option. If SR-IOV + is enabled in both the vSwitch and the guest configuration, then the + Virtual Function (VF) device is passed to the guest as a PCI + device. In this case, both a synthetic (netvsc) and VF device are + visible in the guest OS and both NIC's have the same MAC address. + + The VF is enslaved by netvsc device. The netvsc driver will transparently + switch the data path to the VF when it is available and up. + Network state (addresses, firewall, etc) should be applied only to the + netvsc device; the slave device should not be accessed directly in + most cases. The exceptions are if some special queue discipline or + flow direction is desired, these should be applied directly to the + VF slave device. + + Receive Buffer + -------------- + Packets are received into a receive area which is created when device + is probed. The receive area is broken into MTU sized chunks and each may + contain one or more packets. The number of receive sections may be changed + via ethtool Rx ring parameters. + + There is a similar send buffer which is used to aggregate packets for sending. + The send area is broken into chunks of 6144 bytes, each of section may + contain one or more packets. The send buffer is an optimization, the driver + will use slower method to handle very large packets or if the send buffer + area is exhausted. diff --git a/MAINTAINERS b/MAINTAINERS index 207e45310620..448f2f67802f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6258,6 +6258,7 @@ M: Haiyang Zhang M: Stephen Hemminger L: devel@linuxdriverproject.org S: Maintained +F: Documentation/networking/netvsc.txt F: arch/x86/include/asm/mshyperv.h F: arch/x86/include/uapi/asm/hyperv.h F: arch/x86/kernel/cpu/mshyperv.c -- cgit v1.2.3 From 5d3ecb24b5363f5249b78792409e879ec5ec7922 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Thu, 3 Aug 2017 10:42:01 +0200 Subject: dt-bindings: net: marvell-pp2: update interrupt-names with TX interrupts The PPv2.2 unit has several interrupts used for TX completion notification. This commit updates the Device Tree binding describing this HW block to mention such interrupts. While at it, we update the example to use a recent Device Tree example, that uses interrupts going through the ICU, and not to the GIC directly. Signed-off-by: Thomas Petazzoni Signed-off-by: David S. Miller --- .../devicetree/bindings/net/marvell-pp2.txt | 28 +++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/marvell-pp2.txt b/Documentation/devicetree/bindings/net/marvell-pp2.txt index 6b4956beff8c..8918ad3ccf14 100644 --- a/Documentation/devicetree/bindings/net/marvell-pp2.txt +++ b/Documentation/devicetree/bindings/net/marvell-pp2.txt @@ -41,6 +41,10 @@ Optional properties (port): - marvell,loopback: port is loopback mode - phy: a phandle to a phy node defining the PHY address (as the reg property, a single integer). +- interrupt-names: if more than a single interrupt for rx is given, must + be the name associated to the interrupts listed. Valid + names are: "tx-cpu0", "tx-cpu1", "tx-cpu2", "tx-cpu3", + "rx-shared". Example for marvell,armada-375-pp2: @@ -80,19 +84,37 @@ cpm_ethernet: ethernet@0 { clock-names = "pp_clk", "gop_clk", "gp_clk"; eth0: eth0 { - interrupts = ; + interrupts = , + , + , + , + ; + interrupt-names = "tx-cpu0", "tx-cpu1", "tx-cpu2", + "tx-cpu3", "rx-shared"; port-id = <0>; gop-port-id = <0>; }; eth1: eth1 { - interrupts = ; + interrupts = , + , + , + , + ; + interrupt-names = "tx-cpu0", "tx-cpu1", "tx-cpu2", + "tx-cpu3", "rx-shared"; port-id = <1>; gop-port-id = <2>; }; eth2: eth2 { - interrupts = ; + interrupts = , + , + , + , + ; + interrupt-names = "tx-cpu0", "tx-cpu1", "tx-cpu2", + "tx-cpu3", "rx-shared"; port-id = <2>; gop-port-id = <3>; }; -- cgit v1.2.3 From 1126f470843a044250ef8863b358607df3cd10c8 Mon Sep 17 00:00:00 2001 From: Alexandru Gagniuc Date: Fri, 4 Aug 2017 13:08:52 -0700 Subject: dt-bindings: net: Document bindings for anarion-gmac Signed-off-by: Alexandru Gagniuc Signed-off-by: David S. Miller --- .../devicetree/bindings/net/anarion-gmac.txt | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/anarion-gmac.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/anarion-gmac.txt b/Documentation/devicetree/bindings/net/anarion-gmac.txt new file mode 100644 index 000000000000..fe678965ae69 --- /dev/null +++ b/Documentation/devicetree/bindings/net/anarion-gmac.txt @@ -0,0 +1,25 @@ +* Adaptrum Anarion ethernet controller + +This device is a platform glue layer for stmmac. +Please see stmmac.txt for the other unchanged properties. + +Required properties: + - compatible: Should be "adaptrum,anarion-gmac", "snps,dwmac" + - phy-mode: Should be "rgmii". Other modes are not currently supported. + + +Examples: + + gmac1: ethernet@f2014000 { + compatible = "adaptrum,anarion-gmac", "snps,dwmac"; + reg = <0xf2014000 0x4000>, <0xf2018100 8>; + + interrupt-parent = <&core_intc>; + interrupts = <21>; + interrupt-names = "macirq"; + + clocks = <&core_clk>; + clock-names = "stmmaceth"; + + phy-mode = "rgmii"; + }; -- cgit v1.2.3 From 0cbf4741652b29cf98f6a77a156d32b2aca59bc3 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 7 Aug 2017 15:30:09 +0100 Subject: Documentation: describe the new eBPF verifier value tracking behaviour Also bring the eBPF documentation up to date in other ways. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 122 ++++++++++++++++++++++++++++++------ 1 file changed, 104 insertions(+), 18 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index b69b205501de..d0fdba7d66e2 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -793,7 +793,7 @@ Some core changes of the new internal format: bpf_exit After the call the registers R1-R5 contain junk values and cannot be read. - In the future an eBPF verifier can be used to validate internal BPF programs. + An in-kernel eBPF verifier is used to validate internal BPF programs. Also in the new design, eBPF is limited to 4096 insns, which means that any program will terminate quickly and will only call a fixed number of kernel @@ -1017,7 +1017,7 @@ At the start of the program the register R1 contains a pointer to context and has type PTR_TO_CTX. If verifier sees an insn that does R2=R1, then R2 has now type PTR_TO_CTX as well and can be used on the right hand side of expression. -If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=UNKNOWN_VALUE, +If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=SCALAR_VALUE, since addition of two valid pointers makes invalid pointer. (In 'secure' mode verifier will reject any type of pointer arithmetic to make sure that kernel addresses don't leak to unprivileged users) @@ -1039,7 +1039,7 @@ is a correct program. If there was R1 instead of R6, it would have been rejected. load/store instructions are allowed only with registers of valid types, which -are PTR_TO_CTX, PTR_TO_MAP, FRAME_PTR. They are bounds and alignment checked. +are PTR_TO_CTX, PTR_TO_MAP, PTR_TO_STACK. They are bounds and alignment checked. For example: bpf_mov R1 = 1 bpf_mov R2 = 2 @@ -1058,7 +1058,7 @@ intends to load a word from address R6 + 8 and store it into R0 If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know that offset 8 of size 4 bytes can be accessed for reading, otherwise the verifier will reject the program. -If R6=FRAME_PTR, then access should be aligned and be within +If R6=PTR_TO_STACK, then access should be aligned and be within stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8, so it will fail verification, since it's out of bounds. @@ -1069,7 +1069,7 @@ For example: bpf_ld R0 = *(u32 *)(R10 - 4) bpf_exit is invalid program. -Though R10 is correct read-only register and has type FRAME_PTR +Though R10 is correct read-only register and has type PTR_TO_STACK and R10 - 4 is within stack bounds, there were no stores into that location. Pointer register spill/fill is tracked as well, since four (R6-R9) @@ -1094,6 +1094,71 @@ all use cases. See details of eBPF verifier in kernel/bpf/verifier.c +Register value tracking +----------------------- +In order to determine the safety of an eBPF program, the verifier must track +the range of possible values in each register and also in each stack slot. +This is done with 'struct bpf_reg_state', defined in include/linux/ +bpf_verifier.h, which unifies tracking of scalar and pointer values. Each +register state has a type, which is either NOT_INIT (the register has not been +written to), SCALAR_VALUE (some value which is not usable as a pointer), or a +pointer type. The types of pointers describe their base, as follows: + PTR_TO_CTX Pointer to bpf_context. + CONST_PTR_TO_MAP Pointer to struct bpf_map. "Const" because arithmetic + on these pointers is forbidden. + PTR_TO_MAP_VALUE Pointer to the value stored in a map element. + PTR_TO_MAP_VALUE_OR_NULL + Either a pointer to a map value, or NULL; map accesses + (see section 'eBPF maps', below) return this type, + which becomes a PTR_TO_MAP_VALUE when checked != NULL. + Arithmetic on these pointers is forbidden. + PTR_TO_STACK Frame pointer. + PTR_TO_PACKET skb->data. + PTR_TO_PACKET_END skb->data + headlen; arithmetic forbidden. +However, a pointer may be offset from this base (as a result of pointer +arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable +offset'. The former is used when an exactly-known value (e.g. an immediate +operand) is added to a pointer, while the latter is used for values which are +not exactly known. The variable offset is also used in SCALAR_VALUEs, to track +the range of possible values in the register. +The verifier's knowledge about the variable offset consists of: +* minimum and maximum values as unsigned +* minimum and maximum values as signed +* knowledge of the values of individual bits, in the form of a 'tnum': a u64 +'mask' and a u64 'value'. 1s in the mask represent bits whose value is unknown; +1s in the value represent bits known to be 1. Bits known to be 0 have 0 in both +mask and value; no bit should ever be 1 in both. For example, if a byte is read +into a register from memory, the register's top 56 bits are known zero, while +the low 8 are unknown - which is represented as the tnum (0x0; 0xff). If we +then OR this with 0x40, we get (0x40; 0xcf), then if we add 1 we get (0x0; +0x1ff), because of potential carries. +Besides arithmetic, the register state can also be updated by conditional +branches. For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch +it will have a umin_value (unsigned minimum value) of 9, whereas in the 'false' +branch it will have a umax_value of 8. A signed compare (with BPF_JSGT or +BPF_JSGE) would instead update the signed minimum/maximum values. Information +from the signed and unsigned bounds can be combined; for instance if a value is +first tested < 8 and then tested s> 4, the verifier will conclude that the value +is also > 4 and s< 8, since the bounds prevent crossing the sign boundary. +PTR_TO_PACKETs with a variable offset part have an 'id', which is common to all +pointers sharing that same variable offset. This is important for packet range +checks: after adding some variable to a packet pointer, if you then copy it to +another register and (say) add a constant 4, both registers will share the same +'id' but one will have a fixed offset of +4. Then if it is bounds-checked and +found to be less than a PTR_TO_PACKET_END, the other register is now known to +have a safe range of at least 4 bytes. See 'Direct packet access', below, for +more on PTR_TO_PACKET ranges. +The 'id' field is also used on PTR_TO_MAP_VALUE_OR_NULL, common to all copies of +the pointer returned from a map lookup. This means that when one copy is +checked and found to be non-NULL, all copies can become PTR_TO_MAP_VALUEs. +As well as range-checking, the tracked information is also used for enforcing +alignment of pointer accesses. For instance, on most systems the packet pointer +is 2 bytes after a 4-byte alignment. If a program adds 14 bytes to that to jump +over the Ethernet header, then reads IHL and addes (IHL * 4), the resulting +pointer will have a variable offset known to be 4n+2 for some n, so adding the 2 +bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses through +that pointer are safe. + Direct packet access -------------------- In cls_bpf and act_bpf programs the verifier allows direct access to the packet @@ -1121,7 +1186,7 @@ it now points to 'skb->data + 14' and accessible range is [R5, R5 + 14 - 14) which is zero bytes. More complex packet access may look like: - R0=imm1 R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp + R0=inv1 R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp 6: r0 = *(u8 *)(r3 +7) /* load 7th byte from the packet */ 7: r4 = *(u8 *)(r3 +12) 8: r4 *= 14 @@ -1135,26 +1200,31 @@ More complex packet access may look like: 16: r2 += 8 17: r1 = *(u32 *)(r1 +80) /* load skb->data_end */ 18: if r2 > r1 goto pc+2 - R0=inv56 R1=pkt_end R2=pkt(id=2,off=8,r=8) R3=pkt(id=2,off=0,r=8) R4=inv52 R5=pkt(id=0,off=14,r=14) R10=fp + R0=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R1=pkt_end R2=pkt(id=2,off=8,r=8) R3=pkt(id=2,off=0,r=8) R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)) R5=pkt(id=0,off=14,r=14) R10=fp 19: r1 = *(u8 *)(r3 +4) The state of the register R3 is R3=pkt(id=2,off=0,r=8) id=2 means that two 'r3 += rX' instructions were seen, so r3 points to some offset within a packet and since the program author did 'if (r3 + 8 > r1) goto err' at insn #18, the safe range is [R3, R3 + 8). -The verifier only allows 'add' operation on packet registers. Any other -operation will set the register state to 'unknown_value' and it won't be +The verifier only allows 'add'/'sub' operations on packet registers. Any other +operation will set the register state to 'SCALAR_VALUE' and it won't be available for direct packet access. Operation 'r3 += rX' may overflow and become less than original skb->data, -therefore the verifier has to prevent that. So it tracks the number of -upper zero bits in all 'uknown_value' registers, so when it sees -'r3 += rX' instruction and rX is more than 16-bit value, it will error as: -"cannot add integer value with N upper zero bits to ptr_to_packet" +therefore the verifier has to prevent that. So when it sees 'r3 += rX' +instruction and rX is more than 16-bit value, any subsequent bounds-check of r3 +against skb->data_end will not give us 'range' information, so attempts to read +through the pointer will give "invalid access to packet" error. Ex. after insn 'r4 = *(u8 *)(r3 +12)' (insn #7 above) the state of r4 is -R4=inv56 which means that upper 56 bits on the register are guaranteed -to be zero. After insn 'r4 *= 14' the state becomes R4=inv52, since -multiplying 8-bit value by constant 14 will keep upper 52 bits as zero. -Similarly 'r2 >>= 48' will make R2=inv48, since the shift is not sign -extending. This logic is implemented in evaluate_reg_alu() function. +R4=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) which means that upper 56 bits +of the register are guaranteed to be zero, and nothing is known about the lower +8 bits. After insn 'r4 *= 14' the state becomes +R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)), since multiplying an 8-bit +value by constant 14 will keep upper 52 bits as zero, also the least significant +bit will be zero as 14 is even. Similarly 'r2 >>= 48' will make +R2=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff)), since the shift is not sign +extending. This logic is implemented in adjust_reg_min_max_vals() function, +which calls adjust_ptr_min_max_vals() for adding pointer to scalar (or vice +versa) and adjust_scalar_min_max_vals() for operations on two scalars. The end result is that bpf program author can access packet directly using normal C code as: @@ -1214,6 +1284,22 @@ The map is defined by: . key size in bytes . value size in bytes +Pruning +------- +The verifier does not actually walk all possible paths through the program. For +each new branch to analyse, the verifier looks at all the states it's previously +been in when at this instruction. If any of them contain the current state as a +subset, the branch is 'pruned' - that is, the fact that the previous state was +accepted implies the current state would be as well. For instance, if in the +previous state, r1 held a packet-pointer, and in the current state, r1 holds a +packet-pointer with a range as long or longer and at least as strict an +alignment, then r1 is safe. Similarly, if r2 was NOT_INIT before then it can't +have been used by any path from that point, so any value in r2 (including +another NOT_INIT) is safe. The implementation is in the function regsafe(). +Pruning considers not only the registers but also the stack (and any spilled +registers it may hold). They must all be safe for the branch to be pruned. +This is implemented in states_equal(). + Understanding eBPF verifier messages ------------------------------------ -- cgit v1.2.3 From 92b31a9af73b3a3fc801899335d6c47966351830 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 10 Aug 2017 01:39:55 +0200 Subject: bpf: add BPF_J{LT,LE,SLT,SLE} instructions Currently, eBPF only understands BPF_JGT (>), BPF_JGE (>=), BPF_JSGT (s>), BPF_JSGE (s>=) instructions, this means that particularly *JLT/*JLE counterparts involving immediates need to be rewritten from e.g. X < [IMM] by swapping arguments into [IMM] > X, meaning the immediate first is required to be loaded into a register Y := [IMM], such that then we can compare with Y > X. Note that the destination operand is always required to be a register. This has the downside of having unnecessarily increased register pressure, meaning complex program would need to spill other registers temporarily to stack in order to obtain an unused register for the [IMM]. Loading to registers will thus also affect state pruning since we need to account for that register use and potentially those registers that had to be spilled/filled again. As a consequence slightly more stack space might have been used due to spilling, and BPF programs are a bit longer due to extra code involving the register load and potentially required spill/fills. Thus, add BPF_JLT (<), BPF_JLE (<=), BPF_JSLT (s<), BPF_JSLE (s<=) counterparts to the eBPF instruction set. Modifying LLVM to remove the NegateCC() workaround in a PoC patch at [1] and allowing it to also emit the new instructions resulted in cilium's BPF programs that are injected into the fast-path to have a reduced program length in the range of 2-3% (e.g. accumulated main and tail call sections from one of the object file reduced from 4864 to 4729 insns), reduced complexity in the range of 10-30% (e.g. accumulated sections reduced in one of the cases from 116432 to 88428 insns), and reduced stack usage in the range of 1-5% (e.g. accumulated sections from one of the object files reduced from 824 to 784b). The modification for LLVM will be incorporated in a backwards compatible way. Plan is for LLVM to have i) a target specific option to offer a possibility to explicitly enable the extension by the user (as we have with -m target specific extensions today for various CPU insns), and ii) have the kernel checked for presence of the extensions and enable them transparently when the user is selecting more aggressive options such as -march=native in a bpf target context. (Other frontends generating BPF byte code, e.g. ply can probe the kernel directly for its code generation.) [1] https://github.com/borkmann/llvm/tree/bpf-insns Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 4 + include/uapi/linux/bpf.h | 5 + kernel/bpf/core.c | 60 ++++++ lib/test_bpf.c | 364 ++++++++++++++++++++++++++++++++++++ net/core/filter.c | 21 ++- tools/include/uapi/linux/bpf.h | 5 + 6 files changed, 455 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index d0fdba7d66e2..6a0df8df6c43 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -906,6 +906,10 @@ If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of: BPF_JSGE 0x70 /* eBPF only: signed '>=' */ BPF_CALL 0x80 /* eBPF only: function call */ BPF_EXIT 0x90 /* eBPF only: function return */ + BPF_JLT 0xa0 /* eBPF only: unsigned '<' */ + BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */ + BPF_JSLT 0xc0 /* eBPF only: signed '<' */ + BPF_JSLE 0xd0 /* eBPF only: signed '<=' */ So BPF_ADD | BPF_X | BPF_ALU means 32-bit addition in both classic BPF and eBPF. There are only two registers in classic BPF, so it means A += X. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1d06be1569b1..91da8371a2d0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -30,9 +30,14 @@ #define BPF_FROM_LE BPF_TO_LE #define BPF_FROM_BE BPF_TO_BE +/* jmp encodings */ #define BPF_JNE 0x50 /* jump != */ +#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ +#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ #define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ #define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ +#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ad5f55922a13..c69e7f5bfde7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -595,9 +595,13 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: case BPF_JMP | BPF_JSET | BPF_K: /* Accommodate for extra offset in case of a backjump. */ off = from->off; @@ -833,12 +837,20 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, + [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X, + [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K, [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, + [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X, + [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K, [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, + [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X, + [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K, [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, + [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X, + [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K, [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, /* Program return */ @@ -1073,6 +1085,18 @@ out: CONT_JMP; } CONT; + JMP_JLT_X: + if (DST < SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JLT_K: + if (DST < IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JGE_X: if (DST >= SRC) { insn += insn->off; @@ -1085,6 +1109,18 @@ out: CONT_JMP; } CONT; + JMP_JLE_X: + if (DST <= SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JLE_K: + if (DST <= IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JSGT_X: if (((s64) DST) > ((s64) SRC)) { insn += insn->off; @@ -1097,6 +1133,18 @@ out: CONT_JMP; } CONT; + JMP_JSLT_X: + if (((s64) DST) < ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSLT_K: + if (((s64) DST) < ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JSGE_X: if (((s64) DST) >= ((s64) SRC)) { insn += insn->off; @@ -1109,6 +1157,18 @@ out: CONT_JMP; } CONT; + JMP_JSLE_X: + if (((s64) DST) <= ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSLE_K: + if (((s64) DST) <= ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JSET_X: if (DST & SRC) { insn += insn->off; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index d9d5a410955c..aa8812ae6776 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -951,6 +951,32 @@ static struct bpf_test tests[] = { { 4, 4, 4, 3, 3 }, { { 2, 0 }, { 3, 1 }, { 4, MAX_K } }, }, + { + "JGE (jt 0), test 1", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_LEN, 0), + BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 2), + BPF_JUMP(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 1), + BPF_STMT(BPF_RET | BPF_K, 1), + BPF_STMT(BPF_RET | BPF_K, MAX_K) + }, + CLASSIC, + { 4, 4, 4, 3, 3 }, + { { 2, 0 }, { 3, 1 }, { 4, 1 } }, + }, + { + "JGE (jt 0), test 2", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_LEN, 0), + BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 2), + BPF_JUMP(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 1), + BPF_STMT(BPF_RET | BPF_K, 1), + BPF_STMT(BPF_RET | BPF_K, MAX_K) + }, + CLASSIC, + { 4, 4, 5, 3, 3 }, + { { 4, 1 }, { 5, 1 }, { 6, MAX_K } }, + }, { "JGE", .u.insns = { @@ -4492,6 +4518,35 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JSLT | BPF_K */ + { + "JMP_JSLT_K: Signed jump: if (-2 < -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 0xfffffffffffffffeLL), + BPF_JMP_IMM(BPF_JSLT, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLT_K: Signed jump: if (-1 < -1) return 0", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_LD_IMM64(R1, 0xffffffffffffffffLL), + BPF_JMP_IMM(BPF_JSLT, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JSGT | BPF_K */ { "JMP_JSGT_K: Signed jump: if (-1 > -2) return 1", @@ -4521,6 +4576,73 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JSLE | BPF_K */ + { + "JMP_JSLE_K: Signed jump: if (-2 <= -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 0xfffffffffffffffeLL), + BPF_JMP_IMM(BPF_JSLE, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLE_K: Signed jump: if (-1 <= -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 0xffffffffffffffffLL), + BPF_JMP_IMM(BPF_JSLE, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLE_K: Signed jump: value walk 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 6), + BPF_ALU64_IMM(BPF_SUB, R1, 1), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 4), + BPF_ALU64_IMM(BPF_SUB, R1, 1), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 2), + BPF_ALU64_IMM(BPF_SUB, R1, 1), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 1), + BPF_EXIT_INSN(), /* bad exit */ + BPF_ALU32_IMM(BPF_MOV, R0, 1), /* good exit */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLE_K: Signed jump: value walk 2", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 4), + BPF_ALU64_IMM(BPF_SUB, R1, 2), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 2), + BPF_ALU64_IMM(BPF_SUB, R1, 2), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 1), + BPF_EXIT_INSN(), /* bad exit */ + BPF_ALU32_IMM(BPF_MOV, R0, 1), /* good exit */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JSGE | BPF_K */ { "JMP_JSGE_K: Signed jump: if (-1 >= -2) return 1", @@ -4617,6 +4739,35 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLT | BPF_K */ + { + "JMP_JLT_K: if (2 < 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 2), + BPF_JMP_IMM(BPF_JLT, R1, 3, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JGT_K: Unsigned jump: if (1 < -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 1), + BPF_JMP_IMM(BPF_JLT, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGE | BPF_K */ { "JMP_JGE_K: if (3 >= 2) return 1", @@ -4632,6 +4783,21 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLE | BPF_K */ + { + "JMP_JLE_K: if (2 <= 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 2), + BPF_JMP_IMM(BPF_JLE, R1, 3, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGT | BPF_K jump backwards */ { "JMP_JGT_K: if (3 > 2) return 1 (jump backwards)", @@ -4662,6 +4828,36 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLT | BPF_K jump backwards */ + { + "JMP_JGT_K: if (2 < 3) return 1 (jump backwards)", + .u.insns_int = { + BPF_JMP_IMM(BPF_JA, 0, 0, 2), /* goto start */ + BPF_ALU32_IMM(BPF_MOV, R0, 1), /* out: */ + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 0), /* start: */ + BPF_LD_IMM64(R1, 2), /* note: this takes 2 insns */ + BPF_JMP_IMM(BPF_JLT, R1, 3, -6), /* goto out */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JLE_K: if (3 <= 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_JMP_IMM(BPF_JLE, R1, 3, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JNE | BPF_K */ { "JMP_JNE_K: if (3 != 2) return 1", @@ -4752,6 +4948,37 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JSLT | BPF_X */ + { + "JMP_JSLT_X: Signed jump: if (-2 < -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, -2), + BPF_JMP_REG(BPF_JSLT, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLT_X: Signed jump: if (-1 < -1) return 0", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, -1), + BPF_JMP_REG(BPF_JSLT, R1, R2, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JSGE | BPF_X */ { "JMP_JSGE_X: Signed jump: if (-1 >= -2) return 1", @@ -4783,6 +5010,37 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JSLE | BPF_X */ + { + "JMP_JSLE_X: Signed jump: if (-2 <= -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, -2), + BPF_JMP_REG(BPF_JSLE, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLE_X: Signed jump: if (-1 <= -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, -1), + BPF_JMP_REG(BPF_JSLE, R1, R2, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGT | BPF_X */ { "JMP_JGT_X: if (3 > 2) return 1", @@ -4814,6 +5072,37 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLT | BPF_X */ + { + "JMP_JLT_X: if (2 < 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLT, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JLT_X: Unsigned jump: if (1 < -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, 1), + BPF_JMP_REG(BPF_JLT, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGE | BPF_X */ { "JMP_JGE_X: if (3 >= 2) return 1", @@ -4845,6 +5134,37 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLE | BPF_X */ + { + "JMP_JLE_X: if (2 <= 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLE, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JLE_X: if (3 <= 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 3), + BPF_JMP_REG(BPF_JLE, R1, R2, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, { /* Mainly testing JIT + imm64 here. */ "JMP_JGE_X: ldimm64 test 1", @@ -4890,6 +5210,50 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + { + "JMP_JLE_X: ldimm64 test 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLE, R2, R1, 2), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0xeeeeeeeeU } }, + }, + { + "JMP_JLE_X: ldimm64 test 2", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLE, R2, R1, 0), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0xffffffffU } }, + }, + { + "JMP_JLE_X: ldimm64 test 3", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLE, R2, R1, 4), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JNE | BPF_X */ { "JMP_JNE_X: if (3 != 2) return 1", diff --git a/net/core/filter.c b/net/core/filter.c index 78d00933dbe7..5afe3ac191ec 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -514,14 +514,27 @@ do_pass: break; } - /* Convert JEQ into JNE when 'jump_true' is next insn. */ - if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { - insn->code = BPF_JMP | BPF_JNE | bpf_src; + /* Convert some jumps when 'jump_true' is next insn. */ + if (fp->jt == 0) { + switch (BPF_OP(fp->code)) { + case BPF_JEQ: + insn->code = BPF_JMP | BPF_JNE | bpf_src; + break; + case BPF_JGT: + insn->code = BPF_JMP | BPF_JLE | bpf_src; + break; + case BPF_JGE: + insn->code = BPF_JMP | BPF_JLT | bpf_src; + break; + default: + goto jmp_rest; + } + target = i + fp->jf + 1; BPF_EMIT_JMP; break; } - +jmp_rest: /* Other jumps are mapped into two insns: Jxx and JA. */ target = i + fp->jt + 1; insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8d9bfcca3fe4..bf3b2e230455 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -30,9 +30,14 @@ #define BPF_FROM_LE BPF_TO_LE #define BPF_FROM_BE BPF_TO_BE +/* jmp encodings */ #define BPF_JNE 0x50 /* jump != */ +#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ +#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ #define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ #define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ +#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ -- cgit v1.2.3 From 2398506b4ec121bc86d97e13f9f9c76085baf2ad Mon Sep 17 00:00:00 2001 From: David Wu Date: Thu, 10 Aug 2017 21:59:18 +0800 Subject: Documentation: net: phy: Add phy-is-integrated binding Add the documentation for integrated PHY. A boolean property indicates the PHY is integrated into the same physical package as the Ethernet MAC. If needed, muxers should be configured to ensure the integrated PHY is used. The absence of this property indicates the muxers should be configured so that the external PHY is used. Signed-off-by: David Wu Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/phy.txt | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/phy.txt b/Documentation/devicetree/bindings/net/phy.txt index b55857696fc3..d3c24d5ffa9a 100644 --- a/Documentation/devicetree/bindings/net/phy.txt +++ b/Documentation/devicetree/bindings/net/phy.txt @@ -52,6 +52,11 @@ Optional Properties: Mark the corresponding energy efficient ethernet mode as broken and request the ethernet to stop advertising it. +- phy-is-integrated: If set, indicates that the PHY is integrated into the same + physical package as the Ethernet MAC. If needed, muxers should be configured + to ensure the integrated PHY is used. The absence of this property indicates + the muxers should be configured so that the external PHY is used. + Example: ethernet-phy@0 { -- cgit v1.2.3 From 22cb7a3ac380ecaab6837670963813599b123a53 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 15 Aug 2017 15:40:20 +0100 Subject: dt-bindings: net: ravb : Add support for r8a7745 SoC Add a new compatible string for the RZ/G1E (R8A7745) SoC. Signed-off-by: Biju Das Acked-by: Sergei Shtylyov Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/renesas,ravb.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/renesas,ravb.txt b/Documentation/devicetree/bindings/net/renesas,ravb.txt index 4717bc24eada..16723535e1aa 100644 --- a/Documentation/devicetree/bindings/net/renesas,ravb.txt +++ b/Documentation/devicetree/bindings/net/renesas,ravb.txt @@ -6,6 +6,7 @@ interface contains. Required properties: - compatible: Must contain one or more of the following: - "renesas,etheravb-r8a7743" for the R8A7743 SoC. + - "renesas,etheravb-r8a7745" for the R8A7745 SoC. - "renesas,etheravb-r8a7790" for the R8A7790 SoC. - "renesas,etheravb-r8a7791" for the R8A7791 SoC. - "renesas,etheravb-r8a7792" for the R8A7792 SoC. -- cgit v1.2.3 From 778ead344acce7ad38df390f2f28b91118c9b2b0 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Thu, 17 Aug 2017 19:59:48 +0200 Subject: dt-bindings: net: bluetooth: Add broadcom-bluetooth Add binding document for serial bluetooth chips using Broadcom protocol. Signed-off-by: Loic Poulain Signed-off-by: Marcel Holtmann --- .../devicetree/bindings/net/broadcom-bluetooth.txt | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/broadcom-bluetooth.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/broadcom-bluetooth.txt b/Documentation/devicetree/bindings/net/broadcom-bluetooth.txt new file mode 100644 index 000000000000..4194ff7e6ee6 --- /dev/null +++ b/Documentation/devicetree/bindings/net/broadcom-bluetooth.txt @@ -0,0 +1,35 @@ +Broadcom Bluetooth Chips +--------------------- + +This documents the binding structure and common properties for serial +attached Broadcom devices. + +Serial attached Broadcom devices shall be a child node of the host UART +device the slave device is attached to. + +Required properties: + + - compatible: should contain one of the following: + * "brcm,bcm43438-bt" + +Optional properties: + + - max-speed: see Documentation/devicetree/bindings/serial/slave-device.txt + - shutdown-gpios: GPIO specifier, used to enable the BT module + - device-wakeup-gpios: GPIO specifier, used to wakeup the controller + - host-wakeup-gpios: GPIO specifier, used to wakeup the host processor + - clocks: clock specifier if external clock provided to the controller + - clock-names: should be "extclk" + + +Example: + +&uart2 { + pinctrl-names = "default"; + pinctrl-0 = <&uart2_pins>; + + bluetooth { + compatible = "brcm,bcm43438-bt"; + max-speed = <921600>; + }; +}; -- cgit v1.2.3 From 89c9c1636f5aeef7b74cdcc141e0abc9bd764afe Mon Sep 17 00:00:00 2001 From: David Wu Date: Mon, 21 Aug 2017 18:12:55 +0800 Subject: net: ethernet: stmmac: dwmac-rk: Add rv1108 gmac support It only supports rmii interface. Add constants and callback functions for the dwmac on rv1108 socs. As can be seen, the base structure is the same, only registers and the bits in them moved slightly. Signed-off-by: David Wu Signed-off-by: David S. Miller --- .../devicetree/bindings/net/rockchip-dwmac.txt | 1 + drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c | 53 ++++++++++++++++++++++ 2 files changed, 54 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/rockchip-dwmac.txt b/Documentation/devicetree/bindings/net/rockchip-dwmac.txt index 8f427550720a..c1325387632c 100644 --- a/Documentation/devicetree/bindings/net/rockchip-dwmac.txt +++ b/Documentation/devicetree/bindings/net/rockchip-dwmac.txt @@ -10,6 +10,7 @@ Required properties: "rockchip,rk3366-gmac": found on RK3366 SoCs "rockchip,rk3368-gmac": found on RK3368 SoCs "rockchip,rk3399-gmac": found on RK3399 SoCs + "rockchip,rv1108-gmac": found on RV1108 SoCs - reg: addresses and length of the register sets for the device. - interrupts: Should contain the GMAC interrupts. - interrupt-names: Should contain the interrupt names "macirq". diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c index 2176403c72d8..99823f54696a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c @@ -787,6 +787,58 @@ static const struct rk_gmac_ops rk3399_ops = { .set_rmii_speed = rk3399_set_rmii_speed, }; +#define RV1108_GRF_GMAC_CON0 0X0900 + +/* RV1108_GRF_GMAC_CON0 */ +#define RV1108_GMAC_PHY_INTF_SEL_RMII (GRF_CLR_BIT(4) | GRF_CLR_BIT(5) | \ + GRF_BIT(6)) +#define RV1108_GMAC_FLOW_CTRL GRF_BIT(3) +#define RV1108_GMAC_FLOW_CTRL_CLR GRF_CLR_BIT(3) +#define RV1108_GMAC_SPEED_10M GRF_CLR_BIT(2) +#define RV1108_GMAC_SPEED_100M GRF_BIT(2) +#define RV1108_GMAC_RMII_CLK_25M GRF_BIT(7) +#define RV1108_GMAC_RMII_CLK_2_5M GRF_CLR_BIT(7) + +static void rv1108_set_to_rmii(struct rk_priv_data *bsp_priv) +{ + struct device *dev = &bsp_priv->pdev->dev; + + if (IS_ERR(bsp_priv->grf)) { + dev_err(dev, "%s: Missing rockchip,grf property\n", __func__); + return; + } + + regmap_write(bsp_priv->grf, RV1108_GRF_GMAC_CON0, + RV1108_GMAC_PHY_INTF_SEL_RMII); +} + +static void rv1108_set_rmii_speed(struct rk_priv_data *bsp_priv, int speed) +{ + struct device *dev = &bsp_priv->pdev->dev; + + if (IS_ERR(bsp_priv->grf)) { + dev_err(dev, "%s: Missing rockchip,grf property\n", __func__); + return; + } + + if (speed == 10) { + regmap_write(bsp_priv->grf, RV1108_GRF_GMAC_CON0, + RV1108_GMAC_RMII_CLK_2_5M | + RV1108_GMAC_SPEED_10M); + } else if (speed == 100) { + regmap_write(bsp_priv->grf, RV1108_GRF_GMAC_CON0, + RV1108_GMAC_RMII_CLK_25M | + RV1108_GMAC_SPEED_100M); + } else { + dev_err(dev, "unknown speed value for RMII! speed=%d", speed); + } +} + +static const struct rk_gmac_ops rv1108_ops = { + .set_to_rmii = rv1108_set_to_rmii, + .set_rmii_speed = rv1108_set_rmii_speed, +}; + #define RK_GRF_MACPHY_CON0 0xb00 #define RK_GRF_MACPHY_CON1 0xb04 #define RK_GRF_MACPHY_CON2 0xb08 @@ -1267,6 +1319,7 @@ static const struct of_device_id rk_gmac_dwmac_match[] = { { .compatible = "rockchip,rk3366-gmac", .data = &rk3366_ops }, { .compatible = "rockchip,rk3368-gmac", .data = &rk3368_ops }, { .compatible = "rockchip,rk3399-gmac", .data = &rk3399_ops }, + { .compatible = "rockchip,rv1108-gmac", .data = &rv1108_ops }, { } }; MODULE_DEVICE_TABLE(of, rk_gmac_dwmac_match); -- cgit v1.2.3 From 51ba902a16e68b786028db8b0482f3a5f22e7d4f Mon Sep 17 00:00:00 2001 From: Aviad Krawczyk Date: Mon, 21 Aug 2017 23:55:47 +0800 Subject: net-next/hinic: Initialize hw interface Initialize hw interface as part of the nic initialization for accessing hw. Signed-off-by: Aviad Krawczyk Signed-off-by: Zhao Chen Signed-off-by: David S. Miller --- Documentation/networking/hinic.txt | 125 ++++++++++++++ drivers/net/ethernet/Kconfig | 1 + drivers/net/ethernet/Makefile | 1 + drivers/net/ethernet/huawei/Kconfig | 19 +++ drivers/net/ethernet/huawei/Makefile | 5 + drivers/net/ethernet/huawei/hinic/Kconfig | 13 ++ drivers/net/ethernet/huawei/hinic/Makefile | 3 + drivers/net/ethernet/huawei/hinic/hinic_dev.h | 33 ++++ drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h | 36 ++++ drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c | 201 ++++++++++++++++++++++ drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h | 42 +++++ drivers/net/ethernet/huawei/hinic/hinic_hw_if.c | 208 +++++++++++++++++++++++ drivers/net/ethernet/huawei/hinic/hinic_hw_if.h | 160 +++++++++++++++++ drivers/net/ethernet/huawei/hinic/hinic_main.c | 195 +++++++++++++++++++++ 14 files changed, 1042 insertions(+) create mode 100644 Documentation/networking/hinic.txt create mode 100644 drivers/net/ethernet/huawei/Kconfig create mode 100644 drivers/net/ethernet/huawei/Makefile create mode 100644 drivers/net/ethernet/huawei/hinic/Kconfig create mode 100644 drivers/net/ethernet/huawei/hinic/Makefile create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_dev.h create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_if.c create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_if.h create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_main.c (limited to 'Documentation') diff --git a/Documentation/networking/hinic.txt b/Documentation/networking/hinic.txt new file mode 100644 index 000000000000..989366a4039c --- /dev/null +++ b/Documentation/networking/hinic.txt @@ -0,0 +1,125 @@ +Linux Kernel Driver for Huawei Intelligent NIC(HiNIC) family +============================================================ + +Overview: +========= +HiNIC is a network interface card for the Data Center Area. + +The driver supports a range of link-speed devices (10GbE, 25GbE, 40GbE, etc.). +The driver supports also a negotiated and extendable feature set. + +Some HiNIC devices support SR-IOV. This driver is used for Physical Function +(PF). + +HiNIC devices support MSI-X interrupt vector for each Tx/Rx queue and +adaptive interrupt moderation. + +HiNIC devices support also various offload features such as checksum offload, +TCP Transmit Segmentation Offload(TSO), Receive-Side Scaling(RSS) and +LRO(Large Receive Offload). + + +Supported PCI vendor ID/device IDs: +=================================== + +19e5:1822 - HiNIC PF + + +Driver Architecture and Source Code: +==================================== + +hinic_dev - Implement a Logical Network device that is independent from +specific HW details about HW data structure formats. + +hinic_hwdev - Implement the HW details of the device and include the components +for accessing the PCI NIC. + +hinic_hwdev contains the following components: +=============================================== + +HW Interface: +============= + +The interface for accessing the pci device (DMA memory and PCI BARs). +(hinic_hw_if.c, hinic_hw_if.h) + +Configuration Status Registers Area that describes the HW Registers on the +configuration and status BAR0. (hinic_hw_csr.h) + +MGMT components: +================ + +Asynchronous Event Queues(AEQs) - The event queues for receiving messages from +the MGMT modules on the cards. (hinic_hw_eqs.c, hinic_hw_eqs.h) + +Application Programmable Interface commands(API CMD) - Interface for sending +MGMT commands to the card. (hinic_hw_api_cmd.c, hinic_hw_api_cmd.h) + +Management (MGMT) - the PF to MGMT channel that uses API CMD for sending MGMT +commands to the card and receives notifications from the MGMT modules on the +card by AEQs. Also set the addresses of the IO CMDQs in HW. +(hinic_hw_mgmt.c, hinic_hw_mgmt.h) + +IO components: +============== + +Completion Event Queues(CEQs) - The completion Event Queues that describe IO +tasks that are finished. (hinic_hw_eqs.c, hinic_hw_eqs.h) + +Work Queues(WQ) - Contain the memory and operations for use by CMD queues and +the Queue Pairs. The WQ is a Memory Block in a Page. The Block contains +pointers to Memory Areas that are the Memory for the Work Queue Elements(WQEs). +(hinic_hw_wq.c, hinic_hw_wq.h) + +Command Queues(CMDQ) - The queues for sending commands for IO management and is +used to set the QPs addresses in HW. The commands completion events are +accumulated on the CEQ that is configured to receive the CMDQ completion events. +(hinic_hw_cmdq.c, hinic_hw_cmdq.h) + +Queue Pairs(QPs) - The HW Receive and Send queues for Receiving and Transmitting +Data. (hinic_hw_qp.c, hinic_hw_qp.h, hinic_hw_qp_ctxt.h) + +IO - de/constructs all the IO components. (hinic_hw_io.c, hinic_hw_io.h) + +HW device: +========== + +HW device - de/constructs the HW Interface, the MGMT components on the +initialization of the driver and the IO components on the case of Interface +UP/DOWN Events. (hinic_hw_dev.c, hinic_hw_dev.h) + + +hinic_dev contains the following components: +=============================================== + +PCI ID table - Contains the supported PCI Vendor/Device IDs. +(hinic_pci_tbl.h) + +Port Commands - Send commands to the HW device for port management +(MAC, Vlan, MTU, ...). (hinic_port.c, hinic_port.h) + +Tx Queues - Logical Tx Queues that use the HW Send Queues for transmit. +The Logical Tx queue is not dependent on the format of the HW Send Queue. +(hinic_tx.c, hinic_tx.h) + +Rx Queues - Logical Rx Queues that use the HW Receive Queues for receive. +The Logical Rx queue is not dependent on the format of the HW Receive Queue. +(hinic_rx.c, hinic_rx.h) + +hinic_dev - de/constructs the Logical Tx and Rx Queues. +(hinic_main.c, hinic_dev.h) + + +Miscellaneous: +============= + +Common functions that are used by HW and Logical Device. +(hinic_common.c, hinic_common.h) + + +Support +======= + +If an issue is identified with the released source code on the supported kernel +with a supported adapter, email the specific information related to the issue to +aviad.krawczyk@huawei.com. diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig index edae15ac0e98..c60421339a98 100644 --- a/drivers/net/ethernet/Kconfig +++ b/drivers/net/ethernet/Kconfig @@ -78,6 +78,7 @@ source "drivers/net/ethernet/freescale/Kconfig" source "drivers/net/ethernet/fujitsu/Kconfig" source "drivers/net/ethernet/hisilicon/Kconfig" source "drivers/net/ethernet/hp/Kconfig" +source "drivers/net/ethernet/huawei/Kconfig" source "drivers/net/ethernet/ibm/Kconfig" source "drivers/net/ethernet/intel/Kconfig" source "drivers/net/ethernet/i825xx/Kconfig" diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile index bf7f4502cabc..a0a03d4d939a 100644 --- a/drivers/net/ethernet/Makefile +++ b/drivers/net/ethernet/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_NET_VENDOR_FREESCALE) += freescale/ obj-$(CONFIG_NET_VENDOR_FUJITSU) += fujitsu/ obj-$(CONFIG_NET_VENDOR_HISILICON) += hisilicon/ obj-$(CONFIG_NET_VENDOR_HP) += hp/ +obj-$(CONFIG_NET_VENDOR_HUAWEI) += huawei/ obj-$(CONFIG_NET_VENDOR_IBM) += ibm/ obj-$(CONFIG_NET_VENDOR_INTEL) += intel/ obj-$(CONFIG_NET_VENDOR_I825XX) += i825xx/ diff --git a/drivers/net/ethernet/huawei/Kconfig b/drivers/net/ethernet/huawei/Kconfig new file mode 100644 index 000000000000..c1a95ae4058b --- /dev/null +++ b/drivers/net/ethernet/huawei/Kconfig @@ -0,0 +1,19 @@ +# +# Huawei driver configuration +# + +config NET_VENDOR_HUAWEI + bool "Huawei devices" + default y + ---help--- + If you have a network (Ethernet) card belonging to this class, say Y. + Note that the answer to this question doesn't directly affect the + kernel: saying N will just cause the configurator to skip all + the questions about Huawei cards. If you say Y, you will be asked + for your specific card in the following questions. + +if NET_VENDOR_HUAWEI + +source "drivers/net/ethernet/huawei/hinic/Kconfig" + +endif # NET_VENDOR_HUAWEI diff --git a/drivers/net/ethernet/huawei/Makefile b/drivers/net/ethernet/huawei/Makefile new file mode 100644 index 000000000000..5c37cc8fc1bc --- /dev/null +++ b/drivers/net/ethernet/huawei/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the Huawei device drivers. +# + +obj-$(CONFIG_HINIC) += hinic/ diff --git a/drivers/net/ethernet/huawei/hinic/Kconfig b/drivers/net/ethernet/huawei/hinic/Kconfig new file mode 100644 index 000000000000..69f2b1fba48d --- /dev/null +++ b/drivers/net/ethernet/huawei/hinic/Kconfig @@ -0,0 +1,13 @@ +# +# Huawei driver configuration +# + +config HINIC + tristate "Huawei Intelligent PCIE Network Interface Card" + depends on (PCI_MSI && X86) + default m + ---help--- + This driver supports HiNIC PCIE Ethernet cards. + To compile this driver as part of the kernel, choose Y here. + If unsure, choose N. + The default is compiled as module. diff --git a/drivers/net/ethernet/huawei/hinic/Makefile b/drivers/net/ethernet/huawei/hinic/Makefile new file mode 100644 index 000000000000..353cee03cf2d --- /dev/null +++ b/drivers/net/ethernet/huawei/hinic/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_HINIC) += hinic.o + +hinic-y := hinic_main.o hinic_hw_dev.o hinic_hw_if.o diff --git a/drivers/net/ethernet/huawei/hinic/hinic_dev.h b/drivers/net/ethernet/huawei/hinic/hinic_dev.h new file mode 100644 index 000000000000..6c2c896015a5 --- /dev/null +++ b/drivers/net/ethernet/huawei/hinic/hinic_dev.h @@ -0,0 +1,33 @@ +/* + * Huawei HiNIC PCI Express Linux driver + * Copyright(c) 2017 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + */ + +#ifndef HINIC_DEV_H +#define HINIC_DEV_H + +#include +#include + +#include "hinic_hw_dev.h" + +#define HINIC_DRV_NAME "hinic" + +struct hinic_dev { + struct net_device *netdev; + struct hinic_hwdev *hwdev; + + u32 msg_enable; +}; + +#endif diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h new file mode 100644 index 000000000000..c3440a9f5a1e --- /dev/null +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h @@ -0,0 +1,36 @@ +/* + * Huawei HiNIC PCI Express Linux driver + * Copyright(c) 2017 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + */ + +#ifndef HINIC_HW_CSR_H +#define HINIC_HW_CSR_H + +/* HW interface registers */ +#define HINIC_CSR_FUNC_ATTR0_ADDR 0x0 +#define HINIC_CSR_FUNC_ATTR1_ADDR 0x4 + +#define HINIC_DMA_ATTR_BASE 0xC80 +#define HINIC_ELECTION_BASE 0x4200 + +#define HINIC_DMA_ATTR_STRIDE 0x4 +#define HINIC_CSR_DMA_ATTR_ADDR(idx) \ + (HINIC_DMA_ATTR_BASE + (idx) * HINIC_DMA_ATTR_STRIDE) + +#define HINIC_PPF_ELECTION_STRIDE 0x4 +#define HINIC_CSR_MAX_PORTS 4 + +#define HINIC_CSR_PPF_ELECTION_ADDR(idx) \ + (HINIC_ELECTION_BASE + (idx) * HINIC_PPF_ELECTION_STRIDE) + +#endif diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c new file mode 100644 index 000000000000..f681846e51d5 --- /dev/null +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c @@ -0,0 +1,201 @@ +/* + * Huawei HiNIC PCI Express Linux driver + * Copyright(c) 2017 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hinic_hw_if.h" +#include "hinic_hw_dev.h" + +#define MAX_IRQS(max_qps, num_aeqs, num_ceqs) \ + (2 * (max_qps) + (num_aeqs) + (num_ceqs)) + +/** + * init_msix - enable the msix and save the entries + * @hwdev: the NIC HW device + * + * Return 0 - Success, negative - Failure + **/ +static int init_msix(struct hinic_hwdev *hwdev) +{ + struct hinic_hwif *hwif = hwdev->hwif; + struct pci_dev *pdev = hwif->pdev; + int nr_irqs, num_aeqs, num_ceqs; + size_t msix_entries_size; + int i, err; + + num_aeqs = HINIC_HWIF_NUM_AEQS(hwif); + num_ceqs = HINIC_HWIF_NUM_CEQS(hwif); + nr_irqs = MAX_IRQS(HINIC_MAX_QPS, num_aeqs, num_ceqs); + if (nr_irqs > HINIC_HWIF_NUM_IRQS(hwif)) + nr_irqs = HINIC_HWIF_NUM_IRQS(hwif); + + msix_entries_size = nr_irqs * sizeof(*hwdev->msix_entries); + hwdev->msix_entries = devm_kzalloc(&pdev->dev, msix_entries_size, + GFP_KERNEL); + if (!hwdev->msix_entries) + return -ENOMEM; + + for (i = 0; i < nr_irqs; i++) + hwdev->msix_entries[i].entry = i; + + err = pci_enable_msix_exact(pdev, hwdev->msix_entries, nr_irqs); + if (err) { + dev_err(&pdev->dev, "Failed to enable pci msix\n"); + return err; + } + + return 0; +} + +/** + * disable_msix - disable the msix + * @hwdev: the NIC HW device + **/ +static void disable_msix(struct hinic_hwdev *hwdev) +{ + struct hinic_hwif *hwif = hwdev->hwif; + struct pci_dev *pdev = hwif->pdev; + + pci_disable_msix(pdev); +} + +/** + * init_pfhwdev - Initialize the extended components of PF + * @pfhwdev: the HW device for PF + * + * Return 0 - success, negative - failure + **/ +static int init_pfhwdev(struct hinic_pfhwdev *pfhwdev) +{ + /* Initialize PF HW device extended components */ + return 0; +} + +/** + * free_pfhwdev - Free the extended components of PF + * @pfhwdev: the HW device for PF + **/ +static void free_pfhwdev(struct hinic_pfhwdev *pfhwdev) +{ +} + +/** + * hinic_init_hwdev - Initialize the NIC HW + * @pdev: the NIC pci device + * + * Return initialized NIC HW device + * + * Initialize the NIC HW device and return a pointer to it + **/ +struct hinic_hwdev *hinic_init_hwdev(struct pci_dev *pdev) +{ + struct hinic_pfhwdev *pfhwdev; + struct hinic_hwdev *hwdev; + struct hinic_hwif *hwif; + int err; + + hwif = devm_kzalloc(&pdev->dev, sizeof(*hwif), GFP_KERNEL); + if (!hwif) + return ERR_PTR(-ENOMEM); + + err = hinic_init_hwif(hwif, pdev); + if (err) { + dev_err(&pdev->dev, "Failed to init HW interface\n"); + return ERR_PTR(err); + } + + if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) { + dev_err(&pdev->dev, "Unsupported PCI Function type\n"); + err = -EFAULT; + goto err_func_type; + } + + pfhwdev = devm_kzalloc(&pdev->dev, sizeof(*pfhwdev), GFP_KERNEL); + if (!pfhwdev) { + err = -ENOMEM; + goto err_pfhwdev_alloc; + } + + hwdev = &pfhwdev->hwdev; + hwdev->hwif = hwif; + + err = init_msix(hwdev); + if (err) { + dev_err(&pdev->dev, "Failed to init msix\n"); + goto err_init_msix; + } + + err = init_pfhwdev(pfhwdev); + if (err) { + dev_err(&pdev->dev, "Failed to init PF HW device\n"); + goto err_init_pfhwdev; + } + + return hwdev; + +err_init_pfhwdev: + disable_msix(hwdev); + +err_init_msix: +err_pfhwdev_alloc: +err_func_type: + hinic_free_hwif(hwif); + return ERR_PTR(err); +} + +/** + * hinic_free_hwdev - Free the NIC HW device + * @hwdev: the NIC HW device + **/ +void hinic_free_hwdev(struct hinic_hwdev *hwdev) +{ + struct hinic_pfhwdev *pfhwdev = container_of(hwdev, + struct hinic_pfhwdev, + hwdev); + + free_pfhwdev(pfhwdev); + + disable_msix(hwdev); + + hinic_free_hwif(hwdev->hwif); +} + +/** + * hinic_hwdev_num_qps - return the number QPs available for use + * @hwdev: the NIC HW device + * + * Return number QPs available for use + **/ +int hinic_hwdev_num_qps(struct hinic_hwdev *hwdev) +{ + int num_aeqs, num_ceqs, nr_irqs, num_qps; + + num_aeqs = HINIC_HWIF_NUM_AEQS(hwdev->hwif); + num_ceqs = HINIC_HWIF_NUM_CEQS(hwdev->hwif); + nr_irqs = HINIC_HWIF_NUM_IRQS(hwdev->hwif); + + /* Each QP has its own (SQ + RQ) interrupt */ + num_qps = (nr_irqs - (num_aeqs + num_ceqs)) / 2; + + /* num_qps must be power of 2 */ + return BIT(fls(num_qps) - 1); +} diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h new file mode 100644 index 000000000000..b42e0ebdd97b --- /dev/null +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h @@ -0,0 +1,42 @@ +/* + * Huawei HiNIC PCI Express Linux driver + * Copyright(c) 2017 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + */ + +#ifndef HINIC_HW_DEV_H +#define HINIC_HW_DEV_H + +#include + +#include "hinic_hw_if.h" + +#define HINIC_MAX_QPS 32 + +struct hinic_hwdev { + struct hinic_hwif *hwif; + struct msix_entry *msix_entries; +}; + +struct hinic_pfhwdev { + struct hinic_hwdev hwdev; + + /* PF Extended components should be here */ +}; + +struct hinic_hwdev *hinic_init_hwdev(struct pci_dev *pdev); + +void hinic_free_hwdev(struct hinic_hwdev *hwdev); + +int hinic_hwdev_num_qps(struct hinic_hwdev *hwdev); + +#endif diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.c new file mode 100644 index 000000000000..edf184242172 --- /dev/null +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.c @@ -0,0 +1,208 @@ +/* + * Huawei HiNIC PCI Express Linux driver + * Copyright(c) 2017 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "hinic_hw_csr.h" +#include "hinic_hw_if.h" + +#define PCIE_ATTR_ENTRY 0 + +/** + * hwif_ready - test if the HW is ready for use + * @hwif: the HW interface of a pci function device + * + * Return 0 - Success, negative - Failure + **/ +static int hwif_ready(struct hinic_hwif *hwif) +{ + struct pci_dev *pdev = hwif->pdev; + u32 addr, attr1; + + addr = HINIC_CSR_FUNC_ATTR1_ADDR; + attr1 = hinic_hwif_read_reg(hwif, addr); + + if (!HINIC_FA1_GET(attr1, INIT_STATUS)) { + dev_err(&pdev->dev, "hwif status is not ready\n"); + return -EFAULT; + } + + return 0; +} + +/** + * set_hwif_attr - set the attributes in the relevant members in hwif + * @hwif: the HW interface of a pci function device + * @attr0: the first attribute that was read from the hw + * @attr1: the second attribute that was read from the hw + **/ +static void set_hwif_attr(struct hinic_hwif *hwif, u32 attr0, u32 attr1) +{ + hwif->attr.func_idx = HINIC_FA0_GET(attr0, FUNC_IDX); + hwif->attr.pf_idx = HINIC_FA0_GET(attr0, PF_IDX); + hwif->attr.pci_intf_idx = HINIC_FA0_GET(attr0, PCI_INTF_IDX); + hwif->attr.func_type = HINIC_FA0_GET(attr0, FUNC_TYPE); + + hwif->attr.num_aeqs = BIT(HINIC_FA1_GET(attr1, AEQS_PER_FUNC)); + hwif->attr.num_ceqs = BIT(HINIC_FA1_GET(attr1, CEQS_PER_FUNC)); + hwif->attr.num_irqs = BIT(HINIC_FA1_GET(attr1, IRQS_PER_FUNC)); + hwif->attr.num_dma_attr = BIT(HINIC_FA1_GET(attr1, DMA_ATTR_PER_FUNC)); +} + +/** + * read_hwif_attr - read the attributes and set members in hwif + * @hwif: the HW interface of a pci function device + **/ +static void read_hwif_attr(struct hinic_hwif *hwif) +{ + u32 addr, attr0, attr1; + + addr = HINIC_CSR_FUNC_ATTR0_ADDR; + attr0 = hinic_hwif_read_reg(hwif, addr); + + addr = HINIC_CSR_FUNC_ATTR1_ADDR; + attr1 = hinic_hwif_read_reg(hwif, addr); + + set_hwif_attr(hwif, attr0, attr1); +} + +/** + * set_ppf - try to set hwif as ppf and set the type of hwif in this case + * @hwif: the HW interface of a pci function device + **/ +static void set_ppf(struct hinic_hwif *hwif) +{ + struct hinic_func_attr *attr = &hwif->attr; + u32 addr, val, ppf_election; + + /* Read Modify Write */ + addr = HINIC_CSR_PPF_ELECTION_ADDR(HINIC_HWIF_PCI_INTF(hwif)); + + val = hinic_hwif_read_reg(hwif, addr); + val = HINIC_PPF_ELECTION_CLEAR(val, IDX); + + ppf_election = HINIC_PPF_ELECTION_SET(HINIC_HWIF_FUNC_IDX(hwif), IDX); + + val |= ppf_election; + hinic_hwif_write_reg(hwif, addr, val); + + /* check PPF */ + val = hinic_hwif_read_reg(hwif, addr); + + attr->ppf_idx = HINIC_PPF_ELECTION_GET(val, IDX); + if (attr->ppf_idx == HINIC_HWIF_FUNC_IDX(hwif)) + attr->func_type = HINIC_PPF; +} + +/** + * set_dma_attr - set the dma attributes in the HW + * @hwif: the HW interface of a pci function device + * @entry_idx: the entry index in the dma table + * @st: PCIE TLP steering tag + * @at: PCIE TLP AT field + * @ph: PCIE TLP Processing Hint field + * @no_snooping: PCIE TLP No snooping + * @tph_en: PCIE TLP Processing Hint Enable + **/ +static void set_dma_attr(struct hinic_hwif *hwif, u32 entry_idx, + u8 st, u8 at, u8 ph, + enum hinic_pcie_nosnoop no_snooping, + enum hinic_pcie_tph tph_en) +{ + u32 addr, val, dma_attr_entry; + + /* Read Modify Write */ + addr = HINIC_CSR_DMA_ATTR_ADDR(entry_idx); + + val = hinic_hwif_read_reg(hwif, addr); + val = HINIC_DMA_ATTR_CLEAR(val, ST) & + HINIC_DMA_ATTR_CLEAR(val, AT) & + HINIC_DMA_ATTR_CLEAR(val, PH) & + HINIC_DMA_ATTR_CLEAR(val, NO_SNOOPING) & + HINIC_DMA_ATTR_CLEAR(val, TPH_EN); + + dma_attr_entry = HINIC_DMA_ATTR_SET(st, ST) | + HINIC_DMA_ATTR_SET(at, AT) | + HINIC_DMA_ATTR_SET(ph, PH) | + HINIC_DMA_ATTR_SET(no_snooping, NO_SNOOPING) | + HINIC_DMA_ATTR_SET(tph_en, TPH_EN); + + val |= dma_attr_entry; + hinic_hwif_write_reg(hwif, addr, val); +} + +/** + * dma_attr_table_init - initialize the the default dma attributes + * @hwif: the HW interface of a pci function device + **/ +static void dma_attr_init(struct hinic_hwif *hwif) +{ + set_dma_attr(hwif, PCIE_ATTR_ENTRY, HINIC_PCIE_ST_DISABLE, + HINIC_PCIE_AT_DISABLE, HINIC_PCIE_PH_DISABLE, + HINIC_PCIE_SNOOP, HINIC_PCIE_TPH_DISABLE); +} + +/** + * hinic_init_hwif - initialize the hw interface + * @hwif: the HW interface of a pci function device + * @pdev: the pci device for acessing PCI resources + * + * Return 0 - Success, negative - Failure + **/ +int hinic_init_hwif(struct hinic_hwif *hwif, struct pci_dev *pdev) +{ + int err; + + hwif->pdev = pdev; + + hwif->cfg_regs_bar = pci_ioremap_bar(pdev, HINIC_PCI_CFG_REGS_BAR); + if (!hwif->cfg_regs_bar) { + dev_err(&pdev->dev, "Failed to map configuration regs\n"); + return -ENOMEM; + } + + err = hwif_ready(hwif); + if (err) { + dev_err(&pdev->dev, "HW interface is not ready\n"); + goto err_hwif_ready; + } + + read_hwif_attr(hwif); + + if (HINIC_IS_PF(hwif)) + set_ppf(hwif); + + /* No transactionss before DMA is initialized */ + dma_attr_init(hwif); + return 0; + +err_hwif_ready: + iounmap(hwif->cfg_regs_bar); + return err; +} + +/** + * hinic_free_hwif - free the HW interface + * @hwif: the HW interface of a pci function device + **/ +void hinic_free_hwif(struct hinic_hwif *hwif) +{ + iounmap(hwif->cfg_regs_bar); +} diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h new file mode 100644 index 000000000000..d1a8fa2bc3ee --- /dev/null +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h @@ -0,0 +1,160 @@ +/* + * Huawei HiNIC PCI Express Linux driver + * Copyright(c) 2017 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + */ + +#ifndef HINIC_HW_IF_H +#define HINIC_HW_IF_H + +#include +#include +#include +#include + +#define HINIC_DMA_ATTR_ST_SHIFT 0 +#define HINIC_DMA_ATTR_AT_SHIFT 8 +#define HINIC_DMA_ATTR_PH_SHIFT 10 +#define HINIC_DMA_ATTR_NO_SNOOPING_SHIFT 12 +#define HINIC_DMA_ATTR_TPH_EN_SHIFT 13 + +#define HINIC_DMA_ATTR_ST_MASK 0xFF +#define HINIC_DMA_ATTR_AT_MASK 0x3 +#define HINIC_DMA_ATTR_PH_MASK 0x3 +#define HINIC_DMA_ATTR_NO_SNOOPING_MASK 0x1 +#define HINIC_DMA_ATTR_TPH_EN_MASK 0x1 + +#define HINIC_DMA_ATTR_SET(val, member) \ + (((u32)(val) & HINIC_DMA_ATTR_##member##_MASK) << \ + HINIC_DMA_ATTR_##member##_SHIFT) + +#define HINIC_DMA_ATTR_CLEAR(val, member) \ + ((val) & (~(HINIC_DMA_ATTR_##member##_MASK \ + << HINIC_DMA_ATTR_##member##_SHIFT))) + +#define HINIC_FA0_FUNC_IDX_SHIFT 0 +#define HINIC_FA0_PF_IDX_SHIFT 10 +#define HINIC_FA0_PCI_INTF_IDX_SHIFT 14 +/* reserved members - off 16 */ +#define HINIC_FA0_FUNC_TYPE_SHIFT 24 + +#define HINIC_FA0_FUNC_IDX_MASK 0x3FF +#define HINIC_FA0_PF_IDX_MASK 0xF +#define HINIC_FA0_PCI_INTF_IDX_MASK 0x3 +#define HINIC_FA0_FUNC_TYPE_MASK 0x1 + +#define HINIC_FA0_GET(val, member) \ + (((val) >> HINIC_FA0_##member##_SHIFT) & HINIC_FA0_##member##_MASK) + +#define HINIC_FA1_AEQS_PER_FUNC_SHIFT 8 +/* reserved members - off 10 */ +#define HINIC_FA1_CEQS_PER_FUNC_SHIFT 12 +/* reserved members - off 15 */ +#define HINIC_FA1_IRQS_PER_FUNC_SHIFT 20 +#define HINIC_FA1_DMA_ATTR_PER_FUNC_SHIFT 24 +/* reserved members - off 27 */ +#define HINIC_FA1_INIT_STATUS_SHIFT 30 + +#define HINIC_FA1_AEQS_PER_FUNC_MASK 0x3 +#define HINIC_FA1_CEQS_PER_FUNC_MASK 0x7 +#define HINIC_FA1_IRQS_PER_FUNC_MASK 0xF +#define HINIC_FA1_DMA_ATTR_PER_FUNC_MASK 0x7 +#define HINIC_FA1_INIT_STATUS_MASK 0x1 + +#define HINIC_FA1_GET(val, member) \ + (((val) >> HINIC_FA1_##member##_SHIFT) & HINIC_FA1_##member##_MASK) + +#define HINIC_PPF_ELECTION_IDX_SHIFT 0 +#define HINIC_PPF_ELECTION_IDX_MASK 0x1F + +#define HINIC_PPF_ELECTION_SET(val, member) \ + (((u32)(val) & HINIC_PPF_ELECTION_##member##_MASK) << \ + HINIC_PPF_ELECTION_##member##_SHIFT) + +#define HINIC_PPF_ELECTION_GET(val, member) \ + (((val) >> HINIC_PPF_ELECTION_##member##_SHIFT) & \ + HINIC_PPF_ELECTION_##member##_MASK) + +#define HINIC_PPF_ELECTION_CLEAR(val, member) \ + ((val) & (~(HINIC_PPF_ELECTION_##member##_MASK \ + << HINIC_PPF_ELECTION_##member##_SHIFT))) + +#define HINIC_HWIF_NUM_AEQS(hwif) ((hwif)->attr.num_aeqs) +#define HINIC_HWIF_NUM_CEQS(hwif) ((hwif)->attr.num_ceqs) +#define HINIC_HWIF_NUM_IRQS(hwif) ((hwif)->attr.num_irqs) +#define HINIC_HWIF_FUNC_IDX(hwif) ((hwif)->attr.func_idx) +#define HINIC_HWIF_PCI_INTF(hwif) ((hwif)->attr.pci_intf_idx) + +#define HINIC_FUNC_TYPE(hwif) ((hwif)->attr.func_type) +#define HINIC_IS_PF(hwif) (HINIC_FUNC_TYPE(hwif) == HINIC_PF) +#define HINIC_IS_PPF(hwif) (HINIC_FUNC_TYPE(hwif) == HINIC_PPF) + +#define HINIC_PCI_CFG_REGS_BAR 0 + +#define HINIC_PCIE_ST_DISABLE 0 +#define HINIC_PCIE_AT_DISABLE 0 +#define HINIC_PCIE_PH_DISABLE 0 + +enum hinic_pcie_nosnoop { + HINIC_PCIE_SNOOP = 0, + HINIC_PCIE_NO_SNOOP = 1, +}; + +enum hinic_pcie_tph { + HINIC_PCIE_TPH_DISABLE = 0, + HINIC_PCIE_TPH_ENABLE = 1, +}; + +enum hinic_func_type { + HINIC_PF = 0, + HINIC_PPF = 2, +}; + +struct hinic_func_attr { + u16 func_idx; + u8 pf_idx; + u8 pci_intf_idx; + + enum hinic_func_type func_type; + + u8 ppf_idx; + + u16 num_irqs; + u8 num_aeqs; + u8 num_ceqs; + + u8 num_dma_attr; +}; + +struct hinic_hwif { + struct pci_dev *pdev; + void __iomem *cfg_regs_bar; + + struct hinic_func_attr attr; +}; + +static inline u32 hinic_hwif_read_reg(struct hinic_hwif *hwif, u32 reg) +{ + return be32_to_cpu(readl(hwif->cfg_regs_bar + reg)); +} + +static inline void hinic_hwif_write_reg(struct hinic_hwif *hwif, u32 reg, + u32 val) +{ + writel(cpu_to_be32(val), hwif->cfg_regs_bar + reg); +} + +int hinic_init_hwif(struct hinic_hwif *hwif, struct pci_dev *pdev); + +void hinic_free_hwif(struct hinic_hwif *hwif); + +#endif diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c new file mode 100644 index 000000000000..1d7aed07b25f --- /dev/null +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -0,0 +1,195 @@ +/* + * Huawei HiNIC PCI Express Linux driver + * Copyright(c) 2017 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hinic_hw_dev.h" +#include "hinic_dev.h" + +MODULE_AUTHOR("Huawei Technologies CO., Ltd"); +MODULE_DESCRIPTION("Huawei Intelligent NIC driver"); +MODULE_LICENSE("GPL"); + +#define PCI_DEVICE_ID_HI1822_PF 0x1822 + +#define MSG_ENABLE_DEFAULT (NETIF_MSG_DRV | NETIF_MSG_PROBE | \ + NETIF_MSG_IFUP | \ + NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR) + +static const struct net_device_ops hinic_netdev_ops = { + /* Operations are empty, should be filled */ +}; + +/** + * nic_dev_init - Initialize the NIC device + * @pdev: the NIC pci device + * + * Return 0 - Success, negative - Failure + **/ +static int nic_dev_init(struct pci_dev *pdev) +{ + struct hinic_dev *nic_dev; + struct net_device *netdev; + struct hinic_hwdev *hwdev; + int err, num_qps; + + hwdev = hinic_init_hwdev(pdev); + if (IS_ERR(hwdev)) { + dev_err(&pdev->dev, "Failed to initialize HW device\n"); + return PTR_ERR(hwdev); + } + + num_qps = hinic_hwdev_num_qps(hwdev); + if (num_qps <= 0) { + dev_err(&pdev->dev, "Invalid number of QPS\n"); + err = -EINVAL; + goto err_num_qps; + } + + netdev = alloc_etherdev_mq(sizeof(*nic_dev), num_qps); + if (!netdev) { + dev_err(&pdev->dev, "Failed to allocate Ethernet device\n"); + err = -ENOMEM; + goto err_alloc_etherdev; + } + + netdev->netdev_ops = &hinic_netdev_ops; + + nic_dev = netdev_priv(netdev); + nic_dev->netdev = netdev; + nic_dev->hwdev = hwdev; + nic_dev->msg_enable = MSG_ENABLE_DEFAULT; + + pci_set_drvdata(pdev, netdev); + + netif_carrier_off(netdev); + + err = register_netdev(netdev); + if (err) { + dev_err(&pdev->dev, "Failed to register netdev\n"); + goto err_reg_netdev; + } + + return 0; + +err_reg_netdev: + pci_set_drvdata(pdev, NULL); + free_netdev(netdev); + +err_alloc_etherdev: +err_num_qps: + hinic_free_hwdev(hwdev); + return err; +} + +static int hinic_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + int err = pci_enable_device(pdev); + + if (err) { + dev_err(&pdev->dev, "Failed to enable PCI device\n"); + return err; + } + + err = pci_request_regions(pdev, HINIC_DRV_NAME); + if (err) { + dev_err(&pdev->dev, "Failed to request PCI regions\n"); + goto err_pci_regions; + } + + pci_set_master(pdev); + + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Couldn't set 64-bit DMA mask\n"); + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Failed to set DMA mask\n"); + goto err_dma_mask; + } + } + + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, + "Couldn't set 64-bit consistent DMA mask\n"); + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, + "Failed to set consistent DMA mask\n"); + goto err_dma_consistent_mask; + } + } + + err = nic_dev_init(pdev); + if (err) { + dev_err(&pdev->dev, "Failed to initialize NIC device\n"); + goto err_nic_dev_init; + } + + dev_info(&pdev->dev, "HiNIC driver - probed\n"); + return 0; + +err_nic_dev_init: +err_dma_consistent_mask: +err_dma_mask: + pci_release_regions(pdev); + +err_pci_regions: + pci_disable_device(pdev); + return err; +} + +static void hinic_remove(struct pci_dev *pdev) +{ + struct net_device *netdev = pci_get_drvdata(pdev); + struct hinic_dev *nic_dev = netdev_priv(netdev); + + unregister_netdev(netdev); + + pci_set_drvdata(pdev, NULL); + + hinic_free_hwdev(nic_dev->hwdev); + + free_netdev(netdev); + + pci_release_regions(pdev); + pci_disable_device(pdev); + + dev_info(&pdev->dev, "HiNIC driver - removed\n"); +} + +static const struct pci_device_id hinic_pci_table[] = { + { PCI_VDEVICE(HUAWEI, PCI_DEVICE_ID_HI1822_PF), 0}, + { 0, 0} +}; +MODULE_DEVICE_TABLE(pci, hinic_pci_table); + +static struct pci_driver hinic_driver = { + .name = HINIC_DRV_NAME, + .id_table = hinic_pci_table, + .probe = hinic_probe, + .remove = hinic_remove, +}; + +module_pci_driver(hinic_driver); -- cgit v1.2.3 From 3b0c34580b7ab9bb1d4a375e427af6a75ca45821 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Mon, 21 Aug 2017 19:22:40 -0700 Subject: hv_netvsc: Update netvsc Document for UDP hash level setting Update Documentation/networking/netvsc.txt for UDP hash level setting and related info. Signed-off-by: Haiyang Zhang Signed-off-by: David S. Miller --- Documentation/networking/netvsc.txt | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/netvsc.txt b/Documentation/networking/netvsc.txt index 4ddb4e4b0426..fa8d86356791 100644 --- a/Documentation/networking/netvsc.txt +++ b/Documentation/networking/netvsc.txt @@ -21,11 +21,23 @@ Features -------------------- Hyper-V supports receive side scaling. For TCP, packets are distributed among available queues based on IP address and port - number. Current versions of Hyper-V host, only distribute UDP - packets based on the IP source and destination address. - The port number is not used as part of the hash value for UDP. - Fragmented IP packets are not distributed between queues; - all fragmented packets arrive on the first channel. + number. + + For UDP, we can switch UDP hash level between L3 and L4 by ethtool + command. UDP over IPv4 and v6 can be set differently. The default + hash level is L4. We currently only allow switching TX hash level + from within the guests. + + On Azure, fragmented UDP packets have high loss rate with L4 + hashing. Using L3 hashing is recommended in this case. + + For example, for UDP over IPv4 on eth0: + To include UDP port numbers in hasing: + ethtool -N eth0 rx-flow-hash udp4 sdfn + To exclude UDP port numbers in hasing: + ethtool -N eth0 rx-flow-hash udp4 sd + To show UDP hash level: + ethtool -n eth0 rx-flow-hash udp4 Generic Receive Offload, aka GRO -------------------------------- -- cgit v1.2.3 From 7afe461ee64c03a857adfc146fcda1c98bb6a8ca Mon Sep 17 00:00:00 2001 From: Antoine Ténart Date: Tue, 22 Aug 2017 19:08:28 +0200 Subject: Documentation/bindings: net: marvell-pp2: add the system controller This patch documents the new marvell,system-controller property used by the Marvell ppv2 network driver. Signed-off-by: Antoine Tenart Tested-by: Marcin Wojtas Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/marvell-pp2.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/marvell-pp2.txt b/Documentation/devicetree/bindings/net/marvell-pp2.txt index 8918ad3ccf14..49484db81583 100644 --- a/Documentation/devicetree/bindings/net/marvell-pp2.txt +++ b/Documentation/devicetree/bindings/net/marvell-pp2.txt @@ -45,6 +45,7 @@ Optional properties (port): be the name associated to the interrupts listed. Valid names are: "tx-cpu0", "tx-cpu1", "tx-cpu2", "tx-cpu3", "rx-shared". +- marvell,system-controller: a phandle to the system controller. Example for marvell,armada-375-pp2: -- cgit v1.2.3 From d2aaa3dc419994eefa21de971bb1f544c42541c7 Mon Sep 17 00:00:00 2001 From: Shubham Bansal Date: Wed, 23 Aug 2017 21:29:10 +0530 Subject: bpf, doc: Add arm32 as arch supporting eBPF JIT As eBPF JIT support for arm32 was added recently with commit 39c13c204bb1150d401e27d41a9d8b332be47c49, it seems appropriate to add arm32 as arch with support for eBPF JIT in bpf and sysctl docs as well. Signed-off-by: Shubham Bansal Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 4 ++-- Documentation/sysctl/net.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 6a0df8df6c43..e5e33bac2068 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -596,8 +596,8 @@ skb pointer). All constraints and restrictions from bpf_check_classic() apply before a conversion to the new layout is being done behind the scenes! Currently, the classic BPF format is being used for JITing on most 32-bit -architectures, whereas x86-64, aarch64, s390x, powerpc64, sparc64 perform JIT -compilation from eBPF instruction set. +architectures, whereas x86-64, aarch64, s390x, powerpc64, sparc64, arm32 perform +JIT compilation from eBPF instruction set. Some core changes of the new internal format: diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 28596e03220b..b67044a2575f 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -46,13 +46,13 @@ translate these BPF proglets into native CPU instructions. There are two flavors of JITs, the newer eBPF JIT currently supported on: - x86_64 - arm64 + - arm32 - ppc64 - sparc64 - mips64 - s390x And the older cBPF JIT supported on the following archs: - - arm - mips - ppc - sparc -- cgit v1.2.3 From 22b6722bfa591ba03d6a0c5521b600d4ab2d9a27 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 23 Aug 2017 09:55:41 +0200 Subject: ipv6: Add sysctl for per namespace flow label reflection Reflecting IPv6 Flow Label at server nodes is useful in environments that employ multipath routing to load balance the requests. As "IPv6 Flow Label Reflection" standard draft [1] points out - ICMPv6 PTB error messages generated in response to a downstream packets from the server can be routed by a load balancer back to the original server without looking at transport headers, if the server applies the flow label reflection. This enables the Path MTU Discovery past the ECMP router in load-balance or anycast environments where each server node is reachable by only one path. Introduce a sysctl to enable flow label reflection per net namespace for all newly created sockets. Same could be earlier achieved only per socket by setting the IPV6_FL_F_REFLECT flag for the IPV6_FLOWLABEL_MGR socket option. [1] https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01 Signed-off-by: Jakub Sitnicki Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 9 +++++++++ include/net/netns/ipv6.h | 1 + net/ipv6/af_inet6.c | 1 + net/ipv6/sysctl_net_ipv6.c | 8 ++++++++ 4 files changed, 19 insertions(+) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 84c9b8cee780..6b0bc0f71534 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1350,6 +1350,15 @@ flowlabel_state_ranges - BOOLEAN FALSE: disabled Default: true +flowlabel_reflect - BOOLEAN + Automatically reflect the flow label. Needed for Path MTU + Discovery to work with Equal Cost Multipath Routing in anycast + environments. See RFC 7690 and: + https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01 + TRUE: enabled + FALSE: disabled + Default: FALSE + anycast_src_echo_reply - BOOLEAN Controls the use of anycast addresses as source addresses for ICMPv6 echo reply diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 0e50bf3ed097..2544f9760a42 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -36,6 +36,7 @@ struct netns_sysctl_ipv6 { int idgen_retries; int idgen_delay; int flowlabel_state_ranges; + int flowlabel_reflect; }; struct netns_ipv6 { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 3b58ee709f33..fe5262fd6aa5 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -211,6 +211,7 @@ lookup_protocol: np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; np->autoflowlabel = ip6_default_np_autolabel(net); + np->repflow = net->ipv6.sysctl.flowlabel_reflect; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 69c50e737c54..6fbf8ae5e52c 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -90,6 +90,13 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "flowlabel_reflect", + .data = &init_net.ipv6.sysctl.flowlabel_reflect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; @@ -149,6 +156,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay; ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges; ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind; + ipv6_table[9].data = &net->ipv6.sysctl.flowlabel_reflect; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) -- cgit v1.2.3 From 3fd87127073292538047adf1c9c757e9cab0dd56 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 24 Aug 2017 14:38:51 -0700 Subject: strparser: initialize all callbacks commit bbb03029a899 ("strparser: Generalize strparser") added more function pointers to 'struct strp_callbacks'; however, kcm_attach() was not updated to initialize them. This could cause the ->lock() and/or ->unlock() function pointers to be set to garbage values, causing a crash in strp_work(). Fix the bug by moving the callback structs into static memory, so unspecified members are zeroed. Also constify them while we're at it. This bug was found by syzkaller, which encountered the following splat: IP: 0x55 PGD 3b1ca067 P4D 3b1ca067 PUD 3b12f067 PMD 0 Oops: 0010 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 2 PID: 1194 Comm: kworker/u8:1 Not tainted 4.13.0-rc4-next-20170811 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: kstrp strp_work task: ffff88006bb0e480 task.stack: ffff88006bb10000 RIP: 0010:0x55 RSP: 0018:ffff88006bb17540 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff88006ce4bd60 RCX: 0000000000000000 RDX: 1ffff1000d9c97bd RSI: 0000000000000000 RDI: ffff88006ce4bc48 RBP: ffff88006bb17558 R08: ffffffff81467ab2 R09: 0000000000000000 R10: ffff88006bb17438 R11: ffff88006bb17940 R12: ffff88006ce4bc48 R13: ffff88003c683018 R14: ffff88006bb17980 R15: ffff88003c683000 FS: 0000000000000000(0000) GS:ffff88006de00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000055 CR3: 000000003c145000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: process_one_work+0xbf3/0x1bc0 kernel/workqueue.c:2098 worker_thread+0x223/0x1860 kernel/workqueue.c:2233 kthread+0x35e/0x430 kernel/kthread.c:231 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:431 Code: Bad RIP value. RIP: 0x55 RSP: ffff88006bb17540 CR2: 0000000000000055 ---[ end trace f0e4920047069cee ]--- Here is a C reproducer (requires CONFIG_BPF_SYSCALL=y and CONFIG_AF_KCM=y): #include #include #include #include #include #include #include #include static const struct bpf_insn bpf_insns[3] = { { .code = 0xb7 }, /* BPF_MOV64_IMM(0, 0) */ { .code = 0x95 }, /* BPF_EXIT_INSN() */ }; static const union bpf_attr bpf_attr = { .prog_type = 1, .insn_cnt = 2, .insns = (uintptr_t)&bpf_insns, .license = (uintptr_t)"", }; int main(void) { int bpf_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &bpf_attr, sizeof(bpf_attr)); int inet_fd = socket(AF_INET, SOCK_STREAM, 0); int kcm_fd = socket(AF_KCM, SOCK_DGRAM, 0); ioctl(kcm_fd, SIOCKCMATTACH, &(struct kcm_attach) { .fd = inet_fd, .bpf_fd = bpf_fd }); } Fixes: bbb03029a899 ("strparser: Generalize strparser") Cc: Dmitry Vyukov Cc: Tom Herbert Signed-off-by: Eric Biggers Signed-off-by: David S. Miller --- Documentation/networking/strparser.txt | 2 +- include/net/strparser.h | 2 +- kernel/bpf/sockmap.c | 10 +++++----- net/kcm/kcmsock.c | 11 +++++------ net/strparser/strparser.c | 2 +- 5 files changed, 13 insertions(+), 14 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/strparser.txt b/Documentation/networking/strparser.txt index fe01302471ae..13081b3decef 100644 --- a/Documentation/networking/strparser.txt +++ b/Documentation/networking/strparser.txt @@ -35,7 +35,7 @@ Functions ========= strp_init(struct strparser *strp, struct sock *sk, - struct strp_callbacks *cb) + const struct strp_callbacks *cb) Called to initialize a stream parser. strp is a struct of type strparser that is allocated by the upper layer. sk is the TCP diff --git a/include/net/strparser.h b/include/net/strparser.h index 4fe966a0ad92..7dc131d62ad5 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -138,7 +138,7 @@ void strp_done(struct strparser *strp); void strp_stop(struct strparser *strp); void strp_check_rcv(struct strparser *strp); int strp_init(struct strparser *strp, struct sock *sk, - struct strp_callbacks *cb); + const struct strp_callbacks *cb); void strp_data_ready(struct strparser *strp); int strp_process(struct strparser *strp, struct sk_buff *orig_skb, unsigned int orig_offset, size_t orig_len, diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 78b2bb9370ac..617c239590c2 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -368,12 +368,12 @@ static int smap_read_sock_done(struct strparser *strp, int err) static int smap_init_sock(struct smap_psock *psock, struct sock *sk) { - struct strp_callbacks cb; + static const struct strp_callbacks cb = { + .rcv_msg = smap_read_sock_strparser, + .parse_msg = smap_parse_func_strparser, + .read_sock_done = smap_read_sock_done, + }; - memset(&cb, 0, sizeof(cb)); - cb.rcv_msg = smap_read_sock_strparser; - cb.parse_msg = smap_parse_func_strparser; - cb.read_sock_done = smap_read_sock_done; return strp_init(&psock->strp, sk, &cb); } diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 88ce73288247..48e993b2dbcf 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1376,7 +1376,11 @@ static int kcm_attach(struct socket *sock, struct socket *csock, struct kcm_psock *psock = NULL, *tpsock; struct list_head *head; int index = 0; - struct strp_callbacks cb; + static const struct strp_callbacks cb = { + .rcv_msg = kcm_rcv_strparser, + .parse_msg = kcm_parse_func_strparser, + .read_sock_done = kcm_read_sock_done, + }; int err; csk = csock->sk; @@ -1391,11 +1395,6 @@ static int kcm_attach(struct socket *sock, struct socket *csock, psock->sk = csk; psock->bpf_prog = prog; - cb.rcv_msg = kcm_rcv_strparser; - cb.abort_parser = NULL; - cb.parse_msg = kcm_parse_func_strparser; - cb.read_sock_done = kcm_read_sock_done; - err = strp_init(&psock->strp, csk, &cb); if (err) { kmem_cache_free(kcm_psockp, psock); diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 434aa6637a52..d4ea46a5f233 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -472,7 +472,7 @@ static void strp_sock_unlock(struct strparser *strp) } int strp_init(struct strparser *strp, struct sock *sk, - struct strp_callbacks *cb) + const struct strp_callbacks *cb) { if (!cb || !cb->rcv_msg || !cb->parse_msg) -- cgit v1.2.3 From 0659191630bd386d4ca76fdaa9b556cbdda48278 Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Sun, 27 Aug 2017 16:13:42 +0300 Subject: Documentation: networking: add RSS information Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- Documentation/networking/dpaa.txt | 68 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/networking/dpaa.txt b/Documentation/networking/dpaa.txt index 76e016d4d344..f88194f71c54 100644 --- a/Documentation/networking/dpaa.txt +++ b/Documentation/networking/dpaa.txt @@ -13,6 +13,7 @@ Contents - Configuring DPAA Ethernet in your kernel - DPAA Ethernet Frame Processing - DPAA Ethernet Features + - DPAA IRQ Affinity and Receive Side Scaling - Debugging DPAA Ethernet Overview @@ -147,7 +148,10 @@ gradually. The driver has Rx and Tx checksum offloading for UDP and TCP. Currently the Rx checksum offload feature is enabled by default and cannot be controlled through -ethtool. +ethtool. Also, rx-flow-hash and rx-hashing was added. The addition of RSS +provides a big performance boost for the forwarding scenarios, allowing +different traffic flows received by one interface to be processed by different +CPUs in parallel. The driver has support for multiple prioritized Tx traffic classes. Priorities range from 0 (lowest) to 3 (highest). These are mapped to HW workqueues with @@ -166,6 +170,68 @@ classes as follows: tc qdisc add dev root handle 1: \ mqprio num_tc 4 map 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 hw 1 +DPAA IRQ Affinity and Receive Side Scaling +========================================== + +Traffic coming on the DPAA Rx queues or on the DPAA Tx confirmation +queues is seen by the CPU as ingress traffic on a certain portal. +The DPAA QMan portal interrupts are affined each to a certain CPU. +The same portal interrupt services all the QMan portal consumers. + +By default the DPAA Ethernet driver enables RSS, making use of the +DPAA FMan Parser and Keygen blocks to distribute traffic on 128 +hardware frame queues using a hash on IP v4/v6 source and destination +and L4 source and destination ports, in present in the received frame. +When RSS is disabled, all traffic received by a certain interface is +received on the default Rx frame queue. The default DPAA Rx frame +queues are configured to put the received traffic into a pool channel +that allows any available CPU portal to dequeue the ingress traffic. +The default frame queues have the HOLDACTIVE option set, ensuring that +traffic bursts from a certain queue are serviced by the same CPU. +This ensures a very low rate of frame reordering. A drawback of this +is that only one CPU at a time can service the traffic received by a +certain interface when RSS is not enabled. + +To implement RSS, the DPAA Ethernet driver allocates an extra set of +128 Rx frame queues that are configured to dedicated channels, in a +round-robin manner. The mapping of the frame queues to CPUs is now +hardcoded, there is no indirection table to move traffic for a certain +FQ (hash result) to another CPU. The ingress traffic arriving on one +of these frame queues will arrive at the same portal and will always +be processed by the same CPU. This ensures intra-flow order preservation +and workload distribution for multiple traffic flows. + +RSS can be turned off for a certain interface using ethtool, i.e. + + # ethtool -N fm1-mac9 rx-flow-hash tcp4 "" + +To turn it back on, one needs to set rx-flow-hash for tcp4/6 or udp4/6: + + # ethtool -N fm1-mac9 rx-flow-hash udp4 sfdn + +There is no independent control for individual protocols, any command +run for one of tcp4|udp4|ah4|esp4|sctp4|tcp6|udp6|ah6|esp6|sctp6 is +going to control the rx-flow-hashing for all protocols on that interface. + +Besides using the FMan Keygen computed hash for spreading traffic on the +128 Rx FQs, the DPAA Ethernet driver also sets the skb hash value when +the NETIF_F_RXHASH feature is on (active by default). This can be turned +on or off through ethtool, i.e.: + + # ethtool -K fm1-mac9 rx-hashing off + # ethtool -k fm1-mac9 | grep hash + receive-hashing: off + # ethtool -K fm1-mac9 rx-hashing on + Actual changes: + receive-hashing: on + # ethtool -k fm1-mac9 | grep hash + receive-hashing: on + +Please note that Rx hashing depends upon the rx-flow-hashing being on +for that interface - turning off rx-flow-hashing will also disable the +rx-hashing (without ethtool reporting it as off as that depends on the +NETIF_F_RXHASH feature flag). + Debugging ========= -- cgit v1.2.3 From e833251ad813168253fef9915aaf6a8c883337b0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 29 Aug 2017 10:18:56 +0100 Subject: rxrpc: Add notification of end-of-Tx phase Add a callback to rxrpc_kernel_send_data() so that a kernel service can get a notification that the AF_RXRPC call has transitioned out the Tx phase and is now waiting for a reply or a final ACK. This is called from AF_RXRPC with the call state lock held so the notification is guaranteed to come before any reply is passed back. Further, modify the AFS filesystem to make use of this so that we don't have to change the afs_call state before sending the last bit of data. Signed-off-by: David Howells --- Documentation/networking/rxrpc.txt | 12 +++++++++- fs/afs/rxrpc.c | 46 +++++++++++++++++++++++++++++--------- include/net/af_rxrpc.h | 5 ++++- net/rxrpc/sendmsg.c | 34 ++++++++++++++++++++++------ 4 files changed, 77 insertions(+), 20 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index 8c70ba5dee4d..92a3c3bd5ac3 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -818,10 +818,15 @@ The kernel interface functions are as follows: (*) Send data through a call. + typedef void (*rxrpc_notify_end_tx_t)(struct sock *sk, + unsigned long user_call_ID, + struct sk_buff *skb); + int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, - size_t len); + size_t len, + rxrpc_notify_end_tx_t notify_end_rx); This is used to supply either the request part of a client call or the reply part of a server call. msg.msg_iovlen and msg.msg_iov specify the @@ -832,6 +837,11 @@ The kernel interface functions are as follows: The msg must not specify a destination address, control data or any flags other than MSG_MORE. len is the total amount of data to transmit. + notify_end_rx can be NULL or it can be used to specify a function to be + called when the call changes state to end the Tx phase. This function is + called with the call-state spinlock held to prevent any reply or final ACK + from being delivered first. + (*) Receive data from a call. int rxrpc_kernel_recv_data(struct socket *sock, diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 10743043d431..0bf191f0dbaf 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -291,6 +291,19 @@ static void afs_load_bvec(struct afs_call *call, struct msghdr *msg, iov_iter_bvec(&msg->msg_iter, WRITE | ITER_BVEC, bv, nr, bytes); } +/* + * Advance the AFS call state when the RxRPC call ends the transmit phase. + */ +static void afs_notify_end_request_tx(struct sock *sock, + struct rxrpc_call *rxcall, + unsigned long call_user_ID) +{ + struct afs_call *call = (struct afs_call *)call_user_ID; + + if (call->state == AFS_CALL_REQUESTING) + call->state = AFS_CALL_AWAIT_REPLY; +} + /* * attach the data from a bunch of pages on an inode to a call */ @@ -310,14 +323,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) bytes = msg->msg_iter.count; nr = msg->msg_iter.nr_segs; - /* Have to change the state *before* sending the last - * packet as RxRPC might give us the reply before it - * returns from sending the request. - */ - if (first + nr - 1 >= last) - call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, - msg, bytes); + ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, msg, + bytes, afs_notify_end_request_tx); for (loop = 0; loop < nr; loop++) put_page(bv[loop].bv_page); if (ret < 0) @@ -409,7 +416,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, if (!call->send_pages) call->state = AFS_CALL_AWAIT_REPLY; ret = rxrpc_kernel_send_data(afs_socket, rxcall, - &msg, call->request_size); + &msg, call->request_size, + afs_notify_end_request_tx); if (ret < 0) goto error_do_abort; @@ -740,6 +748,20 @@ static int afs_deliver_cm_op_id(struct afs_call *call) return call->type->deliver(call); } +/* + * Advance the AFS call state when an RxRPC service call ends the transmit + * phase. + */ +static void afs_notify_end_reply_tx(struct sock *sock, + struct rxrpc_call *rxcall, + unsigned long call_user_ID) +{ + struct afs_call *call = (struct afs_call *)call_user_ID; + + if (call->state == AFS_CALL_REPLYING) + call->state = AFS_CALL_AWAIT_ACK; +} + /* * send an empty reply */ @@ -759,7 +781,8 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0)) { + switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0, + afs_notify_end_reply_tx)) { case 0: _leave(" [replied]"); return; @@ -797,7 +820,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len); + n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len, + afs_notify_end_reply_tx); if (n >= 0) { /* Success */ _leave(" [replied]"); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index c172709787af..07a47ee6f783 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -21,6 +21,8 @@ struct rxrpc_call; typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, unsigned long); +typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *, + unsigned long); typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *, unsigned long); typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long); @@ -37,7 +39,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, gfp_t, rxrpc_notify_rx_t); int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, - struct msghdr *, size_t); + struct msghdr *, size_t, + rxrpc_notify_end_tx_t); int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *, void *, size_t, size_t *, bool, u32 *); bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index bc7f0241d92b..344fdce89823 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -100,12 +100,24 @@ static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix) spin_unlock_bh(&call->lock); } +/* + * Notify the owner of the call that the transmit phase is ended and the last + * packet has been queued. + */ +static void rxrpc_notify_end_tx(struct rxrpc_sock *rx, struct rxrpc_call *call, + rxrpc_notify_end_tx_t notify_end_tx) +{ + if (notify_end_tx) + notify_end_tx(&rx->sk, call, call->user_call_ID); +} + /* * Queue a DATA packet for transmission, set the resend timeout and send the * packet immediately */ -static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, - bool last) +static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, + struct sk_buff *skb, bool last, + rxrpc_notify_end_tx_t notify_end_tx) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); rxrpc_seq_t seq = sp->hdr.seq; @@ -141,6 +153,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, switch (call->state) { case RXRPC_CALL_CLIENT_SEND_REQUEST: call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; + rxrpc_notify_end_tx(rx, call, notify_end_tx); break; case RXRPC_CALL_SERVER_ACK_REQUEST: call->state = RXRPC_CALL_SERVER_SEND_REPLY; @@ -153,6 +166,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, break; case RXRPC_CALL_SERVER_SEND_REPLY: call->state = RXRPC_CALL_SERVER_AWAIT_ACK; + rxrpc_notify_end_tx(rx, call, notify_end_tx); break; default: break; @@ -189,7 +203,8 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, */ static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, - struct msghdr *msg, size_t len) + struct msghdr *msg, size_t len, + rxrpc_notify_end_tx_t notify_end_tx) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; @@ -350,7 +365,9 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, if (ret < 0) goto out; - rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more); + rxrpc_queue_packet(rx, call, skb, + !msg_data_left(msg) && !more, + notify_end_tx); skb = NULL; } } while (msg_data_left(msg) > 0); @@ -611,7 +628,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) /* Reply phase not begun or not complete for service call. */ ret = -EPROTO; } else { - ret = rxrpc_send_data(rx, call, msg, len); + ret = rxrpc_send_data(rx, call, msg, len, NULL); } mutex_unlock(&call->user_mutex); @@ -631,6 +648,7 @@ error_release_sock: * @call: The call to send data through * @msg: The data to send * @len: The amount of data to send + * @notify_end_tx: Notification that the last packet is queued. * * Allow a kernel service to send data on a call. The call must be in an state * appropriate to sending data. No control data should be supplied in @msg, @@ -638,7 +656,8 @@ error_release_sock: * more data to come, otherwise this data will end the transmission phase. */ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, - struct msghdr *msg, size_t len) + struct msghdr *msg, size_t len, + rxrpc_notify_end_tx_t notify_end_tx) { int ret; @@ -656,7 +675,8 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, case RXRPC_CALL_CLIENT_SEND_REQUEST: case RXRPC_CALL_SERVER_ACK_REQUEST: case RXRPC_CALL_SERVER_SEND_REPLY: - ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len); + ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len, + notify_end_tx); break; case RXRPC_CALL_COMPLETE: read_lock_bh(&call->state_lock); -- cgit v1.2.3 From c038a58ccfd6704d4d7d60ed3d6a0fca13cf13a4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 29 Aug 2017 10:19:01 +0100 Subject: rxrpc: Allow failed client calls to be retried Allow a client call that failed on network error to be retried, provided that the Tx queue still holds DATA packet 1. This allows an operation to be submitted to another server or another address for the same server without having to repackage and re-encrypt the data so far processed. Two new functions are provided: (1) rxrpc_kernel_check_call() - This is used to find out the completion state of a call to guess whether it can be retried and whether it should be retried. (2) rxrpc_kernel_retry_call() - Disconnect the call from its current connection, reset the state and submit it as a new client call to a new address. The new address need not match the previous address. A call may be retried even if all the data hasn't been loaded into it yet; a partially constructed will be retained at the same point it was at when an error condition was detected. msg_data_left() can be used to find out how much data was packaged before the error occurred. Signed-off-by: David Howells --- Documentation/networking/rxrpc.txt | 45 ++++++++++++++++ include/net/af_rxrpc.h | 16 ++++++ net/rxrpc/af_rxrpc.c | 69 +++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 19 +++---- net/rxrpc/call_object.c | 102 +++++++++++++++++++++++++++++++++++-- net/rxrpc/conn_client.c | 17 +++++-- net/rxrpc/sendmsg.c | 24 +++++---- 7 files changed, 261 insertions(+), 31 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index 92a3c3bd5ac3..810620153a44 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -975,6 +975,51 @@ The kernel interface functions are as follows: size should be set when the call is begun. tx_total_len may not be less than zero. + (*) Check to see the completion state of a call so that the caller can assess + whether it needs to be retried. + + enum rxrpc_call_completion { + RXRPC_CALL_SUCCEEDED, + RXRPC_CALL_REMOTELY_ABORTED, + RXRPC_CALL_LOCALLY_ABORTED, + RXRPC_CALL_LOCAL_ERROR, + RXRPC_CALL_NETWORK_ERROR, + }; + + int rxrpc_kernel_check_call(struct socket *sock, struct rxrpc_call *call, + enum rxrpc_call_completion *_compl, + u32 *_abort_code); + + On return, -EINPROGRESS will be returned if the call is still ongoing; if + it is finished, *_compl will be set to indicate the manner of completion, + *_abort_code will be set to any abort code that occurred. 0 will be + returned on a successful completion, -ECONNABORTED will be returned if the + client failed due to a remote abort and anything else will return an + appropriate error code. + + The caller should look at this information to decide if it's worth + retrying the call. + + (*) Retry a client call. + + int rxrpc_kernel_retry_call(struct socket *sock, + struct rxrpc_call *call, + struct sockaddr_rxrpc *srx, + struct key *key); + + This attempts to partially reinitialise a call and submit it again whilst + reusing the original call's Tx queue to avoid the need to repackage and + re-encrypt the data to be sent. call indicates the call to retry, srx the + new address to send it to and key the encryption key to use for signing or + encrypting the packets. + + For this to work, the first Tx data packet must still be in the transmit + queue, and currently this is only permitted for local and network errors + and the call must not have been aborted. Any partially constructed Tx + packet is left as is and can continue being filled afterwards. + + It returns 0 if the call was requeued and an error otherwise. + ======================= CONFIGURABLE PARAMETERS diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 07a47ee6f783..3ac79150291f 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -19,6 +19,18 @@ struct sock; struct socket; struct rxrpc_call; +/* + * Call completion condition (state == RXRPC_CALL_COMPLETE). + */ +enum rxrpc_call_completion { + RXRPC_CALL_SUCCEEDED, /* - Normal termination */ + RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ + RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ + RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */ + RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ + NR__RXRPC_CALL_COMPLETIONS +}; + typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, unsigned long); typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *, @@ -51,5 +63,9 @@ void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, rxrpc_user_attach_call_t, unsigned long, gfp_t); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); +int rxrpc_kernel_retry_call(struct socket *, struct rxrpc_call *, + struct sockaddr_rxrpc *, struct key *); +int rxrpc_kernel_check_call(struct socket *, struct rxrpc_call *, + enum rxrpc_call_completion *, u32 *); #endif /* _NET_RXRPC_H */ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 31e97f714ca9..fb17552fd292 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -336,6 +336,75 @@ void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) } EXPORT_SYMBOL(rxrpc_kernel_end_call); +/** + * rxrpc_kernel_check_call - Check a call's state + * @sock: The socket the call is on + * @call: The call to check + * @_compl: Where to store the completion state + * @_abort_code: Where to store any abort code + * + * Allow a kernel service to query the state of a call and find out the manner + * of its termination if it has completed. Returns -EINPROGRESS if the call is + * still going, 0 if the call finished successfully, -ECONNABORTED if the call + * was aborted and an appropriate error if the call failed in some other way. + */ +int rxrpc_kernel_check_call(struct socket *sock, struct rxrpc_call *call, + enum rxrpc_call_completion *_compl, u32 *_abort_code) +{ + if (call->state != RXRPC_CALL_COMPLETE) + return -EINPROGRESS; + smp_rmb(); + *_compl = call->completion; + *_abort_code = call->abort_code; + return call->error; +} +EXPORT_SYMBOL(rxrpc_kernel_check_call); + +/** + * rxrpc_kernel_retry_call - Allow a kernel service to retry a call + * @sock: The socket the call is on + * @call: The call to retry + * @srx: The address of the peer to contact + * @key: The security context to use (defaults to socket setting) + * + * Allow a kernel service to try resending a client call that failed due to a + * network error to a new address. The Tx queue is maintained intact, thereby + * relieving the need to re-encrypt any request data that has already been + * buffered. + */ +int rxrpc_kernel_retry_call(struct socket *sock, struct rxrpc_call *call, + struct sockaddr_rxrpc *srx, struct key *key) +{ + struct rxrpc_conn_parameters cp; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + int ret; + + _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); + + if (!key) + key = rx->key; + if (key && !key->payload.data[0]) + key = NULL; /* a no-security key */ + + memset(&cp, 0, sizeof(cp)); + cp.local = rx->local; + cp.key = key; + cp.security_level = 0; + cp.exclusive = false; + cp.service_id = srx->srx_service; + + mutex_lock(&call->user_mutex); + + ret = rxrpc_prepare_call_for_retry(rx, call); + if (ret == 0) + ret = rxrpc_retry_client_call(rx, call, &cp, srx, GFP_KERNEL); + + mutex_unlock(&call->user_mutex); + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(rxrpc_kernel_retry_call); + /** * rxrpc_kernel_new_call_notification - Get notifications of new calls * @sock: The socket to intercept received messages on diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 227e24794055..ea5600b747cc 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -445,6 +445,7 @@ enum rxrpc_call_flag { RXRPC_CALL_EXPOSED, /* The call was exposed to the world */ RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ + RXRPC_CALL_TX_LASTQ, /* Last packet has been queued */ RXRPC_CALL_SEND_PING, /* A ping will need to be sent */ RXRPC_CALL_PINGING, /* Ping in process */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ @@ -481,18 +482,6 @@ enum rxrpc_call_state { NR__RXRPC_CALL_STATES }; -/* - * Call completion condition (state == RXRPC_CALL_COMPLETE). - */ -enum rxrpc_call_completion { - RXRPC_CALL_SUCCEEDED, /* - Normal termination */ - RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ - RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ - RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */ - RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ - NR__RXRPC_CALL_COMPLETIONS -}; - /* * Call Tx congestion management modes. */ @@ -687,9 +676,15 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, unsigned long, s64, gfp_t); +int rxrpc_retry_client_call(struct rxrpc_sock *, + struct rxrpc_call *, + struct rxrpc_conn_parameters *, + struct sockaddr_rxrpc *, + gfp_t); void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *); +int rxrpc_prepare_call_for_retry(struct rxrpc_sock *, struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); bool __rxrpc_queue_call(struct rxrpc_call *); bool rxrpc_queue_call(struct rxrpc_call *); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index d7809a0620b4..fcdd6555a820 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -269,11 +269,6 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage), here, NULL); - spin_lock_bh(&call->conn->params.peer->lock); - hlist_add_head(&call->error_link, - &call->conn->params.peer->error_targets); - spin_unlock_bh(&call->conn->params.peer->lock); - rxrpc_start_call_timer(call); _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); @@ -303,6 +298,48 @@ error: return ERR_PTR(ret); } +/* + * Retry a call to a new address. It is expected that the Tx queue of the call + * will contain data previously packaged for an old call. + */ +int rxrpc_retry_client_call(struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct rxrpc_conn_parameters *cp, + struct sockaddr_rxrpc *srx, + gfp_t gfp) +{ + const void *here = __builtin_return_address(0); + int ret; + + /* Set up or get a connection record and set the protocol parameters, + * including channel number and call ID. + */ + ret = rxrpc_connect_call(call, cp, srx, gfp); + if (ret < 0) + goto error; + + trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage), + here, NULL); + + rxrpc_start_call_timer(call); + + _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); + + if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) + rxrpc_queue_call(call); + + _leave(" = 0"); + return 0; + +error: + rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, + RX_CALL_DEAD, ret); + trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage), + here, ERR_PTR(ret)); + _leave(" = %d", ret); + return ret; +} + /* * Set up an incoming call. call->conn points to the connection. * This is called in BH context and isn't allowed to fail. @@ -470,6 +507,61 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) _leave(""); } +/* + * Prepare a kernel service call for retry. + */ +int rxrpc_prepare_call_for_retry(struct rxrpc_sock *rx, struct rxrpc_call *call) +{ + const void *here = __builtin_return_address(0); + int i; + u8 last = 0; + + _enter("{%d,%d}", call->debug_id, atomic_read(&call->usage)); + + trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage), + here, (const void *)call->flags); + + ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); + ASSERTCMP(call->completion, !=, RXRPC_CALL_REMOTELY_ABORTED); + ASSERTCMP(call->completion, !=, RXRPC_CALL_LOCALLY_ABORTED); + ASSERT(list_empty(&call->recvmsg_link)); + + del_timer_sync(&call->timer); + + _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, call->conn); + + if (call->conn) + rxrpc_disconnect_call(call); + + if (rxrpc_is_service_call(call) || + !call->tx_phase || + call->tx_hard_ack != 0 || + call->rx_hard_ack != 0 || + call->rx_top != 0) + return -EINVAL; + + call->state = RXRPC_CALL_UNINITIALISED; + call->completion = RXRPC_CALL_SUCCEEDED; + call->call_id = 0; + call->cid = 0; + call->cong_cwnd = 0; + call->cong_extra = 0; + call->cong_ssthresh = 0; + call->cong_mode = 0; + call->cong_dup_acks = 0; + call->cong_cumul_acks = 0; + call->acks_lowest_nak = 0; + + for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) { + last |= call->rxtx_annotations[i]; + call->rxtx_annotations[i] &= RXRPC_TX_ANNO_LAST; + call->rxtx_annotations[i] |= RXRPC_TX_ANNO_RETRANS; + } + + _leave(" = 0"); + return 0; +} + /* * release all the calls associated with a socket */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index eb2157680399..5f9624bd311c 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -555,7 +555,10 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate); write_lock_bh(&call->state_lock); - call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; + if (!test_bit(RXRPC_CALL_TX_LASTQ, &call->flags)) + call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; + else + call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; write_unlock_bh(&call->state_lock); rxrpc_see_call(call); @@ -688,15 +691,23 @@ int rxrpc_connect_call(struct rxrpc_call *call, ret = rxrpc_get_client_conn(call, cp, srx, gfp); if (ret < 0) - return ret; + goto out; rxrpc_animate_client_conn(rxnet, call->conn); rxrpc_activate_channels(call->conn); ret = rxrpc_wait_for_channel(call, gfp); - if (ret < 0) + if (ret < 0) { rxrpc_disconnect_client_call(call); + goto out; + } + + spin_lock_bh(&call->conn->params.peer->lock); + hlist_add_head(&call->error_link, + &call->conn->params.peer->error_targets); + spin_unlock_bh(&call->conn->params.peer->lock); +out: _leave(" = %d", ret); return ret; } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 344fdce89823..9ea6f972767e 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -128,8 +128,10 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, ASSERTCMP(seq, ==, call->tx_top + 1); - if (last) + if (last) { annotation |= RXRPC_TX_ANNO_LAST; + set_bit(RXRPC_CALL_TX_LASTQ, &call->flags); + } /* We have to set the timestamp before queueing as the retransmit * algorithm can see the packet as soon as we queue it. @@ -326,11 +328,6 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, call->tx_total_len -= copy; } - /* check for the far side aborting the call or a network error - * occurring */ - if (call->state == RXRPC_CALL_COMPLETE) - goto call_terminated; - /* add the packet to the send queue if it's now full */ if (sp->remain <= 0 || (msg_data_left(msg) == 0 && !more)) { @@ -370,6 +367,16 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, notify_end_tx); skb = NULL; } + + /* Check for the far side aborting the call or a network error + * occurring. If this happens, save any packet that was under + * construction so that in the case of a network error, the + * call can be retried or redirected. + */ + if (call->state == RXRPC_CALL_COMPLETE) { + ret = call->error; + goto out; + } } while (msg_data_left(msg) > 0); success: @@ -379,11 +386,6 @@ out: _leave(" = %d", ret); return ret; -call_terminated: - rxrpc_free_skb(skb, rxrpc_skb_tx_freed); - _leave(" = %d", call->error); - return call->error; - maybe_error: if (copied) goto success; -- cgit v1.2.3 From c965584039725283f89cb46d806c8f49be65ef2a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 29 Aug 2017 15:07:51 -0700 Subject: Documentation: networking: Add blurb about patches in patchwork Explain that the patch queue in patchwork should not be touched by patch submitters. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/netdev-FAQ.txt | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'Documentation') diff --git a/Documentation/networking/netdev-FAQ.txt b/Documentation/networking/netdev-FAQ.txt index 247a30ba8e17..cfc66ea72329 100644 --- a/Documentation/networking/netdev-FAQ.txt +++ b/Documentation/networking/netdev-FAQ.txt @@ -111,6 +111,14 @@ A: Generally speaking, the patches get triaged quickly (in less than 48h). patch is a good way to ensure your patch is ignored or pushed to the bottom of the priority list. +Q: I submitted multiple versions of the patch series, should I directly update + patchwork for the previous versions of these patch series? + +A: No, please don't interfere with the patch status on patchwork, leave it to + the maintainer to figure out what is the most recent and current version that + should be applied. If there is any doubt, the maintainer will reply and ask + what should be done. + Q: How can I tell what patches are queued up for backporting to the various stable releases? -- cgit v1.2.3 From eaa72dc47488d599439cd0fd0f8c4f1bcb3906bb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Aug 2017 15:16:01 -0700 Subject: neigh: increase queue_len_bytes to match wmem_default Florian reported UDP xmit drops that could be root caused to the too small neigh limit. Current limit is 64 KB, meaning that even a single UDP socket would hit it, since its default sk_sndbuf comes from net.core.wmem_default (~212992 bytes on 64bit arches). Once ARP/ND resolution is in progress, we should allow a little more packets to be queued, at least for one producer. Once neigh arp_queue is filled, a rogue socket should hit its sk_sndbuf limit and either block in sendmsg() or return -EAGAIN. Signed-off-by: Eric Dumazet Reported-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 +++++-- include/net/sock.h | 10 ++++++++++ net/core/sock.c | 10 ---------- net/decnet/dn_neigh.c | 2 +- net/ipv4/arp.c | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv6/ndisc.c | 2 +- 7 files changed, 19 insertions(+), 16 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 6b0bc0f71534..b3345d0fe0a6 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -109,7 +109,10 @@ neigh/default/unres_qlen_bytes - INTEGER queued for each unresolved address by other network layers. (added in linux 3.3) Setting negative value is meaningless and will return error. - Default: 65536 Bytes(64KB) + Default: SK_WMEM_MAX, (same as net.core.wmem_default). + Exact value depends on architecture and kernel options, + but should be enough to allow queuing 256 packets + of medium size. neigh/default/unres_qlen - INTEGER The maximum number of packets which may be queued for each @@ -119,7 +122,7 @@ neigh/default/unres_qlen - INTEGER unexpected packet loss. The current default value is calculated according to default value of unres_qlen_bytes and true size of packet. - Default: 31 + Default: 101 mtu_expires - INTEGER Time, in seconds, that cached PMTU information is kept. diff --git a/include/net/sock.h b/include/net/sock.h index 1c2912d433e8..03a362568357 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2368,6 +2368,16 @@ bool sk_net_capable(const struct sock *sk, int cap); void sk_get_meminfo(const struct sock *sk, u32 *meminfo); +/* Take into consideration the size of the struct sk_buff overhead in the + * determination of these values, since that is non-constant across + * platforms. This makes socket queueing behavior and performance + * not depend upon such differences. + */ +#define _SK_MEM_PACKETS 256 +#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) +#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) + extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; diff --git a/net/core/sock.c b/net/core/sock.c index dfdd14cac775..9b7b6bbb2a23 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -307,16 +307,6 @@ static struct lock_class_key af_wlock_keys[AF_MAX]; static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; -/* Take into consideration the size of the struct sk_buff overhead in the - * determination of these values, since that is non-constant across - * platforms. This makes socket queueing behavior and performance - * not depend upon such differences. - */ -#define _SK_MEM_PACKETS 256 -#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) -#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) -#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) - /* Run time adjustable parameters. */ __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; EXPORT_SYMBOL(sysctl_wmem_max); diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 21dedf6fd0f7..22bf0b95d6ed 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -94,7 +94,7 @@ struct neigh_table dn_neigh_table = { [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64*1024, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 0, [NEIGH_VAR_ANYCAST_DELAY] = 0, [NEIGH_VAR_PROXY_DELAY] = 0, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 8b52179ddc6e..7c45b8896709 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -171,7 +171,7 @@ struct neigh_table arp_tbl = { [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 568ccfd6dd37..7616cd76f6f6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6086,9 +6086,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; - struct dst_entry *dst = NULL; struct request_sock *req; bool want_cookie = false; + struct dst_entry *dst; struct flowi fl; /* TW buckets are converted to open requests without diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 5e338eb89509..266a530414d7 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -127,7 +127,7 @@ struct neigh_table nd_tbl = { [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, -- cgit v1.2.3 From ceed73a2cf4aff2921802aa3d21d45280677547d Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Tue, 29 Aug 2017 22:44:18 -0600 Subject: drivers: net: ethernet: qualcomm: rmnet: Initial implementation RmNet driver provides a transport agnostic MAP (multiplexing and aggregation protocol) support in embedded module. Module provides virtual network devices which can be attached to any IP-mode physical device. This will be used to provide all MAP functionality on future hardware in a single consistent location. Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- Documentation/networking/rmnet.txt | 82 ++++ drivers/net/ethernet/qualcomm/Kconfig | 2 + drivers/net/ethernet/qualcomm/Makefile | 2 + drivers/net/ethernet/qualcomm/rmnet/Kconfig | 12 + drivers/net/ethernet/qualcomm/rmnet/Makefile | 10 + drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 419 +++++++++++++++++++++ drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h | 56 +++ .../net/ethernet/qualcomm/rmnet/rmnet_handlers.c | 271 +++++++++++++ .../net/ethernet/qualcomm/rmnet/rmnet_handlers.h | 26 ++ drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h | 88 +++++ .../ethernet/qualcomm/rmnet/rmnet_map_command.c | 107 ++++++ .../net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 105 ++++++ .../net/ethernet/qualcomm/rmnet/rmnet_private.h | 45 +++ drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c | 170 +++++++++ drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h | 29 ++ 15 files changed, 1424 insertions(+) create mode 100644 Documentation/networking/rmnet.txt create mode 100644 drivers/net/ethernet/qualcomm/rmnet/Kconfig create mode 100644 drivers/net/ethernet/qualcomm/rmnet/Makefile create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h (limited to 'Documentation') diff --git a/Documentation/networking/rmnet.txt b/Documentation/networking/rmnet.txt new file mode 100644 index 000000000000..6b341eaf2062 --- /dev/null +++ b/Documentation/networking/rmnet.txt @@ -0,0 +1,82 @@ +1. Introduction + +rmnet driver is used for supporting the Multiplexing and aggregation +Protocol (MAP). This protocol is used by all recent chipsets using Qualcomm +Technologies, Inc. modems. + +This driver can be used to register onto any physical network device in +IP mode. Physical transports include USB, HSIC, PCIe and IP accelerator. + +Multiplexing allows for creation of logical netdevices (rmnet devices) to +handle multiple private data networks (PDN) like a default internet, tethering, +multimedia messaging service (MMS) or IP media subsystem (IMS). Hardware sends +packets with MAP headers to rmnet. Based on the multiplexer id, rmnet +routes to the appropriate PDN after removing the MAP header. + +Aggregation is required to achieve high data rates. This involves hardware +sending aggregated bunch of MAP frames. rmnet driver will de-aggregate +these MAP frames and send them to appropriate PDN's. + +2. Packet format + +a. MAP packet (data / control) + +MAP header has the same endianness of the IP packet. + +Packet format - + +Bit 0 1 2-7 8 - 15 16 - 31 +Function Command / Data Reserved Pad Multiplexer ID Payload length +Bit 32 - x +Function Raw Bytes + +Command (1)/ Data (0) bit value is to indicate if the packet is a MAP command +or data packet. Control packet is used for transport level flow control. Data +packets are standard IP packets. + +Reserved bits are usually zeroed out and to be ignored by receiver. + +Padding is number of bytes to be added for 4 byte alignment if required by +hardware. + +Multiplexer ID is to indicate the PDN on which data has to be sent. + +Payload length includes the padding length but does not include MAP header +length. + +b. MAP packet (command specific) + +Bit 0 1 2-7 8 - 15 16 - 31 +Function Command Reserved Pad Multiplexer ID Payload length +Bit 32 - 39 40 - 45 46 - 47 48 - 63 +Function Command name Reserved Command Type Reserved +Bit 64 - 95 +Function Transaction ID +Bit 96 - 127 +Function Command data + +Command 1 indicates disabling flow while 2 is enabling flow + +Command types - +0 for MAP command request +1 is to acknowledge the receipt of a command +2 is for unsupported commands +3 is for error during processing of commands + +c. Aggregation + +Aggregation is multiple MAP packets (can be data or command) delivered to +rmnet in a single linear skb. rmnet will process the individual +packets and either ACK the MAP command or deliver the IP packet to the +network stack as needed + +MAP header|IP Packet|Optional padding|MAP header|IP Packet|Optional padding.... +MAP header|IP Packet|Optional padding|MAP header|Command Packet|Optional pad... + +3. Userspace configuration + +rmnet userspace configuration is done through netlink library librmnetctl +and command line utility rmnetcli. Utility is hosted in codeaurora forum git. +The driver uses rtnl_link_ops for communication. + +https://source.codeaurora.org/quic/la/platform/vendor/qcom-opensource/dataservices/tree/rmnetctl diff --git a/drivers/net/ethernet/qualcomm/Kconfig b/drivers/net/ethernet/qualcomm/Kconfig index 877675a27b9f..f5200712718d 100644 --- a/drivers/net/ethernet/qualcomm/Kconfig +++ b/drivers/net/ethernet/qualcomm/Kconfig @@ -59,4 +59,6 @@ config QCOM_EMAC low power, Receive-Side Scaling (RSS), and IEEE 1588-2008 Precision Clock Synchronization Protocol. +source "drivers/net/ethernet/qualcomm/rmnet/Kconfig" + endif # NET_VENDOR_QUALCOMM diff --git a/drivers/net/ethernet/qualcomm/Makefile b/drivers/net/ethernet/qualcomm/Makefile index 92fa7c4da90a..1847350f48a7 100644 --- a/drivers/net/ethernet/qualcomm/Makefile +++ b/drivers/net/ethernet/qualcomm/Makefile @@ -9,3 +9,5 @@ obj-$(CONFIG_QCA7000_UART) += qcauart.o qcauart-objs := qca_uart.o obj-y += emac/ + +obj-$(CONFIG_RMNET) += rmnet/ diff --git a/drivers/net/ethernet/qualcomm/rmnet/Kconfig b/drivers/net/ethernet/qualcomm/rmnet/Kconfig new file mode 100644 index 000000000000..6e2587af47a4 --- /dev/null +++ b/drivers/net/ethernet/qualcomm/rmnet/Kconfig @@ -0,0 +1,12 @@ +# +# RMNET MAP driver +# + +menuconfig RMNET + tristate "RmNet MAP driver" + default n + ---help--- + If you select this, you will enable the RMNET module which is used + for handling data in the multiplexing and aggregation protocol (MAP) + format in the embedded data path. RMNET devices can be attached to + any IP mode physical device. diff --git a/drivers/net/ethernet/qualcomm/rmnet/Makefile b/drivers/net/ethernet/qualcomm/rmnet/Makefile new file mode 100644 index 000000000000..01bddf207cac --- /dev/null +++ b/drivers/net/ethernet/qualcomm/rmnet/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the RMNET module +# + +rmnet-y := rmnet_config.o +rmnet-y += rmnet_vnd.o +rmnet-y += rmnet_handlers.o +rmnet-y += rmnet_map_data.o +rmnet-y += rmnet_map_command.o +obj-$(CONFIG_RMNET) += rmnet.o diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c new file mode 100644 index 000000000000..e836d267d5cc --- /dev/null +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -0,0 +1,419 @@ +/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * RMNET configuration engine + * + */ + +#include +#include +#include +#include +#include "rmnet_config.h" +#include "rmnet_handlers.h" +#include "rmnet_vnd.h" +#include "rmnet_private.h" + +/* Locking scheme - + * The shared resource which needs to be protected is realdev->rx_handler_data. + * For the writer path, this is using rtnl_lock(). The writer paths are + * rmnet_newlink(), rmnet_dellink() and rmnet_force_unassociate_device(). These + * paths are already called with rtnl_lock() acquired in. There is also an + * ASSERT_RTNL() to ensure that we are calling with rtnl acquired. For + * dereference here, we will need to use rtnl_dereference(). Dev list writing + * needs to happen with rtnl_lock() acquired for netdev_master_upper_dev_link(). + * For the reader path, the real_dev->rx_handler_data is called in the TX / RX + * path. We only need rcu_read_lock() for these scenarios. In these cases, + * the rcu_read_lock() is held in __dev_queue_xmit() and + * netif_receive_skb_internal(), so readers need to use rcu_dereference_rtnl() + * to get the relevant information. For dev list reading, we again acquire + * rcu_read_lock() in rmnet_dellink() for netdev_master_upper_dev_get_rcu(). + * We also use unregister_netdevice_many() to free all rmnet devices in + * rmnet_force_unassociate_device() so we dont lose the rtnl_lock() and free in + * same context. + */ + +/* Local Definitions and Declarations */ +#define RMNET_LOCAL_LOGICAL_ENDPOINT -1 + +struct rmnet_walk_data { + struct net_device *real_dev; + struct list_head *head; + struct rmnet_real_dev_info *real_dev_info; +}; + +static int rmnet_is_real_dev_registered(const struct net_device *real_dev) +{ + rx_handler_func_t *rx_handler; + + rx_handler = rcu_dereference(real_dev->rx_handler); + return (rx_handler == rmnet_rx_handler); +} + +/* Needs either rcu_read_lock() or rtnl lock */ +static struct rmnet_real_dev_info* +__rmnet_get_real_dev_info(const struct net_device *real_dev) +{ + if (rmnet_is_real_dev_registered(real_dev)) + return rcu_dereference_rtnl(real_dev->rx_handler_data); + else + return NULL; +} + +/* Needs rtnl lock */ +static struct rmnet_real_dev_info* +rmnet_get_real_dev_info_rtnl(const struct net_device *real_dev) +{ + return rtnl_dereference(real_dev->rx_handler_data); +} + +static struct rmnet_endpoint* +rmnet_get_endpoint(struct net_device *dev, int config_id) +{ + struct rmnet_real_dev_info *r; + struct rmnet_endpoint *ep; + + if (!rmnet_is_real_dev_registered(dev)) { + ep = rmnet_vnd_get_endpoint(dev); + } else { + r = __rmnet_get_real_dev_info(dev); + + if (!r) + return NULL; + + if (config_id == RMNET_LOCAL_LOGICAL_ENDPOINT) + ep = &r->local_ep; + else + ep = &r->muxed_ep[config_id]; + } + + return ep; +} + +static int rmnet_unregister_real_device(struct net_device *real_dev, + struct rmnet_real_dev_info *r) +{ + if (r->nr_rmnet_devs) + return -EINVAL; + + kfree(r); + + netdev_rx_handler_unregister(real_dev); + + /* release reference on real_dev */ + dev_put(real_dev); + + netdev_dbg(real_dev, "Removed from rmnet\n"); + return 0; +} + +static int rmnet_register_real_device(struct net_device *real_dev) +{ + struct rmnet_real_dev_info *r; + int rc; + + ASSERT_RTNL(); + + if (rmnet_is_real_dev_registered(real_dev)) + return 0; + + r = kzalloc(sizeof(*r), GFP_ATOMIC); + if (!r) + return -ENOMEM; + + r->dev = real_dev; + rc = netdev_rx_handler_register(real_dev, rmnet_rx_handler, r); + if (rc) { + kfree(r); + return -EBUSY; + } + + /* hold on to real dev for MAP data */ + dev_hold(real_dev); + + netdev_dbg(real_dev, "registered with rmnet\n"); + return 0; +} + +static int rmnet_set_ingress_data_format(struct net_device *dev, u32 idf) +{ + struct rmnet_real_dev_info *r; + + netdev_dbg(dev, "Ingress format 0x%08X\n", idf); + + r = __rmnet_get_real_dev_info(dev); + + r->ingress_data_format = idf; + + return 0; +} + +static int rmnet_set_egress_data_format(struct net_device *dev, u32 edf, + u16 agg_size, u16 agg_count) +{ + struct rmnet_real_dev_info *r; + + netdev_dbg(dev, "Egress format 0x%08X agg size %d cnt %d\n", + edf, agg_size, agg_count); + + r = __rmnet_get_real_dev_info(dev); + + r->egress_data_format = edf; + + return 0; +} + +static int __rmnet_set_endpoint_config(struct net_device *dev, int config_id, + struct rmnet_endpoint *ep) +{ + struct rmnet_endpoint *dev_ep; + + dev_ep = rmnet_get_endpoint(dev, config_id); + + if (!dev_ep) + return -EINVAL; + + memcpy(dev_ep, ep, sizeof(struct rmnet_endpoint)); + if (config_id == RMNET_LOCAL_LOGICAL_ENDPOINT) + dev_ep->mux_id = 0; + else + dev_ep->mux_id = config_id; + + return 0; +} + +static int rmnet_set_endpoint_config(struct net_device *dev, + int config_id, u8 rmnet_mode, + struct net_device *egress_dev) +{ + struct rmnet_endpoint ep; + + netdev_dbg(dev, "id %d mode %d dev %s\n", + config_id, rmnet_mode, egress_dev->name); + + if (config_id < RMNET_LOCAL_LOGICAL_ENDPOINT || + config_id >= RMNET_MAX_LOGICAL_EP) + return -EINVAL; + + /* This config is cleared on every set, so its ok to not + * clear it on a device delete. + */ + memset(&ep, 0, sizeof(struct rmnet_endpoint)); + ep.rmnet_mode = rmnet_mode; + ep.egress_dev = egress_dev; + + return __rmnet_set_endpoint_config(dev, config_id, &ep); +} + +static int rmnet_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + int ingress_format = RMNET_INGRESS_FORMAT_DEMUXING | + RMNET_INGRESS_FORMAT_DEAGGREGATION | + RMNET_INGRESS_FORMAT_MAP; + int egress_format = RMNET_EGRESS_FORMAT_MUXING | + RMNET_EGRESS_FORMAT_MAP; + struct rmnet_real_dev_info *r; + struct net_device *real_dev; + int mode = RMNET_EPMODE_VND; + int err = 0; + u16 mux_id; + + real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + if (!real_dev || !dev) + return -ENODEV; + + if (!data[IFLA_VLAN_ID]) + return -EINVAL; + + mux_id = nla_get_u16(data[IFLA_VLAN_ID]); + + err = rmnet_register_real_device(real_dev); + if (err) + goto err0; + + r = rmnet_get_real_dev_info_rtnl(real_dev); + err = rmnet_vnd_newlink(mux_id, dev, r); + if (err) + goto err1; + + err = netdev_master_upper_dev_link(dev, real_dev, NULL, NULL); + if (err) + goto err2; + + rmnet_vnd_set_mux(dev, mux_id); + rmnet_set_egress_data_format(real_dev, egress_format, 0, 0); + rmnet_set_ingress_data_format(real_dev, ingress_format); + rmnet_set_endpoint_config(real_dev, mux_id, mode, dev); + rmnet_set_endpoint_config(dev, mux_id, mode, real_dev); + return 0; + +err2: + rmnet_vnd_dellink(mux_id, r); +err1: + rmnet_unregister_real_device(real_dev, r); +err0: + return err; +} + +static void rmnet_dellink(struct net_device *dev, struct list_head *head) +{ + struct rmnet_real_dev_info *r; + struct net_device *real_dev; + u8 mux_id; + + rcu_read_lock(); + real_dev = netdev_master_upper_dev_get_rcu(dev); + rcu_read_unlock(); + + if (!real_dev || !rmnet_is_real_dev_registered(real_dev)) + return; + + r = rmnet_get_real_dev_info_rtnl(real_dev); + + mux_id = rmnet_vnd_get_mux(dev); + rmnet_vnd_dellink(mux_id, r); + netdev_upper_dev_unlink(dev, real_dev); + rmnet_unregister_real_device(real_dev, r); + + unregister_netdevice_queue(dev, head); +} + +static int rmnet_dev_walk_unreg(struct net_device *rmnet_dev, void *data) +{ + struct rmnet_walk_data *d = data; + u8 mux_id; + + mux_id = rmnet_vnd_get_mux(rmnet_dev); + + rmnet_vnd_dellink(mux_id, d->real_dev_info); + netdev_upper_dev_unlink(rmnet_dev, d->real_dev); + unregister_netdevice_queue(rmnet_dev, d->head); + + return 0; +} + +static void rmnet_force_unassociate_device(struct net_device *dev) +{ + struct net_device *real_dev = dev; + struct rmnet_real_dev_info *r; + struct rmnet_walk_data d; + LIST_HEAD(list); + + if (!rmnet_is_real_dev_registered(real_dev)) + return; + + ASSERT_RTNL(); + + d.real_dev = real_dev; + d.head = &list; + + r = rmnet_get_real_dev_info_rtnl(dev); + d.real_dev_info = r; + + rcu_read_lock(); + netdev_walk_all_lower_dev_rcu(real_dev, rmnet_dev_walk_unreg, &d); + rcu_read_unlock(); + unregister_netdevice_many(&list); + + rmnet_unregister_real_device(real_dev, r); +} + +static int rmnet_config_notify_cb(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct net_device *dev = netdev_notifier_info_to_dev(data); + + if (!dev) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UNREGISTER: + netdev_dbg(dev, "Kernel unregister\n"); + rmnet_force_unassociate_device(dev); + break; + + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block rmnet_dev_notifier __read_mostly = { + .notifier_call = rmnet_config_notify_cb, +}; + +static int rmnet_rtnl_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + u16 mux_id; + + if (!data || !data[IFLA_VLAN_ID]) + return -EINVAL; + + mux_id = nla_get_u16(data[IFLA_VLAN_ID]); + if (mux_id > (RMNET_MAX_LOGICAL_EP - 1)) + return -ERANGE; + + return 0; +} + +static size_t rmnet_get_size(const struct net_device *dev) +{ + return nla_total_size(2); /* IFLA_VLAN_ID */ +} + +struct rtnl_link_ops rmnet_link_ops __read_mostly = { + .kind = "rmnet", + .maxtype = __IFLA_VLAN_MAX, + .priv_size = sizeof(struct rmnet_priv), + .setup = rmnet_vnd_setup, + .validate = rmnet_rtnl_validate, + .newlink = rmnet_newlink, + .dellink = rmnet_dellink, + .get_size = rmnet_get_size, +}; + +struct rmnet_real_dev_info* +rmnet_get_real_dev_info(struct net_device *real_dev) +{ + return __rmnet_get_real_dev_info(real_dev); +} + +/* Startup/Shutdown */ + +static int __init rmnet_init(void) +{ + int rc; + + rc = register_netdevice_notifier(&rmnet_dev_notifier); + if (rc != 0) + return rc; + + rc = rtnl_link_register(&rmnet_link_ops); + if (rc != 0) { + unregister_netdevice_notifier(&rmnet_dev_notifier); + return rc; + } + return rc; +} + +static void __exit rmnet_exit(void) +{ + unregister_netdevice_notifier(&rmnet_dev_notifier); + rtnl_link_unregister(&rmnet_link_ops); +} + +module_init(rmnet_init) +module_exit(rmnet_exit) +MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h new file mode 100644 index 000000000000..985d372e0d8d --- /dev/null +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2013-2014, 2016-2017 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * RMNET Data configuration engine + * + */ + +#include + +#ifndef _RMNET_CONFIG_H_ +#define _RMNET_CONFIG_H_ + +#define RMNET_MAX_LOGICAL_EP 255 +#define RMNET_MAX_VND 32 + +/* Information about the next device to deliver the packet to. + * Exact usage of this parameter depends on the rmnet_mode. + */ +struct rmnet_endpoint { + u8 rmnet_mode; + u8 mux_id; + struct net_device *egress_dev; +}; + +/* One instance of this structure is instantiated for each real_dev associated + * with rmnet. + */ +struct rmnet_real_dev_info { + struct net_device *dev; + struct rmnet_endpoint local_ep; + struct rmnet_endpoint muxed_ep[RMNET_MAX_LOGICAL_EP]; + u32 ingress_data_format; + u32 egress_data_format; + struct net_device *rmnet_devices[RMNET_MAX_VND]; + u8 nr_rmnet_devs; +}; + +extern struct rtnl_link_ops rmnet_link_ops; + +struct rmnet_priv { + struct rmnet_endpoint local_ep; + u8 mux_id; +}; + +struct rmnet_real_dev_info* +rmnet_get_real_dev_info(struct net_device *real_dev); + +#endif /* _RMNET_CONFIG_H_ */ diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c new file mode 100644 index 000000000000..7dab3bbfeda5 --- /dev/null +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c @@ -0,0 +1,271 @@ +/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * RMNET Data ingress/egress handler + * + */ + +#include +#include +#include "rmnet_private.h" +#include "rmnet_config.h" +#include "rmnet_vnd.h" +#include "rmnet_map.h" +#include "rmnet_handlers.h" + +#define RMNET_IP_VERSION_4 0x40 +#define RMNET_IP_VERSION_6 0x60 + +/* Helper Functions */ + +static void rmnet_set_skb_proto(struct sk_buff *skb) +{ + switch (skb->data[0] & 0xF0) { + case RMNET_IP_VERSION_4: + skb->protocol = htons(ETH_P_IP); + break; + case RMNET_IP_VERSION_6: + skb->protocol = htons(ETH_P_IPV6); + break; + default: + skb->protocol = htons(ETH_P_MAP); + break; + } +} + +/* Generic handler */ + +static rx_handler_result_t +rmnet_bridge_handler(struct sk_buff *skb, struct rmnet_endpoint *ep) +{ + if (!ep->egress_dev) + kfree_skb(skb); + else + rmnet_egress_handler(skb, ep); + + return RX_HANDLER_CONSUMED; +} + +static rx_handler_result_t +rmnet_deliver_skb(struct sk_buff *skb, struct rmnet_endpoint *ep) +{ + switch (ep->rmnet_mode) { + case RMNET_EPMODE_NONE: + return RX_HANDLER_PASS; + + case RMNET_EPMODE_BRIDGE: + return rmnet_bridge_handler(skb, ep); + + case RMNET_EPMODE_VND: + skb_reset_transport_header(skb); + skb_reset_network_header(skb); + rmnet_vnd_rx_fixup(skb, skb->dev); + + skb->pkt_type = PACKET_HOST; + skb_set_mac_header(skb, 0); + netif_receive_skb(skb); + return RX_HANDLER_CONSUMED; + + default: + kfree_skb(skb); + return RX_HANDLER_CONSUMED; + } +} + +static rx_handler_result_t +rmnet_ingress_deliver_packet(struct sk_buff *skb, + struct rmnet_real_dev_info *r) +{ + if (!r) { + kfree_skb(skb); + return RX_HANDLER_CONSUMED; + } + + skb->dev = r->local_ep.egress_dev; + + return rmnet_deliver_skb(skb, &r->local_ep); +} + +/* MAP handler */ + +static rx_handler_result_t +__rmnet_map_ingress_handler(struct sk_buff *skb, + struct rmnet_real_dev_info *r) +{ + struct rmnet_endpoint *ep; + u8 mux_id; + u16 len; + + if (RMNET_MAP_GET_CD_BIT(skb)) { + if (r->ingress_data_format + & RMNET_INGRESS_FORMAT_MAP_COMMANDS) + return rmnet_map_command(skb, r); + + kfree_skb(skb); + return RX_HANDLER_CONSUMED; + } + + mux_id = RMNET_MAP_GET_MUX_ID(skb); + len = RMNET_MAP_GET_LENGTH(skb) - RMNET_MAP_GET_PAD(skb); + + if (mux_id >= RMNET_MAX_LOGICAL_EP) { + kfree_skb(skb); + return RX_HANDLER_CONSUMED; + } + + ep = &r->muxed_ep[mux_id]; + + if (r->ingress_data_format & RMNET_INGRESS_FORMAT_DEMUXING) + skb->dev = ep->egress_dev; + + /* Subtract MAP header */ + skb_pull(skb, sizeof(struct rmnet_map_header)); + skb_trim(skb, len); + rmnet_set_skb_proto(skb); + return rmnet_deliver_skb(skb, ep); +} + +static rx_handler_result_t +rmnet_map_ingress_handler(struct sk_buff *skb, + struct rmnet_real_dev_info *r) +{ + struct sk_buff *skbn; + int rc; + + if (r->ingress_data_format & RMNET_INGRESS_FORMAT_DEAGGREGATION) { + while ((skbn = rmnet_map_deaggregate(skb, r)) != NULL) + __rmnet_map_ingress_handler(skbn, r); + + consume_skb(skb); + rc = RX_HANDLER_CONSUMED; + } else { + rc = __rmnet_map_ingress_handler(skb, r); + } + + return rc; +} + +static int rmnet_map_egress_handler(struct sk_buff *skb, + struct rmnet_real_dev_info *r, + struct rmnet_endpoint *ep, + struct net_device *orig_dev) +{ + int required_headroom, additional_header_len; + struct rmnet_map_header *map_header; + + additional_header_len = 0; + required_headroom = sizeof(struct rmnet_map_header); + + if (skb_headroom(skb) < required_headroom) { + if (pskb_expand_head(skb, required_headroom, 0, GFP_KERNEL)) + return RMNET_MAP_CONSUMED; + } + + map_header = rmnet_map_add_map_header(skb, additional_header_len, 0); + if (!map_header) + return RMNET_MAP_CONSUMED; + + if (r->egress_data_format & RMNET_EGRESS_FORMAT_MUXING) { + if (ep->mux_id == 0xff) + map_header->mux_id = 0; + else + map_header->mux_id = ep->mux_id; + } + + skb->protocol = htons(ETH_P_MAP); + + return RMNET_MAP_SUCCESS; +} + +/* Ingress / Egress Entry Points */ + +/* Processes packet as per ingress data format for receiving device. Logical + * endpoint is determined from packet inspection. Packet is then sent to the + * egress device listed in the logical endpoint configuration. + */ +rx_handler_result_t rmnet_rx_handler(struct sk_buff **pskb) +{ + struct rmnet_real_dev_info *r; + struct sk_buff *skb = *pskb; + struct net_device *dev; + int rc; + + if (!skb) + return RX_HANDLER_CONSUMED; + + dev = skb->dev; + r = rmnet_get_real_dev_info(dev); + + if (r->ingress_data_format & RMNET_INGRESS_FORMAT_MAP) { + rc = rmnet_map_ingress_handler(skb, r); + } else { + switch (ntohs(skb->protocol)) { + case ETH_P_MAP: + if (r->local_ep.rmnet_mode == + RMNET_EPMODE_BRIDGE) { + rc = rmnet_ingress_deliver_packet(skb, r); + } else { + kfree_skb(skb); + rc = RX_HANDLER_CONSUMED; + } + break; + + case ETH_P_IP: + case ETH_P_IPV6: + rc = rmnet_ingress_deliver_packet(skb, r); + break; + + default: + rc = RX_HANDLER_PASS; + } + } + + return rc; +} + +/* Modifies packet as per logical endpoint configuration and egress data format + * for egress device configured in logical endpoint. Packet is then transmitted + * on the egress device. + */ +void rmnet_egress_handler(struct sk_buff *skb, + struct rmnet_endpoint *ep) +{ + struct rmnet_real_dev_info *r; + struct net_device *orig_dev; + + orig_dev = skb->dev; + skb->dev = ep->egress_dev; + + r = rmnet_get_real_dev_info(skb->dev); + if (!r) { + kfree_skb(skb); + return; + } + + if (r->egress_data_format & RMNET_EGRESS_FORMAT_MAP) { + switch (rmnet_map_egress_handler(skb, r, ep, orig_dev)) { + case RMNET_MAP_CONSUMED: + return; + + case RMNET_MAP_SUCCESS: + break; + + default: + kfree_skb(skb); + return; + } + } + + if (ep->rmnet_mode == RMNET_EPMODE_VND) + rmnet_vnd_tx_fixup(skb, orig_dev); + + dev_queue_xmit(skb); +} diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h new file mode 100644 index 000000000000..f2638cf5693c --- /dev/null +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2013, 2016-2017 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * RMNET Data ingress/egress handler + * + */ + +#ifndef _RMNET_HANDLERS_H_ +#define _RMNET_HANDLERS_H_ + +#include "rmnet_config.h" + +void rmnet_egress_handler(struct sk_buff *skb, + struct rmnet_endpoint *ep); + +rx_handler_result_t rmnet_rx_handler(struct sk_buff **pskb); + +#endif /* _RMNET_HANDLERS_H_ */ diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h new file mode 100644 index 000000000000..2aabad289b10 --- /dev/null +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _RMNET_MAP_H_ +#define _RMNET_MAP_H_ + +struct rmnet_map_control_command { + u8 command_name; + u8 cmd_type:2; + u8 reserved:6; + u16 reserved2; + u32 transaction_id; + union { + struct { + u16 ip_family:2; + u16 reserved:14; + u16 flow_control_seq_num; + u32 qos_id; + } flow_control; + u8 data[0]; + }; +} __aligned(1); + +enum rmnet_map_results { + RMNET_MAP_SUCCESS, + RMNET_MAP_CONSUMED, + RMNET_MAP_GENERAL_FAILURE, + RMNET_MAP_NOT_ENABLED, + RMNET_MAP_FAILED_AGGREGATION, + RMNET_MAP_FAILED_MUX +}; + +enum rmnet_map_commands { + RMNET_MAP_COMMAND_NONE, + RMNET_MAP_COMMAND_FLOW_DISABLE, + RMNET_MAP_COMMAND_FLOW_ENABLE, + /* These should always be the last 2 elements */ + RMNET_MAP_COMMAND_UNKNOWN, + RMNET_MAP_COMMAND_ENUM_LENGTH +}; + +struct rmnet_map_header { + u8 pad_len:6; + u8 reserved_bit:1; + u8 cd_bit:1; + u8 mux_id; + u16 pkt_len; +} __aligned(1); + +#define RMNET_MAP_GET_MUX_ID(Y) (((struct rmnet_map_header *) \ + (Y)->data)->mux_id) +#define RMNET_MAP_GET_CD_BIT(Y) (((struct rmnet_map_header *) \ + (Y)->data)->cd_bit) +#define RMNET_MAP_GET_PAD(Y) (((struct rmnet_map_header *) \ + (Y)->data)->pad_len) +#define RMNET_MAP_GET_CMD_START(Y) ((struct rmnet_map_control_command *) \ + ((Y)->data + \ + sizeof(struct rmnet_map_header))) +#define RMNET_MAP_GET_LENGTH(Y) (ntohs(((struct rmnet_map_header *) \ + (Y)->data)->pkt_len)) + +#define RMNET_MAP_COMMAND_REQUEST 0 +#define RMNET_MAP_COMMAND_ACK 1 +#define RMNET_MAP_COMMAND_UNSUPPORTED 2 +#define RMNET_MAP_COMMAND_INVALID 3 + +#define RMNET_MAP_NO_PAD_BYTES 0 +#define RMNET_MAP_ADD_PAD_BYTES 1 + +u8 rmnet_map_demultiplex(struct sk_buff *skb); +struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb, + struct rmnet_real_dev_info *rdinfo); + +struct rmnet_map_header *rmnet_map_add_map_header(struct sk_buff *skb, + int hdrlen, int pad); +rx_handler_result_t rmnet_map_command(struct sk_buff *skb, + struct rmnet_real_dev_info *rdinfo); + +#endif /* _RMNET_MAP_H_ */ diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c new file mode 100644 index 000000000000..ccded40ee551 --- /dev/null +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c @@ -0,0 +1,107 @@ +/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include "rmnet_config.h" +#include "rmnet_map.h" +#include "rmnet_private.h" +#include "rmnet_vnd.h" + +static u8 rmnet_map_do_flow_control(struct sk_buff *skb, + struct rmnet_real_dev_info *rdinfo, + int enable) +{ + struct rmnet_map_control_command *cmd; + struct rmnet_endpoint *ep; + struct net_device *vnd; + u16 ip_family; + u16 fc_seq; + u32 qos_id; + u8 mux_id; + int r; + + mux_id = RMNET_MAP_GET_MUX_ID(skb); + cmd = RMNET_MAP_GET_CMD_START(skb); + + if (mux_id >= RMNET_MAX_LOGICAL_EP) { + kfree_skb(skb); + return RX_HANDLER_CONSUMED; + } + + ep = &rdinfo->muxed_ep[mux_id]; + vnd = ep->egress_dev; + + ip_family = cmd->flow_control.ip_family; + fc_seq = ntohs(cmd->flow_control.flow_control_seq_num); + qos_id = ntohl(cmd->flow_control.qos_id); + + /* Ignore the ip family and pass the sequence number for both v4 and v6 + * sequence. User space does not support creating dedicated flows for + * the 2 protocols + */ + r = rmnet_vnd_do_flow_control(vnd, enable); + if (r) { + kfree_skb(skb); + return RMNET_MAP_COMMAND_UNSUPPORTED; + } else { + return RMNET_MAP_COMMAND_ACK; + } +} + +static void rmnet_map_send_ack(struct sk_buff *skb, + unsigned char type, + struct rmnet_real_dev_info *rdinfo) +{ + struct rmnet_map_control_command *cmd; + int xmit_status; + + skb->protocol = htons(ETH_P_MAP); + + cmd = RMNET_MAP_GET_CMD_START(skb); + cmd->cmd_type = type & 0x03; + + netif_tx_lock(skb->dev); + xmit_status = skb->dev->netdev_ops->ndo_start_xmit(skb, skb->dev); + netif_tx_unlock(skb->dev); +} + +/* Process MAP command frame and send N/ACK message as appropriate. Message cmd + * name is decoded here and appropriate handler is called. + */ +rx_handler_result_t rmnet_map_command(struct sk_buff *skb, + struct rmnet_real_dev_info *rdinfo) +{ + struct rmnet_map_control_command *cmd; + unsigned char command_name; + unsigned char rc = 0; + + cmd = RMNET_MAP_GET_CMD_START(skb); + command_name = cmd->command_name; + + switch (command_name) { + case RMNET_MAP_COMMAND_FLOW_ENABLE: + rc = rmnet_map_do_flow_control(skb, rdinfo, 1); + break; + + case RMNET_MAP_COMMAND_FLOW_DISABLE: + rc = rmnet_map_do_flow_control(skb, rdinfo, 0); + break; + + default: + rc = RMNET_MAP_COMMAND_UNSUPPORTED; + kfree_skb(skb); + break; + } + if (rc == RMNET_MAP_COMMAND_ACK) + rmnet_map_send_ack(skb, rc, rdinfo); + return RX_HANDLER_CONSUMED; +} diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c new file mode 100644 index 000000000000..a29c476a4755 --- /dev/null +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c @@ -0,0 +1,105 @@ +/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * RMNET Data MAP protocol + * + */ + +#include +#include "rmnet_config.h" +#include "rmnet_map.h" +#include "rmnet_private.h" + +#define RMNET_MAP_DEAGGR_SPACING 64 +#define RMNET_MAP_DEAGGR_HEADROOM (RMNET_MAP_DEAGGR_SPACING / 2) + +/* Adds MAP header to front of skb->data + * Padding is calculated and set appropriately in MAP header. Mux ID is + * initialized to 0. + */ +struct rmnet_map_header *rmnet_map_add_map_header(struct sk_buff *skb, + int hdrlen, int pad) +{ + struct rmnet_map_header *map_header; + u32 padding, map_datalen; + u8 *padbytes; + + if (skb_headroom(skb) < sizeof(struct rmnet_map_header)) + return NULL; + + map_datalen = skb->len - hdrlen; + map_header = (struct rmnet_map_header *) + skb_push(skb, sizeof(struct rmnet_map_header)); + memset(map_header, 0, sizeof(struct rmnet_map_header)); + + if (pad == RMNET_MAP_NO_PAD_BYTES) { + map_header->pkt_len = htons(map_datalen); + return map_header; + } + + padding = ALIGN(map_datalen, 4) - map_datalen; + + if (padding == 0) + goto done; + + if (skb_tailroom(skb) < padding) + return NULL; + + padbytes = (u8 *)skb_put(skb, padding); + memset(padbytes, 0, padding); + +done: + map_header->pkt_len = htons(map_datalen + padding); + map_header->pad_len = padding & 0x3F; + + return map_header; +} + +/* Deaggregates a single packet + * A whole new buffer is allocated for each portion of an aggregated frame. + * Caller should keep calling deaggregate() on the source skb until 0 is + * returned, indicating that there are no more packets to deaggregate. Caller + * is responsible for freeing the original skb. + */ +struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb, + struct rmnet_real_dev_info *rdinfo) +{ + struct rmnet_map_header *maph; + struct sk_buff *skbn; + u32 packet_len; + + if (skb->len == 0) + return NULL; + + maph = (struct rmnet_map_header *)skb->data; + packet_len = ntohs(maph->pkt_len) + sizeof(struct rmnet_map_header); + + if (((int)skb->len - (int)packet_len) < 0) + return NULL; + + skbn = alloc_skb(packet_len + RMNET_MAP_DEAGGR_SPACING, GFP_ATOMIC); + if (!skbn) + return NULL; + + skbn->dev = skb->dev; + skb_reserve(skbn, RMNET_MAP_DEAGGR_HEADROOM); + skb_put(skbn, packet_len); + memcpy(skbn->data, skb->data, packet_len); + skb_pull(skb, packet_len); + + /* Some hardware can send us empty frames. Catch them */ + if (ntohs(maph->pkt_len) == 0) { + kfree_skb(skb); + return NULL; + } + + return skbn; +} diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h new file mode 100644 index 000000000000..ed820b5522f5 --- /dev/null +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2013-2014, 2016-2017 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _RMNET_PRIVATE_H_ +#define _RMNET_PRIVATE_H_ + +#define RMNET_MAX_VND 32 +#define RMNET_MAX_PACKET_SIZE 16384 +#define RMNET_DFLT_PACKET_SIZE 1500 +#define RMNET_NEEDED_HEADROOM 16 +#define RMNET_TX_QUEUE_LEN 1000 + +/* Constants */ +#define RMNET_EGRESS_FORMAT__RESERVED__ BIT(0) +#define RMNET_EGRESS_FORMAT_MAP BIT(1) +#define RMNET_EGRESS_FORMAT_AGGREGATION BIT(2) +#define RMNET_EGRESS_FORMAT_MUXING BIT(3) +#define RMNET_EGRESS_FORMAT_MAP_CKSUMV3 BIT(4) +#define RMNET_EGRESS_FORMAT_MAP_CKSUMV4 BIT(5) + +#define RMNET_INGRESS_FIX_ETHERNET BIT(0) +#define RMNET_INGRESS_FORMAT_MAP BIT(1) +#define RMNET_INGRESS_FORMAT_DEAGGREGATION BIT(2) +#define RMNET_INGRESS_FORMAT_DEMUXING BIT(3) +#define RMNET_INGRESS_FORMAT_MAP_COMMANDS BIT(4) +#define RMNET_INGRESS_FORMAT_MAP_CKSUMV3 BIT(5) +#define RMNET_INGRESS_FORMAT_MAP_CKSUMV4 BIT(6) + +/* Pass the frame up the stack with no modifications to skb->dev */ +#define RMNET_EPMODE_NONE (0) +/* Replace skb->dev to a virtual rmnet device and pass up the stack */ +#define RMNET_EPMODE_VND (1) +/* Pass the frame directly to another device with dev_queue_xmit() */ +#define RMNET_EPMODE_BRIDGE (2) + +#endif /* _RMNET_PRIVATE_H_ */ diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c new file mode 100644 index 000000000000..c8b573d28dcf --- /dev/null +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c @@ -0,0 +1,170 @@ +/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * RMNET Data virtual network driver + * + */ + +#include +#include +#include +#include "rmnet_config.h" +#include "rmnet_handlers.h" +#include "rmnet_private.h" +#include "rmnet_map.h" +#include "rmnet_vnd.h" + +/* RX/TX Fixup */ + +void rmnet_vnd_rx_fixup(struct sk_buff *skb, struct net_device *dev) +{ + dev->stats.rx_packets++; + dev->stats.rx_bytes += skb->len; +} + +void rmnet_vnd_tx_fixup(struct sk_buff *skb, struct net_device *dev) +{ + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; +} + +/* Network Device Operations */ + +static netdev_tx_t rmnet_vnd_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct rmnet_priv *priv; + + priv = netdev_priv(dev); + if (priv->local_ep.egress_dev) { + rmnet_egress_handler(skb, &priv->local_ep); + } else { + dev->stats.tx_dropped++; + kfree_skb(skb); + } + return NETDEV_TX_OK; +} + +static int rmnet_vnd_change_mtu(struct net_device *rmnet_dev, int new_mtu) +{ + if (new_mtu < 0 || new_mtu > RMNET_MAX_PACKET_SIZE) + return -EINVAL; + + rmnet_dev->mtu = new_mtu; + return 0; +} + +static const struct net_device_ops rmnet_vnd_ops = { + .ndo_start_xmit = rmnet_vnd_start_xmit, + .ndo_change_mtu = rmnet_vnd_change_mtu, +}; + +/* Called by kernel whenever a new rmnet device is created. Sets MTU, + * flags, ARP type, needed headroom, etc... + */ +void rmnet_vnd_setup(struct net_device *rmnet_dev) +{ + struct rmnet_priv *priv; + + priv = netdev_priv(rmnet_dev); + netdev_dbg(rmnet_dev, "Setting up device %s\n", rmnet_dev->name); + + rmnet_dev->netdev_ops = &rmnet_vnd_ops; + rmnet_dev->mtu = RMNET_DFLT_PACKET_SIZE; + rmnet_dev->needed_headroom = RMNET_NEEDED_HEADROOM; + random_ether_addr(rmnet_dev->dev_addr); + rmnet_dev->tx_queue_len = RMNET_TX_QUEUE_LEN; + + /* Raw IP mode */ + rmnet_dev->header_ops = NULL; /* No header */ + rmnet_dev->type = ARPHRD_RAWIP; + rmnet_dev->hard_header_len = 0; + rmnet_dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST); + + rmnet_dev->needs_free_netdev = true; +} + +/* Exposed API */ + +int rmnet_vnd_newlink(u8 id, struct net_device *rmnet_dev, + struct rmnet_real_dev_info *r) +{ + int rc; + + if (r->rmnet_devices[id]) + return -EINVAL; + + rc = register_netdevice(rmnet_dev); + if (!rc) { + r->rmnet_devices[id] = rmnet_dev; + r->nr_rmnet_devs++; + rmnet_dev->rtnl_link_ops = &rmnet_link_ops; + } + + return rc; +} + +int rmnet_vnd_dellink(u8 id, struct rmnet_real_dev_info *r) +{ + if (id >= RMNET_MAX_VND || !r->rmnet_devices[id]) + return -EINVAL; + + r->rmnet_devices[id] = NULL; + r->nr_rmnet_devs--; + return 0; +} + +u8 rmnet_vnd_get_mux(struct net_device *rmnet_dev) +{ + struct rmnet_priv *priv; + + priv = netdev_priv(rmnet_dev); + return priv->mux_id; +} + +void rmnet_vnd_set_mux(struct net_device *rmnet_dev, u8 mux_id) +{ + struct rmnet_priv *priv; + + priv = netdev_priv(rmnet_dev); + priv->mux_id = mux_id; +} + +/* Gets the logical endpoint configuration for a RmNet virtual network device + * node. Caller should confirm that devices is a RmNet VND before calling. + */ +struct rmnet_endpoint *rmnet_vnd_get_endpoint(struct net_device *rmnet_dev) +{ + struct rmnet_priv *priv; + + if (!rmnet_dev) + return NULL; + + priv = netdev_priv(rmnet_dev); + + return &priv->local_ep; +} + +int rmnet_vnd_do_flow_control(struct net_device *rmnet_dev, int enable) +{ + netdev_dbg(rmnet_dev, "Setting VND TX queue state to %d\n", enable); + /* Although we expect similar number of enable/disable + * commands, optimize for the disable. That is more + * latency sensitive than enable + */ + if (unlikely(enable)) + netif_wake_queue(rmnet_dev); + else + netif_stop_queue(rmnet_dev); + + return 0; +} diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h new file mode 100644 index 000000000000..b102b4269be1 --- /dev/null +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * RMNET Data Virtual Network Device APIs + * + */ + +#ifndef _RMNET_VND_H_ +#define _RMNET_VND_H_ + +int rmnet_vnd_do_flow_control(struct net_device *dev, int enable); +struct rmnet_endpoint *rmnet_vnd_get_endpoint(struct net_device *dev); +int rmnet_vnd_newlink(u8 id, struct net_device *rmnet_dev, + struct rmnet_real_dev_info *r); +int rmnet_vnd_dellink(u8 id, struct rmnet_real_dev_info *r); +void rmnet_vnd_rx_fixup(struct sk_buff *skb, struct net_device *dev); +void rmnet_vnd_tx_fixup(struct sk_buff *skb, struct net_device *dev); +u8 rmnet_vnd_get_mux(struct net_device *rmnet_dev); +void rmnet_vnd_set_mux(struct net_device *rmnet_dev, u8 mux_id); +void rmnet_vnd_setup(struct net_device *dev); +#endif /* _RMNET_VND_H_ */ -- cgit v1.2.3 From 9c2cbd478ee1830670e562a68d74b2336721b202 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 30 Aug 2017 10:29:14 +0200 Subject: Documentation/bindings: phy: document the Marvell comphy driver The Marvell Armada 7K/8K SoCs contains an hardware block called COMPHY that provides a number of shared PHYs used by various interfaces in the SoC: network, SATA, PCIe, etc. This Device Tree binding allows to describe this COMPHY hardware block. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- .../devicetree/bindings/phy/phy-mvebu-comphy.txt | 43 ++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 Documentation/devicetree/bindings/phy/phy-mvebu-comphy.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/phy/phy-mvebu-comphy.txt b/Documentation/devicetree/bindings/phy/phy-mvebu-comphy.txt new file mode 100644 index 000000000000..bfcf80341657 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/phy-mvebu-comphy.txt @@ -0,0 +1,43 @@ +mvebu comphy driver +------------------- + +A comphy controller can be found on Marvell Armada 7k/8k on the CP110. It +provides a number of shared PHYs used by various interfaces (network, sata, +usb, PCIe...). + +Required properties: + +- compatible: should be "marvell,comphy-cp110" +- reg: should contain the comphy register location and length. +- marvell,system-controller: should contain a phandle to the + system controller node. +- #address-cells: should be 1. +- #size-cells: should be 0. + +A sub-node is required for each comphy lane provided by the comphy. + +Required properties (child nodes): + +- reg: comphy lane number. +- #phy-cells : from the generic phy bindings, must be 1. Defines the + input port to use for a given comphy lane. + +Example: + + cpm_comphy: phy@120000 { + compatible = "marvell,comphy-cp110"; + reg = <0x120000 0x6000>; + marvell,system-controller = <&cpm_syscon0>; + #address-cells = <1>; + #size-cells = <0>; + + cpm_comphy0: phy@0 { + reg = <0>; + #phy-cells = <1>; + }; + + cpm_comphy1: phy@1 { + reg = <1>; + #phy-cells = <1>; + }; + }; -- cgit v1.2.3 From d35d6e92caa00bd44596108f2d7aa560d7057f4a Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Wed, 30 Aug 2017 13:37:22 -0700 Subject: hv_netvsc: Fix typos in the document of UDP hashing There are two typos in the document, netvsc.txt, regarding UDP hashing level. This patch fixes them. Signed-off-by: Haiyang Zhang Signed-off-by: David S. Miller --- Documentation/networking/netvsc.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/netvsc.txt b/Documentation/networking/netvsc.txt index fa8d86356791..93560fb1170a 100644 --- a/Documentation/networking/netvsc.txt +++ b/Documentation/networking/netvsc.txt @@ -32,9 +32,9 @@ Features hashing. Using L3 hashing is recommended in this case. For example, for UDP over IPv4 on eth0: - To include UDP port numbers in hasing: + To include UDP port numbers in hashing: ethtool -N eth0 rx-flow-hash udp4 sdfn - To exclude UDP port numbers in hasing: + To exclude UDP port numbers in hashing: ethtool -N eth0 rx-flow-hash udp4 sd To show UDP hash level: ethtool -n eth0 rx-flow-hash udp4 -- cgit v1.2.3 From db40b4d1478c86bf8b1b41a4a5b5d10b0da39f0c Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Fri, 1 Sep 2017 11:04:55 +0200 Subject: Documentation/bindings: net: marvell-pp2: add the link interrupt A link interrupt can be described. Document this valid interrupt name. Signed-off-by: Antoine Tenart Tested-by: Marcin Wojtas Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/marvell-pp2.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/marvell-pp2.txt b/Documentation/devicetree/bindings/net/marvell-pp2.txt index 49484db81583..c78f3187dfea 100644 --- a/Documentation/devicetree/bindings/net/marvell-pp2.txt +++ b/Documentation/devicetree/bindings/net/marvell-pp2.txt @@ -44,7 +44,7 @@ Optional properties (port): - interrupt-names: if more than a single interrupt for rx is given, must be the name associated to the interrupts listed. Valid names are: "tx-cpu0", "tx-cpu1", "tx-cpu2", "tx-cpu3", - "rx-shared". + "rx-shared", "link". - marvell,system-controller: a phandle to the system controller. Example for marvell,armada-375-pp2: -- cgit v1.2.3 From cc8889ae8298ebfc6bbf52ad98fe3b5afdf4ae70 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 1 Sep 2017 12:01:41 -0400 Subject: doc: document MSG_ZEROCOPY Documentation for this feature was missing from the patchset. Copied a lot from the netdev 2.1 paper, addressing some small interface changes since then. Changes v1 -> v2 - change email discussion URL format - clarify that u32 counter is per-syscall, unsigned and wraps after UINT_MAX calls - describe errno on send failure specific to MSG_ZEROCOPY - a few very minor rewordings Signed-off-by: Willem de Bruijn Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/msg_zerocopy.rst | 257 ++++++++++++++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 Documentation/networking/msg_zerocopy.rst (limited to 'Documentation') diff --git a/Documentation/networking/msg_zerocopy.rst b/Documentation/networking/msg_zerocopy.rst new file mode 100644 index 000000000000..77f6d7e25cfd --- /dev/null +++ b/Documentation/networking/msg_zerocopy.rst @@ -0,0 +1,257 @@ + +============ +MSG_ZEROCOPY +============ + +Intro +===== + +The MSG_ZEROCOPY flag enables copy avoidance for socket send calls. +The feature is currently implemented for TCP sockets. + + +Opportunity and Caveats +----------------------- + +Copying large buffers between user process and kernel can be +expensive. Linux supports various interfaces that eschew copying, +such as sendpage and splice. The MSG_ZEROCOPY flag extends the +underlying copy avoidance mechanism to common socket send calls. + +Copy avoidance is not a free lunch. As implemented, with page pinning, +it replaces per byte copy cost with page accounting and completion +notification overhead. As a result, MSG_ZEROCOPY is generally only +effective at writes over around 10 KB. + +Page pinning also changes system call semantics. It temporarily shares +the buffer between process and network stack. Unlike with copying, the +process cannot immediately overwrite the buffer after system call +return without possibly modifying the data in flight. Kernel integrity +is not affected, but a buggy program can possibly corrupt its own data +stream. + +The kernel returns a notification when it is safe to modify data. +Converting an existing application to MSG_ZEROCOPY is not always as +trivial as just passing the flag, then. + + +More Info +--------- + +Much of this document was derived from a longer paper presented at +netdev 2.1. For more in-depth information see that paper and talk, +the excellent reporting over at LWN.net or read the original code. + + paper, slides, video + https://netdevconf.org/2.1/session.html?debruijn + + LWN article + https://lwn.net/Articles/726917/ + + patchset + [PATCH net-next v4 0/9] socket sendmsg MSG_ZEROCOPY + http://lkml.kernel.org/r/20170803202945.70750-1-willemdebruijn.kernel@gmail.com + + +Interface +========= + +Passing the MSG_ZEROCOPY flag is the most obvious step to enable copy +avoidance, but not the only one. + +Socket Setup +------------ + +The kernel is permissive when applications pass undefined flags to the +send system call. By default it simply ignores these. To avoid enabling +copy avoidance mode for legacy processes that accidentally already pass +this flag, a process must first signal intent by setting a socket option: + +:: + + if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one))) + error(1, errno, "setsockopt zerocopy"); + + +Transmission +------------ + +The change to send (or sendto, sendmsg, sendmmsg) itself is trivial. +Pass the new flag. + +:: + + ret = send(fd, buf, sizeof(buf), MSG_ZEROCOPY); + +A zerocopy failure will return -1 with errno ENOBUFS. This happens if +the socket option was not set, the socket exceeds its optmem limit or +the user exceeds its ulimit on locked pages. + + +Mixing copy avoidance and copying +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Many workloads have a mixture of large and small buffers. Because copy +avoidance is more expensive than copying for small packets, the +feature is implemented as a flag. It is safe to mix calls with the flag +with those without. + + +Notifications +------------- + +The kernel has to notify the process when it is safe to reuse a +previously passed buffer. It queues completion notifications on the +socket error queue, akin to the transmit timestamping interface. + +The notification itself is a simple scalar value. Each socket +maintains an internal unsigned 32-bit counter. Each send call with +MSG_ZEROCOPY that successfully sends data increments the counter. The +counter is not incremented on failure or if called with length zero. +The counter counts system call invocations, not bytes. It wraps after +UINT_MAX calls. + + +Notification Reception +~~~~~~~~~~~~~~~~~~~~~~ + +The below snippet demonstrates the API. In the simplest case, each +send syscall is followed by a poll and recvmsg on the error queue. + +Reading from the error queue is always a non-blocking operation. The +poll call is there to block until an error is outstanding. It will set +POLLERR in its output flags. That flag does not have to be set in the +events field. Errors are signaled unconditionally. + +:: + + pfd.fd = fd; + pfd.events = 0; + if (poll(&pfd, 1, -1) != 1 || pfd.revents & POLLERR == 0) + error(1, errno, "poll"); + + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); + if (ret == -1) + error(1, errno, "recvmsg"); + + read_notification(msg); + +The example is for demonstration purpose only. In practice, it is more +efficient to not wait for notifications, but read without blocking +every couple of send calls. + +Notifications can be processed out of order with other operations on +the socket. A socket that has an error queued would normally block +other operations until the error is read. Zerocopy notifications have +a zero error code, however, to not block send and recv calls. + + +Notification Batching +~~~~~~~~~~~~~~~~~~~~~ + +Multiple outstanding packets can be read at once using the recvmmsg +call. This is often not needed. In each message the kernel returns not +a single value, but a range. It coalesces consecutive notifications +while one is outstanding for reception on the error queue. + +When a new notification is about to be queued, it checks whether the +new value extends the range of the notification at the tail of the +queue. If so, it drops the new notification packet and instead increases +the range upper value of the outstanding notification. + +For protocols that acknowledge data in-order, like TCP, each +notification can be squashed into the previous one, so that no more +than one notification is outstanding at any one point. + +Ordered delivery is the common case, but not guaranteed. Notifications +may arrive out of order on retransmission and socket teardown. + + +Notification Parsing +~~~~~~~~~~~~~~~~~~~~ + +The below snippet demonstrates how to parse the control message: the +read_notification() call in the previous snippet. A notification +is encoded in the standard error format, sock_extended_err. + +The level and type fields in the control data are protocol family +specific, IP_RECVERR or IPV6_RECVERR. + +Error origin is the new type SO_EE_ORIGIN_ZEROCOPY. ee_errno is zero, +as explained before, to avoid blocking read and write system calls on +the socket. + +The 32-bit notification range is encoded as [ee_info, ee_data]. This +range is inclusive. Other fields in the struct must be treated as +undefined, bar for ee_code, as discussed below. + +:: + + struct sock_extended_err *serr; + struct cmsghdr *cm; + + cm = CMSG_FIRSTHDR(msg); + if (cm->cmsg_level != SOL_IP && + cm->cmsg_type != IP_RECVERR) + error(1, 0, "cmsg"); + + serr = (void *) CMSG_DATA(cm); + if (serr->ee_errno != 0 || + serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) + error(1, 0, "serr"); + + printf("completed: %u..%u\n", serr->ee_info, serr->ee_data); + + +Deferred copies +~~~~~~~~~~~~~~~ + +Passing flag MSG_ZEROCOPY is a hint to the kernel to apply copy +avoidance, and a contract that the kernel will queue a completion +notification. It is not a guarantee that the copy is elided. + +Copy avoidance is not always feasible. Devices that do not support +scatter-gather I/O cannot send packets made up of kernel generated +protocol headers plus zerocopy user data. A packet may need to be +converted to a private copy of data deep in the stack, say to compute +a checksum. + +In all these cases, the kernel returns a completion notification when +it releases its hold on the shared pages. That notification may arrive +before the (copied) data is fully transmitted. A zerocopy completion +notification is not a transmit completion notification, therefore. + +Deferred copies can be more expensive than a copy immediately in the +system call, if the data is no longer warm in the cache. The process +also incurs notification processing cost for no benefit. For this +reason, the kernel signals if data was completed with a copy, by +setting flag SO_EE_CODE_ZEROCOPY_COPIED in field ee_code on return. +A process may use this signal to stop passing flag MSG_ZEROCOPY on +subsequent requests on the same socket. + + +Implementation +============== + +Loopback +-------- + +Data sent to local sockets can be queued indefinitely if the receive +process does not read its socket. Unbound notification latency is not +acceptable. For this reason all packets generated with MSG_ZEROCOPY +that are looped to a local socket will incur a deferred copy. This +includes looping onto packet sockets (e.g., tcpdump) and tun devices. + + +Testing +======= + +More realistic example code can be found in the kernel source under +tools/testing/selftests/net/msg_zerocopy.c. + +Be cognizant of the loopback constraint. The test can be run between +a pair of hosts. But if run between a local pair of processes, for +instance when run with msg_zerocopy.sh between a veth pair across +namespaces, the test will not show any improvement. For testing, the +loopback restriction can be temporarily relaxed by making +skb_orphan_frags_rx identical to skb_orphan_frags. -- cgit v1.2.3