diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-31 14:31:10 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-31 14:31:10 -0800 |
commit | b2fe5fa68642860e7de76167c3111623aa0d5de1 (patch) | |
tree | b7f9b89b7039ecefbc35fe3c8e73a6ff972641dd /samples | |
parent | a103950e0dd2058df5e8a8d4a915707bdcf205f0 (diff) | |
parent | a54667f6728c2714a400f3c884727da74b6d1717 (diff) | |
download | lwn-b2fe5fa68642860e7de76167c3111623aa0d5de1.tar.gz lwn-b2fe5fa68642860e7de76167c3111623aa0d5de1.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) Significantly shrink the core networking routing structures. Result
of http://vger.kernel.org/~davem/seoul2017_netdev_keynote.pdf
2) Add netdevsim driver for testing various offloads, from Jakub
Kicinski.
3) Support cross-chip FDB operations in DSA, from Vivien Didelot.
4) Add a 2nd listener hash table for TCP, similar to what was done for
UDP. From Martin KaFai Lau.
5) Add eBPF based queue selection to tun, from Jason Wang.
6) Lockless qdisc support, from John Fastabend.
7) SCTP stream interleave support, from Xin Long.
8) Smoother TCP receive autotuning, from Eric Dumazet.
9) Lots of erspan tunneling enhancements, from William Tu.
10) Add true function call support to BPF, from Alexei Starovoitov.
11) Add explicit support for GRO HW offloading, from Michael Chan.
12) Support extack generation in more netlink subsystems. From Alexander
Aring, Quentin Monnet, and Jakub Kicinski.
13) Add 1000BaseX, flow control, and EEE support to mvneta driver. From
Russell King.
14) Add flow table abstraction to netfilter, from Pablo Neira Ayuso.
15) Many improvements and simplifications to the NFP driver bpf JIT,
from Jakub Kicinski.
16) Support for ipv6 non-equal cost multipath routing, from Ido
Schimmel.
17) Add resource abstration to devlink, from Arkadi Sharshevsky.
18) Packet scheduler classifier shared filter block support, from Jiri
Pirko.
19) Avoid locking in act_csum, from Davide Caratti.
20) devinet_ioctl() simplifications from Al viro.
21) More TCP bpf improvements from Lawrence Brakmo.
22) Add support for onlink ipv6 route flag, similar to ipv4, from David
Ahern.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1925 commits)
tls: Add support for encryption using async offload accelerator
ip6mr: fix stale iterator
net/sched: kconfig: Remove blank help texts
openvswitch: meter: Use 64-bit arithmetic instead of 32-bit
tcp_nv: fix potential integer overflow in tcpnv_acked
r8169: fix RTL8168EP take too long to complete driver initialization.
qmi_wwan: Add support for Quectel EP06
rtnetlink: enable IFLA_IF_NETNSID for RTM_NEWLINK
ipmr: Fix ptrdiff_t print formatting
ibmvnic: Wait for device response when changing MAC
qlcnic: fix deadlock bug
tcp: release sk_frag.page in tcp_disconnect
ipv4: Get the address of interface correctly.
net_sched: gen_estimator: fix lockdep splat
net: macb: Handle HRESP error
net/mlx5e: IPoIB, Fix copy-paste bug in flow steering refactoring
ipv6: addrconf: break critical section in addrconf_verify_rtnl()
ipv6: change route cache aging logic
i40e/i40evf: Update DESC_NEEDED value to reflect larger value
bnxt_en: cleanup DIM work on device shutdown
...
Diffstat (limited to 'samples')
-rw-r--r-- | samples/bpf/Makefile | 14 | ||||
-rw-r--r-- | samples/bpf/tcbpf2_kern.c | 170 | ||||
-rw-r--r-- | samples/bpf/test_cgrp2_attach2.c | 36 | ||||
-rwxr-xr-x | samples/bpf/test_override_return.sh | 15 | ||||
-rwxr-xr-x | samples/bpf/test_tunnel_bpf.sh | 128 | ||||
-rw-r--r-- | samples/bpf/tracex7_kern.c | 16 | ||||
-rw-r--r-- | samples/bpf/tracex7_user.c | 28 | ||||
-rwxr-xr-x | samples/bpf/xdp2skb_meta.sh | 220 | ||||
-rw-r--r-- | samples/bpf/xdp2skb_meta_kern.c | 105 | ||||
-rw-r--r-- | samples/bpf/xdp_monitor_kern.c | 96 | ||||
-rw-r--r-- | samples/bpf/xdp_monitor_user.c | 416 | ||||
-rw-r--r-- | samples/bpf/xdp_rxq_info_kern.c | 96 | ||||
-rw-r--r-- | samples/bpf/xdp_rxq_info_user.c | 531 | ||||
-rw-r--r-- | samples/sockmap/sockmap_user.c | 392 |
14 files changed, 2121 insertions, 142 deletions
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index adeaa1302f34..64335bb94f9f 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -12,6 +12,7 @@ hostprogs-y += tracex3 hostprogs-y += tracex4 hostprogs-y += tracex5 hostprogs-y += tracex6 +hostprogs-y += tracex7 hostprogs-y += test_probe_write_user hostprogs-y += trace_output hostprogs-y += lathist @@ -40,6 +41,7 @@ hostprogs-y += xdp_redirect hostprogs-y += xdp_redirect_map hostprogs-y += xdp_redirect_cpu hostprogs-y += xdp_monitor +hostprogs-y += xdp_rxq_info hostprogs-y += syscall_tp # Libbpf dependencies @@ -58,6 +60,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o +tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o @@ -88,6 +91,7 @@ xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o +xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o # Tell kbuild to always build the programs @@ -101,6 +105,7 @@ always += tracex3_kern.o always += tracex4_kern.o always += tracex5_kern.o always += tracex6_kern.o +always += tracex7_kern.o always += sock_flags_kern.o always += test_probe_write_user_kern.o always += trace_output_kern.o @@ -136,6 +141,8 @@ always += xdp_redirect_kern.o always += xdp_redirect_map_kern.o always += xdp_redirect_cpu_kern.o always += xdp_monitor_kern.o +always += xdp_rxq_info_kern.o +always += xdp2skb_meta_kern.o always += syscall_tp_kern.o HOSTCFLAGS += -I$(objtree)/usr/include @@ -155,6 +162,7 @@ HOSTLOADLIBES_tracex3 += -lelf HOSTLOADLIBES_tracex4 += -lelf -lrt HOSTLOADLIBES_tracex5 += -lelf HOSTLOADLIBES_tracex6 += -lelf +HOSTLOADLIBES_tracex7 += -lelf HOSTLOADLIBES_test_cgrp2_sock2 += -lelf HOSTLOADLIBES_load_sock_ops += -lelf HOSTLOADLIBES_test_probe_write_user += -lelf @@ -178,6 +186,7 @@ HOSTLOADLIBES_xdp_redirect += -lelf HOSTLOADLIBES_xdp_redirect_map += -lelf HOSTLOADLIBES_xdp_redirect_cpu += -lelf HOSTLOADLIBES_xdp_monitor += -lelf +HOSTLOADLIBES_xdp_rxq_info += -lelf HOSTLOADLIBES_syscall_tp += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: @@ -192,13 +201,16 @@ CLANG_ARCH_ARGS = -target $(ARCH) endif # Trick to allow make to be run from this directory -all: +all: $(LIBBPF) $(MAKE) -C ../../ $(CURDIR)/ clean: $(MAKE) -C ../../ M=$(CURDIR) clean @rm -f *~ +$(LIBBPF): FORCE + $(MAKE) -C $(dir $@) $(notdir $@) + $(obj)/syscall_nrs.s: $(src)/syscall_nrs.c $(call if_changed_dep,cc_s_c) diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c index 370b749f5ee6..f6bbf8f50da3 100644 --- a/samples/bpf/tcbpf2_kern.c +++ b/samples/bpf/tcbpf2_kern.c @@ -35,12 +35,22 @@ struct geneve_opt { u8 opt_data[8]; /* hard-coded to 8 byte */ }; +struct erspan_md2 { + __be32 timestamp; + __be16 sgt; + __be16 flags; +}; + struct vxlan_metadata { u32 gbp; }; struct erspan_metadata { - __be32 index; + union { + __be32 index; + struct erspan_md2 md2; + } u; + int version; }; SEC("gre_set_tunnel") @@ -81,6 +91,49 @@ int _gre_get_tunnel(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("ip6gretap_set_tunnel") +int _ip6gretap_set_tunnel(struct __sk_buff *skb) +{ + struct bpf_tunnel_key key; + int ret; + + __builtin_memset(&key, 0x0, sizeof(key)); + key.remote_ipv6[3] = _htonl(0x11); /* ::11 */ + key.tunnel_id = 2; + key.tunnel_tos = 0; + key.tunnel_ttl = 64; + key.tunnel_label = 0xabcde; + + ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), + BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + return TC_ACT_OK; +} + +SEC("ip6gretap_get_tunnel") +int _ip6gretap_get_tunnel(struct __sk_buff *skb) +{ + char fmt[] = "key %d remote ip6 ::%x label %x\n"; + struct bpf_tunnel_key key; + int ret; + + ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), + BPF_F_TUNINFO_IPV6); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + bpf_trace_printk(fmt, sizeof(fmt), + key.tunnel_id, key.remote_ipv6[3], key.tunnel_label); + + return TC_ACT_OK; +} + SEC("erspan_set_tunnel") int _erspan_set_tunnel(struct __sk_buff *skb) { @@ -100,7 +153,18 @@ int _erspan_set_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - md.index = htonl(123); + __builtin_memset(&md, 0, sizeof(md)); +#ifdef ERSPAN_V1 + md.version = 1; + md.u.index = htonl(123); +#else + u8 direction = 1; + u16 hwid = 7; + + md.version = 2; + md.u.md2.flags = htons((direction << 3) | (hwid << 4)); +#endif + ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md)); if (ret < 0) { ERROR(ret); @@ -113,7 +177,7 @@ int _erspan_set_tunnel(struct __sk_buff *skb) SEC("erspan_get_tunnel") int _erspan_get_tunnel(struct __sk_buff *skb) { - char fmt[] = "key %d remote ip 0x%x erspan index 0x%x\n"; + char fmt[] = "key %d remote ip 0x%x erspan version %d\n"; struct bpf_tunnel_key key; struct erspan_metadata md; u32 index; @@ -131,9 +195,105 @@ int _erspan_get_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - index = bpf_ntohl(md.index); bpf_trace_printk(fmt, sizeof(fmt), - key.tunnel_id, key.remote_ipv4, index); + key.tunnel_id, key.remote_ipv4, md.version); + +#ifdef ERSPAN_V1 + char fmt2[] = "\tindex %x\n"; + + index = bpf_ntohl(md.u.index); + bpf_trace_printk(fmt2, sizeof(fmt2), index); +#else + char fmt2[] = "\tdirection %d hwid %x timestamp %u\n"; + + bpf_trace_printk(fmt2, sizeof(fmt2), + (ntohs(md.u.md2.flags) >> 3) & 0x1, + (ntohs(md.u.md2.flags) >> 4) & 0x3f, + bpf_ntohl(md.u.md2.timestamp)); +#endif + + return TC_ACT_OK; +} + +SEC("ip4ip6erspan_set_tunnel") +int _ip4ip6erspan_set_tunnel(struct __sk_buff *skb) +{ + struct bpf_tunnel_key key; + struct erspan_metadata md; + int ret; + + __builtin_memset(&key, 0x0, sizeof(key)); + key.remote_ipv6[3] = _htonl(0x11); + key.tunnel_id = 2; + key.tunnel_tos = 0; + key.tunnel_ttl = 64; + + ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), + BPF_F_TUNINFO_IPV6); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + __builtin_memset(&md, 0, sizeof(md)); + +#ifdef ERSPAN_V1 + md.u.index = htonl(123); + md.version = 1; +#else + u8 direction = 0; + u16 hwid = 17; + + md.version = 2; + md.u.md2.flags = htons((direction << 3) | (hwid << 4)); +#endif + + ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md)); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + return TC_ACT_OK; +} + +SEC("ip4ip6erspan_get_tunnel") +int _ip4ip6erspan_get_tunnel(struct __sk_buff *skb) +{ + char fmt[] = "ip6erspan get key %d remote ip6 ::%x erspan version %d\n"; + struct bpf_tunnel_key key; + struct erspan_metadata md; + u32 index; + int ret; + + ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md)); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + bpf_trace_printk(fmt, sizeof(fmt), + key.tunnel_id, key.remote_ipv4, md.version); + +#ifdef ERSPAN_V1 + char fmt2[] = "\tindex %x\n"; + + index = bpf_ntohl(md.u.index); + bpf_trace_printk(fmt2, sizeof(fmt2), index); +#else + char fmt2[] = "\tdirection %d hwid %x timestamp %u\n"; + + bpf_trace_printk(fmt2, sizeof(fmt2), + (ntohs(md.u.md2.flags) >> 3) & 0x1, + (ntohs(md.u.md2.flags) >> 4) & 0x3f, + bpf_ntohl(md.u.md2.timestamp)); +#endif return TC_ACT_OK; } diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c index 3e8232cc04a8..1af412ec6007 100644 --- a/samples/bpf/test_cgrp2_attach2.c +++ b/samples/bpf/test_cgrp2_attach2.c @@ -78,7 +78,8 @@ static int test_foo_bar(void) if (join_cgroup(FOO)) goto err; - if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) { + if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { log_err("Attaching prog to /foo"); goto err; } @@ -97,7 +98,8 @@ static int test_foo_bar(void) printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n"); assert(system(PING_CMD) != 0); - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { + if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { log_err("Attaching prog to /foo/bar"); goto err; } @@ -114,7 +116,8 @@ static int test_foo_bar(void) "This ping in cgroup /foo/bar should fail...\n"); assert(system(PING_CMD) != 0); - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { + if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { log_err("Attaching prog to /foo/bar"); goto err; } @@ -128,7 +131,8 @@ static int test_foo_bar(void) "This ping in cgroup /foo/bar should pass...\n"); assert(system(PING_CMD) == 0); - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { + if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { log_err("Attaching prog to /foo/bar"); goto err; } @@ -161,13 +165,15 @@ static int test_foo_bar(void) goto err; } - if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { + if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { errno = 0; log_err("Unexpected success attaching overridable prog to /foo/bar"); goto err; } - if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) { + if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { errno = 0; log_err("Unexpected success attaching overridable prog to /foo"); goto err; @@ -273,27 +279,33 @@ static int test_multiprog(void) if (join_cgroup("/cg1/cg2/cg3/cg4/cg5")) goto err; - if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) { + if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI)) { log_err("Attaching prog to cg1"); goto err; } - if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) { + if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI)) { log_err("Unexpected success attaching the same prog to cg1"); goto err; } - if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS, 2)) { + if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI)) { log_err("Attaching prog2 to cg1"); goto err; } - if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS, 1)) { + if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { log_err("Attaching prog to cg2"); goto err; } - if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS, 2)) { + if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI)) { log_err("Attaching prog to cg3"); goto err; } - if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS, 1)) { + if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { log_err("Attaching prog to cg4"); goto err; } diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh new file mode 100755 index 000000000000..e68b9ee6814b --- /dev/null +++ b/samples/bpf/test_override_return.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +rm -f testfile.img +dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 +DEVICE=$(losetup --show -f testfile.img) +mkfs.btrfs -f $DEVICE +mkdir tmpmnt +./tracex7 $DEVICE +if [ $? -eq 0 ] +then + echo "SUCCESS!" +else + echo "FAILED!" +fi +losetup -d $DEVICE diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh index 312e1722a39f..ae7f7c38309b 100755 --- a/samples/bpf/test_tunnel_bpf.sh +++ b/samples/bpf/test_tunnel_bpf.sh @@ -33,10 +33,43 @@ function add_gre_tunnel { ip addr add dev $DEV 10.1.1.200/24 } -function add_erspan_tunnel { +function add_ip6gretap_tunnel { + + # assign ipv6 address + ip netns exec at_ns0 ip addr add ::11/96 dev veth0 + ip netns exec at_ns0 ip link set dev veth0 up + ip addr add dev veth1 ::22/96 + ip link set dev veth1 up + # in namespace ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq key 2 local 172.16.1.100 remote 172.16.1.200 erspan 123 + ip link add dev $DEV_NS type $TYPE flowlabel 0xbcdef key 2 \ + local ::11 remote ::22 + + ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 + ip netns exec at_ns0 ip addr add dev $DEV_NS fc80::100/96 + ip netns exec at_ns0 ip link set dev $DEV_NS up + + # out of namespace + ip link add dev $DEV type $TYPE external + ip addr add dev $DEV 10.1.1.200/24 + ip addr add dev $DEV fc80::200/24 + ip link set dev $DEV up +} + +function add_erspan_tunnel { + # in namespace + if [ "$1" == "v1" ]; then + ip netns exec at_ns0 \ + ip link add dev $DEV_NS type $TYPE seq key 2 \ + local 172.16.1.100 remote 172.16.1.200 \ + erspan_ver 1 erspan 123 + else + ip netns exec at_ns0 \ + ip link add dev $DEV_NS type $TYPE seq key 2 \ + local 172.16.1.100 remote 172.16.1.200 \ + erspan_ver 2 erspan_dir 1 erspan_hwid 3 + fi ip netns exec at_ns0 ip link set dev $DEV_NS up ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 @@ -46,6 +79,35 @@ function add_erspan_tunnel { ip addr add dev $DEV 10.1.1.200/24 } +function add_ip6erspan_tunnel { + + # assign ipv6 address + ip netns exec at_ns0 ip addr add ::11/96 dev veth0 + ip netns exec at_ns0 ip link set dev veth0 up + ip addr add dev veth1 ::22/96 + ip link set dev veth1 up + + # in namespace + if [ "$1" == "v1" ]; then + ip netns exec at_ns0 \ + ip link add dev $DEV_NS type $TYPE seq key 2 \ + local ::11 remote ::22 \ + erspan_ver 1 erspan 123 + else + ip netns exec at_ns0 \ + ip link add dev $DEV_NS type $TYPE seq key 2 \ + local ::11 remote ::22 \ + erspan_ver 2 erspan_dir 1 erspan_hwid 7 + fi + ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 + ip netns exec at_ns0 ip link set dev $DEV_NS up + + # out of namespace + ip link add dev $DEV type $TYPE external + ip addr add dev $DEV 10.1.1.200/24 + ip link set dev $DEV up +} + function add_vxlan_tunnel { # Set static ARP entry here because iptables set-mark works # on L3 packet, as a result not applying to ARP packets, @@ -113,18 +175,65 @@ function test_gre { cleanup } +function test_ip6gre { + TYPE=ip6gre + DEV_NS=ip6gre00 + DEV=ip6gre11 + config_device + # reuse the ip6gretap function + add_ip6gretap_tunnel + attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel + # underlay + ping6 -c 4 ::11 + # overlay: ipv4 over ipv6 + ip netns exec at_ns0 ping -c 1 10.1.1.200 + ping -c 1 10.1.1.100 + # overlay: ipv6 over ipv6 + ip netns exec at_ns0 ping6 -c 1 fc80::200 + cleanup +} + +function test_ip6gretap { + TYPE=ip6gretap + DEV_NS=ip6gretap00 + DEV=ip6gretap11 + config_device + add_ip6gretap_tunnel + attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel + # underlay + ping6 -c 4 ::11 + # overlay: ipv4 over ipv6 + ip netns exec at_ns0 ping -i .2 -c 1 10.1.1.200 + ping -c 1 10.1.1.100 + # overlay: ipv6 over ipv6 + ip netns exec at_ns0 ping6 -c 1 fc80::200 + cleanup +} + function test_erspan { TYPE=erspan DEV_NS=erspan00 DEV=erspan11 config_device - add_erspan_tunnel + add_erspan_tunnel $1 attach_bpf $DEV erspan_set_tunnel erspan_get_tunnel ping -c 1 10.1.1.100 ip netns exec at_ns0 ping -c 1 10.1.1.200 cleanup } +function test_ip6erspan { + TYPE=ip6erspan + DEV_NS=ip6erspan00 + DEV=ip6erspan11 + config_device + add_ip6erspan_tunnel $1 + attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel + ping6 -c 3 ::11 + ip netns exec at_ns0 ping -c 1 10.1.1.200 + cleanup +} + function test_vxlan { TYPE=vxlan DEV_NS=vxlan00 @@ -175,9 +284,12 @@ function cleanup { ip link del veth1 ip link del ipip11 ip link del gretap11 + ip link del ip6gre11 + ip link del ip6gretap11 ip link del vxlan11 ip link del geneve11 ip link del erspan11 + ip link del ip6erspan11 pkill tcpdump pkill cat set -ex @@ -187,8 +299,16 @@ trap cleanup 0 2 3 6 9 cleanup echo "Testing GRE tunnel..." test_gre +echo "Testing IP6GRE tunnel..." +test_ip6gre +echo "Testing IP6GRETAP tunnel..." +test_ip6gretap echo "Testing ERSPAN tunnel..." -test_erspan +test_erspan v1 +test_erspan v2 +echo "Testing IP6ERSPAN tunnel..." +test_ip6erspan v1 +test_ip6erspan v2 echo "Testing VXLAN tunnel..." test_vxlan echo "Testing GENEVE tunnel..." diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c new file mode 100644 index 000000000000..1ab308a43e0f --- /dev/null +++ b/samples/bpf/tracex7_kern.c @@ -0,0 +1,16 @@ +#include <uapi/linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include <linux/version.h> +#include "bpf_helpers.h" + +SEC("kprobe/open_ctree") +int bpf_prog1(struct pt_regs *ctx) +{ + unsigned long rc = -12; + + bpf_override_return(ctx, rc); + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c new file mode 100644 index 000000000000..8a52ac492e8b --- /dev/null +++ b/samples/bpf/tracex7_user.c @@ -0,0 +1,28 @@ +#define _GNU_SOURCE + +#include <stdio.h> +#include <linux/bpf.h> +#include <unistd.h> +#include "libbpf.h" +#include "bpf_load.h" + +int main(int argc, char **argv) +{ + FILE *f; + char filename[256]; + char command[256]; + int ret; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + snprintf(command, 256, "mount %s tmpmnt/", argv[1]); + f = popen(command, "r"); + ret = pclose(f); + + return ret ? 0 : 1; +} diff --git a/samples/bpf/xdp2skb_meta.sh b/samples/bpf/xdp2skb_meta.sh new file mode 100755 index 000000000000..b9c9549c4c27 --- /dev/null +++ b/samples/bpf/xdp2skb_meta.sh @@ -0,0 +1,220 @@ +#!/bin/bash +# +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc. +# +# Bash-shell example on using iproute2 tools 'tc' and 'ip' to load +# eBPF programs, both for XDP and clsbpf. Shell script function +# wrappers and even long options parsing is illustrated, for ease of +# use. +# +# Related to sample/bpf/xdp2skb_meta_kern.c, which contains BPF-progs +# that need to collaborate between XDP and TC hooks. Thus, it is +# convenient that the same tool load both programs that need to work +# together. +# +BPF_FILE=xdp2skb_meta_kern.o +DIR=$(dirname $0) + +export TC=/usr/sbin/tc +export IP=/usr/sbin/ip + +function usage() { + echo "" + echo "Usage: $0 [-vfh] --dev ethX" + echo " -d | --dev : Network device (required)" + echo " --flush : Cleanup flush TC and XDP progs" + echo " --list : (\$LIST) List TC and XDP progs" + echo " -v | --verbose : (\$VERBOSE) Verbose" + echo " --dry-run : (\$DRYRUN) Dry-run only (echo commands)" + echo "" +} + +## -- General shell logging cmds -- +function err() { + local exitcode=$1 + shift + echo "ERROR: $@" >&2 + exit $exitcode +} + +function info() { + if [[ -n "$VERBOSE" ]]; then + echo "# $@" + fi +} + +## -- Helper function calls -- + +# Wrapper call for TC and IP +# - Will display the offending command on failure +function _call_cmd() { + local cmd="$1" + local allow_fail="$2" + shift 2 + if [[ -n "$VERBOSE" ]]; then + echo "$(basename $cmd) $@" + fi + if [[ -n "$DRYRUN" ]]; then + return + fi + $cmd "$@" + local status=$? + if (( $status != 0 )); then + if [[ "$allow_fail" == "" ]]; then + err 2 "Exec error($status) occurred cmd: \"$cmd $@\"" + fi + fi +} +function call_tc() { + _call_cmd "$TC" "" "$@" +} +function call_tc_allow_fail() { + _call_cmd "$TC" "allow_fail" "$@" +} +function call_ip() { + _call_cmd "$IP" "" "$@" +} + +## --- Parse command line arguments / parameters --- +# Using external program "getopt" to get --long-options +OPTIONS=$(getopt -o vfhd: \ + --long verbose,flush,help,list,dev:,dry-run -- "$@") +if (( $? != 0 )); then + err 4 "Error calling getopt" +fi +eval set -- "$OPTIONS" + +unset DEV +unset FLUSH +while true; do + case "$1" in + -d | --dev ) # device + DEV=$2 + info "Device set to: DEV=$DEV" >&2 + shift 2 + ;; + -v | --verbose) + VERBOSE=yes + # info "Verbose mode: VERBOSE=$VERBOSE" >&2 + shift + ;; + --dry-run ) + DRYRUN=yes + VERBOSE=yes + info "Dry-run mode: enable VERBOSE and don't call TC+IP" >&2 + shift + ;; + -f | --flush ) + FLUSH=yes + shift + ;; + --list ) + LIST=yes + shift + ;; + -- ) + shift + break + ;; + -h | --help ) + usage; + exit 0 + ;; + * ) + shift + break + ;; + esac +done + +FILE="$DIR/$BPF_FILE" +if [[ ! -e $FILE ]]; then + err 3 "Missing BPF object file ($FILE)" +fi + +if [[ -z $DEV ]]; then + usage + err 2 "Please specify network device -- required option --dev" +fi + +## -- Function calls -- + +function list_tc() +{ + local device="$1" + shift + info "Listing current TC ingress rules" + call_tc filter show dev $device ingress +} + +function list_xdp() +{ + local device="$1" + shift + info "Listing current XDP device($device) setting" + call_ip link show dev $device | grep --color=auto xdp +} + +function flush_tc() +{ + local device="$1" + shift + info "Flush TC on device: $device" + call_tc_allow_fail filter del dev $device ingress + call_tc_allow_fail qdisc del dev $device clsact +} + +function flush_xdp() +{ + local device="$1" + shift + info "Flush XDP on device: $device" + call_ip link set dev $device xdp off +} + +function attach_tc_mark() +{ + local device="$1" + local file="$2" + local prog="tc_mark" + shift 2 + + # Re-attach clsact to clear/flush existing role + call_tc_allow_fail qdisc del dev $device clsact 2> /dev/null + call_tc qdisc add dev $device clsact + + # Attach BPF prog + call_tc filter add dev $device ingress \ + prio 1 handle 1 bpf da obj $file sec $prog +} + +function attach_xdp_mark() +{ + local device="$1" + local file="$2" + local prog="xdp_mark" + shift 2 + + # Remove XDP prog in-case it's already loaded + # TODO: Need ip-link option to override/replace existing XDP prog + flush_xdp $device + + # Attach XDP/BPF prog + call_ip link set dev $device xdp obj $file sec $prog +} + +if [[ -n $FLUSH ]]; then + flush_tc $DEV + flush_xdp $DEV + exit 0 +fi + +if [[ -n $LIST ]]; then + list_tc $DEV + list_xdp $DEV + exit 0 +fi + +attach_tc_mark $DEV $FILE +attach_xdp_mark $DEV $FILE diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c new file mode 100644 index 000000000000..0c12048ac79f --- /dev/null +++ b/samples/bpf/xdp2skb_meta_kern.c @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc. + * + * Example howto transfer info from XDP to SKB, e.g. skb->mark + * ----------------------------------------------------------- + * This uses the XDP data_meta infrastructure, and is a cooperation + * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook. + * + * Notice: This example does not use the BPF C-loader (bpf_load.c), + * but instead rely on the iproute2 TC tool for loading BPF-objects. + */ +#include <uapi/linux/bpf.h> +#include <uapi/linux/pkt_cls.h> + +#include "bpf_helpers.h" + +/* + * This struct is stored in the XDP 'data_meta' area, which is located + * just in-front-of the raw packet payload data. The meaning is + * specific to these two BPF programs that use it as a communication + * channel. XDP adjust/increase the area via a bpf-helper, and TC use + * boundary checks to see if data have been provided. + * + * The struct must be 4 byte aligned, which here is enforced by the + * struct __attribute__((aligned(4))). + */ +struct meta_info { + __u32 mark; +} __attribute__((aligned(4))); + +SEC("xdp_mark") +int _xdp_mark(struct xdp_md *ctx) +{ + struct meta_info *meta; + void *data, *data_end; + int ret; + + /* Reserve space in-front of data pointer for our meta info. + * (Notice drivers not supporting data_meta will fail here!) + */ + ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta)); + if (ret < 0) + return XDP_ABORTED; + + /* Notice: Kernel-side verifier requires that loading of + * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(), + * as pkt-data pointers are invalidated. Helpers that require + * this are determined/marked by bpf_helper_changes_pkt_data() + */ + data = (void *)(unsigned long)ctx->data; + + /* Check data_meta have room for meta_info struct */ + meta = (void *)(unsigned long)ctx->data_meta; + if (meta + 1 > data) + return XDP_ABORTED; + + meta->mark = 42; + + return XDP_PASS; +} + +SEC("tc_mark") +int _tc_mark(struct __sk_buff *ctx) +{ + void *data = (void *)(unsigned long)ctx->data; + void *data_end = (void *)(unsigned long)ctx->data_end; + void *data_meta = (void *)(unsigned long)ctx->data_meta; + struct meta_info *meta = data_meta; + + /* Check XDP gave us some data_meta */ + if (meta + 1 > data) { + ctx->mark = 41; + /* Skip "accept" if no data_meta is avail */ + return TC_ACT_OK; + } + + /* Hint: See func tc_cls_act_is_valid_access() for BPF_WRITE access */ + ctx->mark = meta->mark; /* Transfer XDP-mark to SKB-mark */ + + return TC_ACT_OK; +} + +/* Manually attaching these programs: +export DEV=ixgbe2 +export FILE=xdp2skb_meta_kern.o + +# via TC command +tc qdisc del dev $DEV clsact 2> /dev/null +tc qdisc add dev $DEV clsact +tc filter add dev $DEV ingress prio 1 handle 1 bpf da obj $FILE sec tc_mark +tc filter show dev $DEV ingress + +# XDP via IP command: +ip link set dev $DEV xdp off +ip link set dev $DEV xdp obj $FILE sec xdp_mark + +# Use iptable to "see" if SKBs are marked +iptables -I INPUT -p icmp -m mark --mark 41 # == 0x29 +iptables -I INPUT -p icmp -m mark --mark 42 # == 0x2a + +# Hint: catch XDP_ABORTED errors via +perf record -e xdp:* +perf script + +*/ diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index 2fe2f761a0d0..211db8ded0de 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -1,6 +1,7 @@ -/* XDP monitor tool, based on tracepoints +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc. * - * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * XDP monitor tool, based on tracepoints */ #include <uapi/linux/bpf.h> #include "bpf_helpers.h" @@ -104,7 +105,7 @@ struct xdp_exception_ctx { SEC("tracepoint/xdp/xdp_exception") int trace_xdp_exception(struct xdp_exception_ctx *ctx) { - u64 *cnt;; + u64 *cnt; u32 key; key = ctx->act; @@ -118,3 +119,92 @@ int trace_xdp_exception(struct xdp_exception_ctx *ctx) return 0; } + +/* Common stats data record shared with _user.c */ +struct datarec { + u64 processed; + u64 dropped; + u64 info; +}; +#define MAX_CPUS 64 + +struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = MAX_CPUS, +}; + +struct bpf_map_def SEC("maps") cpumap_kthread_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_enqueue_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int to_cpu; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_enqueue") +int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) +{ + u32 to_cpu = ctx->to_cpu; + struct datarec *rec; + + if (to_cpu >= MAX_CPUS) + return 1; + + rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Record bulk events, then userspace can calc average bulk size */ + if (ctx->processed > 0) + rec->info += 1; + + return 0; +} + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_kthread_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int sched; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_kthread") +int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Count times kthread yielded CPU via schedule call */ + if (ctx->sched) + rec->info++; + + return 0; +} diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index eaba165b3549..eec14520d513 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -1,4 +1,5 @@ -/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ static const char *__doc__= "XDP monitor tool, based on tracepoints\n" @@ -40,6 +41,9 @@ static const struct option long_options[] = { {0, 0, NULL, 0 } }; +/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */ +#define EXIT_FAIL_MEM 5 + static void usage(char *argv[]) { int i; @@ -108,23 +112,93 @@ static const char *action2str(int action) return NULL; } +/* Common stats data record shared with _kern.c */ +struct datarec { + __u64 processed; + __u64 dropped; + __u64 info; +}; +#define MAX_CPUS 64 + +/* Userspace structs for collection of stats from maps */ struct record { - __u64 counter; __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; +struct u64rec { + __u64 processed; +}; +struct record_u64 { + /* record for _kern side __u64 values */ + __u64 timestamp; + struct u64rec total; + struct u64rec *cpu; }; struct stats_record { - struct record xdp_redir[REDIR_RES_MAX]; - struct record xdp_exception[XDP_ACTION_MAX]; + struct record_u64 xdp_redirect[REDIR_RES_MAX]; + struct record_u64 xdp_exception[XDP_ACTION_MAX]; + struct record xdp_cpumap_kthread; + struct record xdp_cpumap_enqueue[MAX_CPUS]; }; -static void stats_print_headers(bool err_only) +static bool map_collect_record(int fd, __u32 key, struct record *rec) { - if (err_only) - printf("\n%s\n", __doc_err_only__); + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec values[nr_cpus]; + __u64 sum_processed = 0; + __u64 sum_dropped = 0; + __u64 sum_info = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); - printf("%-14s %-11s %-10s %-18s %-9s\n", - "ACTION", "result", "pps ", "pps-human-readable", "measure-period"); + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_processed += values[i].processed; + rec->cpu[i].dropped = values[i].dropped; + sum_dropped += values[i].dropped; + rec->cpu[i].info = values[i].info; + sum_info += values[i].info; + } + rec->total.processed = sum_processed; + rec->total.dropped = sum_dropped; + rec->total.info = sum_info; + return true; +} + +static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct u64rec values[nr_cpus]; + __u64 sum_total = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_total += values[i].processed; + } + rec->total.processed = sum_total; + return true; } static double calc_period(struct record *r, struct record *p) @@ -139,77 +213,203 @@ static double calc_period(struct record *r, struct record *p) return period_; } -static double calc_pps(struct record *r, struct record *p, double period) +static double calc_period_u64(struct record_u64 *r, struct record_u64 *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static double calc_pps(struct datarec *r, struct datarec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->processed - p->processed; + pps = packets / period; + } + return pps; +} + +static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->processed - p->processed; + pps = packets / period; + } + return pps; +} + +static double calc_drop(struct datarec *r, struct datarec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->dropped - p->dropped; + pps = packets / period; + } + return pps; +} + +static double calc_info(struct datarec *r, struct datarec *p, double period) { __u64 packets = 0; double pps = 0; if (period > 0) { - packets = r->counter - p->counter; + packets = r->info - p->info; pps = packets / period; } return pps; } -static void stats_print(struct stats_record *rec, - struct stats_record *prev, +static void stats_print(struct stats_record *stats_rec, + struct stats_record *stats_prev, bool err_only) { - double period = 0, pps = 0; - struct record *r, *p; - int i = 0; + unsigned int nr_cpus = bpf_num_possible_cpus(); + int rec_i = 0, i, to_cpu; + double t = 0, pps = 0; - char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n"; + /* Header */ + printf("%-15s %-7s %-12s %-12s %-9s\n", + "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info"); /* tracepoint: xdp:xdp_redirect_* */ if (err_only) - i = REDIR_ERROR; - - for (; i < REDIR_RES_MAX; i++) { - r = &rec->xdp_redir[i]; - p = &prev->xdp_redir[i]; - - if (p->timestamp) { - period = calc_period(r, p); - pps = calc_pps(r, p, period); + rec_i = REDIR_ERROR; + + for (; rec_i < REDIR_RES_MAX; rec_i++) { + struct record_u64 *rec, *prev; + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; + + rec = &stats_rec->xdp_redirect[rec_i]; + prev = &stats_prev->xdp_redirect[rec_i]; + t = calc_period_u64(rec, prev); + + for (i = 0; i < nr_cpus; i++) { + struct u64rec *r = &rec->cpu[i]; + struct u64rec *p = &prev->cpu[i]; + + pps = calc_pps_u64(r, p, t); + if (pps > 0) + printf(fmt1, "XDP_REDIRECT", i, + rec_i ? 0.0: pps, rec_i ? pps : 0.0, + err2str(rec_i)); } - printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period); + pps = calc_pps_u64(&rec->total, &prev->total, t); + printf(fmt2, "XDP_REDIRECT", "total", + rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i)); } /* tracepoint: xdp:xdp_exception */ - for (i = 0; i < XDP_ACTION_MAX; i++) { - r = &rec->xdp_exception[i]; - p = &prev->xdp_exception[i]; - if (p->timestamp) { - period = calc_period(r, p); - pps = calc_pps(r, p, period); + for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) { + struct record_u64 *rec, *prev; + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; + + rec = &stats_rec->xdp_exception[rec_i]; + prev = &stats_prev->xdp_exception[rec_i]; + t = calc_period_u64(rec, prev); + + for (i = 0; i < nr_cpus; i++) { + struct u64rec *r = &rec->cpu[i]; + struct u64rec *p = &prev->cpu[i]; + + pps = calc_pps_u64(r, p, t); + if (pps > 0) + printf(fmt1, "Exception", i, + 0.0, pps, err2str(rec_i)); } + pps = calc_pps_u64(&rec->total, &prev->total, t); if (pps > 0) - printf(fmt, action2str(i), "Exception", - pps, pps, period); + printf(fmt2, "Exception", "total", + 0.0, pps, action2str(rec_i)); } - printf("\n"); -} -static __u64 get_key32_value64_percpu(int fd, __u32 key) -{ - /* For percpu maps, userspace gets a value per possible CPU */ - unsigned int nr_cpus = bpf_num_possible_cpus(); - __u64 values[nr_cpus]; - __u64 sum = 0; - int i; - - if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { - fprintf(stderr, - "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); - return 0; + /* cpumap enqueue stats */ + for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { + char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; + char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; + struct record *rec, *prev; + char *info_str = ""; + double drop, info; + + rec = &stats_rec->xdp_cpumap_enqueue[to_cpu]; + prev = &stats_prev->xdp_cpumap_enqueue[to_cpu]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop(r, p, t); + info = calc_info(r, p, t); + if (info > 0) { + info_str = "bulk-average"; + info = pps / info; /* calc average bulk size */ + } + if (pps > 0) + printf(fmt1, "cpumap-enqueue", + i, to_cpu, pps, drop, info, info_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + if (pps > 0) { + drop = calc_drop(&rec->total, &prev->total, t); + info = calc_info(&rec->total, &prev->total, t); + if (info > 0) { + info_str = "bulk-average"; + info = pps / info; /* calc average bulk size */ + } + printf(fmt2, "cpumap-enqueue", + "sum", to_cpu, pps, drop, info, info_str); + } } - /* Sum values from each CPU */ - for (i = 0; i < nr_cpus; i++) { - sum += values[i]; + /* cpumap kthread stats */ + { + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n"; + struct record *rec, *prev; + double drop, info; + char *i_str = ""; + + rec = &stats_rec->xdp_cpumap_kthread; + prev = &stats_prev->xdp_cpumap_kthread; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop(r, p, t); + info = calc_info(r, p, t); + if (info > 0) + i_str = "sched"; + if (pps > 0) + printf(fmt1, "cpumap-kthread", + i, pps, drop, info, i_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop(&rec->total, &prev->total, t); + info = calc_info(&rec->total, &prev->total, t); + if (info > 0) + i_str = "sched-sum"; + printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str); } - return sum; + + printf("\n"); } static bool stats_collect(struct stats_record *rec) @@ -222,25 +422,109 @@ static bool stats_collect(struct stats_record *rec) */ fd = map_data[0].fd; /* map0: redirect_err_cnt */ - for (i = 0; i < REDIR_RES_MAX; i++) { - rec->xdp_redir[i].timestamp = gettime(); - rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i); - } + for (i = 0; i < REDIR_RES_MAX; i++) + map_collect_record_u64(fd, i, &rec->xdp_redirect[i]); fd = map_data[1].fd; /* map1: exception_cnt */ for (i = 0; i < XDP_ACTION_MAX; i++) { - rec->xdp_exception[i].timestamp = gettime(); - rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i); + map_collect_record_u64(fd, i, &rec->xdp_exception[i]); } + fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */ + for (i = 0; i < MAX_CPUS; i++) + map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]); + + fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */ + map_collect_record(fd, 0, &rec->xdp_cpumap_kthread); + return true; } +static void *alloc_rec_per_cpu(int record_size) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + void *array; + size_t size; + + size = record_size * nr_cpus; + array = malloc(size); + memset(array, 0, size); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct stats_record *alloc_stats_record(void) +{ + struct stats_record *rec; + int rec_sz; + int i; + + /* Alloc main stats_record structure */ + rec = malloc(sizeof(*rec)); + memset(rec, 0, sizeof(*rec)); + if (!rec) { + fprintf(stderr, "Mem alloc error\n"); + exit(EXIT_FAIL_MEM); + } + + /* Alloc stats stored per CPU for each record */ + rec_sz = sizeof(struct u64rec); + for (i = 0; i < REDIR_RES_MAX; i++) + rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz); + + for (i = 0; i < XDP_ACTION_MAX; i++) + rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz); + + rec_sz = sizeof(struct datarec); + rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz); + + for (i = 0; i < MAX_CPUS; i++) + rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz); + + return rec; +} + +static void free_stats_record(struct stats_record *r) +{ + int i; + + for (i = 0; i < REDIR_RES_MAX; i++) + free(r->xdp_redirect[i].cpu); + + for (i = 0; i < XDP_ACTION_MAX; i++) + free(r->xdp_exception[i].cpu); + + free(r->xdp_cpumap_kthread.cpu); + + for (i = 0; i < MAX_CPUS; i++) + free(r->xdp_cpumap_enqueue[i].cpu); + + free(r); +} + +/* Pointer swap trick */ +static inline void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + static void stats_poll(int interval, bool err_only) { - struct stats_record rec, prev; + struct stats_record *rec, *prev; - memset(&rec, 0, sizeof(rec)); + rec = alloc_stats_record(); + prev = alloc_stats_record(); + stats_collect(rec); + + if (err_only) + printf("\n%s\n", __doc_err_only__); /* Trick to pretty printf with thousands separators use %' */ setlocale(LC_NUMERIC, "en_US"); @@ -258,13 +542,15 @@ static void stats_poll(int interval, bool err_only) fflush(stdout); while (1) { - memcpy(&prev, &rec, sizeof(rec)); - stats_collect(&rec); - stats_print_headers(err_only); - stats_print(&rec, &prev, err_only); + swap(&prev, &rec); + stats_collect(rec); + stats_print(rec, prev, err_only); fflush(stdout); sleep(interval); } + + free_stats_record(rec); + free_stats_record(prev); } static void print_bpf_prog_info(void) diff --git a/samples/bpf/xdp_rxq_info_kern.c b/samples/bpf/xdp_rxq_info_kern.c new file mode 100644 index 000000000000..3fd209291653 --- /dev/null +++ b/samples/bpf/xdp_rxq_info_kern.c @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * + * Example howto extract XDP RX-queue info + */ +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +/* Config setup from with userspace + * + * User-side setup ifindex in config_map, to verify that + * ctx->ingress_ifindex is correct (against configured ifindex) + */ +struct config { + __u32 action; + int ifindex; +}; +struct bpf_map_def SEC("maps") config_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(struct config), + .max_entries = 1, +}; + +/* Common stats data record (shared with userspace) */ +struct datarec { + __u64 processed; + __u64 issue; +}; + +struct bpf_map_def SEC("maps") stats_global_map = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +#define MAX_RXQs 64 + +/* Stats per rx_queue_index (per CPU) */ +struct bpf_map_def SEC("maps") rx_queue_index_map = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = MAX_RXQs + 1, +}; + +SEC("xdp_prog0") +int xdp_prognum0(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct datarec *rec, *rxq_rec; + int ingress_ifindex; + struct config *config; + u32 key = 0; + + /* Global stats record */ + rec = bpf_map_lookup_elem(&stats_global_map, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + /* Accessing ctx->ingress_ifindex, cause BPF to rewrite BPF + * instructions inside kernel to access xdp_rxq->dev->ifindex + */ + ingress_ifindex = ctx->ingress_ifindex; + + config = bpf_map_lookup_elem(&config_map, &key); + if (!config) + return XDP_ABORTED; + + /* Simple test: check ctx provided ifindex is as expected */ + if (ingress_ifindex != config->ifindex) { + /* count this error case */ + rec->issue++; + return XDP_ABORTED; + } + + /* Update stats per rx_queue_index. Handle if rx_queue_index + * is larger than stats map can contain info for. + */ + key = ctx->rx_queue_index; + if (key >= MAX_RXQs) + key = MAX_RXQs; + rxq_rec = bpf_map_lookup_elem(&rx_queue_index_map, &key); + if (!rxq_rec) + return XDP_ABORTED; + rxq_rec->processed++; + if (key == MAX_RXQs) + rxq_rec->issue++; + + return config->action; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c new file mode 100644 index 000000000000..32430e8b3a6a --- /dev/null +++ b/samples/bpf/xdp_rxq_info_user.c @@ -0,0 +1,531 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + */ +static const char *__doc__ = " XDP RX-queue info extract example\n\n" + "Monitor how many packets per sec (pps) are received\n" + "per NIC RX queue index and which CPU processed the packet\n" + ; + +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <locale.h> +#include <sys/resource.h> +#include <getopt.h> +#include <net/if.h> +#include <time.h> + +#include <arpa/inet.h> +#include <linux/if_link.h> + +#include "libbpf.h" +#include "bpf_load.h" +#include "bpf_util.h" + +static int ifindex = -1; +static char ifname_buf[IF_NAMESIZE]; +static char *ifname; + +static __u32 xdp_flags; + +/* Exit return codes */ +#define EXIT_OK 0 +#define EXIT_FAIL 1 +#define EXIT_FAIL_OPTION 2 +#define EXIT_FAIL_XDP 3 +#define EXIT_FAIL_BPF 4 +#define EXIT_FAIL_MEM 5 + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"dev", required_argument, NULL, 'd' }, + {"skb-mode", no_argument, NULL, 'S' }, + {"sec", required_argument, NULL, 's' }, + {"no-separators", no_argument, NULL, 'z' }, + {"action", required_argument, NULL, 'a' }, + {0, 0, NULL, 0 } +}; + +static void int_exit(int sig) +{ + fprintf(stderr, + "Interrupted: Removing XDP program on ifindex:%d device:%s\n", + ifindex, ifname); + if (ifindex > -1) + set_link_xdp_fd(ifindex, -1, xdp_flags); + exit(EXIT_OK); +} + +struct config { + __u32 action; + int ifindex; +}; +#define XDP_ACTION_MAX (XDP_TX + 1) +#define XDP_ACTION_MAX_STRLEN 11 +static const char *xdp_action_names[XDP_ACTION_MAX] = { + [XDP_ABORTED] = "XDP_ABORTED", + [XDP_DROP] = "XDP_DROP", + [XDP_PASS] = "XDP_PASS", + [XDP_TX] = "XDP_TX", +}; + +static const char *action2str(int action) +{ + if (action < XDP_ACTION_MAX) + return xdp_action_names[action]; + return NULL; +} + +static int parse_xdp_action(char *action_str) +{ + size_t maxlen; + __u64 action = -1; + int i; + + for (i = 0; i < XDP_ACTION_MAX; i++) { + maxlen = XDP_ACTION_MAX_STRLEN; + if (strncmp(xdp_action_names[i], action_str, maxlen) == 0) { + action = i; + break; + } + } + return action; +} + +static void list_xdp_actions(void) +{ + int i; + + printf("Available XDP --action <options>\n"); + for (i = 0; i < XDP_ACTION_MAX; i++) + printf("\t%s\n", xdp_action_names[i]); + printf("\n"); +} + +static void usage(char *argv[]) +{ + int i; + + printf("\nDOCUMENTATION:\n%s\n", __doc__); + printf(" Usage: %s (options-see-below)\n", argv[0]); + printf(" Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)", + *long_options[i].flag); + else + printf(" short-option: -%c", + long_options[i].val); + printf("\n"); + } + printf("\n"); + list_xdp_actions(); +} + +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ +static __u64 gettime(void) +{ + struct timespec t; + int res; + + res = clock_gettime(CLOCK_MONOTONIC, &t); + if (res < 0) { + fprintf(stderr, "Error with gettimeofday! (%i)\n", res); + exit(EXIT_FAIL); + } + return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; +} + +/* Common stats data record shared with _kern.c */ +struct datarec { + __u64 processed; + __u64 issue; +}; +struct record { + __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; +struct stats_record { + struct record stats; + struct record *rxq; +}; + +static struct datarec *alloc_record_per_cpu(void) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec *array; + size_t size; + + size = sizeof(struct datarec) * nr_cpus; + array = malloc(size); + memset(array, 0, size); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct record *alloc_record_per_rxq(void) +{ + unsigned int nr_rxqs = map_data[2].def.max_entries; + struct record *array; + size_t size; + + size = sizeof(struct record) * nr_rxqs; + array = malloc(size); + memset(array, 0, size); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct stats_record *alloc_stats_record(void) +{ + unsigned int nr_rxqs = map_data[2].def.max_entries; + struct stats_record *rec; + int i; + + rec = malloc(sizeof(*rec)); + memset(rec, 0, sizeof(*rec)); + if (!rec) { + fprintf(stderr, "Mem alloc error\n"); + exit(EXIT_FAIL_MEM); + } + rec->rxq = alloc_record_per_rxq(); + for (i = 0; i < nr_rxqs; i++) + rec->rxq[i].cpu = alloc_record_per_cpu(); + + rec->stats.cpu = alloc_record_per_cpu(); + return rec; +} + +static void free_stats_record(struct stats_record *r) +{ + unsigned int nr_rxqs = map_data[2].def.max_entries; + int i; + + for (i = 0; i < nr_rxqs; i++) + free(r->rxq[i].cpu); + + free(r->rxq); + free(r->stats.cpu); + free(r); +} + +static bool map_collect_percpu(int fd, __u32 key, struct record *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec values[nr_cpus]; + __u64 sum_processed = 0; + __u64 sum_issue = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_processed += values[i].processed; + rec->cpu[i].issue = values[i].issue; + sum_issue += values[i].issue; + } + rec->total.processed = sum_processed; + rec->total.issue = sum_issue; + return true; +} + +static void stats_collect(struct stats_record *rec) +{ + int fd, i, max_rxqs; + + fd = map_data[1].fd; /* map: stats_global_map */ + map_collect_percpu(fd, 0, &rec->stats); + + fd = map_data[2].fd; /* map: rx_queue_index_map */ + max_rxqs = map_data[2].def.max_entries; + for (i = 0; i < max_rxqs; i++) + map_collect_percpu(fd, i, &rec->rxq[i]); +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->processed - p->processed; + pps = packets / period_; + } + return pps; +} + +static __u64 calc_errs_pps(struct datarec *r, + struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->issue - p->issue; + pps = packets / period_; + } + return pps; +} + +static void stats_print(struct stats_record *stats_rec, + struct stats_record *stats_prev, + int action) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + unsigned int nr_rxqs = map_data[2].def.max_entries; + double pps = 0, err = 0; + struct record *rec, *prev; + double t; + int rxq; + int i; + + /* Header */ + printf("\nRunning XDP on dev:%s (ifindex:%d) action:%s\n", + ifname, ifindex, action2str(action)); + + /* stats_global_map */ + { + char *fmt_rx = "%-15s %-7d %'-11.0f %'-10.0f %s\n"; + char *fm2_rx = "%-15s %-7s %'-11.0f\n"; + char *errstr = ""; + + printf("%-15s %-7s %-11s %-11s\n", + "XDP stats", "CPU", "pps", "issue-pps"); + + rec = &stats_rec->stats; + prev = &stats_prev->stats; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps (r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) + errstr = "invalid-ifindex"; + if (pps > 0) + printf(fmt_rx, "XDP-RX CPU", + i, pps, err, errstr); + } + pps = calc_pps (&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + printf(fm2_rx, "XDP-RX CPU", "total", pps, err); + } + + /* rx_queue_index_map */ + printf("\n%-15s %-7s %-11s %-11s\n", + "RXQ stats", "RXQ:CPU", "pps", "issue-pps"); + + for (rxq = 0; rxq < nr_rxqs; rxq++) { + char *fmt_rx = "%-15s %3d:%-3d %'-11.0f %'-10.0f %s\n"; + char *fm2_rx = "%-15s %3d:%-3s %'-11.0f\n"; + char *errstr = ""; + int rxq_ = rxq; + + /* Last RXQ in map catch overflows */ + if (rxq_ == nr_rxqs - 1) + rxq_ = -1; + + rec = &stats_rec->rxq[rxq]; + prev = &stats_prev->rxq[rxq]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps (r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) { + if (rxq_ == -1) + errstr = "map-overflow-RXQ"; + else + errstr = "err"; + } + if (pps > 0) + printf(fmt_rx, "rx_queue_index", + rxq_, i, pps, err, errstr); + } + pps = calc_pps (&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + if (pps || err) + printf(fm2_rx, "rx_queue_index", rxq_, "sum", pps, err); + } +} + + +/* Pointer swap trick */ +static inline void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + +static void stats_poll(int interval, int action) +{ + struct stats_record *record, *prev; + + record = alloc_stats_record(); + prev = alloc_stats_record(); + stats_collect(record); + + while (1) { + swap(&prev, &record); + stats_collect(record); + stats_print(record, prev, action); + sleep(interval); + } + + free_stats_record(record); + free_stats_record(prev); +} + + +int main(int argc, char **argv) +{ + struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + bool use_separators = true; + struct config cfg = { 0 }; + char filename[256]; + int longindex = 0; + int interval = 2; + __u32 key = 0; + int opt, err; + + char action_str_buf[XDP_ACTION_MAX_STRLEN + 1 /* for \0 */] = { 0 }; + int action = XDP_PASS; /* Default action */ + char *action_str = NULL; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + if (load_bpf_file(filename)) { + fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf); + return EXIT_FAIL; + } + + if (!prog_fd[0]) { + fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno)); + return EXIT_FAIL; + } + + /* Parse commands line args */ + while ((opt = getopt_long(argc, argv, "hSd:", + long_options, &longindex)) != -1) { + switch (opt) { + case 'd': + if (strlen(optarg) >= IF_NAMESIZE) { + fprintf(stderr, "ERR: --dev name too long\n"); + goto error; + } + ifname = (char *)&ifname_buf; + strncpy(ifname, optarg, IF_NAMESIZE); + ifindex = if_nametoindex(ifname); + if (ifindex == 0) { + fprintf(stderr, + "ERR: --dev name unknown err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + break; + case 's': + interval = atoi(optarg); + break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'z': + use_separators = false; + break; + case 'a': + action_str = (char *)&action_str_buf; + strncpy(action_str, optarg, XDP_ACTION_MAX_STRLEN); + break; + case 'h': + error: + default: + usage(argv); + return EXIT_FAIL_OPTION; + } + } + /* Required option */ + if (ifindex == -1) { + fprintf(stderr, "ERR: required option --dev missing\n"); + usage(argv); + return EXIT_FAIL_OPTION; + } + cfg.ifindex = ifindex; + + /* Parse action string */ + if (action_str) { + action = parse_xdp_action(action_str); + if (action < 0) { + fprintf(stderr, "ERR: Invalid XDP --action: %s\n", + action_str); + list_xdp_actions(); + return EXIT_FAIL_OPTION; + } + } + cfg.action = action; + + /* Trick to pretty printf with thousands separators use %' */ + if (use_separators) + setlocale(LC_NUMERIC, "en_US"); + + /* User-side setup ifindex in config_map */ + err = bpf_map_update_elem(map_fd[0], &key, &cfg, 0); + if (err) { + fprintf(stderr, "Store config failed (err:%d)\n", err); + exit(EXIT_FAIL_BPF); + } + + /* Remove XDP program when program is interrupted */ + signal(SIGINT, int_exit); + + if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { + fprintf(stderr, "link set xdp fd failed\n"); + return EXIT_FAIL_XDP; + } + + stats_poll(interval, action); + return EXIT_OK; +} diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index 7cc9d228216f..7c25c0c112bc 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -23,8 +23,11 @@ #include <stdbool.h> #include <signal.h> #include <fcntl.h> +#include <sys/wait.h> +#include <time.h> #include <sys/time.h> +#include <sys/resource.h> #include <sys/types.h> #include <linux/netlink.h> @@ -35,6 +38,8 @@ #include <assert.h> #include <libgen.h> +#include <getopt.h> + #include "../bpf/bpf_load.h" #include "../bpf/bpf_util.h" #include "../bpf/libbpf.h" @@ -46,15 +51,42 @@ void running_handler(int a); #define S1_PORT 10000 #define S2_PORT 10001 -static int sockmap_test_sockets(int rate, int dot) +/* global sockets */ +int s1, s2, c1, c2, p1, p2; + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"cgroup", required_argument, NULL, 'c' }, + {"rate", required_argument, NULL, 'r' }, + {"verbose", no_argument, NULL, 'v' }, + {"iov_count", required_argument, NULL, 'i' }, + {"length", required_argument, NULL, 'l' }, + {"test", required_argument, NULL, 't' }, + {0, 0, NULL, 0 } +}; + +static void usage(char *argv[]) +{ + int i; + + printf(" Usage: %s --cgroup <cgroup_path>\n", argv[0]); + printf(" options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)\n", + *long_options[i].flag); + else + printf(" -%c\n", long_options[i].val); + } + printf("\n"); +} + +static int sockmap_init_sockets(void) { - int i, sc, err, max_fd, one = 1; - int s1, s2, c1, c2, p1, p2; + int i, err, one = 1; struct sockaddr_in addr; - struct timeval timeout; - char buf[1024] = {0}; int *fds[4] = {&s1, &s2, &c1, &c2}; - fd_set w; s1 = s2 = p1 = p2 = c1 = c2 = 0; @@ -63,8 +95,7 @@ static int sockmap_test_sockets(int rate, int dot) *fds[i] = socket(AF_INET, SOCK_STREAM, 0); if (*fds[i] < 0) { perror("socket s1 failed()"); - err = *fds[i]; - goto out; + return errno; } } @@ -74,16 +105,16 @@ static int sockmap_test_sockets(int rate, int dot) (char *)&one, sizeof(one)); if (err) { perror("setsockopt failed()"); - goto out; + return errno; } } /* Non-blocking sockets */ - for (i = 0; i < 4; i++) { + for (i = 0; i < 2; i++) { err = ioctl(*fds[i], FIONBIO, (char *)&one); if (err < 0) { perror("ioctl s1 failed()"); - goto out; + return errno; } } @@ -96,14 +127,14 @@ static int sockmap_test_sockets(int rate, int dot) err = bind(s1, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0) { perror("bind s1 failed()\n"); - goto out; + return errno; } addr.sin_port = htons(S2_PORT); err = bind(s2, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0) { perror("bind s2 failed()\n"); - goto out; + return errno; } /* Listen server sockets */ @@ -111,14 +142,14 @@ static int sockmap_test_sockets(int rate, int dot) err = listen(s1, 32); if (err < 0) { perror("listen s1 failed()\n"); - goto out; + return errno; } addr.sin_port = htons(S2_PORT); err = listen(s2, 32); if (err < 0) { perror("listen s1 failed()\n"); - goto out; + return errno; } /* Initiate Connect */ @@ -126,46 +157,232 @@ static int sockmap_test_sockets(int rate, int dot) err = connect(c1, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0 && errno != EINPROGRESS) { perror("connect c1 failed()\n"); - goto out; + return errno; } addr.sin_port = htons(S2_PORT); err = connect(c2, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0 && errno != EINPROGRESS) { perror("connect c2 failed()\n"); - goto out; + return errno; + } else if (err < 0) { + err = 0; } /* Accept Connecrtions */ p1 = accept(s1, NULL, NULL); if (p1 < 0) { perror("accept s1 failed()\n"); - goto out; + return errno; } p2 = accept(s2, NULL, NULL); if (p2 < 0) { perror("accept s1 failed()\n"); - goto out; + return errno; } - max_fd = p2; - timeout.tv_sec = 10; - timeout.tv_usec = 0; - printf("connected sockets: c1 <-> p1, c2 <-> p2\n"); printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n", c1, s1, c2, s2); + return 0; +} + +struct msg_stats { + size_t bytes_sent; + size_t bytes_recvd; + struct timespec start; + struct timespec end; +}; + +static int msg_loop(int fd, int iov_count, int iov_length, int cnt, + struct msg_stats *s, bool tx) +{ + struct msghdr msg = {0}; + int err, i, flags = MSG_NOSIGNAL; + struct iovec *iov; + + iov = calloc(iov_count, sizeof(struct iovec)); + if (!iov) + return errno; + + for (i = 0; i < iov_count; i++) { + char *d = calloc(iov_length, sizeof(char)); + + if (!d) { + fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count); + goto out_errno; + } + iov[i].iov_base = d; + iov[i].iov_len = iov_length; + } + + msg.msg_iov = iov; + msg.msg_iovlen = iov_count; + + if (tx) { + clock_gettime(CLOCK_MONOTONIC, &s->start); + for (i = 0; i < cnt; i++) { + int sent = sendmsg(fd, &msg, flags); + + if (sent < 0) { + perror("send loop error:"); + goto out_errno; + } + s->bytes_sent += sent; + } + clock_gettime(CLOCK_MONOTONIC, &s->end); + } else { + int slct, recv, max_fd = fd; + struct timeval timeout; + float total_bytes; + fd_set w; + + total_bytes = (float)iov_count * (float)iov_length * (float)cnt; + err = clock_gettime(CLOCK_MONOTONIC, &s->start); + if (err < 0) + perror("recv start time: "); + while (s->bytes_recvd < total_bytes) { + timeout.tv_sec = 1; + timeout.tv_usec = 0; + + /* FD sets */ + FD_ZERO(&w); + FD_SET(fd, &w); + + slct = select(max_fd + 1, &w, NULL, NULL, &timeout); + if (slct == -1) { + perror("select()"); + clock_gettime(CLOCK_MONOTONIC, &s->end); + goto out_errno; + } else if (!slct) { + fprintf(stderr, "unexpected timeout\n"); + errno = -EIO; + clock_gettime(CLOCK_MONOTONIC, &s->end); + goto out_errno; + } + + recv = recvmsg(fd, &msg, flags); + if (recv < 0) { + if (errno != EWOULDBLOCK) { + clock_gettime(CLOCK_MONOTONIC, &s->end); + perror("recv failed()\n"); + goto out_errno; + } + } + + s->bytes_recvd += recv; + } + clock_gettime(CLOCK_MONOTONIC, &s->end); + } + + for (i = 0; i < iov_count; i++) + free(iov[i].iov_base); + free(iov); + return 0; +out_errno: + for (i = 0; i < iov_count; i++) + free(iov[i].iov_base); + free(iov); + return errno; +} + +static float giga = 1000000000; + +static inline float sentBps(struct msg_stats s) +{ + return s.bytes_sent / (s.end.tv_sec - s.start.tv_sec); +} + +static inline float recvdBps(struct msg_stats s) +{ + return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec); +} + +static int sendmsg_test(int iov_count, int iov_buf, int cnt, + int verbose, bool base) +{ + float sent_Bps = 0, recvd_Bps = 0; + int rx_fd, txpid, rxpid, err = 0; + struct msg_stats s = {0}; + int status; + + errno = 0; + + if (base) + rx_fd = p1; + else + rx_fd = p2; + + rxpid = fork(); + if (rxpid == 0) { + err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false); + if (err) + fprintf(stderr, + "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", + iov_count, iov_buf, cnt, err); + shutdown(p2, SHUT_RDWR); + shutdown(p1, SHUT_RDWR); + if (s.end.tv_sec - s.start.tv_sec) { + sent_Bps = sentBps(s); + recvd_Bps = recvdBps(s); + } + fprintf(stdout, + "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n", + s.bytes_sent, sent_Bps, sent_Bps/giga, + s.bytes_recvd, recvd_Bps, recvd_Bps/giga); + exit(1); + } else if (rxpid == -1) { + perror("msg_loop_rx: "); + return errno; + } + + txpid = fork(); + if (txpid == 0) { + err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true); + if (err) + fprintf(stderr, + "msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n", + iov_count, iov_buf, cnt, err); + shutdown(c1, SHUT_RDWR); + if (s.end.tv_sec - s.start.tv_sec) { + sent_Bps = sentBps(s); + recvd_Bps = recvdBps(s); + } + fprintf(stdout, + "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n", + s.bytes_sent, sent_Bps, sent_Bps/giga, + s.bytes_recvd, recvd_Bps, recvd_Bps/giga); + exit(1); + } else if (txpid == -1) { + perror("msg_loop_tx: "); + return errno; + } + + assert(waitpid(rxpid, &status, 0) == rxpid); + assert(waitpid(txpid, &status, 0) == txpid); + return err; +} + +static int forever_ping_pong(int rate, int verbose) +{ + struct timeval timeout; + char buf[1024] = {0}; + int sc; + + timeout.tv_sec = 10; + timeout.tv_usec = 0; /* Ping/Pong data from client to server */ sc = send(c1, buf, sizeof(buf), 0); if (sc < 0) { perror("send failed()\n"); - goto out; + return sc; } do { - int s, rc, i; + int s, rc, i, max_fd = p2; + fd_set w; /* FD sets */ FD_ZERO(&w); @@ -193,7 +410,7 @@ static int sockmap_test_sockets(int rate, int dot) if (rc < 0) { if (errno != EWOULDBLOCK) { perror("recv failed()\n"); - break; + return rc; } } @@ -205,35 +422,92 @@ static int sockmap_test_sockets(int rate, int dot) sc = send(i, buf, rc, 0); if (sc < 0) { perror("send failed()\n"); - break; + return sc; } } - sleep(rate); - if (dot) { + + if (rate) + sleep(rate); + + if (verbose) { printf("."); fflush(stdout); } } while (running); -out: - close(s1); - close(s2); - close(p1); - close(p2); - close(c1); - close(c2); - return err; + return 0; } +enum { + PING_PONG, + SENDMSG, + BASE, +}; + int main(int argc, char **argv) { - int rate = 1, dot = 1; + int iov_count = 1, length = 1024, rate = 1, verbose = 0; + struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + int opt, longindex, err, cg_fd = 0; + int test = PING_PONG; char filename[256]; - int err, cg_fd; - char *cg_path; - cg_path = argv[argc - 1]; + while ((opt = getopt_long(argc, argv, "hvc:r:i:l:t:", + long_options, &longindex)) != -1) { + switch (opt) { + /* Cgroup configuration */ + case 'c': + cg_fd = open(optarg, O_DIRECTORY, O_RDONLY); + if (cg_fd < 0) { + fprintf(stderr, + "ERROR: (%i) open cg path failed: %s\n", + cg_fd, optarg); + return cg_fd; + } + break; + case 'r': + rate = atoi(optarg); + break; + case 'v': + verbose = 1; + break; + case 'i': + iov_count = atoi(optarg); + break; + case 'l': + length = atoi(optarg); + break; + case 't': + if (strcmp(optarg, "ping") == 0) { + test = PING_PONG; + } else if (strcmp(optarg, "sendmsg") == 0) { + test = SENDMSG; + } else if (strcmp(optarg, "base") == 0) { + test = BASE; + } else { + usage(argv); + return -1; + } + break; + case 'h': + default: + usage(argv); + return -1; + } + } + + if (!cg_fd) { + fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n", + argv[0]); + return -1; + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); running = 1; @@ -241,20 +515,16 @@ int main(int argc, char **argv) /* catch SIGINT */ signal(SIGINT, running_handler); + /* If base test skip BPF setup */ + if (test == BASE) + goto run; + if (load_bpf_file(filename)) { fprintf(stderr, "load_bpf_file: (%s) %s\n", filename, strerror(errno)); return 1; } - /* Cgroup configuration */ - cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY); - if (cg_fd < 0) { - fprintf(stderr, "ERROR: (%i) open cg path failed: %s\n", - cg_fd, cg_path); - return cg_fd; - } - /* Attach programs to sockmap */ err = bpf_prog_attach(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER, 0); @@ -280,12 +550,30 @@ int main(int argc, char **argv) return err; } - err = sockmap_test_sockets(rate, dot); +run: + err = sockmap_init_sockets(); if (err) { fprintf(stderr, "ERROR: test socket failed: %d\n", err); - return err; + goto out; } - return 0; + + if (test == PING_PONG) + err = forever_ping_pong(rate, verbose); + else if (test == SENDMSG) + err = sendmsg_test(iov_count, length, rate, verbose, false); + else if (test == BASE) + err = sendmsg_test(iov_count, length, rate, verbose, true); + else + fprintf(stderr, "unknown test\n"); +out: + close(s1); + close(s2); + close(p1); + close(p2); + close(c1); + close(c2); + close(cg_fd); + return err; } void running_handler(int a) |