1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
|
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# This test injects a 10-MB burst of traffic with VLAN tag and 802.1p priority
# of 1. This stream is consistently prioritized as priority 1, is put to PG
# buffer 1, and scheduled at TC 1.
#
# - the stream first ingresses through $swp1, where it is forwarded to $swp3
#
# - then it ingresses through $swp4. Here it is put to a lossless buffer and put
# to a small pool ("PFC pool"). The traffic is forwarded to $swp2, which is
# shaped, and thus the PFC pool eventually fills, therefore the headroom
# fills, and $swp3 is paused.
#
# - since $swp3 now can't send traffic, the traffic ingressing $swp1 is kept at
# a pool ("overflow pool"). The overflow pool needs to be large enough to
# contain the whole burst.
#
# - eventually the PFC pool gets some traffic out, headroom therefore gets some
# traffic to the pool, and $swp3 is unpaused again. This way the traffic is
# gradually forwarded from the overflow pool, through the PFC pool, out of
# $swp2, and eventually to $h2.
#
# - if PFC works, all lossless flow packets that ingress through $swp1 should
# also be seen ingressing $h2. If it doesn't, there will be drops due to
# discrepancy between the speeds of $swp1 and $h2.
#
# - it should all play out relatively quickly, so that SLL and HLL will not
# cause drops.
#
# +-----------------------+
# | H1 |
# | + $h1.111 |
# | | 192.0.2.33/28 |
# | | |
# | + $h1 |
# +---|-------------------+ +--------------------+
# | | |
# +---|----------------------|--------------------|---------------------------+
# | + $swp1 $swp3 + + $swp4 |
# | | iPOOL1 iPOOL0 | | iPOOL2 |
# | | ePOOL4 ePOOL5 | | ePOOL4 |
# | | 1Gbps | | 1Gbps |
# | | PFC:enabled=1 | | PFC:enabled=1 |
# | +-|----------------------|-+ +-|------------------------+ |
# | | + $swp1.111 $swp3.111 + | | + $swp4.111 | |
# | | | | | |
# | | BR1 | | BR2 | |
# | | | | | |
# | | | | + $swp2.111 | |
# | +--------------------------+ +---------|----------------+ |
# | | |
# | iPOOL0: 500KB dynamic | |
# | iPOOL1: 10MB static | |
# | iPOOL2: 1MB static + $swp2 |
# | ePOOL4: 500KB dynamic | iPOOL0 |
# | ePOOL5: 10MB static | ePOOL6 |
# | ePOOL6: "infinite" static | 200Mbps shaper |
# +-------------------------------------------------------|-------------------+
# |
# +---|-------------------+
# | + $h2 H2 |
# | | |
# | + $h2.111 |
# | 192.0.2.34/28 |
# +-----------------------+
#
# iPOOL0+ePOOL4 is a helper pool for control traffic etc.
# iPOOL1+ePOOL5 are overflow pools.
# iPOOL2+ePOOL6 are PFC pools.
ALL_TESTS="
ping_ipv4
test_qos_pfc
"
lib_dir=$(dirname $0)/../../../net/forwarding
NUM_NETIFS=6
source $lib_dir/lib.sh
source $lib_dir/devlink_lib.sh
source qos_lib.sh
_1KB=1000
_100KB=$((100 * _1KB))
_500KB=$((500 * _1KB))
_1MB=$((1000 * _1KB))
_10MB=$((10 * _1MB))
h1_create()
{
simple_if_init $h1
mtu_set $h1 10000
vlan_create $h1 111 v$h1 192.0.2.33/28
}
h1_destroy()
{
vlan_destroy $h1 111
mtu_restore $h1
simple_if_fini $h1
}
h2_create()
{
simple_if_init $h2
mtu_set $h2 10000
vlan_create $h2 111 v$h2 192.0.2.34/28
}
h2_destroy()
{
vlan_destroy $h2 111
mtu_restore $h2
simple_if_fini $h2
}
switch_create()
{
# pools
# -----
devlink_pool_size_thtype_save 0
devlink_pool_size_thtype_save 4
devlink_pool_size_thtype_save 1
devlink_pool_size_thtype_save 5
devlink_pool_size_thtype_save 2
devlink_pool_size_thtype_save 6
devlink_port_pool_th_save $swp1 1
devlink_port_pool_th_save $swp2 6
devlink_port_pool_th_save $swp3 5
devlink_port_pool_th_save $swp4 2
devlink_tc_bind_pool_th_save $swp1 1 ingress
devlink_tc_bind_pool_th_save $swp2 1 egress
devlink_tc_bind_pool_th_save $swp3 1 egress
devlink_tc_bind_pool_th_save $swp4 1 ingress
# Control traffic pools. Just reduce the size. Keep them dynamic so that
# we don't need to change all the uninteresting quotas.
devlink_pool_size_thtype_set 0 dynamic $_500KB
devlink_pool_size_thtype_set 4 dynamic $_500KB
# Overflow pools.
devlink_pool_size_thtype_set 1 static $_10MB
devlink_pool_size_thtype_set 5 static $_10MB
# PFC pools. As per the writ, the size of egress PFC pool should be
# infinice, but actually it just needs to be large enough to not matter
# in practice, so reuse the 10MB limit.
devlink_pool_size_thtype_set 2 static $_1MB
devlink_pool_size_thtype_set 6 static $_10MB
# $swp1
# -----
ip link set dev $swp1 up
mtu_set $swp1 10000
vlan_create $swp1 111
ip link set dev $swp1.111 type vlan ingress-qos-map 0:0 1:1
devlink_port_pool_th_set $swp1 1 $_10MB
devlink_tc_bind_pool_th_set $swp1 1 ingress 1 $_10MB
# Configure qdisc so that we can configure PG and therefore pool
# assignment.
tc qdisc replace dev $swp1 root handle 1: \
ets bands 8 strict 8 priomap 7 6
__mlnx_qos -i $swp1 --prio2buffer=0,1,0,0,0,0,0,0 >/dev/null
# $swp2
# -----
ip link set dev $swp2 up
mtu_set $swp2 10000
vlan_create $swp2 111
ip link set dev $swp2.111 type vlan egress-qos-map 0:0 1:1
devlink_port_pool_th_set $swp2 6 $_10MB
devlink_tc_bind_pool_th_set $swp2 1 egress 6 $_10MB
# prio 0->TC0 (band 7), 1->TC1 (band 6). TC1 is shaped.
tc qdisc replace dev $swp2 root handle 1: \
ets bands 8 strict 8 priomap 7 6
tc qdisc replace dev $swp2 parent 1:7 handle 17: \
tbf rate 200Mbit burst 131072 limit 1M
# $swp3
# -----
ip link set dev $swp3 up
mtu_set $swp3 10000
vlan_create $swp3 111
ip link set dev $swp3.111 type vlan egress-qos-map 0:0 1:1
devlink_port_pool_th_set $swp3 5 $_10MB
devlink_tc_bind_pool_th_set $swp3 1 egress 5 $_10MB
# prio 0->TC0 (band 7), 1->TC1 (band 6)
tc qdisc replace dev $swp3 root handle 1: \
ets bands 8 strict 8 priomap 7 6
# Need to enable PFC so that PAUSE takes effect. Therefore need to put
# the lossless prio into a buffer of its own. Don't bother with buffer
# sizes though, there is not going to be any pressure in the "backward"
# direction.
__mlnx_qos -i $swp3 --prio2buffer=0,1,0,0,0,0,0,0 >/dev/null
__mlnx_qos -i $swp3 --pfc=0,1,0,0,0,0,0,0 >/dev/null
# $swp4
# -----
ip link set dev $swp4 up
mtu_set $swp4 10000
vlan_create $swp4 111
ip link set dev $swp4.111 type vlan ingress-qos-map 0:0 1:1
devlink_port_pool_th_set $swp4 2 $_1MB
devlink_tc_bind_pool_th_set $swp4 1 ingress 2 $_1MB
# Configure qdisc so that we can hand-tune headroom.
tc qdisc replace dev $swp4 root handle 1: \
ets bands 8 strict 8 priomap 7 6
__mlnx_qos -i $swp4 --prio2buffer=0,1,0,0,0,0,0,0 >/dev/null
__mlnx_qos -i $swp4 --pfc=0,1,0,0,0,0,0,0 >/dev/null
# PG0 will get autoconfigured to Xoff, give PG1 arbitrarily 100K, which
# is (-2*MTU) about 80K of delay provision.
__mlnx_qos -i $swp4 --buffer_size=0,$_100KB,0,0,0,0,0,0 >/dev/null
# bridges
# -------
ip link add name br1 type bridge vlan_filtering 0
ip link set dev $swp1.111 master br1
ip link set dev $swp3.111 master br1
ip link set dev br1 up
ip link add name br2 type bridge vlan_filtering 0
ip link set dev $swp2.111 master br2
ip link set dev $swp4.111 master br2
ip link set dev br2 up
}
switch_destroy()
{
# Do this first so that we can reset the limits to values that are only
# valid for the original static / dynamic setting.
devlink_pool_size_thtype_restore 6
devlink_pool_size_thtype_restore 5
devlink_pool_size_thtype_restore 4
devlink_pool_size_thtype_restore 2
devlink_pool_size_thtype_restore 1
devlink_pool_size_thtype_restore 0
# bridges
# -------
ip link set dev br2 down
ip link set dev $swp4.111 nomaster
ip link set dev $swp2.111 nomaster
ip link del dev br2
ip link set dev br1 down
ip link set dev $swp3.111 nomaster
ip link set dev $swp1.111 nomaster
ip link del dev br1
# $swp4
# -----
__mlnx_qos -i $swp4 --buffer_size=0,0,0,0,0,0,0,0 >/dev/null
__mlnx_qos -i $swp4 --pfc=0,0,0,0,0,0,0,0 >/dev/null
__mlnx_qos -i $swp4 --prio2buffer=0,0,0,0,0,0,0,0 >/dev/null
tc qdisc del dev $swp4 root
devlink_tc_bind_pool_th_restore $swp4 1 ingress
devlink_port_pool_th_restore $swp4 2
vlan_destroy $swp4 111
mtu_restore $swp4
ip link set dev $swp4 down
# $swp3
# -----
__mlnx_qos -i $swp3 --pfc=0,0,0,0,0,0,0,0 >/dev/null
__mlnx_qos -i $swp3 --prio2buffer=0,0,0,0,0,0,0,0 >/dev/null
tc qdisc del dev $swp3 root
devlink_tc_bind_pool_th_restore $swp3 1 egress
devlink_port_pool_th_restore $swp3 5
vlan_destroy $swp3 111
mtu_restore $swp3
ip link set dev $swp3 down
# $swp2
# -----
tc qdisc del dev $swp2 parent 1:7
tc qdisc del dev $swp2 root
devlink_tc_bind_pool_th_restore $swp2 1 egress
devlink_port_pool_th_restore $swp2 6
vlan_destroy $swp2 111
mtu_restore $swp2
ip link set dev $swp2 down
# $swp1
# -----
__mlnx_qos -i $swp1 --prio2buffer=0,0,0,0,0,0,0,0 >/dev/null
tc qdisc del dev $swp1 root
devlink_tc_bind_pool_th_restore $swp1 1 ingress
devlink_port_pool_th_restore $swp1 1
vlan_destroy $swp1 111
mtu_restore $swp1
ip link set dev $swp1 down
}
setup_prepare()
{
h1=${NETIFS[p1]}
swp1=${NETIFS[p2]}
swp2=${NETIFS[p3]}
h2=${NETIFS[p4]}
swp3=${NETIFS[p5]}
swp4=${NETIFS[p6]}
h2mac=$(mac_get $h2)
vrf_prepare
h1_create
h2_create
switch_create
}
cleanup()
{
pre_cleanup
switch_destroy
h2_destroy
h1_destroy
vrf_cleanup
}
ping_ipv4()
{
ping_test $h1 192.0.2.34
}
test_qos_pfc()
{
RET=0
# 10M pool, each packet is 8K of payload + headers
local pkts=$((_10MB / 8050))
local size=$((pkts * 8050))
local in0=$(ethtool_stats_get $swp1 rx_octets_prio_1)
local out0=$(ethtool_stats_get $swp2 tx_octets_prio_1)
$MZ $h1 -p 8000 -Q 1:111 -A 192.0.2.33 -B 192.0.2.34 \
-a own -b $h2mac -c $pkts -t udp -q
sleep 2
local in1=$(ethtool_stats_get $swp1 rx_octets_prio_1)
local out1=$(ethtool_stats_get $swp2 tx_octets_prio_1)
local din=$((in1 - in0))
local dout=$((out1 - out0))
local pct_in=$((din * 100 / size))
((pct_in > 95 && pct_in < 105))
check_err $? "Relative ingress out of expected bounds, $pct_in% should be 100%"
((dout == din))
check_err $? "$((din - dout)) bytes out of $din ingressed got lost"
log_test "PFC"
}
trap cleanup EXIT
bail_on_lldpad
setup_prepare
setup_wait
tests_run
exit $EXIT_STATUS
|