summaryrefslogtreecommitdiff
path: root/net/sched/sch_fq.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched/sch_fq.c')
-rw-r--r--net/sched/sch_fq.c89
1 files changed, 62 insertions, 27 deletions
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index e5458b99e09c..a4f738ac7728 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -86,6 +86,7 @@ struct fq_sched_data {
struct rb_root delayed; /* for rate limited flows */
u64 time_next_delayed_flow;
+ unsigned long unthrottle_latency_ns;
struct fq_flow internal; /* for non classified or high prio packets */
u32 quantum;
@@ -94,6 +95,7 @@ struct fq_sched_data {
u32 flow_max_rate; /* optional max rate per flow */
u32 flow_plimit; /* max packets per flow */
u32 orphan_mask; /* mask for orphaned skb */
+ u32 low_rate_threshold;
struct rb_root *fq_root;
u8 rate_enable;
u8 fq_trees_log;
@@ -134,7 +136,7 @@ static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
struct fq_flow *aux;
parent = *p;
- aux = container_of(parent, struct fq_flow, rate_node);
+ aux = rb_entry(parent, struct fq_flow, rate_node);
if (f->time_next_packet >= aux->time_next_packet)
p = &parent->rb_right;
else
@@ -186,7 +188,7 @@ static void fq_gc(struct fq_sched_data *q,
while (*p) {
parent = *p;
- f = container_of(parent, struct fq_flow, fq_node);
+ f = rb_entry(parent, struct fq_flow, fq_node);
if (f->sk == sk)
break;
@@ -243,7 +245,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
skb_orphan(skb);
}
- root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
+ root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
if (q->flows >= (2U << q->fq_trees_log) &&
q->inactive_flows > q->flows/2)
@@ -254,7 +256,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
while (*p) {
parent = *p;
- f = container_of(parent, struct fq_flow, fq_node);
+ f = rb_entry(parent, struct fq_flow, fq_node);
if (f->sk == sk) {
/* socket might have been reallocated, so check
* if its sk_hash is the same.
@@ -407,14 +409,22 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
static void fq_check_throttled(struct fq_sched_data *q, u64 now)
{
+ unsigned long sample;
struct rb_node *p;
if (q->time_next_delayed_flow > now)
return;
+ /* Update unthrottle latency EWMA.
+ * This is cheap and can help diagnosing timer/latency problems.
+ */
+ sample = (unsigned long)(now - q->time_next_delayed_flow);
+ q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
+ q->unthrottle_latency_ns += sample >> 3;
+
q->time_next_delayed_flow = ~0ULL;
while ((p = rb_first(&q->delayed)) != NULL) {
- struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
+ struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
if (f->time_next_packet > now) {
q->time_next_delayed_flow = f->time_next_packet;
@@ -433,7 +443,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
struct fq_flow_head *head;
struct sk_buff *skb;
struct fq_flow *f;
- u32 rate;
+ u32 rate, plen;
skb = fq_dequeue_head(sch, &q->internal);
if (skb)
@@ -482,7 +492,7 @@ begin:
prefetch(&skb->end);
f->credit -= qdisc_pkt_len(skb);
- if (f->credit > 0 || !q->rate_enable)
+ if (!q->rate_enable)
goto out;
/* Do not pace locally generated ack packets */
@@ -493,8 +503,15 @@ begin:
if (skb->sk)
rate = min(skb->sk->sk_pacing_rate, rate);
+ if (rate <= q->low_rate_threshold) {
+ f->credit = 0;
+ plen = qdisc_pkt_len(skb);
+ } else {
+ plen = max(qdisc_pkt_len(skb), q->quantum);
+ if (f->credit > 0)
+ goto out;
+ }
if (rate != ~0U) {
- u32 plen = max(qdisc_pkt_len(skb), q->quantum);
u64 len = (u64)plen * NSEC_PER_SEC;
if (likely(rate))
@@ -507,7 +524,12 @@ begin:
len = NSEC_PER_SEC;
q->stat_pkts_too_long++;
}
-
+ /* Account for schedule/timers drifts.
+ * f->time_next_packet was set when prior packet was sent,
+ * and current time (@now) can be too late by tens of us.
+ */
+ if (f->time_next_packet)
+ len -= min(len/2, now - f->time_next_packet);
f->time_next_packet = now + len;
}
out:
@@ -541,7 +563,7 @@ static void fq_reset(struct Qdisc *sch)
for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
root = &q->fq_root[idx];
while ((p = rb_first(root)) != NULL) {
- f = container_of(p, struct fq_flow, fq_node);
+ f = rb_entry(p, struct fq_flow, fq_node);
rb_erase(p, root);
fq_flow_purge(f);
@@ -571,20 +593,20 @@ static void fq_rehash(struct fq_sched_data *q,
oroot = &old_array[idx];
while ((op = rb_first(oroot)) != NULL) {
rb_erase(op, oroot);
- of = container_of(op, struct fq_flow, fq_node);
+ of = rb_entry(op, struct fq_flow, fq_node);
if (fq_gc_candidate(of)) {
fcnt++;
kmem_cache_free(fq_flow_cachep, of);
continue;
}
- nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
+ nroot = &new_array[hash_ptr(of->sk, new_log)];
np = &nroot->rb_node;
parent = NULL;
while (*np) {
parent = *np;
- nf = container_of(parent, struct fq_flow, fq_node);
+ nf = rb_entry(parent, struct fq_flow, fq_node);
BUG_ON(nf->sk == of->sk);
if (nf->sk > of->sk)
@@ -662,6 +684,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
+ [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
};
static int fq_change(struct Qdisc *sch, struct nlattr *opt)
@@ -716,6 +739,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_FQ_FLOW_MAX_RATE])
q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+ if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
+ q->low_rate_threshold =
+ nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
+
if (tb[TCA_FQ_RATE_ENABLE]) {
u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
@@ -774,6 +801,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
q->flow_refill_delay = msecs_to_jiffies(40);
q->flow_max_rate = ~0U;
+ q->time_next_delayed_flow = ~0ULL;
q->rate_enable = 1;
q->new_flows.first = NULL;
q->old_flows.first = NULL;
@@ -781,6 +809,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
q->fq_root = NULL;
q->fq_trees_log = ilog2(1024);
q->orphan_mask = 1024 - 1;
+ q->low_rate_threshold = 550000 / 8;
qdisc_watchdog_init(&q->watchdog, sch);
if (opt)
@@ -811,6 +840,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
jiffies_to_usecs(q->flow_refill_delay)) ||
nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
+ nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
+ q->low_rate_threshold) ||
nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
goto nla_put_failure;
@@ -823,20 +854,24 @@ nla_put_failure:
static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct fq_sched_data *q = qdisc_priv(sch);
- u64 now = ktime_get_ns();
- struct tc_fq_qd_stats st = {
- .gc_flows = q->stat_gc_flows,
- .highprio_packets = q->stat_internal_packets,
- .tcp_retrans = q->stat_tcp_retrans,
- .throttled = q->stat_throttled,
- .flows_plimit = q->stat_flows_plimit,
- .pkts_too_long = q->stat_pkts_too_long,
- .allocation_errors = q->stat_allocation_errors,
- .flows = q->flows,
- .inactive_flows = q->inactive_flows,
- .throttled_flows = q->throttled_flows,
- .time_next_delayed_flow = q->time_next_delayed_flow - now,
- };
+ struct tc_fq_qd_stats st;
+
+ sch_tree_lock(sch);
+
+ st.gc_flows = q->stat_gc_flows;
+ st.highprio_packets = q->stat_internal_packets;
+ st.tcp_retrans = q->stat_tcp_retrans;
+ st.throttled = q->stat_throttled;
+ st.flows_plimit = q->stat_flows_plimit;
+ st.pkts_too_long = q->stat_pkts_too_long;
+ st.allocation_errors = q->stat_allocation_errors;
+ st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
+ st.flows = q->flows;
+ st.inactive_flows = q->inactive_flows;
+ st.throttled_flows = q->throttled_flows;
+ st.unthrottle_latency_ns = min_t(unsigned long,
+ q->unthrottle_latency_ns, ~0U);
+ sch_tree_unlock(sch);
return gnet_stats_copy_app(d, &st, sizeof(st));
}