diff options
Diffstat (limited to 'net/sched/sch_fq.c')
-rw-r--r-- | net/sched/sch_fq.c | 89 |
1 files changed, 62 insertions, 27 deletions
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index e5458b99e09c..a4f738ac7728 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -86,6 +86,7 @@ struct fq_sched_data { struct rb_root delayed; /* for rate limited flows */ u64 time_next_delayed_flow; + unsigned long unthrottle_latency_ns; struct fq_flow internal; /* for non classified or high prio packets */ u32 quantum; @@ -94,6 +95,7 @@ struct fq_sched_data { u32 flow_max_rate; /* optional max rate per flow */ u32 flow_plimit; /* max packets per flow */ u32 orphan_mask; /* mask for orphaned skb */ + u32 low_rate_threshold; struct rb_root *fq_root; u8 rate_enable; u8 fq_trees_log; @@ -134,7 +136,7 @@ static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) struct fq_flow *aux; parent = *p; - aux = container_of(parent, struct fq_flow, rate_node); + aux = rb_entry(parent, struct fq_flow, rate_node); if (f->time_next_packet >= aux->time_next_packet) p = &parent->rb_right; else @@ -186,7 +188,7 @@ static void fq_gc(struct fq_sched_data *q, while (*p) { parent = *p; - f = container_of(parent, struct fq_flow, fq_node); + f = rb_entry(parent, struct fq_flow, fq_node); if (f->sk == sk) break; @@ -243,7 +245,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) skb_orphan(skb); } - root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; + root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; if (q->flows >= (2U << q->fq_trees_log) && q->inactive_flows > q->flows/2) @@ -254,7 +256,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) while (*p) { parent = *p; - f = container_of(parent, struct fq_flow, fq_node); + f = rb_entry(parent, struct fq_flow, fq_node); if (f->sk == sk) { /* socket might have been reallocated, so check * if its sk_hash is the same. @@ -407,14 +409,22 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, static void fq_check_throttled(struct fq_sched_data *q, u64 now) { + unsigned long sample; struct rb_node *p; if (q->time_next_delayed_flow > now) return; + /* Update unthrottle latency EWMA. + * This is cheap and can help diagnosing timer/latency problems. + */ + sample = (unsigned long)(now - q->time_next_delayed_flow); + q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; + q->unthrottle_latency_ns += sample >> 3; + q->time_next_delayed_flow = ~0ULL; while ((p = rb_first(&q->delayed)) != NULL) { - struct fq_flow *f = container_of(p, struct fq_flow, rate_node); + struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node); if (f->time_next_packet > now) { q->time_next_delayed_flow = f->time_next_packet; @@ -433,7 +443,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; - u32 rate; + u32 rate, plen; skb = fq_dequeue_head(sch, &q->internal); if (skb) @@ -482,7 +492,7 @@ begin: prefetch(&skb->end); f->credit -= qdisc_pkt_len(skb); - if (f->credit > 0 || !q->rate_enable) + if (!q->rate_enable) goto out; /* Do not pace locally generated ack packets */ @@ -493,8 +503,15 @@ begin: if (skb->sk) rate = min(skb->sk->sk_pacing_rate, rate); + if (rate <= q->low_rate_threshold) { + f->credit = 0; + plen = qdisc_pkt_len(skb); + } else { + plen = max(qdisc_pkt_len(skb), q->quantum); + if (f->credit > 0) + goto out; + } if (rate != ~0U) { - u32 plen = max(qdisc_pkt_len(skb), q->quantum); u64 len = (u64)plen * NSEC_PER_SEC; if (likely(rate)) @@ -507,7 +524,12 @@ begin: len = NSEC_PER_SEC; q->stat_pkts_too_long++; } - + /* Account for schedule/timers drifts. + * f->time_next_packet was set when prior packet was sent, + * and current time (@now) can be too late by tens of us. + */ + if (f->time_next_packet) + len -= min(len/2, now - f->time_next_packet); f->time_next_packet = now + len; } out: @@ -541,7 +563,7 @@ static void fq_reset(struct Qdisc *sch) for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { root = &q->fq_root[idx]; while ((p = rb_first(root)) != NULL) { - f = container_of(p, struct fq_flow, fq_node); + f = rb_entry(p, struct fq_flow, fq_node); rb_erase(p, root); fq_flow_purge(f); @@ -571,20 +593,20 @@ static void fq_rehash(struct fq_sched_data *q, oroot = &old_array[idx]; while ((op = rb_first(oroot)) != NULL) { rb_erase(op, oroot); - of = container_of(op, struct fq_flow, fq_node); + of = rb_entry(op, struct fq_flow, fq_node); if (fq_gc_candidate(of)) { fcnt++; kmem_cache_free(fq_flow_cachep, of); continue; } - nroot = &new_array[hash_32((u32)(long)of->sk, new_log)]; + nroot = &new_array[hash_ptr(of->sk, new_log)]; np = &nroot->rb_node; parent = NULL; while (*np) { parent = *np; - nf = container_of(parent, struct fq_flow, fq_node); + nf = rb_entry(parent, struct fq_flow, fq_node); BUG_ON(nf->sk == of->sk); if (nf->sk > of->sk) @@ -662,6 +684,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, + [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt) @@ -716,6 +739,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_FQ_FLOW_MAX_RATE]) q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); + if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) + q->low_rate_threshold = + nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); + if (tb[TCA_FQ_RATE_ENABLE]) { u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); @@ -774,6 +801,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); q->flow_refill_delay = msecs_to_jiffies(40); q->flow_max_rate = ~0U; + q->time_next_delayed_flow = ~0ULL; q->rate_enable = 1; q->new_flows.first = NULL; q->old_flows.first = NULL; @@ -781,6 +809,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) q->fq_root = NULL; q->fq_trees_log = ilog2(1024); q->orphan_mask = 1024 - 1; + q->low_rate_threshold = 550000 / 8; qdisc_watchdog_init(&q->watchdog, sch); if (opt) @@ -811,6 +840,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, jiffies_to_usecs(q->flow_refill_delay)) || nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || + nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, + q->low_rate_threshold) || nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) goto nla_put_failure; @@ -823,20 +854,24 @@ nla_put_failure: static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct fq_sched_data *q = qdisc_priv(sch); - u64 now = ktime_get_ns(); - struct tc_fq_qd_stats st = { - .gc_flows = q->stat_gc_flows, - .highprio_packets = q->stat_internal_packets, - .tcp_retrans = q->stat_tcp_retrans, - .throttled = q->stat_throttled, - .flows_plimit = q->stat_flows_plimit, - .pkts_too_long = q->stat_pkts_too_long, - .allocation_errors = q->stat_allocation_errors, - .flows = q->flows, - .inactive_flows = q->inactive_flows, - .throttled_flows = q->throttled_flows, - .time_next_delayed_flow = q->time_next_delayed_flow - now, - }; + struct tc_fq_qd_stats st; + + sch_tree_lock(sch); + + st.gc_flows = q->stat_gc_flows; + st.highprio_packets = q->stat_internal_packets; + st.tcp_retrans = q->stat_tcp_retrans; + st.throttled = q->stat_throttled; + st.flows_plimit = q->stat_flows_plimit; + st.pkts_too_long = q->stat_pkts_too_long; + st.allocation_errors = q->stat_allocation_errors; + st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns(); + st.flows = q->flows; + st.inactive_flows = q->inactive_flows; + st.throttled_flows = q->throttled_flows; + st.unthrottle_latency_ns = min_t(unsigned long, + q->unthrottle_latency_ns, ~0U); + sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); } |