diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/sched/sch_netem.c | |
download | lwn-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.gz lwn-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.zip |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'net/sched/sch_netem.c')
-rw-r--r-- | net/sched/sch_netem.c | 598 |
1 files changed, 598 insertions, 0 deletions
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c new file mode 100644 index 000000000000..31c29deb139d --- /dev/null +++ b/net/sched/sch_netem.c @@ -0,0 +1,598 @@ +/* + * net/sched/sch_netem.c Network emulator + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Many of the algorithms and ideas for this came from + * NIST Net which is not copyrighted. + * + * Authors: Stephen Hemminger <shemminger@osdl.org> + * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> + +#include <net/pkt_sched.h> + +/* Network Emulation Queuing algorithm. + ==================================== + + Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based + Network Emulation Tool + [2] Luigi Rizzo, DummyNet for FreeBSD + + ---------------------------------------------------------------- + + This started out as a simple way to delay outgoing packets to + test TCP but has grown to include most of the functionality + of a full blown network emulator like NISTnet. It can delay + packets and add random jitter (and correlation). The random + distribution can be loaded from a table as well to provide + normal, Pareto, or experimental curves. Packet loss, + duplication, and reordering can also be emulated. + + This qdisc does not do classification that can be handled in + layering other disciplines. It does not need to do bandwidth + control either since that can be handled by using token + bucket or other rate control. + + The simulator is limited by the Linux timer resolution + and will create packet bursts on the HZ boundary (1ms). +*/ + +struct netem_sched_data { + struct Qdisc *qdisc; + struct sk_buff_head delayed; + struct timer_list timer; + + u32 latency; + u32 loss; + u32 limit; + u32 counter; + u32 gap; + u32 jitter; + u32 duplicate; + + struct crndstate { + unsigned long last; + unsigned long rho; + } delay_cor, loss_cor, dup_cor; + + struct disttable { + u32 size; + s16 table[0]; + } *delay_dist; +}; + +/* Time stamp put into socket buffer control block */ +struct netem_skb_cb { + psched_time_t time_to_send; +}; + +/* init_crandom - initialize correlated random number generator + * Use entropy source for initial seed. + */ +static void init_crandom(struct crndstate *state, unsigned long rho) +{ + state->rho = rho; + state->last = net_random(); +} + +/* get_crandom - correlated random number generator + * Next number depends on last value. + * rho is scaled to avoid floating point. + */ +static unsigned long get_crandom(struct crndstate *state) +{ + u64 value, rho; + unsigned long answer; + + if (state->rho == 0) /* no correllation */ + return net_random(); + + value = net_random(); + rho = (u64)state->rho + 1; + answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; + state->last = answer; + return answer; +} + +/* tabledist - return a pseudo-randomly distributed value with mean mu and + * std deviation sigma. Uses table lookup to approximate the desired + * distribution, and a uniformly-distributed pseudo-random source. + */ +static long tabledist(unsigned long mu, long sigma, + struct crndstate *state, const struct disttable *dist) +{ + long t, x; + unsigned long rnd; + + if (sigma == 0) + return mu; + + rnd = get_crandom(state); + + /* default uniform distribution */ + if (dist == NULL) + return (rnd % (2*sigma)) - sigma + mu; + + t = dist->table[rnd % dist->size]; + x = (sigma % NETEM_DIST_SCALE) * t; + if (x >= 0) + x += NETEM_DIST_SCALE/2; + else + x -= NETEM_DIST_SCALE/2; + + return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; +} + +/* Put skb in the private delayed queue. */ +static int delay_skb(struct Qdisc *sch, struct sk_buff *skb) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; + psched_tdiff_t td; + psched_time_t now; + + PSCHED_GET_TIME(now); + td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist); + PSCHED_TADD2(now, td, cb->time_to_send); + + /* Always queue at tail to keep packets in order */ + if (likely(q->delayed.qlen < q->limit)) { + __skb_queue_tail(&q->delayed, skb); + if (!timer_pending(&q->timer)) { + q->timer.expires = jiffies + PSCHED_US2JIFFIE(td); + add_timer(&q->timer); + } + return NET_XMIT_SUCCESS; + } + + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb2; + int ret; + + pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies); + + /* Random packet drop 0 => none, ~0 => all */ + if (q->loss && q->loss >= get_crandom(&q->loss_cor)) { + pr_debug("netem_enqueue: random loss\n"); + sch->qstats.drops++; + kfree_skb(skb); + return 0; /* lie about loss so TCP doesn't know */ + } + + /* Random duplication */ + if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor) + && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { + pr_debug("netem_enqueue: dup %p\n", skb2); + + if (delay_skb(sch, skb2)) { + sch->q.qlen++; + sch->bstats.bytes += skb2->len; + sch->bstats.packets++; + } else + sch->qstats.drops++; + } + + /* If doing simple delay then gap == 0 so all packets + * go into the delayed holding queue + * otherwise if doing out of order only "1 out of gap" + * packets will be delayed. + */ + if (q->counter < q->gap) { + ++q->counter; + ret = q->qdisc->enqueue(skb, q->qdisc); + } else { + q->counter = 0; + ret = delay_skb(sch, skb); + } + + if (likely(ret == NET_XMIT_SUCCESS)) { + sch->q.qlen++; + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + } else + sch->qstats.drops++; + + return ret; +} + +/* Requeue packets but don't change time stamp */ +static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + int ret; + + if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) { + sch->q.qlen++; + sch->qstats.requeues++; + } + + return ret; +} + +static unsigned int netem_drop(struct Qdisc* sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + unsigned int len; + + if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) { + sch->q.qlen--; + sch->qstats.drops++; + } + return len; +} + +/* Dequeue packet. + * Move all packets that are ready to send from the delay holding + * list to the underlying qdisc, then just call dequeue + */ +static struct sk_buff *netem_dequeue(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + skb = q->qdisc->dequeue(q->qdisc); + if (skb) + sch->q.qlen--; + return skb; +} + +static void netem_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + struct netem_sched_data *q = qdisc_priv(sch); + struct net_device *dev = sch->dev; + struct sk_buff *skb; + psched_time_t now; + + pr_debug("netem_watchdog: fired @%lu\n", jiffies); + + spin_lock_bh(&dev->queue_lock); + PSCHED_GET_TIME(now); + + while ((skb = skb_peek(&q->delayed)) != NULL) { + const struct netem_skb_cb *cb + = (const struct netem_skb_cb *)skb->cb; + long delay + = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); + pr_debug("netem_watchdog: skb %p@%lu %ld\n", + skb, jiffies, delay); + + /* if more time remaining? */ + if (delay > 0) { + mod_timer(&q->timer, jiffies + delay); + break; + } + __skb_unlink(skb, &q->delayed); + + if (q->qdisc->enqueue(skb, q->qdisc)) { + sch->q.qlen--; + sch->qstats.drops++; + } + } + qdisc_run(dev); + spin_unlock_bh(&dev->queue_lock); +} + +static void netem_reset(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + qdisc_reset(q->qdisc); + skb_queue_purge(&q->delayed); + + sch->q.qlen = 0; + del_timer_sync(&q->timer); +} + +static int set_fifo_limit(struct Qdisc *q, int limit) +{ + struct rtattr *rta; + int ret = -ENOMEM; + + rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); + if (rta) { + rta->rta_type = RTM_NEWQDISC; + rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); + ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; + + ret = q->ops->change(q, rta); + kfree(rta); + } + return ret; +} + +/* + * Distribution data is a variable size payload containing + * signed 16 bit values. + */ +static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16); + const __s16 *data = RTA_DATA(attr); + struct disttable *d; + int i; + + if (n > 65536) + return -EINVAL; + + d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL); + if (!d) + return -ENOMEM; + + d->size = n; + for (i = 0; i < n; i++) + d->table[i] = data[i]; + + spin_lock_bh(&sch->dev->queue_lock); + d = xchg(&q->delay_dist, d); + spin_unlock_bh(&sch->dev->queue_lock); + + kfree(d); + return 0; +} + +static int get_correlation(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_corr *c = RTA_DATA(attr); + + if (RTA_PAYLOAD(attr) != sizeof(*c)) + return -EINVAL; + + init_crandom(&q->delay_cor, c->delay_corr); + init_crandom(&q->loss_cor, c->loss_corr); + init_crandom(&q->dup_cor, c->dup_corr); + return 0; +} + +static int netem_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct tc_netem_qopt *qopt; + int ret; + + if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + return -EINVAL; + + qopt = RTA_DATA(opt); + ret = set_fifo_limit(q->qdisc, qopt->limit); + if (ret) { + pr_debug("netem: can't set fifo limit\n"); + return ret; + } + + q->latency = qopt->latency; + q->jitter = qopt->jitter; + q->limit = qopt->limit; + q->gap = qopt->gap; + q->loss = qopt->loss; + q->duplicate = qopt->duplicate; + + /* Handle nested options after initial queue options. + * Should have put all options in nested format but too late now. + */ + if (RTA_PAYLOAD(opt) > sizeof(*qopt)) { + struct rtattr *tb[TCA_NETEM_MAX]; + if (rtattr_parse(tb, TCA_NETEM_MAX, + RTA_DATA(opt) + sizeof(*qopt), + RTA_PAYLOAD(opt) - sizeof(*qopt))) + return -EINVAL; + + if (tb[TCA_NETEM_CORR-1]) { + ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]); + if (ret) + return ret; + } + + if (tb[TCA_NETEM_DELAY_DIST-1]) { + ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]); + if (ret) + return ret; + } + } + + + return 0; +} + +static int netem_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct netem_sched_data *q = qdisc_priv(sch); + int ret; + + if (!opt) + return -EINVAL; + + skb_queue_head_init(&q->delayed); + init_timer(&q->timer); + q->timer.function = netem_watchdog; + q->timer.data = (unsigned long) sch; + q->counter = 0; + + q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (!q->qdisc) { + pr_debug("netem: qdisc create failed\n"); + return -ENOMEM; + } + + ret = netem_change(sch, opt); + if (ret) { + pr_debug("netem: change failed\n"); + qdisc_destroy(q->qdisc); + } + return ret; +} + +static void netem_destroy(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + del_timer_sync(&q->timer); + qdisc_destroy(q->qdisc); + kfree(q->delay_dist); +} + +static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + const struct netem_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct rtattr *rta = (struct rtattr *) b; + struct tc_netem_qopt qopt; + struct tc_netem_corr cor; + + qopt.latency = q->latency; + qopt.jitter = q->jitter; + qopt.limit = q->limit; + qopt.loss = q->loss; + qopt.gap = q->gap; + qopt.duplicate = q->duplicate; + RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + + cor.delay_corr = q->delay_cor.rho; + cor.loss_corr = q->loss_cor.rho; + cor.dup_corr = q->dup_cor.rho; + RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int netem_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + if (cl != 1) /* only one class */ + return -ENOENT; + + tcm->tcm_handle |= TC_H_MIN(1); + tcm->tcm_info = q->qdisc->handle; + + return 0; +} + +static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = xchg(&q->qdisc, new); + qdisc_reset(*old); + sch->q.qlen = 0; + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct netem_sched_data *q = qdisc_priv(sch); + return q->qdisc; +} + +static unsigned long netem_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void netem_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct rtattr **tca, unsigned long *arg) +{ + return -ENOSYS; +} + +static int netem_delete(struct Qdisc *sch, unsigned long arg) +{ + return -ENOSYS; +} + +static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + return NULL; +} + +static struct Qdisc_class_ops netem_class_ops = { + .graft = netem_graft, + .leaf = netem_leaf, + .get = netem_get, + .put = netem_put, + .change = netem_change_class, + .delete = netem_delete, + .walk = netem_walk, + .tcf_chain = netem_find_tcf, + .dump = netem_dump_class, +}; + +static struct Qdisc_ops netem_qdisc_ops = { + .id = "netem", + .cl_ops = &netem_class_ops, + .priv_size = sizeof(struct netem_sched_data), + .enqueue = netem_enqueue, + .dequeue = netem_dequeue, + .requeue = netem_requeue, + .drop = netem_drop, + .init = netem_init, + .reset = netem_reset, + .destroy = netem_destroy, + .change = netem_change, + .dump = netem_dump, + .owner = THIS_MODULE, +}; + + +static int __init netem_module_init(void) +{ + return register_qdisc(&netem_qdisc_ops); +} +static void __exit netem_module_exit(void) +{ + unregister_qdisc(&netem_qdisc_ops); +} +module_init(netem_module_init) +module_exit(netem_module_exit) +MODULE_LICENSE("GPL"); |