sch_tbf.c revision b757c9336d63f94c6b57532bb4e8651d8b28786f
1/*
2 * net/sched/sch_tbf.c	Token Bucket Filter queue.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
11 *						 original idea by Martin Devera
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/kernel.h>
18#include <linux/string.h>
19#include <linux/errno.h>
20#include <linux/skbuff.h>
21#include <net/netlink.h>
22#include <net/sch_generic.h>
23#include <net/pkt_sched.h>
24
25
26/*	Simple Token Bucket Filter.
27	=======================================
28
29	SOURCE.
30	-------
31
32	None.
33
34	Description.
35	------------
36
37	A data flow obeys TBF with rate R and depth B, if for any
38	time interval t_i...t_f the number of transmitted bits
39	does not exceed B + R*(t_f-t_i).
40
41	Packetized version of this definition:
42	The sequence of packets of sizes s_i served at moments t_i
43	obeys TBF, if for any i<=k:
44
45	s_i+....+s_k <= B + R*(t_k - t_i)
46
47	Algorithm.
48	----------
49
50	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
51
52	N(t+delta) = min{B/R, N(t) + delta}
53
54	If the first packet in queue has length S, it may be
55	transmitted only at the time t_* when S/R <= N(t_*),
56	and in this case N(t) jumps:
57
58	N(t_* + 0) = N(t_* - 0) - S/R.
59
60
61
62	Actually, QoS requires two TBF to be applied to a data stream.
63	One of them controls steady state burst size, another
64	one with rate P (peak rate) and depth M (equal to link MTU)
65	limits bursts at a smaller time scale.
66
67	It is easy to see that P>R, and B>M. If P is infinity, this double
68	TBF is equivalent to a single one.
69
70	When TBF works in reshaping mode, latency is estimated as:
71
72	lat = max ((L-B)/R, (L-M)/P)
73
74
75	NOTES.
76	------
77
78	If TBF throttles, it starts a watchdog timer, which will wake it up
79	when it is ready to transmit.
80	Note that the minimal timer resolution is 1/HZ.
81	If no new packets arrive during this period,
82	or if the device is not awaken by EOI for some previous packet,
83	TBF can stop its activity for 1/HZ.
84
85
86	This means, that with depth B, the maximal rate is
87
88	R_crit = B*HZ
89
90	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
91
92	Note that the peak rate TBF is much more tough: with MTU 1500
93	P_crit = 150Kbytes/sec. So, if you need greater peak
94	rates, use alpha with HZ=1000 :-)
95
96	With classful TBF, limit is just kept for backwards compatibility.
97	It is passed to the default bfifo qdisc - if the inner qdisc is
98	changed the limit is not effective anymore.
99*/
100
101struct tbf_sched_data {
102/* Parameters */
103	u32		limit;		/* Maximal length of backlog: bytes */
104	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
105	s64		mtu;
106	u32		max_size;
107	struct psched_ratecfg rate;
108	struct psched_ratecfg peak;
109	bool peak_present;
110
111/* Variables */
112	s64	tokens;			/* Current number of B tokens */
113	s64	ptokens;		/* Current number of P tokens */
114	s64	t_c;			/* Time check-point */
115	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
116	struct qdisc_watchdog watchdog;	/* Watchdog timer */
117};
118
119static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
120{
121	struct tbf_sched_data *q = qdisc_priv(sch);
122	int ret;
123
124	if (qdisc_pkt_len(skb) > q->max_size)
125		return qdisc_reshape_fail(skb, sch);
126
127	ret = qdisc_enqueue(skb, q->qdisc);
128	if (ret != NET_XMIT_SUCCESS) {
129		if (net_xmit_drop_count(ret))
130			sch->qstats.drops++;
131		return ret;
132	}
133
134	sch->q.qlen++;
135	return NET_XMIT_SUCCESS;
136}
137
138static unsigned int tbf_drop(struct Qdisc *sch)
139{
140	struct tbf_sched_data *q = qdisc_priv(sch);
141	unsigned int len = 0;
142
143	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
144		sch->q.qlen--;
145		sch->qstats.drops++;
146	}
147	return len;
148}
149
150static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
151{
152	struct tbf_sched_data *q = qdisc_priv(sch);
153	struct sk_buff *skb;
154
155	skb = q->qdisc->ops->peek(q->qdisc);
156
157	if (skb) {
158		s64 now;
159		s64 toks;
160		s64 ptoks = 0;
161		unsigned int len = qdisc_pkt_len(skb);
162
163		now = ktime_to_ns(ktime_get());
164		toks = min_t(s64, now - q->t_c, q->buffer);
165
166		if (q->peak_present) {
167			ptoks = toks + q->ptokens;
168			if (ptoks > q->mtu)
169				ptoks = q->mtu;
170			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
171		}
172		toks += q->tokens;
173		if (toks > q->buffer)
174			toks = q->buffer;
175		toks -= (s64) psched_l2t_ns(&q->rate, len);
176
177		if ((toks|ptoks) >= 0) {
178			skb = qdisc_dequeue_peeked(q->qdisc);
179			if (unlikely(!skb))
180				return NULL;
181
182			q->t_c = now;
183			q->tokens = toks;
184			q->ptokens = ptoks;
185			sch->q.qlen--;
186			qdisc_unthrottled(sch);
187			qdisc_bstats_update(sch, skb);
188			return skb;
189		}
190
191		qdisc_watchdog_schedule_ns(&q->watchdog,
192					   now + max_t(long, -toks, -ptoks));
193
194		/* Maybe we have a shorter packet in the queue,
195		   which can be sent now. It sounds cool,
196		   but, however, this is wrong in principle.
197		   We MUST NOT reorder packets under these circumstances.
198
199		   Really, if we split the flow into independent
200		   subflows, it would be a very good solution.
201		   This is the main idea of all FQ algorithms
202		   (cf. CSZ, HPFQ, HFSC)
203		 */
204
205		sch->qstats.overlimits++;
206	}
207	return NULL;
208}
209
210static void tbf_reset(struct Qdisc *sch)
211{
212	struct tbf_sched_data *q = qdisc_priv(sch);
213
214	qdisc_reset(q->qdisc);
215	sch->q.qlen = 0;
216	q->t_c = ktime_to_ns(ktime_get());
217	q->tokens = q->buffer;
218	q->ptokens = q->mtu;
219	qdisc_watchdog_cancel(&q->watchdog);
220}
221
222static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
223	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
224	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
225	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
226};
227
228static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
229{
230	int err;
231	struct tbf_sched_data *q = qdisc_priv(sch);
232	struct nlattr *tb[TCA_TBF_PTAB + 1];
233	struct tc_tbf_qopt *qopt;
234	struct qdisc_rate_table *rtab = NULL;
235	struct qdisc_rate_table *ptab = NULL;
236	struct Qdisc *child = NULL;
237	int max_size, n;
238
239	err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
240	if (err < 0)
241		return err;
242
243	err = -EINVAL;
244	if (tb[TCA_TBF_PARMS] == NULL)
245		goto done;
246
247	qopt = nla_data(tb[TCA_TBF_PARMS]);
248	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
249	if (rtab == NULL)
250		goto done;
251
252	if (qopt->peakrate.rate) {
253		if (qopt->peakrate.rate > qopt->rate.rate)
254			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
255		if (ptab == NULL)
256			goto done;
257	}
258
259	for (n = 0; n < 256; n++)
260		if (rtab->data[n] > qopt->buffer)
261			break;
262	max_size = (n << qopt->rate.cell_log) - 1;
263	if (ptab) {
264		int size;
265
266		for (n = 0; n < 256; n++)
267			if (ptab->data[n] > qopt->mtu)
268				break;
269		size = (n << qopt->peakrate.cell_log) - 1;
270		if (size < max_size)
271			max_size = size;
272	}
273	if (max_size < 0)
274		goto done;
275
276	if (q->qdisc != &noop_qdisc) {
277		err = fifo_set_limit(q->qdisc, qopt->limit);
278		if (err)
279			goto done;
280	} else if (qopt->limit > 0) {
281		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
282		if (IS_ERR(child)) {
283			err = PTR_ERR(child);
284			goto done;
285		}
286	}
287
288	sch_tree_lock(sch);
289	if (child) {
290		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
291		qdisc_destroy(q->qdisc);
292		q->qdisc = child;
293	}
294	q->limit = qopt->limit;
295	q->mtu = PSCHED_TICKS2NS(qopt->mtu);
296	q->max_size = max_size;
297	q->buffer = PSCHED_TICKS2NS(qopt->buffer);
298	q->tokens = q->buffer;
299	q->ptokens = q->mtu;
300
301	psched_ratecfg_precompute(&q->rate, rtab->rate.rate);
302	if (ptab) {
303		psched_ratecfg_precompute(&q->peak, ptab->rate.rate);
304		q->peak_present = true;
305	} else {
306		q->peak_present = false;
307	}
308
309	sch_tree_unlock(sch);
310	err = 0;
311done:
312	if (rtab)
313		qdisc_put_rtab(rtab);
314	if (ptab)
315		qdisc_put_rtab(ptab);
316	return err;
317}
318
319static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
320{
321	struct tbf_sched_data *q = qdisc_priv(sch);
322
323	if (opt == NULL)
324		return -EINVAL;
325
326	q->t_c = ktime_to_ns(ktime_get());
327	qdisc_watchdog_init(&q->watchdog, sch);
328	q->qdisc = &noop_qdisc;
329
330	return tbf_change(sch, opt);
331}
332
333static void tbf_destroy(struct Qdisc *sch)
334{
335	struct tbf_sched_data *q = qdisc_priv(sch);
336
337	qdisc_watchdog_cancel(&q->watchdog);
338	qdisc_destroy(q->qdisc);
339}
340
341static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
342{
343	struct tbf_sched_data *q = qdisc_priv(sch);
344	struct nlattr *nest;
345	struct tc_tbf_qopt opt;
346
347	sch->qstats.backlog = q->qdisc->qstats.backlog;
348	nest = nla_nest_start(skb, TCA_OPTIONS);
349	if (nest == NULL)
350		goto nla_put_failure;
351
352	opt.limit = q->limit;
353	opt.rate.rate = psched_ratecfg_getrate(&q->rate);
354	if (q->peak_present)
355		opt.peakrate.rate = psched_ratecfg_getrate(&q->peak);
356	else
357		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
358	opt.mtu = PSCHED_NS2TICKS(q->mtu);
359	opt.buffer = PSCHED_NS2TICKS(q->buffer);
360	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
361		goto nla_put_failure;
362
363	nla_nest_end(skb, nest);
364	return skb->len;
365
366nla_put_failure:
367	nla_nest_cancel(skb, nest);
368	return -1;
369}
370
371static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
372			  struct sk_buff *skb, struct tcmsg *tcm)
373{
374	struct tbf_sched_data *q = qdisc_priv(sch);
375
376	tcm->tcm_handle |= TC_H_MIN(1);
377	tcm->tcm_info = q->qdisc->handle;
378
379	return 0;
380}
381
382static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
383		     struct Qdisc **old)
384{
385	struct tbf_sched_data *q = qdisc_priv(sch);
386
387	if (new == NULL)
388		new = &noop_qdisc;
389
390	sch_tree_lock(sch);
391	*old = q->qdisc;
392	q->qdisc = new;
393	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
394	qdisc_reset(*old);
395	sch_tree_unlock(sch);
396
397	return 0;
398}
399
400static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
401{
402	struct tbf_sched_data *q = qdisc_priv(sch);
403	return q->qdisc;
404}
405
406static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
407{
408	return 1;
409}
410
411static void tbf_put(struct Qdisc *sch, unsigned long arg)
412{
413}
414
415static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
416{
417	if (!walker->stop) {
418		if (walker->count >= walker->skip)
419			if (walker->fn(sch, 1, walker) < 0) {
420				walker->stop = 1;
421				return;
422			}
423		walker->count++;
424	}
425}
426
427static const struct Qdisc_class_ops tbf_class_ops = {
428	.graft		=	tbf_graft,
429	.leaf		=	tbf_leaf,
430	.get		=	tbf_get,
431	.put		=	tbf_put,
432	.walk		=	tbf_walk,
433	.dump		=	tbf_dump_class,
434};
435
436static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
437	.next		=	NULL,
438	.cl_ops		=	&tbf_class_ops,
439	.id		=	"tbf",
440	.priv_size	=	sizeof(struct tbf_sched_data),
441	.enqueue	=	tbf_enqueue,
442	.dequeue	=	tbf_dequeue,
443	.peek		=	qdisc_peek_dequeued,
444	.drop		=	tbf_drop,
445	.init		=	tbf_init,
446	.reset		=	tbf_reset,
447	.destroy	=	tbf_destroy,
448	.change		=	tbf_change,
449	.dump		=	tbf_dump,
450	.owner		=	THIS_MODULE,
451};
452
453static int __init tbf_module_init(void)
454{
455	return register_qdisc(&tbf_qdisc_ops);
456}
457
458static void __exit tbf_module_exit(void)
459{
460	unregister_qdisc(&tbf_qdisc_ops);
461}
462module_init(tbf_module_init)
463module_exit(tbf_module_exit)
464MODULE_LICENSE("GPL");
465