sch_tbf.c revision 2e04ad424b03661ec8239acd52146497eb33be1c
1/*
2 * net/sched/sch_tbf.c	Token Bucket Filter queue.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
11 *						 original idea by Martin Devera
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/kernel.h>
18#include <linux/string.h>
19#include <linux/errno.h>
20#include <linux/skbuff.h>
21#include <net/netlink.h>
22#include <net/sch_generic.h>
23#include <net/pkt_sched.h>
24#include <net/tcp.h>
25
26
27/*	Simple Token Bucket Filter.
28	=======================================
29
30	SOURCE.
31	-------
32
33	None.
34
35	Description.
36	------------
37
38	A data flow obeys TBF with rate R and depth B, if for any
39	time interval t_i...t_f the number of transmitted bits
40	does not exceed B + R*(t_f-t_i).
41
42	Packetized version of this definition:
43	The sequence of packets of sizes s_i served at moments t_i
44	obeys TBF, if for any i<=k:
45
46	s_i+....+s_k <= B + R*(t_k - t_i)
47
48	Algorithm.
49	----------
50
51	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
52
53	N(t+delta) = min{B/R, N(t) + delta}
54
55	If the first packet in queue has length S, it may be
56	transmitted only at the time t_* when S/R <= N(t_*),
57	and in this case N(t) jumps:
58
59	N(t_* + 0) = N(t_* - 0) - S/R.
60
61
62
63	Actually, QoS requires two TBF to be applied to a data stream.
64	One of them controls steady state burst size, another
65	one with rate P (peak rate) and depth M (equal to link MTU)
66	limits bursts at a smaller time scale.
67
68	It is easy to see that P>R, and B>M. If P is infinity, this double
69	TBF is equivalent to a single one.
70
71	When TBF works in reshaping mode, latency is estimated as:
72
73	lat = max ((L-B)/R, (L-M)/P)
74
75
76	NOTES.
77	------
78
79	If TBF throttles, it starts a watchdog timer, which will wake it up
80	when it is ready to transmit.
81	Note that the minimal timer resolution is 1/HZ.
82	If no new packets arrive during this period,
83	or if the device is not awaken by EOI for some previous packet,
84	TBF can stop its activity for 1/HZ.
85
86
87	This means, that with depth B, the maximal rate is
88
89	R_crit = B*HZ
90
91	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
92
93	Note that the peak rate TBF is much more tough: with MTU 1500
94	P_crit = 150Kbytes/sec. So, if you need greater peak
95	rates, use alpha with HZ=1000 :-)
96
97	With classful TBF, limit is just kept for backwards compatibility.
98	It is passed to the default bfifo qdisc - if the inner qdisc is
99	changed the limit is not effective anymore.
100*/
101
102struct tbf_sched_data {
103/* Parameters */
104	u32		limit;		/* Maximal length of backlog: bytes */
105	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
106	s64		mtu;
107	u32		max_size;
108	struct psched_ratecfg rate;
109	struct psched_ratecfg peak;
110	bool peak_present;
111
112/* Variables */
113	s64	tokens;			/* Current number of B tokens */
114	s64	ptokens;		/* Current number of P tokens */
115	s64	t_c;			/* Time check-point */
116	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
117	struct qdisc_watchdog watchdog;	/* Watchdog timer */
118};
119
120
121/* Time to Length, convert time in ns to length in bytes
122 * to determinate how many bytes can be sent in given time.
123 */
124static u64 psched_ns_t2l(const struct psched_ratecfg *r,
125			 u64 time_in_ns)
126{
127	/* The formula is :
128	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
129	 */
130	u64 len = time_in_ns * r->rate_bytes_ps;
131
132	do_div(len, NSEC_PER_SEC);
133
134	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
135		do_div(len, 53);
136		len = len * 48;
137	}
138
139	if (len > r->overhead)
140		len -= r->overhead;
141	else
142		len = 0;
143
144	return len;
145}
146
147/*
148 * Return length of individual segments of a gso packet,
149 * including all headers (MAC, IP, TCP/UDP)
150 */
151static unsigned int skb_gso_seglen(const struct sk_buff *skb)
152{
153	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
154	const struct skb_shared_info *shinfo = skb_shinfo(skb);
155
156	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
157		hdr_len += tcp_hdrlen(skb);
158	else
159		hdr_len += sizeof(struct udphdr);
160	return hdr_len + shinfo->gso_size;
161}
162
163/* GSO packet is too big, segment it so that tbf can transmit
164 * each segment in time
165 */
166static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
167{
168	struct tbf_sched_data *q = qdisc_priv(sch);
169	struct sk_buff *segs, *nskb;
170	netdev_features_t features = netif_skb_features(skb);
171	int ret, nb;
172
173	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
174
175	if (IS_ERR_OR_NULL(segs))
176		return qdisc_reshape_fail(skb, sch);
177
178	nb = 0;
179	while (segs) {
180		nskb = segs->next;
181		segs->next = NULL;
182		qdisc_skb_cb(segs)->pkt_len = segs->len;
183		ret = qdisc_enqueue(segs, q->qdisc);
184		if (ret != NET_XMIT_SUCCESS) {
185			if (net_xmit_drop_count(ret))
186				sch->qstats.drops++;
187		} else {
188			nb++;
189		}
190		segs = nskb;
191	}
192	sch->q.qlen += nb;
193	if (nb > 1)
194		qdisc_tree_decrease_qlen(sch, 1 - nb);
195	consume_skb(skb);
196	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
197}
198
199static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
200{
201	struct tbf_sched_data *q = qdisc_priv(sch);
202	int ret;
203
204	if (qdisc_pkt_len(skb) > q->max_size) {
205		if (skb_is_gso(skb) && skb_gso_seglen(skb) <= q->max_size)
206			return tbf_segment(skb, sch);
207		return qdisc_reshape_fail(skb, sch);
208	}
209	ret = qdisc_enqueue(skb, q->qdisc);
210	if (ret != NET_XMIT_SUCCESS) {
211		if (net_xmit_drop_count(ret))
212			sch->qstats.drops++;
213		return ret;
214	}
215
216	sch->q.qlen++;
217	return NET_XMIT_SUCCESS;
218}
219
220static unsigned int tbf_drop(struct Qdisc *sch)
221{
222	struct tbf_sched_data *q = qdisc_priv(sch);
223	unsigned int len = 0;
224
225	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
226		sch->q.qlen--;
227		sch->qstats.drops++;
228	}
229	return len;
230}
231
232static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
233{
234	struct tbf_sched_data *q = qdisc_priv(sch);
235	struct sk_buff *skb;
236
237	skb = q->qdisc->ops->peek(q->qdisc);
238
239	if (skb) {
240		s64 now;
241		s64 toks;
242		s64 ptoks = 0;
243		unsigned int len = qdisc_pkt_len(skb);
244
245		now = ktime_to_ns(ktime_get());
246		toks = min_t(s64, now - q->t_c, q->buffer);
247
248		if (q->peak_present) {
249			ptoks = toks + q->ptokens;
250			if (ptoks > q->mtu)
251				ptoks = q->mtu;
252			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
253		}
254		toks += q->tokens;
255		if (toks > q->buffer)
256			toks = q->buffer;
257		toks -= (s64) psched_l2t_ns(&q->rate, len);
258
259		if ((toks|ptoks) >= 0) {
260			skb = qdisc_dequeue_peeked(q->qdisc);
261			if (unlikely(!skb))
262				return NULL;
263
264			q->t_c = now;
265			q->tokens = toks;
266			q->ptokens = ptoks;
267			sch->q.qlen--;
268			qdisc_unthrottled(sch);
269			qdisc_bstats_update(sch, skb);
270			return skb;
271		}
272
273		qdisc_watchdog_schedule_ns(&q->watchdog,
274					   now + max_t(long, -toks, -ptoks));
275
276		/* Maybe we have a shorter packet in the queue,
277		   which can be sent now. It sounds cool,
278		   but, however, this is wrong in principle.
279		   We MUST NOT reorder packets under these circumstances.
280
281		   Really, if we split the flow into independent
282		   subflows, it would be a very good solution.
283		   This is the main idea of all FQ algorithms
284		   (cf. CSZ, HPFQ, HFSC)
285		 */
286
287		sch->qstats.overlimits++;
288	}
289	return NULL;
290}
291
292static void tbf_reset(struct Qdisc *sch)
293{
294	struct tbf_sched_data *q = qdisc_priv(sch);
295
296	qdisc_reset(q->qdisc);
297	sch->q.qlen = 0;
298	q->t_c = ktime_to_ns(ktime_get());
299	q->tokens = q->buffer;
300	q->ptokens = q->mtu;
301	qdisc_watchdog_cancel(&q->watchdog);
302}
303
304static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
305	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
306	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
307	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
308	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
309	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
310	[TCA_TBF_BURST] = { .type = NLA_U32 },
311	[TCA_TBF_PBURST] = { .type = NLA_U32 },
312};
313
314static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
315{
316	int err;
317	struct tbf_sched_data *q = qdisc_priv(sch);
318	struct nlattr *tb[TCA_TBF_MAX + 1];
319	struct tc_tbf_qopt *qopt;
320	struct Qdisc *child = NULL;
321	struct psched_ratecfg rate;
322	struct psched_ratecfg peak;
323	u64 max_size;
324	s64 buffer, mtu;
325	u64 rate64 = 0, prate64 = 0;
326
327	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
328	if (err < 0)
329		return err;
330
331	err = -EINVAL;
332	if (tb[TCA_TBF_PARMS] == NULL)
333		goto done;
334
335	qopt = nla_data(tb[TCA_TBF_PARMS]);
336	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
337		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
338					      tb[TCA_TBF_RTAB]));
339
340	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
341			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
342						      tb[TCA_TBF_PTAB]));
343
344	if (q->qdisc != &noop_qdisc) {
345		err = fifo_set_limit(q->qdisc, qopt->limit);
346		if (err)
347			goto done;
348	} else if (qopt->limit > 0) {
349		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
350		if (IS_ERR(child)) {
351			err = PTR_ERR(child);
352			goto done;
353		}
354	}
355
356	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
357	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
358
359	if (tb[TCA_TBF_RATE64])
360		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
361	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
362
363	if (tb[TCA_TBF_BURST]) {
364		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
365		buffer = psched_l2t_ns(&rate, max_size);
366	} else {
367		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
368	}
369
370	if (qopt->peakrate.rate) {
371		if (tb[TCA_TBF_PRATE64])
372			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
373		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
374		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
375			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
376					peak.rate_bytes_ps, rate.rate_bytes_ps);
377			err = -EINVAL;
378			goto done;
379		}
380
381		if (tb[TCA_TBF_PBURST]) {
382			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
383			max_size = min_t(u32, max_size, pburst);
384			mtu = psched_l2t_ns(&peak, pburst);
385		} else {
386			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
387		}
388	}
389
390	if (max_size < psched_mtu(qdisc_dev(sch)))
391		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
392				    max_size, qdisc_dev(sch)->name,
393				    psched_mtu(qdisc_dev(sch)));
394
395	if (!max_size) {
396		err = -EINVAL;
397		goto done;
398	}
399
400	sch_tree_lock(sch);
401	if (child) {
402		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
403		qdisc_destroy(q->qdisc);
404		q->qdisc = child;
405	}
406	q->limit = qopt->limit;
407	if (tb[TCA_TBF_PBURST])
408		q->mtu = mtu;
409	else
410		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
411	q->max_size = max_size;
412	if (tb[TCA_TBF_BURST])
413		q->buffer = buffer;
414	else
415		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
416	q->tokens = q->buffer;
417	q->ptokens = q->mtu;
418
419	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
420	if (qopt->peakrate.rate) {
421		memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
422		q->peak_present = true;
423	} else {
424		q->peak_present = false;
425	}
426
427	sch_tree_unlock(sch);
428	err = 0;
429done:
430	return err;
431}
432
433static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
434{
435	struct tbf_sched_data *q = qdisc_priv(sch);
436
437	if (opt == NULL)
438		return -EINVAL;
439
440	q->t_c = ktime_to_ns(ktime_get());
441	qdisc_watchdog_init(&q->watchdog, sch);
442	q->qdisc = &noop_qdisc;
443
444	return tbf_change(sch, opt);
445}
446
447static void tbf_destroy(struct Qdisc *sch)
448{
449	struct tbf_sched_data *q = qdisc_priv(sch);
450
451	qdisc_watchdog_cancel(&q->watchdog);
452	qdisc_destroy(q->qdisc);
453}
454
455static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
456{
457	struct tbf_sched_data *q = qdisc_priv(sch);
458	struct nlattr *nest;
459	struct tc_tbf_qopt opt;
460
461	sch->qstats.backlog = q->qdisc->qstats.backlog;
462	nest = nla_nest_start(skb, TCA_OPTIONS);
463	if (nest == NULL)
464		goto nla_put_failure;
465
466	opt.limit = q->limit;
467	psched_ratecfg_getrate(&opt.rate, &q->rate);
468	if (q->peak_present)
469		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
470	else
471		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
472	opt.mtu = PSCHED_NS2TICKS(q->mtu);
473	opt.buffer = PSCHED_NS2TICKS(q->buffer);
474	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
475		goto nla_put_failure;
476	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
477	    nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
478		goto nla_put_failure;
479	if (q->peak_present &&
480	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
481	    nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
482		goto nla_put_failure;
483
484	nla_nest_end(skb, nest);
485	return skb->len;
486
487nla_put_failure:
488	nla_nest_cancel(skb, nest);
489	return -1;
490}
491
492static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
493			  struct sk_buff *skb, struct tcmsg *tcm)
494{
495	struct tbf_sched_data *q = qdisc_priv(sch);
496
497	tcm->tcm_handle |= TC_H_MIN(1);
498	tcm->tcm_info = q->qdisc->handle;
499
500	return 0;
501}
502
503static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
504		     struct Qdisc **old)
505{
506	struct tbf_sched_data *q = qdisc_priv(sch);
507
508	if (new == NULL)
509		new = &noop_qdisc;
510
511	sch_tree_lock(sch);
512	*old = q->qdisc;
513	q->qdisc = new;
514	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
515	qdisc_reset(*old);
516	sch_tree_unlock(sch);
517
518	return 0;
519}
520
521static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
522{
523	struct tbf_sched_data *q = qdisc_priv(sch);
524	return q->qdisc;
525}
526
527static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
528{
529	return 1;
530}
531
532static void tbf_put(struct Qdisc *sch, unsigned long arg)
533{
534}
535
536static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
537{
538	if (!walker->stop) {
539		if (walker->count >= walker->skip)
540			if (walker->fn(sch, 1, walker) < 0) {
541				walker->stop = 1;
542				return;
543			}
544		walker->count++;
545	}
546}
547
548static const struct Qdisc_class_ops tbf_class_ops = {
549	.graft		=	tbf_graft,
550	.leaf		=	tbf_leaf,
551	.get		=	tbf_get,
552	.put		=	tbf_put,
553	.walk		=	tbf_walk,
554	.dump		=	tbf_dump_class,
555};
556
557static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
558	.next		=	NULL,
559	.cl_ops		=	&tbf_class_ops,
560	.id		=	"tbf",
561	.priv_size	=	sizeof(struct tbf_sched_data),
562	.enqueue	=	tbf_enqueue,
563	.dequeue	=	tbf_dequeue,
564	.peek		=	qdisc_peek_dequeued,
565	.drop		=	tbf_drop,
566	.init		=	tbf_init,
567	.reset		=	tbf_reset,
568	.destroy	=	tbf_destroy,
569	.change		=	tbf_change,
570	.dump		=	tbf_dump,
571	.owner		=	THIS_MODULE,
572};
573
574static int __init tbf_module_init(void)
575{
576	return register_qdisc(&tbf_qdisc_ops);
577}
578
579static void __exit tbf_module_exit(void)
580{
581	unregister_qdisc(&tbf_qdisc_ops);
582}
583module_init(tbf_module_init)
584module_exit(tbf_module_exit)
585MODULE_LICENSE("GPL");
586