blk-cgroup.c revision a11cdaa7af56423a921a8bdad8f5a5f4ddca918a
1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 *		      Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * 	              Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
14#include <linux/seq_file.h>
15#include <linux/kdev_t.h>
16#include <linux/module.h>
17#include <linux/err.h>
18#include <linux/blkdev.h>
19#include "blk-cgroup.h"
20#include <linux/genhd.h>
21
22#define MAX_KEY_LEN 100
23
24static DEFINE_SPINLOCK(blkio_list_lock);
25static LIST_HEAD(blkio_list);
26
27struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
28EXPORT_SYMBOL_GPL(blkio_root_cgroup);
29
30static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
31						  struct cgroup *);
32static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
33			      struct task_struct *, bool);
34static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
35			   struct cgroup *, struct task_struct *, bool);
36static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
37static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
38
39struct cgroup_subsys blkio_subsys = {
40	.name = "blkio",
41	.create = blkiocg_create,
42	.can_attach = blkiocg_can_attach,
43	.attach = blkiocg_attach,
44	.destroy = blkiocg_destroy,
45	.populate = blkiocg_populate,
46#ifdef CONFIG_BLK_CGROUP
47	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
48	.subsys_id = blkio_subsys_id,
49#endif
50	.use_id = 1,
51	.module = THIS_MODULE,
52};
53EXPORT_SYMBOL_GPL(blkio_subsys);
54
55static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
56					    struct blkio_policy_node *pn)
57{
58	list_add(&pn->node, &blkcg->policy_list);
59}
60
61/* Must be called with blkcg->lock held */
62static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
63{
64	list_del(&pn->node);
65}
66
67/* Must be called with blkcg->lock held */
68static struct blkio_policy_node *
69blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
70{
71	struct blkio_policy_node *pn;
72
73	list_for_each_entry(pn, &blkcg->policy_list, node) {
74		if (pn->dev == dev)
75			return pn;
76	}
77
78	return NULL;
79}
80
81struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
82{
83	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
84			    struct blkio_cgroup, css);
85}
86EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
87
88void blkio_group_init(struct blkio_group *blkg)
89{
90	spin_lock_init(&blkg->stats_lock);
91}
92EXPORT_SYMBOL_GPL(blkio_group_init);
93
94/*
95 * Add to the appropriate stat variable depending on the request type.
96 * This should be called with the blkg->stats_lock held.
97 */
98static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
99				bool sync)
100{
101	if (direction)
102		stat[BLKIO_STAT_WRITE] += add;
103	else
104		stat[BLKIO_STAT_READ] += add;
105	if (sync)
106		stat[BLKIO_STAT_SYNC] += add;
107	else
108		stat[BLKIO_STAT_ASYNC] += add;
109}
110
111/*
112 * Decrements the appropriate stat variable if non-zero depending on the
113 * request type. Panics on value being zero.
114 * This should be called with the blkg->stats_lock held.
115 */
116static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
117{
118	if (direction) {
119		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
120		stat[BLKIO_STAT_WRITE]--;
121	} else {
122		BUG_ON(stat[BLKIO_STAT_READ] == 0);
123		stat[BLKIO_STAT_READ]--;
124	}
125	if (sync) {
126		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
127		stat[BLKIO_STAT_SYNC]--;
128	} else {
129		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
130		stat[BLKIO_STAT_ASYNC]--;
131	}
132}
133
134#ifdef CONFIG_DEBUG_BLK_CGROUP
135/* This should be called with the blkg->stats_lock held. */
136static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
137						struct blkio_group *curr_blkg)
138{
139	if (blkio_blkg_waiting(&blkg->stats))
140		return;
141	if (blkg == curr_blkg)
142		return;
143	blkg->stats.start_group_wait_time = sched_clock();
144	blkio_mark_blkg_waiting(&blkg->stats);
145}
146
147/* This should be called with the blkg->stats_lock held. */
148static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
149{
150	unsigned long long now;
151
152	if (!blkio_blkg_waiting(stats))
153		return;
154
155	now = sched_clock();
156	if (time_after64(now, stats->start_group_wait_time))
157		stats->group_wait_time += now - stats->start_group_wait_time;
158	blkio_clear_blkg_waiting(stats);
159}
160
161/* This should be called with the blkg->stats_lock held. */
162static void blkio_end_empty_time(struct blkio_group_stats *stats)
163{
164	unsigned long long now;
165
166	if (!blkio_blkg_empty(stats))
167		return;
168
169	now = sched_clock();
170	if (time_after64(now, stats->start_empty_time))
171		stats->empty_time += now - stats->start_empty_time;
172	blkio_clear_blkg_empty(stats);
173}
174
175void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
176{
177	unsigned long flags;
178
179	spin_lock_irqsave(&blkg->stats_lock, flags);
180	BUG_ON(blkio_blkg_idling(&blkg->stats));
181	blkg->stats.start_idle_time = sched_clock();
182	blkio_mark_blkg_idling(&blkg->stats);
183	spin_unlock_irqrestore(&blkg->stats_lock, flags);
184}
185EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
186
187void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
188{
189	unsigned long flags;
190	unsigned long long now;
191	struct blkio_group_stats *stats;
192
193	spin_lock_irqsave(&blkg->stats_lock, flags);
194	stats = &blkg->stats;
195	if (blkio_blkg_idling(stats)) {
196		now = sched_clock();
197		if (time_after64(now, stats->start_idle_time))
198			stats->idle_time += now - stats->start_idle_time;
199		blkio_clear_blkg_idling(stats);
200	}
201	spin_unlock_irqrestore(&blkg->stats_lock, flags);
202}
203EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
204
205void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
206{
207	unsigned long flags;
208	struct blkio_group_stats *stats;
209
210	spin_lock_irqsave(&blkg->stats_lock, flags);
211	stats = &blkg->stats;
212	stats->avg_queue_size_sum +=
213			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
214			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
215	stats->avg_queue_size_samples++;
216	blkio_update_group_wait_time(stats);
217	spin_unlock_irqrestore(&blkg->stats_lock, flags);
218}
219EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
220
221void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
222			unsigned long dequeue)
223{
224	blkg->stats.dequeue += dequeue;
225}
226EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
227#else
228static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
229					struct blkio_group *curr_blkg) {}
230static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
231#endif
232
233void blkiocg_update_io_add_stats(struct blkio_group *blkg,
234			struct blkio_group *curr_blkg, bool direction,
235			bool sync)
236{
237	unsigned long flags;
238
239	spin_lock_irqsave(&blkg->stats_lock, flags);
240	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
241			sync);
242	blkio_end_empty_time(&blkg->stats);
243	blkio_set_start_group_wait_time(blkg, curr_blkg);
244	spin_unlock_irqrestore(&blkg->stats_lock, flags);
245}
246EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
247
248void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
249						bool direction, bool sync)
250{
251	unsigned long flags;
252
253	spin_lock_irqsave(&blkg->stats_lock, flags);
254	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
255					direction, sync);
256	spin_unlock_irqrestore(&blkg->stats_lock, flags);
257}
258EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
259
260void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
261{
262	unsigned long flags;
263
264	spin_lock_irqsave(&blkg->stats_lock, flags);
265	blkg->stats.time += time;
266	spin_unlock_irqrestore(&blkg->stats_lock, flags);
267}
268EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
269
270void blkiocg_set_start_empty_time(struct blkio_group *blkg, bool ignore)
271{
272	unsigned long flags;
273	struct blkio_group_stats *stats;
274
275	spin_lock_irqsave(&blkg->stats_lock, flags);
276	stats = &blkg->stats;
277
278	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
279			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
280		spin_unlock_irqrestore(&blkg->stats_lock, flags);
281		return;
282	}
283
284	/*
285	 * If ignore is set, we do not panic on the empty flag being set
286	 * already. This is to avoid cases where there are superfluous timeslice
287	 * complete events (for eg., forced_dispatch in CFQ) when no IOs are
288	 * served which could result in triggering the empty check incorrectly.
289	 */
290	BUG_ON(!ignore && blkio_blkg_empty(stats));
291	stats->start_empty_time = sched_clock();
292	blkio_mark_blkg_empty(stats);
293	spin_unlock_irqrestore(&blkg->stats_lock, flags);
294}
295EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
296
297void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
298				uint64_t bytes, bool direction, bool sync)
299{
300	struct blkio_group_stats *stats;
301	unsigned long flags;
302
303	spin_lock_irqsave(&blkg->stats_lock, flags);
304	stats = &blkg->stats;
305	stats->sectors += bytes >> 9;
306	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
307			sync);
308	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
309			direction, sync);
310	spin_unlock_irqrestore(&blkg->stats_lock, flags);
311}
312EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
313
314void blkiocg_update_completion_stats(struct blkio_group *blkg,
315	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
316{
317	struct blkio_group_stats *stats;
318	unsigned long flags;
319	unsigned long long now = sched_clock();
320
321	spin_lock_irqsave(&blkg->stats_lock, flags);
322	stats = &blkg->stats;
323	if (time_after64(now, io_start_time))
324		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
325				now - io_start_time, direction, sync);
326	if (time_after64(io_start_time, start_time))
327		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
328				io_start_time - start_time, direction, sync);
329	spin_unlock_irqrestore(&blkg->stats_lock, flags);
330}
331EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
332
333void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
334					bool sync)
335{
336	unsigned long flags;
337
338	spin_lock_irqsave(&blkg->stats_lock, flags);
339	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
340			sync);
341	spin_unlock_irqrestore(&blkg->stats_lock, flags);
342}
343EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
344
345void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
346			struct blkio_group *blkg, void *key, dev_t dev)
347{
348	unsigned long flags;
349
350	spin_lock_irqsave(&blkcg->lock, flags);
351	rcu_assign_pointer(blkg->key, key);
352	blkg->blkcg_id = css_id(&blkcg->css);
353	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
354	spin_unlock_irqrestore(&blkcg->lock, flags);
355#ifdef CONFIG_DEBUG_BLK_CGROUP
356	/* Need to take css reference ? */
357	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
358#endif
359	blkg->dev = dev;
360}
361EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
362
363static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
364{
365	hlist_del_init_rcu(&blkg->blkcg_node);
366	blkg->blkcg_id = 0;
367}
368
369/*
370 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
371 * indicating that blk_group was unhashed by the time we got to it.
372 */
373int blkiocg_del_blkio_group(struct blkio_group *blkg)
374{
375	struct blkio_cgroup *blkcg;
376	unsigned long flags;
377	struct cgroup_subsys_state *css;
378	int ret = 1;
379
380	rcu_read_lock();
381	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
382	if (!css)
383		goto out;
384
385	blkcg = container_of(css, struct blkio_cgroup, css);
386	spin_lock_irqsave(&blkcg->lock, flags);
387	if (!hlist_unhashed(&blkg->blkcg_node)) {
388		__blkiocg_del_blkio_group(blkg);
389		ret = 0;
390	}
391	spin_unlock_irqrestore(&blkcg->lock, flags);
392out:
393	rcu_read_unlock();
394	return ret;
395}
396EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
397
398/* called under rcu_read_lock(). */
399struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
400{
401	struct blkio_group *blkg;
402	struct hlist_node *n;
403	void *__key;
404
405	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
406		__key = blkg->key;
407		if (__key == key)
408			return blkg;
409	}
410
411	return NULL;
412}
413EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
414
415#define SHOW_FUNCTION(__VAR)						\
416static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
417				       struct cftype *cftype)		\
418{									\
419	struct blkio_cgroup *blkcg;					\
420									\
421	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
422	return (u64)blkcg->__VAR;					\
423}
424
425SHOW_FUNCTION(weight);
426#undef SHOW_FUNCTION
427
428static int
429blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
430{
431	struct blkio_cgroup *blkcg;
432	struct blkio_group *blkg;
433	struct hlist_node *n;
434	struct blkio_policy_type *blkiop;
435	struct blkio_policy_node *pn;
436
437	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
438		return -EINVAL;
439
440	blkcg = cgroup_to_blkio_cgroup(cgroup);
441	spin_lock(&blkio_list_lock);
442	spin_lock_irq(&blkcg->lock);
443	blkcg->weight = (unsigned int)val;
444
445	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
446		pn = blkio_policy_search_node(blkcg, blkg->dev);
447
448		if (pn)
449			continue;
450
451		list_for_each_entry(blkiop, &blkio_list, list)
452			blkiop->ops.blkio_update_group_weight_fn(blkg,
453					blkcg->weight);
454	}
455	spin_unlock_irq(&blkcg->lock);
456	spin_unlock(&blkio_list_lock);
457	return 0;
458}
459
460static int
461blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
462{
463	struct blkio_cgroup *blkcg;
464	struct blkio_group *blkg;
465	struct blkio_group_stats *stats;
466	struct hlist_node *n;
467	uint64_t queued[BLKIO_STAT_TOTAL];
468	int i;
469#ifdef CONFIG_DEBUG_BLK_CGROUP
470	bool idling, waiting, empty;
471	unsigned long long now = sched_clock();
472#endif
473
474	blkcg = cgroup_to_blkio_cgroup(cgroup);
475	spin_lock_irq(&blkcg->lock);
476	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
477		spin_lock(&blkg->stats_lock);
478		stats = &blkg->stats;
479#ifdef CONFIG_DEBUG_BLK_CGROUP
480		idling = blkio_blkg_idling(stats);
481		waiting = blkio_blkg_waiting(stats);
482		empty = blkio_blkg_empty(stats);
483#endif
484		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
485			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
486		memset(stats, 0, sizeof(struct blkio_group_stats));
487		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
488			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
489#ifdef CONFIG_DEBUG_BLK_CGROUP
490		if (idling) {
491			blkio_mark_blkg_idling(stats);
492			stats->start_idle_time = now;
493		}
494		if (waiting) {
495			blkio_mark_blkg_waiting(stats);
496			stats->start_group_wait_time = now;
497		}
498		if (empty) {
499			blkio_mark_blkg_empty(stats);
500			stats->start_empty_time = now;
501		}
502#endif
503		spin_unlock(&blkg->stats_lock);
504	}
505	spin_unlock_irq(&blkcg->lock);
506	return 0;
507}
508
509static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
510				int chars_left, bool diskname_only)
511{
512	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
513	chars_left -= strlen(str);
514	if (chars_left <= 0) {
515		printk(KERN_WARNING
516			"Possibly incorrect cgroup stat display format");
517		return;
518	}
519	if (diskname_only)
520		return;
521	switch (type) {
522	case BLKIO_STAT_READ:
523		strlcat(str, " Read", chars_left);
524		break;
525	case BLKIO_STAT_WRITE:
526		strlcat(str, " Write", chars_left);
527		break;
528	case BLKIO_STAT_SYNC:
529		strlcat(str, " Sync", chars_left);
530		break;
531	case BLKIO_STAT_ASYNC:
532		strlcat(str, " Async", chars_left);
533		break;
534	case BLKIO_STAT_TOTAL:
535		strlcat(str, " Total", chars_left);
536		break;
537	default:
538		strlcat(str, " Invalid", chars_left);
539	}
540}
541
542static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
543				struct cgroup_map_cb *cb, dev_t dev)
544{
545	blkio_get_key_name(0, dev, str, chars_left, true);
546	cb->fill(cb, str, val);
547	return val;
548}
549
550/* This should be called with blkg->stats_lock held */
551static uint64_t blkio_get_stat(struct blkio_group *blkg,
552		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
553{
554	uint64_t disk_total;
555	char key_str[MAX_KEY_LEN];
556	enum stat_sub_type sub_type;
557
558	if (type == BLKIO_STAT_TIME)
559		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
560					blkg->stats.time, cb, dev);
561	if (type == BLKIO_STAT_SECTORS)
562		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
563					blkg->stats.sectors, cb, dev);
564#ifdef CONFIG_DEBUG_BLK_CGROUP
565	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
566		uint64_t sum = blkg->stats.avg_queue_size_sum;
567		uint64_t samples = blkg->stats.avg_queue_size_samples;
568		if (samples)
569			do_div(sum, samples);
570		else
571			sum = 0;
572		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
573	}
574	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
575		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
576					blkg->stats.group_wait_time, cb, dev);
577	if (type == BLKIO_STAT_IDLE_TIME)
578		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
579					blkg->stats.idle_time, cb, dev);
580	if (type == BLKIO_STAT_EMPTY_TIME)
581		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
582					blkg->stats.empty_time, cb, dev);
583	if (type == BLKIO_STAT_DEQUEUE)
584		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
585					blkg->stats.dequeue, cb, dev);
586#endif
587
588	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
589			sub_type++) {
590		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
591		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
592	}
593	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
594			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
595	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
596	cb->fill(cb, key_str, disk_total);
597	return disk_total;
598}
599
600#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)		\
601static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
602		struct cftype *cftype, struct cgroup_map_cb *cb)	\
603{									\
604	struct blkio_cgroup *blkcg;					\
605	struct blkio_group *blkg;					\
606	struct hlist_node *n;						\
607	uint64_t cgroup_total = 0;					\
608									\
609	if (!cgroup_lock_live_group(cgroup))				\
610		return -ENODEV;						\
611									\
612	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
613	rcu_read_lock();						\
614	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
615		if (blkg->dev) {					\
616			spin_lock_irq(&blkg->stats_lock);		\
617			cgroup_total += blkio_get_stat(blkg, cb,	\
618						blkg->dev, type);	\
619			spin_unlock_irq(&blkg->stats_lock);		\
620		}							\
621	}								\
622	if (show_total)							\
623		cb->fill(cb, "Total", cgroup_total);			\
624	rcu_read_unlock();						\
625	cgroup_unlock();						\
626	return 0;							\
627}
628
629SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
630SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
631SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
632SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
633SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
634SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
635SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
636SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
637#ifdef CONFIG_DEBUG_BLK_CGROUP
638SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
639SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
640SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
641SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
642SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
643#endif
644#undef SHOW_FUNCTION_PER_GROUP
645
646static int blkio_check_dev_num(dev_t dev)
647{
648	int part = 0;
649	struct gendisk *disk;
650
651	disk = get_gendisk(dev, &part);
652	if (!disk || part)
653		return -ENODEV;
654
655	return 0;
656}
657
658static int blkio_policy_parse_and_set(char *buf,
659				      struct blkio_policy_node *newpn)
660{
661	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
662	int ret;
663	unsigned long major, minor, temp;
664	int i = 0;
665	dev_t dev;
666
667	memset(s, 0, sizeof(s));
668
669	while ((p = strsep(&buf, " ")) != NULL) {
670		if (!*p)
671			continue;
672
673		s[i++] = p;
674
675		/* Prevent from inputing too many things */
676		if (i == 3)
677			break;
678	}
679
680	if (i != 2)
681		return -EINVAL;
682
683	p = strsep(&s[0], ":");
684	if (p != NULL)
685		major_s = p;
686	else
687		return -EINVAL;
688
689	minor_s = s[0];
690	if (!minor_s)
691		return -EINVAL;
692
693	ret = strict_strtoul(major_s, 10, &major);
694	if (ret)
695		return -EINVAL;
696
697	ret = strict_strtoul(minor_s, 10, &minor);
698	if (ret)
699		return -EINVAL;
700
701	dev = MKDEV(major, minor);
702
703	ret = blkio_check_dev_num(dev);
704	if (ret)
705		return ret;
706
707	newpn->dev = dev;
708
709	if (s[1] == NULL)
710		return -EINVAL;
711
712	ret = strict_strtoul(s[1], 10, &temp);
713	if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
714	    temp > BLKIO_WEIGHT_MAX)
715		return -EINVAL;
716
717	newpn->weight =  temp;
718
719	return 0;
720}
721
722unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
723			      dev_t dev)
724{
725	struct blkio_policy_node *pn;
726
727	pn = blkio_policy_search_node(blkcg, dev);
728	if (pn)
729		return pn->weight;
730	else
731		return blkcg->weight;
732}
733EXPORT_SYMBOL_GPL(blkcg_get_weight);
734
735
736static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
737				       const char *buffer)
738{
739	int ret = 0;
740	char *buf;
741	struct blkio_policy_node *newpn, *pn;
742	struct blkio_cgroup *blkcg;
743	struct blkio_group *blkg;
744	int keep_newpn = 0;
745	struct hlist_node *n;
746	struct blkio_policy_type *blkiop;
747
748	buf = kstrdup(buffer, GFP_KERNEL);
749	if (!buf)
750		return -ENOMEM;
751
752	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
753	if (!newpn) {
754		ret = -ENOMEM;
755		goto free_buf;
756	}
757
758	ret = blkio_policy_parse_and_set(buf, newpn);
759	if (ret)
760		goto free_newpn;
761
762	blkcg = cgroup_to_blkio_cgroup(cgrp);
763
764	spin_lock_irq(&blkcg->lock);
765
766	pn = blkio_policy_search_node(blkcg, newpn->dev);
767	if (!pn) {
768		if (newpn->weight != 0) {
769			blkio_policy_insert_node(blkcg, newpn);
770			keep_newpn = 1;
771		}
772		spin_unlock_irq(&blkcg->lock);
773		goto update_io_group;
774	}
775
776	if (newpn->weight == 0) {
777		/* weight == 0 means deleteing a specific weight */
778		blkio_policy_delete_node(pn);
779		spin_unlock_irq(&blkcg->lock);
780		goto update_io_group;
781	}
782	spin_unlock_irq(&blkcg->lock);
783
784	pn->weight = newpn->weight;
785
786update_io_group:
787	/* update weight for each cfqg */
788	spin_lock(&blkio_list_lock);
789	spin_lock_irq(&blkcg->lock);
790
791	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
792		if (newpn->dev == blkg->dev) {
793			list_for_each_entry(blkiop, &blkio_list, list)
794				blkiop->ops.blkio_update_group_weight_fn(blkg,
795							 newpn->weight ?
796							 newpn->weight :
797							 blkcg->weight);
798		}
799	}
800
801	spin_unlock_irq(&blkcg->lock);
802	spin_unlock(&blkio_list_lock);
803
804free_newpn:
805	if (!keep_newpn)
806		kfree(newpn);
807free_buf:
808	kfree(buf);
809	return ret;
810}
811
812static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
813				      struct seq_file *m)
814{
815	struct blkio_cgroup *blkcg;
816	struct blkio_policy_node *pn;
817
818	seq_printf(m, "dev\tweight\n");
819
820	blkcg = cgroup_to_blkio_cgroup(cgrp);
821	if (list_empty(&blkcg->policy_list))
822		goto out;
823
824	spin_lock_irq(&blkcg->lock);
825	list_for_each_entry(pn, &blkcg->policy_list, node) {
826		seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
827			   MINOR(pn->dev), pn->weight);
828	}
829	spin_unlock_irq(&blkcg->lock);
830
831out:
832	return 0;
833}
834
835struct cftype blkio_files[] = {
836	{
837		.name = "weight_device",
838		.read_seq_string = blkiocg_weight_device_read,
839		.write_string = blkiocg_weight_device_write,
840		.max_write_len = 256,
841	},
842	{
843		.name = "weight",
844		.read_u64 = blkiocg_weight_read,
845		.write_u64 = blkiocg_weight_write,
846	},
847	{
848		.name = "time",
849		.read_map = blkiocg_time_read,
850	},
851	{
852		.name = "sectors",
853		.read_map = blkiocg_sectors_read,
854	},
855	{
856		.name = "io_service_bytes",
857		.read_map = blkiocg_io_service_bytes_read,
858	},
859	{
860		.name = "io_serviced",
861		.read_map = blkiocg_io_serviced_read,
862	},
863	{
864		.name = "io_service_time",
865		.read_map = blkiocg_io_service_time_read,
866	},
867	{
868		.name = "io_wait_time",
869		.read_map = blkiocg_io_wait_time_read,
870	},
871	{
872		.name = "io_merged",
873		.read_map = blkiocg_io_merged_read,
874	},
875	{
876		.name = "io_queued",
877		.read_map = blkiocg_io_queued_read,
878	},
879	{
880		.name = "reset_stats",
881		.write_u64 = blkiocg_reset_stats,
882	},
883#ifdef CONFIG_DEBUG_BLK_CGROUP
884	{
885		.name = "avg_queue_size",
886		.read_map = blkiocg_avg_queue_size_read,
887	},
888	{
889		.name = "group_wait_time",
890		.read_map = blkiocg_group_wait_time_read,
891	},
892	{
893		.name = "idle_time",
894		.read_map = blkiocg_idle_time_read,
895	},
896	{
897		.name = "empty_time",
898		.read_map = blkiocg_empty_time_read,
899	},
900	{
901		.name = "dequeue",
902		.read_map = blkiocg_dequeue_read,
903	},
904#endif
905};
906
907static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
908{
909	return cgroup_add_files(cgroup, subsys, blkio_files,
910				ARRAY_SIZE(blkio_files));
911}
912
913static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
914{
915	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
916	unsigned long flags;
917	struct blkio_group *blkg;
918	void *key;
919	struct blkio_policy_type *blkiop;
920	struct blkio_policy_node *pn, *pntmp;
921
922	rcu_read_lock();
923remove_entry:
924	spin_lock_irqsave(&blkcg->lock, flags);
925
926	if (hlist_empty(&blkcg->blkg_list)) {
927		spin_unlock_irqrestore(&blkcg->lock, flags);
928		goto done;
929	}
930
931	blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
932				blkcg_node);
933	key = rcu_dereference(blkg->key);
934	__blkiocg_del_blkio_group(blkg);
935
936	spin_unlock_irqrestore(&blkcg->lock, flags);
937
938	/*
939	 * This blkio_group is being unlinked as associated cgroup is going
940	 * away. Let all the IO controlling policies know about this event.
941	 *
942	 * Currently this is static call to one io controlling policy. Once
943	 * we have more policies in place, we need some dynamic registration
944	 * of callback function.
945	 */
946	spin_lock(&blkio_list_lock);
947	list_for_each_entry(blkiop, &blkio_list, list)
948		blkiop->ops.blkio_unlink_group_fn(key, blkg);
949	spin_unlock(&blkio_list_lock);
950	goto remove_entry;
951
952done:
953	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
954		blkio_policy_delete_node(pn);
955		kfree(pn);
956	}
957	free_css_id(&blkio_subsys, &blkcg->css);
958	rcu_read_unlock();
959	if (blkcg != &blkio_root_cgroup)
960		kfree(blkcg);
961}
962
963static struct cgroup_subsys_state *
964blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
965{
966	struct blkio_cgroup *blkcg, *parent_blkcg;
967
968	if (!cgroup->parent) {
969		blkcg = &blkio_root_cgroup;
970		goto done;
971	}
972
973	/* Currently we do not support hierarchy deeper than two level (0,1) */
974	parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
975	if (css_depth(&parent_blkcg->css) > 0)
976		return ERR_PTR(-EINVAL);
977
978	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
979	if (!blkcg)
980		return ERR_PTR(-ENOMEM);
981
982	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
983done:
984	spin_lock_init(&blkcg->lock);
985	INIT_HLIST_HEAD(&blkcg->blkg_list);
986
987	INIT_LIST_HEAD(&blkcg->policy_list);
988	return &blkcg->css;
989}
990
991/*
992 * We cannot support shared io contexts, as we have no mean to support
993 * two tasks with the same ioc in two different groups without major rework
994 * of the main cic data structures.  For now we allow a task to change
995 * its cgroup only if it's the only owner of its ioc.
996 */
997static int blkiocg_can_attach(struct cgroup_subsys *subsys,
998				struct cgroup *cgroup, struct task_struct *tsk,
999				bool threadgroup)
1000{
1001	struct io_context *ioc;
1002	int ret = 0;
1003
1004	/* task_lock() is needed to avoid races with exit_io_context() */
1005	task_lock(tsk);
1006	ioc = tsk->io_context;
1007	if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1008		ret = -EINVAL;
1009	task_unlock(tsk);
1010
1011	return ret;
1012}
1013
1014static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
1015				struct cgroup *prev, struct task_struct *tsk,
1016				bool threadgroup)
1017{
1018	struct io_context *ioc;
1019
1020	task_lock(tsk);
1021	ioc = tsk->io_context;
1022	if (ioc)
1023		ioc->cgroup_changed = 1;
1024	task_unlock(tsk);
1025}
1026
1027void blkio_policy_register(struct blkio_policy_type *blkiop)
1028{
1029	spin_lock(&blkio_list_lock);
1030	list_add_tail(&blkiop->list, &blkio_list);
1031	spin_unlock(&blkio_list_lock);
1032}
1033EXPORT_SYMBOL_GPL(blkio_policy_register);
1034
1035void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1036{
1037	spin_lock(&blkio_list_lock);
1038	list_del_init(&blkiop->list);
1039	spin_unlock(&blkio_list_lock);
1040}
1041EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1042
1043static int __init init_cgroup_blkio(void)
1044{
1045	return cgroup_load_subsys(&blkio_subsys);
1046}
1047
1048static void __exit exit_cgroup_blkio(void)
1049{
1050	cgroup_unload_subsys(&blkio_subsys);
1051}
1052
1053module_init(init_cgroup_blkio);
1054module_exit(exit_cgroup_blkio);
1055MODULE_LICENSE("GPL");
1056