blk-cgroup.c revision 70087dc38cc77ca8f46059564c00338777734762
1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 *		      Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * 	              Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
14#include <linux/seq_file.h>
15#include <linux/kdev_t.h>
16#include <linux/module.h>
17#include <linux/err.h>
18#include <linux/blkdev.h>
19#include <linux/slab.h>
20#include "blk-cgroup.h"
21#include <linux/genhd.h>
22
23#define MAX_KEY_LEN 100
24
25static DEFINE_SPINLOCK(blkio_list_lock);
26static LIST_HEAD(blkio_list);
27
28struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
29EXPORT_SYMBOL_GPL(blkio_root_cgroup);
30
31static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
32						  struct cgroup *);
33static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
34			      struct task_struct *, bool);
35static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
36			   struct cgroup *, struct task_struct *, bool);
37static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
38static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
39
40/* for encoding cft->private value on file */
41#define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
42/* What policy owns the file, proportional or throttle */
43#define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
44#define BLKIOFILE_ATTR(val)		((val) & 0xffff)
45
46struct cgroup_subsys blkio_subsys = {
47	.name = "blkio",
48	.create = blkiocg_create,
49	.can_attach = blkiocg_can_attach,
50	.attach = blkiocg_attach,
51	.destroy = blkiocg_destroy,
52	.populate = blkiocg_populate,
53#ifdef CONFIG_BLK_CGROUP
54	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
55	.subsys_id = blkio_subsys_id,
56#endif
57	.use_id = 1,
58	.module = THIS_MODULE,
59};
60EXPORT_SYMBOL_GPL(blkio_subsys);
61
62static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
63					    struct blkio_policy_node *pn)
64{
65	list_add(&pn->node, &blkcg->policy_list);
66}
67
68static inline bool cftype_blkg_same_policy(struct cftype *cft,
69			struct blkio_group *blkg)
70{
71	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
72
73	if (blkg->plid == plid)
74		return 1;
75
76	return 0;
77}
78
79/* Determines if policy node matches cgroup file being accessed */
80static inline bool pn_matches_cftype(struct cftype *cft,
81			struct blkio_policy_node *pn)
82{
83	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
84	int fileid = BLKIOFILE_ATTR(cft->private);
85
86	return (plid == pn->plid && fileid == pn->fileid);
87}
88
89/* Must be called with blkcg->lock held */
90static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
91{
92	list_del(&pn->node);
93}
94
95/* Must be called with blkcg->lock held */
96static struct blkio_policy_node *
97blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
98		enum blkio_policy_id plid, int fileid)
99{
100	struct blkio_policy_node *pn;
101
102	list_for_each_entry(pn, &blkcg->policy_list, node) {
103		if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
104			return pn;
105	}
106
107	return NULL;
108}
109
110struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
111{
112	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
113			    struct blkio_cgroup, css);
114}
115EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
116
117struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
118{
119	return container_of(task_subsys_state(tsk, blkio_subsys_id),
120			    struct blkio_cgroup, css);
121}
122EXPORT_SYMBOL_GPL(task_blkio_cgroup);
123
124static inline void
125blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
126{
127	struct blkio_policy_type *blkiop;
128
129	list_for_each_entry(blkiop, &blkio_list, list) {
130		/* If this policy does not own the blkg, do not send updates */
131		if (blkiop->plid != blkg->plid)
132			continue;
133		if (blkiop->ops.blkio_update_group_weight_fn)
134			blkiop->ops.blkio_update_group_weight_fn(blkg->key,
135							blkg, weight);
136	}
137}
138
139static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
140				int fileid)
141{
142	struct blkio_policy_type *blkiop;
143
144	list_for_each_entry(blkiop, &blkio_list, list) {
145
146		/* If this policy does not own the blkg, do not send updates */
147		if (blkiop->plid != blkg->plid)
148			continue;
149
150		if (fileid == BLKIO_THROTL_read_bps_device
151		    && blkiop->ops.blkio_update_group_read_bps_fn)
152			blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
153								blkg, bps);
154
155		if (fileid == BLKIO_THROTL_write_bps_device
156		    && blkiop->ops.blkio_update_group_write_bps_fn)
157			blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
158								blkg, bps);
159	}
160}
161
162static inline void blkio_update_group_iops(struct blkio_group *blkg,
163			unsigned int iops, int fileid)
164{
165	struct blkio_policy_type *blkiop;
166
167	list_for_each_entry(blkiop, &blkio_list, list) {
168
169		/* If this policy does not own the blkg, do not send updates */
170		if (blkiop->plid != blkg->plid)
171			continue;
172
173		if (fileid == BLKIO_THROTL_read_iops_device
174		    && blkiop->ops.blkio_update_group_read_iops_fn)
175			blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
176								blkg, iops);
177
178		if (fileid == BLKIO_THROTL_write_iops_device
179		    && blkiop->ops.blkio_update_group_write_iops_fn)
180			blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
181								blkg,iops);
182	}
183}
184
185/*
186 * Add to the appropriate stat variable depending on the request type.
187 * This should be called with the blkg->stats_lock held.
188 */
189static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
190				bool sync)
191{
192	if (direction)
193		stat[BLKIO_STAT_WRITE] += add;
194	else
195		stat[BLKIO_STAT_READ] += add;
196	if (sync)
197		stat[BLKIO_STAT_SYNC] += add;
198	else
199		stat[BLKIO_STAT_ASYNC] += add;
200}
201
202/*
203 * Decrements the appropriate stat variable if non-zero depending on the
204 * request type. Panics on value being zero.
205 * This should be called with the blkg->stats_lock held.
206 */
207static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
208{
209	if (direction) {
210		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
211		stat[BLKIO_STAT_WRITE]--;
212	} else {
213		BUG_ON(stat[BLKIO_STAT_READ] == 0);
214		stat[BLKIO_STAT_READ]--;
215	}
216	if (sync) {
217		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
218		stat[BLKIO_STAT_SYNC]--;
219	} else {
220		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
221		stat[BLKIO_STAT_ASYNC]--;
222	}
223}
224
225#ifdef CONFIG_DEBUG_BLK_CGROUP
226/* This should be called with the blkg->stats_lock held. */
227static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
228						struct blkio_group *curr_blkg)
229{
230	if (blkio_blkg_waiting(&blkg->stats))
231		return;
232	if (blkg == curr_blkg)
233		return;
234	blkg->stats.start_group_wait_time = sched_clock();
235	blkio_mark_blkg_waiting(&blkg->stats);
236}
237
238/* This should be called with the blkg->stats_lock held. */
239static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
240{
241	unsigned long long now;
242
243	if (!blkio_blkg_waiting(stats))
244		return;
245
246	now = sched_clock();
247	if (time_after64(now, stats->start_group_wait_time))
248		stats->group_wait_time += now - stats->start_group_wait_time;
249	blkio_clear_blkg_waiting(stats);
250}
251
252/* This should be called with the blkg->stats_lock held. */
253static void blkio_end_empty_time(struct blkio_group_stats *stats)
254{
255	unsigned long long now;
256
257	if (!blkio_blkg_empty(stats))
258		return;
259
260	now = sched_clock();
261	if (time_after64(now, stats->start_empty_time))
262		stats->empty_time += now - stats->start_empty_time;
263	blkio_clear_blkg_empty(stats);
264}
265
266void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
267{
268	unsigned long flags;
269
270	spin_lock_irqsave(&blkg->stats_lock, flags);
271	BUG_ON(blkio_blkg_idling(&blkg->stats));
272	blkg->stats.start_idle_time = sched_clock();
273	blkio_mark_blkg_idling(&blkg->stats);
274	spin_unlock_irqrestore(&blkg->stats_lock, flags);
275}
276EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
277
278void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
279{
280	unsigned long flags;
281	unsigned long long now;
282	struct blkio_group_stats *stats;
283
284	spin_lock_irqsave(&blkg->stats_lock, flags);
285	stats = &blkg->stats;
286	if (blkio_blkg_idling(stats)) {
287		now = sched_clock();
288		if (time_after64(now, stats->start_idle_time))
289			stats->idle_time += now - stats->start_idle_time;
290		blkio_clear_blkg_idling(stats);
291	}
292	spin_unlock_irqrestore(&blkg->stats_lock, flags);
293}
294EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
295
296void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
297{
298	unsigned long flags;
299	struct blkio_group_stats *stats;
300
301	spin_lock_irqsave(&blkg->stats_lock, flags);
302	stats = &blkg->stats;
303	stats->avg_queue_size_sum +=
304			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
305			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
306	stats->avg_queue_size_samples++;
307	blkio_update_group_wait_time(stats);
308	spin_unlock_irqrestore(&blkg->stats_lock, flags);
309}
310EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
311
312void blkiocg_set_start_empty_time(struct blkio_group *blkg)
313{
314	unsigned long flags;
315	struct blkio_group_stats *stats;
316
317	spin_lock_irqsave(&blkg->stats_lock, flags);
318	stats = &blkg->stats;
319
320	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
321			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
322		spin_unlock_irqrestore(&blkg->stats_lock, flags);
323		return;
324	}
325
326	/*
327	 * group is already marked empty. This can happen if cfqq got new
328	 * request in parent group and moved to this group while being added
329	 * to service tree. Just ignore the event and move on.
330	 */
331	if(blkio_blkg_empty(stats)) {
332		spin_unlock_irqrestore(&blkg->stats_lock, flags);
333		return;
334	}
335
336	stats->start_empty_time = sched_clock();
337	blkio_mark_blkg_empty(stats);
338	spin_unlock_irqrestore(&blkg->stats_lock, flags);
339}
340EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
341
342void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
343			unsigned long dequeue)
344{
345	blkg->stats.dequeue += dequeue;
346}
347EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
348#else
349static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
350					struct blkio_group *curr_blkg) {}
351static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
352#endif
353
354void blkiocg_update_io_add_stats(struct blkio_group *blkg,
355			struct blkio_group *curr_blkg, bool direction,
356			bool sync)
357{
358	unsigned long flags;
359
360	spin_lock_irqsave(&blkg->stats_lock, flags);
361	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
362			sync);
363	blkio_end_empty_time(&blkg->stats);
364	blkio_set_start_group_wait_time(blkg, curr_blkg);
365	spin_unlock_irqrestore(&blkg->stats_lock, flags);
366}
367EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
368
369void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
370						bool direction, bool sync)
371{
372	unsigned long flags;
373
374	spin_lock_irqsave(&blkg->stats_lock, flags);
375	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
376					direction, sync);
377	spin_unlock_irqrestore(&blkg->stats_lock, flags);
378}
379EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
380
381void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
382				unsigned long unaccounted_time)
383{
384	unsigned long flags;
385
386	spin_lock_irqsave(&blkg->stats_lock, flags);
387	blkg->stats.time += time;
388	blkg->stats.unaccounted_time += unaccounted_time;
389	spin_unlock_irqrestore(&blkg->stats_lock, flags);
390}
391EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
392
393void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
394				uint64_t bytes, bool direction, bool sync)
395{
396	struct blkio_group_stats *stats;
397	unsigned long flags;
398
399	spin_lock_irqsave(&blkg->stats_lock, flags);
400	stats = &blkg->stats;
401	stats->sectors += bytes >> 9;
402	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
403			sync);
404	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
405			direction, sync);
406	spin_unlock_irqrestore(&blkg->stats_lock, flags);
407}
408EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
409
410void blkiocg_update_completion_stats(struct blkio_group *blkg,
411	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
412{
413	struct blkio_group_stats *stats;
414	unsigned long flags;
415	unsigned long long now = sched_clock();
416
417	spin_lock_irqsave(&blkg->stats_lock, flags);
418	stats = &blkg->stats;
419	if (time_after64(now, io_start_time))
420		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
421				now - io_start_time, direction, sync);
422	if (time_after64(io_start_time, start_time))
423		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
424				io_start_time - start_time, direction, sync);
425	spin_unlock_irqrestore(&blkg->stats_lock, flags);
426}
427EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
428
429void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
430					bool sync)
431{
432	unsigned long flags;
433
434	spin_lock_irqsave(&blkg->stats_lock, flags);
435	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
436			sync);
437	spin_unlock_irqrestore(&blkg->stats_lock, flags);
438}
439EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
440
441void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
442		struct blkio_group *blkg, void *key, dev_t dev,
443		enum blkio_policy_id plid)
444{
445	unsigned long flags;
446
447	spin_lock_irqsave(&blkcg->lock, flags);
448	spin_lock_init(&blkg->stats_lock);
449	rcu_assign_pointer(blkg->key, key);
450	blkg->blkcg_id = css_id(&blkcg->css);
451	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
452	blkg->plid = plid;
453	spin_unlock_irqrestore(&blkcg->lock, flags);
454	/* Need to take css reference ? */
455	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
456	blkg->dev = dev;
457}
458EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
459
460static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
461{
462	hlist_del_init_rcu(&blkg->blkcg_node);
463	blkg->blkcg_id = 0;
464}
465
466/*
467 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
468 * indicating that blk_group was unhashed by the time we got to it.
469 */
470int blkiocg_del_blkio_group(struct blkio_group *blkg)
471{
472	struct blkio_cgroup *blkcg;
473	unsigned long flags;
474	struct cgroup_subsys_state *css;
475	int ret = 1;
476
477	rcu_read_lock();
478	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
479	if (css) {
480		blkcg = container_of(css, struct blkio_cgroup, css);
481		spin_lock_irqsave(&blkcg->lock, flags);
482		if (!hlist_unhashed(&blkg->blkcg_node)) {
483			__blkiocg_del_blkio_group(blkg);
484			ret = 0;
485		}
486		spin_unlock_irqrestore(&blkcg->lock, flags);
487	}
488
489	rcu_read_unlock();
490	return ret;
491}
492EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
493
494/* called under rcu_read_lock(). */
495struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
496{
497	struct blkio_group *blkg;
498	struct hlist_node *n;
499	void *__key;
500
501	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
502		__key = blkg->key;
503		if (__key == key)
504			return blkg;
505	}
506
507	return NULL;
508}
509EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
510
511static int
512blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
513{
514	struct blkio_cgroup *blkcg;
515	struct blkio_group *blkg;
516	struct blkio_group_stats *stats;
517	struct hlist_node *n;
518	uint64_t queued[BLKIO_STAT_TOTAL];
519	int i;
520#ifdef CONFIG_DEBUG_BLK_CGROUP
521	bool idling, waiting, empty;
522	unsigned long long now = sched_clock();
523#endif
524
525	blkcg = cgroup_to_blkio_cgroup(cgroup);
526	spin_lock_irq(&blkcg->lock);
527	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
528		spin_lock(&blkg->stats_lock);
529		stats = &blkg->stats;
530#ifdef CONFIG_DEBUG_BLK_CGROUP
531		idling = blkio_blkg_idling(stats);
532		waiting = blkio_blkg_waiting(stats);
533		empty = blkio_blkg_empty(stats);
534#endif
535		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
536			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
537		memset(stats, 0, sizeof(struct blkio_group_stats));
538		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
539			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
540#ifdef CONFIG_DEBUG_BLK_CGROUP
541		if (idling) {
542			blkio_mark_blkg_idling(stats);
543			stats->start_idle_time = now;
544		}
545		if (waiting) {
546			blkio_mark_blkg_waiting(stats);
547			stats->start_group_wait_time = now;
548		}
549		if (empty) {
550			blkio_mark_blkg_empty(stats);
551			stats->start_empty_time = now;
552		}
553#endif
554		spin_unlock(&blkg->stats_lock);
555	}
556	spin_unlock_irq(&blkcg->lock);
557	return 0;
558}
559
560static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
561				int chars_left, bool diskname_only)
562{
563	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
564	chars_left -= strlen(str);
565	if (chars_left <= 0) {
566		printk(KERN_WARNING
567			"Possibly incorrect cgroup stat display format");
568		return;
569	}
570	if (diskname_only)
571		return;
572	switch (type) {
573	case BLKIO_STAT_READ:
574		strlcat(str, " Read", chars_left);
575		break;
576	case BLKIO_STAT_WRITE:
577		strlcat(str, " Write", chars_left);
578		break;
579	case BLKIO_STAT_SYNC:
580		strlcat(str, " Sync", chars_left);
581		break;
582	case BLKIO_STAT_ASYNC:
583		strlcat(str, " Async", chars_left);
584		break;
585	case BLKIO_STAT_TOTAL:
586		strlcat(str, " Total", chars_left);
587		break;
588	default:
589		strlcat(str, " Invalid", chars_left);
590	}
591}
592
593static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
594				struct cgroup_map_cb *cb, dev_t dev)
595{
596	blkio_get_key_name(0, dev, str, chars_left, true);
597	cb->fill(cb, str, val);
598	return val;
599}
600
601/* This should be called with blkg->stats_lock held */
602static uint64_t blkio_get_stat(struct blkio_group *blkg,
603		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
604{
605	uint64_t disk_total;
606	char key_str[MAX_KEY_LEN];
607	enum stat_sub_type sub_type;
608
609	if (type == BLKIO_STAT_TIME)
610		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
611					blkg->stats.time, cb, dev);
612	if (type == BLKIO_STAT_SECTORS)
613		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
614					blkg->stats.sectors, cb, dev);
615#ifdef CONFIG_DEBUG_BLK_CGROUP
616	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
617		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
618					blkg->stats.unaccounted_time, cb, dev);
619	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
620		uint64_t sum = blkg->stats.avg_queue_size_sum;
621		uint64_t samples = blkg->stats.avg_queue_size_samples;
622		if (samples)
623			do_div(sum, samples);
624		else
625			sum = 0;
626		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
627	}
628	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
629		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
630					blkg->stats.group_wait_time, cb, dev);
631	if (type == BLKIO_STAT_IDLE_TIME)
632		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
633					blkg->stats.idle_time, cb, dev);
634	if (type == BLKIO_STAT_EMPTY_TIME)
635		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
636					blkg->stats.empty_time, cb, dev);
637	if (type == BLKIO_STAT_DEQUEUE)
638		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
639					blkg->stats.dequeue, cb, dev);
640#endif
641
642	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
643			sub_type++) {
644		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
645		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
646	}
647	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
648			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
649	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
650	cb->fill(cb, key_str, disk_total);
651	return disk_total;
652}
653
654static int blkio_check_dev_num(dev_t dev)
655{
656	int part = 0;
657	struct gendisk *disk;
658
659	disk = get_gendisk(dev, &part);
660	if (!disk || part)
661		return -ENODEV;
662
663	return 0;
664}
665
666static int blkio_policy_parse_and_set(char *buf,
667	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
668{
669	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
670	int ret;
671	unsigned long major, minor, temp;
672	int i = 0;
673	dev_t dev;
674	u64 bps, iops;
675
676	memset(s, 0, sizeof(s));
677
678	while ((p = strsep(&buf, " ")) != NULL) {
679		if (!*p)
680			continue;
681
682		s[i++] = p;
683
684		/* Prevent from inputing too many things */
685		if (i == 3)
686			break;
687	}
688
689	if (i != 2)
690		return -EINVAL;
691
692	p = strsep(&s[0], ":");
693	if (p != NULL)
694		major_s = p;
695	else
696		return -EINVAL;
697
698	minor_s = s[0];
699	if (!minor_s)
700		return -EINVAL;
701
702	ret = strict_strtoul(major_s, 10, &major);
703	if (ret)
704		return -EINVAL;
705
706	ret = strict_strtoul(minor_s, 10, &minor);
707	if (ret)
708		return -EINVAL;
709
710	dev = MKDEV(major, minor);
711
712	ret = blkio_check_dev_num(dev);
713	if (ret)
714		return ret;
715
716	newpn->dev = dev;
717
718	if (s[1] == NULL)
719		return -EINVAL;
720
721	switch (plid) {
722	case BLKIO_POLICY_PROP:
723		ret = strict_strtoul(s[1], 10, &temp);
724		if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
725			temp > BLKIO_WEIGHT_MAX)
726			return -EINVAL;
727
728		newpn->plid = plid;
729		newpn->fileid = fileid;
730		newpn->val.weight = temp;
731		break;
732	case BLKIO_POLICY_THROTL:
733		switch(fileid) {
734		case BLKIO_THROTL_read_bps_device:
735		case BLKIO_THROTL_write_bps_device:
736			ret = strict_strtoull(s[1], 10, &bps);
737			if (ret)
738				return -EINVAL;
739
740			newpn->plid = plid;
741			newpn->fileid = fileid;
742			newpn->val.bps = bps;
743			break;
744		case BLKIO_THROTL_read_iops_device:
745		case BLKIO_THROTL_write_iops_device:
746			ret = strict_strtoull(s[1], 10, &iops);
747			if (ret)
748				return -EINVAL;
749
750			if (iops > THROTL_IOPS_MAX)
751				return -EINVAL;
752
753			newpn->plid = plid;
754			newpn->fileid = fileid;
755			newpn->val.iops = (unsigned int)iops;
756			break;
757		}
758		break;
759	default:
760		BUG();
761	}
762
763	return 0;
764}
765
766unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
767			      dev_t dev)
768{
769	struct blkio_policy_node *pn;
770
771	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
772				BLKIO_PROP_weight_device);
773	if (pn)
774		return pn->val.weight;
775	else
776		return blkcg->weight;
777}
778EXPORT_SYMBOL_GPL(blkcg_get_weight);
779
780uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
781{
782	struct blkio_policy_node *pn;
783
784	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
785				BLKIO_THROTL_read_bps_device);
786	if (pn)
787		return pn->val.bps;
788	else
789		return -1;
790}
791
792uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
793{
794	struct blkio_policy_node *pn;
795	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
796				BLKIO_THROTL_write_bps_device);
797	if (pn)
798		return pn->val.bps;
799	else
800		return -1;
801}
802
803unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
804{
805	struct blkio_policy_node *pn;
806
807	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
808				BLKIO_THROTL_read_iops_device);
809	if (pn)
810		return pn->val.iops;
811	else
812		return -1;
813}
814
815unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
816{
817	struct blkio_policy_node *pn;
818	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
819				BLKIO_THROTL_write_iops_device);
820	if (pn)
821		return pn->val.iops;
822	else
823		return -1;
824}
825
826/* Checks whether user asked for deleting a policy rule */
827static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
828{
829	switch(pn->plid) {
830	case BLKIO_POLICY_PROP:
831		if (pn->val.weight == 0)
832			return 1;
833		break;
834	case BLKIO_POLICY_THROTL:
835		switch(pn->fileid) {
836		case BLKIO_THROTL_read_bps_device:
837		case BLKIO_THROTL_write_bps_device:
838			if (pn->val.bps == 0)
839				return 1;
840			break;
841		case BLKIO_THROTL_read_iops_device:
842		case BLKIO_THROTL_write_iops_device:
843			if (pn->val.iops == 0)
844				return 1;
845		}
846		break;
847	default:
848		BUG();
849	}
850
851	return 0;
852}
853
854static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
855					struct blkio_policy_node *newpn)
856{
857	switch(oldpn->plid) {
858	case BLKIO_POLICY_PROP:
859		oldpn->val.weight = newpn->val.weight;
860		break;
861	case BLKIO_POLICY_THROTL:
862		switch(newpn->fileid) {
863		case BLKIO_THROTL_read_bps_device:
864		case BLKIO_THROTL_write_bps_device:
865			oldpn->val.bps = newpn->val.bps;
866			break;
867		case BLKIO_THROTL_read_iops_device:
868		case BLKIO_THROTL_write_iops_device:
869			oldpn->val.iops = newpn->val.iops;
870		}
871		break;
872	default:
873		BUG();
874	}
875}
876
877/*
878 * Some rules/values in blkg have changed. Propagate those to respective
879 * policies.
880 */
881static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
882		struct blkio_group *blkg, struct blkio_policy_node *pn)
883{
884	unsigned int weight, iops;
885	u64 bps;
886
887	switch(pn->plid) {
888	case BLKIO_POLICY_PROP:
889		weight = pn->val.weight ? pn->val.weight :
890				blkcg->weight;
891		blkio_update_group_weight(blkg, weight);
892		break;
893	case BLKIO_POLICY_THROTL:
894		switch(pn->fileid) {
895		case BLKIO_THROTL_read_bps_device:
896		case BLKIO_THROTL_write_bps_device:
897			bps = pn->val.bps ? pn->val.bps : (-1);
898			blkio_update_group_bps(blkg, bps, pn->fileid);
899			break;
900		case BLKIO_THROTL_read_iops_device:
901		case BLKIO_THROTL_write_iops_device:
902			iops = pn->val.iops ? pn->val.iops : (-1);
903			blkio_update_group_iops(blkg, iops, pn->fileid);
904			break;
905		}
906		break;
907	default:
908		BUG();
909	}
910}
911
912/*
913 * A policy node rule has been updated. Propagate this update to all the
914 * block groups which might be affected by this update.
915 */
916static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
917				struct blkio_policy_node *pn)
918{
919	struct blkio_group *blkg;
920	struct hlist_node *n;
921
922	spin_lock(&blkio_list_lock);
923	spin_lock_irq(&blkcg->lock);
924
925	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
926		if (pn->dev != blkg->dev || pn->plid != blkg->plid)
927			continue;
928		blkio_update_blkg_policy(blkcg, blkg, pn);
929	}
930
931	spin_unlock_irq(&blkcg->lock);
932	spin_unlock(&blkio_list_lock);
933}
934
935static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
936 				       const char *buffer)
937{
938	int ret = 0;
939	char *buf;
940	struct blkio_policy_node *newpn, *pn;
941	struct blkio_cgroup *blkcg;
942	int keep_newpn = 0;
943	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
944	int fileid = BLKIOFILE_ATTR(cft->private);
945
946	buf = kstrdup(buffer, GFP_KERNEL);
947	if (!buf)
948		return -ENOMEM;
949
950	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
951	if (!newpn) {
952		ret = -ENOMEM;
953		goto free_buf;
954	}
955
956	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
957	if (ret)
958		goto free_newpn;
959
960	blkcg = cgroup_to_blkio_cgroup(cgrp);
961
962	spin_lock_irq(&blkcg->lock);
963
964	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
965	if (!pn) {
966		if (!blkio_delete_rule_command(newpn)) {
967			blkio_policy_insert_node(blkcg, newpn);
968			keep_newpn = 1;
969		}
970		spin_unlock_irq(&blkcg->lock);
971		goto update_io_group;
972	}
973
974	if (blkio_delete_rule_command(newpn)) {
975		blkio_policy_delete_node(pn);
976		spin_unlock_irq(&blkcg->lock);
977		goto update_io_group;
978	}
979	spin_unlock_irq(&blkcg->lock);
980
981	blkio_update_policy_rule(pn, newpn);
982
983update_io_group:
984	blkio_update_policy_node_blkg(blkcg, newpn);
985
986free_newpn:
987	if (!keep_newpn)
988		kfree(newpn);
989free_buf:
990	kfree(buf);
991	return ret;
992}
993
994static void
995blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
996{
997	switch(pn->plid) {
998		case BLKIO_POLICY_PROP:
999			if (pn->fileid == BLKIO_PROP_weight_device)
1000				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1001					MINOR(pn->dev), pn->val.weight);
1002			break;
1003		case BLKIO_POLICY_THROTL:
1004			switch(pn->fileid) {
1005			case BLKIO_THROTL_read_bps_device:
1006			case BLKIO_THROTL_write_bps_device:
1007				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
1008					MINOR(pn->dev), pn->val.bps);
1009				break;
1010			case BLKIO_THROTL_read_iops_device:
1011			case BLKIO_THROTL_write_iops_device:
1012				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1013					MINOR(pn->dev), pn->val.iops);
1014				break;
1015			}
1016			break;
1017		default:
1018			BUG();
1019	}
1020}
1021
1022/* cgroup files which read their data from policy nodes end up here */
1023static void blkio_read_policy_node_files(struct cftype *cft,
1024			struct blkio_cgroup *blkcg, struct seq_file *m)
1025{
1026	struct blkio_policy_node *pn;
1027
1028	if (!list_empty(&blkcg->policy_list)) {
1029		spin_lock_irq(&blkcg->lock);
1030		list_for_each_entry(pn, &blkcg->policy_list, node) {
1031			if (!pn_matches_cftype(cft, pn))
1032				continue;
1033			blkio_print_policy_node(m, pn);
1034		}
1035		spin_unlock_irq(&blkcg->lock);
1036	}
1037}
1038
1039static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1040				struct seq_file *m)
1041{
1042	struct blkio_cgroup *blkcg;
1043	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1044	int name = BLKIOFILE_ATTR(cft->private);
1045
1046	blkcg = cgroup_to_blkio_cgroup(cgrp);
1047
1048	switch(plid) {
1049	case BLKIO_POLICY_PROP:
1050		switch(name) {
1051		case BLKIO_PROP_weight_device:
1052			blkio_read_policy_node_files(cft, blkcg, m);
1053			return 0;
1054		default:
1055			BUG();
1056		}
1057		break;
1058	case BLKIO_POLICY_THROTL:
1059		switch(name){
1060		case BLKIO_THROTL_read_bps_device:
1061		case BLKIO_THROTL_write_bps_device:
1062		case BLKIO_THROTL_read_iops_device:
1063		case BLKIO_THROTL_write_iops_device:
1064			blkio_read_policy_node_files(cft, blkcg, m);
1065			return 0;
1066		default:
1067			BUG();
1068		}
1069		break;
1070	default:
1071		BUG();
1072	}
1073
1074	return 0;
1075}
1076
1077static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1078		struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
1079		bool show_total)
1080{
1081	struct blkio_group *blkg;
1082	struct hlist_node *n;
1083	uint64_t cgroup_total = 0;
1084
1085	rcu_read_lock();
1086	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1087		if (blkg->dev) {
1088			if (!cftype_blkg_same_policy(cft, blkg))
1089				continue;
1090			spin_lock_irq(&blkg->stats_lock);
1091			cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
1092						type);
1093			spin_unlock_irq(&blkg->stats_lock);
1094		}
1095	}
1096	if (show_total)
1097		cb->fill(cb, "Total", cgroup_total);
1098	rcu_read_unlock();
1099	return 0;
1100}
1101
1102/* All map kind of cgroup file get serviced by this function */
1103static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1104				struct cgroup_map_cb *cb)
1105{
1106	struct blkio_cgroup *blkcg;
1107	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1108	int name = BLKIOFILE_ATTR(cft->private);
1109
1110	blkcg = cgroup_to_blkio_cgroup(cgrp);
1111
1112	switch(plid) {
1113	case BLKIO_POLICY_PROP:
1114		switch(name) {
1115		case BLKIO_PROP_time:
1116			return blkio_read_blkg_stats(blkcg, cft, cb,
1117						BLKIO_STAT_TIME, 0);
1118		case BLKIO_PROP_sectors:
1119			return blkio_read_blkg_stats(blkcg, cft, cb,
1120						BLKIO_STAT_SECTORS, 0);
1121		case BLKIO_PROP_io_service_bytes:
1122			return blkio_read_blkg_stats(blkcg, cft, cb,
1123						BLKIO_STAT_SERVICE_BYTES, 1);
1124		case BLKIO_PROP_io_serviced:
1125			return blkio_read_blkg_stats(blkcg, cft, cb,
1126						BLKIO_STAT_SERVICED, 1);
1127		case BLKIO_PROP_io_service_time:
1128			return blkio_read_blkg_stats(blkcg, cft, cb,
1129						BLKIO_STAT_SERVICE_TIME, 1);
1130		case BLKIO_PROP_io_wait_time:
1131			return blkio_read_blkg_stats(blkcg, cft, cb,
1132						BLKIO_STAT_WAIT_TIME, 1);
1133		case BLKIO_PROP_io_merged:
1134			return blkio_read_blkg_stats(blkcg, cft, cb,
1135						BLKIO_STAT_MERGED, 1);
1136		case BLKIO_PROP_io_queued:
1137			return blkio_read_blkg_stats(blkcg, cft, cb,
1138						BLKIO_STAT_QUEUED, 1);
1139#ifdef CONFIG_DEBUG_BLK_CGROUP
1140		case BLKIO_PROP_unaccounted_time:
1141			return blkio_read_blkg_stats(blkcg, cft, cb,
1142						BLKIO_STAT_UNACCOUNTED_TIME, 0);
1143		case BLKIO_PROP_dequeue:
1144			return blkio_read_blkg_stats(blkcg, cft, cb,
1145						BLKIO_STAT_DEQUEUE, 0);
1146		case BLKIO_PROP_avg_queue_size:
1147			return blkio_read_blkg_stats(blkcg, cft, cb,
1148						BLKIO_STAT_AVG_QUEUE_SIZE, 0);
1149		case BLKIO_PROP_group_wait_time:
1150			return blkio_read_blkg_stats(blkcg, cft, cb,
1151						BLKIO_STAT_GROUP_WAIT_TIME, 0);
1152		case BLKIO_PROP_idle_time:
1153			return blkio_read_blkg_stats(blkcg, cft, cb,
1154						BLKIO_STAT_IDLE_TIME, 0);
1155		case BLKIO_PROP_empty_time:
1156			return blkio_read_blkg_stats(blkcg, cft, cb,
1157						BLKIO_STAT_EMPTY_TIME, 0);
1158#endif
1159		default:
1160			BUG();
1161		}
1162		break;
1163	case BLKIO_POLICY_THROTL:
1164		switch(name){
1165		case BLKIO_THROTL_io_service_bytes:
1166			return blkio_read_blkg_stats(blkcg, cft, cb,
1167						BLKIO_STAT_SERVICE_BYTES, 1);
1168		case BLKIO_THROTL_io_serviced:
1169			return blkio_read_blkg_stats(blkcg, cft, cb,
1170						BLKIO_STAT_SERVICED, 1);
1171		default:
1172			BUG();
1173		}
1174		break;
1175	default:
1176		BUG();
1177	}
1178
1179	return 0;
1180}
1181
1182static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1183{
1184	struct blkio_group *blkg;
1185	struct hlist_node *n;
1186	struct blkio_policy_node *pn;
1187
1188	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1189		return -EINVAL;
1190
1191	spin_lock(&blkio_list_lock);
1192	spin_lock_irq(&blkcg->lock);
1193	blkcg->weight = (unsigned int)val;
1194
1195	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1196		pn = blkio_policy_search_node(blkcg, blkg->dev,
1197				BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1198		if (pn)
1199			continue;
1200
1201		blkio_update_group_weight(blkg, blkcg->weight);
1202	}
1203	spin_unlock_irq(&blkcg->lock);
1204	spin_unlock(&blkio_list_lock);
1205	return 0;
1206}
1207
1208static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1209	struct blkio_cgroup *blkcg;
1210	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1211	int name = BLKIOFILE_ATTR(cft->private);
1212
1213	blkcg = cgroup_to_blkio_cgroup(cgrp);
1214
1215	switch(plid) {
1216	case BLKIO_POLICY_PROP:
1217		switch(name) {
1218		case BLKIO_PROP_weight:
1219			return (u64)blkcg->weight;
1220		}
1221		break;
1222	default:
1223		BUG();
1224	}
1225	return 0;
1226}
1227
1228static int
1229blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1230{
1231	struct blkio_cgroup *blkcg;
1232	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1233	int name = BLKIOFILE_ATTR(cft->private);
1234
1235	blkcg = cgroup_to_blkio_cgroup(cgrp);
1236
1237	switch(plid) {
1238	case BLKIO_POLICY_PROP:
1239		switch(name) {
1240		case BLKIO_PROP_weight:
1241			return blkio_weight_write(blkcg, val);
1242		}
1243		break;
1244	default:
1245		BUG();
1246	}
1247
1248	return 0;
1249}
1250
1251struct cftype blkio_files[] = {
1252	{
1253		.name = "weight_device",
1254		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1255				BLKIO_PROP_weight_device),
1256		.read_seq_string = blkiocg_file_read,
1257		.write_string = blkiocg_file_write,
1258		.max_write_len = 256,
1259	},
1260	{
1261		.name = "weight",
1262		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1263				BLKIO_PROP_weight),
1264		.read_u64 = blkiocg_file_read_u64,
1265		.write_u64 = blkiocg_file_write_u64,
1266	},
1267	{
1268		.name = "time",
1269		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1270				BLKIO_PROP_time),
1271		.read_map = blkiocg_file_read_map,
1272	},
1273	{
1274		.name = "sectors",
1275		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1276				BLKIO_PROP_sectors),
1277		.read_map = blkiocg_file_read_map,
1278	},
1279	{
1280		.name = "io_service_bytes",
1281		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1282				BLKIO_PROP_io_service_bytes),
1283		.read_map = blkiocg_file_read_map,
1284	},
1285	{
1286		.name = "io_serviced",
1287		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1288				BLKIO_PROP_io_serviced),
1289		.read_map = blkiocg_file_read_map,
1290	},
1291	{
1292		.name = "io_service_time",
1293		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1294				BLKIO_PROP_io_service_time),
1295		.read_map = blkiocg_file_read_map,
1296	},
1297	{
1298		.name = "io_wait_time",
1299		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1300				BLKIO_PROP_io_wait_time),
1301		.read_map = blkiocg_file_read_map,
1302	},
1303	{
1304		.name = "io_merged",
1305		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1306				BLKIO_PROP_io_merged),
1307		.read_map = blkiocg_file_read_map,
1308	},
1309	{
1310		.name = "io_queued",
1311		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1312				BLKIO_PROP_io_queued),
1313		.read_map = blkiocg_file_read_map,
1314	},
1315	{
1316		.name = "reset_stats",
1317		.write_u64 = blkiocg_reset_stats,
1318	},
1319#ifdef CONFIG_BLK_DEV_THROTTLING
1320	{
1321		.name = "throttle.read_bps_device",
1322		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1323				BLKIO_THROTL_read_bps_device),
1324		.read_seq_string = blkiocg_file_read,
1325		.write_string = blkiocg_file_write,
1326		.max_write_len = 256,
1327	},
1328
1329	{
1330		.name = "throttle.write_bps_device",
1331		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1332				BLKIO_THROTL_write_bps_device),
1333		.read_seq_string = blkiocg_file_read,
1334		.write_string = blkiocg_file_write,
1335		.max_write_len = 256,
1336	},
1337
1338	{
1339		.name = "throttle.read_iops_device",
1340		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1341				BLKIO_THROTL_read_iops_device),
1342		.read_seq_string = blkiocg_file_read,
1343		.write_string = blkiocg_file_write,
1344		.max_write_len = 256,
1345	},
1346
1347	{
1348		.name = "throttle.write_iops_device",
1349		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1350				BLKIO_THROTL_write_iops_device),
1351		.read_seq_string = blkiocg_file_read,
1352		.write_string = blkiocg_file_write,
1353		.max_write_len = 256,
1354	},
1355	{
1356		.name = "throttle.io_service_bytes",
1357		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1358				BLKIO_THROTL_io_service_bytes),
1359		.read_map = blkiocg_file_read_map,
1360	},
1361	{
1362		.name = "throttle.io_serviced",
1363		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1364				BLKIO_THROTL_io_serviced),
1365		.read_map = blkiocg_file_read_map,
1366	},
1367#endif /* CONFIG_BLK_DEV_THROTTLING */
1368
1369#ifdef CONFIG_DEBUG_BLK_CGROUP
1370	{
1371		.name = "avg_queue_size",
1372		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1373				BLKIO_PROP_avg_queue_size),
1374		.read_map = blkiocg_file_read_map,
1375	},
1376	{
1377		.name = "group_wait_time",
1378		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1379				BLKIO_PROP_group_wait_time),
1380		.read_map = blkiocg_file_read_map,
1381	},
1382	{
1383		.name = "idle_time",
1384		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1385				BLKIO_PROP_idle_time),
1386		.read_map = blkiocg_file_read_map,
1387	},
1388	{
1389		.name = "empty_time",
1390		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1391				BLKIO_PROP_empty_time),
1392		.read_map = blkiocg_file_read_map,
1393	},
1394	{
1395		.name = "dequeue",
1396		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1397				BLKIO_PROP_dequeue),
1398		.read_map = blkiocg_file_read_map,
1399	},
1400	{
1401		.name = "unaccounted_time",
1402		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1403				BLKIO_PROP_unaccounted_time),
1404		.read_map = blkiocg_file_read_map,
1405	},
1406#endif
1407};
1408
1409static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1410{
1411	return cgroup_add_files(cgroup, subsys, blkio_files,
1412				ARRAY_SIZE(blkio_files));
1413}
1414
1415static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1416{
1417	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1418	unsigned long flags;
1419	struct blkio_group *blkg;
1420	void *key;
1421	struct blkio_policy_type *blkiop;
1422	struct blkio_policy_node *pn, *pntmp;
1423
1424	rcu_read_lock();
1425	do {
1426		spin_lock_irqsave(&blkcg->lock, flags);
1427
1428		if (hlist_empty(&blkcg->blkg_list)) {
1429			spin_unlock_irqrestore(&blkcg->lock, flags);
1430			break;
1431		}
1432
1433		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
1434					blkcg_node);
1435		key = rcu_dereference(blkg->key);
1436		__blkiocg_del_blkio_group(blkg);
1437
1438		spin_unlock_irqrestore(&blkcg->lock, flags);
1439
1440		/*
1441		 * This blkio_group is being unlinked as associated cgroup is
1442		 * going away. Let all the IO controlling policies know about
1443		 * this event.
1444		 */
1445		spin_lock(&blkio_list_lock);
1446		list_for_each_entry(blkiop, &blkio_list, list) {
1447			if (blkiop->plid != blkg->plid)
1448				continue;
1449			blkiop->ops.blkio_unlink_group_fn(key, blkg);
1450		}
1451		spin_unlock(&blkio_list_lock);
1452	} while (1);
1453
1454	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
1455		blkio_policy_delete_node(pn);
1456		kfree(pn);
1457	}
1458
1459	free_css_id(&blkio_subsys, &blkcg->css);
1460	rcu_read_unlock();
1461	if (blkcg != &blkio_root_cgroup)
1462		kfree(blkcg);
1463}
1464
1465static struct cgroup_subsys_state *
1466blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1467{
1468	struct blkio_cgroup *blkcg;
1469	struct cgroup *parent = cgroup->parent;
1470
1471	if (!parent) {
1472		blkcg = &blkio_root_cgroup;
1473		goto done;
1474	}
1475
1476	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1477	if (!blkcg)
1478		return ERR_PTR(-ENOMEM);
1479
1480	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1481done:
1482	spin_lock_init(&blkcg->lock);
1483	INIT_HLIST_HEAD(&blkcg->blkg_list);
1484
1485	INIT_LIST_HEAD(&blkcg->policy_list);
1486	return &blkcg->css;
1487}
1488
1489/*
1490 * We cannot support shared io contexts, as we have no mean to support
1491 * two tasks with the same ioc in two different groups without major rework
1492 * of the main cic data structures.  For now we allow a task to change
1493 * its cgroup only if it's the only owner of its ioc.
1494 */
1495static int blkiocg_can_attach(struct cgroup_subsys *subsys,
1496				struct cgroup *cgroup, struct task_struct *tsk,
1497				bool threadgroup)
1498{
1499	struct io_context *ioc;
1500	int ret = 0;
1501
1502	/* task_lock() is needed to avoid races with exit_io_context() */
1503	task_lock(tsk);
1504	ioc = tsk->io_context;
1505	if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1506		ret = -EINVAL;
1507	task_unlock(tsk);
1508
1509	return ret;
1510}
1511
1512static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
1513				struct cgroup *prev, struct task_struct *tsk,
1514				bool threadgroup)
1515{
1516	struct io_context *ioc;
1517
1518	task_lock(tsk);
1519	ioc = tsk->io_context;
1520	if (ioc)
1521		ioc->cgroup_changed = 1;
1522	task_unlock(tsk);
1523}
1524
1525void blkio_policy_register(struct blkio_policy_type *blkiop)
1526{
1527	spin_lock(&blkio_list_lock);
1528	list_add_tail(&blkiop->list, &blkio_list);
1529	spin_unlock(&blkio_list_lock);
1530}
1531EXPORT_SYMBOL_GPL(blkio_policy_register);
1532
1533void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1534{
1535	spin_lock(&blkio_list_lock);
1536	list_del_init(&blkiop->list);
1537	spin_unlock(&blkio_list_lock);
1538}
1539EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1540
1541static int __init init_cgroup_blkio(void)
1542{
1543	return cgroup_load_subsys(&blkio_subsys);
1544}
1545
1546static void __exit exit_cgroup_blkio(void)
1547{
1548	cgroup_unload_subsys(&blkio_subsys);
1549}
1550
1551module_init(init_cgroup_blkio);
1552module_exit(exit_cgroup_blkio);
1553MODULE_LICENSE("GPL");
1554