blk-cgroup.c revision 72e06c255181537d0b3e1f657a9ed81655d745b1
1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 *		      Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * 	              Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
14#include <linux/seq_file.h>
15#include <linux/kdev_t.h>
16#include <linux/module.h>
17#include <linux/err.h>
18#include <linux/blkdev.h>
19#include <linux/slab.h>
20#include <linux/genhd.h>
21#include <linux/delay.h>
22#include "blk-cgroup.h"
23
24#define MAX_KEY_LEN 100
25
26static DEFINE_SPINLOCK(blkio_list_lock);
27static LIST_HEAD(blkio_list);
28
29struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
30EXPORT_SYMBOL_GPL(blkio_root_cgroup);
31
32static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
33						  struct cgroup *);
34static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
35			      struct cgroup_taskset *);
36static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
37			   struct cgroup_taskset *);
38static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
39static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
40
41/* for encoding cft->private value on file */
42#define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
43/* What policy owns the file, proportional or throttle */
44#define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
45#define BLKIOFILE_ATTR(val)		((val) & 0xffff)
46
47struct cgroup_subsys blkio_subsys = {
48	.name = "blkio",
49	.create = blkiocg_create,
50	.can_attach = blkiocg_can_attach,
51	.attach = blkiocg_attach,
52	.destroy = blkiocg_destroy,
53	.populate = blkiocg_populate,
54	.subsys_id = blkio_subsys_id,
55	.use_id = 1,
56	.module = THIS_MODULE,
57};
58EXPORT_SYMBOL_GPL(blkio_subsys);
59
60static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
61					    struct blkio_policy_node *pn)
62{
63	list_add(&pn->node, &blkcg->policy_list);
64}
65
66static inline bool cftype_blkg_same_policy(struct cftype *cft,
67			struct blkio_group *blkg)
68{
69	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
70
71	if (blkg->plid == plid)
72		return 1;
73
74	return 0;
75}
76
77/* Determines if policy node matches cgroup file being accessed */
78static inline bool pn_matches_cftype(struct cftype *cft,
79			struct blkio_policy_node *pn)
80{
81	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
82	int fileid = BLKIOFILE_ATTR(cft->private);
83
84	return (plid == pn->plid && fileid == pn->fileid);
85}
86
87/* Must be called with blkcg->lock held */
88static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
89{
90	list_del(&pn->node);
91}
92
93/* Must be called with blkcg->lock held */
94static struct blkio_policy_node *
95blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
96		enum blkio_policy_id plid, int fileid)
97{
98	struct blkio_policy_node *pn;
99
100	list_for_each_entry(pn, &blkcg->policy_list, node) {
101		if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
102			return pn;
103	}
104
105	return NULL;
106}
107
108struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
109{
110	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
111			    struct blkio_cgroup, css);
112}
113EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
114
115struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
116{
117	return container_of(task_subsys_state(tsk, blkio_subsys_id),
118			    struct blkio_cgroup, css);
119}
120EXPORT_SYMBOL_GPL(task_blkio_cgroup);
121
122static inline void
123blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
124{
125	struct blkio_policy_type *blkiop;
126
127	list_for_each_entry(blkiop, &blkio_list, list) {
128		/* If this policy does not own the blkg, do not send updates */
129		if (blkiop->plid != blkg->plid)
130			continue;
131		if (blkiop->ops.blkio_update_group_weight_fn)
132			blkiop->ops.blkio_update_group_weight_fn(blkg->key,
133							blkg, weight);
134	}
135}
136
137static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
138				int fileid)
139{
140	struct blkio_policy_type *blkiop;
141
142	list_for_each_entry(blkiop, &blkio_list, list) {
143
144		/* If this policy does not own the blkg, do not send updates */
145		if (blkiop->plid != blkg->plid)
146			continue;
147
148		if (fileid == BLKIO_THROTL_read_bps_device
149		    && blkiop->ops.blkio_update_group_read_bps_fn)
150			blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
151								blkg, bps);
152
153		if (fileid == BLKIO_THROTL_write_bps_device
154		    && blkiop->ops.blkio_update_group_write_bps_fn)
155			blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
156								blkg, bps);
157	}
158}
159
160static inline void blkio_update_group_iops(struct blkio_group *blkg,
161			unsigned int iops, int fileid)
162{
163	struct blkio_policy_type *blkiop;
164
165	list_for_each_entry(blkiop, &blkio_list, list) {
166
167		/* If this policy does not own the blkg, do not send updates */
168		if (blkiop->plid != blkg->plid)
169			continue;
170
171		if (fileid == BLKIO_THROTL_read_iops_device
172		    && blkiop->ops.blkio_update_group_read_iops_fn)
173			blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
174								blkg, iops);
175
176		if (fileid == BLKIO_THROTL_write_iops_device
177		    && blkiop->ops.blkio_update_group_write_iops_fn)
178			blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
179								blkg,iops);
180	}
181}
182
183/*
184 * Add to the appropriate stat variable depending on the request type.
185 * This should be called with the blkg->stats_lock held.
186 */
187static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
188				bool sync)
189{
190	if (direction)
191		stat[BLKIO_STAT_WRITE] += add;
192	else
193		stat[BLKIO_STAT_READ] += add;
194	if (sync)
195		stat[BLKIO_STAT_SYNC] += add;
196	else
197		stat[BLKIO_STAT_ASYNC] += add;
198}
199
200/*
201 * Decrements the appropriate stat variable if non-zero depending on the
202 * request type. Panics on value being zero.
203 * This should be called with the blkg->stats_lock held.
204 */
205static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
206{
207	if (direction) {
208		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
209		stat[BLKIO_STAT_WRITE]--;
210	} else {
211		BUG_ON(stat[BLKIO_STAT_READ] == 0);
212		stat[BLKIO_STAT_READ]--;
213	}
214	if (sync) {
215		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
216		stat[BLKIO_STAT_SYNC]--;
217	} else {
218		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
219		stat[BLKIO_STAT_ASYNC]--;
220	}
221}
222
223#ifdef CONFIG_DEBUG_BLK_CGROUP
224/* This should be called with the blkg->stats_lock held. */
225static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
226						struct blkio_group *curr_blkg)
227{
228	if (blkio_blkg_waiting(&blkg->stats))
229		return;
230	if (blkg == curr_blkg)
231		return;
232	blkg->stats.start_group_wait_time = sched_clock();
233	blkio_mark_blkg_waiting(&blkg->stats);
234}
235
236/* This should be called with the blkg->stats_lock held. */
237static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
238{
239	unsigned long long now;
240
241	if (!blkio_blkg_waiting(stats))
242		return;
243
244	now = sched_clock();
245	if (time_after64(now, stats->start_group_wait_time))
246		stats->group_wait_time += now - stats->start_group_wait_time;
247	blkio_clear_blkg_waiting(stats);
248}
249
250/* This should be called with the blkg->stats_lock held. */
251static void blkio_end_empty_time(struct blkio_group_stats *stats)
252{
253	unsigned long long now;
254
255	if (!blkio_blkg_empty(stats))
256		return;
257
258	now = sched_clock();
259	if (time_after64(now, stats->start_empty_time))
260		stats->empty_time += now - stats->start_empty_time;
261	blkio_clear_blkg_empty(stats);
262}
263
264void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
265{
266	unsigned long flags;
267
268	spin_lock_irqsave(&blkg->stats_lock, flags);
269	BUG_ON(blkio_blkg_idling(&blkg->stats));
270	blkg->stats.start_idle_time = sched_clock();
271	blkio_mark_blkg_idling(&blkg->stats);
272	spin_unlock_irqrestore(&blkg->stats_lock, flags);
273}
274EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
275
276void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
277{
278	unsigned long flags;
279	unsigned long long now;
280	struct blkio_group_stats *stats;
281
282	spin_lock_irqsave(&blkg->stats_lock, flags);
283	stats = &blkg->stats;
284	if (blkio_blkg_idling(stats)) {
285		now = sched_clock();
286		if (time_after64(now, stats->start_idle_time))
287			stats->idle_time += now - stats->start_idle_time;
288		blkio_clear_blkg_idling(stats);
289	}
290	spin_unlock_irqrestore(&blkg->stats_lock, flags);
291}
292EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
293
294void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
295{
296	unsigned long flags;
297	struct blkio_group_stats *stats;
298
299	spin_lock_irqsave(&blkg->stats_lock, flags);
300	stats = &blkg->stats;
301	stats->avg_queue_size_sum +=
302			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
303			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
304	stats->avg_queue_size_samples++;
305	blkio_update_group_wait_time(stats);
306	spin_unlock_irqrestore(&blkg->stats_lock, flags);
307}
308EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
309
310void blkiocg_set_start_empty_time(struct blkio_group *blkg)
311{
312	unsigned long flags;
313	struct blkio_group_stats *stats;
314
315	spin_lock_irqsave(&blkg->stats_lock, flags);
316	stats = &blkg->stats;
317
318	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
319			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
320		spin_unlock_irqrestore(&blkg->stats_lock, flags);
321		return;
322	}
323
324	/*
325	 * group is already marked empty. This can happen if cfqq got new
326	 * request in parent group and moved to this group while being added
327	 * to service tree. Just ignore the event and move on.
328	 */
329	if(blkio_blkg_empty(stats)) {
330		spin_unlock_irqrestore(&blkg->stats_lock, flags);
331		return;
332	}
333
334	stats->start_empty_time = sched_clock();
335	blkio_mark_blkg_empty(stats);
336	spin_unlock_irqrestore(&blkg->stats_lock, flags);
337}
338EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
339
340void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
341			unsigned long dequeue)
342{
343	blkg->stats.dequeue += dequeue;
344}
345EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
346#else
347static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
348					struct blkio_group *curr_blkg) {}
349static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
350#endif
351
352void blkiocg_update_io_add_stats(struct blkio_group *blkg,
353			struct blkio_group *curr_blkg, bool direction,
354			bool sync)
355{
356	unsigned long flags;
357
358	spin_lock_irqsave(&blkg->stats_lock, flags);
359	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
360			sync);
361	blkio_end_empty_time(&blkg->stats);
362	blkio_set_start_group_wait_time(blkg, curr_blkg);
363	spin_unlock_irqrestore(&blkg->stats_lock, flags);
364}
365EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
366
367void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
368						bool direction, bool sync)
369{
370	unsigned long flags;
371
372	spin_lock_irqsave(&blkg->stats_lock, flags);
373	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
374					direction, sync);
375	spin_unlock_irqrestore(&blkg->stats_lock, flags);
376}
377EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
378
379void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
380				unsigned long unaccounted_time)
381{
382	unsigned long flags;
383
384	spin_lock_irqsave(&blkg->stats_lock, flags);
385	blkg->stats.time += time;
386#ifdef CONFIG_DEBUG_BLK_CGROUP
387	blkg->stats.unaccounted_time += unaccounted_time;
388#endif
389	spin_unlock_irqrestore(&blkg->stats_lock, flags);
390}
391EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
392
393/*
394 * should be called under rcu read lock or queue lock to make sure blkg pointer
395 * is valid.
396 */
397void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
398				uint64_t bytes, bool direction, bool sync)
399{
400	struct blkio_group_stats_cpu *stats_cpu;
401	unsigned long flags;
402
403	/*
404	 * Disabling interrupts to provide mutual exclusion between two
405	 * writes on same cpu. It probably is not needed for 64bit. Not
406	 * optimizing that case yet.
407	 */
408	local_irq_save(flags);
409
410	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
411
412	u64_stats_update_begin(&stats_cpu->syncp);
413	stats_cpu->sectors += bytes >> 9;
414	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
415			1, direction, sync);
416	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
417			bytes, direction, sync);
418	u64_stats_update_end(&stats_cpu->syncp);
419	local_irq_restore(flags);
420}
421EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
422
423void blkiocg_update_completion_stats(struct blkio_group *blkg,
424	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
425{
426	struct blkio_group_stats *stats;
427	unsigned long flags;
428	unsigned long long now = sched_clock();
429
430	spin_lock_irqsave(&blkg->stats_lock, flags);
431	stats = &blkg->stats;
432	if (time_after64(now, io_start_time))
433		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
434				now - io_start_time, direction, sync);
435	if (time_after64(io_start_time, start_time))
436		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
437				io_start_time - start_time, direction, sync);
438	spin_unlock_irqrestore(&blkg->stats_lock, flags);
439}
440EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
441
442/*  Merged stats are per cpu.  */
443void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
444					bool sync)
445{
446	struct blkio_group_stats_cpu *stats_cpu;
447	unsigned long flags;
448
449	/*
450	 * Disabling interrupts to provide mutual exclusion between two
451	 * writes on same cpu. It probably is not needed for 64bit. Not
452	 * optimizing that case yet.
453	 */
454	local_irq_save(flags);
455
456	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
457
458	u64_stats_update_begin(&stats_cpu->syncp);
459	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
460				direction, sync);
461	u64_stats_update_end(&stats_cpu->syncp);
462	local_irq_restore(flags);
463}
464EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
465
466/*
467 * This function allocates the per cpu stats for blkio_group. Should be called
468 * from sleepable context as alloc_per_cpu() requires that.
469 */
470int blkio_alloc_blkg_stats(struct blkio_group *blkg)
471{
472	/* Allocate memory for per cpu stats */
473	blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
474	if (!blkg->stats_cpu)
475		return -ENOMEM;
476	return 0;
477}
478EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
479
480void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
481		struct blkio_group *blkg, void *key, dev_t dev,
482		enum blkio_policy_id plid)
483{
484	unsigned long flags;
485
486	spin_lock_irqsave(&blkcg->lock, flags);
487	spin_lock_init(&blkg->stats_lock);
488	rcu_assign_pointer(blkg->key, key);
489	blkg->blkcg_id = css_id(&blkcg->css);
490	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
491	blkg->plid = plid;
492	spin_unlock_irqrestore(&blkcg->lock, flags);
493	/* Need to take css reference ? */
494	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
495	blkg->dev = dev;
496}
497EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
498
499static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
500{
501	hlist_del_init_rcu(&blkg->blkcg_node);
502	blkg->blkcg_id = 0;
503}
504
505/*
506 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
507 * indicating that blk_group was unhashed by the time we got to it.
508 */
509int blkiocg_del_blkio_group(struct blkio_group *blkg)
510{
511	struct blkio_cgroup *blkcg;
512	unsigned long flags;
513	struct cgroup_subsys_state *css;
514	int ret = 1;
515
516	rcu_read_lock();
517	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
518	if (css) {
519		blkcg = container_of(css, struct blkio_cgroup, css);
520		spin_lock_irqsave(&blkcg->lock, flags);
521		if (!hlist_unhashed(&blkg->blkcg_node)) {
522			__blkiocg_del_blkio_group(blkg);
523			ret = 0;
524		}
525		spin_unlock_irqrestore(&blkcg->lock, flags);
526	}
527
528	rcu_read_unlock();
529	return ret;
530}
531EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
532
533/* called under rcu_read_lock(). */
534struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
535{
536	struct blkio_group *blkg;
537	struct hlist_node *n;
538	void *__key;
539
540	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
541		__key = blkg->key;
542		if (__key == key)
543			return blkg;
544	}
545
546	return NULL;
547}
548EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
549
550void blkg_destroy_all(struct request_queue *q)
551{
552	struct blkio_policy_type *pol;
553
554	while (true) {
555		bool done = true;
556
557		spin_lock(&blkio_list_lock);
558		spin_lock_irq(q->queue_lock);
559
560		/*
561		 * clear_queue_fn() might return with non-empty group list
562		 * if it raced cgroup removal and lost.  cgroup removal is
563		 * guaranteed to make forward progress and retrying after a
564		 * while is enough.  This ugliness is scheduled to be
565		 * removed after locking update.
566		 */
567		list_for_each_entry(pol, &blkio_list, list)
568			if (!pol->ops.blkio_clear_queue_fn(q))
569				done = false;
570
571		spin_unlock_irq(q->queue_lock);
572		spin_unlock(&blkio_list_lock);
573
574		if (done)
575			break;
576
577		msleep(10);	/* just some random duration I like */
578	}
579}
580
581static void blkio_reset_stats_cpu(struct blkio_group *blkg)
582{
583	struct blkio_group_stats_cpu *stats_cpu;
584	int i, j, k;
585	/*
586	 * Note: On 64 bit arch this should not be an issue. This has the
587	 * possibility of returning some inconsistent value on 32bit arch
588	 * as 64bit update on 32bit is non atomic. Taking care of this
589	 * corner case makes code very complicated, like sending IPIs to
590	 * cpus, taking care of stats of offline cpus etc.
591	 *
592	 * reset stats is anyway more of a debug feature and this sounds a
593	 * corner case. So I am not complicating the code yet until and
594	 * unless this becomes a real issue.
595	 */
596	for_each_possible_cpu(i) {
597		stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
598		stats_cpu->sectors = 0;
599		for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
600			for (k = 0; k < BLKIO_STAT_TOTAL; k++)
601				stats_cpu->stat_arr_cpu[j][k] = 0;
602	}
603}
604
605static int
606blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
607{
608	struct blkio_cgroup *blkcg;
609	struct blkio_group *blkg;
610	struct blkio_group_stats *stats;
611	struct hlist_node *n;
612	uint64_t queued[BLKIO_STAT_TOTAL];
613	int i;
614#ifdef CONFIG_DEBUG_BLK_CGROUP
615	bool idling, waiting, empty;
616	unsigned long long now = sched_clock();
617#endif
618
619	blkcg = cgroup_to_blkio_cgroup(cgroup);
620	spin_lock_irq(&blkcg->lock);
621	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
622		spin_lock(&blkg->stats_lock);
623		stats = &blkg->stats;
624#ifdef CONFIG_DEBUG_BLK_CGROUP
625		idling = blkio_blkg_idling(stats);
626		waiting = blkio_blkg_waiting(stats);
627		empty = blkio_blkg_empty(stats);
628#endif
629		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
630			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
631		memset(stats, 0, sizeof(struct blkio_group_stats));
632		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
633			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
634#ifdef CONFIG_DEBUG_BLK_CGROUP
635		if (idling) {
636			blkio_mark_blkg_idling(stats);
637			stats->start_idle_time = now;
638		}
639		if (waiting) {
640			blkio_mark_blkg_waiting(stats);
641			stats->start_group_wait_time = now;
642		}
643		if (empty) {
644			blkio_mark_blkg_empty(stats);
645			stats->start_empty_time = now;
646		}
647#endif
648		spin_unlock(&blkg->stats_lock);
649
650		/* Reset Per cpu stats which don't take blkg->stats_lock */
651		blkio_reset_stats_cpu(blkg);
652	}
653
654	spin_unlock_irq(&blkcg->lock);
655	return 0;
656}
657
658static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
659				int chars_left, bool diskname_only)
660{
661	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
662	chars_left -= strlen(str);
663	if (chars_left <= 0) {
664		printk(KERN_WARNING
665			"Possibly incorrect cgroup stat display format");
666		return;
667	}
668	if (diskname_only)
669		return;
670	switch (type) {
671	case BLKIO_STAT_READ:
672		strlcat(str, " Read", chars_left);
673		break;
674	case BLKIO_STAT_WRITE:
675		strlcat(str, " Write", chars_left);
676		break;
677	case BLKIO_STAT_SYNC:
678		strlcat(str, " Sync", chars_left);
679		break;
680	case BLKIO_STAT_ASYNC:
681		strlcat(str, " Async", chars_left);
682		break;
683	case BLKIO_STAT_TOTAL:
684		strlcat(str, " Total", chars_left);
685		break;
686	default:
687		strlcat(str, " Invalid", chars_left);
688	}
689}
690
691static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
692				struct cgroup_map_cb *cb, dev_t dev)
693{
694	blkio_get_key_name(0, dev, str, chars_left, true);
695	cb->fill(cb, str, val);
696	return val;
697}
698
699
700static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
701			enum stat_type_cpu type, enum stat_sub_type sub_type)
702{
703	int cpu;
704	struct blkio_group_stats_cpu *stats_cpu;
705	u64 val = 0, tval;
706
707	for_each_possible_cpu(cpu) {
708		unsigned int start;
709		stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
710
711		do {
712			start = u64_stats_fetch_begin(&stats_cpu->syncp);
713			if (type == BLKIO_STAT_CPU_SECTORS)
714				tval = stats_cpu->sectors;
715			else
716				tval = stats_cpu->stat_arr_cpu[type][sub_type];
717		} while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
718
719		val += tval;
720	}
721
722	return val;
723}
724
725static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
726		struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
727{
728	uint64_t disk_total, val;
729	char key_str[MAX_KEY_LEN];
730	enum stat_sub_type sub_type;
731
732	if (type == BLKIO_STAT_CPU_SECTORS) {
733		val = blkio_read_stat_cpu(blkg, type, 0);
734		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
735	}
736
737	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
738			sub_type++) {
739		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
740		val = blkio_read_stat_cpu(blkg, type, sub_type);
741		cb->fill(cb, key_str, val);
742	}
743
744	disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
745			blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
746
747	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
748	cb->fill(cb, key_str, disk_total);
749	return disk_total;
750}
751
752/* This should be called with blkg->stats_lock held */
753static uint64_t blkio_get_stat(struct blkio_group *blkg,
754		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
755{
756	uint64_t disk_total;
757	char key_str[MAX_KEY_LEN];
758	enum stat_sub_type sub_type;
759
760	if (type == BLKIO_STAT_TIME)
761		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
762					blkg->stats.time, cb, dev);
763#ifdef CONFIG_DEBUG_BLK_CGROUP
764	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
765		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
766					blkg->stats.unaccounted_time, cb, dev);
767	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
768		uint64_t sum = blkg->stats.avg_queue_size_sum;
769		uint64_t samples = blkg->stats.avg_queue_size_samples;
770		if (samples)
771			do_div(sum, samples);
772		else
773			sum = 0;
774		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
775	}
776	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
777		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
778					blkg->stats.group_wait_time, cb, dev);
779	if (type == BLKIO_STAT_IDLE_TIME)
780		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
781					blkg->stats.idle_time, cb, dev);
782	if (type == BLKIO_STAT_EMPTY_TIME)
783		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
784					blkg->stats.empty_time, cb, dev);
785	if (type == BLKIO_STAT_DEQUEUE)
786		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
787					blkg->stats.dequeue, cb, dev);
788#endif
789
790	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
791			sub_type++) {
792		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
793		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
794	}
795	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
796			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
797	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
798	cb->fill(cb, key_str, disk_total);
799	return disk_total;
800}
801
802static int blkio_policy_parse_and_set(char *buf,
803	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
804{
805	struct gendisk *disk = NULL;
806	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
807	unsigned long major, minor;
808	int i = 0, ret = -EINVAL;
809	int part;
810	dev_t dev;
811	u64 temp;
812
813	memset(s, 0, sizeof(s));
814
815	while ((p = strsep(&buf, " ")) != NULL) {
816		if (!*p)
817			continue;
818
819		s[i++] = p;
820
821		/* Prevent from inputing too many things */
822		if (i == 3)
823			break;
824	}
825
826	if (i != 2)
827		goto out;
828
829	p = strsep(&s[0], ":");
830	if (p != NULL)
831		major_s = p;
832	else
833		goto out;
834
835	minor_s = s[0];
836	if (!minor_s)
837		goto out;
838
839	if (strict_strtoul(major_s, 10, &major))
840		goto out;
841
842	if (strict_strtoul(minor_s, 10, &minor))
843		goto out;
844
845	dev = MKDEV(major, minor);
846
847	if (strict_strtoull(s[1], 10, &temp))
848		goto out;
849
850	/* For rule removal, do not check for device presence. */
851	if (temp) {
852		disk = get_gendisk(dev, &part);
853		if (!disk || part) {
854			ret = -ENODEV;
855			goto out;
856		}
857	}
858
859	newpn->dev = dev;
860
861	switch (plid) {
862	case BLKIO_POLICY_PROP:
863		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
864		     temp > BLKIO_WEIGHT_MAX)
865			goto out;
866
867		newpn->plid = plid;
868		newpn->fileid = fileid;
869		newpn->val.weight = temp;
870		break;
871	case BLKIO_POLICY_THROTL:
872		switch(fileid) {
873		case BLKIO_THROTL_read_bps_device:
874		case BLKIO_THROTL_write_bps_device:
875			newpn->plid = plid;
876			newpn->fileid = fileid;
877			newpn->val.bps = temp;
878			break;
879		case BLKIO_THROTL_read_iops_device:
880		case BLKIO_THROTL_write_iops_device:
881			if (temp > THROTL_IOPS_MAX)
882				goto out;
883
884			newpn->plid = plid;
885			newpn->fileid = fileid;
886			newpn->val.iops = (unsigned int)temp;
887			break;
888		}
889		break;
890	default:
891		BUG();
892	}
893	ret = 0;
894out:
895	put_disk(disk);
896	return ret;
897}
898
899unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
900			      dev_t dev)
901{
902	struct blkio_policy_node *pn;
903	unsigned long flags;
904	unsigned int weight;
905
906	spin_lock_irqsave(&blkcg->lock, flags);
907
908	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
909				BLKIO_PROP_weight_device);
910	if (pn)
911		weight = pn->val.weight;
912	else
913		weight = blkcg->weight;
914
915	spin_unlock_irqrestore(&blkcg->lock, flags);
916
917	return weight;
918}
919EXPORT_SYMBOL_GPL(blkcg_get_weight);
920
921uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
922{
923	struct blkio_policy_node *pn;
924	unsigned long flags;
925	uint64_t bps = -1;
926
927	spin_lock_irqsave(&blkcg->lock, flags);
928	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
929				BLKIO_THROTL_read_bps_device);
930	if (pn)
931		bps = pn->val.bps;
932	spin_unlock_irqrestore(&blkcg->lock, flags);
933
934	return bps;
935}
936
937uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
938{
939	struct blkio_policy_node *pn;
940	unsigned long flags;
941	uint64_t bps = -1;
942
943	spin_lock_irqsave(&blkcg->lock, flags);
944	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
945				BLKIO_THROTL_write_bps_device);
946	if (pn)
947		bps = pn->val.bps;
948	spin_unlock_irqrestore(&blkcg->lock, flags);
949
950	return bps;
951}
952
953unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
954{
955	struct blkio_policy_node *pn;
956	unsigned long flags;
957	unsigned int iops = -1;
958
959	spin_lock_irqsave(&blkcg->lock, flags);
960	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
961				BLKIO_THROTL_read_iops_device);
962	if (pn)
963		iops = pn->val.iops;
964	spin_unlock_irqrestore(&blkcg->lock, flags);
965
966	return iops;
967}
968
969unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
970{
971	struct blkio_policy_node *pn;
972	unsigned long flags;
973	unsigned int iops = -1;
974
975	spin_lock_irqsave(&blkcg->lock, flags);
976	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
977				BLKIO_THROTL_write_iops_device);
978	if (pn)
979		iops = pn->val.iops;
980	spin_unlock_irqrestore(&blkcg->lock, flags);
981
982	return iops;
983}
984
985/* Checks whether user asked for deleting a policy rule */
986static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
987{
988	switch(pn->plid) {
989	case BLKIO_POLICY_PROP:
990		if (pn->val.weight == 0)
991			return 1;
992		break;
993	case BLKIO_POLICY_THROTL:
994		switch(pn->fileid) {
995		case BLKIO_THROTL_read_bps_device:
996		case BLKIO_THROTL_write_bps_device:
997			if (pn->val.bps == 0)
998				return 1;
999			break;
1000		case BLKIO_THROTL_read_iops_device:
1001		case BLKIO_THROTL_write_iops_device:
1002			if (pn->val.iops == 0)
1003				return 1;
1004		}
1005		break;
1006	default:
1007		BUG();
1008	}
1009
1010	return 0;
1011}
1012
1013static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
1014					struct blkio_policy_node *newpn)
1015{
1016	switch(oldpn->plid) {
1017	case BLKIO_POLICY_PROP:
1018		oldpn->val.weight = newpn->val.weight;
1019		break;
1020	case BLKIO_POLICY_THROTL:
1021		switch(newpn->fileid) {
1022		case BLKIO_THROTL_read_bps_device:
1023		case BLKIO_THROTL_write_bps_device:
1024			oldpn->val.bps = newpn->val.bps;
1025			break;
1026		case BLKIO_THROTL_read_iops_device:
1027		case BLKIO_THROTL_write_iops_device:
1028			oldpn->val.iops = newpn->val.iops;
1029		}
1030		break;
1031	default:
1032		BUG();
1033	}
1034}
1035
1036/*
1037 * Some rules/values in blkg have changed. Propagate those to respective
1038 * policies.
1039 */
1040static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
1041		struct blkio_group *blkg, struct blkio_policy_node *pn)
1042{
1043	unsigned int weight, iops;
1044	u64 bps;
1045
1046	switch(pn->plid) {
1047	case BLKIO_POLICY_PROP:
1048		weight = pn->val.weight ? pn->val.weight :
1049				blkcg->weight;
1050		blkio_update_group_weight(blkg, weight);
1051		break;
1052	case BLKIO_POLICY_THROTL:
1053		switch(pn->fileid) {
1054		case BLKIO_THROTL_read_bps_device:
1055		case BLKIO_THROTL_write_bps_device:
1056			bps = pn->val.bps ? pn->val.bps : (-1);
1057			blkio_update_group_bps(blkg, bps, pn->fileid);
1058			break;
1059		case BLKIO_THROTL_read_iops_device:
1060		case BLKIO_THROTL_write_iops_device:
1061			iops = pn->val.iops ? pn->val.iops : (-1);
1062			blkio_update_group_iops(blkg, iops, pn->fileid);
1063			break;
1064		}
1065		break;
1066	default:
1067		BUG();
1068	}
1069}
1070
1071/*
1072 * A policy node rule has been updated. Propagate this update to all the
1073 * block groups which might be affected by this update.
1074 */
1075static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
1076				struct blkio_policy_node *pn)
1077{
1078	struct blkio_group *blkg;
1079	struct hlist_node *n;
1080
1081	spin_lock(&blkio_list_lock);
1082	spin_lock_irq(&blkcg->lock);
1083
1084	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1085		if (pn->dev != blkg->dev || pn->plid != blkg->plid)
1086			continue;
1087		blkio_update_blkg_policy(blkcg, blkg, pn);
1088	}
1089
1090	spin_unlock_irq(&blkcg->lock);
1091	spin_unlock(&blkio_list_lock);
1092}
1093
1094static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
1095 				       const char *buffer)
1096{
1097	int ret = 0;
1098	char *buf;
1099	struct blkio_policy_node *newpn, *pn;
1100	struct blkio_cgroup *blkcg;
1101	int keep_newpn = 0;
1102	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1103	int fileid = BLKIOFILE_ATTR(cft->private);
1104
1105	buf = kstrdup(buffer, GFP_KERNEL);
1106	if (!buf)
1107		return -ENOMEM;
1108
1109	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
1110	if (!newpn) {
1111		ret = -ENOMEM;
1112		goto free_buf;
1113	}
1114
1115	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
1116	if (ret)
1117		goto free_newpn;
1118
1119	blkcg = cgroup_to_blkio_cgroup(cgrp);
1120
1121	spin_lock_irq(&blkcg->lock);
1122
1123	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
1124	if (!pn) {
1125		if (!blkio_delete_rule_command(newpn)) {
1126			blkio_policy_insert_node(blkcg, newpn);
1127			keep_newpn = 1;
1128		}
1129		spin_unlock_irq(&blkcg->lock);
1130		goto update_io_group;
1131	}
1132
1133	if (blkio_delete_rule_command(newpn)) {
1134		blkio_policy_delete_node(pn);
1135		kfree(pn);
1136		spin_unlock_irq(&blkcg->lock);
1137		goto update_io_group;
1138	}
1139	spin_unlock_irq(&blkcg->lock);
1140
1141	blkio_update_policy_rule(pn, newpn);
1142
1143update_io_group:
1144	blkio_update_policy_node_blkg(blkcg, newpn);
1145
1146free_newpn:
1147	if (!keep_newpn)
1148		kfree(newpn);
1149free_buf:
1150	kfree(buf);
1151	return ret;
1152}
1153
1154static void
1155blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
1156{
1157	switch(pn->plid) {
1158		case BLKIO_POLICY_PROP:
1159			if (pn->fileid == BLKIO_PROP_weight_device)
1160				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1161					MINOR(pn->dev), pn->val.weight);
1162			break;
1163		case BLKIO_POLICY_THROTL:
1164			switch(pn->fileid) {
1165			case BLKIO_THROTL_read_bps_device:
1166			case BLKIO_THROTL_write_bps_device:
1167				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
1168					MINOR(pn->dev), pn->val.bps);
1169				break;
1170			case BLKIO_THROTL_read_iops_device:
1171			case BLKIO_THROTL_write_iops_device:
1172				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1173					MINOR(pn->dev), pn->val.iops);
1174				break;
1175			}
1176			break;
1177		default:
1178			BUG();
1179	}
1180}
1181
1182/* cgroup files which read their data from policy nodes end up here */
1183static void blkio_read_policy_node_files(struct cftype *cft,
1184			struct blkio_cgroup *blkcg, struct seq_file *m)
1185{
1186	struct blkio_policy_node *pn;
1187
1188	if (!list_empty(&blkcg->policy_list)) {
1189		spin_lock_irq(&blkcg->lock);
1190		list_for_each_entry(pn, &blkcg->policy_list, node) {
1191			if (!pn_matches_cftype(cft, pn))
1192				continue;
1193			blkio_print_policy_node(m, pn);
1194		}
1195		spin_unlock_irq(&blkcg->lock);
1196	}
1197}
1198
1199static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1200				struct seq_file *m)
1201{
1202	struct blkio_cgroup *blkcg;
1203	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1204	int name = BLKIOFILE_ATTR(cft->private);
1205
1206	blkcg = cgroup_to_blkio_cgroup(cgrp);
1207
1208	switch(plid) {
1209	case BLKIO_POLICY_PROP:
1210		switch(name) {
1211		case BLKIO_PROP_weight_device:
1212			blkio_read_policy_node_files(cft, blkcg, m);
1213			return 0;
1214		default:
1215			BUG();
1216		}
1217		break;
1218	case BLKIO_POLICY_THROTL:
1219		switch(name){
1220		case BLKIO_THROTL_read_bps_device:
1221		case BLKIO_THROTL_write_bps_device:
1222		case BLKIO_THROTL_read_iops_device:
1223		case BLKIO_THROTL_write_iops_device:
1224			blkio_read_policy_node_files(cft, blkcg, m);
1225			return 0;
1226		default:
1227			BUG();
1228		}
1229		break;
1230	default:
1231		BUG();
1232	}
1233
1234	return 0;
1235}
1236
1237static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1238		struct cftype *cft, struct cgroup_map_cb *cb,
1239		enum stat_type type, bool show_total, bool pcpu)
1240{
1241	struct blkio_group *blkg;
1242	struct hlist_node *n;
1243	uint64_t cgroup_total = 0;
1244
1245	rcu_read_lock();
1246	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1247		if (blkg->dev) {
1248			if (!cftype_blkg_same_policy(cft, blkg))
1249				continue;
1250			if (pcpu)
1251				cgroup_total += blkio_get_stat_cpu(blkg, cb,
1252						blkg->dev, type);
1253			else {
1254				spin_lock_irq(&blkg->stats_lock);
1255				cgroup_total += blkio_get_stat(blkg, cb,
1256						blkg->dev, type);
1257				spin_unlock_irq(&blkg->stats_lock);
1258			}
1259		}
1260	}
1261	if (show_total)
1262		cb->fill(cb, "Total", cgroup_total);
1263	rcu_read_unlock();
1264	return 0;
1265}
1266
1267/* All map kind of cgroup file get serviced by this function */
1268static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1269				struct cgroup_map_cb *cb)
1270{
1271	struct blkio_cgroup *blkcg;
1272	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1273	int name = BLKIOFILE_ATTR(cft->private);
1274
1275	blkcg = cgroup_to_blkio_cgroup(cgrp);
1276
1277	switch(plid) {
1278	case BLKIO_POLICY_PROP:
1279		switch(name) {
1280		case BLKIO_PROP_time:
1281			return blkio_read_blkg_stats(blkcg, cft, cb,
1282						BLKIO_STAT_TIME, 0, 0);
1283		case BLKIO_PROP_sectors:
1284			return blkio_read_blkg_stats(blkcg, cft, cb,
1285						BLKIO_STAT_CPU_SECTORS, 0, 1);
1286		case BLKIO_PROP_io_service_bytes:
1287			return blkio_read_blkg_stats(blkcg, cft, cb,
1288					BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1289		case BLKIO_PROP_io_serviced:
1290			return blkio_read_blkg_stats(blkcg, cft, cb,
1291						BLKIO_STAT_CPU_SERVICED, 1, 1);
1292		case BLKIO_PROP_io_service_time:
1293			return blkio_read_blkg_stats(blkcg, cft, cb,
1294						BLKIO_STAT_SERVICE_TIME, 1, 0);
1295		case BLKIO_PROP_io_wait_time:
1296			return blkio_read_blkg_stats(blkcg, cft, cb,
1297						BLKIO_STAT_WAIT_TIME, 1, 0);
1298		case BLKIO_PROP_io_merged:
1299			return blkio_read_blkg_stats(blkcg, cft, cb,
1300						BLKIO_STAT_CPU_MERGED, 1, 1);
1301		case BLKIO_PROP_io_queued:
1302			return blkio_read_blkg_stats(blkcg, cft, cb,
1303						BLKIO_STAT_QUEUED, 1, 0);
1304#ifdef CONFIG_DEBUG_BLK_CGROUP
1305		case BLKIO_PROP_unaccounted_time:
1306			return blkio_read_blkg_stats(blkcg, cft, cb,
1307					BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1308		case BLKIO_PROP_dequeue:
1309			return blkio_read_blkg_stats(blkcg, cft, cb,
1310						BLKIO_STAT_DEQUEUE, 0, 0);
1311		case BLKIO_PROP_avg_queue_size:
1312			return blkio_read_blkg_stats(blkcg, cft, cb,
1313					BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1314		case BLKIO_PROP_group_wait_time:
1315			return blkio_read_blkg_stats(blkcg, cft, cb,
1316					BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1317		case BLKIO_PROP_idle_time:
1318			return blkio_read_blkg_stats(blkcg, cft, cb,
1319						BLKIO_STAT_IDLE_TIME, 0, 0);
1320		case BLKIO_PROP_empty_time:
1321			return blkio_read_blkg_stats(blkcg, cft, cb,
1322						BLKIO_STAT_EMPTY_TIME, 0, 0);
1323#endif
1324		default:
1325			BUG();
1326		}
1327		break;
1328	case BLKIO_POLICY_THROTL:
1329		switch(name){
1330		case BLKIO_THROTL_io_service_bytes:
1331			return blkio_read_blkg_stats(blkcg, cft, cb,
1332						BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1333		case BLKIO_THROTL_io_serviced:
1334			return blkio_read_blkg_stats(blkcg, cft, cb,
1335						BLKIO_STAT_CPU_SERVICED, 1, 1);
1336		default:
1337			BUG();
1338		}
1339		break;
1340	default:
1341		BUG();
1342	}
1343
1344	return 0;
1345}
1346
1347static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1348{
1349	struct blkio_group *blkg;
1350	struct hlist_node *n;
1351	struct blkio_policy_node *pn;
1352
1353	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1354		return -EINVAL;
1355
1356	spin_lock(&blkio_list_lock);
1357	spin_lock_irq(&blkcg->lock);
1358	blkcg->weight = (unsigned int)val;
1359
1360	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1361		pn = blkio_policy_search_node(blkcg, blkg->dev,
1362				BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1363		if (pn)
1364			continue;
1365
1366		blkio_update_group_weight(blkg, blkcg->weight);
1367	}
1368	spin_unlock_irq(&blkcg->lock);
1369	spin_unlock(&blkio_list_lock);
1370	return 0;
1371}
1372
1373static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1374	struct blkio_cgroup *blkcg;
1375	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1376	int name = BLKIOFILE_ATTR(cft->private);
1377
1378	blkcg = cgroup_to_blkio_cgroup(cgrp);
1379
1380	switch(plid) {
1381	case BLKIO_POLICY_PROP:
1382		switch(name) {
1383		case BLKIO_PROP_weight:
1384			return (u64)blkcg->weight;
1385		}
1386		break;
1387	default:
1388		BUG();
1389	}
1390	return 0;
1391}
1392
1393static int
1394blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1395{
1396	struct blkio_cgroup *blkcg;
1397	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1398	int name = BLKIOFILE_ATTR(cft->private);
1399
1400	blkcg = cgroup_to_blkio_cgroup(cgrp);
1401
1402	switch(plid) {
1403	case BLKIO_POLICY_PROP:
1404		switch(name) {
1405		case BLKIO_PROP_weight:
1406			return blkio_weight_write(blkcg, val);
1407		}
1408		break;
1409	default:
1410		BUG();
1411	}
1412
1413	return 0;
1414}
1415
1416struct cftype blkio_files[] = {
1417	{
1418		.name = "weight_device",
1419		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1420				BLKIO_PROP_weight_device),
1421		.read_seq_string = blkiocg_file_read,
1422		.write_string = blkiocg_file_write,
1423		.max_write_len = 256,
1424	},
1425	{
1426		.name = "weight",
1427		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1428				BLKIO_PROP_weight),
1429		.read_u64 = blkiocg_file_read_u64,
1430		.write_u64 = blkiocg_file_write_u64,
1431	},
1432	{
1433		.name = "time",
1434		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1435				BLKIO_PROP_time),
1436		.read_map = blkiocg_file_read_map,
1437	},
1438	{
1439		.name = "sectors",
1440		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1441				BLKIO_PROP_sectors),
1442		.read_map = blkiocg_file_read_map,
1443	},
1444	{
1445		.name = "io_service_bytes",
1446		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1447				BLKIO_PROP_io_service_bytes),
1448		.read_map = blkiocg_file_read_map,
1449	},
1450	{
1451		.name = "io_serviced",
1452		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1453				BLKIO_PROP_io_serviced),
1454		.read_map = blkiocg_file_read_map,
1455	},
1456	{
1457		.name = "io_service_time",
1458		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1459				BLKIO_PROP_io_service_time),
1460		.read_map = blkiocg_file_read_map,
1461	},
1462	{
1463		.name = "io_wait_time",
1464		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1465				BLKIO_PROP_io_wait_time),
1466		.read_map = blkiocg_file_read_map,
1467	},
1468	{
1469		.name = "io_merged",
1470		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1471				BLKIO_PROP_io_merged),
1472		.read_map = blkiocg_file_read_map,
1473	},
1474	{
1475		.name = "io_queued",
1476		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1477				BLKIO_PROP_io_queued),
1478		.read_map = blkiocg_file_read_map,
1479	},
1480	{
1481		.name = "reset_stats",
1482		.write_u64 = blkiocg_reset_stats,
1483	},
1484#ifdef CONFIG_BLK_DEV_THROTTLING
1485	{
1486		.name = "throttle.read_bps_device",
1487		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1488				BLKIO_THROTL_read_bps_device),
1489		.read_seq_string = blkiocg_file_read,
1490		.write_string = blkiocg_file_write,
1491		.max_write_len = 256,
1492	},
1493
1494	{
1495		.name = "throttle.write_bps_device",
1496		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1497				BLKIO_THROTL_write_bps_device),
1498		.read_seq_string = blkiocg_file_read,
1499		.write_string = blkiocg_file_write,
1500		.max_write_len = 256,
1501	},
1502
1503	{
1504		.name = "throttle.read_iops_device",
1505		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1506				BLKIO_THROTL_read_iops_device),
1507		.read_seq_string = blkiocg_file_read,
1508		.write_string = blkiocg_file_write,
1509		.max_write_len = 256,
1510	},
1511
1512	{
1513		.name = "throttle.write_iops_device",
1514		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1515				BLKIO_THROTL_write_iops_device),
1516		.read_seq_string = blkiocg_file_read,
1517		.write_string = blkiocg_file_write,
1518		.max_write_len = 256,
1519	},
1520	{
1521		.name = "throttle.io_service_bytes",
1522		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1523				BLKIO_THROTL_io_service_bytes),
1524		.read_map = blkiocg_file_read_map,
1525	},
1526	{
1527		.name = "throttle.io_serviced",
1528		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1529				BLKIO_THROTL_io_serviced),
1530		.read_map = blkiocg_file_read_map,
1531	},
1532#endif /* CONFIG_BLK_DEV_THROTTLING */
1533
1534#ifdef CONFIG_DEBUG_BLK_CGROUP
1535	{
1536		.name = "avg_queue_size",
1537		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1538				BLKIO_PROP_avg_queue_size),
1539		.read_map = blkiocg_file_read_map,
1540	},
1541	{
1542		.name = "group_wait_time",
1543		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1544				BLKIO_PROP_group_wait_time),
1545		.read_map = blkiocg_file_read_map,
1546	},
1547	{
1548		.name = "idle_time",
1549		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1550				BLKIO_PROP_idle_time),
1551		.read_map = blkiocg_file_read_map,
1552	},
1553	{
1554		.name = "empty_time",
1555		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1556				BLKIO_PROP_empty_time),
1557		.read_map = blkiocg_file_read_map,
1558	},
1559	{
1560		.name = "dequeue",
1561		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1562				BLKIO_PROP_dequeue),
1563		.read_map = blkiocg_file_read_map,
1564	},
1565	{
1566		.name = "unaccounted_time",
1567		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1568				BLKIO_PROP_unaccounted_time),
1569		.read_map = blkiocg_file_read_map,
1570	},
1571#endif
1572};
1573
1574static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1575{
1576	return cgroup_add_files(cgroup, subsys, blkio_files,
1577				ARRAY_SIZE(blkio_files));
1578}
1579
1580static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1581{
1582	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1583	unsigned long flags;
1584	struct blkio_group *blkg;
1585	void *key;
1586	struct blkio_policy_type *blkiop;
1587	struct blkio_policy_node *pn, *pntmp;
1588
1589	rcu_read_lock();
1590	do {
1591		spin_lock_irqsave(&blkcg->lock, flags);
1592
1593		if (hlist_empty(&blkcg->blkg_list)) {
1594			spin_unlock_irqrestore(&blkcg->lock, flags);
1595			break;
1596		}
1597
1598		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
1599					blkcg_node);
1600		key = rcu_dereference(blkg->key);
1601		__blkiocg_del_blkio_group(blkg);
1602
1603		spin_unlock_irqrestore(&blkcg->lock, flags);
1604
1605		/*
1606		 * This blkio_group is being unlinked as associated cgroup is
1607		 * going away. Let all the IO controlling policies know about
1608		 * this event.
1609		 */
1610		spin_lock(&blkio_list_lock);
1611		list_for_each_entry(blkiop, &blkio_list, list) {
1612			if (blkiop->plid != blkg->plid)
1613				continue;
1614			blkiop->ops.blkio_unlink_group_fn(key, blkg);
1615		}
1616		spin_unlock(&blkio_list_lock);
1617	} while (1);
1618
1619	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
1620		blkio_policy_delete_node(pn);
1621		kfree(pn);
1622	}
1623
1624	free_css_id(&blkio_subsys, &blkcg->css);
1625	rcu_read_unlock();
1626	if (blkcg != &blkio_root_cgroup)
1627		kfree(blkcg);
1628}
1629
1630static struct cgroup_subsys_state *
1631blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1632{
1633	struct blkio_cgroup *blkcg;
1634	struct cgroup *parent = cgroup->parent;
1635
1636	if (!parent) {
1637		blkcg = &blkio_root_cgroup;
1638		goto done;
1639	}
1640
1641	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1642	if (!blkcg)
1643		return ERR_PTR(-ENOMEM);
1644
1645	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1646done:
1647	spin_lock_init(&blkcg->lock);
1648	INIT_HLIST_HEAD(&blkcg->blkg_list);
1649
1650	INIT_LIST_HEAD(&blkcg->policy_list);
1651	return &blkcg->css;
1652}
1653
1654/*
1655 * We cannot support shared io contexts, as we have no mean to support
1656 * two tasks with the same ioc in two different groups without major rework
1657 * of the main cic data structures.  For now we allow a task to change
1658 * its cgroup only if it's the only owner of its ioc.
1659 */
1660static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1661			      struct cgroup_taskset *tset)
1662{
1663	struct task_struct *task;
1664	struct io_context *ioc;
1665	int ret = 0;
1666
1667	/* task_lock() is needed to avoid races with exit_io_context() */
1668	cgroup_taskset_for_each(task, cgrp, tset) {
1669		task_lock(task);
1670		ioc = task->io_context;
1671		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1672			ret = -EINVAL;
1673		task_unlock(task);
1674		if (ret)
1675			break;
1676	}
1677	return ret;
1678}
1679
1680static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1681			   struct cgroup_taskset *tset)
1682{
1683	struct task_struct *task;
1684	struct io_context *ioc;
1685
1686	cgroup_taskset_for_each(task, cgrp, tset) {
1687		/* we don't lose anything even if ioc allocation fails */
1688		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1689		if (ioc) {
1690			ioc_cgroup_changed(ioc);
1691			put_io_context(ioc);
1692		}
1693	}
1694}
1695
1696void blkio_policy_register(struct blkio_policy_type *blkiop)
1697{
1698	spin_lock(&blkio_list_lock);
1699	list_add_tail(&blkiop->list, &blkio_list);
1700	spin_unlock(&blkio_list_lock);
1701}
1702EXPORT_SYMBOL_GPL(blkio_policy_register);
1703
1704void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1705{
1706	spin_lock(&blkio_list_lock);
1707	list_del_init(&blkiop->list);
1708	spin_unlock(&blkio_list_lock);
1709}
1710EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1711