blk-cgroup.c revision 676f7c8f84d15e94065841529016da5ab92e901b
1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 *		      Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * 	              Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
14#include <linux/seq_file.h>
15#include <linux/kdev_t.h>
16#include <linux/module.h>
17#include <linux/err.h>
18#include <linux/blkdev.h>
19#include <linux/slab.h>
20#include "blk-cgroup.h"
21#include <linux/genhd.h>
22
23#define MAX_KEY_LEN 100
24
25static DEFINE_SPINLOCK(blkio_list_lock);
26static LIST_HEAD(blkio_list);
27
28struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
29EXPORT_SYMBOL_GPL(blkio_root_cgroup);
30
31/* for encoding cft->private value on file */
32#define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
33/* What policy owns the file, proportional or throttle */
34#define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
35#define BLKIOFILE_ATTR(val)		((val) & 0xffff)
36
37static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
38					    struct blkio_policy_node *pn)
39{
40	list_add(&pn->node, &blkcg->policy_list);
41}
42
43static inline bool cftype_blkg_same_policy(struct cftype *cft,
44			struct blkio_group *blkg)
45{
46	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
47
48	if (blkg->plid == plid)
49		return 1;
50
51	return 0;
52}
53
54/* Determines if policy node matches cgroup file being accessed */
55static inline bool pn_matches_cftype(struct cftype *cft,
56			struct blkio_policy_node *pn)
57{
58	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
59	int fileid = BLKIOFILE_ATTR(cft->private);
60
61	return (plid == pn->plid && fileid == pn->fileid);
62}
63
64/* Must be called with blkcg->lock held */
65static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
66{
67	list_del(&pn->node);
68}
69
70/* Must be called with blkcg->lock held */
71static struct blkio_policy_node *
72blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
73		enum blkio_policy_id plid, int fileid)
74{
75	struct blkio_policy_node *pn;
76
77	list_for_each_entry(pn, &blkcg->policy_list, node) {
78		if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
79			return pn;
80	}
81
82	return NULL;
83}
84
85struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
86{
87	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
88			    struct blkio_cgroup, css);
89}
90EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
91
92struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
93{
94	return container_of(task_subsys_state(tsk, blkio_subsys_id),
95			    struct blkio_cgroup, css);
96}
97EXPORT_SYMBOL_GPL(task_blkio_cgroup);
98
99static inline void
100blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
101{
102	struct blkio_policy_type *blkiop;
103
104	list_for_each_entry(blkiop, &blkio_list, list) {
105		/* If this policy does not own the blkg, do not send updates */
106		if (blkiop->plid != blkg->plid)
107			continue;
108		if (blkiop->ops.blkio_update_group_weight_fn)
109			blkiop->ops.blkio_update_group_weight_fn(blkg->key,
110							blkg, weight);
111	}
112}
113
114static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
115				int fileid)
116{
117	struct blkio_policy_type *blkiop;
118
119	list_for_each_entry(blkiop, &blkio_list, list) {
120
121		/* If this policy does not own the blkg, do not send updates */
122		if (blkiop->plid != blkg->plid)
123			continue;
124
125		if (fileid == BLKIO_THROTL_read_bps_device
126		    && blkiop->ops.blkio_update_group_read_bps_fn)
127			blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
128								blkg, bps);
129
130		if (fileid == BLKIO_THROTL_write_bps_device
131		    && blkiop->ops.blkio_update_group_write_bps_fn)
132			blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
133								blkg, bps);
134	}
135}
136
137static inline void blkio_update_group_iops(struct blkio_group *blkg,
138			unsigned int iops, int fileid)
139{
140	struct blkio_policy_type *blkiop;
141
142	list_for_each_entry(blkiop, &blkio_list, list) {
143
144		/* If this policy does not own the blkg, do not send updates */
145		if (blkiop->plid != blkg->plid)
146			continue;
147
148		if (fileid == BLKIO_THROTL_read_iops_device
149		    && blkiop->ops.blkio_update_group_read_iops_fn)
150			blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
151								blkg, iops);
152
153		if (fileid == BLKIO_THROTL_write_iops_device
154		    && blkiop->ops.blkio_update_group_write_iops_fn)
155			blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
156								blkg,iops);
157	}
158}
159
160/*
161 * Add to the appropriate stat variable depending on the request type.
162 * This should be called with the blkg->stats_lock held.
163 */
164static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
165				bool sync)
166{
167	if (direction)
168		stat[BLKIO_STAT_WRITE] += add;
169	else
170		stat[BLKIO_STAT_READ] += add;
171	if (sync)
172		stat[BLKIO_STAT_SYNC] += add;
173	else
174		stat[BLKIO_STAT_ASYNC] += add;
175}
176
177/*
178 * Decrements the appropriate stat variable if non-zero depending on the
179 * request type. Panics on value being zero.
180 * This should be called with the blkg->stats_lock held.
181 */
182static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
183{
184	if (direction) {
185		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
186		stat[BLKIO_STAT_WRITE]--;
187	} else {
188		BUG_ON(stat[BLKIO_STAT_READ] == 0);
189		stat[BLKIO_STAT_READ]--;
190	}
191	if (sync) {
192		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
193		stat[BLKIO_STAT_SYNC]--;
194	} else {
195		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
196		stat[BLKIO_STAT_ASYNC]--;
197	}
198}
199
200#ifdef CONFIG_DEBUG_BLK_CGROUP
201/* This should be called with the blkg->stats_lock held. */
202static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
203						struct blkio_group *curr_blkg)
204{
205	if (blkio_blkg_waiting(&blkg->stats))
206		return;
207	if (blkg == curr_blkg)
208		return;
209	blkg->stats.start_group_wait_time = sched_clock();
210	blkio_mark_blkg_waiting(&blkg->stats);
211}
212
213/* This should be called with the blkg->stats_lock held. */
214static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
215{
216	unsigned long long now;
217
218	if (!blkio_blkg_waiting(stats))
219		return;
220
221	now = sched_clock();
222	if (time_after64(now, stats->start_group_wait_time))
223		stats->group_wait_time += now - stats->start_group_wait_time;
224	blkio_clear_blkg_waiting(stats);
225}
226
227/* This should be called with the blkg->stats_lock held. */
228static void blkio_end_empty_time(struct blkio_group_stats *stats)
229{
230	unsigned long long now;
231
232	if (!blkio_blkg_empty(stats))
233		return;
234
235	now = sched_clock();
236	if (time_after64(now, stats->start_empty_time))
237		stats->empty_time += now - stats->start_empty_time;
238	blkio_clear_blkg_empty(stats);
239}
240
241void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
242{
243	unsigned long flags;
244
245	spin_lock_irqsave(&blkg->stats_lock, flags);
246	BUG_ON(blkio_blkg_idling(&blkg->stats));
247	blkg->stats.start_idle_time = sched_clock();
248	blkio_mark_blkg_idling(&blkg->stats);
249	spin_unlock_irqrestore(&blkg->stats_lock, flags);
250}
251EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
252
253void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
254{
255	unsigned long flags;
256	unsigned long long now;
257	struct blkio_group_stats *stats;
258
259	spin_lock_irqsave(&blkg->stats_lock, flags);
260	stats = &blkg->stats;
261	if (blkio_blkg_idling(stats)) {
262		now = sched_clock();
263		if (time_after64(now, stats->start_idle_time))
264			stats->idle_time += now - stats->start_idle_time;
265		blkio_clear_blkg_idling(stats);
266	}
267	spin_unlock_irqrestore(&blkg->stats_lock, flags);
268}
269EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
270
271void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
272{
273	unsigned long flags;
274	struct blkio_group_stats *stats;
275
276	spin_lock_irqsave(&blkg->stats_lock, flags);
277	stats = &blkg->stats;
278	stats->avg_queue_size_sum +=
279			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
280			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
281	stats->avg_queue_size_samples++;
282	blkio_update_group_wait_time(stats);
283	spin_unlock_irqrestore(&blkg->stats_lock, flags);
284}
285EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
286
287void blkiocg_set_start_empty_time(struct blkio_group *blkg)
288{
289	unsigned long flags;
290	struct blkio_group_stats *stats;
291
292	spin_lock_irqsave(&blkg->stats_lock, flags);
293	stats = &blkg->stats;
294
295	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
296			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
297		spin_unlock_irqrestore(&blkg->stats_lock, flags);
298		return;
299	}
300
301	/*
302	 * group is already marked empty. This can happen if cfqq got new
303	 * request in parent group and moved to this group while being added
304	 * to service tree. Just ignore the event and move on.
305	 */
306	if(blkio_blkg_empty(stats)) {
307		spin_unlock_irqrestore(&blkg->stats_lock, flags);
308		return;
309	}
310
311	stats->start_empty_time = sched_clock();
312	blkio_mark_blkg_empty(stats);
313	spin_unlock_irqrestore(&blkg->stats_lock, flags);
314}
315EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
316
317void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
318			unsigned long dequeue)
319{
320	blkg->stats.dequeue += dequeue;
321}
322EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
323#else
324static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
325					struct blkio_group *curr_blkg) {}
326static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
327#endif
328
329void blkiocg_update_io_add_stats(struct blkio_group *blkg,
330			struct blkio_group *curr_blkg, bool direction,
331			bool sync)
332{
333	unsigned long flags;
334
335	spin_lock_irqsave(&blkg->stats_lock, flags);
336	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
337			sync);
338	blkio_end_empty_time(&blkg->stats);
339	blkio_set_start_group_wait_time(blkg, curr_blkg);
340	spin_unlock_irqrestore(&blkg->stats_lock, flags);
341}
342EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
343
344void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
345						bool direction, bool sync)
346{
347	unsigned long flags;
348
349	spin_lock_irqsave(&blkg->stats_lock, flags);
350	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
351					direction, sync);
352	spin_unlock_irqrestore(&blkg->stats_lock, flags);
353}
354EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
355
356void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
357				unsigned long unaccounted_time)
358{
359	unsigned long flags;
360
361	spin_lock_irqsave(&blkg->stats_lock, flags);
362	blkg->stats.time += time;
363#ifdef CONFIG_DEBUG_BLK_CGROUP
364	blkg->stats.unaccounted_time += unaccounted_time;
365#endif
366	spin_unlock_irqrestore(&blkg->stats_lock, flags);
367}
368EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
369
370/*
371 * should be called under rcu read lock or queue lock to make sure blkg pointer
372 * is valid.
373 */
374void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
375				uint64_t bytes, bool direction, bool sync)
376{
377	struct blkio_group_stats_cpu *stats_cpu;
378	unsigned long flags;
379
380	/*
381	 * Disabling interrupts to provide mutual exclusion between two
382	 * writes on same cpu. It probably is not needed for 64bit. Not
383	 * optimizing that case yet.
384	 */
385	local_irq_save(flags);
386
387	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
388
389	u64_stats_update_begin(&stats_cpu->syncp);
390	stats_cpu->sectors += bytes >> 9;
391	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
392			1, direction, sync);
393	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
394			bytes, direction, sync);
395	u64_stats_update_end(&stats_cpu->syncp);
396	local_irq_restore(flags);
397}
398EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
399
400void blkiocg_update_completion_stats(struct blkio_group *blkg,
401	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
402{
403	struct blkio_group_stats *stats;
404	unsigned long flags;
405	unsigned long long now = sched_clock();
406
407	spin_lock_irqsave(&blkg->stats_lock, flags);
408	stats = &blkg->stats;
409	if (time_after64(now, io_start_time))
410		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
411				now - io_start_time, direction, sync);
412	if (time_after64(io_start_time, start_time))
413		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
414				io_start_time - start_time, direction, sync);
415	spin_unlock_irqrestore(&blkg->stats_lock, flags);
416}
417EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
418
419/*  Merged stats are per cpu.  */
420void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
421					bool sync)
422{
423	struct blkio_group_stats_cpu *stats_cpu;
424	unsigned long flags;
425
426	/*
427	 * Disabling interrupts to provide mutual exclusion between two
428	 * writes on same cpu. It probably is not needed for 64bit. Not
429	 * optimizing that case yet.
430	 */
431	local_irq_save(flags);
432
433	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
434
435	u64_stats_update_begin(&stats_cpu->syncp);
436	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
437				direction, sync);
438	u64_stats_update_end(&stats_cpu->syncp);
439	local_irq_restore(flags);
440}
441EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
442
443/*
444 * This function allocates the per cpu stats for blkio_group. Should be called
445 * from sleepable context as alloc_per_cpu() requires that.
446 */
447int blkio_alloc_blkg_stats(struct blkio_group *blkg)
448{
449	/* Allocate memory for per cpu stats */
450	blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
451	if (!blkg->stats_cpu)
452		return -ENOMEM;
453	return 0;
454}
455EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
456
457void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
458		struct blkio_group *blkg, void *key, dev_t dev,
459		enum blkio_policy_id plid)
460{
461	unsigned long flags;
462
463	spin_lock_irqsave(&blkcg->lock, flags);
464	spin_lock_init(&blkg->stats_lock);
465	rcu_assign_pointer(blkg->key, key);
466	blkg->blkcg_id = css_id(&blkcg->css);
467	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
468	blkg->plid = plid;
469	spin_unlock_irqrestore(&blkcg->lock, flags);
470	/* Need to take css reference ? */
471	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
472	blkg->dev = dev;
473}
474EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
475
476static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
477{
478	hlist_del_init_rcu(&blkg->blkcg_node);
479	blkg->blkcg_id = 0;
480}
481
482/*
483 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
484 * indicating that blk_group was unhashed by the time we got to it.
485 */
486int blkiocg_del_blkio_group(struct blkio_group *blkg)
487{
488	struct blkio_cgroup *blkcg;
489	unsigned long flags;
490	struct cgroup_subsys_state *css;
491	int ret = 1;
492
493	rcu_read_lock();
494	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
495	if (css) {
496		blkcg = container_of(css, struct blkio_cgroup, css);
497		spin_lock_irqsave(&blkcg->lock, flags);
498		if (!hlist_unhashed(&blkg->blkcg_node)) {
499			__blkiocg_del_blkio_group(blkg);
500			ret = 0;
501		}
502		spin_unlock_irqrestore(&blkcg->lock, flags);
503	}
504
505	rcu_read_unlock();
506	return ret;
507}
508EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
509
510/* called under rcu_read_lock(). */
511struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
512{
513	struct blkio_group *blkg;
514	struct hlist_node *n;
515	void *__key;
516
517	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
518		__key = blkg->key;
519		if (__key == key)
520			return blkg;
521	}
522
523	return NULL;
524}
525EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
526
527static void blkio_reset_stats_cpu(struct blkio_group *blkg)
528{
529	struct blkio_group_stats_cpu *stats_cpu;
530	int i, j, k;
531	/*
532	 * Note: On 64 bit arch this should not be an issue. This has the
533	 * possibility of returning some inconsistent value on 32bit arch
534	 * as 64bit update on 32bit is non atomic. Taking care of this
535	 * corner case makes code very complicated, like sending IPIs to
536	 * cpus, taking care of stats of offline cpus etc.
537	 *
538	 * reset stats is anyway more of a debug feature and this sounds a
539	 * corner case. So I am not complicating the code yet until and
540	 * unless this becomes a real issue.
541	 */
542	for_each_possible_cpu(i) {
543		stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
544		stats_cpu->sectors = 0;
545		for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
546			for (k = 0; k < BLKIO_STAT_TOTAL; k++)
547				stats_cpu->stat_arr_cpu[j][k] = 0;
548	}
549}
550
551static int
552blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
553{
554	struct blkio_cgroup *blkcg;
555	struct blkio_group *blkg;
556	struct blkio_group_stats *stats;
557	struct hlist_node *n;
558	uint64_t queued[BLKIO_STAT_TOTAL];
559	int i;
560#ifdef CONFIG_DEBUG_BLK_CGROUP
561	bool idling, waiting, empty;
562	unsigned long long now = sched_clock();
563#endif
564
565	blkcg = cgroup_to_blkio_cgroup(cgroup);
566	spin_lock_irq(&blkcg->lock);
567	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
568		spin_lock(&blkg->stats_lock);
569		stats = &blkg->stats;
570#ifdef CONFIG_DEBUG_BLK_CGROUP
571		idling = blkio_blkg_idling(stats);
572		waiting = blkio_blkg_waiting(stats);
573		empty = blkio_blkg_empty(stats);
574#endif
575		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
576			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
577		memset(stats, 0, sizeof(struct blkio_group_stats));
578		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
579			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
580#ifdef CONFIG_DEBUG_BLK_CGROUP
581		if (idling) {
582			blkio_mark_blkg_idling(stats);
583			stats->start_idle_time = now;
584		}
585		if (waiting) {
586			blkio_mark_blkg_waiting(stats);
587			stats->start_group_wait_time = now;
588		}
589		if (empty) {
590			blkio_mark_blkg_empty(stats);
591			stats->start_empty_time = now;
592		}
593#endif
594		spin_unlock(&blkg->stats_lock);
595
596		/* Reset Per cpu stats which don't take blkg->stats_lock */
597		blkio_reset_stats_cpu(blkg);
598	}
599
600	spin_unlock_irq(&blkcg->lock);
601	return 0;
602}
603
604static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
605				int chars_left, bool diskname_only)
606{
607	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
608	chars_left -= strlen(str);
609	if (chars_left <= 0) {
610		printk(KERN_WARNING
611			"Possibly incorrect cgroup stat display format");
612		return;
613	}
614	if (diskname_only)
615		return;
616	switch (type) {
617	case BLKIO_STAT_READ:
618		strlcat(str, " Read", chars_left);
619		break;
620	case BLKIO_STAT_WRITE:
621		strlcat(str, " Write", chars_left);
622		break;
623	case BLKIO_STAT_SYNC:
624		strlcat(str, " Sync", chars_left);
625		break;
626	case BLKIO_STAT_ASYNC:
627		strlcat(str, " Async", chars_left);
628		break;
629	case BLKIO_STAT_TOTAL:
630		strlcat(str, " Total", chars_left);
631		break;
632	default:
633		strlcat(str, " Invalid", chars_left);
634	}
635}
636
637static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
638				struct cgroup_map_cb *cb, dev_t dev)
639{
640	blkio_get_key_name(0, dev, str, chars_left, true);
641	cb->fill(cb, str, val);
642	return val;
643}
644
645
646static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
647			enum stat_type_cpu type, enum stat_sub_type sub_type)
648{
649	int cpu;
650	struct blkio_group_stats_cpu *stats_cpu;
651	u64 val = 0, tval;
652
653	for_each_possible_cpu(cpu) {
654		unsigned int start;
655		stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
656
657		do {
658			start = u64_stats_fetch_begin(&stats_cpu->syncp);
659			if (type == BLKIO_STAT_CPU_SECTORS)
660				tval = stats_cpu->sectors;
661			else
662				tval = stats_cpu->stat_arr_cpu[type][sub_type];
663		} while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
664
665		val += tval;
666	}
667
668	return val;
669}
670
671static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
672		struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
673{
674	uint64_t disk_total, val;
675	char key_str[MAX_KEY_LEN];
676	enum stat_sub_type sub_type;
677
678	if (type == BLKIO_STAT_CPU_SECTORS) {
679		val = blkio_read_stat_cpu(blkg, type, 0);
680		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
681	}
682
683	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
684			sub_type++) {
685		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
686		val = blkio_read_stat_cpu(blkg, type, sub_type);
687		cb->fill(cb, key_str, val);
688	}
689
690	disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
691			blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
692
693	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
694	cb->fill(cb, key_str, disk_total);
695	return disk_total;
696}
697
698/* This should be called with blkg->stats_lock held */
699static uint64_t blkio_get_stat(struct blkio_group *blkg,
700		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
701{
702	uint64_t disk_total;
703	char key_str[MAX_KEY_LEN];
704	enum stat_sub_type sub_type;
705
706	if (type == BLKIO_STAT_TIME)
707		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
708					blkg->stats.time, cb, dev);
709#ifdef CONFIG_DEBUG_BLK_CGROUP
710	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
711		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
712					blkg->stats.unaccounted_time, cb, dev);
713	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
714		uint64_t sum = blkg->stats.avg_queue_size_sum;
715		uint64_t samples = blkg->stats.avg_queue_size_samples;
716		if (samples)
717			do_div(sum, samples);
718		else
719			sum = 0;
720		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
721	}
722	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
723		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
724					blkg->stats.group_wait_time, cb, dev);
725	if (type == BLKIO_STAT_IDLE_TIME)
726		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
727					blkg->stats.idle_time, cb, dev);
728	if (type == BLKIO_STAT_EMPTY_TIME)
729		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
730					blkg->stats.empty_time, cb, dev);
731	if (type == BLKIO_STAT_DEQUEUE)
732		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
733					blkg->stats.dequeue, cb, dev);
734#endif
735
736	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
737			sub_type++) {
738		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
739		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
740	}
741	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
742			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
743	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
744	cb->fill(cb, key_str, disk_total);
745	return disk_total;
746}
747
748static int blkio_policy_parse_and_set(char *buf,
749	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
750{
751	struct gendisk *disk = NULL;
752	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
753	unsigned long major, minor;
754	int i = 0, ret = -EINVAL;
755	int part;
756	dev_t dev;
757	u64 temp;
758
759	memset(s, 0, sizeof(s));
760
761	while ((p = strsep(&buf, " ")) != NULL) {
762		if (!*p)
763			continue;
764
765		s[i++] = p;
766
767		/* Prevent from inputing too many things */
768		if (i == 3)
769			break;
770	}
771
772	if (i != 2)
773		goto out;
774
775	p = strsep(&s[0], ":");
776	if (p != NULL)
777		major_s = p;
778	else
779		goto out;
780
781	minor_s = s[0];
782	if (!minor_s)
783		goto out;
784
785	if (strict_strtoul(major_s, 10, &major))
786		goto out;
787
788	if (strict_strtoul(minor_s, 10, &minor))
789		goto out;
790
791	dev = MKDEV(major, minor);
792
793	if (strict_strtoull(s[1], 10, &temp))
794		goto out;
795
796	/* For rule removal, do not check for device presence. */
797	if (temp) {
798		disk = get_gendisk(dev, &part);
799		if (!disk || part) {
800			ret = -ENODEV;
801			goto out;
802		}
803	}
804
805	newpn->dev = dev;
806
807	switch (plid) {
808	case BLKIO_POLICY_PROP:
809		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
810		     temp > BLKIO_WEIGHT_MAX)
811			goto out;
812
813		newpn->plid = plid;
814		newpn->fileid = fileid;
815		newpn->val.weight = temp;
816		break;
817	case BLKIO_POLICY_THROTL:
818		switch(fileid) {
819		case BLKIO_THROTL_read_bps_device:
820		case BLKIO_THROTL_write_bps_device:
821			newpn->plid = plid;
822			newpn->fileid = fileid;
823			newpn->val.bps = temp;
824			break;
825		case BLKIO_THROTL_read_iops_device:
826		case BLKIO_THROTL_write_iops_device:
827			if (temp > THROTL_IOPS_MAX)
828				goto out;
829
830			newpn->plid = plid;
831			newpn->fileid = fileid;
832			newpn->val.iops = (unsigned int)temp;
833			break;
834		}
835		break;
836	default:
837		BUG();
838	}
839	ret = 0;
840out:
841	put_disk(disk);
842	return ret;
843}
844
845unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
846			      dev_t dev)
847{
848	struct blkio_policy_node *pn;
849	unsigned long flags;
850	unsigned int weight;
851
852	spin_lock_irqsave(&blkcg->lock, flags);
853
854	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
855				BLKIO_PROP_weight_device);
856	if (pn)
857		weight = pn->val.weight;
858	else
859		weight = blkcg->weight;
860
861	spin_unlock_irqrestore(&blkcg->lock, flags);
862
863	return weight;
864}
865EXPORT_SYMBOL_GPL(blkcg_get_weight);
866
867uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
868{
869	struct blkio_policy_node *pn;
870	unsigned long flags;
871	uint64_t bps = -1;
872
873	spin_lock_irqsave(&blkcg->lock, flags);
874	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
875				BLKIO_THROTL_read_bps_device);
876	if (pn)
877		bps = pn->val.bps;
878	spin_unlock_irqrestore(&blkcg->lock, flags);
879
880	return bps;
881}
882
883uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
884{
885	struct blkio_policy_node *pn;
886	unsigned long flags;
887	uint64_t bps = -1;
888
889	spin_lock_irqsave(&blkcg->lock, flags);
890	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
891				BLKIO_THROTL_write_bps_device);
892	if (pn)
893		bps = pn->val.bps;
894	spin_unlock_irqrestore(&blkcg->lock, flags);
895
896	return bps;
897}
898
899unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
900{
901	struct blkio_policy_node *pn;
902	unsigned long flags;
903	unsigned int iops = -1;
904
905	spin_lock_irqsave(&blkcg->lock, flags);
906	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
907				BLKIO_THROTL_read_iops_device);
908	if (pn)
909		iops = pn->val.iops;
910	spin_unlock_irqrestore(&blkcg->lock, flags);
911
912	return iops;
913}
914
915unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
916{
917	struct blkio_policy_node *pn;
918	unsigned long flags;
919	unsigned int iops = -1;
920
921	spin_lock_irqsave(&blkcg->lock, flags);
922	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
923				BLKIO_THROTL_write_iops_device);
924	if (pn)
925		iops = pn->val.iops;
926	spin_unlock_irqrestore(&blkcg->lock, flags);
927
928	return iops;
929}
930
931/* Checks whether user asked for deleting a policy rule */
932static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
933{
934	switch(pn->plid) {
935	case BLKIO_POLICY_PROP:
936		if (pn->val.weight == 0)
937			return 1;
938		break;
939	case BLKIO_POLICY_THROTL:
940		switch(pn->fileid) {
941		case BLKIO_THROTL_read_bps_device:
942		case BLKIO_THROTL_write_bps_device:
943			if (pn->val.bps == 0)
944				return 1;
945			break;
946		case BLKIO_THROTL_read_iops_device:
947		case BLKIO_THROTL_write_iops_device:
948			if (pn->val.iops == 0)
949				return 1;
950		}
951		break;
952	default:
953		BUG();
954	}
955
956	return 0;
957}
958
959static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
960					struct blkio_policy_node *newpn)
961{
962	switch(oldpn->plid) {
963	case BLKIO_POLICY_PROP:
964		oldpn->val.weight = newpn->val.weight;
965		break;
966	case BLKIO_POLICY_THROTL:
967		switch(newpn->fileid) {
968		case BLKIO_THROTL_read_bps_device:
969		case BLKIO_THROTL_write_bps_device:
970			oldpn->val.bps = newpn->val.bps;
971			break;
972		case BLKIO_THROTL_read_iops_device:
973		case BLKIO_THROTL_write_iops_device:
974			oldpn->val.iops = newpn->val.iops;
975		}
976		break;
977	default:
978		BUG();
979	}
980}
981
982/*
983 * Some rules/values in blkg have changed. Propagate those to respective
984 * policies.
985 */
986static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
987		struct blkio_group *blkg, struct blkio_policy_node *pn)
988{
989	unsigned int weight, iops;
990	u64 bps;
991
992	switch(pn->plid) {
993	case BLKIO_POLICY_PROP:
994		weight = pn->val.weight ? pn->val.weight :
995				blkcg->weight;
996		blkio_update_group_weight(blkg, weight);
997		break;
998	case BLKIO_POLICY_THROTL:
999		switch(pn->fileid) {
1000		case BLKIO_THROTL_read_bps_device:
1001		case BLKIO_THROTL_write_bps_device:
1002			bps = pn->val.bps ? pn->val.bps : (-1);
1003			blkio_update_group_bps(blkg, bps, pn->fileid);
1004			break;
1005		case BLKIO_THROTL_read_iops_device:
1006		case BLKIO_THROTL_write_iops_device:
1007			iops = pn->val.iops ? pn->val.iops : (-1);
1008			blkio_update_group_iops(blkg, iops, pn->fileid);
1009			break;
1010		}
1011		break;
1012	default:
1013		BUG();
1014	}
1015}
1016
1017/*
1018 * A policy node rule has been updated. Propagate this update to all the
1019 * block groups which might be affected by this update.
1020 */
1021static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
1022				struct blkio_policy_node *pn)
1023{
1024	struct blkio_group *blkg;
1025	struct hlist_node *n;
1026
1027	spin_lock(&blkio_list_lock);
1028	spin_lock_irq(&blkcg->lock);
1029
1030	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1031		if (pn->dev != blkg->dev || pn->plid != blkg->plid)
1032			continue;
1033		blkio_update_blkg_policy(blkcg, blkg, pn);
1034	}
1035
1036	spin_unlock_irq(&blkcg->lock);
1037	spin_unlock(&blkio_list_lock);
1038}
1039
1040static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
1041 				       const char *buffer)
1042{
1043	int ret = 0;
1044	char *buf;
1045	struct blkio_policy_node *newpn, *pn;
1046	struct blkio_cgroup *blkcg;
1047	int keep_newpn = 0;
1048	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1049	int fileid = BLKIOFILE_ATTR(cft->private);
1050
1051	buf = kstrdup(buffer, GFP_KERNEL);
1052	if (!buf)
1053		return -ENOMEM;
1054
1055	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
1056	if (!newpn) {
1057		ret = -ENOMEM;
1058		goto free_buf;
1059	}
1060
1061	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
1062	if (ret)
1063		goto free_newpn;
1064
1065	blkcg = cgroup_to_blkio_cgroup(cgrp);
1066
1067	spin_lock_irq(&blkcg->lock);
1068
1069	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
1070	if (!pn) {
1071		if (!blkio_delete_rule_command(newpn)) {
1072			blkio_policy_insert_node(blkcg, newpn);
1073			keep_newpn = 1;
1074		}
1075		spin_unlock_irq(&blkcg->lock);
1076		goto update_io_group;
1077	}
1078
1079	if (blkio_delete_rule_command(newpn)) {
1080		blkio_policy_delete_node(pn);
1081		kfree(pn);
1082		spin_unlock_irq(&blkcg->lock);
1083		goto update_io_group;
1084	}
1085	spin_unlock_irq(&blkcg->lock);
1086
1087	blkio_update_policy_rule(pn, newpn);
1088
1089update_io_group:
1090	blkio_update_policy_node_blkg(blkcg, newpn);
1091
1092free_newpn:
1093	if (!keep_newpn)
1094		kfree(newpn);
1095free_buf:
1096	kfree(buf);
1097	return ret;
1098}
1099
1100static void
1101blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
1102{
1103	switch(pn->plid) {
1104		case BLKIO_POLICY_PROP:
1105			if (pn->fileid == BLKIO_PROP_weight_device)
1106				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1107					MINOR(pn->dev), pn->val.weight);
1108			break;
1109		case BLKIO_POLICY_THROTL:
1110			switch(pn->fileid) {
1111			case BLKIO_THROTL_read_bps_device:
1112			case BLKIO_THROTL_write_bps_device:
1113				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
1114					MINOR(pn->dev), pn->val.bps);
1115				break;
1116			case BLKIO_THROTL_read_iops_device:
1117			case BLKIO_THROTL_write_iops_device:
1118				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1119					MINOR(pn->dev), pn->val.iops);
1120				break;
1121			}
1122			break;
1123		default:
1124			BUG();
1125	}
1126}
1127
1128/* cgroup files which read their data from policy nodes end up here */
1129static void blkio_read_policy_node_files(struct cftype *cft,
1130			struct blkio_cgroup *blkcg, struct seq_file *m)
1131{
1132	struct blkio_policy_node *pn;
1133
1134	if (!list_empty(&blkcg->policy_list)) {
1135		spin_lock_irq(&blkcg->lock);
1136		list_for_each_entry(pn, &blkcg->policy_list, node) {
1137			if (!pn_matches_cftype(cft, pn))
1138				continue;
1139			blkio_print_policy_node(m, pn);
1140		}
1141		spin_unlock_irq(&blkcg->lock);
1142	}
1143}
1144
1145static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1146				struct seq_file *m)
1147{
1148	struct blkio_cgroup *blkcg;
1149	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1150	int name = BLKIOFILE_ATTR(cft->private);
1151
1152	blkcg = cgroup_to_blkio_cgroup(cgrp);
1153
1154	switch(plid) {
1155	case BLKIO_POLICY_PROP:
1156		switch(name) {
1157		case BLKIO_PROP_weight_device:
1158			blkio_read_policy_node_files(cft, blkcg, m);
1159			return 0;
1160		default:
1161			BUG();
1162		}
1163		break;
1164	case BLKIO_POLICY_THROTL:
1165		switch(name){
1166		case BLKIO_THROTL_read_bps_device:
1167		case BLKIO_THROTL_write_bps_device:
1168		case BLKIO_THROTL_read_iops_device:
1169		case BLKIO_THROTL_write_iops_device:
1170			blkio_read_policy_node_files(cft, blkcg, m);
1171			return 0;
1172		default:
1173			BUG();
1174		}
1175		break;
1176	default:
1177		BUG();
1178	}
1179
1180	return 0;
1181}
1182
1183static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1184		struct cftype *cft, struct cgroup_map_cb *cb,
1185		enum stat_type type, bool show_total, bool pcpu)
1186{
1187	struct blkio_group *blkg;
1188	struct hlist_node *n;
1189	uint64_t cgroup_total = 0;
1190
1191	rcu_read_lock();
1192	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1193		if (blkg->dev) {
1194			if (!cftype_blkg_same_policy(cft, blkg))
1195				continue;
1196			if (pcpu)
1197				cgroup_total += blkio_get_stat_cpu(blkg, cb,
1198						blkg->dev, type);
1199			else {
1200				spin_lock_irq(&blkg->stats_lock);
1201				cgroup_total += blkio_get_stat(blkg, cb,
1202						blkg->dev, type);
1203				spin_unlock_irq(&blkg->stats_lock);
1204			}
1205		}
1206	}
1207	if (show_total)
1208		cb->fill(cb, "Total", cgroup_total);
1209	rcu_read_unlock();
1210	return 0;
1211}
1212
1213/* All map kind of cgroup file get serviced by this function */
1214static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1215				struct cgroup_map_cb *cb)
1216{
1217	struct blkio_cgroup *blkcg;
1218	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1219	int name = BLKIOFILE_ATTR(cft->private);
1220
1221	blkcg = cgroup_to_blkio_cgroup(cgrp);
1222
1223	switch(plid) {
1224	case BLKIO_POLICY_PROP:
1225		switch(name) {
1226		case BLKIO_PROP_time:
1227			return blkio_read_blkg_stats(blkcg, cft, cb,
1228						BLKIO_STAT_TIME, 0, 0);
1229		case BLKIO_PROP_sectors:
1230			return blkio_read_blkg_stats(blkcg, cft, cb,
1231						BLKIO_STAT_CPU_SECTORS, 0, 1);
1232		case BLKIO_PROP_io_service_bytes:
1233			return blkio_read_blkg_stats(blkcg, cft, cb,
1234					BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1235		case BLKIO_PROP_io_serviced:
1236			return blkio_read_blkg_stats(blkcg, cft, cb,
1237						BLKIO_STAT_CPU_SERVICED, 1, 1);
1238		case BLKIO_PROP_io_service_time:
1239			return blkio_read_blkg_stats(blkcg, cft, cb,
1240						BLKIO_STAT_SERVICE_TIME, 1, 0);
1241		case BLKIO_PROP_io_wait_time:
1242			return blkio_read_blkg_stats(blkcg, cft, cb,
1243						BLKIO_STAT_WAIT_TIME, 1, 0);
1244		case BLKIO_PROP_io_merged:
1245			return blkio_read_blkg_stats(blkcg, cft, cb,
1246						BLKIO_STAT_CPU_MERGED, 1, 1);
1247		case BLKIO_PROP_io_queued:
1248			return blkio_read_blkg_stats(blkcg, cft, cb,
1249						BLKIO_STAT_QUEUED, 1, 0);
1250#ifdef CONFIG_DEBUG_BLK_CGROUP
1251		case BLKIO_PROP_unaccounted_time:
1252			return blkio_read_blkg_stats(blkcg, cft, cb,
1253					BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1254		case BLKIO_PROP_dequeue:
1255			return blkio_read_blkg_stats(blkcg, cft, cb,
1256						BLKIO_STAT_DEQUEUE, 0, 0);
1257		case BLKIO_PROP_avg_queue_size:
1258			return blkio_read_blkg_stats(blkcg, cft, cb,
1259					BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1260		case BLKIO_PROP_group_wait_time:
1261			return blkio_read_blkg_stats(blkcg, cft, cb,
1262					BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1263		case BLKIO_PROP_idle_time:
1264			return blkio_read_blkg_stats(blkcg, cft, cb,
1265						BLKIO_STAT_IDLE_TIME, 0, 0);
1266		case BLKIO_PROP_empty_time:
1267			return blkio_read_blkg_stats(blkcg, cft, cb,
1268						BLKIO_STAT_EMPTY_TIME, 0, 0);
1269#endif
1270		default:
1271			BUG();
1272		}
1273		break;
1274	case BLKIO_POLICY_THROTL:
1275		switch(name){
1276		case BLKIO_THROTL_io_service_bytes:
1277			return blkio_read_blkg_stats(blkcg, cft, cb,
1278						BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1279		case BLKIO_THROTL_io_serviced:
1280			return blkio_read_blkg_stats(blkcg, cft, cb,
1281						BLKIO_STAT_CPU_SERVICED, 1, 1);
1282		default:
1283			BUG();
1284		}
1285		break;
1286	default:
1287		BUG();
1288	}
1289
1290	return 0;
1291}
1292
1293static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1294{
1295	struct blkio_group *blkg;
1296	struct hlist_node *n;
1297	struct blkio_policy_node *pn;
1298
1299	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1300		return -EINVAL;
1301
1302	spin_lock(&blkio_list_lock);
1303	spin_lock_irq(&blkcg->lock);
1304	blkcg->weight = (unsigned int)val;
1305
1306	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1307		pn = blkio_policy_search_node(blkcg, blkg->dev,
1308				BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1309		if (pn)
1310			continue;
1311
1312		blkio_update_group_weight(blkg, blkcg->weight);
1313	}
1314	spin_unlock_irq(&blkcg->lock);
1315	spin_unlock(&blkio_list_lock);
1316	return 0;
1317}
1318
1319static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1320	struct blkio_cgroup *blkcg;
1321	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1322	int name = BLKIOFILE_ATTR(cft->private);
1323
1324	blkcg = cgroup_to_blkio_cgroup(cgrp);
1325
1326	switch(plid) {
1327	case BLKIO_POLICY_PROP:
1328		switch(name) {
1329		case BLKIO_PROP_weight:
1330			return (u64)blkcg->weight;
1331		}
1332		break;
1333	default:
1334		BUG();
1335	}
1336	return 0;
1337}
1338
1339static int
1340blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1341{
1342	struct blkio_cgroup *blkcg;
1343	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1344	int name = BLKIOFILE_ATTR(cft->private);
1345
1346	blkcg = cgroup_to_blkio_cgroup(cgrp);
1347
1348	switch(plid) {
1349	case BLKIO_POLICY_PROP:
1350		switch(name) {
1351		case BLKIO_PROP_weight:
1352			return blkio_weight_write(blkcg, val);
1353		}
1354		break;
1355	default:
1356		BUG();
1357	}
1358
1359	return 0;
1360}
1361
1362struct cftype blkio_files[] = {
1363	{
1364		.name = "weight_device",
1365		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1366				BLKIO_PROP_weight_device),
1367		.read_seq_string = blkiocg_file_read,
1368		.write_string = blkiocg_file_write,
1369		.max_write_len = 256,
1370	},
1371	{
1372		.name = "weight",
1373		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1374				BLKIO_PROP_weight),
1375		.read_u64 = blkiocg_file_read_u64,
1376		.write_u64 = blkiocg_file_write_u64,
1377	},
1378	{
1379		.name = "time",
1380		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1381				BLKIO_PROP_time),
1382		.read_map = blkiocg_file_read_map,
1383	},
1384	{
1385		.name = "sectors",
1386		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1387				BLKIO_PROP_sectors),
1388		.read_map = blkiocg_file_read_map,
1389	},
1390	{
1391		.name = "io_service_bytes",
1392		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1393				BLKIO_PROP_io_service_bytes),
1394		.read_map = blkiocg_file_read_map,
1395	},
1396	{
1397		.name = "io_serviced",
1398		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1399				BLKIO_PROP_io_serviced),
1400		.read_map = blkiocg_file_read_map,
1401	},
1402	{
1403		.name = "io_service_time",
1404		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1405				BLKIO_PROP_io_service_time),
1406		.read_map = blkiocg_file_read_map,
1407	},
1408	{
1409		.name = "io_wait_time",
1410		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1411				BLKIO_PROP_io_wait_time),
1412		.read_map = blkiocg_file_read_map,
1413	},
1414	{
1415		.name = "io_merged",
1416		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1417				BLKIO_PROP_io_merged),
1418		.read_map = blkiocg_file_read_map,
1419	},
1420	{
1421		.name = "io_queued",
1422		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1423				BLKIO_PROP_io_queued),
1424		.read_map = blkiocg_file_read_map,
1425	},
1426	{
1427		.name = "reset_stats",
1428		.write_u64 = blkiocg_reset_stats,
1429	},
1430#ifdef CONFIG_BLK_DEV_THROTTLING
1431	{
1432		.name = "throttle.read_bps_device",
1433		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1434				BLKIO_THROTL_read_bps_device),
1435		.read_seq_string = blkiocg_file_read,
1436		.write_string = blkiocg_file_write,
1437		.max_write_len = 256,
1438	},
1439
1440	{
1441		.name = "throttle.write_bps_device",
1442		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1443				BLKIO_THROTL_write_bps_device),
1444		.read_seq_string = blkiocg_file_read,
1445		.write_string = blkiocg_file_write,
1446		.max_write_len = 256,
1447	},
1448
1449	{
1450		.name = "throttle.read_iops_device",
1451		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1452				BLKIO_THROTL_read_iops_device),
1453		.read_seq_string = blkiocg_file_read,
1454		.write_string = blkiocg_file_write,
1455		.max_write_len = 256,
1456	},
1457
1458	{
1459		.name = "throttle.write_iops_device",
1460		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1461				BLKIO_THROTL_write_iops_device),
1462		.read_seq_string = blkiocg_file_read,
1463		.write_string = blkiocg_file_write,
1464		.max_write_len = 256,
1465	},
1466	{
1467		.name = "throttle.io_service_bytes",
1468		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1469				BLKIO_THROTL_io_service_bytes),
1470		.read_map = blkiocg_file_read_map,
1471	},
1472	{
1473		.name = "throttle.io_serviced",
1474		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1475				BLKIO_THROTL_io_serviced),
1476		.read_map = blkiocg_file_read_map,
1477	},
1478#endif /* CONFIG_BLK_DEV_THROTTLING */
1479
1480#ifdef CONFIG_DEBUG_BLK_CGROUP
1481	{
1482		.name = "avg_queue_size",
1483		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1484				BLKIO_PROP_avg_queue_size),
1485		.read_map = blkiocg_file_read_map,
1486	},
1487	{
1488		.name = "group_wait_time",
1489		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1490				BLKIO_PROP_group_wait_time),
1491		.read_map = blkiocg_file_read_map,
1492	},
1493	{
1494		.name = "idle_time",
1495		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1496				BLKIO_PROP_idle_time),
1497		.read_map = blkiocg_file_read_map,
1498	},
1499	{
1500		.name = "empty_time",
1501		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1502				BLKIO_PROP_empty_time),
1503		.read_map = blkiocg_file_read_map,
1504	},
1505	{
1506		.name = "dequeue",
1507		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1508				BLKIO_PROP_dequeue),
1509		.read_map = blkiocg_file_read_map,
1510	},
1511	{
1512		.name = "unaccounted_time",
1513		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1514				BLKIO_PROP_unaccounted_time),
1515		.read_map = blkiocg_file_read_map,
1516	},
1517#endif
1518};
1519
1520static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1521{
1522	return cgroup_add_files(cgroup, subsys, blkio_files,
1523				ARRAY_SIZE(blkio_files));
1524}
1525
1526static void blkiocg_destroy(struct cgroup *cgroup)
1527{
1528	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1529	unsigned long flags;
1530	struct blkio_group *blkg;
1531	void *key;
1532	struct blkio_policy_type *blkiop;
1533	struct blkio_policy_node *pn, *pntmp;
1534
1535	rcu_read_lock();
1536	do {
1537		spin_lock_irqsave(&blkcg->lock, flags);
1538
1539		if (hlist_empty(&blkcg->blkg_list)) {
1540			spin_unlock_irqrestore(&blkcg->lock, flags);
1541			break;
1542		}
1543
1544		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
1545					blkcg_node);
1546		key = rcu_dereference(blkg->key);
1547		__blkiocg_del_blkio_group(blkg);
1548
1549		spin_unlock_irqrestore(&blkcg->lock, flags);
1550
1551		/*
1552		 * This blkio_group is being unlinked as associated cgroup is
1553		 * going away. Let all the IO controlling policies know about
1554		 * this event.
1555		 */
1556		spin_lock(&blkio_list_lock);
1557		list_for_each_entry(blkiop, &blkio_list, list) {
1558			if (blkiop->plid != blkg->plid)
1559				continue;
1560			blkiop->ops.blkio_unlink_group_fn(key, blkg);
1561		}
1562		spin_unlock(&blkio_list_lock);
1563	} while (1);
1564
1565	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
1566		blkio_policy_delete_node(pn);
1567		kfree(pn);
1568	}
1569
1570	free_css_id(&blkio_subsys, &blkcg->css);
1571	rcu_read_unlock();
1572	if (blkcg != &blkio_root_cgroup)
1573		kfree(blkcg);
1574}
1575
1576static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
1577{
1578	struct blkio_cgroup *blkcg;
1579	struct cgroup *parent = cgroup->parent;
1580
1581	if (!parent) {
1582		blkcg = &blkio_root_cgroup;
1583		goto done;
1584	}
1585
1586	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1587	if (!blkcg)
1588		return ERR_PTR(-ENOMEM);
1589
1590	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1591done:
1592	spin_lock_init(&blkcg->lock);
1593	INIT_HLIST_HEAD(&blkcg->blkg_list);
1594
1595	INIT_LIST_HEAD(&blkcg->policy_list);
1596	return &blkcg->css;
1597}
1598
1599/*
1600 * We cannot support shared io contexts, as we have no mean to support
1601 * two tasks with the same ioc in two different groups without major rework
1602 * of the main cic data structures.  For now we allow a task to change
1603 * its cgroup only if it's the only owner of its ioc.
1604 */
1605static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1606{
1607	struct task_struct *task;
1608	struct io_context *ioc;
1609	int ret = 0;
1610
1611	/* task_lock() is needed to avoid races with exit_io_context() */
1612	cgroup_taskset_for_each(task, cgrp, tset) {
1613		task_lock(task);
1614		ioc = task->io_context;
1615		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1616			ret = -EINVAL;
1617		task_unlock(task);
1618		if (ret)
1619			break;
1620	}
1621	return ret;
1622}
1623
1624static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1625{
1626	struct task_struct *task;
1627	struct io_context *ioc;
1628
1629	cgroup_taskset_for_each(task, cgrp, tset) {
1630		/* we don't lose anything even if ioc allocation fails */
1631		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1632		if (ioc) {
1633			ioc_cgroup_changed(ioc);
1634			put_io_context(ioc);
1635		}
1636	}
1637}
1638
1639struct cgroup_subsys blkio_subsys = {
1640	.name = "blkio",
1641	.create = blkiocg_create,
1642	.can_attach = blkiocg_can_attach,
1643	.attach = blkiocg_attach,
1644	.destroy = blkiocg_destroy,
1645	.populate = blkiocg_populate,
1646#ifdef CONFIG_BLK_CGROUP
1647	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
1648	.subsys_id = blkio_subsys_id,
1649#endif
1650	.use_id = 1,
1651	.module = THIS_MODULE,
1652};
1653EXPORT_SYMBOL_GPL(blkio_subsys);
1654
1655void blkio_policy_register(struct blkio_policy_type *blkiop)
1656{
1657	spin_lock(&blkio_list_lock);
1658	list_add_tail(&blkiop->list, &blkio_list);
1659	spin_unlock(&blkio_list_lock);
1660}
1661EXPORT_SYMBOL_GPL(blkio_policy_register);
1662
1663void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1664{
1665	spin_lock(&blkio_list_lock);
1666	list_del_init(&blkiop->list);
1667	spin_unlock(&blkio_list_lock);
1668}
1669EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1670
1671static int __init init_cgroup_blkio(void)
1672{
1673	return cgroup_load_subsys(&blkio_subsys);
1674}
1675
1676static void __exit exit_cgroup_blkio(void)
1677{
1678	cgroup_unload_subsys(&blkio_subsys);
1679}
1680
1681module_init(init_cgroup_blkio);
1682module_exit(exit_cgroup_blkio);
1683MODULE_LICENSE("GPL");
1684