cpufreq_ondemand.c revision 6b8fcd9029f217a9ecce822db645e19111c11080
1/*
2 *  drivers/cpufreq/cpufreq_ondemand.c
3 *
4 *  Copyright (C)  2001 Russell King
5 *            (C)  2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>.
6 *                      Jun Nakajima <jun.nakajima@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/kernel.h>
14#include <linux/module.h>
15#include <linux/init.h>
16#include <linux/cpufreq.h>
17#include <linux/cpu.h>
18#include <linux/jiffies.h>
19#include <linux/kernel_stat.h>
20#include <linux/mutex.h>
21#include <linux/hrtimer.h>
22#include <linux/tick.h>
23#include <linux/ktime.h>
24#include <linux/sched.h>
25
26/*
27 * dbs is used in this file as a shortform for demandbased switching
28 * It helps to keep variable names smaller, simpler
29 */
30
31#define DEF_FREQUENCY_DOWN_DIFFERENTIAL		(10)
32#define DEF_FREQUENCY_UP_THRESHOLD		(80)
33#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL	(3)
34#define MICRO_FREQUENCY_UP_THRESHOLD		(95)
35#define MICRO_FREQUENCY_MIN_SAMPLE_RATE		(10000)
36#define MIN_FREQUENCY_UP_THRESHOLD		(11)
37#define MAX_FREQUENCY_UP_THRESHOLD		(100)
38
39/*
40 * The polling frequency of this governor depends on the capability of
41 * the processor. Default polling frequency is 1000 times the transition
42 * latency of the processor. The governor will work on any processor with
43 * transition latency <= 10mS, using appropriate sampling
44 * rate.
45 * For CPUs with transition latency > 10mS (mostly drivers with CPUFREQ_ETERNAL)
46 * this governor will not work.
47 * All times here are in uS.
48 */
49#define MIN_SAMPLING_RATE_RATIO			(2)
50
51static unsigned int min_sampling_rate;
52
53#define LATENCY_MULTIPLIER			(1000)
54#define MIN_LATENCY_MULTIPLIER			(100)
55#define TRANSITION_LATENCY_LIMIT		(10 * 1000 * 1000)
56
57static void do_dbs_timer(struct work_struct *work);
58static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
59				unsigned int event);
60
61#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
62static
63#endif
64struct cpufreq_governor cpufreq_gov_ondemand = {
65       .name                   = "ondemand",
66       .governor               = cpufreq_governor_dbs,
67       .max_transition_latency = TRANSITION_LATENCY_LIMIT,
68       .owner                  = THIS_MODULE,
69};
70
71/* Sampling types */
72enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE};
73
74struct cpu_dbs_info_s {
75	cputime64_t prev_cpu_idle;
76	cputime64_t prev_cpu_iowait;
77	cputime64_t prev_cpu_wall;
78	cputime64_t prev_cpu_nice;
79	struct cpufreq_policy *cur_policy;
80	struct delayed_work work;
81	struct cpufreq_frequency_table *freq_table;
82	unsigned int freq_lo;
83	unsigned int freq_lo_jiffies;
84	unsigned int freq_hi_jiffies;
85	int cpu;
86	unsigned int sample_type:1;
87	/*
88	 * percpu mutex that serializes governor limit change with
89	 * do_dbs_timer invocation. We do not want do_dbs_timer to run
90	 * when user is changing the governor or limits.
91	 */
92	struct mutex timer_mutex;
93};
94static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info);
95
96static unsigned int dbs_enable;	/* number of CPUs using this policy */
97
98/*
99 * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on
100 * different CPUs. It protects dbs_enable in governor start/stop.
101 */
102static DEFINE_MUTEX(dbs_mutex);
103
104static struct workqueue_struct	*kondemand_wq;
105
106static struct dbs_tuners {
107	unsigned int sampling_rate;
108	unsigned int up_threshold;
109	unsigned int down_differential;
110	unsigned int ignore_nice;
111	unsigned int powersave_bias;
112} dbs_tuners_ins = {
113	.up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
114	.down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL,
115	.ignore_nice = 0,
116	.powersave_bias = 0,
117};
118
119static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
120							cputime64_t *wall)
121{
122	cputime64_t idle_time;
123	cputime64_t cur_wall_time;
124	cputime64_t busy_time;
125
126	cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
127	busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
128			kstat_cpu(cpu).cpustat.system);
129
130	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq);
131	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq);
132	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal);
133	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice);
134
135	idle_time = cputime64_sub(cur_wall_time, busy_time);
136	if (wall)
137		*wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
138
139	return (cputime64_t)jiffies_to_usecs(idle_time);
140}
141
142static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
143{
144	u64 idle_time = get_cpu_idle_time_us(cpu, wall);
145
146	if (idle_time == -1ULL)
147		return get_cpu_idle_time_jiffy(cpu, wall);
148
149	return idle_time;
150}
151
152static inline cputime64_t get_cpu_iowait_time(unsigned int cpu, cputime64_t *wall)
153{
154	u64 iowait_time = get_cpu_iowait_time_us(cpu, wall);
155
156	if (iowait_time == -1ULL)
157		return 0;
158
159	return iowait_time;
160}
161
162/*
163 * Find right freq to be set now with powersave_bias on.
164 * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
165 * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs.
166 */
167static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
168					  unsigned int freq_next,
169					  unsigned int relation)
170{
171	unsigned int freq_req, freq_reduc, freq_avg;
172	unsigned int freq_hi, freq_lo;
173	unsigned int index = 0;
174	unsigned int jiffies_total, jiffies_hi, jiffies_lo;
175	struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
176						   policy->cpu);
177
178	if (!dbs_info->freq_table) {
179		dbs_info->freq_lo = 0;
180		dbs_info->freq_lo_jiffies = 0;
181		return freq_next;
182	}
183
184	cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_next,
185			relation, &index);
186	freq_req = dbs_info->freq_table[index].frequency;
187	freq_reduc = freq_req * dbs_tuners_ins.powersave_bias / 1000;
188	freq_avg = freq_req - freq_reduc;
189
190	/* Find freq bounds for freq_avg in freq_table */
191	index = 0;
192	cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg,
193			CPUFREQ_RELATION_H, &index);
194	freq_lo = dbs_info->freq_table[index].frequency;
195	index = 0;
196	cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg,
197			CPUFREQ_RELATION_L, &index);
198	freq_hi = dbs_info->freq_table[index].frequency;
199
200	/* Find out how long we have to be in hi and lo freqs */
201	if (freq_hi == freq_lo) {
202		dbs_info->freq_lo = 0;
203		dbs_info->freq_lo_jiffies = 0;
204		return freq_lo;
205	}
206	jiffies_total = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
207	jiffies_hi = (freq_avg - freq_lo) * jiffies_total;
208	jiffies_hi += ((freq_hi - freq_lo) / 2);
209	jiffies_hi /= (freq_hi - freq_lo);
210	jiffies_lo = jiffies_total - jiffies_hi;
211	dbs_info->freq_lo = freq_lo;
212	dbs_info->freq_lo_jiffies = jiffies_lo;
213	dbs_info->freq_hi_jiffies = jiffies_hi;
214	return freq_hi;
215}
216
217static void ondemand_powersave_bias_init_cpu(int cpu)
218{
219	struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
220	dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
221	dbs_info->freq_lo = 0;
222}
223
224static void ondemand_powersave_bias_init(void)
225{
226	int i;
227	for_each_online_cpu(i) {
228		ondemand_powersave_bias_init_cpu(i);
229	}
230}
231
232/************************** sysfs interface ************************/
233
234static ssize_t show_sampling_rate_max(struct kobject *kobj,
235				      struct attribute *attr, char *buf)
236{
237	printk_once(KERN_INFO "CPUFREQ: ondemand sampling_rate_max "
238	       "sysfs file is deprecated - used by: %s\n", current->comm);
239	return sprintf(buf, "%u\n", -1U);
240}
241
242static ssize_t show_sampling_rate_min(struct kobject *kobj,
243				      struct attribute *attr, char *buf)
244{
245	return sprintf(buf, "%u\n", min_sampling_rate);
246}
247
248#define define_one_ro(_name)		\
249static struct global_attr _name =	\
250__ATTR(_name, 0444, show_##_name, NULL)
251
252define_one_ro(sampling_rate_max);
253define_one_ro(sampling_rate_min);
254
255/* cpufreq_ondemand Governor Tunables */
256#define show_one(file_name, object)					\
257static ssize_t show_##file_name						\
258(struct kobject *kobj, struct attribute *attr, char *buf)              \
259{									\
260	return sprintf(buf, "%u\n", dbs_tuners_ins.object);		\
261}
262show_one(sampling_rate, sampling_rate);
263show_one(up_threshold, up_threshold);
264show_one(ignore_nice_load, ignore_nice);
265show_one(powersave_bias, powersave_bias);
266
267/*** delete after deprecation time ***/
268
269#define DEPRECATION_MSG(file_name)					\
270	printk_once(KERN_INFO "CPUFREQ: Per core ondemand sysfs "	\
271		    "interface is deprecated - " #file_name "\n");
272
273#define show_one_old(file_name)						\
274static ssize_t show_##file_name##_old					\
275(struct cpufreq_policy *unused, char *buf)				\
276{									\
277	printk_once(KERN_INFO "CPUFREQ: Per core ondemand sysfs "	\
278		    "interface is deprecated - " #file_name "\n");	\
279	return show_##file_name(NULL, NULL, buf);			\
280}
281show_one_old(sampling_rate);
282show_one_old(up_threshold);
283show_one_old(ignore_nice_load);
284show_one_old(powersave_bias);
285show_one_old(sampling_rate_min);
286show_one_old(sampling_rate_max);
287
288#define define_one_ro_old(object, _name)       \
289static struct freq_attr object =               \
290__ATTR(_name, 0444, show_##_name##_old, NULL)
291
292define_one_ro_old(sampling_rate_min_old, sampling_rate_min);
293define_one_ro_old(sampling_rate_max_old, sampling_rate_max);
294
295/*** delete after deprecation time ***/
296
297static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b,
298				   const char *buf, size_t count)
299{
300	unsigned int input;
301	int ret;
302	ret = sscanf(buf, "%u", &input);
303	if (ret != 1)
304		return -EINVAL;
305
306	mutex_lock(&dbs_mutex);
307	dbs_tuners_ins.sampling_rate = max(input, min_sampling_rate);
308	mutex_unlock(&dbs_mutex);
309
310	return count;
311}
312
313static ssize_t store_up_threshold(struct kobject *a, struct attribute *b,
314				  const char *buf, size_t count)
315{
316	unsigned int input;
317	int ret;
318	ret = sscanf(buf, "%u", &input);
319
320	if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD ||
321			input < MIN_FREQUENCY_UP_THRESHOLD) {
322		return -EINVAL;
323	}
324
325	mutex_lock(&dbs_mutex);
326	dbs_tuners_ins.up_threshold = input;
327	mutex_unlock(&dbs_mutex);
328
329	return count;
330}
331
332static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
333				      const char *buf, size_t count)
334{
335	unsigned int input;
336	int ret;
337
338	unsigned int j;
339
340	ret = sscanf(buf, "%u", &input);
341	if (ret != 1)
342		return -EINVAL;
343
344	if (input > 1)
345		input = 1;
346
347	mutex_lock(&dbs_mutex);
348	if (input == dbs_tuners_ins.ignore_nice) { /* nothing to do */
349		mutex_unlock(&dbs_mutex);
350		return count;
351	}
352	dbs_tuners_ins.ignore_nice = input;
353
354	/* we need to re-evaluate prev_cpu_idle */
355	for_each_online_cpu(j) {
356		struct cpu_dbs_info_s *dbs_info;
357		dbs_info = &per_cpu(od_cpu_dbs_info, j);
358		dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
359						&dbs_info->prev_cpu_wall);
360		if (dbs_tuners_ins.ignore_nice)
361			dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
362
363	}
364	mutex_unlock(&dbs_mutex);
365
366	return count;
367}
368
369static ssize_t store_powersave_bias(struct kobject *a, struct attribute *b,
370				    const char *buf, size_t count)
371{
372	unsigned int input;
373	int ret;
374	ret = sscanf(buf, "%u", &input);
375
376	if (ret != 1)
377		return -EINVAL;
378
379	if (input > 1000)
380		input = 1000;
381
382	mutex_lock(&dbs_mutex);
383	dbs_tuners_ins.powersave_bias = input;
384	ondemand_powersave_bias_init();
385	mutex_unlock(&dbs_mutex);
386
387	return count;
388}
389
390#define define_one_rw(_name) \
391static struct global_attr _name = \
392__ATTR(_name, 0644, show_##_name, store_##_name)
393
394define_one_rw(sampling_rate);
395define_one_rw(up_threshold);
396define_one_rw(ignore_nice_load);
397define_one_rw(powersave_bias);
398
399static struct attribute *dbs_attributes[] = {
400	&sampling_rate_max.attr,
401	&sampling_rate_min.attr,
402	&sampling_rate.attr,
403	&up_threshold.attr,
404	&ignore_nice_load.attr,
405	&powersave_bias.attr,
406	NULL
407};
408
409static struct attribute_group dbs_attr_group = {
410	.attrs = dbs_attributes,
411	.name = "ondemand",
412};
413
414/*** delete after deprecation time ***/
415
416#define write_one_old(file_name)					\
417static ssize_t store_##file_name##_old					\
418(struct cpufreq_policy *unused, const char *buf, size_t count)		\
419{									\
420       printk_once(KERN_INFO "CPUFREQ: Per core ondemand sysfs "	\
421		   "interface is deprecated - " #file_name "\n");	\
422       return store_##file_name(NULL, NULL, buf, count);		\
423}
424write_one_old(sampling_rate);
425write_one_old(up_threshold);
426write_one_old(ignore_nice_load);
427write_one_old(powersave_bias);
428
429#define define_one_rw_old(object, _name)       \
430static struct freq_attr object =               \
431__ATTR(_name, 0644, show_##_name##_old, store_##_name##_old)
432
433define_one_rw_old(sampling_rate_old, sampling_rate);
434define_one_rw_old(up_threshold_old, up_threshold);
435define_one_rw_old(ignore_nice_load_old, ignore_nice_load);
436define_one_rw_old(powersave_bias_old, powersave_bias);
437
438static struct attribute *dbs_attributes_old[] = {
439       &sampling_rate_max_old.attr,
440       &sampling_rate_min_old.attr,
441       &sampling_rate_old.attr,
442       &up_threshold_old.attr,
443       &ignore_nice_load_old.attr,
444       &powersave_bias_old.attr,
445       NULL
446};
447
448static struct attribute_group dbs_attr_group_old = {
449       .attrs = dbs_attributes_old,
450       .name = "ondemand",
451};
452
453/*** delete after deprecation time ***/
454
455/************************** sysfs end ************************/
456
457static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
458{
459	unsigned int max_load_freq;
460
461	struct cpufreq_policy *policy;
462	unsigned int j;
463
464	this_dbs_info->freq_lo = 0;
465	policy = this_dbs_info->cur_policy;
466
467	/*
468	 * Every sampling_rate, we check, if current idle time is less
469	 * than 20% (default), then we try to increase frequency
470	 * Every sampling_rate, we look for a the lowest
471	 * frequency which can sustain the load while keeping idle time over
472	 * 30%. If such a frequency exist, we try to decrease to this frequency.
473	 *
474	 * Any frequency increase takes it to the maximum frequency.
475	 * Frequency reduction happens at minimum steps of
476	 * 5% (default) of current frequency
477	 */
478
479	/* Get Absolute Load - in terms of freq */
480	max_load_freq = 0;
481
482	for_each_cpu(j, policy->cpus) {
483		struct cpu_dbs_info_s *j_dbs_info;
484		cputime64_t cur_wall_time, cur_idle_time, cur_iowait_time;
485		unsigned int idle_time, wall_time, iowait_time;
486		unsigned int load, load_freq;
487		int freq_avg;
488
489		j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
490
491		cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
492		cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time);
493
494		wall_time = (unsigned int) cputime64_sub(cur_wall_time,
495				j_dbs_info->prev_cpu_wall);
496		j_dbs_info->prev_cpu_wall = cur_wall_time;
497
498		idle_time = (unsigned int) cputime64_sub(cur_idle_time,
499				j_dbs_info->prev_cpu_idle);
500		j_dbs_info->prev_cpu_idle = cur_idle_time;
501
502		iowait_time = (unsigned int) cputime64_sub(cur_iowait_time,
503				j_dbs_info->prev_cpu_iowait);
504		j_dbs_info->prev_cpu_iowait = cur_iowait_time;
505
506		if (dbs_tuners_ins.ignore_nice) {
507			cputime64_t cur_nice;
508			unsigned long cur_nice_jiffies;
509
510			cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice,
511					 j_dbs_info->prev_cpu_nice);
512			/*
513			 * Assumption: nice time between sampling periods will
514			 * be less than 2^32 jiffies for 32 bit sys
515			 */
516			cur_nice_jiffies = (unsigned long)
517					cputime64_to_jiffies64(cur_nice);
518
519			j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
520			idle_time += jiffies_to_usecs(cur_nice_jiffies);
521		}
522
523		/*
524		 * For the purpose of ondemand, waiting for disk IO is an
525		 * indication that you're performance critical, and not that
526		 * the system is actually idle. So subtract the iowait time
527		 * from the cpu idle time.
528		 */
529
530		if (idle_time >= iowait_time)
531			idle_time -= iowait_time;
532
533		if (unlikely(!wall_time || wall_time < idle_time))
534			continue;
535
536		load = 100 * (wall_time - idle_time) / wall_time;
537
538		freq_avg = __cpufreq_driver_getavg(policy, j);
539		if (freq_avg <= 0)
540			freq_avg = policy->cur;
541
542		load_freq = load * freq_avg;
543		if (load_freq > max_load_freq)
544			max_load_freq = load_freq;
545	}
546
547	/* Check for frequency increase */
548	if (max_load_freq > dbs_tuners_ins.up_threshold * policy->cur) {
549		/* if we are already at full speed then break out early */
550		if (!dbs_tuners_ins.powersave_bias) {
551			if (policy->cur == policy->max)
552				return;
553
554			__cpufreq_driver_target(policy, policy->max,
555				CPUFREQ_RELATION_H);
556		} else {
557			int freq = powersave_bias_target(policy, policy->max,
558					CPUFREQ_RELATION_H);
559			__cpufreq_driver_target(policy, freq,
560				CPUFREQ_RELATION_L);
561		}
562		return;
563	}
564
565	/* Check for frequency decrease */
566	/* if we cannot reduce the frequency anymore, break out early */
567	if (policy->cur == policy->min)
568		return;
569
570	/*
571	 * The optimal frequency is the frequency that is the lowest that
572	 * can support the current CPU usage without triggering the up
573	 * policy. To be safe, we focus 10 points under the threshold.
574	 */
575	if (max_load_freq <
576	    (dbs_tuners_ins.up_threshold - dbs_tuners_ins.down_differential) *
577	     policy->cur) {
578		unsigned int freq_next;
579		freq_next = max_load_freq /
580				(dbs_tuners_ins.up_threshold -
581				 dbs_tuners_ins.down_differential);
582
583		if (freq_next < policy->min)
584			freq_next = policy->min;
585
586		if (!dbs_tuners_ins.powersave_bias) {
587			__cpufreq_driver_target(policy, freq_next,
588					CPUFREQ_RELATION_L);
589		} else {
590			int freq = powersave_bias_target(policy, freq_next,
591					CPUFREQ_RELATION_L);
592			__cpufreq_driver_target(policy, freq,
593				CPUFREQ_RELATION_L);
594		}
595	}
596}
597
598static void do_dbs_timer(struct work_struct *work)
599{
600	struct cpu_dbs_info_s *dbs_info =
601		container_of(work, struct cpu_dbs_info_s, work.work);
602	unsigned int cpu = dbs_info->cpu;
603	int sample_type = dbs_info->sample_type;
604
605	/* We want all CPUs to do sampling nearly on same jiffy */
606	int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
607
608	delay -= jiffies % delay;
609	mutex_lock(&dbs_info->timer_mutex);
610
611	/* Common NORMAL_SAMPLE setup */
612	dbs_info->sample_type = DBS_NORMAL_SAMPLE;
613	if (!dbs_tuners_ins.powersave_bias ||
614	    sample_type == DBS_NORMAL_SAMPLE) {
615		dbs_check_cpu(dbs_info);
616		if (dbs_info->freq_lo) {
617			/* Setup timer for SUB_SAMPLE */
618			dbs_info->sample_type = DBS_SUB_SAMPLE;
619			delay = dbs_info->freq_hi_jiffies;
620		}
621	} else {
622		__cpufreq_driver_target(dbs_info->cur_policy,
623			dbs_info->freq_lo, CPUFREQ_RELATION_H);
624	}
625	queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay);
626	mutex_unlock(&dbs_info->timer_mutex);
627}
628
629static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
630{
631	/* We want all CPUs to do sampling nearly on same jiffy */
632	int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
633	delay -= jiffies % delay;
634
635	dbs_info->sample_type = DBS_NORMAL_SAMPLE;
636	INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer);
637	queue_delayed_work_on(dbs_info->cpu, kondemand_wq, &dbs_info->work,
638		delay);
639}
640
641static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
642{
643	cancel_delayed_work_sync(&dbs_info->work);
644}
645
646static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
647				   unsigned int event)
648{
649	unsigned int cpu = policy->cpu;
650	struct cpu_dbs_info_s *this_dbs_info;
651	unsigned int j;
652	int rc;
653
654	this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
655
656	switch (event) {
657	case CPUFREQ_GOV_START:
658		if ((!cpu_online(cpu)) || (!policy->cur))
659			return -EINVAL;
660
661		mutex_lock(&dbs_mutex);
662
663		rc = sysfs_create_group(&policy->kobj, &dbs_attr_group_old);
664		if (rc) {
665			mutex_unlock(&dbs_mutex);
666			return rc;
667		}
668
669		dbs_enable++;
670		for_each_cpu(j, policy->cpus) {
671			struct cpu_dbs_info_s *j_dbs_info;
672			j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
673			j_dbs_info->cur_policy = policy;
674
675			j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
676						&j_dbs_info->prev_cpu_wall);
677			if (dbs_tuners_ins.ignore_nice) {
678				j_dbs_info->prev_cpu_nice =
679						kstat_cpu(j).cpustat.nice;
680			}
681		}
682		this_dbs_info->cpu = cpu;
683		ondemand_powersave_bias_init_cpu(cpu);
684		/*
685		 * Start the timerschedule work, when this governor
686		 * is used for first time
687		 */
688		if (dbs_enable == 1) {
689			unsigned int latency;
690
691			rc = sysfs_create_group(cpufreq_global_kobject,
692						&dbs_attr_group);
693			if (rc) {
694				mutex_unlock(&dbs_mutex);
695				return rc;
696			}
697
698			/* policy latency is in nS. Convert it to uS first */
699			latency = policy->cpuinfo.transition_latency / 1000;
700			if (latency == 0)
701				latency = 1;
702			/* Bring kernel and HW constraints together */
703			min_sampling_rate = max(min_sampling_rate,
704					MIN_LATENCY_MULTIPLIER * latency);
705			dbs_tuners_ins.sampling_rate =
706				max(min_sampling_rate,
707				    latency * LATENCY_MULTIPLIER);
708		}
709		mutex_unlock(&dbs_mutex);
710
711		mutex_init(&this_dbs_info->timer_mutex);
712		dbs_timer_init(this_dbs_info);
713		break;
714
715	case CPUFREQ_GOV_STOP:
716		dbs_timer_exit(this_dbs_info);
717
718		mutex_lock(&dbs_mutex);
719		sysfs_remove_group(&policy->kobj, &dbs_attr_group_old);
720		mutex_destroy(&this_dbs_info->timer_mutex);
721		dbs_enable--;
722		mutex_unlock(&dbs_mutex);
723		if (!dbs_enable)
724			sysfs_remove_group(cpufreq_global_kobject,
725					   &dbs_attr_group);
726
727		break;
728
729	case CPUFREQ_GOV_LIMITS:
730		mutex_lock(&this_dbs_info->timer_mutex);
731		if (policy->max < this_dbs_info->cur_policy->cur)
732			__cpufreq_driver_target(this_dbs_info->cur_policy,
733				policy->max, CPUFREQ_RELATION_H);
734		else if (policy->min > this_dbs_info->cur_policy->cur)
735			__cpufreq_driver_target(this_dbs_info->cur_policy,
736				policy->min, CPUFREQ_RELATION_L);
737		mutex_unlock(&this_dbs_info->timer_mutex);
738		break;
739	}
740	return 0;
741}
742
743static int __init cpufreq_gov_dbs_init(void)
744{
745	int err;
746	cputime64_t wall;
747	u64 idle_time;
748	int cpu = get_cpu();
749
750	idle_time = get_cpu_idle_time_us(cpu, &wall);
751	put_cpu();
752	if (idle_time != -1ULL) {
753		/* Idle micro accounting is supported. Use finer thresholds */
754		dbs_tuners_ins.up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
755		dbs_tuners_ins.down_differential =
756					MICRO_FREQUENCY_DOWN_DIFFERENTIAL;
757		/*
758		 * In no_hz/micro accounting case we set the minimum frequency
759		 * not depending on HZ, but fixed (very low). The deferred
760		 * timer might skip some samples if idle/sleeping as needed.
761		*/
762		min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
763	} else {
764		/* For correct statistics, we need 10 ticks for each measure */
765		min_sampling_rate =
766			MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10);
767	}
768
769	kondemand_wq = create_workqueue("kondemand");
770	if (!kondemand_wq) {
771		printk(KERN_ERR "Creation of kondemand failed\n");
772		return -EFAULT;
773	}
774	err = cpufreq_register_governor(&cpufreq_gov_ondemand);
775	if (err)
776		destroy_workqueue(kondemand_wq);
777
778	return err;
779}
780
781static void __exit cpufreq_gov_dbs_exit(void)
782{
783	cpufreq_unregister_governor(&cpufreq_gov_ondemand);
784	destroy_workqueue(kondemand_wq);
785}
786
787
788MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>");
789MODULE_AUTHOR("Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>");
790MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for "
791	"Low Latency Frequency Transition capable processors");
792MODULE_LICENSE("GPL");
793
794#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
795fs_initcall(cpufreq_gov_dbs_init);
796#else
797module_init(cpufreq_gov_dbs_init);
798#endif
799module_exit(cpufreq_gov_dbs_exit);
800