1/*
2 * intel_powerclamp.c - package c-state idle injection
3 *
4 * Copyright (c) 2012, Intel Corporation.
5 *
6 * Authors:
7 *     Arjan van de Ven <arjan@linux.intel.com>
8 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms and conditions of the GNU General Public License,
12 * version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17 * more details.
18 *
19 * You should have received a copy of the GNU General Public License along with
20 * this program; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
22 *
23 *
24 *	TODO:
25 *           1. better handle wakeup from external interrupts, currently a fixed
26 *              compensation is added to clamping duration when excessive amount
27 *              of wakeups are observed during idle time. the reason is that in
28 *              case of external interrupts without need for ack, clamping down
29 *              cpu in non-irq context does not reduce irq. for majority of the
30 *              cases, clamping down cpu does help reduce irq as well, we should
31 *              be able to differenciate the two cases and give a quantitative
32 *              solution for the irqs that we can control. perhaps based on
33 *              get_cpu_iowait_time_us()
34 *
35 *	     2. synchronization with other hw blocks
36 *
37 *
38 */
39
40#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
41
42#include <linux/module.h>
43#include <linux/kernel.h>
44#include <linux/delay.h>
45#include <linux/kthread.h>
46#include <linux/freezer.h>
47#include <linux/cpu.h>
48#include <linux/thermal.h>
49#include <linux/slab.h>
50#include <linux/tick.h>
51#include <linux/debugfs.h>
52#include <linux/seq_file.h>
53#include <linux/sched/rt.h>
54
55#include <asm/nmi.h>
56#include <asm/msr.h>
57#include <asm/mwait.h>
58#include <asm/cpu_device_id.h>
59#include <asm/idle.h>
60#include <asm/hardirq.h>
61
62#define MAX_TARGET_RATIO (50U)
63/* For each undisturbed clamping period (no extra wake ups during idle time),
64 * we increment the confidence counter for the given target ratio.
65 * CONFIDENCE_OK defines the level where runtime calibration results are
66 * valid.
67 */
68#define CONFIDENCE_OK (3)
69/* Default idle injection duration, driver adjust sleep time to meet target
70 * idle ratio. Similar to frequency modulation.
71 */
72#define DEFAULT_DURATION_JIFFIES (6)
73
74static unsigned int target_mwait;
75static struct dentry *debug_dir;
76
77/* user selected target */
78static unsigned int set_target_ratio;
79static unsigned int current_ratio;
80static bool should_skip;
81static bool reduce_irq;
82static atomic_t idle_wakeup_counter;
83static unsigned int control_cpu; /* The cpu assigned to collect stat and update
84				  * control parameters. default to BSP but BSP
85				  * can be offlined.
86				  */
87static bool clamping;
88
89
90static struct task_struct * __percpu *powerclamp_thread;
91static struct thermal_cooling_device *cooling_dev;
92static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
93					   * clamping thread
94					   */
95
96static unsigned int duration;
97static unsigned int pkg_cstate_ratio_cur;
98static unsigned int window_size;
99
100static int duration_set(const char *arg, const struct kernel_param *kp)
101{
102	int ret = 0;
103	unsigned long new_duration;
104
105	ret = kstrtoul(arg, 10, &new_duration);
106	if (ret)
107		goto exit;
108	if (new_duration > 25 || new_duration < 6) {
109		pr_err("Out of recommended range %lu, between 6-25ms\n",
110			new_duration);
111		ret = -EINVAL;
112	}
113
114	duration = clamp(new_duration, 6ul, 25ul);
115	smp_mb();
116
117exit:
118
119	return ret;
120}
121
122static struct kernel_param_ops duration_ops = {
123	.set = duration_set,
124	.get = param_get_int,
125};
126
127
128module_param_cb(duration, &duration_ops, &duration, 0644);
129MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
130
131struct powerclamp_calibration_data {
132	unsigned long confidence;  /* used for calibration, basically a counter
133				    * gets incremented each time a clamping
134				    * period is completed without extra wakeups
135				    * once that counter is reached given level,
136				    * compensation is deemed usable.
137				    */
138	unsigned long steady_comp; /* steady state compensation used when
139				    * no extra wakeups occurred.
140				    */
141	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
142				     * mostly from external interrupts.
143				     */
144};
145
146static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
147
148static int window_size_set(const char *arg, const struct kernel_param *kp)
149{
150	int ret = 0;
151	unsigned long new_window_size;
152
153	ret = kstrtoul(arg, 10, &new_window_size);
154	if (ret)
155		goto exit_win;
156	if (new_window_size > 10 || new_window_size < 2) {
157		pr_err("Out of recommended window size %lu, between 2-10\n",
158			new_window_size);
159		ret = -EINVAL;
160	}
161
162	window_size = clamp(new_window_size, 2ul, 10ul);
163	smp_mb();
164
165exit_win:
166
167	return ret;
168}
169
170static struct kernel_param_ops window_size_ops = {
171	.set = window_size_set,
172	.get = param_get_int,
173};
174
175module_param_cb(window_size, &window_size_ops, &window_size, 0644);
176MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
177	"\tpowerclamp controls idle ratio within this window. larger\n"
178	"\twindow size results in slower response time but more smooth\n"
179	"\tclamping results. default to 2.");
180
181static void find_target_mwait(void)
182{
183	unsigned int eax, ebx, ecx, edx;
184	unsigned int highest_cstate = 0;
185	unsigned int highest_subcstate = 0;
186	int i;
187
188	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
189		return;
190
191	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
192
193	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
194	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
195		return;
196
197	edx >>= MWAIT_SUBSTATE_SIZE;
198	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
199		if (edx & MWAIT_SUBSTATE_MASK) {
200			highest_cstate = i;
201			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
202		}
203	}
204	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
205		(highest_subcstate - 1);
206
207}
208
209static bool has_pkg_state_counter(void)
210{
211	u64 tmp;
212	return !rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &tmp) ||
213	       !rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &tmp) ||
214	       !rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &tmp) ||
215	       !rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &tmp);
216}
217
218static u64 pkg_state_counter(void)
219{
220	u64 val;
221	u64 count = 0;
222
223	static bool skip_c2;
224	static bool skip_c3;
225	static bool skip_c6;
226	static bool skip_c7;
227
228	if (!skip_c2) {
229		if (!rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &val))
230			count += val;
231		else
232			skip_c2 = true;
233	}
234
235	if (!skip_c3) {
236		if (!rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &val))
237			count += val;
238		else
239			skip_c3 = true;
240	}
241
242	if (!skip_c6) {
243		if (!rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &val))
244			count += val;
245		else
246			skip_c6 = true;
247	}
248
249	if (!skip_c7) {
250		if (!rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &val))
251			count += val;
252		else
253			skip_c7 = true;
254	}
255
256	return count;
257}
258
259static void noop_timer(unsigned long foo)
260{
261	/* empty... just the fact that we get the interrupt wakes us up */
262}
263
264static unsigned int get_compensation(int ratio)
265{
266	unsigned int comp = 0;
267
268	/* we only use compensation if all adjacent ones are good */
269	if (ratio == 1 &&
270		cal_data[ratio].confidence >= CONFIDENCE_OK &&
271		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
272		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
273		comp = (cal_data[ratio].steady_comp +
274			cal_data[ratio + 1].steady_comp +
275			cal_data[ratio + 2].steady_comp) / 3;
276	} else if (ratio == MAX_TARGET_RATIO - 1 &&
277		cal_data[ratio].confidence >= CONFIDENCE_OK &&
278		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
279		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
280		comp = (cal_data[ratio].steady_comp +
281			cal_data[ratio - 1].steady_comp +
282			cal_data[ratio - 2].steady_comp) / 3;
283	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
284		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
285		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
286		comp = (cal_data[ratio].steady_comp +
287			cal_data[ratio - 1].steady_comp +
288			cal_data[ratio + 1].steady_comp) / 3;
289	}
290
291	/* REVISIT: simple penalty of double idle injection */
292	if (reduce_irq)
293		comp = ratio;
294	/* do not exceed limit */
295	if (comp + ratio >= MAX_TARGET_RATIO)
296		comp = MAX_TARGET_RATIO - ratio - 1;
297
298	return comp;
299}
300
301static void adjust_compensation(int target_ratio, unsigned int win)
302{
303	int delta;
304	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
305
306	/*
307	 * adjust compensations if confidence level has not been reached or
308	 * there are too many wakeups during the last idle injection period, we
309	 * cannot trust the data for compensation.
310	 */
311	if (d->confidence >= CONFIDENCE_OK ||
312		atomic_read(&idle_wakeup_counter) >
313		win * num_online_cpus())
314		return;
315
316	delta = set_target_ratio - current_ratio;
317	/* filter out bad data */
318	if (delta >= 0 && delta <= (1+target_ratio/10)) {
319		if (d->steady_comp)
320			d->steady_comp =
321				roundup(delta+d->steady_comp, 2)/2;
322		else
323			d->steady_comp = delta;
324		d->confidence++;
325	}
326}
327
328static bool powerclamp_adjust_controls(unsigned int target_ratio,
329				unsigned int guard, unsigned int win)
330{
331	static u64 msr_last, tsc_last;
332	u64 msr_now, tsc_now;
333	u64 val64;
334
335	/* check result for the last window */
336	msr_now = pkg_state_counter();
337	rdtscll(tsc_now);
338
339	/* calculate pkg cstate vs tsc ratio */
340	if (!msr_last || !tsc_last)
341		current_ratio = 1;
342	else if (tsc_now-tsc_last) {
343		val64 = 100*(msr_now-msr_last);
344		do_div(val64, (tsc_now-tsc_last));
345		current_ratio = val64;
346	}
347
348	/* update record */
349	msr_last = msr_now;
350	tsc_last = tsc_now;
351
352	adjust_compensation(target_ratio, win);
353	/*
354	 * too many external interrupts, set flag such
355	 * that we can take measure later.
356	 */
357	reduce_irq = atomic_read(&idle_wakeup_counter) >=
358		2 * win * num_online_cpus();
359
360	atomic_set(&idle_wakeup_counter, 0);
361	/* if we are above target+guard, skip */
362	return set_target_ratio + guard <= current_ratio;
363}
364
365static int clamp_thread(void *arg)
366{
367	int cpunr = (unsigned long)arg;
368	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
369	static const struct sched_param param = {
370		.sched_priority = MAX_USER_RT_PRIO/2,
371	};
372	unsigned int count = 0;
373	unsigned int target_ratio;
374
375	set_bit(cpunr, cpu_clamping_mask);
376	set_freezable();
377	init_timer_on_stack(&wakeup_timer);
378	sched_setscheduler(current, SCHED_FIFO, &param);
379
380	while (true == clamping && !kthread_should_stop() &&
381		cpu_online(cpunr)) {
382		int sleeptime;
383		unsigned long target_jiffies;
384		unsigned int guard;
385		unsigned int compensation = 0;
386		int interval; /* jiffies to sleep for each attempt */
387		unsigned int duration_jiffies = msecs_to_jiffies(duration);
388		unsigned int window_size_now;
389
390		try_to_freeze();
391		/*
392		 * make sure user selected ratio does not take effect until
393		 * the next round. adjust target_ratio if user has changed
394		 * target such that we can converge quickly.
395		 */
396		target_ratio = set_target_ratio;
397		guard = 1 + target_ratio/20;
398		window_size_now = window_size;
399		count++;
400
401		/*
402		 * systems may have different ability to enter package level
403		 * c-states, thus we need to compensate the injected idle ratio
404		 * to achieve the actual target reported by the HW.
405		 */
406		compensation = get_compensation(target_ratio);
407		interval = duration_jiffies*100/(target_ratio+compensation);
408
409		/* align idle time */
410		target_jiffies = roundup(jiffies, interval);
411		sleeptime = target_jiffies - jiffies;
412		if (sleeptime <= 0)
413			sleeptime = 1;
414		schedule_timeout_interruptible(sleeptime);
415		/*
416		 * only elected controlling cpu can collect stats and update
417		 * control parameters.
418		 */
419		if (cpunr == control_cpu && !(count%window_size_now)) {
420			should_skip =
421				powerclamp_adjust_controls(target_ratio,
422							guard, window_size_now);
423			smp_mb();
424		}
425
426		if (should_skip)
427			continue;
428
429		target_jiffies = jiffies + duration_jiffies;
430		mod_timer(&wakeup_timer, target_jiffies);
431		if (unlikely(local_softirq_pending()))
432			continue;
433		/*
434		 * stop tick sched during idle time, interrupts are still
435		 * allowed. thus jiffies are updated properly.
436		 */
437		preempt_disable();
438		tick_nohz_idle_enter();
439		/* mwait until target jiffies is reached */
440		while (time_before(jiffies, target_jiffies)) {
441			unsigned long ecx = 1;
442			unsigned long eax = target_mwait;
443
444			/*
445			 * REVISIT: may call enter_idle() to notify drivers who
446			 * can save power during cpu idle. same for exit_idle()
447			 */
448			local_touch_nmi();
449			stop_critical_timings();
450			mwait_idle_with_hints(eax, ecx);
451			start_critical_timings();
452			atomic_inc(&idle_wakeup_counter);
453		}
454		tick_nohz_idle_exit();
455		preempt_enable();
456	}
457	del_timer_sync(&wakeup_timer);
458	clear_bit(cpunr, cpu_clamping_mask);
459
460	return 0;
461}
462
463/*
464 * 1 HZ polling while clamping is active, useful for userspace
465 * to monitor actual idle ratio.
466 */
467static void poll_pkg_cstate(struct work_struct *dummy);
468static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
469static void poll_pkg_cstate(struct work_struct *dummy)
470{
471	static u64 msr_last;
472	static u64 tsc_last;
473	static unsigned long jiffies_last;
474
475	u64 msr_now;
476	unsigned long jiffies_now;
477	u64 tsc_now;
478	u64 val64;
479
480	msr_now = pkg_state_counter();
481	rdtscll(tsc_now);
482	jiffies_now = jiffies;
483
484	/* calculate pkg cstate vs tsc ratio */
485	if (!msr_last || !tsc_last)
486		pkg_cstate_ratio_cur = 1;
487	else {
488		if (tsc_now - tsc_last) {
489			val64 = 100 * (msr_now - msr_last);
490			do_div(val64, (tsc_now - tsc_last));
491			pkg_cstate_ratio_cur = val64;
492		}
493	}
494
495	/* update record */
496	msr_last = msr_now;
497	jiffies_last = jiffies_now;
498	tsc_last = tsc_now;
499
500	if (true == clamping)
501		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
502}
503
504static int start_power_clamp(void)
505{
506	unsigned long cpu;
507	struct task_struct *thread;
508
509	/* check if pkg cstate counter is completely 0, abort in this case */
510	if (!has_pkg_state_counter()) {
511		pr_err("pkg cstate counter not functional, abort\n");
512		return -EINVAL;
513	}
514
515	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
516	/* prevent cpu hotplug */
517	get_online_cpus();
518
519	/* prefer BSP */
520	control_cpu = 0;
521	if (!cpu_online(control_cpu))
522		control_cpu = smp_processor_id();
523
524	clamping = true;
525	schedule_delayed_work(&poll_pkg_cstate_work, 0);
526
527	/* start one thread per online cpu */
528	for_each_online_cpu(cpu) {
529		struct task_struct **p =
530			per_cpu_ptr(powerclamp_thread, cpu);
531
532		thread = kthread_create_on_node(clamp_thread,
533						(void *) cpu,
534						cpu_to_node(cpu),
535						"kidle_inject/%ld", cpu);
536		/* bind to cpu here */
537		if (likely(!IS_ERR(thread))) {
538			kthread_bind(thread, cpu);
539			wake_up_process(thread);
540			*p = thread;
541		}
542
543	}
544	put_online_cpus();
545
546	return 0;
547}
548
549static void end_power_clamp(void)
550{
551	int i;
552	struct task_struct *thread;
553
554	clamping = false;
555	/*
556	 * make clamping visible to other cpus and give per cpu clamping threads
557	 * sometime to exit, or gets killed later.
558	 */
559	smp_mb();
560	msleep(20);
561	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
562		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
563			pr_debug("clamping thread for cpu %d alive, kill\n", i);
564			thread = *per_cpu_ptr(powerclamp_thread, i);
565			kthread_stop(thread);
566		}
567	}
568}
569
570static int powerclamp_cpu_callback(struct notifier_block *nfb,
571				unsigned long action, void *hcpu)
572{
573	unsigned long cpu = (unsigned long)hcpu;
574	struct task_struct *thread;
575	struct task_struct **percpu_thread =
576		per_cpu_ptr(powerclamp_thread, cpu);
577
578	if (false == clamping)
579		goto exit_ok;
580
581	switch (action) {
582	case CPU_ONLINE:
583		thread = kthread_create_on_node(clamp_thread,
584						(void *) cpu,
585						cpu_to_node(cpu),
586						"kidle_inject/%lu", cpu);
587		if (likely(!IS_ERR(thread))) {
588			kthread_bind(thread, cpu);
589			wake_up_process(thread);
590			*percpu_thread = thread;
591		}
592		/* prefer BSP as controlling CPU */
593		if (cpu == 0) {
594			control_cpu = 0;
595			smp_mb();
596		}
597		break;
598	case CPU_DEAD:
599		if (test_bit(cpu, cpu_clamping_mask)) {
600			pr_err("cpu %lu dead but powerclamping thread is not\n",
601				cpu);
602			kthread_stop(*percpu_thread);
603		}
604		if (cpu == control_cpu) {
605			control_cpu = smp_processor_id();
606			smp_mb();
607		}
608	}
609
610exit_ok:
611	return NOTIFY_OK;
612}
613
614static struct notifier_block powerclamp_cpu_notifier = {
615	.notifier_call = powerclamp_cpu_callback,
616};
617
618static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
619				 unsigned long *state)
620{
621	*state = MAX_TARGET_RATIO;
622
623	return 0;
624}
625
626static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
627				 unsigned long *state)
628{
629	if (true == clamping)
630		*state = pkg_cstate_ratio_cur;
631	else
632		/* to save power, do not poll idle ratio while not clamping */
633		*state = -1; /* indicates invalid state */
634
635	return 0;
636}
637
638static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
639				 unsigned long new_target_ratio)
640{
641	int ret = 0;
642
643	new_target_ratio = clamp(new_target_ratio, 0UL,
644				(unsigned long) (MAX_TARGET_RATIO-1));
645	if (set_target_ratio == 0 && new_target_ratio > 0) {
646		pr_info("Start idle injection to reduce power\n");
647		set_target_ratio = new_target_ratio;
648		ret = start_power_clamp();
649		goto exit_set;
650	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
651		pr_info("Stop forced idle injection\n");
652		set_target_ratio = 0;
653		end_power_clamp();
654	} else	/* adjust currently running */ {
655		set_target_ratio = new_target_ratio;
656		/* make new set_target_ratio visible to other cpus */
657		smp_mb();
658	}
659
660exit_set:
661	return ret;
662}
663
664/* bind to generic thermal layer as cooling device*/
665static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
666	.get_max_state = powerclamp_get_max_state,
667	.get_cur_state = powerclamp_get_cur_state,
668	.set_cur_state = powerclamp_set_cur_state,
669};
670
671/* runs on Nehalem and later */
672static const struct x86_cpu_id intel_powerclamp_ids[] = {
673	{ X86_VENDOR_INTEL, 6, 0x1a},
674	{ X86_VENDOR_INTEL, 6, 0x1c},
675	{ X86_VENDOR_INTEL, 6, 0x1e},
676	{ X86_VENDOR_INTEL, 6, 0x1f},
677	{ X86_VENDOR_INTEL, 6, 0x25},
678	{ X86_VENDOR_INTEL, 6, 0x26},
679	{ X86_VENDOR_INTEL, 6, 0x2a},
680	{ X86_VENDOR_INTEL, 6, 0x2c},
681	{ X86_VENDOR_INTEL, 6, 0x2d},
682	{ X86_VENDOR_INTEL, 6, 0x2e},
683	{ X86_VENDOR_INTEL, 6, 0x2f},
684	{ X86_VENDOR_INTEL, 6, 0x37},
685	{ X86_VENDOR_INTEL, 6, 0x3a},
686	{ X86_VENDOR_INTEL, 6, 0x3c},
687	{ X86_VENDOR_INTEL, 6, 0x3d},
688	{ X86_VENDOR_INTEL, 6, 0x3e},
689	{ X86_VENDOR_INTEL, 6, 0x3f},
690	{ X86_VENDOR_INTEL, 6, 0x45},
691	{ X86_VENDOR_INTEL, 6, 0x46},
692	{}
693};
694MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
695
696static int powerclamp_probe(void)
697{
698	if (!x86_match_cpu(intel_powerclamp_ids)) {
699		pr_err("Intel powerclamp does not run on family %d model %d\n",
700				boot_cpu_data.x86, boot_cpu_data.x86_model);
701		return -ENODEV;
702	}
703	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
704		!boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
705		!boot_cpu_has(X86_FEATURE_MWAIT) ||
706		!boot_cpu_has(X86_FEATURE_ARAT))
707		return -ENODEV;
708
709	/* find the deepest mwait value */
710	find_target_mwait();
711
712	return 0;
713}
714
715static int powerclamp_debug_show(struct seq_file *m, void *unused)
716{
717	int i = 0;
718
719	seq_printf(m, "controlling cpu: %d\n", control_cpu);
720	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
721	for (i = 0; i < MAX_TARGET_RATIO; i++) {
722		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
723			i,
724			cal_data[i].confidence,
725			cal_data[i].steady_comp,
726			cal_data[i].dynamic_comp);
727	}
728
729	return 0;
730}
731
732static int powerclamp_debug_open(struct inode *inode,
733			struct file *file)
734{
735	return single_open(file, powerclamp_debug_show, inode->i_private);
736}
737
738static const struct file_operations powerclamp_debug_fops = {
739	.open		= powerclamp_debug_open,
740	.read		= seq_read,
741	.llseek		= seq_lseek,
742	.release	= single_release,
743	.owner		= THIS_MODULE,
744};
745
746static inline void powerclamp_create_debug_files(void)
747{
748	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
749	if (!debug_dir)
750		return;
751
752	if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
753					cal_data, &powerclamp_debug_fops))
754		goto file_error;
755
756	return;
757
758file_error:
759	debugfs_remove_recursive(debug_dir);
760}
761
762static int powerclamp_init(void)
763{
764	int retval;
765	int bitmap_size;
766
767	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
768	cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
769	if (!cpu_clamping_mask)
770		return -ENOMEM;
771
772	/* probe cpu features and ids here */
773	retval = powerclamp_probe();
774	if (retval)
775		goto exit_free;
776
777	/* set default limit, maybe adjusted during runtime based on feedback */
778	window_size = 2;
779	register_hotcpu_notifier(&powerclamp_cpu_notifier);
780
781	powerclamp_thread = alloc_percpu(struct task_struct *);
782	if (!powerclamp_thread) {
783		retval = -ENOMEM;
784		goto exit_unregister;
785	}
786
787	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
788						&powerclamp_cooling_ops);
789	if (IS_ERR(cooling_dev)) {
790		retval = -ENODEV;
791		goto exit_free_thread;
792	}
793
794	if (!duration)
795		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
796
797	powerclamp_create_debug_files();
798
799	return 0;
800
801exit_free_thread:
802	free_percpu(powerclamp_thread);
803exit_unregister:
804	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
805exit_free:
806	kfree(cpu_clamping_mask);
807	return retval;
808}
809module_init(powerclamp_init);
810
811static void powerclamp_exit(void)
812{
813	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
814	end_power_clamp();
815	free_percpu(powerclamp_thread);
816	thermal_cooling_device_unregister(cooling_dev);
817	kfree(cpu_clamping_mask);
818
819	cancel_delayed_work_sync(&poll_pkg_cstate_work);
820	debugfs_remove_recursive(debug_dir);
821}
822module_exit(powerclamp_exit);
823
824MODULE_LICENSE("GPL");
825MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
826MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
827MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
828