memory-failure.c revision c9fbdd5f131440981b124883656ea21fb12cde4a
1/*
2 * Copyright (C) 2008, 2009 Intel Corporation
3 * Authors: Andi Kleen, Fengguang Wu
4 *
5 * This software may be redistributed and/or modified under the terms of
6 * the GNU General Public License ("GPL") version 2 only as published by the
7 * Free Software Foundation.
8 *
9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache
11 * failure.
12 *
13 * Handles page cache pages in various states.	The tricky part
14 * here is that we can access any page asynchronous to other VM
15 * users, because memory failures could happen anytime and anywhere,
16 * possibly violating some of their assumptions. This is why this code
17 * has to be extremely careful. Generally it tries to use normal locking
18 * rules, as in get the standard locks, even if that means the
19 * error handling takes potentially a long time.
20 *
21 * The operation to map back from RMAP chains to processes has to walk
22 * the complete process list and has non linear complexity with the number
23 * mappings. In short it can be quite slow. But since memory corruptions
24 * are rare we hope to get away with this.
25 */
26
27/*
28 * Notebook:
29 * - hugetlb needs more code
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel
32 */
33#define DEBUG 1		/* remove me in 2.6.34 */
34#include <linux/kernel.h>
35#include <linux/mm.h>
36#include <linux/page-flags.h>
37#include <linux/kernel-page-flags.h>
38#include <linux/sched.h>
39#include <linux/ksm.h>
40#include <linux/rmap.h>
41#include <linux/pagemap.h>
42#include <linux/swap.h>
43#include <linux/backing-dev.h>
44#include <linux/migrate.h>
45#include <linux/page-isolation.h>
46#include <linux/suspend.h>
47#include <linux/slab.h>
48#include <linux/hugetlb.h>
49#include "internal.h"
50
51int sysctl_memory_failure_early_kill __read_mostly = 0;
52
53int sysctl_memory_failure_recovery __read_mostly = 1;
54
55atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
56
57#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
58
59u32 hwpoison_filter_enable = 0;
60u32 hwpoison_filter_dev_major = ~0U;
61u32 hwpoison_filter_dev_minor = ~0U;
62u64 hwpoison_filter_flags_mask;
63u64 hwpoison_filter_flags_value;
64EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
65EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
66EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
67EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
68EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
69
70static int hwpoison_filter_dev(struct page *p)
71{
72	struct address_space *mapping;
73	dev_t dev;
74
75	if (hwpoison_filter_dev_major == ~0U &&
76	    hwpoison_filter_dev_minor == ~0U)
77		return 0;
78
79	/*
80	 * page_mapping() does not accept slab page
81	 */
82	if (PageSlab(p))
83		return -EINVAL;
84
85	mapping = page_mapping(p);
86	if (mapping == NULL || mapping->host == NULL)
87		return -EINVAL;
88
89	dev = mapping->host->i_sb->s_dev;
90	if (hwpoison_filter_dev_major != ~0U &&
91	    hwpoison_filter_dev_major != MAJOR(dev))
92		return -EINVAL;
93	if (hwpoison_filter_dev_minor != ~0U &&
94	    hwpoison_filter_dev_minor != MINOR(dev))
95		return -EINVAL;
96
97	return 0;
98}
99
100static int hwpoison_filter_flags(struct page *p)
101{
102	if (!hwpoison_filter_flags_mask)
103		return 0;
104
105	if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
106				    hwpoison_filter_flags_value)
107		return 0;
108	else
109		return -EINVAL;
110}
111
112/*
113 * This allows stress tests to limit test scope to a collection of tasks
114 * by putting them under some memcg. This prevents killing unrelated/important
115 * processes such as /sbin/init. Note that the target task may share clean
116 * pages with init (eg. libc text), which is harmless. If the target task
117 * share _dirty_ pages with another task B, the test scheme must make sure B
118 * is also included in the memcg. At last, due to race conditions this filter
119 * can only guarantee that the page either belongs to the memcg tasks, or is
120 * a freed page.
121 */
122#ifdef	CONFIG_CGROUP_MEM_RES_CTLR_SWAP
123u64 hwpoison_filter_memcg;
124EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
125static int hwpoison_filter_task(struct page *p)
126{
127	struct mem_cgroup *mem;
128	struct cgroup_subsys_state *css;
129	unsigned long ino;
130
131	if (!hwpoison_filter_memcg)
132		return 0;
133
134	mem = try_get_mem_cgroup_from_page(p);
135	if (!mem)
136		return -EINVAL;
137
138	css = mem_cgroup_css(mem);
139	/* root_mem_cgroup has NULL dentries */
140	if (!css->cgroup->dentry)
141		return -EINVAL;
142
143	ino = css->cgroup->dentry->d_inode->i_ino;
144	css_put(css);
145
146	if (ino != hwpoison_filter_memcg)
147		return -EINVAL;
148
149	return 0;
150}
151#else
152static int hwpoison_filter_task(struct page *p) { return 0; }
153#endif
154
155int hwpoison_filter(struct page *p)
156{
157	if (!hwpoison_filter_enable)
158		return 0;
159
160	if (hwpoison_filter_dev(p))
161		return -EINVAL;
162
163	if (hwpoison_filter_flags(p))
164		return -EINVAL;
165
166	if (hwpoison_filter_task(p))
167		return -EINVAL;
168
169	return 0;
170}
171#else
172int hwpoison_filter(struct page *p)
173{
174	return 0;
175}
176#endif
177
178EXPORT_SYMBOL_GPL(hwpoison_filter);
179
180/*
181 * Send all the processes who have the page mapped an ``action optional''
182 * signal.
183 */
184static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
185			unsigned long pfn)
186{
187	struct siginfo si;
188	int ret;
189
190	printk(KERN_ERR
191		"MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
192		pfn, t->comm, t->pid);
193	si.si_signo = SIGBUS;
194	si.si_errno = 0;
195	si.si_code = BUS_MCEERR_AO;
196	si.si_addr = (void *)addr;
197#ifdef __ARCH_SI_TRAPNO
198	si.si_trapno = trapno;
199#endif
200	si.si_addr_lsb = PAGE_SHIFT;
201	/*
202	 * Don't use force here, it's convenient if the signal
203	 * can be temporarily blocked.
204	 * This could cause a loop when the user sets SIGBUS
205	 * to SIG_IGN, but hopefully noone will do that?
206	 */
207	ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
208	if (ret < 0)
209		printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
210		       t->comm, t->pid, ret);
211	return ret;
212}
213
214/*
215 * When a unknown page type is encountered drain as many buffers as possible
216 * in the hope to turn the page into a LRU or free page, which we can handle.
217 */
218void shake_page(struct page *p, int access)
219{
220	if (!PageSlab(p)) {
221		lru_add_drain_all();
222		if (PageLRU(p))
223			return;
224		drain_all_pages();
225		if (PageLRU(p) || is_free_buddy_page(p))
226			return;
227	}
228
229	/*
230	 * Only all shrink_slab here (which would also
231	 * shrink other caches) if access is not potentially fatal.
232	 */
233	if (access) {
234		int nr;
235		do {
236			nr = shrink_slab(1000, GFP_KERNEL, 1000);
237			if (page_count(p) == 0)
238				break;
239		} while (nr > 10);
240	}
241}
242EXPORT_SYMBOL_GPL(shake_page);
243
244/*
245 * Kill all processes that have a poisoned page mapped and then isolate
246 * the page.
247 *
248 * General strategy:
249 * Find all processes having the page mapped and kill them.
250 * But we keep a page reference around so that the page is not
251 * actually freed yet.
252 * Then stash the page away
253 *
254 * There's no convenient way to get back to mapped processes
255 * from the VMAs. So do a brute-force search over all
256 * running processes.
257 *
258 * Remember that machine checks are not common (or rather
259 * if they are common you have other problems), so this shouldn't
260 * be a performance issue.
261 *
262 * Also there are some races possible while we get from the
263 * error detection to actually handle it.
264 */
265
266struct to_kill {
267	struct list_head nd;
268	struct task_struct *tsk;
269	unsigned long addr;
270	unsigned addr_valid:1;
271};
272
273/*
274 * Failure handling: if we can't find or can't kill a process there's
275 * not much we can do.	We just print a message and ignore otherwise.
276 */
277
278/*
279 * Schedule a process for later kill.
280 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
281 * TBD would GFP_NOIO be enough?
282 */
283static void add_to_kill(struct task_struct *tsk, struct page *p,
284		       struct vm_area_struct *vma,
285		       struct list_head *to_kill,
286		       struct to_kill **tkc)
287{
288	struct to_kill *tk;
289
290	if (*tkc) {
291		tk = *tkc;
292		*tkc = NULL;
293	} else {
294		tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
295		if (!tk) {
296			printk(KERN_ERR
297		"MCE: Out of memory while machine check handling\n");
298			return;
299		}
300	}
301	tk->addr = page_address_in_vma(p, vma);
302	tk->addr_valid = 1;
303
304	/*
305	 * In theory we don't have to kill when the page was
306	 * munmaped. But it could be also a mremap. Since that's
307	 * likely very rare kill anyways just out of paranoia, but use
308	 * a SIGKILL because the error is not contained anymore.
309	 */
310	if (tk->addr == -EFAULT) {
311		pr_debug("MCE: Unable to find user space address %lx in %s\n",
312			page_to_pfn(p), tsk->comm);
313		tk->addr_valid = 0;
314	}
315	get_task_struct(tsk);
316	tk->tsk = tsk;
317	list_add_tail(&tk->nd, to_kill);
318}
319
320/*
321 * Kill the processes that have been collected earlier.
322 *
323 * Only do anything when DOIT is set, otherwise just free the list
324 * (this is used for clean pages which do not need killing)
325 * Also when FAIL is set do a force kill because something went
326 * wrong earlier.
327 */
328static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
329			  int fail, unsigned long pfn)
330{
331	struct to_kill *tk, *next;
332
333	list_for_each_entry_safe (tk, next, to_kill, nd) {
334		if (doit) {
335			/*
336			 * In case something went wrong with munmapping
337			 * make sure the process doesn't catch the
338			 * signal and then access the memory. Just kill it.
339			 */
340			if (fail || tk->addr_valid == 0) {
341				printk(KERN_ERR
342		"MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
343					pfn, tk->tsk->comm, tk->tsk->pid);
344				force_sig(SIGKILL, tk->tsk);
345			}
346
347			/*
348			 * In theory the process could have mapped
349			 * something else on the address in-between. We could
350			 * check for that, but we need to tell the
351			 * process anyways.
352			 */
353			else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
354					      pfn) < 0)
355				printk(KERN_ERR
356		"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
357					pfn, tk->tsk->comm, tk->tsk->pid);
358		}
359		put_task_struct(tk->tsk);
360		kfree(tk);
361	}
362}
363
364static int task_early_kill(struct task_struct *tsk)
365{
366	if (!tsk->mm)
367		return 0;
368	if (tsk->flags & PF_MCE_PROCESS)
369		return !!(tsk->flags & PF_MCE_EARLY);
370	return sysctl_memory_failure_early_kill;
371}
372
373/*
374 * Collect processes when the error hit an anonymous page.
375 */
376static void collect_procs_anon(struct page *page, struct list_head *to_kill,
377			      struct to_kill **tkc)
378{
379	struct vm_area_struct *vma;
380	struct task_struct *tsk;
381	struct anon_vma *av;
382
383	read_lock(&tasklist_lock);
384	av = page_lock_anon_vma(page);
385	if (av == NULL)	/* Not actually mapped anymore */
386		goto out;
387	for_each_process (tsk) {
388		struct anon_vma_chain *vmac;
389
390		if (!task_early_kill(tsk))
391			continue;
392		list_for_each_entry(vmac, &av->head, same_anon_vma) {
393			vma = vmac->vma;
394			if (!page_mapped_in_vma(page, vma))
395				continue;
396			if (vma->vm_mm == tsk->mm)
397				add_to_kill(tsk, page, vma, to_kill, tkc);
398		}
399	}
400	page_unlock_anon_vma(av);
401out:
402	read_unlock(&tasklist_lock);
403}
404
405/*
406 * Collect processes when the error hit a file mapped page.
407 */
408static void collect_procs_file(struct page *page, struct list_head *to_kill,
409			      struct to_kill **tkc)
410{
411	struct vm_area_struct *vma;
412	struct task_struct *tsk;
413	struct prio_tree_iter iter;
414	struct address_space *mapping = page->mapping;
415
416	/*
417	 * A note on the locking order between the two locks.
418	 * We don't rely on this particular order.
419	 * If you have some other code that needs a different order
420	 * feel free to switch them around. Or add a reverse link
421	 * from mm_struct to task_struct, then this could be all
422	 * done without taking tasklist_lock and looping over all tasks.
423	 */
424
425	read_lock(&tasklist_lock);
426	spin_lock(&mapping->i_mmap_lock);
427	for_each_process(tsk) {
428		pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
429
430		if (!task_early_kill(tsk))
431			continue;
432
433		vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
434				      pgoff) {
435			/*
436			 * Send early kill signal to tasks where a vma covers
437			 * the page but the corrupted page is not necessarily
438			 * mapped it in its pte.
439			 * Assume applications who requested early kill want
440			 * to be informed of all such data corruptions.
441			 */
442			if (vma->vm_mm == tsk->mm)
443				add_to_kill(tsk, page, vma, to_kill, tkc);
444		}
445	}
446	spin_unlock(&mapping->i_mmap_lock);
447	read_unlock(&tasklist_lock);
448}
449
450/*
451 * Collect the processes who have the corrupted page mapped to kill.
452 * This is done in two steps for locking reasons.
453 * First preallocate one tokill structure outside the spin locks,
454 * so that we can kill at least one process reasonably reliable.
455 */
456static void collect_procs(struct page *page, struct list_head *tokill)
457{
458	struct to_kill *tk;
459
460	if (!page->mapping)
461		return;
462
463	tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
464	if (!tk)
465		return;
466	if (PageAnon(page))
467		collect_procs_anon(page, tokill, &tk);
468	else
469		collect_procs_file(page, tokill, &tk);
470	kfree(tk);
471}
472
473/*
474 * Error handlers for various types of pages.
475 */
476
477enum outcome {
478	IGNORED,	/* Error: cannot be handled */
479	FAILED,		/* Error: handling failed */
480	DELAYED,	/* Will be handled later */
481	RECOVERED,	/* Successfully recovered */
482};
483
484static const char *action_name[] = {
485	[IGNORED] = "Ignored",
486	[FAILED] = "Failed",
487	[DELAYED] = "Delayed",
488	[RECOVERED] = "Recovered",
489};
490
491/*
492 * XXX: It is possible that a page is isolated from LRU cache,
493 * and then kept in swap cache or failed to remove from page cache.
494 * The page count will stop it from being freed by unpoison.
495 * Stress tests should be aware of this memory leak problem.
496 */
497static int delete_from_lru_cache(struct page *p)
498{
499	if (!isolate_lru_page(p)) {
500		/*
501		 * Clear sensible page flags, so that the buddy system won't
502		 * complain when the page is unpoison-and-freed.
503		 */
504		ClearPageActive(p);
505		ClearPageUnevictable(p);
506		/*
507		 * drop the page count elevated by isolate_lru_page()
508		 */
509		page_cache_release(p);
510		return 0;
511	}
512	return -EIO;
513}
514
515/*
516 * Error hit kernel page.
517 * Do nothing, try to be lucky and not touch this instead. For a few cases we
518 * could be more sophisticated.
519 */
520static int me_kernel(struct page *p, unsigned long pfn)
521{
522	return IGNORED;
523}
524
525/*
526 * Page in unknown state. Do nothing.
527 */
528static int me_unknown(struct page *p, unsigned long pfn)
529{
530	printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
531	return FAILED;
532}
533
534/*
535 * Clean (or cleaned) page cache page.
536 */
537static int me_pagecache_clean(struct page *p, unsigned long pfn)
538{
539	int err;
540	int ret = FAILED;
541	struct address_space *mapping;
542
543	delete_from_lru_cache(p);
544
545	/*
546	 * For anonymous pages we're done the only reference left
547	 * should be the one m_f() holds.
548	 */
549	if (PageAnon(p))
550		return RECOVERED;
551
552	/*
553	 * Now truncate the page in the page cache. This is really
554	 * more like a "temporary hole punch"
555	 * Don't do this for block devices when someone else
556	 * has a reference, because it could be file system metadata
557	 * and that's not safe to truncate.
558	 */
559	mapping = page_mapping(p);
560	if (!mapping) {
561		/*
562		 * Page has been teared down in the meanwhile
563		 */
564		return FAILED;
565	}
566
567	/*
568	 * Truncation is a bit tricky. Enable it per file system for now.
569	 *
570	 * Open: to take i_mutex or not for this? Right now we don't.
571	 */
572	if (mapping->a_ops->error_remove_page) {
573		err = mapping->a_ops->error_remove_page(mapping, p);
574		if (err != 0) {
575			printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
576					pfn, err);
577		} else if (page_has_private(p) &&
578				!try_to_release_page(p, GFP_NOIO)) {
579			pr_debug("MCE %#lx: failed to release buffers\n", pfn);
580		} else {
581			ret = RECOVERED;
582		}
583	} else {
584		/*
585		 * If the file system doesn't support it just invalidate
586		 * This fails on dirty or anything with private pages
587		 */
588		if (invalidate_inode_page(p))
589			ret = RECOVERED;
590		else
591			printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
592				pfn);
593	}
594	return ret;
595}
596
597/*
598 * Dirty cache page page
599 * Issues: when the error hit a hole page the error is not properly
600 * propagated.
601 */
602static int me_pagecache_dirty(struct page *p, unsigned long pfn)
603{
604	struct address_space *mapping = page_mapping(p);
605
606	SetPageError(p);
607	/* TBD: print more information about the file. */
608	if (mapping) {
609		/*
610		 * IO error will be reported by write(), fsync(), etc.
611		 * who check the mapping.
612		 * This way the application knows that something went
613		 * wrong with its dirty file data.
614		 *
615		 * There's one open issue:
616		 *
617		 * The EIO will be only reported on the next IO
618		 * operation and then cleared through the IO map.
619		 * Normally Linux has two mechanisms to pass IO error
620		 * first through the AS_EIO flag in the address space
621		 * and then through the PageError flag in the page.
622		 * Since we drop pages on memory failure handling the
623		 * only mechanism open to use is through AS_AIO.
624		 *
625		 * This has the disadvantage that it gets cleared on
626		 * the first operation that returns an error, while
627		 * the PageError bit is more sticky and only cleared
628		 * when the page is reread or dropped.  If an
629		 * application assumes it will always get error on
630		 * fsync, but does other operations on the fd before
631		 * and the page is dropped inbetween then the error
632		 * will not be properly reported.
633		 *
634		 * This can already happen even without hwpoisoned
635		 * pages: first on metadata IO errors (which only
636		 * report through AS_EIO) or when the page is dropped
637		 * at the wrong time.
638		 *
639		 * So right now we assume that the application DTRT on
640		 * the first EIO, but we're not worse than other parts
641		 * of the kernel.
642		 */
643		mapping_set_error(mapping, EIO);
644	}
645
646	return me_pagecache_clean(p, pfn);
647}
648
649/*
650 * Clean and dirty swap cache.
651 *
652 * Dirty swap cache page is tricky to handle. The page could live both in page
653 * cache and swap cache(ie. page is freshly swapped in). So it could be
654 * referenced concurrently by 2 types of PTEs:
655 * normal PTEs and swap PTEs. We try to handle them consistently by calling
656 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
657 * and then
658 *      - clear dirty bit to prevent IO
659 *      - remove from LRU
660 *      - but keep in the swap cache, so that when we return to it on
661 *        a later page fault, we know the application is accessing
662 *        corrupted data and shall be killed (we installed simple
663 *        interception code in do_swap_page to catch it).
664 *
665 * Clean swap cache pages can be directly isolated. A later page fault will
666 * bring in the known good data from disk.
667 */
668static int me_swapcache_dirty(struct page *p, unsigned long pfn)
669{
670	ClearPageDirty(p);
671	/* Trigger EIO in shmem: */
672	ClearPageUptodate(p);
673
674	if (!delete_from_lru_cache(p))
675		return DELAYED;
676	else
677		return FAILED;
678}
679
680static int me_swapcache_clean(struct page *p, unsigned long pfn)
681{
682	delete_from_swap_cache(p);
683
684	if (!delete_from_lru_cache(p))
685		return RECOVERED;
686	else
687		return FAILED;
688}
689
690/*
691 * Huge pages. Needs work.
692 * Issues:
693 * No rmap support so we cannot find the original mapper. In theory could walk
694 * all MMs and look for the mappings, but that would be non atomic and racy.
695 * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
696 * like just walking the current process and hoping it has it mapped (that
697 * should be usually true for the common "shared database cache" case)
698 * Should handle free huge pages and dequeue them too, but this needs to
699 * handle huge page accounting correctly.
700 */
701static int me_huge_page(struct page *p, unsigned long pfn)
702{
703	return FAILED;
704}
705
706/*
707 * Various page states we can handle.
708 *
709 * A page state is defined by its current page->flags bits.
710 * The table matches them in order and calls the right handler.
711 *
712 * This is quite tricky because we can access page at any time
713 * in its live cycle, so all accesses have to be extremly careful.
714 *
715 * This is not complete. More states could be added.
716 * For any missing state don't attempt recovery.
717 */
718
719#define dirty		(1UL << PG_dirty)
720#define sc		(1UL << PG_swapcache)
721#define unevict		(1UL << PG_unevictable)
722#define mlock		(1UL << PG_mlocked)
723#define writeback	(1UL << PG_writeback)
724#define lru		(1UL << PG_lru)
725#define swapbacked	(1UL << PG_swapbacked)
726#define head		(1UL << PG_head)
727#define tail		(1UL << PG_tail)
728#define compound	(1UL << PG_compound)
729#define slab		(1UL << PG_slab)
730#define reserved	(1UL << PG_reserved)
731
732static struct page_state {
733	unsigned long mask;
734	unsigned long res;
735	char *msg;
736	int (*action)(struct page *p, unsigned long pfn);
737} error_states[] = {
738	{ reserved,	reserved,	"reserved kernel",	me_kernel },
739	/*
740	 * free pages are specially detected outside this table:
741	 * PG_buddy pages only make a small fraction of all free pages.
742	 */
743
744	/*
745	 * Could in theory check if slab page is free or if we can drop
746	 * currently unused objects without touching them. But just
747	 * treat it as standard kernel for now.
748	 */
749	{ slab,		slab,		"kernel slab",	me_kernel },
750
751#ifdef CONFIG_PAGEFLAGS_EXTENDED
752	{ head,		head,		"huge",		me_huge_page },
753	{ tail,		tail,		"huge",		me_huge_page },
754#else
755	{ compound,	compound,	"huge",		me_huge_page },
756#endif
757
758	{ sc|dirty,	sc|dirty,	"swapcache",	me_swapcache_dirty },
759	{ sc|dirty,	sc,		"swapcache",	me_swapcache_clean },
760
761	{ unevict|dirty, unevict|dirty,	"unevictable LRU", me_pagecache_dirty},
762	{ unevict,	unevict,	"unevictable LRU", me_pagecache_clean},
763
764	{ mlock|dirty,	mlock|dirty,	"mlocked LRU",	me_pagecache_dirty },
765	{ mlock,	mlock,		"mlocked LRU",	me_pagecache_clean },
766
767	{ lru|dirty,	lru|dirty,	"LRU",		me_pagecache_dirty },
768	{ lru|dirty,	lru,		"clean LRU",	me_pagecache_clean },
769
770	/*
771	 * Catchall entry: must be at end.
772	 */
773	{ 0,		0,		"unknown page state",	me_unknown },
774};
775
776#undef dirty
777#undef sc
778#undef unevict
779#undef mlock
780#undef writeback
781#undef lru
782#undef swapbacked
783#undef head
784#undef tail
785#undef compound
786#undef slab
787#undef reserved
788
789static void action_result(unsigned long pfn, char *msg, int result)
790{
791	struct page *page = pfn_to_page(pfn);
792
793	printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
794		pfn,
795		PageDirty(page) ? "dirty " : "",
796		msg, action_name[result]);
797}
798
799static int page_action(struct page_state *ps, struct page *p,
800			unsigned long pfn)
801{
802	int result;
803	int count;
804
805	result = ps->action(p, pfn);
806	action_result(pfn, ps->msg, result);
807
808	count = page_count(p) - 1;
809	if (ps->action == me_swapcache_dirty && result == DELAYED)
810		count--;
811	if (count != 0) {
812		printk(KERN_ERR
813		       "MCE %#lx: %s page still referenced by %d users\n",
814		       pfn, ps->msg, count);
815		result = FAILED;
816	}
817
818	/* Could do more checks here if page looks ok */
819	/*
820	 * Could adjust zone counters here to correct for the missing page.
821	 */
822
823	return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
824}
825
826#define N_UNMAP_TRIES 5
827
828/*
829 * Do all that is necessary to remove user space mappings. Unmap
830 * the pages and send SIGBUS to the processes if the data was dirty.
831 */
832static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
833				  int trapno)
834{
835	enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
836	struct address_space *mapping;
837	LIST_HEAD(tokill);
838	int ret;
839	int i;
840	int kill = 1;
841	struct page *hpage = compound_head(p);
842
843	if (PageReserved(p) || PageSlab(p))
844		return SWAP_SUCCESS;
845
846	/*
847	 * This check implies we don't kill processes if their pages
848	 * are in the swap cache early. Those are always late kills.
849	 */
850	if (!page_mapped(hpage))
851		return SWAP_SUCCESS;
852
853	if (PageKsm(p))
854		return SWAP_FAIL;
855
856	if (PageSwapCache(p)) {
857		printk(KERN_ERR
858		       "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
859		ttu |= TTU_IGNORE_HWPOISON;
860	}
861
862	/*
863	 * Propagate the dirty bit from PTEs to struct page first, because we
864	 * need this to decide if we should kill or just drop the page.
865	 * XXX: the dirty test could be racy: set_page_dirty() may not always
866	 * be called inside page lock (it's recommended but not enforced).
867	 */
868	mapping = page_mapping(hpage);
869	if (!PageDirty(hpage) && mapping &&
870	    mapping_cap_writeback_dirty(mapping)) {
871		if (page_mkclean(hpage)) {
872			SetPageDirty(hpage);
873		} else {
874			kill = 0;
875			ttu |= TTU_IGNORE_HWPOISON;
876			printk(KERN_INFO
877	"MCE %#lx: corrupted page was clean: dropped without side effects\n",
878				pfn);
879		}
880	}
881
882	/*
883	 * First collect all the processes that have the page
884	 * mapped in dirty form.  This has to be done before try_to_unmap,
885	 * because ttu takes the rmap data structures down.
886	 *
887	 * Error handling: We ignore errors here because
888	 * there's nothing that can be done.
889	 */
890	if (kill)
891		collect_procs(hpage, &tokill);
892
893	/*
894	 * try_to_unmap can fail temporarily due to races.
895	 * Try a few times (RED-PEN better strategy?)
896	 */
897	for (i = 0; i < N_UNMAP_TRIES; i++) {
898		ret = try_to_unmap(hpage, ttu);
899		if (ret == SWAP_SUCCESS)
900			break;
901		pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn,  ret);
902	}
903
904	if (ret != SWAP_SUCCESS)
905		printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
906				pfn, page_mapcount(hpage));
907
908	/*
909	 * Now that the dirty bit has been propagated to the
910	 * struct page and all unmaps done we can decide if
911	 * killing is needed or not.  Only kill when the page
912	 * was dirty, otherwise the tokill list is merely
913	 * freed.  When there was a problem unmapping earlier
914	 * use a more force-full uncatchable kill to prevent
915	 * any accesses to the poisoned memory.
916	 */
917	kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
918		      ret != SWAP_SUCCESS, pfn);
919
920	return ret;
921}
922
923static void set_page_hwpoison_huge_page(struct page *hpage)
924{
925	int i;
926	int nr_pages = 1 << compound_order(hpage);
927	for (i = 0; i < nr_pages; i++)
928		SetPageHWPoison(hpage + i);
929}
930
931static void clear_page_hwpoison_huge_page(struct page *hpage)
932{
933	int i;
934	int nr_pages = 1 << compound_order(hpage);
935	for (i = 0; i < nr_pages; i++)
936		ClearPageHWPoison(hpage + i);
937}
938
939int __memory_failure(unsigned long pfn, int trapno, int flags)
940{
941	struct page_state *ps;
942	struct page *p;
943	struct page *hpage;
944	int res;
945	unsigned int nr_pages;
946
947	if (!sysctl_memory_failure_recovery)
948		panic("Memory failure from trap %d on page %lx", trapno, pfn);
949
950	if (!pfn_valid(pfn)) {
951		printk(KERN_ERR
952		       "MCE %#lx: memory outside kernel control\n",
953		       pfn);
954		return -ENXIO;
955	}
956
957	p = pfn_to_page(pfn);
958	hpage = compound_head(p);
959	if (TestSetPageHWPoison(p)) {
960		printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
961		return 0;
962	}
963
964	nr_pages = 1 << compound_order(hpage);
965	atomic_long_add(nr_pages, &mce_bad_pages);
966
967	/*
968	 * We need/can do nothing about count=0 pages.
969	 * 1) it's a free page, and therefore in safe hand:
970	 *    prep_new_page() will be the gate keeper.
971	 * 2) it's part of a non-compound high order page.
972	 *    Implies some kernel user: cannot stop them from
973	 *    R/W the page; let's pray that the page has been
974	 *    used and will be freed some time later.
975	 * In fact it's dangerous to directly bump up page count from 0,
976	 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
977	 */
978	if (!(flags & MF_COUNT_INCREASED) &&
979		!get_page_unless_zero(hpage)) {
980		if (is_free_buddy_page(p)) {
981			action_result(pfn, "free buddy", DELAYED);
982			return 0;
983		} else {
984			action_result(pfn, "high order kernel", IGNORED);
985			return -EBUSY;
986		}
987	}
988
989	/*
990	 * We ignore non-LRU pages for good reasons.
991	 * - PG_locked is only well defined for LRU pages and a few others
992	 * - to avoid races with __set_page_locked()
993	 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
994	 * The check (unnecessarily) ignores LRU pages being isolated and
995	 * walked by the page reclaim code, however that's not a big loss.
996	 */
997	if (!PageLRU(p) && !PageHuge(p))
998		shake_page(p, 0);
999	if (!PageLRU(p) && !PageHuge(p)) {
1000		/*
1001		 * shake_page could have turned it free.
1002		 */
1003		if (is_free_buddy_page(p)) {
1004			action_result(pfn, "free buddy, 2nd try", DELAYED);
1005			return 0;
1006		}
1007		action_result(pfn, "non LRU", IGNORED);
1008		put_page(p);
1009		return -EBUSY;
1010	}
1011
1012	/*
1013	 * Lock the page and wait for writeback to finish.
1014	 * It's very difficult to mess with pages currently under IO
1015	 * and in many cases impossible, so we just avoid it here.
1016	 */
1017	lock_page_nosync(hpage);
1018
1019	/*
1020	 * unpoison always clear PG_hwpoison inside page lock
1021	 */
1022	if (!PageHWPoison(p)) {
1023		printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1024		res = 0;
1025		goto out;
1026	}
1027	if (hwpoison_filter(p)) {
1028		if (TestClearPageHWPoison(p))
1029			atomic_long_sub(nr_pages, &mce_bad_pages);
1030		unlock_page(hpage);
1031		put_page(hpage);
1032		return 0;
1033	}
1034
1035	/*
1036	 * For error on the tail page, we should set PG_hwpoison
1037	 * on the head page to show that the hugepage is hwpoisoned
1038	 */
1039	if (PageTail(p) && TestSetPageHWPoison(hpage)) {
1040		action_result(pfn, "hugepage already hardware poisoned",
1041				IGNORED);
1042		unlock_page(hpage);
1043		put_page(hpage);
1044		return 0;
1045	}
1046	/*
1047	 * Set PG_hwpoison on all pages in an error hugepage,
1048	 * because containment is done in hugepage unit for now.
1049	 * Since we have done TestSetPageHWPoison() for the head page with
1050	 * page lock held, we can safely set PG_hwpoison bits on tail pages.
1051	 */
1052	if (PageHuge(p))
1053		set_page_hwpoison_huge_page(hpage);
1054
1055	wait_on_page_writeback(p);
1056
1057	/*
1058	 * Now take care of user space mappings.
1059	 * Abort on fail: __remove_from_page_cache() assumes unmapped page.
1060	 */
1061	if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1062		printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1063		res = -EBUSY;
1064		goto out;
1065	}
1066
1067	/*
1068	 * Torn down by someone else?
1069	 */
1070	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1071		action_result(pfn, "already truncated LRU", IGNORED);
1072		res = -EBUSY;
1073		goto out;
1074	}
1075
1076	res = -EBUSY;
1077	for (ps = error_states;; ps++) {
1078		if ((p->flags & ps->mask) == ps->res) {
1079			res = page_action(ps, p, pfn);
1080			break;
1081		}
1082	}
1083out:
1084	unlock_page(hpage);
1085	return res;
1086}
1087EXPORT_SYMBOL_GPL(__memory_failure);
1088
1089/**
1090 * memory_failure - Handle memory failure of a page.
1091 * @pfn: Page Number of the corrupted page
1092 * @trapno: Trap number reported in the signal to user space.
1093 *
1094 * This function is called by the low level machine check code
1095 * of an architecture when it detects hardware memory corruption
1096 * of a page. It tries its best to recover, which includes
1097 * dropping pages, killing processes etc.
1098 *
1099 * The function is primarily of use for corruptions that
1100 * happen outside the current execution context (e.g. when
1101 * detected by a background scrubber)
1102 *
1103 * Must run in process context (e.g. a work queue) with interrupts
1104 * enabled and no spinlocks hold.
1105 */
1106void memory_failure(unsigned long pfn, int trapno)
1107{
1108	__memory_failure(pfn, trapno, 0);
1109}
1110
1111/**
1112 * unpoison_memory - Unpoison a previously poisoned page
1113 * @pfn: Page number of the to be unpoisoned page
1114 *
1115 * Software-unpoison a page that has been poisoned by
1116 * memory_failure() earlier.
1117 *
1118 * This is only done on the software-level, so it only works
1119 * for linux injected failures, not real hardware failures
1120 *
1121 * Returns 0 for success, otherwise -errno.
1122 */
1123int unpoison_memory(unsigned long pfn)
1124{
1125	struct page *page;
1126	struct page *p;
1127	int freeit = 0;
1128	unsigned int nr_pages;
1129
1130	if (!pfn_valid(pfn))
1131		return -ENXIO;
1132
1133	p = pfn_to_page(pfn);
1134	page = compound_head(p);
1135
1136	if (!PageHWPoison(p)) {
1137		pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
1138		return 0;
1139	}
1140
1141	nr_pages = 1 << compound_order(page);
1142
1143	if (!get_page_unless_zero(page)) {
1144		if (TestClearPageHWPoison(p))
1145			atomic_long_sub(nr_pages, &mce_bad_pages);
1146		pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
1147		return 0;
1148	}
1149
1150	lock_page_nosync(page);
1151	/*
1152	 * This test is racy because PG_hwpoison is set outside of page lock.
1153	 * That's acceptable because that won't trigger kernel panic. Instead,
1154	 * the PG_hwpoison page will be caught and isolated on the entrance to
1155	 * the free buddy page pool.
1156	 */
1157	if (TestClearPageHWPoison(page)) {
1158		pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
1159		atomic_long_sub(nr_pages, &mce_bad_pages);
1160		freeit = 1;
1161	}
1162	if (PageHuge(p))
1163		clear_page_hwpoison_huge_page(page);
1164	unlock_page(page);
1165
1166	put_page(page);
1167	if (freeit)
1168		put_page(page);
1169
1170	return 0;
1171}
1172EXPORT_SYMBOL(unpoison_memory);
1173
1174static struct page *new_page(struct page *p, unsigned long private, int **x)
1175{
1176	int nid = page_to_nid(p);
1177	return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1178}
1179
1180/*
1181 * Safely get reference count of an arbitrary page.
1182 * Returns 0 for a free page, -EIO for a zero refcount page
1183 * that is not free, and 1 for any other page type.
1184 * For 1 the page is returned with increased page count, otherwise not.
1185 */
1186static int get_any_page(struct page *p, unsigned long pfn, int flags)
1187{
1188	int ret;
1189
1190	if (flags & MF_COUNT_INCREASED)
1191		return 1;
1192
1193	/*
1194	 * The lock_system_sleep prevents a race with memory hotplug,
1195	 * because the isolation assumes there's only a single user.
1196	 * This is a big hammer, a better would be nicer.
1197	 */
1198	lock_system_sleep();
1199
1200	/*
1201	 * Isolate the page, so that it doesn't get reallocated if it
1202	 * was free.
1203	 */
1204	set_migratetype_isolate(p);
1205	if (!get_page_unless_zero(compound_head(p))) {
1206		if (is_free_buddy_page(p)) {
1207			pr_debug("get_any_page: %#lx free buddy page\n", pfn);
1208			/* Set hwpoison bit while page is still isolated */
1209			SetPageHWPoison(p);
1210			ret = 0;
1211		} else {
1212			pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1213				pfn, p->flags);
1214			ret = -EIO;
1215		}
1216	} else {
1217		/* Not a free page */
1218		ret = 1;
1219	}
1220	unset_migratetype_isolate(p);
1221	unlock_system_sleep();
1222	return ret;
1223}
1224
1225/**
1226 * soft_offline_page - Soft offline a page.
1227 * @page: page to offline
1228 * @flags: flags. Same as memory_failure().
1229 *
1230 * Returns 0 on success, otherwise negated errno.
1231 *
1232 * Soft offline a page, by migration or invalidation,
1233 * without killing anything. This is for the case when
1234 * a page is not corrupted yet (so it's still valid to access),
1235 * but has had a number of corrected errors and is better taken
1236 * out.
1237 *
1238 * The actual policy on when to do that is maintained by
1239 * user space.
1240 *
1241 * This should never impact any application or cause data loss,
1242 * however it might take some time.
1243 *
1244 * This is not a 100% solution for all memory, but tries to be
1245 * ``good enough'' for the majority of memory.
1246 */
1247int soft_offline_page(struct page *page, int flags)
1248{
1249	int ret;
1250	unsigned long pfn = page_to_pfn(page);
1251
1252	ret = get_any_page(page, pfn, flags);
1253	if (ret < 0)
1254		return ret;
1255	if (ret == 0)
1256		goto done;
1257
1258	/*
1259	 * Page cache page we can handle?
1260	 */
1261	if (!PageLRU(page)) {
1262		/*
1263		 * Try to free it.
1264		 */
1265		put_page(page);
1266		shake_page(page, 1);
1267
1268		/*
1269		 * Did it turn free?
1270		 */
1271		ret = get_any_page(page, pfn, 0);
1272		if (ret < 0)
1273			return ret;
1274		if (ret == 0)
1275			goto done;
1276	}
1277	if (!PageLRU(page)) {
1278		pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
1279				pfn, page->flags);
1280		return -EIO;
1281	}
1282
1283	lock_page(page);
1284	wait_on_page_writeback(page);
1285
1286	/*
1287	 * Synchronized using the page lock with memory_failure()
1288	 */
1289	if (PageHWPoison(page)) {
1290		unlock_page(page);
1291		put_page(page);
1292		pr_debug("soft offline: %#lx page already poisoned\n", pfn);
1293		return -EBUSY;
1294	}
1295
1296	/*
1297	 * Try to invalidate first. This should work for
1298	 * non dirty unmapped page cache pages.
1299	 */
1300	ret = invalidate_inode_page(page);
1301	unlock_page(page);
1302
1303	/*
1304	 * Drop count because page migration doesn't like raised
1305	 * counts. The page could get re-allocated, but if it becomes
1306	 * LRU the isolation will just fail.
1307	 * RED-PEN would be better to keep it isolated here, but we
1308	 * would need to fix isolation locking first.
1309	 */
1310	put_page(page);
1311	if (ret == 1) {
1312		ret = 0;
1313		pr_debug("soft_offline: %#lx: invalidated\n", pfn);
1314		goto done;
1315	}
1316
1317	/*
1318	 * Simple invalidation didn't work.
1319	 * Try to migrate to a new page instead. migrate.c
1320	 * handles a large number of cases for us.
1321	 */
1322	ret = isolate_lru_page(page);
1323	if (!ret) {
1324		LIST_HEAD(pagelist);
1325
1326		list_add(&page->lru, &pagelist);
1327		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1328		if (ret) {
1329			pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1330				pfn, ret, page->flags);
1331			if (ret > 0)
1332				ret = -EIO;
1333		}
1334	} else {
1335		pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1336				pfn, ret, page_count(page), page->flags);
1337	}
1338	if (ret)
1339		return ret;
1340
1341done:
1342	atomic_long_add(1, &mce_bad_pages);
1343	SetPageHWPoison(page);
1344	/* keep elevated page count for bad page */
1345	return ret;
1346}
1347