memory-failure.c revision 7c116f2b0dbac4a1dd051c7a5e8cef37701cafd4
1/*
2 * Copyright (C) 2008, 2009 Intel Corporation
3 * Authors: Andi Kleen, Fengguang Wu
4 *
5 * This software may be redistributed and/or modified under the terms of
6 * the GNU General Public License ("GPL") version 2 only as published by the
7 * Free Software Foundation.
8 *
9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache
11 * failure.
12 *
13 * Handles page cache pages in various states.	The tricky part
14 * here is that we can access any page asynchronous to other VM
15 * users, because memory failures could happen anytime and anywhere,
16 * possibly violating some of their assumptions. This is why this code
17 * has to be extremely careful. Generally it tries to use normal locking
18 * rules, as in get the standard locks, even if that means the
19 * error handling takes potentially a long time.
20 *
21 * The operation to map back from RMAP chains to processes has to walk
22 * the complete process list and has non linear complexity with the number
23 * mappings. In short it can be quite slow. But since memory corruptions
24 * are rare we hope to get away with this.
25 */
26
27/*
28 * Notebook:
29 * - hugetlb needs more code
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel
32 */
33#define DEBUG 1		/* remove me in 2.6.34 */
34#include <linux/kernel.h>
35#include <linux/mm.h>
36#include <linux/page-flags.h>
37#include <linux/sched.h>
38#include <linux/ksm.h>
39#include <linux/rmap.h>
40#include <linux/pagemap.h>
41#include <linux/swap.h>
42#include <linux/backing-dev.h>
43#include "internal.h"
44
45int sysctl_memory_failure_early_kill __read_mostly = 0;
46
47int sysctl_memory_failure_recovery __read_mostly = 1;
48
49atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
50
51u32 hwpoison_filter_dev_major = ~0U;
52u32 hwpoison_filter_dev_minor = ~0U;
53EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
54EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
55
56static int hwpoison_filter_dev(struct page *p)
57{
58	struct address_space *mapping;
59	dev_t dev;
60
61	if (hwpoison_filter_dev_major == ~0U &&
62	    hwpoison_filter_dev_minor == ~0U)
63		return 0;
64
65	/*
66	 * page_mapping() does not accept slab page
67	 */
68	if (PageSlab(p))
69		return -EINVAL;
70
71	mapping = page_mapping(p);
72	if (mapping == NULL || mapping->host == NULL)
73		return -EINVAL;
74
75	dev = mapping->host->i_sb->s_dev;
76	if (hwpoison_filter_dev_major != ~0U &&
77	    hwpoison_filter_dev_major != MAJOR(dev))
78		return -EINVAL;
79	if (hwpoison_filter_dev_minor != ~0U &&
80	    hwpoison_filter_dev_minor != MINOR(dev))
81		return -EINVAL;
82
83	return 0;
84}
85
86int hwpoison_filter(struct page *p)
87{
88	if (hwpoison_filter_dev(p))
89		return -EINVAL;
90
91	return 0;
92}
93EXPORT_SYMBOL_GPL(hwpoison_filter);
94
95/*
96 * Send all the processes who have the page mapped an ``action optional''
97 * signal.
98 */
99static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
100			unsigned long pfn)
101{
102	struct siginfo si;
103	int ret;
104
105	printk(KERN_ERR
106		"MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
107		pfn, t->comm, t->pid);
108	si.si_signo = SIGBUS;
109	si.si_errno = 0;
110	si.si_code = BUS_MCEERR_AO;
111	si.si_addr = (void *)addr;
112#ifdef __ARCH_SI_TRAPNO
113	si.si_trapno = trapno;
114#endif
115	si.si_addr_lsb = PAGE_SHIFT;
116	/*
117	 * Don't use force here, it's convenient if the signal
118	 * can be temporarily blocked.
119	 * This could cause a loop when the user sets SIGBUS
120	 * to SIG_IGN, but hopefully noone will do that?
121	 */
122	ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
123	if (ret < 0)
124		printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
125		       t->comm, t->pid, ret);
126	return ret;
127}
128
129/*
130 * When a unknown page type is encountered drain as many buffers as possible
131 * in the hope to turn the page into a LRU or free page, which we can handle.
132 */
133void shake_page(struct page *p)
134{
135	if (!PageSlab(p)) {
136		lru_add_drain_all();
137		if (PageLRU(p))
138			return;
139		drain_all_pages();
140		if (PageLRU(p) || is_free_buddy_page(p))
141			return;
142	}
143	/*
144	 * Could call shrink_slab here (which would also
145	 * shrink other caches). Unfortunately that might
146	 * also access the corrupted page, which could be fatal.
147	 */
148}
149EXPORT_SYMBOL_GPL(shake_page);
150
151/*
152 * Kill all processes that have a poisoned page mapped and then isolate
153 * the page.
154 *
155 * General strategy:
156 * Find all processes having the page mapped and kill them.
157 * But we keep a page reference around so that the page is not
158 * actually freed yet.
159 * Then stash the page away
160 *
161 * There's no convenient way to get back to mapped processes
162 * from the VMAs. So do a brute-force search over all
163 * running processes.
164 *
165 * Remember that machine checks are not common (or rather
166 * if they are common you have other problems), so this shouldn't
167 * be a performance issue.
168 *
169 * Also there are some races possible while we get from the
170 * error detection to actually handle it.
171 */
172
173struct to_kill {
174	struct list_head nd;
175	struct task_struct *tsk;
176	unsigned long addr;
177	unsigned addr_valid:1;
178};
179
180/*
181 * Failure handling: if we can't find or can't kill a process there's
182 * not much we can do.	We just print a message and ignore otherwise.
183 */
184
185/*
186 * Schedule a process for later kill.
187 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
188 * TBD would GFP_NOIO be enough?
189 */
190static void add_to_kill(struct task_struct *tsk, struct page *p,
191		       struct vm_area_struct *vma,
192		       struct list_head *to_kill,
193		       struct to_kill **tkc)
194{
195	struct to_kill *tk;
196
197	if (*tkc) {
198		tk = *tkc;
199		*tkc = NULL;
200	} else {
201		tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
202		if (!tk) {
203			printk(KERN_ERR
204		"MCE: Out of memory while machine check handling\n");
205			return;
206		}
207	}
208	tk->addr = page_address_in_vma(p, vma);
209	tk->addr_valid = 1;
210
211	/*
212	 * In theory we don't have to kill when the page was
213	 * munmaped. But it could be also a mremap. Since that's
214	 * likely very rare kill anyways just out of paranoia, but use
215	 * a SIGKILL because the error is not contained anymore.
216	 */
217	if (tk->addr == -EFAULT) {
218		pr_debug("MCE: Unable to find user space address %lx in %s\n",
219			page_to_pfn(p), tsk->comm);
220		tk->addr_valid = 0;
221	}
222	get_task_struct(tsk);
223	tk->tsk = tsk;
224	list_add_tail(&tk->nd, to_kill);
225}
226
227/*
228 * Kill the processes that have been collected earlier.
229 *
230 * Only do anything when DOIT is set, otherwise just free the list
231 * (this is used for clean pages which do not need killing)
232 * Also when FAIL is set do a force kill because something went
233 * wrong earlier.
234 */
235static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
236			  int fail, unsigned long pfn)
237{
238	struct to_kill *tk, *next;
239
240	list_for_each_entry_safe (tk, next, to_kill, nd) {
241		if (doit) {
242			/*
243			 * In case something went wrong with munmapping
244			 * make sure the process doesn't catch the
245			 * signal and then access the memory. Just kill it.
246			 * the signal handlers
247			 */
248			if (fail || tk->addr_valid == 0) {
249				printk(KERN_ERR
250		"MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
251					pfn, tk->tsk->comm, tk->tsk->pid);
252				force_sig(SIGKILL, tk->tsk);
253			}
254
255			/*
256			 * In theory the process could have mapped
257			 * something else on the address in-between. We could
258			 * check for that, but we need to tell the
259			 * process anyways.
260			 */
261			else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
262					      pfn) < 0)
263				printk(KERN_ERR
264		"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
265					pfn, tk->tsk->comm, tk->tsk->pid);
266		}
267		put_task_struct(tk->tsk);
268		kfree(tk);
269	}
270}
271
272static int task_early_kill(struct task_struct *tsk)
273{
274	if (!tsk->mm)
275		return 0;
276	if (tsk->flags & PF_MCE_PROCESS)
277		return !!(tsk->flags & PF_MCE_EARLY);
278	return sysctl_memory_failure_early_kill;
279}
280
281/*
282 * Collect processes when the error hit an anonymous page.
283 */
284static void collect_procs_anon(struct page *page, struct list_head *to_kill,
285			      struct to_kill **tkc)
286{
287	struct vm_area_struct *vma;
288	struct task_struct *tsk;
289	struct anon_vma *av;
290
291	read_lock(&tasklist_lock);
292	av = page_lock_anon_vma(page);
293	if (av == NULL)	/* Not actually mapped anymore */
294		goto out;
295	for_each_process (tsk) {
296		if (!task_early_kill(tsk))
297			continue;
298		list_for_each_entry (vma, &av->head, anon_vma_node) {
299			if (!page_mapped_in_vma(page, vma))
300				continue;
301			if (vma->vm_mm == tsk->mm)
302				add_to_kill(tsk, page, vma, to_kill, tkc);
303		}
304	}
305	page_unlock_anon_vma(av);
306out:
307	read_unlock(&tasklist_lock);
308}
309
310/*
311 * Collect processes when the error hit a file mapped page.
312 */
313static void collect_procs_file(struct page *page, struct list_head *to_kill,
314			      struct to_kill **tkc)
315{
316	struct vm_area_struct *vma;
317	struct task_struct *tsk;
318	struct prio_tree_iter iter;
319	struct address_space *mapping = page->mapping;
320
321	/*
322	 * A note on the locking order between the two locks.
323	 * We don't rely on this particular order.
324	 * If you have some other code that needs a different order
325	 * feel free to switch them around. Or add a reverse link
326	 * from mm_struct to task_struct, then this could be all
327	 * done without taking tasklist_lock and looping over all tasks.
328	 */
329
330	read_lock(&tasklist_lock);
331	spin_lock(&mapping->i_mmap_lock);
332	for_each_process(tsk) {
333		pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
334
335		if (!task_early_kill(tsk))
336			continue;
337
338		vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
339				      pgoff) {
340			/*
341			 * Send early kill signal to tasks where a vma covers
342			 * the page but the corrupted page is not necessarily
343			 * mapped it in its pte.
344			 * Assume applications who requested early kill want
345			 * to be informed of all such data corruptions.
346			 */
347			if (vma->vm_mm == tsk->mm)
348				add_to_kill(tsk, page, vma, to_kill, tkc);
349		}
350	}
351	spin_unlock(&mapping->i_mmap_lock);
352	read_unlock(&tasklist_lock);
353}
354
355/*
356 * Collect the processes who have the corrupted page mapped to kill.
357 * This is done in two steps for locking reasons.
358 * First preallocate one tokill structure outside the spin locks,
359 * so that we can kill at least one process reasonably reliable.
360 */
361static void collect_procs(struct page *page, struct list_head *tokill)
362{
363	struct to_kill *tk;
364
365	if (!page->mapping)
366		return;
367
368	tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
369	if (!tk)
370		return;
371	if (PageAnon(page))
372		collect_procs_anon(page, tokill, &tk);
373	else
374		collect_procs_file(page, tokill, &tk);
375	kfree(tk);
376}
377
378/*
379 * Error handlers for various types of pages.
380 */
381
382enum outcome {
383	IGNORED,	/* Error: cannot be handled */
384	FAILED,		/* Error: handling failed */
385	DELAYED,	/* Will be handled later */
386	RECOVERED,	/* Successfully recovered */
387};
388
389static const char *action_name[] = {
390	[IGNORED] = "Ignored",
391	[FAILED] = "Failed",
392	[DELAYED] = "Delayed",
393	[RECOVERED] = "Recovered",
394};
395
396/*
397 * XXX: It is possible that a page is isolated from LRU cache,
398 * and then kept in swap cache or failed to remove from page cache.
399 * The page count will stop it from being freed by unpoison.
400 * Stress tests should be aware of this memory leak problem.
401 */
402static int delete_from_lru_cache(struct page *p)
403{
404	if (!isolate_lru_page(p)) {
405		/*
406		 * Clear sensible page flags, so that the buddy system won't
407		 * complain when the page is unpoison-and-freed.
408		 */
409		ClearPageActive(p);
410		ClearPageUnevictable(p);
411		/*
412		 * drop the page count elevated by isolate_lru_page()
413		 */
414		page_cache_release(p);
415		return 0;
416	}
417	return -EIO;
418}
419
420/*
421 * Error hit kernel page.
422 * Do nothing, try to be lucky and not touch this instead. For a few cases we
423 * could be more sophisticated.
424 */
425static int me_kernel(struct page *p, unsigned long pfn)
426{
427	return IGNORED;
428}
429
430/*
431 * Page in unknown state. Do nothing.
432 */
433static int me_unknown(struct page *p, unsigned long pfn)
434{
435	printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
436	return FAILED;
437}
438
439/*
440 * Clean (or cleaned) page cache page.
441 */
442static int me_pagecache_clean(struct page *p, unsigned long pfn)
443{
444	int err;
445	int ret = FAILED;
446	struct address_space *mapping;
447
448	delete_from_lru_cache(p);
449
450	/*
451	 * For anonymous pages we're done the only reference left
452	 * should be the one m_f() holds.
453	 */
454	if (PageAnon(p))
455		return RECOVERED;
456
457	/*
458	 * Now truncate the page in the page cache. This is really
459	 * more like a "temporary hole punch"
460	 * Don't do this for block devices when someone else
461	 * has a reference, because it could be file system metadata
462	 * and that's not safe to truncate.
463	 */
464	mapping = page_mapping(p);
465	if (!mapping) {
466		/*
467		 * Page has been teared down in the meanwhile
468		 */
469		return FAILED;
470	}
471
472	/*
473	 * Truncation is a bit tricky. Enable it per file system for now.
474	 *
475	 * Open: to take i_mutex or not for this? Right now we don't.
476	 */
477	if (mapping->a_ops->error_remove_page) {
478		err = mapping->a_ops->error_remove_page(mapping, p);
479		if (err != 0) {
480			printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
481					pfn, err);
482		} else if (page_has_private(p) &&
483				!try_to_release_page(p, GFP_NOIO)) {
484			pr_debug("MCE %#lx: failed to release buffers\n", pfn);
485		} else {
486			ret = RECOVERED;
487		}
488	} else {
489		/*
490		 * If the file system doesn't support it just invalidate
491		 * This fails on dirty or anything with private pages
492		 */
493		if (invalidate_inode_page(p))
494			ret = RECOVERED;
495		else
496			printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
497				pfn);
498	}
499	return ret;
500}
501
502/*
503 * Dirty cache page page
504 * Issues: when the error hit a hole page the error is not properly
505 * propagated.
506 */
507static int me_pagecache_dirty(struct page *p, unsigned long pfn)
508{
509	struct address_space *mapping = page_mapping(p);
510
511	SetPageError(p);
512	/* TBD: print more information about the file. */
513	if (mapping) {
514		/*
515		 * IO error will be reported by write(), fsync(), etc.
516		 * who check the mapping.
517		 * This way the application knows that something went
518		 * wrong with its dirty file data.
519		 *
520		 * There's one open issue:
521		 *
522		 * The EIO will be only reported on the next IO
523		 * operation and then cleared through the IO map.
524		 * Normally Linux has two mechanisms to pass IO error
525		 * first through the AS_EIO flag in the address space
526		 * and then through the PageError flag in the page.
527		 * Since we drop pages on memory failure handling the
528		 * only mechanism open to use is through AS_AIO.
529		 *
530		 * This has the disadvantage that it gets cleared on
531		 * the first operation that returns an error, while
532		 * the PageError bit is more sticky and only cleared
533		 * when the page is reread or dropped.  If an
534		 * application assumes it will always get error on
535		 * fsync, but does other operations on the fd before
536		 * and the page is dropped inbetween then the error
537		 * will not be properly reported.
538		 *
539		 * This can already happen even without hwpoisoned
540		 * pages: first on metadata IO errors (which only
541		 * report through AS_EIO) or when the page is dropped
542		 * at the wrong time.
543		 *
544		 * So right now we assume that the application DTRT on
545		 * the first EIO, but we're not worse than other parts
546		 * of the kernel.
547		 */
548		mapping_set_error(mapping, EIO);
549	}
550
551	return me_pagecache_clean(p, pfn);
552}
553
554/*
555 * Clean and dirty swap cache.
556 *
557 * Dirty swap cache page is tricky to handle. The page could live both in page
558 * cache and swap cache(ie. page is freshly swapped in). So it could be
559 * referenced concurrently by 2 types of PTEs:
560 * normal PTEs and swap PTEs. We try to handle them consistently by calling
561 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
562 * and then
563 *      - clear dirty bit to prevent IO
564 *      - remove from LRU
565 *      - but keep in the swap cache, so that when we return to it on
566 *        a later page fault, we know the application is accessing
567 *        corrupted data and shall be killed (we installed simple
568 *        interception code in do_swap_page to catch it).
569 *
570 * Clean swap cache pages can be directly isolated. A later page fault will
571 * bring in the known good data from disk.
572 */
573static int me_swapcache_dirty(struct page *p, unsigned long pfn)
574{
575	ClearPageDirty(p);
576	/* Trigger EIO in shmem: */
577	ClearPageUptodate(p);
578
579	if (!delete_from_lru_cache(p))
580		return DELAYED;
581	else
582		return FAILED;
583}
584
585static int me_swapcache_clean(struct page *p, unsigned long pfn)
586{
587	delete_from_swap_cache(p);
588
589	if (!delete_from_lru_cache(p))
590		return RECOVERED;
591	else
592		return FAILED;
593}
594
595/*
596 * Huge pages. Needs work.
597 * Issues:
598 * No rmap support so we cannot find the original mapper. In theory could walk
599 * all MMs and look for the mappings, but that would be non atomic and racy.
600 * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
601 * like just walking the current process and hoping it has it mapped (that
602 * should be usually true for the common "shared database cache" case)
603 * Should handle free huge pages and dequeue them too, but this needs to
604 * handle huge page accounting correctly.
605 */
606static int me_huge_page(struct page *p, unsigned long pfn)
607{
608	return FAILED;
609}
610
611/*
612 * Various page states we can handle.
613 *
614 * A page state is defined by its current page->flags bits.
615 * The table matches them in order and calls the right handler.
616 *
617 * This is quite tricky because we can access page at any time
618 * in its live cycle, so all accesses have to be extremly careful.
619 *
620 * This is not complete. More states could be added.
621 * For any missing state don't attempt recovery.
622 */
623
624#define dirty		(1UL << PG_dirty)
625#define sc		(1UL << PG_swapcache)
626#define unevict		(1UL << PG_unevictable)
627#define mlock		(1UL << PG_mlocked)
628#define writeback	(1UL << PG_writeback)
629#define lru		(1UL << PG_lru)
630#define swapbacked	(1UL << PG_swapbacked)
631#define head		(1UL << PG_head)
632#define tail		(1UL << PG_tail)
633#define compound	(1UL << PG_compound)
634#define slab		(1UL << PG_slab)
635#define reserved	(1UL << PG_reserved)
636
637static struct page_state {
638	unsigned long mask;
639	unsigned long res;
640	char *msg;
641	int (*action)(struct page *p, unsigned long pfn);
642} error_states[] = {
643	{ reserved,	reserved,	"reserved kernel",	me_kernel },
644	/*
645	 * free pages are specially detected outside this table:
646	 * PG_buddy pages only make a small fraction of all free pages.
647	 */
648
649	/*
650	 * Could in theory check if slab page is free or if we can drop
651	 * currently unused objects without touching them. But just
652	 * treat it as standard kernel for now.
653	 */
654	{ slab,		slab,		"kernel slab",	me_kernel },
655
656#ifdef CONFIG_PAGEFLAGS_EXTENDED
657	{ head,		head,		"huge",		me_huge_page },
658	{ tail,		tail,		"huge",		me_huge_page },
659#else
660	{ compound,	compound,	"huge",		me_huge_page },
661#endif
662
663	{ sc|dirty,	sc|dirty,	"swapcache",	me_swapcache_dirty },
664	{ sc|dirty,	sc,		"swapcache",	me_swapcache_clean },
665
666	{ unevict|dirty, unevict|dirty,	"unevictable LRU", me_pagecache_dirty},
667	{ unevict,	unevict,	"unevictable LRU", me_pagecache_clean},
668
669	{ mlock|dirty,	mlock|dirty,	"mlocked LRU",	me_pagecache_dirty },
670	{ mlock,	mlock,		"mlocked LRU",	me_pagecache_clean },
671
672	{ lru|dirty,	lru|dirty,	"LRU",		me_pagecache_dirty },
673	{ lru|dirty,	lru,		"clean LRU",	me_pagecache_clean },
674
675	/*
676	 * Catchall entry: must be at end.
677	 */
678	{ 0,		0,		"unknown page state",	me_unknown },
679};
680
681static void action_result(unsigned long pfn, char *msg, int result)
682{
683	struct page *page = pfn_to_page(pfn);
684
685	printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
686		pfn,
687		PageDirty(page) ? "dirty " : "",
688		msg, action_name[result]);
689}
690
691static int page_action(struct page_state *ps, struct page *p,
692			unsigned long pfn)
693{
694	int result;
695	int count;
696
697	result = ps->action(p, pfn);
698	action_result(pfn, ps->msg, result);
699
700	count = page_count(p) - 1;
701	if (ps->action == me_swapcache_dirty && result == DELAYED)
702		count--;
703	if (count != 0) {
704		printk(KERN_ERR
705		       "MCE %#lx: %s page still referenced by %d users\n",
706		       pfn, ps->msg, count);
707		result = FAILED;
708	}
709
710	/* Could do more checks here if page looks ok */
711	/*
712	 * Could adjust zone counters here to correct for the missing page.
713	 */
714
715	return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
716}
717
718#define N_UNMAP_TRIES 5
719
720/*
721 * Do all that is necessary to remove user space mappings. Unmap
722 * the pages and send SIGBUS to the processes if the data was dirty.
723 */
724static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
725				  int trapno)
726{
727	enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
728	struct address_space *mapping;
729	LIST_HEAD(tokill);
730	int ret;
731	int i;
732	int kill = 1;
733
734	if (PageReserved(p) || PageSlab(p))
735		return SWAP_SUCCESS;
736
737	/*
738	 * This check implies we don't kill processes if their pages
739	 * are in the swap cache early. Those are always late kills.
740	 */
741	if (!page_mapped(p))
742		return SWAP_SUCCESS;
743
744	if (PageCompound(p) || PageKsm(p))
745		return SWAP_FAIL;
746
747	if (PageSwapCache(p)) {
748		printk(KERN_ERR
749		       "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
750		ttu |= TTU_IGNORE_HWPOISON;
751	}
752
753	/*
754	 * Propagate the dirty bit from PTEs to struct page first, because we
755	 * need this to decide if we should kill or just drop the page.
756	 * XXX: the dirty test could be racy: set_page_dirty() may not always
757	 * be called inside page lock (it's recommended but not enforced).
758	 */
759	mapping = page_mapping(p);
760	if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
761		if (page_mkclean(p)) {
762			SetPageDirty(p);
763		} else {
764			kill = 0;
765			ttu |= TTU_IGNORE_HWPOISON;
766			printk(KERN_INFO
767	"MCE %#lx: corrupted page was clean: dropped without side effects\n",
768				pfn);
769		}
770	}
771
772	/*
773	 * First collect all the processes that have the page
774	 * mapped in dirty form.  This has to be done before try_to_unmap,
775	 * because ttu takes the rmap data structures down.
776	 *
777	 * Error handling: We ignore errors here because
778	 * there's nothing that can be done.
779	 */
780	if (kill)
781		collect_procs(p, &tokill);
782
783	/*
784	 * try_to_unmap can fail temporarily due to races.
785	 * Try a few times (RED-PEN better strategy?)
786	 */
787	for (i = 0; i < N_UNMAP_TRIES; i++) {
788		ret = try_to_unmap(p, ttu);
789		if (ret == SWAP_SUCCESS)
790			break;
791		pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn,  ret);
792	}
793
794	if (ret != SWAP_SUCCESS)
795		printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
796				pfn, page_mapcount(p));
797
798	/*
799	 * Now that the dirty bit has been propagated to the
800	 * struct page and all unmaps done we can decide if
801	 * killing is needed or not.  Only kill when the page
802	 * was dirty, otherwise the tokill list is merely
803	 * freed.  When there was a problem unmapping earlier
804	 * use a more force-full uncatchable kill to prevent
805	 * any accesses to the poisoned memory.
806	 */
807	kill_procs_ao(&tokill, !!PageDirty(p), trapno,
808		      ret != SWAP_SUCCESS, pfn);
809
810	return ret;
811}
812
813int __memory_failure(unsigned long pfn, int trapno, int flags)
814{
815	struct page_state *ps;
816	struct page *p;
817	int res;
818
819	if (!sysctl_memory_failure_recovery)
820		panic("Memory failure from trap %d on page %lx", trapno, pfn);
821
822	if (!pfn_valid(pfn)) {
823		printk(KERN_ERR
824		       "MCE %#lx: memory outside kernel control\n",
825		       pfn);
826		return -ENXIO;
827	}
828
829	p = pfn_to_page(pfn);
830	if (TestSetPageHWPoison(p)) {
831		printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
832		return 0;
833	}
834
835	atomic_long_add(1, &mce_bad_pages);
836
837	/*
838	 * We need/can do nothing about count=0 pages.
839	 * 1) it's a free page, and therefore in safe hand:
840	 *    prep_new_page() will be the gate keeper.
841	 * 2) it's part of a non-compound high order page.
842	 *    Implies some kernel user: cannot stop them from
843	 *    R/W the page; let's pray that the page has been
844	 *    used and will be freed some time later.
845	 * In fact it's dangerous to directly bump up page count from 0,
846	 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
847	 */
848	if (!(flags & MF_COUNT_INCREASED) &&
849		!get_page_unless_zero(compound_head(p))) {
850		if (is_free_buddy_page(p)) {
851			action_result(pfn, "free buddy", DELAYED);
852			return 0;
853		} else {
854			action_result(pfn, "high order kernel", IGNORED);
855			return -EBUSY;
856		}
857	}
858
859	/*
860	 * We ignore non-LRU pages for good reasons.
861	 * - PG_locked is only well defined for LRU pages and a few others
862	 * - to avoid races with __set_page_locked()
863	 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
864	 * The check (unnecessarily) ignores LRU pages being isolated and
865	 * walked by the page reclaim code, however that's not a big loss.
866	 */
867	if (!PageLRU(p))
868		lru_add_drain_all();
869	if (!PageLRU(p)) {
870		action_result(pfn, "non LRU", IGNORED);
871		put_page(p);
872		return -EBUSY;
873	}
874
875	/*
876	 * Lock the page and wait for writeback to finish.
877	 * It's very difficult to mess with pages currently under IO
878	 * and in many cases impossible, so we just avoid it here.
879	 */
880	lock_page_nosync(p);
881
882	/*
883	 * unpoison always clear PG_hwpoison inside page lock
884	 */
885	if (!PageHWPoison(p)) {
886		printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
887		res = 0;
888		goto out;
889	}
890	if (hwpoison_filter(p)) {
891		if (TestClearPageHWPoison(p))
892			atomic_long_dec(&mce_bad_pages);
893		unlock_page(p);
894		put_page(p);
895		return 0;
896	}
897
898	wait_on_page_writeback(p);
899
900	/*
901	 * Now take care of user space mappings.
902	 * Abort on fail: __remove_from_page_cache() assumes unmapped page.
903	 */
904	if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
905		printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
906		res = -EBUSY;
907		goto out;
908	}
909
910	/*
911	 * Torn down by someone else?
912	 */
913	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
914		action_result(pfn, "already truncated LRU", IGNORED);
915		res = -EBUSY;
916		goto out;
917	}
918
919	res = -EBUSY;
920	for (ps = error_states;; ps++) {
921		if ((p->flags & ps->mask) == ps->res) {
922			res = page_action(ps, p, pfn);
923			break;
924		}
925	}
926out:
927	unlock_page(p);
928	return res;
929}
930EXPORT_SYMBOL_GPL(__memory_failure);
931
932/**
933 * memory_failure - Handle memory failure of a page.
934 * @pfn: Page Number of the corrupted page
935 * @trapno: Trap number reported in the signal to user space.
936 *
937 * This function is called by the low level machine check code
938 * of an architecture when it detects hardware memory corruption
939 * of a page. It tries its best to recover, which includes
940 * dropping pages, killing processes etc.
941 *
942 * The function is primarily of use for corruptions that
943 * happen outside the current execution context (e.g. when
944 * detected by a background scrubber)
945 *
946 * Must run in process context (e.g. a work queue) with interrupts
947 * enabled and no spinlocks hold.
948 */
949void memory_failure(unsigned long pfn, int trapno)
950{
951	__memory_failure(pfn, trapno, 0);
952}
953
954/**
955 * unpoison_memory - Unpoison a previously poisoned page
956 * @pfn: Page number of the to be unpoisoned page
957 *
958 * Software-unpoison a page that has been poisoned by
959 * memory_failure() earlier.
960 *
961 * This is only done on the software-level, so it only works
962 * for linux injected failures, not real hardware failures
963 *
964 * Returns 0 for success, otherwise -errno.
965 */
966int unpoison_memory(unsigned long pfn)
967{
968	struct page *page;
969	struct page *p;
970	int freeit = 0;
971
972	if (!pfn_valid(pfn))
973		return -ENXIO;
974
975	p = pfn_to_page(pfn);
976	page = compound_head(p);
977
978	if (!PageHWPoison(p)) {
979		pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
980		return 0;
981	}
982
983	if (!get_page_unless_zero(page)) {
984		if (TestClearPageHWPoison(p))
985			atomic_long_dec(&mce_bad_pages);
986		pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
987		return 0;
988	}
989
990	lock_page_nosync(page);
991	/*
992	 * This test is racy because PG_hwpoison is set outside of page lock.
993	 * That's acceptable because that won't trigger kernel panic. Instead,
994	 * the PG_hwpoison page will be caught and isolated on the entrance to
995	 * the free buddy page pool.
996	 */
997	if (TestClearPageHWPoison(p)) {
998		pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
999		atomic_long_dec(&mce_bad_pages);
1000		freeit = 1;
1001	}
1002	unlock_page(page);
1003
1004	put_page(page);
1005	if (freeit)
1006		put_page(page);
1007
1008	return 0;
1009}
1010EXPORT_SYMBOL(unpoison_memory);
1011