swapfile.c revision b27256439568950f30864ccecaeb6dfb588089d5
1/*
2 *  linux/mm/swapfile.c
3 *
4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5 *  Swap reorganised 29.12.95, Stephen Tweedie
6 */
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shm.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/module.h>
25#include <linux/ksm.h>
26#include <linux/rmap.h>
27#include <linux/security.h>
28#include <linux/backing-dev.h>
29#include <linux/mutex.h>
30#include <linux/capability.h>
31#include <linux/syscalls.h>
32#include <linux/memcontrol.h>
33
34#include <asm/pgtable.h>
35#include <asm/tlbflush.h>
36#include <linux/swapops.h>
37#include <linux/page_cgroup.h>
38
39static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
40				 unsigned char);
41static void free_swap_count_continuations(struct swap_info_struct *);
42static sector_t map_swap_entry(swp_entry_t, struct block_device**);
43
44static DEFINE_SPINLOCK(swap_lock);
45static unsigned int nr_swapfiles;
46long nr_swap_pages;
47long total_swap_pages;
48static int least_priority;
49
50static const char Bad_file[] = "Bad swap file entry ";
51static const char Unused_file[] = "Unused swap file entry ";
52static const char Bad_offset[] = "Bad swap offset entry ";
53static const char Unused_offset[] = "Unused swap offset entry ";
54
55static struct swap_list_t swap_list = {-1, -1};
56
57static struct swap_info_struct *swap_info[MAX_SWAPFILES];
58
59static DEFINE_MUTEX(swapon_mutex);
60
61static inline unsigned char swap_count(unsigned char ent)
62{
63	return ent & ~SWAP_HAS_CACHE;	/* may include SWAP_HAS_CONT flag */
64}
65
66/* returns 1 if swap entry is freed */
67static int
68__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
69{
70	swp_entry_t entry = swp_entry(si->type, offset);
71	struct page *page;
72	int ret = 0;
73
74	page = find_get_page(&swapper_space, entry.val);
75	if (!page)
76		return 0;
77	/*
78	 * This function is called from scan_swap_map() and it's called
79	 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
80	 * We have to use trylock for avoiding deadlock. This is a special
81	 * case and you should use try_to_free_swap() with explicit lock_page()
82	 * in usual operations.
83	 */
84	if (trylock_page(page)) {
85		ret = try_to_free_swap(page);
86		unlock_page(page);
87	}
88	page_cache_release(page);
89	return ret;
90}
91
92/*
93 * We need this because the bdev->unplug_fn can sleep and we cannot
94 * hold swap_lock while calling the unplug_fn. And swap_lock
95 * cannot be turned into a mutex.
96 */
97static DECLARE_RWSEM(swap_unplug_sem);
98
99void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
100{
101	swp_entry_t entry;
102
103	down_read(&swap_unplug_sem);
104	entry.val = page_private(page);
105	if (PageSwapCache(page)) {
106		struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
107		struct backing_dev_info *bdi;
108
109		/*
110		 * If the page is removed from swapcache from under us (with a
111		 * racy try_to_unuse/swapoff) we need an additional reference
112		 * count to avoid reading garbage from page_private(page) above.
113		 * If the WARN_ON triggers during a swapoff it maybe the race
114		 * condition and it's harmless. However if it triggers without
115		 * swapoff it signals a problem.
116		 */
117		WARN_ON(page_count(page) <= 1);
118
119		bdi = bdev->bd_inode->i_mapping->backing_dev_info;
120		blk_run_backing_dev(bdi, page);
121	}
122	up_read(&swap_unplug_sem);
123}
124
125/*
126 * swapon tell device that all the old swap contents can be discarded,
127 * to allow the swap device to optimize its wear-levelling.
128 */
129static int discard_swap(struct swap_info_struct *si)
130{
131	struct swap_extent *se;
132	sector_t start_block;
133	sector_t nr_blocks;
134	int err = 0;
135
136	/* Do not discard the swap header page! */
137	se = &si->first_swap_extent;
138	start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
139	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
140	if (nr_blocks) {
141		err = blkdev_issue_discard(si->bdev, start_block,
142				nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
143		if (err)
144			return err;
145		cond_resched();
146	}
147
148	list_for_each_entry(se, &si->first_swap_extent.list, list) {
149		start_block = se->start_block << (PAGE_SHIFT - 9);
150		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
151
152		err = blkdev_issue_discard(si->bdev, start_block,
153				nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
154		if (err)
155			break;
156
157		cond_resched();
158	}
159	return err;		/* That will often be -EOPNOTSUPP */
160}
161
162/*
163 * swap allocation tell device that a cluster of swap can now be discarded,
164 * to allow the swap device to optimize its wear-levelling.
165 */
166static void discard_swap_cluster(struct swap_info_struct *si,
167				 pgoff_t start_page, pgoff_t nr_pages)
168{
169	struct swap_extent *se = si->curr_swap_extent;
170	int found_extent = 0;
171
172	while (nr_pages) {
173		struct list_head *lh;
174
175		if (se->start_page <= start_page &&
176		    start_page < se->start_page + se->nr_pages) {
177			pgoff_t offset = start_page - se->start_page;
178			sector_t start_block = se->start_block + offset;
179			sector_t nr_blocks = se->nr_pages - offset;
180
181			if (nr_blocks > nr_pages)
182				nr_blocks = nr_pages;
183			start_page += nr_blocks;
184			nr_pages -= nr_blocks;
185
186			if (!found_extent++)
187				si->curr_swap_extent = se;
188
189			start_block <<= PAGE_SHIFT - 9;
190			nr_blocks <<= PAGE_SHIFT - 9;
191			if (blkdev_issue_discard(si->bdev, start_block,
192				    nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
193				break;
194		}
195
196		lh = se->list.next;
197		se = list_entry(lh, struct swap_extent, list);
198	}
199}
200
201static int wait_for_discard(void *word)
202{
203	schedule();
204	return 0;
205}
206
207#define SWAPFILE_CLUSTER	256
208#define LATENCY_LIMIT		256
209
210static inline unsigned long scan_swap_map(struct swap_info_struct *si,
211					  unsigned char usage)
212{
213	unsigned long offset;
214	unsigned long scan_base;
215	unsigned long last_in_cluster = 0;
216	int latency_ration = LATENCY_LIMIT;
217	int found_free_cluster = 0;
218
219	/*
220	 * We try to cluster swap pages by allocating them sequentially
221	 * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
222	 * way, however, we resort to first-free allocation, starting
223	 * a new cluster.  This prevents us from scattering swap pages
224	 * all over the entire swap partition, so that we reduce
225	 * overall disk seek times between swap pages.  -- sct
226	 * But we do now try to find an empty cluster.  -Andrea
227	 * And we let swap pages go all over an SSD partition.  Hugh
228	 */
229
230	si->flags += SWP_SCANNING;
231	scan_base = offset = si->cluster_next;
232
233	if (unlikely(!si->cluster_nr--)) {
234		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
235			si->cluster_nr = SWAPFILE_CLUSTER - 1;
236			goto checks;
237		}
238		if (si->flags & SWP_DISCARDABLE) {
239			/*
240			 * Start range check on racing allocations, in case
241			 * they overlap the cluster we eventually decide on
242			 * (we scan without swap_lock to allow preemption).
243			 * It's hardly conceivable that cluster_nr could be
244			 * wrapped during our scan, but don't depend on it.
245			 */
246			if (si->lowest_alloc)
247				goto checks;
248			si->lowest_alloc = si->max;
249			si->highest_alloc = 0;
250		}
251		spin_unlock(&swap_lock);
252
253		/*
254		 * If seek is expensive, start searching for new cluster from
255		 * start of partition, to minimize the span of allocated swap.
256		 * But if seek is cheap, search from our current position, so
257		 * that swap is allocated from all over the partition: if the
258		 * Flash Translation Layer only remaps within limited zones,
259		 * we don't want to wear out the first zone too quickly.
260		 */
261		if (!(si->flags & SWP_SOLIDSTATE))
262			scan_base = offset = si->lowest_bit;
263		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
264
265		/* Locate the first empty (unaligned) cluster */
266		for (; last_in_cluster <= si->highest_bit; offset++) {
267			if (si->swap_map[offset])
268				last_in_cluster = offset + SWAPFILE_CLUSTER;
269			else if (offset == last_in_cluster) {
270				spin_lock(&swap_lock);
271				offset -= SWAPFILE_CLUSTER - 1;
272				si->cluster_next = offset;
273				si->cluster_nr = SWAPFILE_CLUSTER - 1;
274				found_free_cluster = 1;
275				goto checks;
276			}
277			if (unlikely(--latency_ration < 0)) {
278				cond_resched();
279				latency_ration = LATENCY_LIMIT;
280			}
281		}
282
283		offset = si->lowest_bit;
284		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
285
286		/* Locate the first empty (unaligned) cluster */
287		for (; last_in_cluster < scan_base; offset++) {
288			if (si->swap_map[offset])
289				last_in_cluster = offset + SWAPFILE_CLUSTER;
290			else if (offset == last_in_cluster) {
291				spin_lock(&swap_lock);
292				offset -= SWAPFILE_CLUSTER - 1;
293				si->cluster_next = offset;
294				si->cluster_nr = SWAPFILE_CLUSTER - 1;
295				found_free_cluster = 1;
296				goto checks;
297			}
298			if (unlikely(--latency_ration < 0)) {
299				cond_resched();
300				latency_ration = LATENCY_LIMIT;
301			}
302		}
303
304		offset = scan_base;
305		spin_lock(&swap_lock);
306		si->cluster_nr = SWAPFILE_CLUSTER - 1;
307		si->lowest_alloc = 0;
308	}
309
310checks:
311	if (!(si->flags & SWP_WRITEOK))
312		goto no_page;
313	if (!si->highest_bit)
314		goto no_page;
315	if (offset > si->highest_bit)
316		scan_base = offset = si->lowest_bit;
317
318	/* reuse swap entry of cache-only swap if not busy. */
319	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
320		int swap_was_freed;
321		spin_unlock(&swap_lock);
322		swap_was_freed = __try_to_reclaim_swap(si, offset);
323		spin_lock(&swap_lock);
324		/* entry was freed successfully, try to use this again */
325		if (swap_was_freed)
326			goto checks;
327		goto scan; /* check next one */
328	}
329
330	if (si->swap_map[offset])
331		goto scan;
332
333	if (offset == si->lowest_bit)
334		si->lowest_bit++;
335	if (offset == si->highest_bit)
336		si->highest_bit--;
337	si->inuse_pages++;
338	if (si->inuse_pages == si->pages) {
339		si->lowest_bit = si->max;
340		si->highest_bit = 0;
341	}
342	si->swap_map[offset] = usage;
343	si->cluster_next = offset + 1;
344	si->flags -= SWP_SCANNING;
345
346	if (si->lowest_alloc) {
347		/*
348		 * Only set when SWP_DISCARDABLE, and there's a scan
349		 * for a free cluster in progress or just completed.
350		 */
351		if (found_free_cluster) {
352			/*
353			 * To optimize wear-levelling, discard the
354			 * old data of the cluster, taking care not to
355			 * discard any of its pages that have already
356			 * been allocated by racing tasks (offset has
357			 * already stepped over any at the beginning).
358			 */
359			if (offset < si->highest_alloc &&
360			    si->lowest_alloc <= last_in_cluster)
361				last_in_cluster = si->lowest_alloc - 1;
362			si->flags |= SWP_DISCARDING;
363			spin_unlock(&swap_lock);
364
365			if (offset < last_in_cluster)
366				discard_swap_cluster(si, offset,
367					last_in_cluster - offset + 1);
368
369			spin_lock(&swap_lock);
370			si->lowest_alloc = 0;
371			si->flags &= ~SWP_DISCARDING;
372
373			smp_mb();	/* wake_up_bit advises this */
374			wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
375
376		} else if (si->flags & SWP_DISCARDING) {
377			/*
378			 * Delay using pages allocated by racing tasks
379			 * until the whole discard has been issued. We
380			 * could defer that delay until swap_writepage,
381			 * but it's easier to keep this self-contained.
382			 */
383			spin_unlock(&swap_lock);
384			wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
385				wait_for_discard, TASK_UNINTERRUPTIBLE);
386			spin_lock(&swap_lock);
387		} else {
388			/*
389			 * Note pages allocated by racing tasks while
390			 * scan for a free cluster is in progress, so
391			 * that its final discard can exclude them.
392			 */
393			if (offset < si->lowest_alloc)
394				si->lowest_alloc = offset;
395			if (offset > si->highest_alloc)
396				si->highest_alloc = offset;
397		}
398	}
399	return offset;
400
401scan:
402	spin_unlock(&swap_lock);
403	while (++offset <= si->highest_bit) {
404		if (!si->swap_map[offset]) {
405			spin_lock(&swap_lock);
406			goto checks;
407		}
408		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
409			spin_lock(&swap_lock);
410			goto checks;
411		}
412		if (unlikely(--latency_ration < 0)) {
413			cond_resched();
414			latency_ration = LATENCY_LIMIT;
415		}
416	}
417	offset = si->lowest_bit;
418	while (++offset < scan_base) {
419		if (!si->swap_map[offset]) {
420			spin_lock(&swap_lock);
421			goto checks;
422		}
423		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
424			spin_lock(&swap_lock);
425			goto checks;
426		}
427		if (unlikely(--latency_ration < 0)) {
428			cond_resched();
429			latency_ration = LATENCY_LIMIT;
430		}
431	}
432	spin_lock(&swap_lock);
433
434no_page:
435	si->flags -= SWP_SCANNING;
436	return 0;
437}
438
439swp_entry_t get_swap_page(void)
440{
441	struct swap_info_struct *si;
442	pgoff_t offset;
443	int type, next;
444	int wrapped = 0;
445
446	spin_lock(&swap_lock);
447	if (nr_swap_pages <= 0)
448		goto noswap;
449	nr_swap_pages--;
450
451	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
452		si = swap_info[type];
453		next = si->next;
454		if (next < 0 ||
455		    (!wrapped && si->prio != swap_info[next]->prio)) {
456			next = swap_list.head;
457			wrapped++;
458		}
459
460		if (!si->highest_bit)
461			continue;
462		if (!(si->flags & SWP_WRITEOK))
463			continue;
464
465		swap_list.next = next;
466		/* This is called for allocating swap entry for cache */
467		offset = scan_swap_map(si, SWAP_HAS_CACHE);
468		if (offset) {
469			spin_unlock(&swap_lock);
470			return swp_entry(type, offset);
471		}
472		next = swap_list.next;
473	}
474
475	nr_swap_pages++;
476noswap:
477	spin_unlock(&swap_lock);
478	return (swp_entry_t) {0};
479}
480
481/* The only caller of this function is now susupend routine */
482swp_entry_t get_swap_page_of_type(int type)
483{
484	struct swap_info_struct *si;
485	pgoff_t offset;
486
487	spin_lock(&swap_lock);
488	si = swap_info[type];
489	if (si && (si->flags & SWP_WRITEOK)) {
490		nr_swap_pages--;
491		/* This is called for allocating swap entry, not cache */
492		offset = scan_swap_map(si, 1);
493		if (offset) {
494			spin_unlock(&swap_lock);
495			return swp_entry(type, offset);
496		}
497		nr_swap_pages++;
498	}
499	spin_unlock(&swap_lock);
500	return (swp_entry_t) {0};
501}
502
503static struct swap_info_struct *swap_info_get(swp_entry_t entry)
504{
505	struct swap_info_struct *p;
506	unsigned long offset, type;
507
508	if (!entry.val)
509		goto out;
510	type = swp_type(entry);
511	if (type >= nr_swapfiles)
512		goto bad_nofile;
513	p = swap_info[type];
514	if (!(p->flags & SWP_USED))
515		goto bad_device;
516	offset = swp_offset(entry);
517	if (offset >= p->max)
518		goto bad_offset;
519	if (!p->swap_map[offset])
520		goto bad_free;
521	spin_lock(&swap_lock);
522	return p;
523
524bad_free:
525	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
526	goto out;
527bad_offset:
528	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
529	goto out;
530bad_device:
531	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
532	goto out;
533bad_nofile:
534	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
535out:
536	return NULL;
537}
538
539static unsigned char swap_entry_free(struct swap_info_struct *p,
540				     swp_entry_t entry, unsigned char usage)
541{
542	unsigned long offset = swp_offset(entry);
543	unsigned char count;
544	unsigned char has_cache;
545
546	count = p->swap_map[offset];
547	has_cache = count & SWAP_HAS_CACHE;
548	count &= ~SWAP_HAS_CACHE;
549
550	if (usage == SWAP_HAS_CACHE) {
551		VM_BUG_ON(!has_cache);
552		has_cache = 0;
553	} else if (count == SWAP_MAP_SHMEM) {
554		/*
555		 * Or we could insist on shmem.c using a special
556		 * swap_shmem_free() and free_shmem_swap_and_cache()...
557		 */
558		count = 0;
559	} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
560		if (count == COUNT_CONTINUED) {
561			if (swap_count_continued(p, offset, count))
562				count = SWAP_MAP_MAX | COUNT_CONTINUED;
563			else
564				count = SWAP_MAP_MAX;
565		} else
566			count--;
567	}
568
569	if (!count)
570		mem_cgroup_uncharge_swap(entry);
571
572	usage = count | has_cache;
573	p->swap_map[offset] = usage;
574
575	/* free if no reference */
576	if (!usage) {
577		if (offset < p->lowest_bit)
578			p->lowest_bit = offset;
579		if (offset > p->highest_bit)
580			p->highest_bit = offset;
581		if (swap_list.next >= 0 &&
582		    p->prio > swap_info[swap_list.next]->prio)
583			swap_list.next = p->type;
584		nr_swap_pages++;
585		p->inuse_pages--;
586	}
587
588	return usage;
589}
590
591/*
592 * Caller has made sure that the swapdevice corresponding to entry
593 * is still around or has not been recycled.
594 */
595void swap_free(swp_entry_t entry)
596{
597	struct swap_info_struct *p;
598
599	p = swap_info_get(entry);
600	if (p) {
601		swap_entry_free(p, entry, 1);
602		spin_unlock(&swap_lock);
603	}
604}
605
606/*
607 * Called after dropping swapcache to decrease refcnt to swap entries.
608 */
609void swapcache_free(swp_entry_t entry, struct page *page)
610{
611	struct swap_info_struct *p;
612	unsigned char count;
613
614	p = swap_info_get(entry);
615	if (p) {
616		count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
617		if (page)
618			mem_cgroup_uncharge_swapcache(page, entry, count != 0);
619		spin_unlock(&swap_lock);
620	}
621}
622
623/*
624 * How many references to page are currently swapped out?
625 * This does not give an exact answer when swap count is continued,
626 * but does include the high COUNT_CONTINUED flag to allow for that.
627 */
628static inline int page_swapcount(struct page *page)
629{
630	int count = 0;
631	struct swap_info_struct *p;
632	swp_entry_t entry;
633
634	entry.val = page_private(page);
635	p = swap_info_get(entry);
636	if (p) {
637		count = swap_count(p->swap_map[swp_offset(entry)]);
638		spin_unlock(&swap_lock);
639	}
640	return count;
641}
642
643/*
644 * We can write to an anon page without COW if there are no other references
645 * to it.  And as a side-effect, free up its swap: because the old content
646 * on disk will never be read, and seeking back there to write new content
647 * later would only waste time away from clustering.
648 */
649int reuse_swap_page(struct page *page)
650{
651	int count;
652
653	VM_BUG_ON(!PageLocked(page));
654	if (unlikely(PageKsm(page)))
655		return 0;
656	count = page_mapcount(page);
657	if (count <= 1 && PageSwapCache(page)) {
658		count += page_swapcount(page);
659		if (count == 1 && !PageWriteback(page)) {
660			delete_from_swap_cache(page);
661			SetPageDirty(page);
662		}
663	}
664	return count <= 1;
665}
666
667/*
668 * If swap is getting full, or if there are no more mappings of this page,
669 * then try_to_free_swap is called to free its swap space.
670 */
671int try_to_free_swap(struct page *page)
672{
673	VM_BUG_ON(!PageLocked(page));
674
675	if (!PageSwapCache(page))
676		return 0;
677	if (PageWriteback(page))
678		return 0;
679	if (page_swapcount(page))
680		return 0;
681
682	delete_from_swap_cache(page);
683	SetPageDirty(page);
684	return 1;
685}
686
687/*
688 * Free the swap entry like above, but also try to
689 * free the page cache entry if it is the last user.
690 */
691int free_swap_and_cache(swp_entry_t entry)
692{
693	struct swap_info_struct *p;
694	struct page *page = NULL;
695
696	if (non_swap_entry(entry))
697		return 1;
698
699	p = swap_info_get(entry);
700	if (p) {
701		if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
702			page = find_get_page(&swapper_space, entry.val);
703			if (page && !trylock_page(page)) {
704				page_cache_release(page);
705				page = NULL;
706			}
707		}
708		spin_unlock(&swap_lock);
709	}
710	if (page) {
711		/*
712		 * Not mapped elsewhere, or swap space full? Free it!
713		 * Also recheck PageSwapCache now page is locked (above).
714		 */
715		if (PageSwapCache(page) && !PageWriteback(page) &&
716				(!page_mapped(page) || vm_swap_full())) {
717			delete_from_swap_cache(page);
718			SetPageDirty(page);
719		}
720		unlock_page(page);
721		page_cache_release(page);
722	}
723	return p != NULL;
724}
725
726#ifdef CONFIG_CGROUP_MEM_RES_CTLR
727/**
728 * mem_cgroup_count_swap_user - count the user of a swap entry
729 * @ent: the swap entry to be checked
730 * @pagep: the pointer for the swap cache page of the entry to be stored
731 *
732 * Returns the number of the user of the swap entry. The number is valid only
733 * for swaps of anonymous pages.
734 * If the entry is found on swap cache, the page is stored to pagep with
735 * refcount of it being incremented.
736 */
737int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
738{
739	struct page *page;
740	struct swap_info_struct *p;
741	int count = 0;
742
743	page = find_get_page(&swapper_space, ent.val);
744	if (page)
745		count += page_mapcount(page);
746	p = swap_info_get(ent);
747	if (p) {
748		count += swap_count(p->swap_map[swp_offset(ent)]);
749		spin_unlock(&swap_lock);
750	}
751
752	*pagep = page;
753	return count;
754}
755#endif
756
757#ifdef CONFIG_HIBERNATION
758/*
759 * Find the swap type that corresponds to given device (if any).
760 *
761 * @offset - number of the PAGE_SIZE-sized block of the device, starting
762 * from 0, in which the swap header is expected to be located.
763 *
764 * This is needed for the suspend to disk (aka swsusp).
765 */
766int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
767{
768	struct block_device *bdev = NULL;
769	int type;
770
771	if (device)
772		bdev = bdget(device);
773
774	spin_lock(&swap_lock);
775	for (type = 0; type < nr_swapfiles; type++) {
776		struct swap_info_struct *sis = swap_info[type];
777
778		if (!(sis->flags & SWP_WRITEOK))
779			continue;
780
781		if (!bdev) {
782			if (bdev_p)
783				*bdev_p = bdgrab(sis->bdev);
784
785			spin_unlock(&swap_lock);
786			return type;
787		}
788		if (bdev == sis->bdev) {
789			struct swap_extent *se = &sis->first_swap_extent;
790
791			if (se->start_block == offset) {
792				if (bdev_p)
793					*bdev_p = bdgrab(sis->bdev);
794
795				spin_unlock(&swap_lock);
796				bdput(bdev);
797				return type;
798			}
799		}
800	}
801	spin_unlock(&swap_lock);
802	if (bdev)
803		bdput(bdev);
804
805	return -ENODEV;
806}
807
808/*
809 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
810 * corresponding to given index in swap_info (swap type).
811 */
812sector_t swapdev_block(int type, pgoff_t offset)
813{
814	struct block_device *bdev;
815
816	if ((unsigned int)type >= nr_swapfiles)
817		return 0;
818	if (!(swap_info[type]->flags & SWP_WRITEOK))
819		return 0;
820	return map_swap_entry(swp_entry(type, offset), &bdev);
821}
822
823/*
824 * Return either the total number of swap pages of given type, or the number
825 * of free pages of that type (depending on @free)
826 *
827 * This is needed for software suspend
828 */
829unsigned int count_swap_pages(int type, int free)
830{
831	unsigned int n = 0;
832
833	spin_lock(&swap_lock);
834	if ((unsigned int)type < nr_swapfiles) {
835		struct swap_info_struct *sis = swap_info[type];
836
837		if (sis->flags & SWP_WRITEOK) {
838			n = sis->pages;
839			if (free)
840				n -= sis->inuse_pages;
841		}
842	}
843	spin_unlock(&swap_lock);
844	return n;
845}
846#endif /* CONFIG_HIBERNATION */
847
848/*
849 * No need to decide whether this PTE shares the swap entry with others,
850 * just let do_wp_page work it out if a write is requested later - to
851 * force COW, vm_page_prot omits write permission from any private vma.
852 */
853static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
854		unsigned long addr, swp_entry_t entry, struct page *page)
855{
856	struct mem_cgroup *ptr = NULL;
857	spinlock_t *ptl;
858	pte_t *pte;
859	int ret = 1;
860
861	if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
862		ret = -ENOMEM;
863		goto out_nolock;
864	}
865
866	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
867	if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
868		if (ret > 0)
869			mem_cgroup_cancel_charge_swapin(ptr);
870		ret = 0;
871		goto out;
872	}
873
874	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
875	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
876	get_page(page);
877	set_pte_at(vma->vm_mm, addr, pte,
878		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
879	page_add_anon_rmap(page, vma, addr);
880	mem_cgroup_commit_charge_swapin(page, ptr);
881	swap_free(entry);
882	/*
883	 * Move the page to the active list so it is not
884	 * immediately swapped out again after swapon.
885	 */
886	activate_page(page);
887out:
888	pte_unmap_unlock(pte, ptl);
889out_nolock:
890	return ret;
891}
892
893static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
894				unsigned long addr, unsigned long end,
895				swp_entry_t entry, struct page *page)
896{
897	pte_t swp_pte = swp_entry_to_pte(entry);
898	pte_t *pte;
899	int ret = 0;
900
901	/*
902	 * We don't actually need pte lock while scanning for swp_pte: since
903	 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
904	 * page table while we're scanning; though it could get zapped, and on
905	 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
906	 * of unmatched parts which look like swp_pte, so unuse_pte must
907	 * recheck under pte lock.  Scanning without pte lock lets it be
908	 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
909	 */
910	pte = pte_offset_map(pmd, addr);
911	do {
912		/*
913		 * swapoff spends a _lot_ of time in this loop!
914		 * Test inline before going to call unuse_pte.
915		 */
916		if (unlikely(pte_same(*pte, swp_pte))) {
917			pte_unmap(pte);
918			ret = unuse_pte(vma, pmd, addr, entry, page);
919			if (ret)
920				goto out;
921			pte = pte_offset_map(pmd, addr);
922		}
923	} while (pte++, addr += PAGE_SIZE, addr != end);
924	pte_unmap(pte - 1);
925out:
926	return ret;
927}
928
929static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
930				unsigned long addr, unsigned long end,
931				swp_entry_t entry, struct page *page)
932{
933	pmd_t *pmd;
934	unsigned long next;
935	int ret;
936
937	pmd = pmd_offset(pud, addr);
938	do {
939		next = pmd_addr_end(addr, end);
940		if (pmd_none_or_clear_bad(pmd))
941			continue;
942		ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
943		if (ret)
944			return ret;
945	} while (pmd++, addr = next, addr != end);
946	return 0;
947}
948
949static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
950				unsigned long addr, unsigned long end,
951				swp_entry_t entry, struct page *page)
952{
953	pud_t *pud;
954	unsigned long next;
955	int ret;
956
957	pud = pud_offset(pgd, addr);
958	do {
959		next = pud_addr_end(addr, end);
960		if (pud_none_or_clear_bad(pud))
961			continue;
962		ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
963		if (ret)
964			return ret;
965	} while (pud++, addr = next, addr != end);
966	return 0;
967}
968
969static int unuse_vma(struct vm_area_struct *vma,
970				swp_entry_t entry, struct page *page)
971{
972	pgd_t *pgd;
973	unsigned long addr, end, next;
974	int ret;
975
976	if (page_anon_vma(page)) {
977		addr = page_address_in_vma(page, vma);
978		if (addr == -EFAULT)
979			return 0;
980		else
981			end = addr + PAGE_SIZE;
982	} else {
983		addr = vma->vm_start;
984		end = vma->vm_end;
985	}
986
987	pgd = pgd_offset(vma->vm_mm, addr);
988	do {
989		next = pgd_addr_end(addr, end);
990		if (pgd_none_or_clear_bad(pgd))
991			continue;
992		ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
993		if (ret)
994			return ret;
995	} while (pgd++, addr = next, addr != end);
996	return 0;
997}
998
999static int unuse_mm(struct mm_struct *mm,
1000				swp_entry_t entry, struct page *page)
1001{
1002	struct vm_area_struct *vma;
1003	int ret = 0;
1004
1005	if (!down_read_trylock(&mm->mmap_sem)) {
1006		/*
1007		 * Activate page so shrink_inactive_list is unlikely to unmap
1008		 * its ptes while lock is dropped, so swapoff can make progress.
1009		 */
1010		activate_page(page);
1011		unlock_page(page);
1012		down_read(&mm->mmap_sem);
1013		lock_page(page);
1014	}
1015	for (vma = mm->mmap; vma; vma = vma->vm_next) {
1016		if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1017			break;
1018	}
1019	up_read(&mm->mmap_sem);
1020	return (ret < 0)? ret: 0;
1021}
1022
1023/*
1024 * Scan swap_map from current position to next entry still in use.
1025 * Recycle to start on reaching the end, returning 0 when empty.
1026 */
1027static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1028					unsigned int prev)
1029{
1030	unsigned int max = si->max;
1031	unsigned int i = prev;
1032	unsigned char count;
1033
1034	/*
1035	 * No need for swap_lock here: we're just looking
1036	 * for whether an entry is in use, not modifying it; false
1037	 * hits are okay, and sys_swapoff() has already prevented new
1038	 * allocations from this area (while holding swap_lock).
1039	 */
1040	for (;;) {
1041		if (++i >= max) {
1042			if (!prev) {
1043				i = 0;
1044				break;
1045			}
1046			/*
1047			 * No entries in use at top of swap_map,
1048			 * loop back to start and recheck there.
1049			 */
1050			max = prev + 1;
1051			prev = 0;
1052			i = 1;
1053		}
1054		count = si->swap_map[i];
1055		if (count && swap_count(count) != SWAP_MAP_BAD)
1056			break;
1057	}
1058	return i;
1059}
1060
1061/*
1062 * We completely avoid races by reading each swap page in advance,
1063 * and then search for the process using it.  All the necessary
1064 * page table adjustments can then be made atomically.
1065 */
1066static int try_to_unuse(unsigned int type)
1067{
1068	struct swap_info_struct *si = swap_info[type];
1069	struct mm_struct *start_mm;
1070	unsigned char *swap_map;
1071	unsigned char swcount;
1072	struct page *page;
1073	swp_entry_t entry;
1074	unsigned int i = 0;
1075	int retval = 0;
1076
1077	/*
1078	 * When searching mms for an entry, a good strategy is to
1079	 * start at the first mm we freed the previous entry from
1080	 * (though actually we don't notice whether we or coincidence
1081	 * freed the entry).  Initialize this start_mm with a hold.
1082	 *
1083	 * A simpler strategy would be to start at the last mm we
1084	 * freed the previous entry from; but that would take less
1085	 * advantage of mmlist ordering, which clusters forked mms
1086	 * together, child after parent.  If we race with dup_mmap(), we
1087	 * prefer to resolve parent before child, lest we miss entries
1088	 * duplicated after we scanned child: using last mm would invert
1089	 * that.
1090	 */
1091	start_mm = &init_mm;
1092	atomic_inc(&init_mm.mm_users);
1093
1094	/*
1095	 * Keep on scanning until all entries have gone.  Usually,
1096	 * one pass through swap_map is enough, but not necessarily:
1097	 * there are races when an instance of an entry might be missed.
1098	 */
1099	while ((i = find_next_to_unuse(si, i)) != 0) {
1100		if (signal_pending(current)) {
1101			retval = -EINTR;
1102			break;
1103		}
1104
1105		/*
1106		 * Get a page for the entry, using the existing swap
1107		 * cache page if there is one.  Otherwise, get a clean
1108		 * page and read the swap into it.
1109		 */
1110		swap_map = &si->swap_map[i];
1111		entry = swp_entry(type, i);
1112		page = read_swap_cache_async(entry,
1113					GFP_HIGHUSER_MOVABLE, NULL, 0);
1114		if (!page) {
1115			/*
1116			 * Either swap_duplicate() failed because entry
1117			 * has been freed independently, and will not be
1118			 * reused since sys_swapoff() already disabled
1119			 * allocation from here, or alloc_page() failed.
1120			 */
1121			if (!*swap_map)
1122				continue;
1123			retval = -ENOMEM;
1124			break;
1125		}
1126
1127		/*
1128		 * Don't hold on to start_mm if it looks like exiting.
1129		 */
1130		if (atomic_read(&start_mm->mm_users) == 1) {
1131			mmput(start_mm);
1132			start_mm = &init_mm;
1133			atomic_inc(&init_mm.mm_users);
1134		}
1135
1136		/*
1137		 * Wait for and lock page.  When do_swap_page races with
1138		 * try_to_unuse, do_swap_page can handle the fault much
1139		 * faster than try_to_unuse can locate the entry.  This
1140		 * apparently redundant "wait_on_page_locked" lets try_to_unuse
1141		 * defer to do_swap_page in such a case - in some tests,
1142		 * do_swap_page and try_to_unuse repeatedly compete.
1143		 */
1144		wait_on_page_locked(page);
1145		wait_on_page_writeback(page);
1146		lock_page(page);
1147		wait_on_page_writeback(page);
1148
1149		/*
1150		 * Remove all references to entry.
1151		 */
1152		swcount = *swap_map;
1153		if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1154			retval = shmem_unuse(entry, page);
1155			/* page has already been unlocked and released */
1156			if (retval < 0)
1157				break;
1158			continue;
1159		}
1160		if (swap_count(swcount) && start_mm != &init_mm)
1161			retval = unuse_mm(start_mm, entry, page);
1162
1163		if (swap_count(*swap_map)) {
1164			int set_start_mm = (*swap_map >= swcount);
1165			struct list_head *p = &start_mm->mmlist;
1166			struct mm_struct *new_start_mm = start_mm;
1167			struct mm_struct *prev_mm = start_mm;
1168			struct mm_struct *mm;
1169
1170			atomic_inc(&new_start_mm->mm_users);
1171			atomic_inc(&prev_mm->mm_users);
1172			spin_lock(&mmlist_lock);
1173			while (swap_count(*swap_map) && !retval &&
1174					(p = p->next) != &start_mm->mmlist) {
1175				mm = list_entry(p, struct mm_struct, mmlist);
1176				if (!atomic_inc_not_zero(&mm->mm_users))
1177					continue;
1178				spin_unlock(&mmlist_lock);
1179				mmput(prev_mm);
1180				prev_mm = mm;
1181
1182				cond_resched();
1183
1184				swcount = *swap_map;
1185				if (!swap_count(swcount)) /* any usage ? */
1186					;
1187				else if (mm == &init_mm)
1188					set_start_mm = 1;
1189				else
1190					retval = unuse_mm(mm, entry, page);
1191
1192				if (set_start_mm && *swap_map < swcount) {
1193					mmput(new_start_mm);
1194					atomic_inc(&mm->mm_users);
1195					new_start_mm = mm;
1196					set_start_mm = 0;
1197				}
1198				spin_lock(&mmlist_lock);
1199			}
1200			spin_unlock(&mmlist_lock);
1201			mmput(prev_mm);
1202			mmput(start_mm);
1203			start_mm = new_start_mm;
1204		}
1205		if (retval) {
1206			unlock_page(page);
1207			page_cache_release(page);
1208			break;
1209		}
1210
1211		/*
1212		 * If a reference remains (rare), we would like to leave
1213		 * the page in the swap cache; but try_to_unmap could
1214		 * then re-duplicate the entry once we drop page lock,
1215		 * so we might loop indefinitely; also, that page could
1216		 * not be swapped out to other storage meanwhile.  So:
1217		 * delete from cache even if there's another reference,
1218		 * after ensuring that the data has been saved to disk -
1219		 * since if the reference remains (rarer), it will be
1220		 * read from disk into another page.  Splitting into two
1221		 * pages would be incorrect if swap supported "shared
1222		 * private" pages, but they are handled by tmpfs files.
1223		 *
1224		 * Given how unuse_vma() targets one particular offset
1225		 * in an anon_vma, once the anon_vma has been determined,
1226		 * this splitting happens to be just what is needed to
1227		 * handle where KSM pages have been swapped out: re-reading
1228		 * is unnecessarily slow, but we can fix that later on.
1229		 */
1230		if (swap_count(*swap_map) &&
1231		     PageDirty(page) && PageSwapCache(page)) {
1232			struct writeback_control wbc = {
1233				.sync_mode = WB_SYNC_NONE,
1234			};
1235
1236			swap_writepage(page, &wbc);
1237			lock_page(page);
1238			wait_on_page_writeback(page);
1239		}
1240
1241		/*
1242		 * It is conceivable that a racing task removed this page from
1243		 * swap cache just before we acquired the page lock at the top,
1244		 * or while we dropped it in unuse_mm().  The page might even
1245		 * be back in swap cache on another swap area: that we must not
1246		 * delete, since it may not have been written out to swap yet.
1247		 */
1248		if (PageSwapCache(page) &&
1249		    likely(page_private(page) == entry.val))
1250			delete_from_swap_cache(page);
1251
1252		/*
1253		 * So we could skip searching mms once swap count went
1254		 * to 1, we did not mark any present ptes as dirty: must
1255		 * mark page dirty so shrink_page_list will preserve it.
1256		 */
1257		SetPageDirty(page);
1258		unlock_page(page);
1259		page_cache_release(page);
1260
1261		/*
1262		 * Make sure that we aren't completely killing
1263		 * interactive performance.
1264		 */
1265		cond_resched();
1266	}
1267
1268	mmput(start_mm);
1269	return retval;
1270}
1271
1272/*
1273 * After a successful try_to_unuse, if no swap is now in use, we know
1274 * we can empty the mmlist.  swap_lock must be held on entry and exit.
1275 * Note that mmlist_lock nests inside swap_lock, and an mm must be
1276 * added to the mmlist just after page_duplicate - before would be racy.
1277 */
1278static void drain_mmlist(void)
1279{
1280	struct list_head *p, *next;
1281	unsigned int type;
1282
1283	for (type = 0; type < nr_swapfiles; type++)
1284		if (swap_info[type]->inuse_pages)
1285			return;
1286	spin_lock(&mmlist_lock);
1287	list_for_each_safe(p, next, &init_mm.mmlist)
1288		list_del_init(p);
1289	spin_unlock(&mmlist_lock);
1290}
1291
1292/*
1293 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
1294 * corresponds to page offset for the specified swap entry.
1295 * Note that the type of this function is sector_t, but it returns page offset
1296 * into the bdev, not sector offset.
1297 */
1298static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1299{
1300	struct swap_info_struct *sis;
1301	struct swap_extent *start_se;
1302	struct swap_extent *se;
1303	pgoff_t offset;
1304
1305	sis = swap_info[swp_type(entry)];
1306	*bdev = sis->bdev;
1307
1308	offset = swp_offset(entry);
1309	start_se = sis->curr_swap_extent;
1310	se = start_se;
1311
1312	for ( ; ; ) {
1313		struct list_head *lh;
1314
1315		if (se->start_page <= offset &&
1316				offset < (se->start_page + se->nr_pages)) {
1317			return se->start_block + (offset - se->start_page);
1318		}
1319		lh = se->list.next;
1320		se = list_entry(lh, struct swap_extent, list);
1321		sis->curr_swap_extent = se;
1322		BUG_ON(se == start_se);		/* It *must* be present */
1323	}
1324}
1325
1326/*
1327 * Returns the page offset into bdev for the specified page's swap entry.
1328 */
1329sector_t map_swap_page(struct page *page, struct block_device **bdev)
1330{
1331	swp_entry_t entry;
1332	entry.val = page_private(page);
1333	return map_swap_entry(entry, bdev);
1334}
1335
1336/*
1337 * Free all of a swapdev's extent information
1338 */
1339static void destroy_swap_extents(struct swap_info_struct *sis)
1340{
1341	while (!list_empty(&sis->first_swap_extent.list)) {
1342		struct swap_extent *se;
1343
1344		se = list_entry(sis->first_swap_extent.list.next,
1345				struct swap_extent, list);
1346		list_del(&se->list);
1347		kfree(se);
1348	}
1349}
1350
1351/*
1352 * Add a block range (and the corresponding page range) into this swapdev's
1353 * extent list.  The extent list is kept sorted in page order.
1354 *
1355 * This function rather assumes that it is called in ascending page order.
1356 */
1357static int
1358add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1359		unsigned long nr_pages, sector_t start_block)
1360{
1361	struct swap_extent *se;
1362	struct swap_extent *new_se;
1363	struct list_head *lh;
1364
1365	if (start_page == 0) {
1366		se = &sis->first_swap_extent;
1367		sis->curr_swap_extent = se;
1368		se->start_page = 0;
1369		se->nr_pages = nr_pages;
1370		se->start_block = start_block;
1371		return 1;
1372	} else {
1373		lh = sis->first_swap_extent.list.prev;	/* Highest extent */
1374		se = list_entry(lh, struct swap_extent, list);
1375		BUG_ON(se->start_page + se->nr_pages != start_page);
1376		if (se->start_block + se->nr_pages == start_block) {
1377			/* Merge it */
1378			se->nr_pages += nr_pages;
1379			return 0;
1380		}
1381	}
1382
1383	/*
1384	 * No merge.  Insert a new extent, preserving ordering.
1385	 */
1386	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1387	if (new_se == NULL)
1388		return -ENOMEM;
1389	new_se->start_page = start_page;
1390	new_se->nr_pages = nr_pages;
1391	new_se->start_block = start_block;
1392
1393	list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1394	return 1;
1395}
1396
1397/*
1398 * A `swap extent' is a simple thing which maps a contiguous range of pages
1399 * onto a contiguous range of disk blocks.  An ordered list of swap extents
1400 * is built at swapon time and is then used at swap_writepage/swap_readpage
1401 * time for locating where on disk a page belongs.
1402 *
1403 * If the swapfile is an S_ISBLK block device, a single extent is installed.
1404 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
1405 * swap files identically.
1406 *
1407 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
1408 * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
1409 * swapfiles are handled *identically* after swapon time.
1410 *
1411 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
1412 * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
1413 * some stray blocks are found which do not fall within the PAGE_SIZE alignment
1414 * requirements, they are simply tossed out - we will never use those blocks
1415 * for swapping.
1416 *
1417 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon.  This
1418 * prevents root from shooting her foot off by ftruncating an in-use swapfile,
1419 * which will scribble on the fs.
1420 *
1421 * The amount of disk space which a single swap extent represents varies.
1422 * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
1423 * extents in the list.  To avoid much list walking, we cache the previous
1424 * search location in `curr_swap_extent', and start new searches from there.
1425 * This is extremely effective.  The average number of iterations in
1426 * map_swap_page() has been measured at about 0.3 per page.  - akpm.
1427 */
1428static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1429{
1430	struct inode *inode;
1431	unsigned blocks_per_page;
1432	unsigned long page_no;
1433	unsigned blkbits;
1434	sector_t probe_block;
1435	sector_t last_block;
1436	sector_t lowest_block = -1;
1437	sector_t highest_block = 0;
1438	int nr_extents = 0;
1439	int ret;
1440
1441	inode = sis->swap_file->f_mapping->host;
1442	if (S_ISBLK(inode->i_mode)) {
1443		ret = add_swap_extent(sis, 0, sis->max, 0);
1444		*span = sis->pages;
1445		goto out;
1446	}
1447
1448	blkbits = inode->i_blkbits;
1449	blocks_per_page = PAGE_SIZE >> blkbits;
1450
1451	/*
1452	 * Map all the blocks into the extent list.  This code doesn't try
1453	 * to be very smart.
1454	 */
1455	probe_block = 0;
1456	page_no = 0;
1457	last_block = i_size_read(inode) >> blkbits;
1458	while ((probe_block + blocks_per_page) <= last_block &&
1459			page_no < sis->max) {
1460		unsigned block_in_page;
1461		sector_t first_block;
1462
1463		first_block = bmap(inode, probe_block);
1464		if (first_block == 0)
1465			goto bad_bmap;
1466
1467		/*
1468		 * It must be PAGE_SIZE aligned on-disk
1469		 */
1470		if (first_block & (blocks_per_page - 1)) {
1471			probe_block++;
1472			goto reprobe;
1473		}
1474
1475		for (block_in_page = 1; block_in_page < blocks_per_page;
1476					block_in_page++) {
1477			sector_t block;
1478
1479			block = bmap(inode, probe_block + block_in_page);
1480			if (block == 0)
1481				goto bad_bmap;
1482			if (block != first_block + block_in_page) {
1483				/* Discontiguity */
1484				probe_block++;
1485				goto reprobe;
1486			}
1487		}
1488
1489		first_block >>= (PAGE_SHIFT - blkbits);
1490		if (page_no) {	/* exclude the header page */
1491			if (first_block < lowest_block)
1492				lowest_block = first_block;
1493			if (first_block > highest_block)
1494				highest_block = first_block;
1495		}
1496
1497		/*
1498		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
1499		 */
1500		ret = add_swap_extent(sis, page_no, 1, first_block);
1501		if (ret < 0)
1502			goto out;
1503		nr_extents += ret;
1504		page_no++;
1505		probe_block += blocks_per_page;
1506reprobe:
1507		continue;
1508	}
1509	ret = nr_extents;
1510	*span = 1 + highest_block - lowest_block;
1511	if (page_no == 0)
1512		page_no = 1;	/* force Empty message */
1513	sis->max = page_no;
1514	sis->pages = page_no - 1;
1515	sis->highest_bit = page_no - 1;
1516out:
1517	return ret;
1518bad_bmap:
1519	printk(KERN_ERR "swapon: swapfile has holes\n");
1520	ret = -EINVAL;
1521	goto out;
1522}
1523
1524SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1525{
1526	struct swap_info_struct *p = NULL;
1527	unsigned char *swap_map;
1528	struct file *swap_file, *victim;
1529	struct address_space *mapping;
1530	struct inode *inode;
1531	char *pathname;
1532	int i, type, prev;
1533	int err;
1534
1535	if (!capable(CAP_SYS_ADMIN))
1536		return -EPERM;
1537
1538	pathname = getname(specialfile);
1539	err = PTR_ERR(pathname);
1540	if (IS_ERR(pathname))
1541		goto out;
1542
1543	victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1544	putname(pathname);
1545	err = PTR_ERR(victim);
1546	if (IS_ERR(victim))
1547		goto out;
1548
1549	mapping = victim->f_mapping;
1550	prev = -1;
1551	spin_lock(&swap_lock);
1552	for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1553		p = swap_info[type];
1554		if (p->flags & SWP_WRITEOK) {
1555			if (p->swap_file->f_mapping == mapping)
1556				break;
1557		}
1558		prev = type;
1559	}
1560	if (type < 0) {
1561		err = -EINVAL;
1562		spin_unlock(&swap_lock);
1563		goto out_dput;
1564	}
1565	if (!security_vm_enough_memory(p->pages))
1566		vm_unacct_memory(p->pages);
1567	else {
1568		err = -ENOMEM;
1569		spin_unlock(&swap_lock);
1570		goto out_dput;
1571	}
1572	if (prev < 0)
1573		swap_list.head = p->next;
1574	else
1575		swap_info[prev]->next = p->next;
1576	if (type == swap_list.next) {
1577		/* just pick something that's safe... */
1578		swap_list.next = swap_list.head;
1579	}
1580	if (p->prio < 0) {
1581		for (i = p->next; i >= 0; i = swap_info[i]->next)
1582			swap_info[i]->prio = p->prio--;
1583		least_priority++;
1584	}
1585	nr_swap_pages -= p->pages;
1586	total_swap_pages -= p->pages;
1587	p->flags &= ~SWP_WRITEOK;
1588	spin_unlock(&swap_lock);
1589
1590	current->flags |= PF_OOM_ORIGIN;
1591	err = try_to_unuse(type);
1592	current->flags &= ~PF_OOM_ORIGIN;
1593
1594	if (err) {
1595		/* re-insert swap space back into swap_list */
1596		spin_lock(&swap_lock);
1597		if (p->prio < 0)
1598			p->prio = --least_priority;
1599		prev = -1;
1600		for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1601			if (p->prio >= swap_info[i]->prio)
1602				break;
1603			prev = i;
1604		}
1605		p->next = i;
1606		if (prev < 0)
1607			swap_list.head = swap_list.next = type;
1608		else
1609			swap_info[prev]->next = type;
1610		nr_swap_pages += p->pages;
1611		total_swap_pages += p->pages;
1612		p->flags |= SWP_WRITEOK;
1613		spin_unlock(&swap_lock);
1614		goto out_dput;
1615	}
1616
1617	/* wait for any unplug function to finish */
1618	down_write(&swap_unplug_sem);
1619	up_write(&swap_unplug_sem);
1620
1621	destroy_swap_extents(p);
1622	if (p->flags & SWP_CONTINUED)
1623		free_swap_count_continuations(p);
1624
1625	mutex_lock(&swapon_mutex);
1626	spin_lock(&swap_lock);
1627	drain_mmlist();
1628
1629	/* wait for anyone still in scan_swap_map */
1630	p->highest_bit = 0;		/* cuts scans short */
1631	while (p->flags >= SWP_SCANNING) {
1632		spin_unlock(&swap_lock);
1633		schedule_timeout_uninterruptible(1);
1634		spin_lock(&swap_lock);
1635	}
1636
1637	swap_file = p->swap_file;
1638	p->swap_file = NULL;
1639	p->max = 0;
1640	swap_map = p->swap_map;
1641	p->swap_map = NULL;
1642	p->flags = 0;
1643	spin_unlock(&swap_lock);
1644	mutex_unlock(&swapon_mutex);
1645	vfree(swap_map);
1646	/* Destroy swap account informatin */
1647	swap_cgroup_swapoff(type);
1648
1649	inode = mapping->host;
1650	if (S_ISBLK(inode->i_mode)) {
1651		struct block_device *bdev = I_BDEV(inode);
1652		set_blocksize(bdev, p->old_block_size);
1653		bd_release(bdev);
1654	} else {
1655		mutex_lock(&inode->i_mutex);
1656		inode->i_flags &= ~S_SWAPFILE;
1657		mutex_unlock(&inode->i_mutex);
1658	}
1659	filp_close(swap_file, NULL);
1660	err = 0;
1661
1662out_dput:
1663	filp_close(victim, NULL);
1664out:
1665	return err;
1666}
1667
1668#ifdef CONFIG_PROC_FS
1669/* iterator */
1670static void *swap_start(struct seq_file *swap, loff_t *pos)
1671{
1672	struct swap_info_struct *si;
1673	int type;
1674	loff_t l = *pos;
1675
1676	mutex_lock(&swapon_mutex);
1677
1678	if (!l)
1679		return SEQ_START_TOKEN;
1680
1681	for (type = 0; type < nr_swapfiles; type++) {
1682		smp_rmb();	/* read nr_swapfiles before swap_info[type] */
1683		si = swap_info[type];
1684		if (!(si->flags & SWP_USED) || !si->swap_map)
1685			continue;
1686		if (!--l)
1687			return si;
1688	}
1689
1690	return NULL;
1691}
1692
1693static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1694{
1695	struct swap_info_struct *si = v;
1696	int type;
1697
1698	if (v == SEQ_START_TOKEN)
1699		type = 0;
1700	else
1701		type = si->type + 1;
1702
1703	for (; type < nr_swapfiles; type++) {
1704		smp_rmb();	/* read nr_swapfiles before swap_info[type] */
1705		si = swap_info[type];
1706		if (!(si->flags & SWP_USED) || !si->swap_map)
1707			continue;
1708		++*pos;
1709		return si;
1710	}
1711
1712	return NULL;
1713}
1714
1715static void swap_stop(struct seq_file *swap, void *v)
1716{
1717	mutex_unlock(&swapon_mutex);
1718}
1719
1720static int swap_show(struct seq_file *swap, void *v)
1721{
1722	struct swap_info_struct *si = v;
1723	struct file *file;
1724	int len;
1725
1726	if (si == SEQ_START_TOKEN) {
1727		seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1728		return 0;
1729	}
1730
1731	file = si->swap_file;
1732	len = seq_path(swap, &file->f_path, " \t\n\\");
1733	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1734			len < 40 ? 40 - len : 1, " ",
1735			S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1736				"partition" : "file\t",
1737			si->pages << (PAGE_SHIFT - 10),
1738			si->inuse_pages << (PAGE_SHIFT - 10),
1739			si->prio);
1740	return 0;
1741}
1742
1743static const struct seq_operations swaps_op = {
1744	.start =	swap_start,
1745	.next =		swap_next,
1746	.stop =		swap_stop,
1747	.show =		swap_show
1748};
1749
1750static int swaps_open(struct inode *inode, struct file *file)
1751{
1752	return seq_open(file, &swaps_op);
1753}
1754
1755static const struct file_operations proc_swaps_operations = {
1756	.open		= swaps_open,
1757	.read		= seq_read,
1758	.llseek		= seq_lseek,
1759	.release	= seq_release,
1760};
1761
1762static int __init procswaps_init(void)
1763{
1764	proc_create("swaps", 0, NULL, &proc_swaps_operations);
1765	return 0;
1766}
1767__initcall(procswaps_init);
1768#endif /* CONFIG_PROC_FS */
1769
1770#ifdef MAX_SWAPFILES_CHECK
1771static int __init max_swapfiles_check(void)
1772{
1773	MAX_SWAPFILES_CHECK();
1774	return 0;
1775}
1776late_initcall(max_swapfiles_check);
1777#endif
1778
1779/*
1780 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1781 *
1782 * The swapon system call
1783 */
1784SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1785{
1786	struct swap_info_struct *p;
1787	char *name = NULL;
1788	struct block_device *bdev = NULL;
1789	struct file *swap_file = NULL;
1790	struct address_space *mapping;
1791	unsigned int type;
1792	int i, prev;
1793	int error;
1794	union swap_header *swap_header;
1795	unsigned int nr_good_pages;
1796	int nr_extents = 0;
1797	sector_t span;
1798	unsigned long maxpages;
1799	unsigned long swapfilepages;
1800	unsigned char *swap_map = NULL;
1801	struct page *page = NULL;
1802	struct inode *inode = NULL;
1803	int did_down = 0;
1804
1805	if (!capable(CAP_SYS_ADMIN))
1806		return -EPERM;
1807
1808	p = kzalloc(sizeof(*p), GFP_KERNEL);
1809	if (!p)
1810		return -ENOMEM;
1811
1812	spin_lock(&swap_lock);
1813	for (type = 0; type < nr_swapfiles; type++) {
1814		if (!(swap_info[type]->flags & SWP_USED))
1815			break;
1816	}
1817	error = -EPERM;
1818	if (type >= MAX_SWAPFILES) {
1819		spin_unlock(&swap_lock);
1820		kfree(p);
1821		goto out;
1822	}
1823	if (type >= nr_swapfiles) {
1824		p->type = type;
1825		swap_info[type] = p;
1826		/*
1827		 * Write swap_info[type] before nr_swapfiles, in case a
1828		 * racing procfs swap_start() or swap_next() is reading them.
1829		 * (We never shrink nr_swapfiles, we never free this entry.)
1830		 */
1831		smp_wmb();
1832		nr_swapfiles++;
1833	} else {
1834		kfree(p);
1835		p = swap_info[type];
1836		/*
1837		 * Do not memset this entry: a racing procfs swap_next()
1838		 * would be relying on p->type to remain valid.
1839		 */
1840	}
1841	INIT_LIST_HEAD(&p->first_swap_extent.list);
1842	p->flags = SWP_USED;
1843	p->next = -1;
1844	spin_unlock(&swap_lock);
1845
1846	name = getname(specialfile);
1847	error = PTR_ERR(name);
1848	if (IS_ERR(name)) {
1849		name = NULL;
1850		goto bad_swap_2;
1851	}
1852	swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
1853	error = PTR_ERR(swap_file);
1854	if (IS_ERR(swap_file)) {
1855		swap_file = NULL;
1856		goto bad_swap_2;
1857	}
1858
1859	p->swap_file = swap_file;
1860	mapping = swap_file->f_mapping;
1861	inode = mapping->host;
1862
1863	error = -EBUSY;
1864	for (i = 0; i < nr_swapfiles; i++) {
1865		struct swap_info_struct *q = swap_info[i];
1866
1867		if (i == type || !q->swap_file)
1868			continue;
1869		if (mapping == q->swap_file->f_mapping)
1870			goto bad_swap;
1871	}
1872
1873	error = -EINVAL;
1874	if (S_ISBLK(inode->i_mode)) {
1875		bdev = I_BDEV(inode);
1876		error = bd_claim(bdev, sys_swapon);
1877		if (error < 0) {
1878			bdev = NULL;
1879			error = -EINVAL;
1880			goto bad_swap;
1881		}
1882		p->old_block_size = block_size(bdev);
1883		error = set_blocksize(bdev, PAGE_SIZE);
1884		if (error < 0)
1885			goto bad_swap;
1886		p->bdev = bdev;
1887		p->flags |= SWP_BLKDEV;
1888	} else if (S_ISREG(inode->i_mode)) {
1889		p->bdev = inode->i_sb->s_bdev;
1890		mutex_lock(&inode->i_mutex);
1891		did_down = 1;
1892		if (IS_SWAPFILE(inode)) {
1893			error = -EBUSY;
1894			goto bad_swap;
1895		}
1896	} else {
1897		goto bad_swap;
1898	}
1899
1900	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1901
1902	/*
1903	 * Read the swap header.
1904	 */
1905	if (!mapping->a_ops->readpage) {
1906		error = -EINVAL;
1907		goto bad_swap;
1908	}
1909	page = read_mapping_page(mapping, 0, swap_file);
1910	if (IS_ERR(page)) {
1911		error = PTR_ERR(page);
1912		goto bad_swap;
1913	}
1914	swap_header = kmap(page);
1915
1916	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1917		printk(KERN_ERR "Unable to find swap-space signature\n");
1918		error = -EINVAL;
1919		goto bad_swap;
1920	}
1921
1922	/* swap partition endianess hack... */
1923	if (swab32(swap_header->info.version) == 1) {
1924		swab32s(&swap_header->info.version);
1925		swab32s(&swap_header->info.last_page);
1926		swab32s(&swap_header->info.nr_badpages);
1927		for (i = 0; i < swap_header->info.nr_badpages; i++)
1928			swab32s(&swap_header->info.badpages[i]);
1929	}
1930	/* Check the swap header's sub-version */
1931	if (swap_header->info.version != 1) {
1932		printk(KERN_WARNING
1933		       "Unable to handle swap header version %d\n",
1934		       swap_header->info.version);
1935		error = -EINVAL;
1936		goto bad_swap;
1937	}
1938
1939	p->lowest_bit  = 1;
1940	p->cluster_next = 1;
1941	p->cluster_nr = 0;
1942
1943	/*
1944	 * Find out how many pages are allowed for a single swap
1945	 * device. There are two limiting factors: 1) the number of
1946	 * bits for the swap offset in the swp_entry_t type and
1947	 * 2) the number of bits in the a swap pte as defined by
1948	 * the different architectures. In order to find the
1949	 * largest possible bit mask a swap entry with swap type 0
1950	 * and swap offset ~0UL is created, encoded to a swap pte,
1951	 * decoded to a swp_entry_t again and finally the swap
1952	 * offset is extracted. This will mask all the bits from
1953	 * the initial ~0UL mask that can't be encoded in either
1954	 * the swp_entry_t or the architecture definition of a
1955	 * swap pte.
1956	 */
1957	maxpages = swp_offset(pte_to_swp_entry(
1958			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1959	if (maxpages > swap_header->info.last_page) {
1960		maxpages = swap_header->info.last_page + 1;
1961		/* p->max is an unsigned int: don't overflow it */
1962		if ((unsigned int)maxpages == 0)
1963			maxpages = UINT_MAX;
1964	}
1965	p->highest_bit = maxpages - 1;
1966
1967	error = -EINVAL;
1968	if (!maxpages)
1969		goto bad_swap;
1970	if (swapfilepages && maxpages > swapfilepages) {
1971		printk(KERN_WARNING
1972		       "Swap area shorter than signature indicates\n");
1973		goto bad_swap;
1974	}
1975	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1976		goto bad_swap;
1977	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1978		goto bad_swap;
1979
1980	/* OK, set up the swap map and apply the bad block list */
1981	swap_map = vmalloc(maxpages);
1982	if (!swap_map) {
1983		error = -ENOMEM;
1984		goto bad_swap;
1985	}
1986
1987	memset(swap_map, 0, maxpages);
1988	nr_good_pages = maxpages - 1;	/* omit header page */
1989
1990	for (i = 0; i < swap_header->info.nr_badpages; i++) {
1991		unsigned int page_nr = swap_header->info.badpages[i];
1992		if (page_nr == 0 || page_nr > swap_header->info.last_page) {
1993			error = -EINVAL;
1994			goto bad_swap;
1995		}
1996		if (page_nr < maxpages) {
1997			swap_map[page_nr] = SWAP_MAP_BAD;
1998			nr_good_pages--;
1999		}
2000	}
2001
2002	error = swap_cgroup_swapon(type, maxpages);
2003	if (error)
2004		goto bad_swap;
2005
2006	if (nr_good_pages) {
2007		swap_map[0] = SWAP_MAP_BAD;
2008		p->max = maxpages;
2009		p->pages = nr_good_pages;
2010		nr_extents = setup_swap_extents(p, &span);
2011		if (nr_extents < 0) {
2012			error = nr_extents;
2013			goto bad_swap;
2014		}
2015		nr_good_pages = p->pages;
2016	}
2017	if (!nr_good_pages) {
2018		printk(KERN_WARNING "Empty swap-file\n");
2019		error = -EINVAL;
2020		goto bad_swap;
2021	}
2022
2023	if (p->bdev) {
2024		if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2025			p->flags |= SWP_SOLIDSTATE;
2026			p->cluster_next = 1 + (random32() % p->highest_bit);
2027		}
2028		if (discard_swap(p) == 0)
2029			p->flags |= SWP_DISCARDABLE;
2030	}
2031
2032	mutex_lock(&swapon_mutex);
2033	spin_lock(&swap_lock);
2034	if (swap_flags & SWAP_FLAG_PREFER)
2035		p->prio =
2036		  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2037	else
2038		p->prio = --least_priority;
2039	p->swap_map = swap_map;
2040	p->flags |= SWP_WRITEOK;
2041	nr_swap_pages += nr_good_pages;
2042	total_swap_pages += nr_good_pages;
2043
2044	printk(KERN_INFO "Adding %uk swap on %s.  "
2045			"Priority:%d extents:%d across:%lluk %s%s\n",
2046		nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
2047		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2048		(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2049		(p->flags & SWP_DISCARDABLE) ? "D" : "");
2050
2051	/* insert swap space into swap_list: */
2052	prev = -1;
2053	for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
2054		if (p->prio >= swap_info[i]->prio)
2055			break;
2056		prev = i;
2057	}
2058	p->next = i;
2059	if (prev < 0)
2060		swap_list.head = swap_list.next = type;
2061	else
2062		swap_info[prev]->next = type;
2063	spin_unlock(&swap_lock);
2064	mutex_unlock(&swapon_mutex);
2065	error = 0;
2066	goto out;
2067bad_swap:
2068	if (bdev) {
2069		set_blocksize(bdev, p->old_block_size);
2070		bd_release(bdev);
2071	}
2072	destroy_swap_extents(p);
2073	swap_cgroup_swapoff(type);
2074bad_swap_2:
2075	spin_lock(&swap_lock);
2076	p->swap_file = NULL;
2077	p->flags = 0;
2078	spin_unlock(&swap_lock);
2079	vfree(swap_map);
2080	if (swap_file)
2081		filp_close(swap_file, NULL);
2082out:
2083	if (page && !IS_ERR(page)) {
2084		kunmap(page);
2085		page_cache_release(page);
2086	}
2087	if (name)
2088		putname(name);
2089	if (did_down) {
2090		if (!error)
2091			inode->i_flags |= S_SWAPFILE;
2092		mutex_unlock(&inode->i_mutex);
2093	}
2094	return error;
2095}
2096
2097void si_swapinfo(struct sysinfo *val)
2098{
2099	unsigned int type;
2100	unsigned long nr_to_be_unused = 0;
2101
2102	spin_lock(&swap_lock);
2103	for (type = 0; type < nr_swapfiles; type++) {
2104		struct swap_info_struct *si = swap_info[type];
2105
2106		if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2107			nr_to_be_unused += si->inuse_pages;
2108	}
2109	val->freeswap = nr_swap_pages + nr_to_be_unused;
2110	val->totalswap = total_swap_pages + nr_to_be_unused;
2111	spin_unlock(&swap_lock);
2112}
2113
2114/*
2115 * Verify that a swap entry is valid and increment its swap map count.
2116 *
2117 * Returns error code in following case.
2118 * - success -> 0
2119 * - swp_entry is invalid -> EINVAL
2120 * - swp_entry is migration entry -> EINVAL
2121 * - swap-cache reference is requested but there is already one. -> EEXIST
2122 * - swap-cache reference is requested but the entry is not used. -> ENOENT
2123 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
2124 */
2125static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2126{
2127	struct swap_info_struct *p;
2128	unsigned long offset, type;
2129	unsigned char count;
2130	unsigned char has_cache;
2131	int err = -EINVAL;
2132
2133	if (non_swap_entry(entry))
2134		goto out;
2135
2136	type = swp_type(entry);
2137	if (type >= nr_swapfiles)
2138		goto bad_file;
2139	p = swap_info[type];
2140	offset = swp_offset(entry);
2141
2142	spin_lock(&swap_lock);
2143	if (unlikely(offset >= p->max))
2144		goto unlock_out;
2145
2146	count = p->swap_map[offset];
2147	has_cache = count & SWAP_HAS_CACHE;
2148	count &= ~SWAP_HAS_CACHE;
2149	err = 0;
2150
2151	if (usage == SWAP_HAS_CACHE) {
2152
2153		/* set SWAP_HAS_CACHE if there is no cache and entry is used */
2154		if (!has_cache && count)
2155			has_cache = SWAP_HAS_CACHE;
2156		else if (has_cache)		/* someone else added cache */
2157			err = -EEXIST;
2158		else				/* no users remaining */
2159			err = -ENOENT;
2160
2161	} else if (count || has_cache) {
2162
2163		if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2164			count += usage;
2165		else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2166			err = -EINVAL;
2167		else if (swap_count_continued(p, offset, count))
2168			count = COUNT_CONTINUED;
2169		else
2170			err = -ENOMEM;
2171	} else
2172		err = -ENOENT;			/* unused swap entry */
2173
2174	p->swap_map[offset] = count | has_cache;
2175
2176unlock_out:
2177	spin_unlock(&swap_lock);
2178out:
2179	return err;
2180
2181bad_file:
2182	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2183	goto out;
2184}
2185
2186/*
2187 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
2188 * (in which case its reference count is never incremented).
2189 */
2190void swap_shmem_alloc(swp_entry_t entry)
2191{
2192	__swap_duplicate(entry, SWAP_MAP_SHMEM);
2193}
2194
2195/*
2196 * Increase reference count of swap entry by 1.
2197 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
2198 * but could not be atomically allocated.  Returns 0, just as if it succeeded,
2199 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
2200 * might occur if a page table entry has got corrupted.
2201 */
2202int swap_duplicate(swp_entry_t entry)
2203{
2204	int err = 0;
2205
2206	while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2207		err = add_swap_count_continuation(entry, GFP_ATOMIC);
2208	return err;
2209}
2210
2211/*
2212 * @entry: swap entry for which we allocate swap cache.
2213 *
2214 * Called when allocating swap cache for existing swap entry,
2215 * This can return error codes. Returns 0 at success.
2216 * -EBUSY means there is a swap cache.
2217 * Note: return code is different from swap_duplicate().
2218 */
2219int swapcache_prepare(swp_entry_t entry)
2220{
2221	return __swap_duplicate(entry, SWAP_HAS_CACHE);
2222}
2223
2224/*
2225 * swap_lock prevents swap_map being freed. Don't grab an extra
2226 * reference on the swaphandle, it doesn't matter if it becomes unused.
2227 */
2228int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2229{
2230	struct swap_info_struct *si;
2231	int our_page_cluster = page_cluster;
2232	pgoff_t target, toff;
2233	pgoff_t base, end;
2234	int nr_pages = 0;
2235
2236	if (!our_page_cluster)	/* no readahead */
2237		return 0;
2238
2239	si = swap_info[swp_type(entry)];
2240	target = swp_offset(entry);
2241	base = (target >> our_page_cluster) << our_page_cluster;
2242	end = base + (1 << our_page_cluster);
2243	if (!base)		/* first page is swap header */
2244		base++;
2245
2246	spin_lock(&swap_lock);
2247	if (end > si->max)	/* don't go beyond end of map */
2248		end = si->max;
2249
2250	/* Count contiguous allocated slots above our target */
2251	for (toff = target; ++toff < end; nr_pages++) {
2252		/* Don't read in free or bad pages */
2253		if (!si->swap_map[toff])
2254			break;
2255		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2256			break;
2257	}
2258	/* Count contiguous allocated slots below our target */
2259	for (toff = target; --toff >= base; nr_pages++) {
2260		/* Don't read in free or bad pages */
2261		if (!si->swap_map[toff])
2262			break;
2263		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2264			break;
2265	}
2266	spin_unlock(&swap_lock);
2267
2268	/*
2269	 * Indicate starting offset, and return number of pages to get:
2270	 * if only 1, say 0, since there's then no readahead to be done.
2271	 */
2272	*offset = ++toff;
2273	return nr_pages? ++nr_pages: 0;
2274}
2275
2276/*
2277 * add_swap_count_continuation - called when a swap count is duplicated
2278 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2279 * page of the original vmalloc'ed swap_map, to hold the continuation count
2280 * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
2281 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
2282 *
2283 * These continuation pages are seldom referenced: the common paths all work
2284 * on the original swap_map, only referring to a continuation page when the
2285 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
2286 *
2287 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
2288 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
2289 * can be called after dropping locks.
2290 */
2291int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2292{
2293	struct swap_info_struct *si;
2294	struct page *head;
2295	struct page *page;
2296	struct page *list_page;
2297	pgoff_t offset;
2298	unsigned char count;
2299
2300	/*
2301	 * When debugging, it's easier to use __GFP_ZERO here; but it's better
2302	 * for latency not to zero a page while GFP_ATOMIC and holding locks.
2303	 */
2304	page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2305
2306	si = swap_info_get(entry);
2307	if (!si) {
2308		/*
2309		 * An acceptable race has occurred since the failing
2310		 * __swap_duplicate(): the swap entry has been freed,
2311		 * perhaps even the whole swap_map cleared for swapoff.
2312		 */
2313		goto outer;
2314	}
2315
2316	offset = swp_offset(entry);
2317	count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2318
2319	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2320		/*
2321		 * The higher the swap count, the more likely it is that tasks
2322		 * will race to add swap count continuation: we need to avoid
2323		 * over-provisioning.
2324		 */
2325		goto out;
2326	}
2327
2328	if (!page) {
2329		spin_unlock(&swap_lock);
2330		return -ENOMEM;
2331	}
2332
2333	/*
2334	 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
2335	 * no architecture is using highmem pages for kernel pagetables: so it
2336	 * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
2337	 */
2338	head = vmalloc_to_page(si->swap_map + offset);
2339	offset &= ~PAGE_MASK;
2340
2341	/*
2342	 * Page allocation does not initialize the page's lru field,
2343	 * but it does always reset its private field.
2344	 */
2345	if (!page_private(head)) {
2346		BUG_ON(count & COUNT_CONTINUED);
2347		INIT_LIST_HEAD(&head->lru);
2348		set_page_private(head, SWP_CONTINUED);
2349		si->flags |= SWP_CONTINUED;
2350	}
2351
2352	list_for_each_entry(list_page, &head->lru, lru) {
2353		unsigned char *map;
2354
2355		/*
2356		 * If the previous map said no continuation, but we've found
2357		 * a continuation page, free our allocation and use this one.
2358		 */
2359		if (!(count & COUNT_CONTINUED))
2360			goto out;
2361
2362		map = kmap_atomic(list_page, KM_USER0) + offset;
2363		count = *map;
2364		kunmap_atomic(map, KM_USER0);
2365
2366		/*
2367		 * If this continuation count now has some space in it,
2368		 * free our allocation and use this one.
2369		 */
2370		if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2371			goto out;
2372	}
2373
2374	list_add_tail(&page->lru, &head->lru);
2375	page = NULL;			/* now it's attached, don't free it */
2376out:
2377	spin_unlock(&swap_lock);
2378outer:
2379	if (page)
2380		__free_page(page);
2381	return 0;
2382}
2383
2384/*
2385 * swap_count_continued - when the original swap_map count is incremented
2386 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
2387 * into, carry if so, or else fail until a new continuation page is allocated;
2388 * when the original swap_map count is decremented from 0 with continuation,
2389 * borrow from the continuation and report whether it still holds more.
2390 * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
2391 */
2392static bool swap_count_continued(struct swap_info_struct *si,
2393				 pgoff_t offset, unsigned char count)
2394{
2395	struct page *head;
2396	struct page *page;
2397	unsigned char *map;
2398
2399	head = vmalloc_to_page(si->swap_map + offset);
2400	if (page_private(head) != SWP_CONTINUED) {
2401		BUG_ON(count & COUNT_CONTINUED);
2402		return false;		/* need to add count continuation */
2403	}
2404
2405	offset &= ~PAGE_MASK;
2406	page = list_entry(head->lru.next, struct page, lru);
2407	map = kmap_atomic(page, KM_USER0) + offset;
2408
2409	if (count == SWAP_MAP_MAX)	/* initial increment from swap_map */
2410		goto init_map;		/* jump over SWAP_CONT_MAX checks */
2411
2412	if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
2413		/*
2414		 * Think of how you add 1 to 999
2415		 */
2416		while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2417			kunmap_atomic(map, KM_USER0);
2418			page = list_entry(page->lru.next, struct page, lru);
2419			BUG_ON(page == head);
2420			map = kmap_atomic(page, KM_USER0) + offset;
2421		}
2422		if (*map == SWAP_CONT_MAX) {
2423			kunmap_atomic(map, KM_USER0);
2424			page = list_entry(page->lru.next, struct page, lru);
2425			if (page == head)
2426				return false;	/* add count continuation */
2427			map = kmap_atomic(page, KM_USER0) + offset;
2428init_map:		*map = 0;		/* we didn't zero the page */
2429		}
2430		*map += 1;
2431		kunmap_atomic(map, KM_USER0);
2432		page = list_entry(page->lru.prev, struct page, lru);
2433		while (page != head) {
2434			map = kmap_atomic(page, KM_USER0) + offset;
2435			*map = COUNT_CONTINUED;
2436			kunmap_atomic(map, KM_USER0);
2437			page = list_entry(page->lru.prev, struct page, lru);
2438		}
2439		return true;			/* incremented */
2440
2441	} else {				/* decrementing */
2442		/*
2443		 * Think of how you subtract 1 from 1000
2444		 */
2445		BUG_ON(count != COUNT_CONTINUED);
2446		while (*map == COUNT_CONTINUED) {
2447			kunmap_atomic(map, KM_USER0);
2448			page = list_entry(page->lru.next, struct page, lru);
2449			BUG_ON(page == head);
2450			map = kmap_atomic(page, KM_USER0) + offset;
2451		}
2452		BUG_ON(*map == 0);
2453		*map -= 1;
2454		if (*map == 0)
2455			count = 0;
2456		kunmap_atomic(map, KM_USER0);
2457		page = list_entry(page->lru.prev, struct page, lru);
2458		while (page != head) {
2459			map = kmap_atomic(page, KM_USER0) + offset;
2460			*map = SWAP_CONT_MAX | count;
2461			count = COUNT_CONTINUED;
2462			kunmap_atomic(map, KM_USER0);
2463			page = list_entry(page->lru.prev, struct page, lru);
2464		}
2465		return count == COUNT_CONTINUED;
2466	}
2467}
2468
2469/*
2470 * free_swap_count_continuations - swapoff free all the continuation pages
2471 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
2472 */
2473static void free_swap_count_continuations(struct swap_info_struct *si)
2474{
2475	pgoff_t offset;
2476
2477	for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2478		struct page *head;
2479		head = vmalloc_to_page(si->swap_map + offset);
2480		if (page_private(head)) {
2481			struct list_head *this, *next;
2482			list_for_each_safe(this, next, &head->lru) {
2483				struct page *page;
2484				page = list_entry(this, struct page, lru);
2485				list_del(this);
2486				__free_page(page);
2487			}
2488		}
2489	}
2490}
2491