swapfile.c revision f29ad6a99b596b8169744d107bf088e8be9e8d0d
1/* 2 * linux/mm/swapfile.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * Swap reorganised 29.12.95, Stephen Tweedie 6 */ 7 8#include <linux/mm.h> 9#include <linux/hugetlb.h> 10#include <linux/mman.h> 11#include <linux/slab.h> 12#include <linux/kernel_stat.h> 13#include <linux/swap.h> 14#include <linux/vmalloc.h> 15#include <linux/pagemap.h> 16#include <linux/namei.h> 17#include <linux/shm.h> 18#include <linux/blkdev.h> 19#include <linux/random.h> 20#include <linux/writeback.h> 21#include <linux/proc_fs.h> 22#include <linux/seq_file.h> 23#include <linux/init.h> 24#include <linux/module.h> 25#include <linux/rmap.h> 26#include <linux/security.h> 27#include <linux/backing-dev.h> 28#include <linux/mutex.h> 29#include <linux/capability.h> 30#include <linux/syscalls.h> 31#include <linux/memcontrol.h> 32 33#include <asm/pgtable.h> 34#include <asm/tlbflush.h> 35#include <linux/swapops.h> 36#include <linux/page_cgroup.h> 37 38static DEFINE_SPINLOCK(swap_lock); 39static unsigned int nr_swapfiles; 40long nr_swap_pages; 41long total_swap_pages; 42static int swap_overflow; 43static int least_priority; 44 45static const char Bad_file[] = "Bad swap file entry "; 46static const char Unused_file[] = "Unused swap file entry "; 47static const char Bad_offset[] = "Bad swap offset entry "; 48static const char Unused_offset[] = "Unused swap offset entry "; 49 50static struct swap_list_t swap_list = {-1, -1}; 51 52static struct swap_info_struct swap_info[MAX_SWAPFILES]; 53 54static DEFINE_MUTEX(swapon_mutex); 55 56/* For reference count accounting in swap_map */ 57/* enum for swap_map[] handling. internal use only */ 58enum { 59 SWAP_MAP = 0, /* ops for reference from swap users */ 60 SWAP_CACHE, /* ops for reference from swap cache */ 61}; 62 63static inline int swap_count(unsigned short ent) 64{ 65 return ent & SWAP_COUNT_MASK; 66} 67 68static inline bool swap_has_cache(unsigned short ent) 69{ 70 return !!(ent & SWAP_HAS_CACHE); 71} 72 73static inline unsigned short encode_swapmap(int count, bool has_cache) 74{ 75 unsigned short ret = count; 76 77 if (has_cache) 78 return SWAP_HAS_CACHE | ret; 79 return ret; 80} 81 82/* returnes 1 if swap entry is freed */ 83static int 84__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) 85{ 86 int type = si - swap_info; 87 swp_entry_t entry = swp_entry(type, offset); 88 struct page *page; 89 int ret = 0; 90 91 page = find_get_page(&swapper_space, entry.val); 92 if (!page) 93 return 0; 94 /* 95 * This function is called from scan_swap_map() and it's called 96 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. 97 * We have to use trylock for avoiding deadlock. This is a special 98 * case and you should use try_to_free_swap() with explicit lock_page() 99 * in usual operations. 100 */ 101 if (trylock_page(page)) { 102 ret = try_to_free_swap(page); 103 unlock_page(page); 104 } 105 page_cache_release(page); 106 return ret; 107} 108 109/* 110 * We need this because the bdev->unplug_fn can sleep and we cannot 111 * hold swap_lock while calling the unplug_fn. And swap_lock 112 * cannot be turned into a mutex. 113 */ 114static DECLARE_RWSEM(swap_unplug_sem); 115 116void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) 117{ 118 swp_entry_t entry; 119 120 down_read(&swap_unplug_sem); 121 entry.val = page_private(page); 122 if (PageSwapCache(page)) { 123 struct block_device *bdev = swap_info[swp_type(entry)].bdev; 124 struct backing_dev_info *bdi; 125 126 /* 127 * If the page is removed from swapcache from under us (with a 128 * racy try_to_unuse/swapoff) we need an additional reference 129 * count to avoid reading garbage from page_private(page) above. 130 * If the WARN_ON triggers during a swapoff it maybe the race 131 * condition and it's harmless. However if it triggers without 132 * swapoff it signals a problem. 133 */ 134 WARN_ON(page_count(page) <= 1); 135 136 bdi = bdev->bd_inode->i_mapping->backing_dev_info; 137 blk_run_backing_dev(bdi, page); 138 } 139 up_read(&swap_unplug_sem); 140} 141 142/* 143 * swapon tell device that all the old swap contents can be discarded, 144 * to allow the swap device to optimize its wear-levelling. 145 */ 146static int discard_swap(struct swap_info_struct *si) 147{ 148 struct swap_extent *se; 149 int err = 0; 150 151 list_for_each_entry(se, &si->extent_list, list) { 152 sector_t start_block = se->start_block << (PAGE_SHIFT - 9); 153 sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 154 155 if (se->start_page == 0) { 156 /* Do not discard the swap header page! */ 157 start_block += 1 << (PAGE_SHIFT - 9); 158 nr_blocks -= 1 << (PAGE_SHIFT - 9); 159 if (!nr_blocks) 160 continue; 161 } 162 163 err = blkdev_issue_discard(si->bdev, start_block, 164 nr_blocks, GFP_KERNEL, 165 DISCARD_FL_BARRIER); 166 if (err) 167 break; 168 169 cond_resched(); 170 } 171 return err; /* That will often be -EOPNOTSUPP */ 172} 173 174/* 175 * swap allocation tell device that a cluster of swap can now be discarded, 176 * to allow the swap device to optimize its wear-levelling. 177 */ 178static void discard_swap_cluster(struct swap_info_struct *si, 179 pgoff_t start_page, pgoff_t nr_pages) 180{ 181 struct swap_extent *se = si->curr_swap_extent; 182 int found_extent = 0; 183 184 while (nr_pages) { 185 struct list_head *lh; 186 187 if (se->start_page <= start_page && 188 start_page < se->start_page + se->nr_pages) { 189 pgoff_t offset = start_page - se->start_page; 190 sector_t start_block = se->start_block + offset; 191 sector_t nr_blocks = se->nr_pages - offset; 192 193 if (nr_blocks > nr_pages) 194 nr_blocks = nr_pages; 195 start_page += nr_blocks; 196 nr_pages -= nr_blocks; 197 198 if (!found_extent++) 199 si->curr_swap_extent = se; 200 201 start_block <<= PAGE_SHIFT - 9; 202 nr_blocks <<= PAGE_SHIFT - 9; 203 if (blkdev_issue_discard(si->bdev, start_block, 204 nr_blocks, GFP_NOIO, 205 DISCARD_FL_BARRIER)) 206 break; 207 } 208 209 lh = se->list.next; 210 if (lh == &si->extent_list) 211 lh = lh->next; 212 se = list_entry(lh, struct swap_extent, list); 213 } 214} 215 216static int wait_for_discard(void *word) 217{ 218 schedule(); 219 return 0; 220} 221 222#define SWAPFILE_CLUSTER 256 223#define LATENCY_LIMIT 256 224 225static inline unsigned long scan_swap_map(struct swap_info_struct *si, 226 int cache) 227{ 228 unsigned long offset; 229 unsigned long scan_base; 230 unsigned long last_in_cluster = 0; 231 int latency_ration = LATENCY_LIMIT; 232 int found_free_cluster = 0; 233 234 /* 235 * We try to cluster swap pages by allocating them sequentially 236 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 237 * way, however, we resort to first-free allocation, starting 238 * a new cluster. This prevents us from scattering swap pages 239 * all over the entire swap partition, so that we reduce 240 * overall disk seek times between swap pages. -- sct 241 * But we do now try to find an empty cluster. -Andrea 242 * And we let swap pages go all over an SSD partition. Hugh 243 */ 244 245 si->flags += SWP_SCANNING; 246 scan_base = offset = si->cluster_next; 247 248 if (unlikely(!si->cluster_nr--)) { 249 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 250 si->cluster_nr = SWAPFILE_CLUSTER - 1; 251 goto checks; 252 } 253 if (si->flags & SWP_DISCARDABLE) { 254 /* 255 * Start range check on racing allocations, in case 256 * they overlap the cluster we eventually decide on 257 * (we scan without swap_lock to allow preemption). 258 * It's hardly conceivable that cluster_nr could be 259 * wrapped during our scan, but don't depend on it. 260 */ 261 if (si->lowest_alloc) 262 goto checks; 263 si->lowest_alloc = si->max; 264 si->highest_alloc = 0; 265 } 266 spin_unlock(&swap_lock); 267 268 /* 269 * If seek is expensive, start searching for new cluster from 270 * start of partition, to minimize the span of allocated swap. 271 * But if seek is cheap, search from our current position, so 272 * that swap is allocated from all over the partition: if the 273 * Flash Translation Layer only remaps within limited zones, 274 * we don't want to wear out the first zone too quickly. 275 */ 276 if (!(si->flags & SWP_SOLIDSTATE)) 277 scan_base = offset = si->lowest_bit; 278 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 279 280 /* Locate the first empty (unaligned) cluster */ 281 for (; last_in_cluster <= si->highest_bit; offset++) { 282 if (si->swap_map[offset]) 283 last_in_cluster = offset + SWAPFILE_CLUSTER; 284 else if (offset == last_in_cluster) { 285 spin_lock(&swap_lock); 286 offset -= SWAPFILE_CLUSTER - 1; 287 si->cluster_next = offset; 288 si->cluster_nr = SWAPFILE_CLUSTER - 1; 289 found_free_cluster = 1; 290 goto checks; 291 } 292 if (unlikely(--latency_ration < 0)) { 293 cond_resched(); 294 latency_ration = LATENCY_LIMIT; 295 } 296 } 297 298 offset = si->lowest_bit; 299 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 300 301 /* Locate the first empty (unaligned) cluster */ 302 for (; last_in_cluster < scan_base; offset++) { 303 if (si->swap_map[offset]) 304 last_in_cluster = offset + SWAPFILE_CLUSTER; 305 else if (offset == last_in_cluster) { 306 spin_lock(&swap_lock); 307 offset -= SWAPFILE_CLUSTER - 1; 308 si->cluster_next = offset; 309 si->cluster_nr = SWAPFILE_CLUSTER - 1; 310 found_free_cluster = 1; 311 goto checks; 312 } 313 if (unlikely(--latency_ration < 0)) { 314 cond_resched(); 315 latency_ration = LATENCY_LIMIT; 316 } 317 } 318 319 offset = scan_base; 320 spin_lock(&swap_lock); 321 si->cluster_nr = SWAPFILE_CLUSTER - 1; 322 si->lowest_alloc = 0; 323 } 324 325checks: 326 if (!(si->flags & SWP_WRITEOK)) 327 goto no_page; 328 if (!si->highest_bit) 329 goto no_page; 330 if (offset > si->highest_bit) 331 scan_base = offset = si->lowest_bit; 332 333 /* reuse swap entry of cache-only swap if not busy. */ 334 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 335 int swap_was_freed; 336 spin_unlock(&swap_lock); 337 swap_was_freed = __try_to_reclaim_swap(si, offset); 338 spin_lock(&swap_lock); 339 /* entry was freed successfully, try to use this again */ 340 if (swap_was_freed) 341 goto checks; 342 goto scan; /* check next one */ 343 } 344 345 if (si->swap_map[offset]) 346 goto scan; 347 348 if (offset == si->lowest_bit) 349 si->lowest_bit++; 350 if (offset == si->highest_bit) 351 si->highest_bit--; 352 si->inuse_pages++; 353 if (si->inuse_pages == si->pages) { 354 si->lowest_bit = si->max; 355 si->highest_bit = 0; 356 } 357 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ 358 si->swap_map[offset] = encode_swapmap(0, true); 359 else /* at suspend */ 360 si->swap_map[offset] = encode_swapmap(1, false); 361 si->cluster_next = offset + 1; 362 si->flags -= SWP_SCANNING; 363 364 if (si->lowest_alloc) { 365 /* 366 * Only set when SWP_DISCARDABLE, and there's a scan 367 * for a free cluster in progress or just completed. 368 */ 369 if (found_free_cluster) { 370 /* 371 * To optimize wear-levelling, discard the 372 * old data of the cluster, taking care not to 373 * discard any of its pages that have already 374 * been allocated by racing tasks (offset has 375 * already stepped over any at the beginning). 376 */ 377 if (offset < si->highest_alloc && 378 si->lowest_alloc <= last_in_cluster) 379 last_in_cluster = si->lowest_alloc - 1; 380 si->flags |= SWP_DISCARDING; 381 spin_unlock(&swap_lock); 382 383 if (offset < last_in_cluster) 384 discard_swap_cluster(si, offset, 385 last_in_cluster - offset + 1); 386 387 spin_lock(&swap_lock); 388 si->lowest_alloc = 0; 389 si->flags &= ~SWP_DISCARDING; 390 391 smp_mb(); /* wake_up_bit advises this */ 392 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); 393 394 } else if (si->flags & SWP_DISCARDING) { 395 /* 396 * Delay using pages allocated by racing tasks 397 * until the whole discard has been issued. We 398 * could defer that delay until swap_writepage, 399 * but it's easier to keep this self-contained. 400 */ 401 spin_unlock(&swap_lock); 402 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), 403 wait_for_discard, TASK_UNINTERRUPTIBLE); 404 spin_lock(&swap_lock); 405 } else { 406 /* 407 * Note pages allocated by racing tasks while 408 * scan for a free cluster is in progress, so 409 * that its final discard can exclude them. 410 */ 411 if (offset < si->lowest_alloc) 412 si->lowest_alloc = offset; 413 if (offset > si->highest_alloc) 414 si->highest_alloc = offset; 415 } 416 } 417 return offset; 418 419scan: 420 spin_unlock(&swap_lock); 421 while (++offset <= si->highest_bit) { 422 if (!si->swap_map[offset]) { 423 spin_lock(&swap_lock); 424 goto checks; 425 } 426 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 427 spin_lock(&swap_lock); 428 goto checks; 429 } 430 if (unlikely(--latency_ration < 0)) { 431 cond_resched(); 432 latency_ration = LATENCY_LIMIT; 433 } 434 } 435 offset = si->lowest_bit; 436 while (++offset < scan_base) { 437 if (!si->swap_map[offset]) { 438 spin_lock(&swap_lock); 439 goto checks; 440 } 441 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 442 spin_lock(&swap_lock); 443 goto checks; 444 } 445 if (unlikely(--latency_ration < 0)) { 446 cond_resched(); 447 latency_ration = LATENCY_LIMIT; 448 } 449 } 450 spin_lock(&swap_lock); 451 452no_page: 453 si->flags -= SWP_SCANNING; 454 return 0; 455} 456 457swp_entry_t get_swap_page(void) 458{ 459 struct swap_info_struct *si; 460 pgoff_t offset; 461 int type, next; 462 int wrapped = 0; 463 464 spin_lock(&swap_lock); 465 if (nr_swap_pages <= 0) 466 goto noswap; 467 nr_swap_pages--; 468 469 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 470 si = swap_info + type; 471 next = si->next; 472 if (next < 0 || 473 (!wrapped && si->prio != swap_info[next].prio)) { 474 next = swap_list.head; 475 wrapped++; 476 } 477 478 if (!si->highest_bit) 479 continue; 480 if (!(si->flags & SWP_WRITEOK)) 481 continue; 482 483 swap_list.next = next; 484 /* This is called for allocating swap entry for cache */ 485 offset = scan_swap_map(si, SWAP_CACHE); 486 if (offset) { 487 spin_unlock(&swap_lock); 488 return swp_entry(type, offset); 489 } 490 next = swap_list.next; 491 } 492 493 nr_swap_pages++; 494noswap: 495 spin_unlock(&swap_lock); 496 return (swp_entry_t) {0}; 497} 498 499/* The only caller of this function is now susupend routine */ 500swp_entry_t get_swap_page_of_type(int type) 501{ 502 struct swap_info_struct *si; 503 pgoff_t offset; 504 505 spin_lock(&swap_lock); 506 si = swap_info + type; 507 if (si->flags & SWP_WRITEOK) { 508 nr_swap_pages--; 509 /* This is called for allocating swap entry, not cache */ 510 offset = scan_swap_map(si, SWAP_MAP); 511 if (offset) { 512 spin_unlock(&swap_lock); 513 return swp_entry(type, offset); 514 } 515 nr_swap_pages++; 516 } 517 spin_unlock(&swap_lock); 518 return (swp_entry_t) {0}; 519} 520 521static struct swap_info_struct * swap_info_get(swp_entry_t entry) 522{ 523 struct swap_info_struct * p; 524 unsigned long offset, type; 525 526 if (!entry.val) 527 goto out; 528 type = swp_type(entry); 529 if (type >= nr_swapfiles) 530 goto bad_nofile; 531 p = & swap_info[type]; 532 if (!(p->flags & SWP_USED)) 533 goto bad_device; 534 offset = swp_offset(entry); 535 if (offset >= p->max) 536 goto bad_offset; 537 if (!p->swap_map[offset]) 538 goto bad_free; 539 spin_lock(&swap_lock); 540 return p; 541 542bad_free: 543 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); 544 goto out; 545bad_offset: 546 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); 547 goto out; 548bad_device: 549 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); 550 goto out; 551bad_nofile: 552 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 553out: 554 return NULL; 555} 556 557static int swap_entry_free(struct swap_info_struct *p, 558 swp_entry_t ent, int cache) 559{ 560 unsigned long offset = swp_offset(ent); 561 int count = swap_count(p->swap_map[offset]); 562 bool has_cache; 563 564 has_cache = swap_has_cache(p->swap_map[offset]); 565 566 if (cache == SWAP_MAP) { /* dropping usage count of swap */ 567 if (count < SWAP_MAP_MAX) { 568 count--; 569 p->swap_map[offset] = encode_swapmap(count, has_cache); 570 } 571 } else { /* dropping swap cache flag */ 572 VM_BUG_ON(!has_cache); 573 p->swap_map[offset] = encode_swapmap(count, false); 574 575 } 576 /* return code. */ 577 count = p->swap_map[offset]; 578 /* free if no reference */ 579 if (!count) { 580 if (offset < p->lowest_bit) 581 p->lowest_bit = offset; 582 if (offset > p->highest_bit) 583 p->highest_bit = offset; 584 if (p->prio > swap_info[swap_list.next].prio) 585 swap_list.next = p - swap_info; 586 nr_swap_pages++; 587 p->inuse_pages--; 588 } 589 if (!swap_count(count)) 590 mem_cgroup_uncharge_swap(ent); 591 return count; 592} 593 594/* 595 * Caller has made sure that the swapdevice corresponding to entry 596 * is still around or has not been recycled. 597 */ 598void swap_free(swp_entry_t entry) 599{ 600 struct swap_info_struct * p; 601 602 p = swap_info_get(entry); 603 if (p) { 604 swap_entry_free(p, entry, SWAP_MAP); 605 spin_unlock(&swap_lock); 606 } 607} 608 609/* 610 * Called after dropping swapcache to decrease refcnt to swap entries. 611 */ 612void swapcache_free(swp_entry_t entry, struct page *page) 613{ 614 struct swap_info_struct *p; 615 int ret; 616 617 p = swap_info_get(entry); 618 if (p) { 619 ret = swap_entry_free(p, entry, SWAP_CACHE); 620 if (page) { 621 bool swapout; 622 if (ret) 623 swapout = true; /* the end of swap out */ 624 else 625 swapout = false; /* no more swap users! */ 626 mem_cgroup_uncharge_swapcache(page, entry, swapout); 627 } 628 spin_unlock(&swap_lock); 629 } 630 return; 631} 632 633/* 634 * How many references to page are currently swapped out? 635 */ 636static inline int page_swapcount(struct page *page) 637{ 638 int count = 0; 639 struct swap_info_struct *p; 640 swp_entry_t entry; 641 642 entry.val = page_private(page); 643 p = swap_info_get(entry); 644 if (p) { 645 count = swap_count(p->swap_map[swp_offset(entry)]); 646 spin_unlock(&swap_lock); 647 } 648 return count; 649} 650 651/* 652 * We can write to an anon page without COW if there are no other references 653 * to it. And as a side-effect, free up its swap: because the old content 654 * on disk will never be read, and seeking back there to write new content 655 * later would only waste time away from clustering. 656 */ 657int reuse_swap_page(struct page *page) 658{ 659 int count; 660 661 VM_BUG_ON(!PageLocked(page)); 662 count = page_mapcount(page); 663 if (count <= 1 && PageSwapCache(page)) { 664 count += page_swapcount(page); 665 if (count == 1 && !PageWriteback(page)) { 666 delete_from_swap_cache(page); 667 SetPageDirty(page); 668 } 669 } 670 return count == 1; 671} 672 673/* 674 * If swap is getting full, or if there are no more mappings of this page, 675 * then try_to_free_swap is called to free its swap space. 676 */ 677int try_to_free_swap(struct page *page) 678{ 679 VM_BUG_ON(!PageLocked(page)); 680 681 if (!PageSwapCache(page)) 682 return 0; 683 if (PageWriteback(page)) 684 return 0; 685 if (page_swapcount(page)) 686 return 0; 687 688 delete_from_swap_cache(page); 689 SetPageDirty(page); 690 return 1; 691} 692 693/* 694 * Free the swap entry like above, but also try to 695 * free the page cache entry if it is the last user. 696 */ 697int free_swap_and_cache(swp_entry_t entry) 698{ 699 struct swap_info_struct *p; 700 struct page *page = NULL; 701 702 if (non_swap_entry(entry)) 703 return 1; 704 705 p = swap_info_get(entry); 706 if (p) { 707 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { 708 page = find_get_page(&swapper_space, entry.val); 709 if (page && !trylock_page(page)) { 710 page_cache_release(page); 711 page = NULL; 712 } 713 } 714 spin_unlock(&swap_lock); 715 } 716 if (page) { 717 /* 718 * Not mapped elsewhere, or swap space full? Free it! 719 * Also recheck PageSwapCache now page is locked (above). 720 */ 721 if (PageSwapCache(page) && !PageWriteback(page) && 722 (!page_mapped(page) || vm_swap_full())) { 723 delete_from_swap_cache(page); 724 SetPageDirty(page); 725 } 726 unlock_page(page); 727 page_cache_release(page); 728 } 729 return p != NULL; 730} 731 732#ifdef CONFIG_HIBERNATION 733/* 734 * Find the swap type that corresponds to given device (if any). 735 * 736 * @offset - number of the PAGE_SIZE-sized block of the device, starting 737 * from 0, in which the swap header is expected to be located. 738 * 739 * This is needed for the suspend to disk (aka swsusp). 740 */ 741int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) 742{ 743 struct block_device *bdev = NULL; 744 int i; 745 746 if (device) 747 bdev = bdget(device); 748 749 spin_lock(&swap_lock); 750 for (i = 0; i < nr_swapfiles; i++) { 751 struct swap_info_struct *sis = swap_info + i; 752 753 if (!(sis->flags & SWP_WRITEOK)) 754 continue; 755 756 if (!bdev) { 757 if (bdev_p) 758 *bdev_p = bdgrab(sis->bdev); 759 760 spin_unlock(&swap_lock); 761 return i; 762 } 763 if (bdev == sis->bdev) { 764 struct swap_extent *se; 765 766 se = list_entry(sis->extent_list.next, 767 struct swap_extent, list); 768 if (se->start_block == offset) { 769 if (bdev_p) 770 *bdev_p = bdgrab(sis->bdev); 771 772 spin_unlock(&swap_lock); 773 bdput(bdev); 774 return i; 775 } 776 } 777 } 778 spin_unlock(&swap_lock); 779 if (bdev) 780 bdput(bdev); 781 782 return -ENODEV; 783} 784 785/* 786 * Return either the total number of swap pages of given type, or the number 787 * of free pages of that type (depending on @free) 788 * 789 * This is needed for software suspend 790 */ 791unsigned int count_swap_pages(int type, int free) 792{ 793 unsigned int n = 0; 794 795 if (type < nr_swapfiles) { 796 spin_lock(&swap_lock); 797 if (swap_info[type].flags & SWP_WRITEOK) { 798 n = swap_info[type].pages; 799 if (free) 800 n -= swap_info[type].inuse_pages; 801 } 802 spin_unlock(&swap_lock); 803 } 804 return n; 805} 806#endif 807 808/* 809 * No need to decide whether this PTE shares the swap entry with others, 810 * just let do_wp_page work it out if a write is requested later - to 811 * force COW, vm_page_prot omits write permission from any private vma. 812 */ 813static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 814 unsigned long addr, swp_entry_t entry, struct page *page) 815{ 816 struct mem_cgroup *ptr = NULL; 817 spinlock_t *ptl; 818 pte_t *pte; 819 int ret = 1; 820 821 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { 822 ret = -ENOMEM; 823 goto out_nolock; 824 } 825 826 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 827 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 828 if (ret > 0) 829 mem_cgroup_cancel_charge_swapin(ptr); 830 ret = 0; 831 goto out; 832 } 833 834 inc_mm_counter(vma->vm_mm, anon_rss); 835 get_page(page); 836 set_pte_at(vma->vm_mm, addr, pte, 837 pte_mkold(mk_pte(page, vma->vm_page_prot))); 838 page_add_anon_rmap(page, vma, addr); 839 mem_cgroup_commit_charge_swapin(page, ptr); 840 swap_free(entry); 841 /* 842 * Move the page to the active list so it is not 843 * immediately swapped out again after swapon. 844 */ 845 activate_page(page); 846out: 847 pte_unmap_unlock(pte, ptl); 848out_nolock: 849 return ret; 850} 851 852static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 853 unsigned long addr, unsigned long end, 854 swp_entry_t entry, struct page *page) 855{ 856 pte_t swp_pte = swp_entry_to_pte(entry); 857 pte_t *pte; 858 int ret = 0; 859 860 /* 861 * We don't actually need pte lock while scanning for swp_pte: since 862 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the 863 * page table while we're scanning; though it could get zapped, and on 864 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse 865 * of unmatched parts which look like swp_pte, so unuse_pte must 866 * recheck under pte lock. Scanning without pte lock lets it be 867 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. 868 */ 869 pte = pte_offset_map(pmd, addr); 870 do { 871 /* 872 * swapoff spends a _lot_ of time in this loop! 873 * Test inline before going to call unuse_pte. 874 */ 875 if (unlikely(pte_same(*pte, swp_pte))) { 876 pte_unmap(pte); 877 ret = unuse_pte(vma, pmd, addr, entry, page); 878 if (ret) 879 goto out; 880 pte = pte_offset_map(pmd, addr); 881 } 882 } while (pte++, addr += PAGE_SIZE, addr != end); 883 pte_unmap(pte - 1); 884out: 885 return ret; 886} 887 888static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 889 unsigned long addr, unsigned long end, 890 swp_entry_t entry, struct page *page) 891{ 892 pmd_t *pmd; 893 unsigned long next; 894 int ret; 895 896 pmd = pmd_offset(pud, addr); 897 do { 898 next = pmd_addr_end(addr, end); 899 if (pmd_none_or_clear_bad(pmd)) 900 continue; 901 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 902 if (ret) 903 return ret; 904 } while (pmd++, addr = next, addr != end); 905 return 0; 906} 907 908static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 909 unsigned long addr, unsigned long end, 910 swp_entry_t entry, struct page *page) 911{ 912 pud_t *pud; 913 unsigned long next; 914 int ret; 915 916 pud = pud_offset(pgd, addr); 917 do { 918 next = pud_addr_end(addr, end); 919 if (pud_none_or_clear_bad(pud)) 920 continue; 921 ret = unuse_pmd_range(vma, pud, addr, next, entry, page); 922 if (ret) 923 return ret; 924 } while (pud++, addr = next, addr != end); 925 return 0; 926} 927 928static int unuse_vma(struct vm_area_struct *vma, 929 swp_entry_t entry, struct page *page) 930{ 931 pgd_t *pgd; 932 unsigned long addr, end, next; 933 int ret; 934 935 if (page->mapping) { 936 addr = page_address_in_vma(page, vma); 937 if (addr == -EFAULT) 938 return 0; 939 else 940 end = addr + PAGE_SIZE; 941 } else { 942 addr = vma->vm_start; 943 end = vma->vm_end; 944 } 945 946 pgd = pgd_offset(vma->vm_mm, addr); 947 do { 948 next = pgd_addr_end(addr, end); 949 if (pgd_none_or_clear_bad(pgd)) 950 continue; 951 ret = unuse_pud_range(vma, pgd, addr, next, entry, page); 952 if (ret) 953 return ret; 954 } while (pgd++, addr = next, addr != end); 955 return 0; 956} 957 958static int unuse_mm(struct mm_struct *mm, 959 swp_entry_t entry, struct page *page) 960{ 961 struct vm_area_struct *vma; 962 int ret = 0; 963 964 if (!down_read_trylock(&mm->mmap_sem)) { 965 /* 966 * Activate page so shrink_inactive_list is unlikely to unmap 967 * its ptes while lock is dropped, so swapoff can make progress. 968 */ 969 activate_page(page); 970 unlock_page(page); 971 down_read(&mm->mmap_sem); 972 lock_page(page); 973 } 974 for (vma = mm->mmap; vma; vma = vma->vm_next) { 975 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) 976 break; 977 } 978 up_read(&mm->mmap_sem); 979 return (ret < 0)? ret: 0; 980} 981 982/* 983 * Scan swap_map from current position to next entry still in use. 984 * Recycle to start on reaching the end, returning 0 when empty. 985 */ 986static unsigned int find_next_to_unuse(struct swap_info_struct *si, 987 unsigned int prev) 988{ 989 unsigned int max = si->max; 990 unsigned int i = prev; 991 int count; 992 993 /* 994 * No need for swap_lock here: we're just looking 995 * for whether an entry is in use, not modifying it; false 996 * hits are okay, and sys_swapoff() has already prevented new 997 * allocations from this area (while holding swap_lock). 998 */ 999 for (;;) { 1000 if (++i >= max) { 1001 if (!prev) { 1002 i = 0; 1003 break; 1004 } 1005 /* 1006 * No entries in use at top of swap_map, 1007 * loop back to start and recheck there. 1008 */ 1009 max = prev + 1; 1010 prev = 0; 1011 i = 1; 1012 } 1013 count = si->swap_map[i]; 1014 if (count && swap_count(count) != SWAP_MAP_BAD) 1015 break; 1016 } 1017 return i; 1018} 1019 1020/* 1021 * We completely avoid races by reading each swap page in advance, 1022 * and then search for the process using it. All the necessary 1023 * page table adjustments can then be made atomically. 1024 */ 1025static int try_to_unuse(unsigned int type) 1026{ 1027 struct swap_info_struct * si = &swap_info[type]; 1028 struct mm_struct *start_mm; 1029 unsigned short *swap_map; 1030 unsigned short swcount; 1031 struct page *page; 1032 swp_entry_t entry; 1033 unsigned int i = 0; 1034 int retval = 0; 1035 int reset_overflow = 0; 1036 int shmem; 1037 1038 /* 1039 * When searching mms for an entry, a good strategy is to 1040 * start at the first mm we freed the previous entry from 1041 * (though actually we don't notice whether we or coincidence 1042 * freed the entry). Initialize this start_mm with a hold. 1043 * 1044 * A simpler strategy would be to start at the last mm we 1045 * freed the previous entry from; but that would take less 1046 * advantage of mmlist ordering, which clusters forked mms 1047 * together, child after parent. If we race with dup_mmap(), we 1048 * prefer to resolve parent before child, lest we miss entries 1049 * duplicated after we scanned child: using last mm would invert 1050 * that. Though it's only a serious concern when an overflowed 1051 * swap count is reset from SWAP_MAP_MAX, preventing a rescan. 1052 */ 1053 start_mm = &init_mm; 1054 atomic_inc(&init_mm.mm_users); 1055 1056 /* 1057 * Keep on scanning until all entries have gone. Usually, 1058 * one pass through swap_map is enough, but not necessarily: 1059 * there are races when an instance of an entry might be missed. 1060 */ 1061 while ((i = find_next_to_unuse(si, i)) != 0) { 1062 if (signal_pending(current)) { 1063 retval = -EINTR; 1064 break; 1065 } 1066 1067 /* 1068 * Get a page for the entry, using the existing swap 1069 * cache page if there is one. Otherwise, get a clean 1070 * page and read the swap into it. 1071 */ 1072 swap_map = &si->swap_map[i]; 1073 entry = swp_entry(type, i); 1074 page = read_swap_cache_async(entry, 1075 GFP_HIGHUSER_MOVABLE, NULL, 0); 1076 if (!page) { 1077 /* 1078 * Either swap_duplicate() failed because entry 1079 * has been freed independently, and will not be 1080 * reused since sys_swapoff() already disabled 1081 * allocation from here, or alloc_page() failed. 1082 */ 1083 if (!*swap_map) 1084 continue; 1085 retval = -ENOMEM; 1086 break; 1087 } 1088 1089 /* 1090 * Don't hold on to start_mm if it looks like exiting. 1091 */ 1092 if (atomic_read(&start_mm->mm_users) == 1) { 1093 mmput(start_mm); 1094 start_mm = &init_mm; 1095 atomic_inc(&init_mm.mm_users); 1096 } 1097 1098 /* 1099 * Wait for and lock page. When do_swap_page races with 1100 * try_to_unuse, do_swap_page can handle the fault much 1101 * faster than try_to_unuse can locate the entry. This 1102 * apparently redundant "wait_on_page_locked" lets try_to_unuse 1103 * defer to do_swap_page in such a case - in some tests, 1104 * do_swap_page and try_to_unuse repeatedly compete. 1105 */ 1106 wait_on_page_locked(page); 1107 wait_on_page_writeback(page); 1108 lock_page(page); 1109 wait_on_page_writeback(page); 1110 1111 /* 1112 * Remove all references to entry. 1113 * Whenever we reach init_mm, there's no address space 1114 * to search, but use it as a reminder to search shmem. 1115 */ 1116 shmem = 0; 1117 swcount = *swap_map; 1118 if (swap_count(swcount)) { 1119 if (start_mm == &init_mm) 1120 shmem = shmem_unuse(entry, page); 1121 else 1122 retval = unuse_mm(start_mm, entry, page); 1123 } 1124 if (swap_count(*swap_map)) { 1125 int set_start_mm = (*swap_map >= swcount); 1126 struct list_head *p = &start_mm->mmlist; 1127 struct mm_struct *new_start_mm = start_mm; 1128 struct mm_struct *prev_mm = start_mm; 1129 struct mm_struct *mm; 1130 1131 atomic_inc(&new_start_mm->mm_users); 1132 atomic_inc(&prev_mm->mm_users); 1133 spin_lock(&mmlist_lock); 1134 while (swap_count(*swap_map) && !retval && !shmem && 1135 (p = p->next) != &start_mm->mmlist) { 1136 mm = list_entry(p, struct mm_struct, mmlist); 1137 if (!atomic_inc_not_zero(&mm->mm_users)) 1138 continue; 1139 spin_unlock(&mmlist_lock); 1140 mmput(prev_mm); 1141 prev_mm = mm; 1142 1143 cond_resched(); 1144 1145 swcount = *swap_map; 1146 if (!swap_count(swcount)) /* any usage ? */ 1147 ; 1148 else if (mm == &init_mm) { 1149 set_start_mm = 1; 1150 shmem = shmem_unuse(entry, page); 1151 } else 1152 retval = unuse_mm(mm, entry, page); 1153 1154 if (set_start_mm && *swap_map < swcount) { 1155 mmput(new_start_mm); 1156 atomic_inc(&mm->mm_users); 1157 new_start_mm = mm; 1158 set_start_mm = 0; 1159 } 1160 spin_lock(&mmlist_lock); 1161 } 1162 spin_unlock(&mmlist_lock); 1163 mmput(prev_mm); 1164 mmput(start_mm); 1165 start_mm = new_start_mm; 1166 } 1167 if (shmem) { 1168 /* page has already been unlocked and released */ 1169 if (shmem > 0) 1170 continue; 1171 retval = shmem; 1172 break; 1173 } 1174 if (retval) { 1175 unlock_page(page); 1176 page_cache_release(page); 1177 break; 1178 } 1179 1180 /* 1181 * How could swap count reach 0x7ffe ? 1182 * There's no way to repeat a swap page within an mm 1183 * (except in shmem, where it's the shared object which takes 1184 * the reference count)? 1185 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned 1186 * short is too small....) 1187 * If that's wrong, then we should worry more about 1188 * exit_mmap() and do_munmap() cases described above: 1189 * we might be resetting SWAP_MAP_MAX too early here. 1190 * We know "Undead"s can happen, they're okay, so don't 1191 * report them; but do report if we reset SWAP_MAP_MAX. 1192 */ 1193 /* We might release the lock_page() in unuse_mm(). */ 1194 if (!PageSwapCache(page) || page_private(page) != entry.val) 1195 goto retry; 1196 1197 if (swap_count(*swap_map) == SWAP_MAP_MAX) { 1198 spin_lock(&swap_lock); 1199 *swap_map = encode_swapmap(0, true); 1200 spin_unlock(&swap_lock); 1201 reset_overflow = 1; 1202 } 1203 1204 /* 1205 * If a reference remains (rare), we would like to leave 1206 * the page in the swap cache; but try_to_unmap could 1207 * then re-duplicate the entry once we drop page lock, 1208 * so we might loop indefinitely; also, that page could 1209 * not be swapped out to other storage meanwhile. So: 1210 * delete from cache even if there's another reference, 1211 * after ensuring that the data has been saved to disk - 1212 * since if the reference remains (rarer), it will be 1213 * read from disk into another page. Splitting into two 1214 * pages would be incorrect if swap supported "shared 1215 * private" pages, but they are handled by tmpfs files. 1216 */ 1217 if (swap_count(*swap_map) && 1218 PageDirty(page) && PageSwapCache(page)) { 1219 struct writeback_control wbc = { 1220 .sync_mode = WB_SYNC_NONE, 1221 }; 1222 1223 swap_writepage(page, &wbc); 1224 lock_page(page); 1225 wait_on_page_writeback(page); 1226 } 1227 1228 /* 1229 * It is conceivable that a racing task removed this page from 1230 * swap cache just before we acquired the page lock at the top, 1231 * or while we dropped it in unuse_mm(). The page might even 1232 * be back in swap cache on another swap area: that we must not 1233 * delete, since it may not have been written out to swap yet. 1234 */ 1235 if (PageSwapCache(page) && 1236 likely(page_private(page) == entry.val)) 1237 delete_from_swap_cache(page); 1238 1239 /* 1240 * So we could skip searching mms once swap count went 1241 * to 1, we did not mark any present ptes as dirty: must 1242 * mark page dirty so shrink_page_list will preserve it. 1243 */ 1244 SetPageDirty(page); 1245retry: 1246 unlock_page(page); 1247 page_cache_release(page); 1248 1249 /* 1250 * Make sure that we aren't completely killing 1251 * interactive performance. 1252 */ 1253 cond_resched(); 1254 } 1255 1256 mmput(start_mm); 1257 if (reset_overflow) { 1258 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); 1259 swap_overflow = 0; 1260 } 1261 return retval; 1262} 1263 1264/* 1265 * After a successful try_to_unuse, if no swap is now in use, we know 1266 * we can empty the mmlist. swap_lock must be held on entry and exit. 1267 * Note that mmlist_lock nests inside swap_lock, and an mm must be 1268 * added to the mmlist just after page_duplicate - before would be racy. 1269 */ 1270static void drain_mmlist(void) 1271{ 1272 struct list_head *p, *next; 1273 unsigned int i; 1274 1275 for (i = 0; i < nr_swapfiles; i++) 1276 if (swap_info[i].inuse_pages) 1277 return; 1278 spin_lock(&mmlist_lock); 1279 list_for_each_safe(p, next, &init_mm.mmlist) 1280 list_del_init(p); 1281 spin_unlock(&mmlist_lock); 1282} 1283 1284/* 1285 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 1286 * corresponds to page offset `offset'. Note that the type of this function 1287 * is sector_t, but it returns page offset into the bdev, not sector offset. 1288 */ 1289sector_t map_swap_page(swp_entry_t entry, struct block_device **bdev) 1290{ 1291 struct swap_info_struct *sis; 1292 struct swap_extent *start_se; 1293 struct swap_extent *se; 1294 pgoff_t offset; 1295 1296 sis = swap_info + swp_type(entry); 1297 *bdev = sis->bdev; 1298 1299 offset = swp_offset(entry); 1300 start_se = sis->curr_swap_extent; 1301 se = start_se; 1302 1303 for ( ; ; ) { 1304 struct list_head *lh; 1305 1306 if (se->start_page <= offset && 1307 offset < (se->start_page + se->nr_pages)) { 1308 return se->start_block + (offset - se->start_page); 1309 } 1310 lh = se->list.next; 1311 if (lh == &sis->extent_list) 1312 lh = lh->next; 1313 se = list_entry(lh, struct swap_extent, list); 1314 sis->curr_swap_extent = se; 1315 BUG_ON(se == start_se); /* It *must* be present */ 1316 } 1317} 1318 1319#ifdef CONFIG_HIBERNATION 1320/* 1321 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 1322 * corresponding to given index in swap_info (swap type). 1323 */ 1324sector_t swapdev_block(int swap_type, pgoff_t offset) 1325{ 1326 struct swap_info_struct *sis; 1327 struct block_device *bdev; 1328 1329 if (swap_type >= nr_swapfiles) 1330 return 0; 1331 1332 sis = swap_info + swap_type; 1333 return (sis->flags & SWP_WRITEOK) ? 1334 map_swap_page(swp_entry(swap_type, offset), &bdev) : 0; 1335} 1336#endif /* CONFIG_HIBERNATION */ 1337 1338/* 1339 * Free all of a swapdev's extent information 1340 */ 1341static void destroy_swap_extents(struct swap_info_struct *sis) 1342{ 1343 while (!list_empty(&sis->extent_list)) { 1344 struct swap_extent *se; 1345 1346 se = list_entry(sis->extent_list.next, 1347 struct swap_extent, list); 1348 list_del(&se->list); 1349 kfree(se); 1350 } 1351} 1352 1353/* 1354 * Add a block range (and the corresponding page range) into this swapdev's 1355 * extent list. The extent list is kept sorted in page order. 1356 * 1357 * This function rather assumes that it is called in ascending page order. 1358 */ 1359static int 1360add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 1361 unsigned long nr_pages, sector_t start_block) 1362{ 1363 struct swap_extent *se; 1364 struct swap_extent *new_se; 1365 struct list_head *lh; 1366 1367 lh = sis->extent_list.prev; /* The highest page extent */ 1368 if (lh != &sis->extent_list) { 1369 se = list_entry(lh, struct swap_extent, list); 1370 BUG_ON(se->start_page + se->nr_pages != start_page); 1371 if (se->start_block + se->nr_pages == start_block) { 1372 /* Merge it */ 1373 se->nr_pages += nr_pages; 1374 return 0; 1375 } 1376 } 1377 1378 /* 1379 * No merge. Insert a new extent, preserving ordering. 1380 */ 1381 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 1382 if (new_se == NULL) 1383 return -ENOMEM; 1384 new_se->start_page = start_page; 1385 new_se->nr_pages = nr_pages; 1386 new_se->start_block = start_block; 1387 1388 list_add_tail(&new_se->list, &sis->extent_list); 1389 return 1; 1390} 1391 1392/* 1393 * A `swap extent' is a simple thing which maps a contiguous range of pages 1394 * onto a contiguous range of disk blocks. An ordered list of swap extents 1395 * is built at swapon time and is then used at swap_writepage/swap_readpage 1396 * time for locating where on disk a page belongs. 1397 * 1398 * If the swapfile is an S_ISBLK block device, a single extent is installed. 1399 * This is done so that the main operating code can treat S_ISBLK and S_ISREG 1400 * swap files identically. 1401 * 1402 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 1403 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 1404 * swapfiles are handled *identically* after swapon time. 1405 * 1406 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 1407 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If 1408 * some stray blocks are found which do not fall within the PAGE_SIZE alignment 1409 * requirements, they are simply tossed out - we will never use those blocks 1410 * for swapping. 1411 * 1412 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This 1413 * prevents root from shooting her foot off by ftruncating an in-use swapfile, 1414 * which will scribble on the fs. 1415 * 1416 * The amount of disk space which a single swap extent represents varies. 1417 * Typically it is in the 1-4 megabyte range. So we can have hundreds of 1418 * extents in the list. To avoid much list walking, we cache the previous 1419 * search location in `curr_swap_extent', and start new searches from there. 1420 * This is extremely effective. The average number of iterations in 1421 * map_swap_page() has been measured at about 0.3 per page. - akpm. 1422 */ 1423static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 1424{ 1425 struct inode *inode; 1426 unsigned blocks_per_page; 1427 unsigned long page_no; 1428 unsigned blkbits; 1429 sector_t probe_block; 1430 sector_t last_block; 1431 sector_t lowest_block = -1; 1432 sector_t highest_block = 0; 1433 int nr_extents = 0; 1434 int ret; 1435 1436 inode = sis->swap_file->f_mapping->host; 1437 if (S_ISBLK(inode->i_mode)) { 1438 ret = add_swap_extent(sis, 0, sis->max, 0); 1439 *span = sis->pages; 1440 goto done; 1441 } 1442 1443 blkbits = inode->i_blkbits; 1444 blocks_per_page = PAGE_SIZE >> blkbits; 1445 1446 /* 1447 * Map all the blocks into the extent list. This code doesn't try 1448 * to be very smart. 1449 */ 1450 probe_block = 0; 1451 page_no = 0; 1452 last_block = i_size_read(inode) >> blkbits; 1453 while ((probe_block + blocks_per_page) <= last_block && 1454 page_no < sis->max) { 1455 unsigned block_in_page; 1456 sector_t first_block; 1457 1458 first_block = bmap(inode, probe_block); 1459 if (first_block == 0) 1460 goto bad_bmap; 1461 1462 /* 1463 * It must be PAGE_SIZE aligned on-disk 1464 */ 1465 if (first_block & (blocks_per_page - 1)) { 1466 probe_block++; 1467 goto reprobe; 1468 } 1469 1470 for (block_in_page = 1; block_in_page < blocks_per_page; 1471 block_in_page++) { 1472 sector_t block; 1473 1474 block = bmap(inode, probe_block + block_in_page); 1475 if (block == 0) 1476 goto bad_bmap; 1477 if (block != first_block + block_in_page) { 1478 /* Discontiguity */ 1479 probe_block++; 1480 goto reprobe; 1481 } 1482 } 1483 1484 first_block >>= (PAGE_SHIFT - blkbits); 1485 if (page_no) { /* exclude the header page */ 1486 if (first_block < lowest_block) 1487 lowest_block = first_block; 1488 if (first_block > highest_block) 1489 highest_block = first_block; 1490 } 1491 1492 /* 1493 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks 1494 */ 1495 ret = add_swap_extent(sis, page_no, 1, first_block); 1496 if (ret < 0) 1497 goto out; 1498 nr_extents += ret; 1499 page_no++; 1500 probe_block += blocks_per_page; 1501reprobe: 1502 continue; 1503 } 1504 ret = nr_extents; 1505 *span = 1 + highest_block - lowest_block; 1506 if (page_no == 0) 1507 page_no = 1; /* force Empty message */ 1508 sis->max = page_no; 1509 sis->pages = page_no - 1; 1510 sis->highest_bit = page_no - 1; 1511done: 1512 sis->curr_swap_extent = list_entry(sis->extent_list.prev, 1513 struct swap_extent, list); 1514 goto out; 1515bad_bmap: 1516 printk(KERN_ERR "swapon: swapfile has holes\n"); 1517 ret = -EINVAL; 1518out: 1519 return ret; 1520} 1521 1522SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1523{ 1524 struct swap_info_struct * p = NULL; 1525 unsigned short *swap_map; 1526 struct file *swap_file, *victim; 1527 struct address_space *mapping; 1528 struct inode *inode; 1529 char * pathname; 1530 int i, type, prev; 1531 int err; 1532 1533 if (!capable(CAP_SYS_ADMIN)) 1534 return -EPERM; 1535 1536 pathname = getname(specialfile); 1537 err = PTR_ERR(pathname); 1538 if (IS_ERR(pathname)) 1539 goto out; 1540 1541 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); 1542 putname(pathname); 1543 err = PTR_ERR(victim); 1544 if (IS_ERR(victim)) 1545 goto out; 1546 1547 mapping = victim->f_mapping; 1548 prev = -1; 1549 spin_lock(&swap_lock); 1550 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1551 p = swap_info + type; 1552 if (p->flags & SWP_WRITEOK) { 1553 if (p->swap_file->f_mapping == mapping) 1554 break; 1555 } 1556 prev = type; 1557 } 1558 if (type < 0) { 1559 err = -EINVAL; 1560 spin_unlock(&swap_lock); 1561 goto out_dput; 1562 } 1563 if (!security_vm_enough_memory(p->pages)) 1564 vm_unacct_memory(p->pages); 1565 else { 1566 err = -ENOMEM; 1567 spin_unlock(&swap_lock); 1568 goto out_dput; 1569 } 1570 if (prev < 0) { 1571 swap_list.head = p->next; 1572 } else { 1573 swap_info[prev].next = p->next; 1574 } 1575 if (type == swap_list.next) { 1576 /* just pick something that's safe... */ 1577 swap_list.next = swap_list.head; 1578 } 1579 if (p->prio < 0) { 1580 for (i = p->next; i >= 0; i = swap_info[i].next) 1581 swap_info[i].prio = p->prio--; 1582 least_priority++; 1583 } 1584 nr_swap_pages -= p->pages; 1585 total_swap_pages -= p->pages; 1586 p->flags &= ~SWP_WRITEOK; 1587 spin_unlock(&swap_lock); 1588 1589 current->flags |= PF_OOM_ORIGIN; 1590 err = try_to_unuse(type); 1591 current->flags &= ~PF_OOM_ORIGIN; 1592 1593 if (err) { 1594 /* re-insert swap space back into swap_list */ 1595 spin_lock(&swap_lock); 1596 if (p->prio < 0) 1597 p->prio = --least_priority; 1598 prev = -1; 1599 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 1600 if (p->prio >= swap_info[i].prio) 1601 break; 1602 prev = i; 1603 } 1604 p->next = i; 1605 if (prev < 0) 1606 swap_list.head = swap_list.next = p - swap_info; 1607 else 1608 swap_info[prev].next = p - swap_info; 1609 nr_swap_pages += p->pages; 1610 total_swap_pages += p->pages; 1611 p->flags |= SWP_WRITEOK; 1612 spin_unlock(&swap_lock); 1613 goto out_dput; 1614 } 1615 1616 /* wait for any unplug function to finish */ 1617 down_write(&swap_unplug_sem); 1618 up_write(&swap_unplug_sem); 1619 1620 destroy_swap_extents(p); 1621 mutex_lock(&swapon_mutex); 1622 spin_lock(&swap_lock); 1623 drain_mmlist(); 1624 1625 /* wait for anyone still in scan_swap_map */ 1626 p->highest_bit = 0; /* cuts scans short */ 1627 while (p->flags >= SWP_SCANNING) { 1628 spin_unlock(&swap_lock); 1629 schedule_timeout_uninterruptible(1); 1630 spin_lock(&swap_lock); 1631 } 1632 1633 swap_file = p->swap_file; 1634 p->swap_file = NULL; 1635 p->max = 0; 1636 swap_map = p->swap_map; 1637 p->swap_map = NULL; 1638 p->flags = 0; 1639 spin_unlock(&swap_lock); 1640 mutex_unlock(&swapon_mutex); 1641 vfree(swap_map); 1642 /* Destroy swap account informatin */ 1643 swap_cgroup_swapoff(type); 1644 1645 inode = mapping->host; 1646 if (S_ISBLK(inode->i_mode)) { 1647 struct block_device *bdev = I_BDEV(inode); 1648 set_blocksize(bdev, p->old_block_size); 1649 bd_release(bdev); 1650 } else { 1651 mutex_lock(&inode->i_mutex); 1652 inode->i_flags &= ~S_SWAPFILE; 1653 mutex_unlock(&inode->i_mutex); 1654 } 1655 filp_close(swap_file, NULL); 1656 err = 0; 1657 1658out_dput: 1659 filp_close(victim, NULL); 1660out: 1661 return err; 1662} 1663 1664#ifdef CONFIG_PROC_FS 1665/* iterator */ 1666static void *swap_start(struct seq_file *swap, loff_t *pos) 1667{ 1668 struct swap_info_struct *ptr = swap_info; 1669 int i; 1670 loff_t l = *pos; 1671 1672 mutex_lock(&swapon_mutex); 1673 1674 if (!l) 1675 return SEQ_START_TOKEN; 1676 1677 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1678 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1679 continue; 1680 if (!--l) 1681 return ptr; 1682 } 1683 1684 return NULL; 1685} 1686 1687static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 1688{ 1689 struct swap_info_struct *ptr; 1690 struct swap_info_struct *endptr = swap_info + nr_swapfiles; 1691 1692 if (v == SEQ_START_TOKEN) 1693 ptr = swap_info; 1694 else { 1695 ptr = v; 1696 ptr++; 1697 } 1698 1699 for (; ptr < endptr; ptr++) { 1700 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1701 continue; 1702 ++*pos; 1703 return ptr; 1704 } 1705 1706 return NULL; 1707} 1708 1709static void swap_stop(struct seq_file *swap, void *v) 1710{ 1711 mutex_unlock(&swapon_mutex); 1712} 1713 1714static int swap_show(struct seq_file *swap, void *v) 1715{ 1716 struct swap_info_struct *ptr = v; 1717 struct file *file; 1718 int len; 1719 1720 if (ptr == SEQ_START_TOKEN) { 1721 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 1722 return 0; 1723 } 1724 1725 file = ptr->swap_file; 1726 len = seq_path(swap, &file->f_path, " \t\n\\"); 1727 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1728 len < 40 ? 40 - len : 1, " ", 1729 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1730 "partition" : "file\t", 1731 ptr->pages << (PAGE_SHIFT - 10), 1732 ptr->inuse_pages << (PAGE_SHIFT - 10), 1733 ptr->prio); 1734 return 0; 1735} 1736 1737static const struct seq_operations swaps_op = { 1738 .start = swap_start, 1739 .next = swap_next, 1740 .stop = swap_stop, 1741 .show = swap_show 1742}; 1743 1744static int swaps_open(struct inode *inode, struct file *file) 1745{ 1746 return seq_open(file, &swaps_op); 1747} 1748 1749static const struct file_operations proc_swaps_operations = { 1750 .open = swaps_open, 1751 .read = seq_read, 1752 .llseek = seq_lseek, 1753 .release = seq_release, 1754}; 1755 1756static int __init procswaps_init(void) 1757{ 1758 proc_create("swaps", 0, NULL, &proc_swaps_operations); 1759 return 0; 1760} 1761__initcall(procswaps_init); 1762#endif /* CONFIG_PROC_FS */ 1763 1764#ifdef MAX_SWAPFILES_CHECK 1765static int __init max_swapfiles_check(void) 1766{ 1767 MAX_SWAPFILES_CHECK(); 1768 return 0; 1769} 1770late_initcall(max_swapfiles_check); 1771#endif 1772 1773/* 1774 * Written 01/25/92 by Simmule Turner, heavily changed by Linus. 1775 * 1776 * The swapon system call 1777 */ 1778SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 1779{ 1780 struct swap_info_struct * p; 1781 char *name = NULL; 1782 struct block_device *bdev = NULL; 1783 struct file *swap_file = NULL; 1784 struct address_space *mapping; 1785 unsigned int type; 1786 int i, prev; 1787 int error; 1788 union swap_header *swap_header = NULL; 1789 unsigned int nr_good_pages = 0; 1790 int nr_extents = 0; 1791 sector_t span; 1792 unsigned long maxpages = 1; 1793 unsigned long swapfilepages; 1794 unsigned short *swap_map = NULL; 1795 struct page *page = NULL; 1796 struct inode *inode = NULL; 1797 int did_down = 0; 1798 1799 if (!capable(CAP_SYS_ADMIN)) 1800 return -EPERM; 1801 spin_lock(&swap_lock); 1802 p = swap_info; 1803 for (type = 0 ; type < nr_swapfiles ; type++,p++) 1804 if (!(p->flags & SWP_USED)) 1805 break; 1806 error = -EPERM; 1807 if (type >= MAX_SWAPFILES) { 1808 spin_unlock(&swap_lock); 1809 goto out; 1810 } 1811 if (type >= nr_swapfiles) 1812 nr_swapfiles = type+1; 1813 memset(p, 0, sizeof(*p)); 1814 INIT_LIST_HEAD(&p->extent_list); 1815 p->flags = SWP_USED; 1816 p->next = -1; 1817 spin_unlock(&swap_lock); 1818 name = getname(specialfile); 1819 error = PTR_ERR(name); 1820 if (IS_ERR(name)) { 1821 name = NULL; 1822 goto bad_swap_2; 1823 } 1824 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); 1825 error = PTR_ERR(swap_file); 1826 if (IS_ERR(swap_file)) { 1827 swap_file = NULL; 1828 goto bad_swap_2; 1829 } 1830 1831 p->swap_file = swap_file; 1832 mapping = swap_file->f_mapping; 1833 inode = mapping->host; 1834 1835 error = -EBUSY; 1836 for (i = 0; i < nr_swapfiles; i++) { 1837 struct swap_info_struct *q = &swap_info[i]; 1838 1839 if (i == type || !q->swap_file) 1840 continue; 1841 if (mapping == q->swap_file->f_mapping) 1842 goto bad_swap; 1843 } 1844 1845 error = -EINVAL; 1846 if (S_ISBLK(inode->i_mode)) { 1847 bdev = I_BDEV(inode); 1848 error = bd_claim(bdev, sys_swapon); 1849 if (error < 0) { 1850 bdev = NULL; 1851 error = -EINVAL; 1852 goto bad_swap; 1853 } 1854 p->old_block_size = block_size(bdev); 1855 error = set_blocksize(bdev, PAGE_SIZE); 1856 if (error < 0) 1857 goto bad_swap; 1858 p->bdev = bdev; 1859 } else if (S_ISREG(inode->i_mode)) { 1860 p->bdev = inode->i_sb->s_bdev; 1861 mutex_lock(&inode->i_mutex); 1862 did_down = 1; 1863 if (IS_SWAPFILE(inode)) { 1864 error = -EBUSY; 1865 goto bad_swap; 1866 } 1867 } else { 1868 goto bad_swap; 1869 } 1870 1871 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 1872 1873 /* 1874 * Read the swap header. 1875 */ 1876 if (!mapping->a_ops->readpage) { 1877 error = -EINVAL; 1878 goto bad_swap; 1879 } 1880 page = read_mapping_page(mapping, 0, swap_file); 1881 if (IS_ERR(page)) { 1882 error = PTR_ERR(page); 1883 goto bad_swap; 1884 } 1885 swap_header = kmap(page); 1886 1887 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 1888 printk(KERN_ERR "Unable to find swap-space signature\n"); 1889 error = -EINVAL; 1890 goto bad_swap; 1891 } 1892 1893 /* swap partition endianess hack... */ 1894 if (swab32(swap_header->info.version) == 1) { 1895 swab32s(&swap_header->info.version); 1896 swab32s(&swap_header->info.last_page); 1897 swab32s(&swap_header->info.nr_badpages); 1898 for (i = 0; i < swap_header->info.nr_badpages; i++) 1899 swab32s(&swap_header->info.badpages[i]); 1900 } 1901 /* Check the swap header's sub-version */ 1902 if (swap_header->info.version != 1) { 1903 printk(KERN_WARNING 1904 "Unable to handle swap header version %d\n", 1905 swap_header->info.version); 1906 error = -EINVAL; 1907 goto bad_swap; 1908 } 1909 1910 p->lowest_bit = 1; 1911 p->cluster_next = 1; 1912 1913 /* 1914 * Find out how many pages are allowed for a single swap 1915 * device. There are two limiting factors: 1) the number of 1916 * bits for the swap offset in the swp_entry_t type and 1917 * 2) the number of bits in the a swap pte as defined by 1918 * the different architectures. In order to find the 1919 * largest possible bit mask a swap entry with swap type 0 1920 * and swap offset ~0UL is created, encoded to a swap pte, 1921 * decoded to a swp_entry_t again and finally the swap 1922 * offset is extracted. This will mask all the bits from 1923 * the initial ~0UL mask that can't be encoded in either 1924 * the swp_entry_t or the architecture definition of a 1925 * swap pte. 1926 */ 1927 maxpages = swp_offset(pte_to_swp_entry( 1928 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; 1929 if (maxpages > swap_header->info.last_page) 1930 maxpages = swap_header->info.last_page; 1931 p->highest_bit = maxpages - 1; 1932 1933 error = -EINVAL; 1934 if (!maxpages) 1935 goto bad_swap; 1936 if (swapfilepages && maxpages > swapfilepages) { 1937 printk(KERN_WARNING 1938 "Swap area shorter than signature indicates\n"); 1939 goto bad_swap; 1940 } 1941 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1942 goto bad_swap; 1943 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1944 goto bad_swap; 1945 1946 /* OK, set up the swap map and apply the bad block list */ 1947 swap_map = vmalloc(maxpages * sizeof(short)); 1948 if (!swap_map) { 1949 error = -ENOMEM; 1950 goto bad_swap; 1951 } 1952 1953 memset(swap_map, 0, maxpages * sizeof(short)); 1954 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1955 int page_nr = swap_header->info.badpages[i]; 1956 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { 1957 error = -EINVAL; 1958 goto bad_swap; 1959 } 1960 swap_map[page_nr] = SWAP_MAP_BAD; 1961 } 1962 1963 error = swap_cgroup_swapon(type, maxpages); 1964 if (error) 1965 goto bad_swap; 1966 1967 nr_good_pages = swap_header->info.last_page - 1968 swap_header->info.nr_badpages - 1969 1 /* header page */; 1970 1971 if (nr_good_pages) { 1972 swap_map[0] = SWAP_MAP_BAD; 1973 p->max = maxpages; 1974 p->pages = nr_good_pages; 1975 nr_extents = setup_swap_extents(p, &span); 1976 if (nr_extents < 0) { 1977 error = nr_extents; 1978 goto bad_swap; 1979 } 1980 nr_good_pages = p->pages; 1981 } 1982 if (!nr_good_pages) { 1983 printk(KERN_WARNING "Empty swap-file\n"); 1984 error = -EINVAL; 1985 goto bad_swap; 1986 } 1987 1988 if (p->bdev) { 1989 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 1990 p->flags |= SWP_SOLIDSTATE; 1991 p->cluster_next = 1 + (random32() % p->highest_bit); 1992 } 1993 if (discard_swap(p) == 0) 1994 p->flags |= SWP_DISCARDABLE; 1995 } 1996 1997 mutex_lock(&swapon_mutex); 1998 spin_lock(&swap_lock); 1999 if (swap_flags & SWAP_FLAG_PREFER) 2000 p->prio = 2001 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2002 else 2003 p->prio = --least_priority; 2004 p->swap_map = swap_map; 2005 p->flags |= SWP_WRITEOK; 2006 nr_swap_pages += nr_good_pages; 2007 total_swap_pages += nr_good_pages; 2008 2009 printk(KERN_INFO "Adding %uk swap on %s. " 2010 "Priority:%d extents:%d across:%lluk %s%s\n", 2011 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, 2012 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2013 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2014 (p->flags & SWP_DISCARDABLE) ? "D" : ""); 2015 2016 /* insert swap space into swap_list: */ 2017 prev = -1; 2018 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 2019 if (p->prio >= swap_info[i].prio) { 2020 break; 2021 } 2022 prev = i; 2023 } 2024 p->next = i; 2025 if (prev < 0) { 2026 swap_list.head = swap_list.next = p - swap_info; 2027 } else { 2028 swap_info[prev].next = p - swap_info; 2029 } 2030 spin_unlock(&swap_lock); 2031 mutex_unlock(&swapon_mutex); 2032 error = 0; 2033 goto out; 2034bad_swap: 2035 if (bdev) { 2036 set_blocksize(bdev, p->old_block_size); 2037 bd_release(bdev); 2038 } 2039 destroy_swap_extents(p); 2040 swap_cgroup_swapoff(type); 2041bad_swap_2: 2042 spin_lock(&swap_lock); 2043 p->swap_file = NULL; 2044 p->flags = 0; 2045 spin_unlock(&swap_lock); 2046 vfree(swap_map); 2047 if (swap_file) 2048 filp_close(swap_file, NULL); 2049out: 2050 if (page && !IS_ERR(page)) { 2051 kunmap(page); 2052 page_cache_release(page); 2053 } 2054 if (name) 2055 putname(name); 2056 if (did_down) { 2057 if (!error) 2058 inode->i_flags |= S_SWAPFILE; 2059 mutex_unlock(&inode->i_mutex); 2060 } 2061 return error; 2062} 2063 2064void si_swapinfo(struct sysinfo *val) 2065{ 2066 unsigned int i; 2067 unsigned long nr_to_be_unused = 0; 2068 2069 spin_lock(&swap_lock); 2070 for (i = 0; i < nr_swapfiles; i++) { 2071 if (!(swap_info[i].flags & SWP_USED) || 2072 (swap_info[i].flags & SWP_WRITEOK)) 2073 continue; 2074 nr_to_be_unused += swap_info[i].inuse_pages; 2075 } 2076 val->freeswap = nr_swap_pages + nr_to_be_unused; 2077 val->totalswap = total_swap_pages + nr_to_be_unused; 2078 spin_unlock(&swap_lock); 2079} 2080 2081/* 2082 * Verify that a swap entry is valid and increment its swap map count. 2083 * 2084 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 2085 * "permanent", but will be reclaimed by the next swapoff. 2086 * Returns error code in following case. 2087 * - success -> 0 2088 * - swp_entry is invalid -> EINVAL 2089 * - swp_entry is migration entry -> EINVAL 2090 * - swap-cache reference is requested but there is already one. -> EEXIST 2091 * - swap-cache reference is requested but the entry is not used. -> ENOENT 2092 */ 2093static int __swap_duplicate(swp_entry_t entry, bool cache) 2094{ 2095 struct swap_info_struct * p; 2096 unsigned long offset, type; 2097 int result = -EINVAL; 2098 int count; 2099 bool has_cache; 2100 2101 if (non_swap_entry(entry)) 2102 return -EINVAL; 2103 2104 type = swp_type(entry); 2105 if (type >= nr_swapfiles) 2106 goto bad_file; 2107 p = type + swap_info; 2108 offset = swp_offset(entry); 2109 2110 spin_lock(&swap_lock); 2111 2112 if (unlikely(offset >= p->max)) 2113 goto unlock_out; 2114 2115 count = swap_count(p->swap_map[offset]); 2116 has_cache = swap_has_cache(p->swap_map[offset]); 2117 2118 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ 2119 2120 /* set SWAP_HAS_CACHE if there is no cache and entry is used */ 2121 if (!has_cache && count) { 2122 p->swap_map[offset] = encode_swapmap(count, true); 2123 result = 0; 2124 } else if (has_cache) /* someone added cache */ 2125 result = -EEXIST; 2126 else if (!count) /* no users */ 2127 result = -ENOENT; 2128 2129 } else if (count || has_cache) { 2130 if (count < SWAP_MAP_MAX - 1) { 2131 p->swap_map[offset] = encode_swapmap(count + 1, 2132 has_cache); 2133 result = 0; 2134 } else if (count <= SWAP_MAP_MAX) { 2135 if (swap_overflow++ < 5) 2136 printk(KERN_WARNING 2137 "swap_dup: swap entry overflow\n"); 2138 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, 2139 has_cache); 2140 result = 0; 2141 } 2142 } else 2143 result = -ENOENT; /* unused swap entry */ 2144unlock_out: 2145 spin_unlock(&swap_lock); 2146out: 2147 return result; 2148 2149bad_file: 2150 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2151 goto out; 2152} 2153/* 2154 * increase reference count of swap entry by 1. 2155 */ 2156void swap_duplicate(swp_entry_t entry) 2157{ 2158 __swap_duplicate(entry, SWAP_MAP); 2159} 2160 2161/* 2162 * @entry: swap entry for which we allocate swap cache. 2163 * 2164 * Called when allocating swap cache for exising swap entry, 2165 * This can return error codes. Returns 0 at success. 2166 * -EBUSY means there is a swap cache. 2167 * Note: return code is different from swap_duplicate(). 2168 */ 2169int swapcache_prepare(swp_entry_t entry) 2170{ 2171 return __swap_duplicate(entry, SWAP_CACHE); 2172} 2173 2174/* 2175 * swap_lock prevents swap_map being freed. Don't grab an extra 2176 * reference on the swaphandle, it doesn't matter if it becomes unused. 2177 */ 2178int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 2179{ 2180 struct swap_info_struct *si; 2181 int our_page_cluster = page_cluster; 2182 pgoff_t target, toff; 2183 pgoff_t base, end; 2184 int nr_pages = 0; 2185 2186 if (!our_page_cluster) /* no readahead */ 2187 return 0; 2188 2189 si = &swap_info[swp_type(entry)]; 2190 target = swp_offset(entry); 2191 base = (target >> our_page_cluster) << our_page_cluster; 2192 end = base + (1 << our_page_cluster); 2193 if (!base) /* first page is swap header */ 2194 base++; 2195 2196 spin_lock(&swap_lock); 2197 if (end > si->max) /* don't go beyond end of map */ 2198 end = si->max; 2199 2200 /* Count contiguous allocated slots above our target */ 2201 for (toff = target; ++toff < end; nr_pages++) { 2202 /* Don't read in free or bad pages */ 2203 if (!si->swap_map[toff]) 2204 break; 2205 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) 2206 break; 2207 } 2208 /* Count contiguous allocated slots below our target */ 2209 for (toff = target; --toff >= base; nr_pages++) { 2210 /* Don't read in free or bad pages */ 2211 if (!si->swap_map[toff]) 2212 break; 2213 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) 2214 break; 2215 } 2216 spin_unlock(&swap_lock); 2217 2218 /* 2219 * Indicate starting offset, and return number of pages to get: 2220 * if only 1, say 0, since there's then no readahead to be done. 2221 */ 2222 *offset = ++toff; 2223 return nr_pages? ++nr_pages: 0; 2224} 2225