ksm.c revision 26465d3ea5a62d59efb3796b9e0e2b0656d02cb1
1/* 2 * Memory merging support. 3 * 4 * This code enables dynamic sharing of identical pages found in different 5 * memory areas, even if they are not shared by fork() 6 * 7 * Copyright (C) 2008-2009 Red Hat, Inc. 8 * Authors: 9 * Izik Eidus 10 * Andrea Arcangeli 11 * Chris Wright 12 * Hugh Dickins 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. 15 */ 16 17#include <linux/errno.h> 18#include <linux/mm.h> 19#include <linux/fs.h> 20#include <linux/mman.h> 21#include <linux/sched.h> 22#include <linux/rwsem.h> 23#include <linux/pagemap.h> 24#include <linux/rmap.h> 25#include <linux/spinlock.h> 26#include <linux/jhash.h> 27#include <linux/delay.h> 28#include <linux/kthread.h> 29#include <linux/wait.h> 30#include <linux/slab.h> 31#include <linux/rbtree.h> 32#include <linux/mmu_notifier.h> 33#include <linux/ksm.h> 34 35#include <asm/tlbflush.h> 36 37/* 38 * A few notes about the KSM scanning process, 39 * to make it easier to understand the data structures below: 40 * 41 * In order to reduce excessive scanning, KSM sorts the memory pages by their 42 * contents into a data structure that holds pointers to the pages' locations. 43 * 44 * Since the contents of the pages may change at any moment, KSM cannot just 45 * insert the pages into a normal sorted tree and expect it to find anything. 46 * Therefore KSM uses two data structures - the stable and the unstable tree. 47 * 48 * The stable tree holds pointers to all the merged pages (ksm pages), sorted 49 * by their contents. Because each such page is write-protected, searching on 50 * this tree is fully assured to be working (except when pages are unmapped), 51 * and therefore this tree is called the stable tree. 52 * 53 * In addition to the stable tree, KSM uses a second data structure called the 54 * unstable tree: this tree holds pointers to pages which have been found to 55 * be "unchanged for a period of time". The unstable tree sorts these pages 56 * by their contents, but since they are not write-protected, KSM cannot rely 57 * upon the unstable tree to work correctly - the unstable tree is liable to 58 * be corrupted as its contents are modified, and so it is called unstable. 59 * 60 * KSM solves this problem by several techniques: 61 * 62 * 1) The unstable tree is flushed every time KSM completes scanning all 63 * memory areas, and then the tree is rebuilt again from the beginning. 64 * 2) KSM will only insert into the unstable tree, pages whose hash value 65 * has not changed since the previous scan of all memory areas. 66 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the 67 * colors of the nodes and not on their contents, assuring that even when 68 * the tree gets "corrupted" it won't get out of balance, so scanning time 69 * remains the same (also, searching and inserting nodes in an rbtree uses 70 * the same algorithm, so we have no overhead when we flush and rebuild). 71 * 4) KSM never flushes the stable tree, which means that even if it were to 72 * take 10 attempts to find a page in the unstable tree, once it is found, 73 * it is secured in the stable tree. (When we scan a new page, we first 74 * compare it against the stable tree, and then against the unstable tree.) 75 */ 76 77/** 78 * struct mm_slot - ksm information per mm that is being scanned 79 * @link: link to the mm_slots hash list 80 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head 81 * @rmap_list: head for this mm_slot's list of rmap_items 82 * @mm: the mm that this information is valid for 83 */ 84struct mm_slot { 85 struct hlist_node link; 86 struct list_head mm_list; 87 struct list_head rmap_list; 88 struct mm_struct *mm; 89}; 90 91/** 92 * struct ksm_scan - cursor for scanning 93 * @mm_slot: the current mm_slot we are scanning 94 * @address: the next address inside that to be scanned 95 * @rmap_item: the current rmap that we are scanning inside the rmap_list 96 * @seqnr: count of completed full scans (needed when removing unstable node) 97 * 98 * There is only the one ksm_scan instance of this cursor structure. 99 */ 100struct ksm_scan { 101 struct mm_slot *mm_slot; 102 unsigned long address; 103 struct rmap_item *rmap_item; 104 unsigned long seqnr; 105}; 106 107/** 108 * struct rmap_item - reverse mapping item for virtual addresses 109 * @link: link into mm_slot's rmap_list (rmap_list is per mm) 110 * @mm: the memory structure this rmap_item is pointing into 111 * @address: the virtual address this rmap_item tracks (+ flags in low bits) 112 * @oldchecksum: previous checksum of the page at that virtual address 113 * @node: rb_node of this rmap_item in either unstable or stable tree 114 * @next: next rmap_item hanging off the same node of the stable tree 115 * @prev: previous rmap_item hanging off the same node of the stable tree 116 */ 117struct rmap_item { 118 struct list_head link; 119 struct mm_struct *mm; 120 unsigned long address; /* + low bits used for flags below */ 121 union { 122 unsigned int oldchecksum; /* when unstable */ 123 struct rmap_item *next; /* when stable */ 124 }; 125 union { 126 struct rb_node node; /* when tree node */ 127 struct rmap_item *prev; /* in stable list */ 128 }; 129}; 130 131#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ 132#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ 133#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ 134 135/* The stable and unstable tree heads */ 136static struct rb_root root_stable_tree = RB_ROOT; 137static struct rb_root root_unstable_tree = RB_ROOT; 138 139#define MM_SLOTS_HASH_HEADS 1024 140static struct hlist_head *mm_slots_hash; 141 142static struct mm_slot ksm_mm_head = { 143 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), 144}; 145static struct ksm_scan ksm_scan = { 146 .mm_slot = &ksm_mm_head, 147}; 148 149static struct kmem_cache *rmap_item_cache; 150static struct kmem_cache *mm_slot_cache; 151 152/* The number of nodes in the stable tree */ 153static unsigned long ksm_pages_shared; 154 155/* The number of page slots additionally sharing those nodes */ 156static unsigned long ksm_pages_sharing; 157 158/* The number of nodes in the unstable tree */ 159static unsigned long ksm_pages_unshared; 160 161/* The number of rmap_items in use: to calculate pages_volatile */ 162static unsigned long ksm_rmap_items; 163 164/* Limit on the number of unswappable pages used */ 165static unsigned long ksm_max_kernel_pages; 166 167/* Number of pages ksmd should scan in one batch */ 168static unsigned int ksm_thread_pages_to_scan; 169 170/* Milliseconds ksmd should sleep between batches */ 171static unsigned int ksm_thread_sleep_millisecs; 172 173#define KSM_RUN_STOP 0 174#define KSM_RUN_MERGE 1 175#define KSM_RUN_UNMERGE 2 176static unsigned int ksm_run; 177 178static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 179static DEFINE_MUTEX(ksm_thread_mutex); 180static DEFINE_SPINLOCK(ksm_mmlist_lock); 181 182#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ 183 sizeof(struct __struct), __alignof__(struct __struct),\ 184 (__flags), NULL) 185 186static int __init ksm_slab_init(void) 187{ 188 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); 189 if (!rmap_item_cache) 190 goto out; 191 192 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); 193 if (!mm_slot_cache) 194 goto out_free; 195 196 return 0; 197 198out_free: 199 kmem_cache_destroy(rmap_item_cache); 200out: 201 return -ENOMEM; 202} 203 204static void __init ksm_slab_free(void) 205{ 206 kmem_cache_destroy(mm_slot_cache); 207 kmem_cache_destroy(rmap_item_cache); 208 mm_slot_cache = NULL; 209} 210 211static inline struct rmap_item *alloc_rmap_item(void) 212{ 213 struct rmap_item *rmap_item; 214 215 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); 216 if (rmap_item) 217 ksm_rmap_items++; 218 return rmap_item; 219} 220 221static inline void free_rmap_item(struct rmap_item *rmap_item) 222{ 223 ksm_rmap_items--; 224 rmap_item->mm = NULL; /* debug safety */ 225 kmem_cache_free(rmap_item_cache, rmap_item); 226} 227 228static inline struct mm_slot *alloc_mm_slot(void) 229{ 230 if (!mm_slot_cache) /* initialization failed */ 231 return NULL; 232 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 233} 234 235static inline void free_mm_slot(struct mm_slot *mm_slot) 236{ 237 kmem_cache_free(mm_slot_cache, mm_slot); 238} 239 240static int __init mm_slots_hash_init(void) 241{ 242 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), 243 GFP_KERNEL); 244 if (!mm_slots_hash) 245 return -ENOMEM; 246 return 0; 247} 248 249static void __init mm_slots_hash_free(void) 250{ 251 kfree(mm_slots_hash); 252} 253 254static struct mm_slot *get_mm_slot(struct mm_struct *mm) 255{ 256 struct mm_slot *mm_slot; 257 struct hlist_head *bucket; 258 struct hlist_node *node; 259 260 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 261 % MM_SLOTS_HASH_HEADS]; 262 hlist_for_each_entry(mm_slot, node, bucket, link) { 263 if (mm == mm_slot->mm) 264 return mm_slot; 265 } 266 return NULL; 267} 268 269static void insert_to_mm_slots_hash(struct mm_struct *mm, 270 struct mm_slot *mm_slot) 271{ 272 struct hlist_head *bucket; 273 274 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 275 % MM_SLOTS_HASH_HEADS]; 276 mm_slot->mm = mm; 277 INIT_LIST_HEAD(&mm_slot->rmap_list); 278 hlist_add_head(&mm_slot->link, bucket); 279} 280 281static inline int in_stable_tree(struct rmap_item *rmap_item) 282{ 283 return rmap_item->address & STABLE_FLAG; 284} 285 286/* 287 * We use break_ksm to break COW on a ksm page: it's a stripped down 288 * 289 * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) 290 * put_page(page); 291 * 292 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, 293 * in case the application has unmapped and remapped mm,addr meanwhile. 294 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP 295 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. 296 */ 297static void break_ksm(struct vm_area_struct *vma, unsigned long addr) 298{ 299 struct page *page; 300 int ret; 301 302 do { 303 cond_resched(); 304 page = follow_page(vma, addr, FOLL_GET); 305 if (!page) 306 break; 307 if (PageKsm(page)) 308 ret = handle_mm_fault(vma->vm_mm, vma, addr, 309 FAULT_FLAG_WRITE); 310 else 311 ret = VM_FAULT_WRITE; 312 put_page(page); 313 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS))); 314 315 /* Which leaves us looping there if VM_FAULT_OOM: hmmm... */ 316} 317 318static void __break_cow(struct mm_struct *mm, unsigned long addr) 319{ 320 struct vm_area_struct *vma; 321 322 vma = find_vma(mm, addr); 323 if (!vma || vma->vm_start > addr) 324 return; 325 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 326 return; 327 break_ksm(vma, addr); 328} 329 330static void break_cow(struct mm_struct *mm, unsigned long addr) 331{ 332 down_read(&mm->mmap_sem); 333 __break_cow(mm, addr); 334 up_read(&mm->mmap_sem); 335} 336 337static struct page *get_mergeable_page(struct rmap_item *rmap_item) 338{ 339 struct mm_struct *mm = rmap_item->mm; 340 unsigned long addr = rmap_item->address; 341 struct vm_area_struct *vma; 342 struct page *page; 343 344 down_read(&mm->mmap_sem); 345 vma = find_vma(mm, addr); 346 if (!vma || vma->vm_start > addr) 347 goto out; 348 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 349 goto out; 350 351 page = follow_page(vma, addr, FOLL_GET); 352 if (!page) 353 goto out; 354 if (PageAnon(page)) { 355 flush_anon_page(vma, page, addr); 356 flush_dcache_page(page); 357 } else { 358 put_page(page); 359out: page = NULL; 360 } 361 up_read(&mm->mmap_sem); 362 return page; 363} 364 365/* 366 * get_ksm_page: checks if the page at the virtual address in rmap_item 367 * is still PageKsm, in which case we can trust the content of the page, 368 * and it returns the gotten page; but NULL if the page has been zapped. 369 */ 370static struct page *get_ksm_page(struct rmap_item *rmap_item) 371{ 372 struct page *page; 373 374 page = get_mergeable_page(rmap_item); 375 if (page && !PageKsm(page)) { 376 put_page(page); 377 page = NULL; 378 } 379 return page; 380} 381 382/* 383 * Removing rmap_item from stable or unstable tree. 384 * This function will clean the information from the stable/unstable tree. 385 */ 386static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) 387{ 388 if (in_stable_tree(rmap_item)) { 389 struct rmap_item *next_item = rmap_item->next; 390 391 if (rmap_item->address & NODE_FLAG) { 392 if (next_item) { 393 rb_replace_node(&rmap_item->node, 394 &next_item->node, 395 &root_stable_tree); 396 next_item->address |= NODE_FLAG; 397 ksm_pages_sharing--; 398 } else { 399 rb_erase(&rmap_item->node, &root_stable_tree); 400 ksm_pages_shared--; 401 } 402 } else { 403 struct rmap_item *prev_item = rmap_item->prev; 404 405 BUG_ON(prev_item->next != rmap_item); 406 prev_item->next = next_item; 407 if (next_item) { 408 BUG_ON(next_item->prev != rmap_item); 409 next_item->prev = rmap_item->prev; 410 } 411 ksm_pages_sharing--; 412 } 413 414 rmap_item->next = NULL; 415 416 } else if (rmap_item->address & NODE_FLAG) { 417 unsigned char age; 418 /* 419 * ksm_thread can and must skip the rb_erase, because 420 * root_unstable_tree was already reset to RB_ROOT. 421 * But __ksm_exit has to be careful: do the rb_erase 422 * if it's interrupting a scan, and this rmap_item was 423 * inserted by this scan rather than left from before. 424 * 425 * Because of the case in which remove_mm_from_lists 426 * increments seqnr before removing rmaps, unstable_nr 427 * may even be 2 behind seqnr, but should never be 428 * further behind. Yes, I did have trouble with this! 429 */ 430 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); 431 BUG_ON(age > 2); 432 if (!age) 433 rb_erase(&rmap_item->node, &root_unstable_tree); 434 ksm_pages_unshared--; 435 } 436 437 rmap_item->address &= PAGE_MASK; 438 439 cond_resched(); /* we're called from many long loops */ 440} 441 442static void remove_all_slot_rmap_items(struct mm_slot *mm_slot) 443{ 444 struct rmap_item *rmap_item, *node; 445 446 list_for_each_entry_safe(rmap_item, node, &mm_slot->rmap_list, link) { 447 remove_rmap_item_from_tree(rmap_item); 448 list_del(&rmap_item->link); 449 free_rmap_item(rmap_item); 450 } 451} 452 453static void remove_trailing_rmap_items(struct mm_slot *mm_slot, 454 struct list_head *cur) 455{ 456 struct rmap_item *rmap_item; 457 458 while (cur != &mm_slot->rmap_list) { 459 rmap_item = list_entry(cur, struct rmap_item, link); 460 cur = cur->next; 461 remove_rmap_item_from_tree(rmap_item); 462 list_del(&rmap_item->link); 463 free_rmap_item(rmap_item); 464 } 465} 466 467/* 468 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather 469 * than check every pte of a given vma, the locking doesn't quite work for 470 * that - an rmap_item is assigned to the stable tree after inserting ksm 471 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing 472 * rmap_items from parent to child at fork time (so as not to waste time 473 * if exit comes before the next scan reaches it). 474 */ 475static void unmerge_ksm_pages(struct vm_area_struct *vma, 476 unsigned long start, unsigned long end) 477{ 478 unsigned long addr; 479 480 for (addr = start; addr < end; addr += PAGE_SIZE) 481 break_ksm(vma, addr); 482} 483 484static void unmerge_and_remove_all_rmap_items(void) 485{ 486 struct mm_slot *mm_slot; 487 struct mm_struct *mm; 488 struct vm_area_struct *vma; 489 490 list_for_each_entry(mm_slot, &ksm_mm_head.mm_list, mm_list) { 491 mm = mm_slot->mm; 492 down_read(&mm->mmap_sem); 493 for (vma = mm->mmap; vma; vma = vma->vm_next) { 494 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 495 continue; 496 unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end); 497 } 498 remove_all_slot_rmap_items(mm_slot); 499 up_read(&mm->mmap_sem); 500 } 501 502 spin_lock(&ksm_mmlist_lock); 503 if (ksm_scan.mm_slot != &ksm_mm_head) { 504 ksm_scan.mm_slot = &ksm_mm_head; 505 ksm_scan.seqnr++; 506 } 507 spin_unlock(&ksm_mmlist_lock); 508} 509 510static void remove_mm_from_lists(struct mm_struct *mm) 511{ 512 struct mm_slot *mm_slot; 513 514 spin_lock(&ksm_mmlist_lock); 515 mm_slot = get_mm_slot(mm); 516 517 /* 518 * This mm_slot is always at the scanning cursor when we're 519 * called from scan_get_next_rmap_item; but it's a special 520 * case when we're called from __ksm_exit. 521 */ 522 if (ksm_scan.mm_slot == mm_slot) { 523 ksm_scan.mm_slot = list_entry( 524 mm_slot->mm_list.next, struct mm_slot, mm_list); 525 ksm_scan.address = 0; 526 ksm_scan.rmap_item = list_entry( 527 &ksm_scan.mm_slot->rmap_list, struct rmap_item, link); 528 if (ksm_scan.mm_slot == &ksm_mm_head) 529 ksm_scan.seqnr++; 530 } 531 532 hlist_del(&mm_slot->link); 533 list_del(&mm_slot->mm_list); 534 spin_unlock(&ksm_mmlist_lock); 535 536 remove_all_slot_rmap_items(mm_slot); 537 free_mm_slot(mm_slot); 538 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 539} 540 541static u32 calc_checksum(struct page *page) 542{ 543 u32 checksum; 544 void *addr = kmap_atomic(page, KM_USER0); 545 checksum = jhash2(addr, PAGE_SIZE / 4, 17); 546 kunmap_atomic(addr, KM_USER0); 547 return checksum; 548} 549 550static int memcmp_pages(struct page *page1, struct page *page2) 551{ 552 char *addr1, *addr2; 553 int ret; 554 555 addr1 = kmap_atomic(page1, KM_USER0); 556 addr2 = kmap_atomic(page2, KM_USER1); 557 ret = memcmp(addr1, addr2, PAGE_SIZE); 558 kunmap_atomic(addr2, KM_USER1); 559 kunmap_atomic(addr1, KM_USER0); 560 return ret; 561} 562 563static inline int pages_identical(struct page *page1, struct page *page2) 564{ 565 return !memcmp_pages(page1, page2); 566} 567 568static int write_protect_page(struct vm_area_struct *vma, struct page *page, 569 pte_t *orig_pte) 570{ 571 struct mm_struct *mm = vma->vm_mm; 572 unsigned long addr; 573 pte_t *ptep; 574 spinlock_t *ptl; 575 int swapped; 576 int err = -EFAULT; 577 578 addr = page_address_in_vma(page, vma); 579 if (addr == -EFAULT) 580 goto out; 581 582 ptep = page_check_address(page, mm, addr, &ptl, 0); 583 if (!ptep) 584 goto out; 585 586 if (pte_write(*ptep)) { 587 pte_t entry; 588 589 swapped = PageSwapCache(page); 590 flush_cache_page(vma, addr, page_to_pfn(page)); 591 /* 592 * Ok this is tricky, when get_user_pages_fast() run it doesnt 593 * take any lock, therefore the check that we are going to make 594 * with the pagecount against the mapcount is racey and 595 * O_DIRECT can happen right after the check. 596 * So we clear the pte and flush the tlb before the check 597 * this assure us that no O_DIRECT can happen after the check 598 * or in the middle of the check. 599 */ 600 entry = ptep_clear_flush(vma, addr, ptep); 601 /* 602 * Check that no O_DIRECT or similar I/O is in progress on the 603 * page 604 */ 605 if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { 606 set_pte_at_notify(mm, addr, ptep, entry); 607 goto out_unlock; 608 } 609 entry = pte_wrprotect(entry); 610 set_pte_at_notify(mm, addr, ptep, entry); 611 } 612 *orig_pte = *ptep; 613 err = 0; 614 615out_unlock: 616 pte_unmap_unlock(ptep, ptl); 617out: 618 return err; 619} 620 621/** 622 * replace_page - replace page in vma by new ksm page 623 * @vma: vma that holds the pte pointing to oldpage 624 * @oldpage: the page we are replacing by newpage 625 * @newpage: the ksm page we replace oldpage by 626 * @orig_pte: the original value of the pte 627 * 628 * Returns 0 on success, -EFAULT on failure. 629 */ 630static int replace_page(struct vm_area_struct *vma, struct page *oldpage, 631 struct page *newpage, pte_t orig_pte) 632{ 633 struct mm_struct *mm = vma->vm_mm; 634 pgd_t *pgd; 635 pud_t *pud; 636 pmd_t *pmd; 637 pte_t *ptep; 638 spinlock_t *ptl; 639 unsigned long addr; 640 pgprot_t prot; 641 int err = -EFAULT; 642 643 prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); 644 645 addr = page_address_in_vma(oldpage, vma); 646 if (addr == -EFAULT) 647 goto out; 648 649 pgd = pgd_offset(mm, addr); 650 if (!pgd_present(*pgd)) 651 goto out; 652 653 pud = pud_offset(pgd, addr); 654 if (!pud_present(*pud)) 655 goto out; 656 657 pmd = pmd_offset(pud, addr); 658 if (!pmd_present(*pmd)) 659 goto out; 660 661 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 662 if (!pte_same(*ptep, orig_pte)) { 663 pte_unmap_unlock(ptep, ptl); 664 goto out; 665 } 666 667 get_page(newpage); 668 page_add_ksm_rmap(newpage); 669 670 flush_cache_page(vma, addr, pte_pfn(*ptep)); 671 ptep_clear_flush(vma, addr, ptep); 672 set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); 673 674 page_remove_rmap(oldpage); 675 put_page(oldpage); 676 677 pte_unmap_unlock(ptep, ptl); 678 err = 0; 679out: 680 return err; 681} 682 683/* 684 * try_to_merge_one_page - take two pages and merge them into one 685 * @vma: the vma that hold the pte pointing into oldpage 686 * @oldpage: the page that we want to replace with newpage 687 * @newpage: the page that we want to map instead of oldpage 688 * 689 * Note: 690 * oldpage should be a PageAnon page, while newpage should be a PageKsm page, 691 * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. 692 * 693 * This function returns 0 if the pages were merged, -EFAULT otherwise. 694 */ 695static int try_to_merge_one_page(struct vm_area_struct *vma, 696 struct page *oldpage, 697 struct page *newpage) 698{ 699 pte_t orig_pte = __pte(0); 700 int err = -EFAULT; 701 702 if (!(vma->vm_flags & VM_MERGEABLE)) 703 goto out; 704 705 if (!PageAnon(oldpage)) 706 goto out; 707 708 get_page(newpage); 709 get_page(oldpage); 710 711 /* 712 * We need the page lock to read a stable PageSwapCache in 713 * write_protect_page(). We use trylock_page() instead of 714 * lock_page() because we don't want to wait here - we 715 * prefer to continue scanning and merging different pages, 716 * then come back to this page when it is unlocked. 717 */ 718 if (!trylock_page(oldpage)) 719 goto out_putpage; 720 /* 721 * If this anonymous page is mapped only here, its pte may need 722 * to be write-protected. If it's mapped elsewhere, all of its 723 * ptes are necessarily already write-protected. But in either 724 * case, we need to lock and check page_count is not raised. 725 */ 726 if (write_protect_page(vma, oldpage, &orig_pte)) { 727 unlock_page(oldpage); 728 goto out_putpage; 729 } 730 unlock_page(oldpage); 731 732 if (pages_identical(oldpage, newpage)) 733 err = replace_page(vma, oldpage, newpage, orig_pte); 734 735out_putpage: 736 put_page(oldpage); 737 put_page(newpage); 738out: 739 return err; 740} 741 742/* 743 * try_to_merge_two_pages - take two identical pages and prepare them 744 * to be merged into one page. 745 * 746 * This function returns 0 if we successfully mapped two identical pages 747 * into one page, -EFAULT otherwise. 748 * 749 * Note that this function allocates a new kernel page: if one of the pages 750 * is already a ksm page, try_to_merge_with_ksm_page should be used. 751 */ 752static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, 753 struct page *page1, struct mm_struct *mm2, 754 unsigned long addr2, struct page *page2) 755{ 756 struct vm_area_struct *vma; 757 struct page *kpage; 758 int err = -EFAULT; 759 760 /* 761 * The number of nodes in the stable tree 762 * is the number of kernel pages that we hold. 763 */ 764 if (ksm_max_kernel_pages && 765 ksm_max_kernel_pages <= ksm_pages_shared) 766 return err; 767 768 kpage = alloc_page(GFP_HIGHUSER); 769 if (!kpage) 770 return err; 771 772 down_read(&mm1->mmap_sem); 773 vma = find_vma(mm1, addr1); 774 if (!vma || vma->vm_start > addr1) { 775 put_page(kpage); 776 up_read(&mm1->mmap_sem); 777 return err; 778 } 779 780 copy_user_highpage(kpage, page1, addr1, vma); 781 err = try_to_merge_one_page(vma, page1, kpage); 782 up_read(&mm1->mmap_sem); 783 784 if (!err) { 785 down_read(&mm2->mmap_sem); 786 vma = find_vma(mm2, addr2); 787 if (!vma || vma->vm_start > addr2) { 788 put_page(kpage); 789 up_read(&mm2->mmap_sem); 790 break_cow(mm1, addr1); 791 return -EFAULT; 792 } 793 794 err = try_to_merge_one_page(vma, page2, kpage); 795 up_read(&mm2->mmap_sem); 796 797 /* 798 * If the second try_to_merge_one_page failed, we have a 799 * ksm page with just one pte pointing to it, so break it. 800 */ 801 if (err) 802 break_cow(mm1, addr1); 803 } 804 805 put_page(kpage); 806 return err; 807} 808 809/* 810 * try_to_merge_with_ksm_page - like try_to_merge_two_pages, 811 * but no new kernel page is allocated: kpage must already be a ksm page. 812 */ 813static int try_to_merge_with_ksm_page(struct mm_struct *mm1, 814 unsigned long addr1, 815 struct page *page1, 816 struct page *kpage) 817{ 818 struct vm_area_struct *vma; 819 int err = -EFAULT; 820 821 down_read(&mm1->mmap_sem); 822 vma = find_vma(mm1, addr1); 823 if (!vma || vma->vm_start > addr1) { 824 up_read(&mm1->mmap_sem); 825 return err; 826 } 827 828 err = try_to_merge_one_page(vma, page1, kpage); 829 up_read(&mm1->mmap_sem); 830 831 return err; 832} 833 834/* 835 * stable_tree_search - search page inside the stable tree 836 * @page: the page that we are searching identical pages to. 837 * @page2: pointer into identical page that we are holding inside the stable 838 * tree that we have found. 839 * @rmap_item: the reverse mapping item 840 * 841 * This function checks if there is a page inside the stable tree 842 * with identical content to the page that we are scanning right now. 843 * 844 * This function return rmap_item pointer to the identical item if found, 845 * NULL otherwise. 846 */ 847static struct rmap_item *stable_tree_search(struct page *page, 848 struct page **page2, 849 struct rmap_item *rmap_item) 850{ 851 struct rb_node *node = root_stable_tree.rb_node; 852 853 while (node) { 854 struct rmap_item *tree_rmap_item, *next_rmap_item; 855 int ret; 856 857 tree_rmap_item = rb_entry(node, struct rmap_item, node); 858 while (tree_rmap_item) { 859 BUG_ON(!in_stable_tree(tree_rmap_item)); 860 cond_resched(); 861 page2[0] = get_ksm_page(tree_rmap_item); 862 if (page2[0]) 863 break; 864 next_rmap_item = tree_rmap_item->next; 865 remove_rmap_item_from_tree(tree_rmap_item); 866 tree_rmap_item = next_rmap_item; 867 } 868 if (!tree_rmap_item) 869 return NULL; 870 871 ret = memcmp_pages(page, page2[0]); 872 873 if (ret < 0) { 874 put_page(page2[0]); 875 node = node->rb_left; 876 } else if (ret > 0) { 877 put_page(page2[0]); 878 node = node->rb_right; 879 } else { 880 return tree_rmap_item; 881 } 882 } 883 884 return NULL; 885} 886 887/* 888 * stable_tree_insert - insert rmap_item pointing to new ksm page 889 * into the stable tree. 890 * 891 * @page: the page that we are searching identical page to inside the stable 892 * tree. 893 * @rmap_item: pointer to the reverse mapping item. 894 * 895 * This function returns rmap_item if success, NULL otherwise. 896 */ 897static struct rmap_item *stable_tree_insert(struct page *page, 898 struct rmap_item *rmap_item) 899{ 900 struct rb_node **new = &root_stable_tree.rb_node; 901 struct rb_node *parent = NULL; 902 903 while (*new) { 904 struct rmap_item *tree_rmap_item, *next_rmap_item; 905 struct page *tree_page; 906 int ret; 907 908 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 909 while (tree_rmap_item) { 910 BUG_ON(!in_stable_tree(tree_rmap_item)); 911 cond_resched(); 912 tree_page = get_ksm_page(tree_rmap_item); 913 if (tree_page) 914 break; 915 next_rmap_item = tree_rmap_item->next; 916 remove_rmap_item_from_tree(tree_rmap_item); 917 tree_rmap_item = next_rmap_item; 918 } 919 if (!tree_rmap_item) 920 return NULL; 921 922 ret = memcmp_pages(page, tree_page); 923 put_page(tree_page); 924 925 parent = *new; 926 if (ret < 0) 927 new = &parent->rb_left; 928 else if (ret > 0) 929 new = &parent->rb_right; 930 else { 931 /* 932 * It is not a bug that stable_tree_search() didn't 933 * find this node: because at that time our page was 934 * not yet write-protected, so may have changed since. 935 */ 936 return NULL; 937 } 938 } 939 940 rmap_item->address |= NODE_FLAG | STABLE_FLAG; 941 rmap_item->next = NULL; 942 rb_link_node(&rmap_item->node, parent, new); 943 rb_insert_color(&rmap_item->node, &root_stable_tree); 944 945 ksm_pages_shared++; 946 return rmap_item; 947} 948 949/* 950 * unstable_tree_search_insert - search and insert items into the unstable tree. 951 * 952 * @page: the page that we are going to search for identical page or to insert 953 * into the unstable tree 954 * @page2: pointer into identical page that was found inside the unstable tree 955 * @rmap_item: the reverse mapping item of page 956 * 957 * This function searches for a page in the unstable tree identical to the 958 * page currently being scanned; and if no identical page is found in the 959 * tree, we insert rmap_item as a new object into the unstable tree. 960 * 961 * This function returns pointer to rmap_item found to be identical 962 * to the currently scanned page, NULL otherwise. 963 * 964 * This function does both searching and inserting, because they share 965 * the same walking algorithm in an rbtree. 966 */ 967static struct rmap_item *unstable_tree_search_insert(struct page *page, 968 struct page **page2, 969 struct rmap_item *rmap_item) 970{ 971 struct rb_node **new = &root_unstable_tree.rb_node; 972 struct rb_node *parent = NULL; 973 974 while (*new) { 975 struct rmap_item *tree_rmap_item; 976 int ret; 977 978 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 979 page2[0] = get_mergeable_page(tree_rmap_item); 980 if (!page2[0]) 981 return NULL; 982 983 /* 984 * Don't substitute an unswappable ksm page 985 * just for one good swappable forked page. 986 */ 987 if (page == page2[0]) { 988 put_page(page2[0]); 989 return NULL; 990 } 991 992 ret = memcmp_pages(page, page2[0]); 993 994 parent = *new; 995 if (ret < 0) { 996 put_page(page2[0]); 997 new = &parent->rb_left; 998 } else if (ret > 0) { 999 put_page(page2[0]); 1000 new = &parent->rb_right; 1001 } else { 1002 return tree_rmap_item; 1003 } 1004 } 1005 1006 rmap_item->address |= NODE_FLAG; 1007 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 1008 rb_link_node(&rmap_item->node, parent, new); 1009 rb_insert_color(&rmap_item->node, &root_unstable_tree); 1010 1011 ksm_pages_unshared++; 1012 return NULL; 1013} 1014 1015/* 1016 * stable_tree_append - add another rmap_item to the linked list of 1017 * rmap_items hanging off a given node of the stable tree, all sharing 1018 * the same ksm page. 1019 */ 1020static void stable_tree_append(struct rmap_item *rmap_item, 1021 struct rmap_item *tree_rmap_item) 1022{ 1023 rmap_item->next = tree_rmap_item->next; 1024 rmap_item->prev = tree_rmap_item; 1025 1026 if (tree_rmap_item->next) 1027 tree_rmap_item->next->prev = rmap_item; 1028 1029 tree_rmap_item->next = rmap_item; 1030 rmap_item->address |= STABLE_FLAG; 1031 1032 ksm_pages_sharing++; 1033} 1034 1035/* 1036 * cmp_and_merge_page - take a page computes its hash value and check if there 1037 * is similar hash value to different page, 1038 * in case we find that there is similar hash to different page we call to 1039 * try_to_merge_two_pages(). 1040 * 1041 * @page: the page that we are searching identical page to. 1042 * @rmap_item: the reverse mapping into the virtual address of this page 1043 */ 1044static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) 1045{ 1046 struct page *page2[1]; 1047 struct rmap_item *tree_rmap_item; 1048 unsigned int checksum; 1049 int err; 1050 1051 if (in_stable_tree(rmap_item)) 1052 remove_rmap_item_from_tree(rmap_item); 1053 1054 /* We first start with searching the page inside the stable tree */ 1055 tree_rmap_item = stable_tree_search(page, page2, rmap_item); 1056 if (tree_rmap_item) { 1057 if (page == page2[0]) /* forked */ 1058 err = 0; 1059 else 1060 err = try_to_merge_with_ksm_page(rmap_item->mm, 1061 rmap_item->address, 1062 page, page2[0]); 1063 put_page(page2[0]); 1064 1065 if (!err) { 1066 /* 1067 * The page was successfully merged: 1068 * add its rmap_item to the stable tree. 1069 */ 1070 stable_tree_append(rmap_item, tree_rmap_item); 1071 } 1072 return; 1073 } 1074 1075 /* 1076 * A ksm page might have got here by fork, but its other 1077 * references have already been removed from the stable tree. 1078 */ 1079 if (PageKsm(page)) 1080 break_cow(rmap_item->mm, rmap_item->address); 1081 1082 /* 1083 * In case the hash value of the page was changed from the last time we 1084 * have calculated it, this page to be changed frequely, therefore we 1085 * don't want to insert it to the unstable tree, and we don't want to 1086 * waste our time to search if there is something identical to it there. 1087 */ 1088 checksum = calc_checksum(page); 1089 if (rmap_item->oldchecksum != checksum) { 1090 rmap_item->oldchecksum = checksum; 1091 return; 1092 } 1093 1094 tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); 1095 if (tree_rmap_item) { 1096 err = try_to_merge_two_pages(rmap_item->mm, 1097 rmap_item->address, page, 1098 tree_rmap_item->mm, 1099 tree_rmap_item->address, page2[0]); 1100 /* 1101 * As soon as we merge this page, we want to remove the 1102 * rmap_item of the page we have merged with from the unstable 1103 * tree, and insert it instead as new node in the stable tree. 1104 */ 1105 if (!err) { 1106 rb_erase(&tree_rmap_item->node, &root_unstable_tree); 1107 tree_rmap_item->address &= ~NODE_FLAG; 1108 ksm_pages_unshared--; 1109 1110 /* 1111 * If we fail to insert the page into the stable tree, 1112 * we will have 2 virtual addresses that are pointing 1113 * to a ksm page left outside the stable tree, 1114 * in which case we need to break_cow on both. 1115 */ 1116 if (stable_tree_insert(page2[0], tree_rmap_item)) 1117 stable_tree_append(rmap_item, tree_rmap_item); 1118 else { 1119 break_cow(tree_rmap_item->mm, 1120 tree_rmap_item->address); 1121 break_cow(rmap_item->mm, rmap_item->address); 1122 } 1123 } 1124 1125 put_page(page2[0]); 1126 } 1127} 1128 1129static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, 1130 struct list_head *cur, 1131 unsigned long addr) 1132{ 1133 struct rmap_item *rmap_item; 1134 1135 while (cur != &mm_slot->rmap_list) { 1136 rmap_item = list_entry(cur, struct rmap_item, link); 1137 if ((rmap_item->address & PAGE_MASK) == addr) { 1138 if (!in_stable_tree(rmap_item)) 1139 remove_rmap_item_from_tree(rmap_item); 1140 return rmap_item; 1141 } 1142 if (rmap_item->address > addr) 1143 break; 1144 cur = cur->next; 1145 remove_rmap_item_from_tree(rmap_item); 1146 list_del(&rmap_item->link); 1147 free_rmap_item(rmap_item); 1148 } 1149 1150 rmap_item = alloc_rmap_item(); 1151 if (rmap_item) { 1152 /* It has already been zeroed */ 1153 rmap_item->mm = mm_slot->mm; 1154 rmap_item->address = addr; 1155 list_add_tail(&rmap_item->link, cur); 1156 } 1157 return rmap_item; 1158} 1159 1160static struct rmap_item *scan_get_next_rmap_item(struct page **page) 1161{ 1162 struct mm_struct *mm; 1163 struct mm_slot *slot; 1164 struct vm_area_struct *vma; 1165 struct rmap_item *rmap_item; 1166 1167 if (list_empty(&ksm_mm_head.mm_list)) 1168 return NULL; 1169 1170 slot = ksm_scan.mm_slot; 1171 if (slot == &ksm_mm_head) { 1172 root_unstable_tree = RB_ROOT; 1173 1174 spin_lock(&ksm_mmlist_lock); 1175 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1176 ksm_scan.mm_slot = slot; 1177 spin_unlock(&ksm_mmlist_lock); 1178next_mm: 1179 ksm_scan.address = 0; 1180 ksm_scan.rmap_item = list_entry(&slot->rmap_list, 1181 struct rmap_item, link); 1182 } 1183 1184 mm = slot->mm; 1185 down_read(&mm->mmap_sem); 1186 for (vma = find_vma(mm, ksm_scan.address); vma; vma = vma->vm_next) { 1187 if (!(vma->vm_flags & VM_MERGEABLE)) 1188 continue; 1189 if (ksm_scan.address < vma->vm_start) 1190 ksm_scan.address = vma->vm_start; 1191 if (!vma->anon_vma) 1192 ksm_scan.address = vma->vm_end; 1193 1194 while (ksm_scan.address < vma->vm_end) { 1195 *page = follow_page(vma, ksm_scan.address, FOLL_GET); 1196 if (*page && PageAnon(*page)) { 1197 flush_anon_page(vma, *page, ksm_scan.address); 1198 flush_dcache_page(*page); 1199 rmap_item = get_next_rmap_item(slot, 1200 ksm_scan.rmap_item->link.next, 1201 ksm_scan.address); 1202 if (rmap_item) { 1203 ksm_scan.rmap_item = rmap_item; 1204 ksm_scan.address += PAGE_SIZE; 1205 } else 1206 put_page(*page); 1207 up_read(&mm->mmap_sem); 1208 return rmap_item; 1209 } 1210 if (*page) 1211 put_page(*page); 1212 ksm_scan.address += PAGE_SIZE; 1213 cond_resched(); 1214 } 1215 } 1216 1217 if (!ksm_scan.address) { 1218 /* 1219 * We've completed a full scan of all vmas, holding mmap_sem 1220 * throughout, and found no VM_MERGEABLE: so do the same as 1221 * __ksm_exit does to remove this mm from all our lists now. 1222 */ 1223 remove_mm_from_lists(mm); 1224 up_read(&mm->mmap_sem); 1225 slot = ksm_scan.mm_slot; 1226 if (slot != &ksm_mm_head) 1227 goto next_mm; 1228 return NULL; 1229 } 1230 1231 /* 1232 * Nuke all the rmap_items that are above this current rmap: 1233 * because there were no VM_MERGEABLE vmas with such addresses. 1234 */ 1235 remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); 1236 up_read(&mm->mmap_sem); 1237 1238 spin_lock(&ksm_mmlist_lock); 1239 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1240 ksm_scan.mm_slot = slot; 1241 spin_unlock(&ksm_mmlist_lock); 1242 1243 /* Repeat until we've completed scanning the whole list */ 1244 if (slot != &ksm_mm_head) 1245 goto next_mm; 1246 1247 /* 1248 * Bump seqnr here rather than at top, so that __ksm_exit 1249 * can skip rb_erase on unstable tree until we run again. 1250 */ 1251 ksm_scan.seqnr++; 1252 return NULL; 1253} 1254 1255/** 1256 * ksm_do_scan - the ksm scanner main worker function. 1257 * @scan_npages - number of pages we want to scan before we return. 1258 */ 1259static void ksm_do_scan(unsigned int scan_npages) 1260{ 1261 struct rmap_item *rmap_item; 1262 struct page *page; 1263 1264 while (scan_npages--) { 1265 cond_resched(); 1266 rmap_item = scan_get_next_rmap_item(&page); 1267 if (!rmap_item) 1268 return; 1269 if (!PageKsm(page) || !in_stable_tree(rmap_item)) 1270 cmp_and_merge_page(page, rmap_item); 1271 else if (page_mapcount(page) == 1) { 1272 /* 1273 * Replace now-unshared ksm page by ordinary page. 1274 */ 1275 break_cow(rmap_item->mm, rmap_item->address); 1276 remove_rmap_item_from_tree(rmap_item); 1277 rmap_item->oldchecksum = calc_checksum(page); 1278 } 1279 put_page(page); 1280 } 1281} 1282 1283static int ksm_scan_thread(void *nothing) 1284{ 1285 set_user_nice(current, 5); 1286 1287 while (!kthread_should_stop()) { 1288 if (ksm_run & KSM_RUN_MERGE) { 1289 mutex_lock(&ksm_thread_mutex); 1290 ksm_do_scan(ksm_thread_pages_to_scan); 1291 mutex_unlock(&ksm_thread_mutex); 1292 schedule_timeout_interruptible( 1293 msecs_to_jiffies(ksm_thread_sleep_millisecs)); 1294 } else { 1295 wait_event_interruptible(ksm_thread_wait, 1296 (ksm_run & KSM_RUN_MERGE) || 1297 kthread_should_stop()); 1298 } 1299 } 1300 return 0; 1301} 1302 1303int ksm_madvise(struct vm_area_struct *vma, unsigned long start, 1304 unsigned long end, int advice, unsigned long *vm_flags) 1305{ 1306 struct mm_struct *mm = vma->vm_mm; 1307 1308 switch (advice) { 1309 case MADV_MERGEABLE: 1310 /* 1311 * Be somewhat over-protective for now! 1312 */ 1313 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1314 VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1315 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1316 VM_MIXEDMAP | VM_SAO)) 1317 return 0; /* just ignore the advice */ 1318 1319 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) 1320 if (__ksm_enter(mm) < 0) 1321 return -EAGAIN; 1322 1323 *vm_flags |= VM_MERGEABLE; 1324 break; 1325 1326 case MADV_UNMERGEABLE: 1327 if (!(*vm_flags & VM_MERGEABLE)) 1328 return 0; /* just ignore the advice */ 1329 1330 if (vma->anon_vma) 1331 unmerge_ksm_pages(vma, start, end); 1332 1333 *vm_flags &= ~VM_MERGEABLE; 1334 break; 1335 } 1336 1337 return 0; 1338} 1339 1340int __ksm_enter(struct mm_struct *mm) 1341{ 1342 struct mm_slot *mm_slot = alloc_mm_slot(); 1343 if (!mm_slot) 1344 return -ENOMEM; 1345 1346 spin_lock(&ksm_mmlist_lock); 1347 insert_to_mm_slots_hash(mm, mm_slot); 1348 /* 1349 * Insert just behind the scanning cursor, to let the area settle 1350 * down a little; when fork is followed by immediate exec, we don't 1351 * want ksmd to waste time setting up and tearing down an rmap_list. 1352 */ 1353 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); 1354 spin_unlock(&ksm_mmlist_lock); 1355 1356 set_bit(MMF_VM_MERGEABLE, &mm->flags); 1357 return 0; 1358} 1359 1360void __ksm_exit(struct mm_struct *mm) 1361{ 1362 /* 1363 * This process is exiting: doesn't hold and doesn't need mmap_sem; 1364 * but we do need to exclude ksmd and other exiters while we modify 1365 * the various lists and trees. 1366 */ 1367 mutex_lock(&ksm_thread_mutex); 1368 remove_mm_from_lists(mm); 1369 mutex_unlock(&ksm_thread_mutex); 1370} 1371 1372#define KSM_ATTR_RO(_name) \ 1373 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 1374#define KSM_ATTR(_name) \ 1375 static struct kobj_attribute _name##_attr = \ 1376 __ATTR(_name, 0644, _name##_show, _name##_store) 1377 1378static ssize_t sleep_millisecs_show(struct kobject *kobj, 1379 struct kobj_attribute *attr, char *buf) 1380{ 1381 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); 1382} 1383 1384static ssize_t sleep_millisecs_store(struct kobject *kobj, 1385 struct kobj_attribute *attr, 1386 const char *buf, size_t count) 1387{ 1388 unsigned long msecs; 1389 int err; 1390 1391 err = strict_strtoul(buf, 10, &msecs); 1392 if (err || msecs > UINT_MAX) 1393 return -EINVAL; 1394 1395 ksm_thread_sleep_millisecs = msecs; 1396 1397 return count; 1398} 1399KSM_ATTR(sleep_millisecs); 1400 1401static ssize_t pages_to_scan_show(struct kobject *kobj, 1402 struct kobj_attribute *attr, char *buf) 1403{ 1404 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); 1405} 1406 1407static ssize_t pages_to_scan_store(struct kobject *kobj, 1408 struct kobj_attribute *attr, 1409 const char *buf, size_t count) 1410{ 1411 int err; 1412 unsigned long nr_pages; 1413 1414 err = strict_strtoul(buf, 10, &nr_pages); 1415 if (err || nr_pages > UINT_MAX) 1416 return -EINVAL; 1417 1418 ksm_thread_pages_to_scan = nr_pages; 1419 1420 return count; 1421} 1422KSM_ATTR(pages_to_scan); 1423 1424static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, 1425 char *buf) 1426{ 1427 return sprintf(buf, "%u\n", ksm_run); 1428} 1429 1430static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, 1431 const char *buf, size_t count) 1432{ 1433 int err; 1434 unsigned long flags; 1435 1436 err = strict_strtoul(buf, 10, &flags); 1437 if (err || flags > UINT_MAX) 1438 return -EINVAL; 1439 if (flags > KSM_RUN_UNMERGE) 1440 return -EINVAL; 1441 1442 /* 1443 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. 1444 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, 1445 * breaking COW to free the unswappable pages_shared (but leaves 1446 * mm_slots on the list for when ksmd may be set running again). 1447 */ 1448 1449 mutex_lock(&ksm_thread_mutex); 1450 if (ksm_run != flags) { 1451 ksm_run = flags; 1452 if (flags & KSM_RUN_UNMERGE) 1453 unmerge_and_remove_all_rmap_items(); 1454 } 1455 mutex_unlock(&ksm_thread_mutex); 1456 1457 if (flags & KSM_RUN_MERGE) 1458 wake_up_interruptible(&ksm_thread_wait); 1459 1460 return count; 1461} 1462KSM_ATTR(run); 1463 1464static ssize_t max_kernel_pages_store(struct kobject *kobj, 1465 struct kobj_attribute *attr, 1466 const char *buf, size_t count) 1467{ 1468 int err; 1469 unsigned long nr_pages; 1470 1471 err = strict_strtoul(buf, 10, &nr_pages); 1472 if (err) 1473 return -EINVAL; 1474 1475 ksm_max_kernel_pages = nr_pages; 1476 1477 return count; 1478} 1479 1480static ssize_t max_kernel_pages_show(struct kobject *kobj, 1481 struct kobj_attribute *attr, char *buf) 1482{ 1483 return sprintf(buf, "%lu\n", ksm_max_kernel_pages); 1484} 1485KSM_ATTR(max_kernel_pages); 1486 1487static ssize_t pages_shared_show(struct kobject *kobj, 1488 struct kobj_attribute *attr, char *buf) 1489{ 1490 return sprintf(buf, "%lu\n", ksm_pages_shared); 1491} 1492KSM_ATTR_RO(pages_shared); 1493 1494static ssize_t pages_sharing_show(struct kobject *kobj, 1495 struct kobj_attribute *attr, char *buf) 1496{ 1497 return sprintf(buf, "%lu\n", ksm_pages_sharing); 1498} 1499KSM_ATTR_RO(pages_sharing); 1500 1501static ssize_t pages_unshared_show(struct kobject *kobj, 1502 struct kobj_attribute *attr, char *buf) 1503{ 1504 return sprintf(buf, "%lu\n", ksm_pages_unshared); 1505} 1506KSM_ATTR_RO(pages_unshared); 1507 1508static ssize_t pages_volatile_show(struct kobject *kobj, 1509 struct kobj_attribute *attr, char *buf) 1510{ 1511 long ksm_pages_volatile; 1512 1513 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared 1514 - ksm_pages_sharing - ksm_pages_unshared; 1515 /* 1516 * It was not worth any locking to calculate that statistic, 1517 * but it might therefore sometimes be negative: conceal that. 1518 */ 1519 if (ksm_pages_volatile < 0) 1520 ksm_pages_volatile = 0; 1521 return sprintf(buf, "%ld\n", ksm_pages_volatile); 1522} 1523KSM_ATTR_RO(pages_volatile); 1524 1525static ssize_t full_scans_show(struct kobject *kobj, 1526 struct kobj_attribute *attr, char *buf) 1527{ 1528 return sprintf(buf, "%lu\n", ksm_scan.seqnr); 1529} 1530KSM_ATTR_RO(full_scans); 1531 1532static struct attribute *ksm_attrs[] = { 1533 &sleep_millisecs_attr.attr, 1534 &pages_to_scan_attr.attr, 1535 &run_attr.attr, 1536 &max_kernel_pages_attr.attr, 1537 &pages_shared_attr.attr, 1538 &pages_sharing_attr.attr, 1539 &pages_unshared_attr.attr, 1540 &pages_volatile_attr.attr, 1541 &full_scans_attr.attr, 1542 NULL, 1543}; 1544 1545static struct attribute_group ksm_attr_group = { 1546 .attrs = ksm_attrs, 1547 .name = "ksm", 1548}; 1549 1550static int __init ksm_init(void) 1551{ 1552 struct task_struct *ksm_thread; 1553 int err; 1554 1555 err = ksm_slab_init(); 1556 if (err) 1557 goto out; 1558 1559 err = mm_slots_hash_init(); 1560 if (err) 1561 goto out_free1; 1562 1563 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 1564 if (IS_ERR(ksm_thread)) { 1565 printk(KERN_ERR "ksm: creating kthread failed\n"); 1566 err = PTR_ERR(ksm_thread); 1567 goto out_free2; 1568 } 1569 1570 err = sysfs_create_group(mm_kobj, &ksm_attr_group); 1571 if (err) { 1572 printk(KERN_ERR "ksm: register sysfs failed\n"); 1573 goto out_free3; 1574 } 1575 1576 return 0; 1577 1578out_free3: 1579 kthread_stop(ksm_thread); 1580out_free2: 1581 mm_slots_hash_free(); 1582out_free1: 1583 ksm_slab_free(); 1584out: 1585 return err; 1586} 1587module_init(ksm_init) 1588