ksm.c revision 35451beecbd7c86ce3249d543594517a5fe9a0cd
1/* 2 * Memory merging support. 3 * 4 * This code enables dynamic sharing of identical pages found in different 5 * memory areas, even if they are not shared by fork() 6 * 7 * Copyright (C) 2008-2009 Red Hat, Inc. 8 * Authors: 9 * Izik Eidus 10 * Andrea Arcangeli 11 * Chris Wright 12 * Hugh Dickins 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. 15 */ 16 17#include <linux/errno.h> 18#include <linux/mm.h> 19#include <linux/fs.h> 20#include <linux/mman.h> 21#include <linux/sched.h> 22#include <linux/rwsem.h> 23#include <linux/pagemap.h> 24#include <linux/rmap.h> 25#include <linux/spinlock.h> 26#include <linux/jhash.h> 27#include <linux/delay.h> 28#include <linux/kthread.h> 29#include <linux/wait.h> 30#include <linux/slab.h> 31#include <linux/rbtree.h> 32#include <linux/mmu_notifier.h> 33#include <linux/ksm.h> 34 35#include <asm/tlbflush.h> 36 37/* 38 * A few notes about the KSM scanning process, 39 * to make it easier to understand the data structures below: 40 * 41 * In order to reduce excessive scanning, KSM sorts the memory pages by their 42 * contents into a data structure that holds pointers to the pages' locations. 43 * 44 * Since the contents of the pages may change at any moment, KSM cannot just 45 * insert the pages into a normal sorted tree and expect it to find anything. 46 * Therefore KSM uses two data structures - the stable and the unstable tree. 47 * 48 * The stable tree holds pointers to all the merged pages (ksm pages), sorted 49 * by their contents. Because each such page is write-protected, searching on 50 * this tree is fully assured to be working (except when pages are unmapped), 51 * and therefore this tree is called the stable tree. 52 * 53 * In addition to the stable tree, KSM uses a second data structure called the 54 * unstable tree: this tree holds pointers to pages which have been found to 55 * be "unchanged for a period of time". The unstable tree sorts these pages 56 * by their contents, but since they are not write-protected, KSM cannot rely 57 * upon the unstable tree to work correctly - the unstable tree is liable to 58 * be corrupted as its contents are modified, and so it is called unstable. 59 * 60 * KSM solves this problem by several techniques: 61 * 62 * 1) The unstable tree is flushed every time KSM completes scanning all 63 * memory areas, and then the tree is rebuilt again from the beginning. 64 * 2) KSM will only insert into the unstable tree, pages whose hash value 65 * has not changed since the previous scan of all memory areas. 66 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the 67 * colors of the nodes and not on their contents, assuring that even when 68 * the tree gets "corrupted" it won't get out of balance, so scanning time 69 * remains the same (also, searching and inserting nodes in an rbtree uses 70 * the same algorithm, so we have no overhead when we flush and rebuild). 71 * 4) KSM never flushes the stable tree, which means that even if it were to 72 * take 10 attempts to find a page in the unstable tree, once it is found, 73 * it is secured in the stable tree. (When we scan a new page, we first 74 * compare it against the stable tree, and then against the unstable tree.) 75 */ 76 77/** 78 * struct mm_slot - ksm information per mm that is being scanned 79 * @link: link to the mm_slots hash list 80 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head 81 * @rmap_list: head for this mm_slot's list of rmap_items 82 * @mm: the mm that this information is valid for 83 */ 84struct mm_slot { 85 struct hlist_node link; 86 struct list_head mm_list; 87 struct list_head rmap_list; 88 struct mm_struct *mm; 89}; 90 91/** 92 * struct ksm_scan - cursor for scanning 93 * @mm_slot: the current mm_slot we are scanning 94 * @address: the next address inside that to be scanned 95 * @rmap_item: the current rmap that we are scanning inside the rmap_list 96 * @seqnr: count of completed full scans (needed when removing unstable node) 97 * 98 * There is only the one ksm_scan instance of this cursor structure. 99 */ 100struct ksm_scan { 101 struct mm_slot *mm_slot; 102 unsigned long address; 103 struct rmap_item *rmap_item; 104 unsigned long seqnr; 105}; 106 107/** 108 * struct rmap_item - reverse mapping item for virtual addresses 109 * @link: link into mm_slot's rmap_list (rmap_list is per mm) 110 * @mm: the memory structure this rmap_item is pointing into 111 * @address: the virtual address this rmap_item tracks (+ flags in low bits) 112 * @oldchecksum: previous checksum of the page at that virtual address 113 * @node: rb_node of this rmap_item in either unstable or stable tree 114 * @next: next rmap_item hanging off the same node of the stable tree 115 * @prev: previous rmap_item hanging off the same node of the stable tree 116 */ 117struct rmap_item { 118 struct list_head link; 119 struct mm_struct *mm; 120 unsigned long address; /* + low bits used for flags below */ 121 union { 122 unsigned int oldchecksum; /* when unstable */ 123 struct rmap_item *next; /* when stable */ 124 }; 125 union { 126 struct rb_node node; /* when tree node */ 127 struct rmap_item *prev; /* in stable list */ 128 }; 129}; 130 131#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ 132#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ 133#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ 134 135/* The stable and unstable tree heads */ 136static struct rb_root root_stable_tree = RB_ROOT; 137static struct rb_root root_unstable_tree = RB_ROOT; 138 139#define MM_SLOTS_HASH_HEADS 1024 140static struct hlist_head *mm_slots_hash; 141 142static struct mm_slot ksm_mm_head = { 143 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), 144}; 145static struct ksm_scan ksm_scan = { 146 .mm_slot = &ksm_mm_head, 147}; 148 149static struct kmem_cache *rmap_item_cache; 150static struct kmem_cache *mm_slot_cache; 151 152/* The number of nodes in the stable tree */ 153static unsigned long ksm_pages_shared; 154 155/* The number of page slots additionally sharing those nodes */ 156static unsigned long ksm_pages_sharing; 157 158/* The number of nodes in the unstable tree */ 159static unsigned long ksm_pages_unshared; 160 161/* The number of rmap_items in use: to calculate pages_volatile */ 162static unsigned long ksm_rmap_items; 163 164/* Limit on the number of unswappable pages used */ 165static unsigned long ksm_max_kernel_pages = 2000; 166 167/* Number of pages ksmd should scan in one batch */ 168static unsigned int ksm_thread_pages_to_scan = 200; 169 170/* Milliseconds ksmd should sleep between batches */ 171static unsigned int ksm_thread_sleep_millisecs = 20; 172 173#define KSM_RUN_STOP 0 174#define KSM_RUN_MERGE 1 175#define KSM_RUN_UNMERGE 2 176static unsigned int ksm_run = KSM_RUN_MERGE; 177 178static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 179static DEFINE_MUTEX(ksm_thread_mutex); 180static DEFINE_SPINLOCK(ksm_mmlist_lock); 181 182#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ 183 sizeof(struct __struct), __alignof__(struct __struct),\ 184 (__flags), NULL) 185 186static int __init ksm_slab_init(void) 187{ 188 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); 189 if (!rmap_item_cache) 190 goto out; 191 192 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); 193 if (!mm_slot_cache) 194 goto out_free; 195 196 return 0; 197 198out_free: 199 kmem_cache_destroy(rmap_item_cache); 200out: 201 return -ENOMEM; 202} 203 204static void __init ksm_slab_free(void) 205{ 206 kmem_cache_destroy(mm_slot_cache); 207 kmem_cache_destroy(rmap_item_cache); 208 mm_slot_cache = NULL; 209} 210 211static inline struct rmap_item *alloc_rmap_item(void) 212{ 213 struct rmap_item *rmap_item; 214 215 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); 216 if (rmap_item) 217 ksm_rmap_items++; 218 return rmap_item; 219} 220 221static inline void free_rmap_item(struct rmap_item *rmap_item) 222{ 223 ksm_rmap_items--; 224 rmap_item->mm = NULL; /* debug safety */ 225 kmem_cache_free(rmap_item_cache, rmap_item); 226} 227 228static inline struct mm_slot *alloc_mm_slot(void) 229{ 230 if (!mm_slot_cache) /* initialization failed */ 231 return NULL; 232 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 233} 234 235static inline void free_mm_slot(struct mm_slot *mm_slot) 236{ 237 kmem_cache_free(mm_slot_cache, mm_slot); 238} 239 240static int __init mm_slots_hash_init(void) 241{ 242 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), 243 GFP_KERNEL); 244 if (!mm_slots_hash) 245 return -ENOMEM; 246 return 0; 247} 248 249static void __init mm_slots_hash_free(void) 250{ 251 kfree(mm_slots_hash); 252} 253 254static struct mm_slot *get_mm_slot(struct mm_struct *mm) 255{ 256 struct mm_slot *mm_slot; 257 struct hlist_head *bucket; 258 struct hlist_node *node; 259 260 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 261 % MM_SLOTS_HASH_HEADS]; 262 hlist_for_each_entry(mm_slot, node, bucket, link) { 263 if (mm == mm_slot->mm) 264 return mm_slot; 265 } 266 return NULL; 267} 268 269static void insert_to_mm_slots_hash(struct mm_struct *mm, 270 struct mm_slot *mm_slot) 271{ 272 struct hlist_head *bucket; 273 274 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 275 % MM_SLOTS_HASH_HEADS]; 276 mm_slot->mm = mm; 277 INIT_LIST_HEAD(&mm_slot->rmap_list); 278 hlist_add_head(&mm_slot->link, bucket); 279} 280 281static inline int in_stable_tree(struct rmap_item *rmap_item) 282{ 283 return rmap_item->address & STABLE_FLAG; 284} 285 286/* 287 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 288 * page tables after it has passed through ksm_exit() - which, if necessary, 289 * takes mmap_sem briefly to serialize against them. ksm_exit() does not set 290 * a special flag: they can just back out as soon as mm_users goes to zero. 291 * ksm_test_exit() is used throughout to make this test for exit: in some 292 * places for correctness, in some places just to avoid unnecessary work. 293 */ 294static inline bool ksm_test_exit(struct mm_struct *mm) 295{ 296 return atomic_read(&mm->mm_users) == 0; 297} 298 299/* 300 * We use break_ksm to break COW on a ksm page: it's a stripped down 301 * 302 * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) 303 * put_page(page); 304 * 305 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, 306 * in case the application has unmapped and remapped mm,addr meanwhile. 307 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP 308 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. 309 */ 310static int break_ksm(struct vm_area_struct *vma, unsigned long addr) 311{ 312 struct page *page; 313 int ret = 0; 314 315 do { 316 cond_resched(); 317 page = follow_page(vma, addr, FOLL_GET); 318 if (!page) 319 break; 320 if (PageKsm(page)) 321 ret = handle_mm_fault(vma->vm_mm, vma, addr, 322 FAULT_FLAG_WRITE); 323 else 324 ret = VM_FAULT_WRITE; 325 put_page(page); 326 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); 327 /* 328 * We must loop because handle_mm_fault() may back out if there's 329 * any difficulty e.g. if pte accessed bit gets updated concurrently. 330 * 331 * VM_FAULT_WRITE is what we have been hoping for: it indicates that 332 * COW has been broken, even if the vma does not permit VM_WRITE; 333 * but note that a concurrent fault might break PageKsm for us. 334 * 335 * VM_FAULT_SIGBUS could occur if we race with truncation of the 336 * backing file, which also invalidates anonymous pages: that's 337 * okay, that truncation will have unmapped the PageKsm for us. 338 * 339 * VM_FAULT_OOM: at the time of writing (late July 2009), setting 340 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the 341 * current task has TIF_MEMDIE set, and will be OOM killed on return 342 * to user; and ksmd, having no mm, would never be chosen for that. 343 * 344 * But if the mm is in a limited mem_cgroup, then the fault may fail 345 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and 346 * even ksmd can fail in this way - though it's usually breaking ksm 347 * just to undo a merge it made a moment before, so unlikely to oom. 348 * 349 * That's a pity: we might therefore have more kernel pages allocated 350 * than we're counting as nodes in the stable tree; but ksm_do_scan 351 * will retry to break_cow on each pass, so should recover the page 352 * in due course. The important thing is to not let VM_MERGEABLE 353 * be cleared while any such pages might remain in the area. 354 */ 355 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 356} 357 358static void break_cow(struct mm_struct *mm, unsigned long addr) 359{ 360 struct vm_area_struct *vma; 361 362 down_read(&mm->mmap_sem); 363 if (ksm_test_exit(mm)) 364 goto out; 365 vma = find_vma(mm, addr); 366 if (!vma || vma->vm_start > addr) 367 goto out; 368 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 369 goto out; 370 break_ksm(vma, addr); 371out: 372 up_read(&mm->mmap_sem); 373} 374 375static struct page *get_mergeable_page(struct rmap_item *rmap_item) 376{ 377 struct mm_struct *mm = rmap_item->mm; 378 unsigned long addr = rmap_item->address; 379 struct vm_area_struct *vma; 380 struct page *page; 381 382 down_read(&mm->mmap_sem); 383 if (ksm_test_exit(mm)) 384 goto out; 385 vma = find_vma(mm, addr); 386 if (!vma || vma->vm_start > addr) 387 goto out; 388 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 389 goto out; 390 391 page = follow_page(vma, addr, FOLL_GET); 392 if (!page) 393 goto out; 394 if (PageAnon(page)) { 395 flush_anon_page(vma, page, addr); 396 flush_dcache_page(page); 397 } else { 398 put_page(page); 399out: page = NULL; 400 } 401 up_read(&mm->mmap_sem); 402 return page; 403} 404 405/* 406 * get_ksm_page: checks if the page at the virtual address in rmap_item 407 * is still PageKsm, in which case we can trust the content of the page, 408 * and it returns the gotten page; but NULL if the page has been zapped. 409 */ 410static struct page *get_ksm_page(struct rmap_item *rmap_item) 411{ 412 struct page *page; 413 414 page = get_mergeable_page(rmap_item); 415 if (page && !PageKsm(page)) { 416 put_page(page); 417 page = NULL; 418 } 419 return page; 420} 421 422/* 423 * Removing rmap_item from stable or unstable tree. 424 * This function will clean the information from the stable/unstable tree. 425 */ 426static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) 427{ 428 if (in_stable_tree(rmap_item)) { 429 struct rmap_item *next_item = rmap_item->next; 430 431 if (rmap_item->address & NODE_FLAG) { 432 if (next_item) { 433 rb_replace_node(&rmap_item->node, 434 &next_item->node, 435 &root_stable_tree); 436 next_item->address |= NODE_FLAG; 437 ksm_pages_sharing--; 438 } else { 439 rb_erase(&rmap_item->node, &root_stable_tree); 440 ksm_pages_shared--; 441 } 442 } else { 443 struct rmap_item *prev_item = rmap_item->prev; 444 445 BUG_ON(prev_item->next != rmap_item); 446 prev_item->next = next_item; 447 if (next_item) { 448 BUG_ON(next_item->prev != rmap_item); 449 next_item->prev = rmap_item->prev; 450 } 451 ksm_pages_sharing--; 452 } 453 454 rmap_item->next = NULL; 455 456 } else if (rmap_item->address & NODE_FLAG) { 457 unsigned char age; 458 /* 459 * Usually ksmd can and must skip the rb_erase, because 460 * root_unstable_tree was already reset to RB_ROOT. 461 * But be careful when an mm is exiting: do the rb_erase 462 * if this rmap_item was inserted by this scan, rather 463 * than left over from before. 464 */ 465 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); 466 BUG_ON(age > 1); 467 if (!age) 468 rb_erase(&rmap_item->node, &root_unstable_tree); 469 ksm_pages_unshared--; 470 } 471 472 rmap_item->address &= PAGE_MASK; 473 474 cond_resched(); /* we're called from many long loops */ 475} 476 477static void remove_trailing_rmap_items(struct mm_slot *mm_slot, 478 struct list_head *cur) 479{ 480 struct rmap_item *rmap_item; 481 482 while (cur != &mm_slot->rmap_list) { 483 rmap_item = list_entry(cur, struct rmap_item, link); 484 cur = cur->next; 485 remove_rmap_item_from_tree(rmap_item); 486 list_del(&rmap_item->link); 487 free_rmap_item(rmap_item); 488 } 489} 490 491/* 492 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather 493 * than check every pte of a given vma, the locking doesn't quite work for 494 * that - an rmap_item is assigned to the stable tree after inserting ksm 495 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing 496 * rmap_items from parent to child at fork time (so as not to waste time 497 * if exit comes before the next scan reaches it). 498 * 499 * Similarly, although we'd like to remove rmap_items (so updating counts 500 * and freeing memory) when unmerging an area, it's easier to leave that 501 * to the next pass of ksmd - consider, for example, how ksmd might be 502 * in cmp_and_merge_page on one of the rmap_items we would be removing. 503 */ 504static int unmerge_ksm_pages(struct vm_area_struct *vma, 505 unsigned long start, unsigned long end) 506{ 507 unsigned long addr; 508 int err = 0; 509 510 for (addr = start; addr < end && !err; addr += PAGE_SIZE) { 511 if (ksm_test_exit(vma->vm_mm)) 512 break; 513 if (signal_pending(current)) 514 err = -ERESTARTSYS; 515 else 516 err = break_ksm(vma, addr); 517 } 518 return err; 519} 520 521#ifdef CONFIG_SYSFS 522/* 523 * Only called through the sysfs control interface: 524 */ 525static int unmerge_and_remove_all_rmap_items(void) 526{ 527 struct mm_slot *mm_slot; 528 struct mm_struct *mm; 529 struct vm_area_struct *vma; 530 int err = 0; 531 532 spin_lock(&ksm_mmlist_lock); 533 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, 534 struct mm_slot, mm_list); 535 spin_unlock(&ksm_mmlist_lock); 536 537 for (mm_slot = ksm_scan.mm_slot; 538 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { 539 mm = mm_slot->mm; 540 down_read(&mm->mmap_sem); 541 for (vma = mm->mmap; vma; vma = vma->vm_next) { 542 if (ksm_test_exit(mm)) 543 break; 544 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 545 continue; 546 err = unmerge_ksm_pages(vma, 547 vma->vm_start, vma->vm_end); 548 if (err) 549 goto error; 550 } 551 552 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); 553 554 spin_lock(&ksm_mmlist_lock); 555 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, 556 struct mm_slot, mm_list); 557 if (ksm_test_exit(mm)) { 558 hlist_del(&mm_slot->link); 559 list_del(&mm_slot->mm_list); 560 spin_unlock(&ksm_mmlist_lock); 561 562 free_mm_slot(mm_slot); 563 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 564 up_read(&mm->mmap_sem); 565 mmdrop(mm); 566 } else { 567 spin_unlock(&ksm_mmlist_lock); 568 up_read(&mm->mmap_sem); 569 } 570 } 571 572 ksm_scan.seqnr = 0; 573 return 0; 574 575error: 576 up_read(&mm->mmap_sem); 577 spin_lock(&ksm_mmlist_lock); 578 ksm_scan.mm_slot = &ksm_mm_head; 579 spin_unlock(&ksm_mmlist_lock); 580 return err; 581} 582#endif /* CONFIG_SYSFS */ 583 584static u32 calc_checksum(struct page *page) 585{ 586 u32 checksum; 587 void *addr = kmap_atomic(page, KM_USER0); 588 checksum = jhash2(addr, PAGE_SIZE / 4, 17); 589 kunmap_atomic(addr, KM_USER0); 590 return checksum; 591} 592 593static int memcmp_pages(struct page *page1, struct page *page2) 594{ 595 char *addr1, *addr2; 596 int ret; 597 598 addr1 = kmap_atomic(page1, KM_USER0); 599 addr2 = kmap_atomic(page2, KM_USER1); 600 ret = memcmp(addr1, addr2, PAGE_SIZE); 601 kunmap_atomic(addr2, KM_USER1); 602 kunmap_atomic(addr1, KM_USER0); 603 return ret; 604} 605 606static inline int pages_identical(struct page *page1, struct page *page2) 607{ 608 return !memcmp_pages(page1, page2); 609} 610 611static int write_protect_page(struct vm_area_struct *vma, struct page *page, 612 pte_t *orig_pte) 613{ 614 struct mm_struct *mm = vma->vm_mm; 615 unsigned long addr; 616 pte_t *ptep; 617 spinlock_t *ptl; 618 int swapped; 619 int err = -EFAULT; 620 621 addr = page_address_in_vma(page, vma); 622 if (addr == -EFAULT) 623 goto out; 624 625 ptep = page_check_address(page, mm, addr, &ptl, 0); 626 if (!ptep) 627 goto out; 628 629 if (pte_write(*ptep)) { 630 pte_t entry; 631 632 swapped = PageSwapCache(page); 633 flush_cache_page(vma, addr, page_to_pfn(page)); 634 /* 635 * Ok this is tricky, when get_user_pages_fast() run it doesnt 636 * take any lock, therefore the check that we are going to make 637 * with the pagecount against the mapcount is racey and 638 * O_DIRECT can happen right after the check. 639 * So we clear the pte and flush the tlb before the check 640 * this assure us that no O_DIRECT can happen after the check 641 * or in the middle of the check. 642 */ 643 entry = ptep_clear_flush(vma, addr, ptep); 644 /* 645 * Check that no O_DIRECT or similar I/O is in progress on the 646 * page 647 */ 648 if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { 649 set_pte_at_notify(mm, addr, ptep, entry); 650 goto out_unlock; 651 } 652 entry = pte_wrprotect(entry); 653 set_pte_at_notify(mm, addr, ptep, entry); 654 } 655 *orig_pte = *ptep; 656 err = 0; 657 658out_unlock: 659 pte_unmap_unlock(ptep, ptl); 660out: 661 return err; 662} 663 664/** 665 * replace_page - replace page in vma by new ksm page 666 * @vma: vma that holds the pte pointing to oldpage 667 * @oldpage: the page we are replacing by newpage 668 * @newpage: the ksm page we replace oldpage by 669 * @orig_pte: the original value of the pte 670 * 671 * Returns 0 on success, -EFAULT on failure. 672 */ 673static int replace_page(struct vm_area_struct *vma, struct page *oldpage, 674 struct page *newpage, pte_t orig_pte) 675{ 676 struct mm_struct *mm = vma->vm_mm; 677 pgd_t *pgd; 678 pud_t *pud; 679 pmd_t *pmd; 680 pte_t *ptep; 681 spinlock_t *ptl; 682 unsigned long addr; 683 pgprot_t prot; 684 int err = -EFAULT; 685 686 prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); 687 688 addr = page_address_in_vma(oldpage, vma); 689 if (addr == -EFAULT) 690 goto out; 691 692 pgd = pgd_offset(mm, addr); 693 if (!pgd_present(*pgd)) 694 goto out; 695 696 pud = pud_offset(pgd, addr); 697 if (!pud_present(*pud)) 698 goto out; 699 700 pmd = pmd_offset(pud, addr); 701 if (!pmd_present(*pmd)) 702 goto out; 703 704 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 705 if (!pte_same(*ptep, orig_pte)) { 706 pte_unmap_unlock(ptep, ptl); 707 goto out; 708 } 709 710 get_page(newpage); 711 page_add_ksm_rmap(newpage); 712 713 flush_cache_page(vma, addr, pte_pfn(*ptep)); 714 ptep_clear_flush(vma, addr, ptep); 715 set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); 716 717 page_remove_rmap(oldpage); 718 put_page(oldpage); 719 720 pte_unmap_unlock(ptep, ptl); 721 err = 0; 722out: 723 return err; 724} 725 726/* 727 * try_to_merge_one_page - take two pages and merge them into one 728 * @vma: the vma that hold the pte pointing into oldpage 729 * @oldpage: the page that we want to replace with newpage 730 * @newpage: the page that we want to map instead of oldpage 731 * 732 * Note: 733 * oldpage should be a PageAnon page, while newpage should be a PageKsm page, 734 * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. 735 * 736 * This function returns 0 if the pages were merged, -EFAULT otherwise. 737 */ 738static int try_to_merge_one_page(struct vm_area_struct *vma, 739 struct page *oldpage, 740 struct page *newpage) 741{ 742 pte_t orig_pte = __pte(0); 743 int err = -EFAULT; 744 745 if (!(vma->vm_flags & VM_MERGEABLE)) 746 goto out; 747 748 if (!PageAnon(oldpage)) 749 goto out; 750 751 get_page(newpage); 752 get_page(oldpage); 753 754 /* 755 * We need the page lock to read a stable PageSwapCache in 756 * write_protect_page(). We use trylock_page() instead of 757 * lock_page() because we don't want to wait here - we 758 * prefer to continue scanning and merging different pages, 759 * then come back to this page when it is unlocked. 760 */ 761 if (!trylock_page(oldpage)) 762 goto out_putpage; 763 /* 764 * If this anonymous page is mapped only here, its pte may need 765 * to be write-protected. If it's mapped elsewhere, all of its 766 * ptes are necessarily already write-protected. But in either 767 * case, we need to lock and check page_count is not raised. 768 */ 769 if (write_protect_page(vma, oldpage, &orig_pte)) { 770 unlock_page(oldpage); 771 goto out_putpage; 772 } 773 unlock_page(oldpage); 774 775 if (pages_identical(oldpage, newpage)) 776 err = replace_page(vma, oldpage, newpage, orig_pte); 777 778out_putpage: 779 put_page(oldpage); 780 put_page(newpage); 781out: 782 return err; 783} 784 785/* 786 * try_to_merge_with_ksm_page - like try_to_merge_two_pages, 787 * but no new kernel page is allocated: kpage must already be a ksm page. 788 */ 789static int try_to_merge_with_ksm_page(struct mm_struct *mm1, 790 unsigned long addr1, 791 struct page *page1, 792 struct page *kpage) 793{ 794 struct vm_area_struct *vma; 795 int err = -EFAULT; 796 797 down_read(&mm1->mmap_sem); 798 if (ksm_test_exit(mm1)) 799 goto out; 800 801 vma = find_vma(mm1, addr1); 802 if (!vma || vma->vm_start > addr1) 803 goto out; 804 805 err = try_to_merge_one_page(vma, page1, kpage); 806out: 807 up_read(&mm1->mmap_sem); 808 return err; 809} 810 811/* 812 * try_to_merge_two_pages - take two identical pages and prepare them 813 * to be merged into one page. 814 * 815 * This function returns 0 if we successfully mapped two identical pages 816 * into one page, -EFAULT otherwise. 817 * 818 * Note that this function allocates a new kernel page: if one of the pages 819 * is already a ksm page, try_to_merge_with_ksm_page should be used. 820 */ 821static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, 822 struct page *page1, struct mm_struct *mm2, 823 unsigned long addr2, struct page *page2) 824{ 825 struct vm_area_struct *vma; 826 struct page *kpage; 827 int err = -EFAULT; 828 829 /* 830 * The number of nodes in the stable tree 831 * is the number of kernel pages that we hold. 832 */ 833 if (ksm_max_kernel_pages && 834 ksm_max_kernel_pages <= ksm_pages_shared) 835 return err; 836 837 kpage = alloc_page(GFP_HIGHUSER); 838 if (!kpage) 839 return err; 840 841 down_read(&mm1->mmap_sem); 842 if (ksm_test_exit(mm1)) { 843 up_read(&mm1->mmap_sem); 844 goto out; 845 } 846 vma = find_vma(mm1, addr1); 847 if (!vma || vma->vm_start > addr1) { 848 up_read(&mm1->mmap_sem); 849 goto out; 850 } 851 852 copy_user_highpage(kpage, page1, addr1, vma); 853 err = try_to_merge_one_page(vma, page1, kpage); 854 up_read(&mm1->mmap_sem); 855 856 if (!err) { 857 err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); 858 /* 859 * If that fails, we have a ksm page with only one pte 860 * pointing to it: so break it. 861 */ 862 if (err) 863 break_cow(mm1, addr1); 864 } 865out: 866 put_page(kpage); 867 return err; 868} 869 870/* 871 * stable_tree_search - search page inside the stable tree 872 * @page: the page that we are searching identical pages to. 873 * @page2: pointer into identical page that we are holding inside the stable 874 * tree that we have found. 875 * @rmap_item: the reverse mapping item 876 * 877 * This function checks if there is a page inside the stable tree 878 * with identical content to the page that we are scanning right now. 879 * 880 * This function return rmap_item pointer to the identical item if found, 881 * NULL otherwise. 882 */ 883static struct rmap_item *stable_tree_search(struct page *page, 884 struct page **page2, 885 struct rmap_item *rmap_item) 886{ 887 struct rb_node *node = root_stable_tree.rb_node; 888 889 while (node) { 890 struct rmap_item *tree_rmap_item, *next_rmap_item; 891 int ret; 892 893 tree_rmap_item = rb_entry(node, struct rmap_item, node); 894 while (tree_rmap_item) { 895 BUG_ON(!in_stable_tree(tree_rmap_item)); 896 cond_resched(); 897 page2[0] = get_ksm_page(tree_rmap_item); 898 if (page2[0]) 899 break; 900 next_rmap_item = tree_rmap_item->next; 901 remove_rmap_item_from_tree(tree_rmap_item); 902 tree_rmap_item = next_rmap_item; 903 } 904 if (!tree_rmap_item) 905 return NULL; 906 907 ret = memcmp_pages(page, page2[0]); 908 909 if (ret < 0) { 910 put_page(page2[0]); 911 node = node->rb_left; 912 } else if (ret > 0) { 913 put_page(page2[0]); 914 node = node->rb_right; 915 } else { 916 return tree_rmap_item; 917 } 918 } 919 920 return NULL; 921} 922 923/* 924 * stable_tree_insert - insert rmap_item pointing to new ksm page 925 * into the stable tree. 926 * 927 * @page: the page that we are searching identical page to inside the stable 928 * tree. 929 * @rmap_item: pointer to the reverse mapping item. 930 * 931 * This function returns rmap_item if success, NULL otherwise. 932 */ 933static struct rmap_item *stable_tree_insert(struct page *page, 934 struct rmap_item *rmap_item) 935{ 936 struct rb_node **new = &root_stable_tree.rb_node; 937 struct rb_node *parent = NULL; 938 939 while (*new) { 940 struct rmap_item *tree_rmap_item, *next_rmap_item; 941 struct page *tree_page; 942 int ret; 943 944 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 945 while (tree_rmap_item) { 946 BUG_ON(!in_stable_tree(tree_rmap_item)); 947 cond_resched(); 948 tree_page = get_ksm_page(tree_rmap_item); 949 if (tree_page) 950 break; 951 next_rmap_item = tree_rmap_item->next; 952 remove_rmap_item_from_tree(tree_rmap_item); 953 tree_rmap_item = next_rmap_item; 954 } 955 if (!tree_rmap_item) 956 return NULL; 957 958 ret = memcmp_pages(page, tree_page); 959 put_page(tree_page); 960 961 parent = *new; 962 if (ret < 0) 963 new = &parent->rb_left; 964 else if (ret > 0) 965 new = &parent->rb_right; 966 else { 967 /* 968 * It is not a bug that stable_tree_search() didn't 969 * find this node: because at that time our page was 970 * not yet write-protected, so may have changed since. 971 */ 972 return NULL; 973 } 974 } 975 976 rmap_item->address |= NODE_FLAG | STABLE_FLAG; 977 rmap_item->next = NULL; 978 rb_link_node(&rmap_item->node, parent, new); 979 rb_insert_color(&rmap_item->node, &root_stable_tree); 980 981 ksm_pages_shared++; 982 return rmap_item; 983} 984 985/* 986 * unstable_tree_search_insert - search and insert items into the unstable tree. 987 * 988 * @page: the page that we are going to search for identical page or to insert 989 * into the unstable tree 990 * @page2: pointer into identical page that was found inside the unstable tree 991 * @rmap_item: the reverse mapping item of page 992 * 993 * This function searches for a page in the unstable tree identical to the 994 * page currently being scanned; and if no identical page is found in the 995 * tree, we insert rmap_item as a new object into the unstable tree. 996 * 997 * This function returns pointer to rmap_item found to be identical 998 * to the currently scanned page, NULL otherwise. 999 * 1000 * This function does both searching and inserting, because they share 1001 * the same walking algorithm in an rbtree. 1002 */ 1003static struct rmap_item *unstable_tree_search_insert(struct page *page, 1004 struct page **page2, 1005 struct rmap_item *rmap_item) 1006{ 1007 struct rb_node **new = &root_unstable_tree.rb_node; 1008 struct rb_node *parent = NULL; 1009 1010 while (*new) { 1011 struct rmap_item *tree_rmap_item; 1012 int ret; 1013 1014 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1015 page2[0] = get_mergeable_page(tree_rmap_item); 1016 if (!page2[0]) 1017 return NULL; 1018 1019 /* 1020 * Don't substitute an unswappable ksm page 1021 * just for one good swappable forked page. 1022 */ 1023 if (page == page2[0]) { 1024 put_page(page2[0]); 1025 return NULL; 1026 } 1027 1028 ret = memcmp_pages(page, page2[0]); 1029 1030 parent = *new; 1031 if (ret < 0) { 1032 put_page(page2[0]); 1033 new = &parent->rb_left; 1034 } else if (ret > 0) { 1035 put_page(page2[0]); 1036 new = &parent->rb_right; 1037 } else { 1038 return tree_rmap_item; 1039 } 1040 } 1041 1042 rmap_item->address |= NODE_FLAG; 1043 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 1044 rb_link_node(&rmap_item->node, parent, new); 1045 rb_insert_color(&rmap_item->node, &root_unstable_tree); 1046 1047 ksm_pages_unshared++; 1048 return NULL; 1049} 1050 1051/* 1052 * stable_tree_append - add another rmap_item to the linked list of 1053 * rmap_items hanging off a given node of the stable tree, all sharing 1054 * the same ksm page. 1055 */ 1056static void stable_tree_append(struct rmap_item *rmap_item, 1057 struct rmap_item *tree_rmap_item) 1058{ 1059 rmap_item->next = tree_rmap_item->next; 1060 rmap_item->prev = tree_rmap_item; 1061 1062 if (tree_rmap_item->next) 1063 tree_rmap_item->next->prev = rmap_item; 1064 1065 tree_rmap_item->next = rmap_item; 1066 rmap_item->address |= STABLE_FLAG; 1067 1068 ksm_pages_sharing++; 1069} 1070 1071/* 1072 * cmp_and_merge_page - first see if page can be merged into the stable tree; 1073 * if not, compare checksum to previous and if it's the same, see if page can 1074 * be inserted into the unstable tree, or merged with a page already there and 1075 * both transferred to the stable tree. 1076 * 1077 * @page: the page that we are searching identical page to. 1078 * @rmap_item: the reverse mapping into the virtual address of this page 1079 */ 1080static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) 1081{ 1082 struct page *page2[1]; 1083 struct rmap_item *tree_rmap_item; 1084 unsigned int checksum; 1085 int err; 1086 1087 if (in_stable_tree(rmap_item)) 1088 remove_rmap_item_from_tree(rmap_item); 1089 1090 /* We first start with searching the page inside the stable tree */ 1091 tree_rmap_item = stable_tree_search(page, page2, rmap_item); 1092 if (tree_rmap_item) { 1093 if (page == page2[0]) /* forked */ 1094 err = 0; 1095 else 1096 err = try_to_merge_with_ksm_page(rmap_item->mm, 1097 rmap_item->address, 1098 page, page2[0]); 1099 put_page(page2[0]); 1100 1101 if (!err) { 1102 /* 1103 * The page was successfully merged: 1104 * add its rmap_item to the stable tree. 1105 */ 1106 stable_tree_append(rmap_item, tree_rmap_item); 1107 } 1108 return; 1109 } 1110 1111 /* 1112 * A ksm page might have got here by fork, but its other 1113 * references have already been removed from the stable tree. 1114 * Or it might be left over from a break_ksm which failed 1115 * when the mem_cgroup had reached its limit: try again now. 1116 */ 1117 if (PageKsm(page)) 1118 break_cow(rmap_item->mm, rmap_item->address); 1119 1120 /* 1121 * In case the hash value of the page was changed from the last time we 1122 * have calculated it, this page to be changed frequely, therefore we 1123 * don't want to insert it to the unstable tree, and we don't want to 1124 * waste our time to search if there is something identical to it there. 1125 */ 1126 checksum = calc_checksum(page); 1127 if (rmap_item->oldchecksum != checksum) { 1128 rmap_item->oldchecksum = checksum; 1129 return; 1130 } 1131 1132 tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); 1133 if (tree_rmap_item) { 1134 err = try_to_merge_two_pages(rmap_item->mm, 1135 rmap_item->address, page, 1136 tree_rmap_item->mm, 1137 tree_rmap_item->address, page2[0]); 1138 /* 1139 * As soon as we merge this page, we want to remove the 1140 * rmap_item of the page we have merged with from the unstable 1141 * tree, and insert it instead as new node in the stable tree. 1142 */ 1143 if (!err) { 1144 rb_erase(&tree_rmap_item->node, &root_unstable_tree); 1145 tree_rmap_item->address &= ~NODE_FLAG; 1146 ksm_pages_unshared--; 1147 1148 /* 1149 * If we fail to insert the page into the stable tree, 1150 * we will have 2 virtual addresses that are pointing 1151 * to a ksm page left outside the stable tree, 1152 * in which case we need to break_cow on both. 1153 */ 1154 if (stable_tree_insert(page2[0], tree_rmap_item)) 1155 stable_tree_append(rmap_item, tree_rmap_item); 1156 else { 1157 break_cow(tree_rmap_item->mm, 1158 tree_rmap_item->address); 1159 break_cow(rmap_item->mm, rmap_item->address); 1160 } 1161 } 1162 1163 put_page(page2[0]); 1164 } 1165} 1166 1167static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, 1168 struct list_head *cur, 1169 unsigned long addr) 1170{ 1171 struct rmap_item *rmap_item; 1172 1173 while (cur != &mm_slot->rmap_list) { 1174 rmap_item = list_entry(cur, struct rmap_item, link); 1175 if ((rmap_item->address & PAGE_MASK) == addr) { 1176 if (!in_stable_tree(rmap_item)) 1177 remove_rmap_item_from_tree(rmap_item); 1178 return rmap_item; 1179 } 1180 if (rmap_item->address > addr) 1181 break; 1182 cur = cur->next; 1183 remove_rmap_item_from_tree(rmap_item); 1184 list_del(&rmap_item->link); 1185 free_rmap_item(rmap_item); 1186 } 1187 1188 rmap_item = alloc_rmap_item(); 1189 if (rmap_item) { 1190 /* It has already been zeroed */ 1191 rmap_item->mm = mm_slot->mm; 1192 rmap_item->address = addr; 1193 list_add_tail(&rmap_item->link, cur); 1194 } 1195 return rmap_item; 1196} 1197 1198static struct rmap_item *scan_get_next_rmap_item(struct page **page) 1199{ 1200 struct mm_struct *mm; 1201 struct mm_slot *slot; 1202 struct vm_area_struct *vma; 1203 struct rmap_item *rmap_item; 1204 1205 if (list_empty(&ksm_mm_head.mm_list)) 1206 return NULL; 1207 1208 slot = ksm_scan.mm_slot; 1209 if (slot == &ksm_mm_head) { 1210 root_unstable_tree = RB_ROOT; 1211 1212 spin_lock(&ksm_mmlist_lock); 1213 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1214 ksm_scan.mm_slot = slot; 1215 spin_unlock(&ksm_mmlist_lock); 1216next_mm: 1217 ksm_scan.address = 0; 1218 ksm_scan.rmap_item = list_entry(&slot->rmap_list, 1219 struct rmap_item, link); 1220 } 1221 1222 mm = slot->mm; 1223 down_read(&mm->mmap_sem); 1224 if (ksm_test_exit(mm)) 1225 vma = NULL; 1226 else 1227 vma = find_vma(mm, ksm_scan.address); 1228 1229 for (; vma; vma = vma->vm_next) { 1230 if (!(vma->vm_flags & VM_MERGEABLE)) 1231 continue; 1232 if (ksm_scan.address < vma->vm_start) 1233 ksm_scan.address = vma->vm_start; 1234 if (!vma->anon_vma) 1235 ksm_scan.address = vma->vm_end; 1236 1237 while (ksm_scan.address < vma->vm_end) { 1238 if (ksm_test_exit(mm)) 1239 break; 1240 *page = follow_page(vma, ksm_scan.address, FOLL_GET); 1241 if (*page && PageAnon(*page)) { 1242 flush_anon_page(vma, *page, ksm_scan.address); 1243 flush_dcache_page(*page); 1244 rmap_item = get_next_rmap_item(slot, 1245 ksm_scan.rmap_item->link.next, 1246 ksm_scan.address); 1247 if (rmap_item) { 1248 ksm_scan.rmap_item = rmap_item; 1249 ksm_scan.address += PAGE_SIZE; 1250 } else 1251 put_page(*page); 1252 up_read(&mm->mmap_sem); 1253 return rmap_item; 1254 } 1255 if (*page) 1256 put_page(*page); 1257 ksm_scan.address += PAGE_SIZE; 1258 cond_resched(); 1259 } 1260 } 1261 1262 if (ksm_test_exit(mm)) { 1263 ksm_scan.address = 0; 1264 ksm_scan.rmap_item = list_entry(&slot->rmap_list, 1265 struct rmap_item, link); 1266 } 1267 /* 1268 * Nuke all the rmap_items that are above this current rmap: 1269 * because there were no VM_MERGEABLE vmas with such addresses. 1270 */ 1271 remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); 1272 1273 spin_lock(&ksm_mmlist_lock); 1274 ksm_scan.mm_slot = list_entry(slot->mm_list.next, 1275 struct mm_slot, mm_list); 1276 if (ksm_scan.address == 0) { 1277 /* 1278 * We've completed a full scan of all vmas, holding mmap_sem 1279 * throughout, and found no VM_MERGEABLE: so do the same as 1280 * __ksm_exit does to remove this mm from all our lists now. 1281 * This applies either when cleaning up after __ksm_exit 1282 * (but beware: we can reach here even before __ksm_exit), 1283 * or when all VM_MERGEABLE areas have been unmapped (and 1284 * mmap_sem then protects against race with MADV_MERGEABLE). 1285 */ 1286 hlist_del(&slot->link); 1287 list_del(&slot->mm_list); 1288 spin_unlock(&ksm_mmlist_lock); 1289 1290 free_mm_slot(slot); 1291 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 1292 up_read(&mm->mmap_sem); 1293 mmdrop(mm); 1294 } else { 1295 spin_unlock(&ksm_mmlist_lock); 1296 up_read(&mm->mmap_sem); 1297 } 1298 1299 /* Repeat until we've completed scanning the whole list */ 1300 slot = ksm_scan.mm_slot; 1301 if (slot != &ksm_mm_head) 1302 goto next_mm; 1303 1304 ksm_scan.seqnr++; 1305 return NULL; 1306} 1307 1308/** 1309 * ksm_do_scan - the ksm scanner main worker function. 1310 * @scan_npages - number of pages we want to scan before we return. 1311 */ 1312static void ksm_do_scan(unsigned int scan_npages) 1313{ 1314 struct rmap_item *rmap_item; 1315 struct page *page; 1316 1317 while (scan_npages--) { 1318 cond_resched(); 1319 rmap_item = scan_get_next_rmap_item(&page); 1320 if (!rmap_item) 1321 return; 1322 if (!PageKsm(page) || !in_stable_tree(rmap_item)) 1323 cmp_and_merge_page(page, rmap_item); 1324 else if (page_mapcount(page) == 1) { 1325 /* 1326 * Replace now-unshared ksm page by ordinary page. 1327 */ 1328 break_cow(rmap_item->mm, rmap_item->address); 1329 remove_rmap_item_from_tree(rmap_item); 1330 rmap_item->oldchecksum = calc_checksum(page); 1331 } 1332 put_page(page); 1333 } 1334} 1335 1336static int ksmd_should_run(void) 1337{ 1338 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); 1339} 1340 1341static int ksm_scan_thread(void *nothing) 1342{ 1343 set_user_nice(current, 5); 1344 1345 while (!kthread_should_stop()) { 1346 mutex_lock(&ksm_thread_mutex); 1347 if (ksmd_should_run()) 1348 ksm_do_scan(ksm_thread_pages_to_scan); 1349 mutex_unlock(&ksm_thread_mutex); 1350 1351 if (ksmd_should_run()) { 1352 schedule_timeout_interruptible( 1353 msecs_to_jiffies(ksm_thread_sleep_millisecs)); 1354 } else { 1355 wait_event_interruptible(ksm_thread_wait, 1356 ksmd_should_run() || kthread_should_stop()); 1357 } 1358 } 1359 return 0; 1360} 1361 1362int ksm_madvise(struct vm_area_struct *vma, unsigned long start, 1363 unsigned long end, int advice, unsigned long *vm_flags) 1364{ 1365 struct mm_struct *mm = vma->vm_mm; 1366 int err; 1367 1368 switch (advice) { 1369 case MADV_MERGEABLE: 1370 /* 1371 * Be somewhat over-protective for now! 1372 */ 1373 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1374 VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1375 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1376 VM_MIXEDMAP | VM_SAO)) 1377 return 0; /* just ignore the advice */ 1378 1379 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { 1380 err = __ksm_enter(mm); 1381 if (err) 1382 return err; 1383 } 1384 1385 *vm_flags |= VM_MERGEABLE; 1386 break; 1387 1388 case MADV_UNMERGEABLE: 1389 if (!(*vm_flags & VM_MERGEABLE)) 1390 return 0; /* just ignore the advice */ 1391 1392 if (vma->anon_vma) { 1393 err = unmerge_ksm_pages(vma, start, end); 1394 if (err) 1395 return err; 1396 } 1397 1398 *vm_flags &= ~VM_MERGEABLE; 1399 break; 1400 } 1401 1402 return 0; 1403} 1404 1405int __ksm_enter(struct mm_struct *mm) 1406{ 1407 struct mm_slot *mm_slot; 1408 int needs_wakeup; 1409 1410 mm_slot = alloc_mm_slot(); 1411 if (!mm_slot) 1412 return -ENOMEM; 1413 1414 /* Check ksm_run too? Would need tighter locking */ 1415 needs_wakeup = list_empty(&ksm_mm_head.mm_list); 1416 1417 spin_lock(&ksm_mmlist_lock); 1418 insert_to_mm_slots_hash(mm, mm_slot); 1419 /* 1420 * Insert just behind the scanning cursor, to let the area settle 1421 * down a little; when fork is followed by immediate exec, we don't 1422 * want ksmd to waste time setting up and tearing down an rmap_list. 1423 */ 1424 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); 1425 spin_unlock(&ksm_mmlist_lock); 1426 1427 set_bit(MMF_VM_MERGEABLE, &mm->flags); 1428 atomic_inc(&mm->mm_count); 1429 1430 if (needs_wakeup) 1431 wake_up_interruptible(&ksm_thread_wait); 1432 1433 return 0; 1434} 1435 1436void __ksm_exit(struct mm_struct *mm) 1437{ 1438 struct mm_slot *mm_slot; 1439 int easy_to_free = 0; 1440 1441 /* 1442 * This process is exiting: if it's straightforward (as is the 1443 * case when ksmd was never running), free mm_slot immediately. 1444 * But if it's at the cursor or has rmap_items linked to it, use 1445 * mmap_sem to synchronize with any break_cows before pagetables 1446 * are freed, and leave the mm_slot on the list for ksmd to free. 1447 * Beware: ksm may already have noticed it exiting and freed the slot. 1448 */ 1449 1450 spin_lock(&ksm_mmlist_lock); 1451 mm_slot = get_mm_slot(mm); 1452 if (mm_slot && ksm_scan.mm_slot != mm_slot) { 1453 if (list_empty(&mm_slot->rmap_list)) { 1454 hlist_del(&mm_slot->link); 1455 list_del(&mm_slot->mm_list); 1456 easy_to_free = 1; 1457 } else { 1458 list_move(&mm_slot->mm_list, 1459 &ksm_scan.mm_slot->mm_list); 1460 } 1461 } 1462 spin_unlock(&ksm_mmlist_lock); 1463 1464 if (easy_to_free) { 1465 free_mm_slot(mm_slot); 1466 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 1467 mmdrop(mm); 1468 } else if (mm_slot) { 1469 down_write(&mm->mmap_sem); 1470 up_write(&mm->mmap_sem); 1471 } 1472} 1473 1474#ifdef CONFIG_SYSFS 1475/* 1476 * This all compiles without CONFIG_SYSFS, but is a waste of space. 1477 */ 1478 1479#define KSM_ATTR_RO(_name) \ 1480 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 1481#define KSM_ATTR(_name) \ 1482 static struct kobj_attribute _name##_attr = \ 1483 __ATTR(_name, 0644, _name##_show, _name##_store) 1484 1485static ssize_t sleep_millisecs_show(struct kobject *kobj, 1486 struct kobj_attribute *attr, char *buf) 1487{ 1488 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); 1489} 1490 1491static ssize_t sleep_millisecs_store(struct kobject *kobj, 1492 struct kobj_attribute *attr, 1493 const char *buf, size_t count) 1494{ 1495 unsigned long msecs; 1496 int err; 1497 1498 err = strict_strtoul(buf, 10, &msecs); 1499 if (err || msecs > UINT_MAX) 1500 return -EINVAL; 1501 1502 ksm_thread_sleep_millisecs = msecs; 1503 1504 return count; 1505} 1506KSM_ATTR(sleep_millisecs); 1507 1508static ssize_t pages_to_scan_show(struct kobject *kobj, 1509 struct kobj_attribute *attr, char *buf) 1510{ 1511 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); 1512} 1513 1514static ssize_t pages_to_scan_store(struct kobject *kobj, 1515 struct kobj_attribute *attr, 1516 const char *buf, size_t count) 1517{ 1518 int err; 1519 unsigned long nr_pages; 1520 1521 err = strict_strtoul(buf, 10, &nr_pages); 1522 if (err || nr_pages > UINT_MAX) 1523 return -EINVAL; 1524 1525 ksm_thread_pages_to_scan = nr_pages; 1526 1527 return count; 1528} 1529KSM_ATTR(pages_to_scan); 1530 1531static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, 1532 char *buf) 1533{ 1534 return sprintf(buf, "%u\n", ksm_run); 1535} 1536 1537static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, 1538 const char *buf, size_t count) 1539{ 1540 int err; 1541 unsigned long flags; 1542 1543 err = strict_strtoul(buf, 10, &flags); 1544 if (err || flags > UINT_MAX) 1545 return -EINVAL; 1546 if (flags > KSM_RUN_UNMERGE) 1547 return -EINVAL; 1548 1549 /* 1550 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. 1551 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, 1552 * breaking COW to free the unswappable pages_shared (but leaves 1553 * mm_slots on the list for when ksmd may be set running again). 1554 */ 1555 1556 mutex_lock(&ksm_thread_mutex); 1557 if (ksm_run != flags) { 1558 ksm_run = flags; 1559 if (flags & KSM_RUN_UNMERGE) { 1560 current->flags |= PF_OOM_ORIGIN; 1561 err = unmerge_and_remove_all_rmap_items(); 1562 current->flags &= ~PF_OOM_ORIGIN; 1563 if (err) { 1564 ksm_run = KSM_RUN_STOP; 1565 count = err; 1566 } 1567 } 1568 } 1569 mutex_unlock(&ksm_thread_mutex); 1570 1571 if (flags & KSM_RUN_MERGE) 1572 wake_up_interruptible(&ksm_thread_wait); 1573 1574 return count; 1575} 1576KSM_ATTR(run); 1577 1578static ssize_t max_kernel_pages_store(struct kobject *kobj, 1579 struct kobj_attribute *attr, 1580 const char *buf, size_t count) 1581{ 1582 int err; 1583 unsigned long nr_pages; 1584 1585 err = strict_strtoul(buf, 10, &nr_pages); 1586 if (err) 1587 return -EINVAL; 1588 1589 ksm_max_kernel_pages = nr_pages; 1590 1591 return count; 1592} 1593 1594static ssize_t max_kernel_pages_show(struct kobject *kobj, 1595 struct kobj_attribute *attr, char *buf) 1596{ 1597 return sprintf(buf, "%lu\n", ksm_max_kernel_pages); 1598} 1599KSM_ATTR(max_kernel_pages); 1600 1601static ssize_t pages_shared_show(struct kobject *kobj, 1602 struct kobj_attribute *attr, char *buf) 1603{ 1604 return sprintf(buf, "%lu\n", ksm_pages_shared); 1605} 1606KSM_ATTR_RO(pages_shared); 1607 1608static ssize_t pages_sharing_show(struct kobject *kobj, 1609 struct kobj_attribute *attr, char *buf) 1610{ 1611 return sprintf(buf, "%lu\n", ksm_pages_sharing); 1612} 1613KSM_ATTR_RO(pages_sharing); 1614 1615static ssize_t pages_unshared_show(struct kobject *kobj, 1616 struct kobj_attribute *attr, char *buf) 1617{ 1618 return sprintf(buf, "%lu\n", ksm_pages_unshared); 1619} 1620KSM_ATTR_RO(pages_unshared); 1621 1622static ssize_t pages_volatile_show(struct kobject *kobj, 1623 struct kobj_attribute *attr, char *buf) 1624{ 1625 long ksm_pages_volatile; 1626 1627 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared 1628 - ksm_pages_sharing - ksm_pages_unshared; 1629 /* 1630 * It was not worth any locking to calculate that statistic, 1631 * but it might therefore sometimes be negative: conceal that. 1632 */ 1633 if (ksm_pages_volatile < 0) 1634 ksm_pages_volatile = 0; 1635 return sprintf(buf, "%ld\n", ksm_pages_volatile); 1636} 1637KSM_ATTR_RO(pages_volatile); 1638 1639static ssize_t full_scans_show(struct kobject *kobj, 1640 struct kobj_attribute *attr, char *buf) 1641{ 1642 return sprintf(buf, "%lu\n", ksm_scan.seqnr); 1643} 1644KSM_ATTR_RO(full_scans); 1645 1646static struct attribute *ksm_attrs[] = { 1647 &sleep_millisecs_attr.attr, 1648 &pages_to_scan_attr.attr, 1649 &run_attr.attr, 1650 &max_kernel_pages_attr.attr, 1651 &pages_shared_attr.attr, 1652 &pages_sharing_attr.attr, 1653 &pages_unshared_attr.attr, 1654 &pages_volatile_attr.attr, 1655 &full_scans_attr.attr, 1656 NULL, 1657}; 1658 1659static struct attribute_group ksm_attr_group = { 1660 .attrs = ksm_attrs, 1661 .name = "ksm", 1662}; 1663#endif /* CONFIG_SYSFS */ 1664 1665static int __init ksm_init(void) 1666{ 1667 struct task_struct *ksm_thread; 1668 int err; 1669 1670 err = ksm_slab_init(); 1671 if (err) 1672 goto out; 1673 1674 err = mm_slots_hash_init(); 1675 if (err) 1676 goto out_free1; 1677 1678 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 1679 if (IS_ERR(ksm_thread)) { 1680 printk(KERN_ERR "ksm: creating kthread failed\n"); 1681 err = PTR_ERR(ksm_thread); 1682 goto out_free2; 1683 } 1684 1685#ifdef CONFIG_SYSFS 1686 err = sysfs_create_group(mm_kobj, &ksm_attr_group); 1687 if (err) { 1688 printk(KERN_ERR "ksm: register sysfs failed\n"); 1689 kthread_stop(ksm_thread); 1690 goto out_free2; 1691 } 1692#endif /* CONFIG_SYSFS */ 1693 1694 return 0; 1695 1696out_free2: 1697 mm_slots_hash_free(); 1698out_free1: 1699 ksm_slab_free(); 1700out: 1701 return err; 1702} 1703module_init(ksm_init) 1704