slub.c revision 2cfb7455d223ab24b23df44be430faf92e12390f
1/* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks and only 6 * uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter 9 */ 10 11#include <linux/mm.h> 12#include <linux/swap.h> /* struct reclaim_state */ 13#include <linux/module.h> 14#include <linux/bit_spinlock.h> 15#include <linux/interrupt.h> 16#include <linux/bitops.h> 17#include <linux/slab.h> 18#include <linux/proc_fs.h> 19#include <linux/seq_file.h> 20#include <linux/kmemcheck.h> 21#include <linux/cpu.h> 22#include <linux/cpuset.h> 23#include <linux/mempolicy.h> 24#include <linux/ctype.h> 25#include <linux/debugobjects.h> 26#include <linux/kallsyms.h> 27#include <linux/memory.h> 28#include <linux/math64.h> 29#include <linux/fault-inject.h> 30 31#include <trace/events/kmem.h> 32 33/* 34 * Lock order: 35 * 1. slab_lock(page) 36 * 2. slab->list_lock 37 * 38 * The slab_lock protects operations on the object of a particular 39 * slab and its metadata in the page struct. If the slab lock 40 * has been taken then no allocations nor frees can be performed 41 * on the objects in the slab nor can the slab be added or removed 42 * from the partial or full lists since this would mean modifying 43 * the page_struct of the slab. 44 * 45 * The list_lock protects the partial and full list on each node and 46 * the partial slab counter. If taken then no new slabs may be added or 47 * removed from the lists nor make the number of partial slabs be modified. 48 * (Note that the total number of slabs is an atomic value that may be 49 * modified without taking the list lock). 50 * 51 * The list_lock is a centralized lock and thus we avoid taking it as 52 * much as possible. As long as SLUB does not have to handle partial 53 * slabs, operations can continue without any centralized lock. F.e. 54 * allocating a long series of objects that fill up slabs does not require 55 * the list lock. 56 * 57 * The lock order is sometimes inverted when we are trying to get a slab 58 * off a list. We take the list_lock and then look for a page on the list 59 * to use. While we do that objects in the slabs may be freed. We can 60 * only operate on the slab if we have also taken the slab_lock. So we use 61 * a slab_trylock() on the slab. If trylock was successful then no frees 62 * can occur anymore and we can use the slab for allocations etc. If the 63 * slab_trylock() does not succeed then frees are in progress in the slab and 64 * we must stay away from it for a while since we may cause a bouncing 65 * cacheline if we try to acquire the lock. So go onto the next slab. 66 * If all pages are busy then we may allocate a new slab instead of reusing 67 * a partial slab. A new slab has no one operating on it and thus there is 68 * no danger of cacheline contention. 69 * 70 * Interrupts are disabled during allocation and deallocation in order to 71 * make the slab allocator safe to use in the context of an irq. In addition 72 * interrupts are disabled to ensure that the processor does not change 73 * while handling per_cpu slabs, due to kernel preemption. 74 * 75 * SLUB assigns one slab for allocation to each processor. 76 * Allocations only occur from these slabs called cpu slabs. 77 * 78 * Slabs with free elements are kept on a partial list and during regular 79 * operations no list for full slabs is used. If an object in a full slab is 80 * freed then the slab will show up again on the partial lists. 81 * We track full slabs for debugging purposes though because otherwise we 82 * cannot scan all objects. 83 * 84 * Slabs are freed when they become empty. Teardown and setup is 85 * minimal so we rely on the page allocators per cpu caches for 86 * fast frees and allocs. 87 * 88 * Overloading of page flags that are otherwise used for LRU management. 89 * 90 * PageActive The slab is frozen and exempt from list processing. 91 * This means that the slab is dedicated to a purpose 92 * such as satisfying allocations for a specific 93 * processor. Objects may be freed in the slab while 94 * it is frozen but slab_free will then skip the usual 95 * list operations. It is up to the processor holding 96 * the slab to integrate the slab into the slab lists 97 * when the slab is no longer needed. 98 * 99 * One use of this flag is to mark slabs that are 100 * used for allocations. Then such a slab becomes a cpu 101 * slab. The cpu slab may be equipped with an additional 102 * freelist that allows lockless access to 103 * free objects in addition to the regular freelist 104 * that requires the slab lock. 105 * 106 * PageError Slab requires special handling due to debug 107 * options set. This moves slab handling out of 108 * the fast path and disables lockless freelists. 109 */ 110 111#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 112 SLAB_TRACE | SLAB_DEBUG_FREE) 113 114static inline int kmem_cache_debug(struct kmem_cache *s) 115{ 116#ifdef CONFIG_SLUB_DEBUG 117 return unlikely(s->flags & SLAB_DEBUG_FLAGS); 118#else 119 return 0; 120#endif 121} 122 123/* 124 * Issues still to be resolved: 125 * 126 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 127 * 128 * - Variable sizing of the per node arrays 129 */ 130 131/* Enable to test recovery from slab corruption on boot */ 132#undef SLUB_RESILIENCY_TEST 133 134/* Enable to log cmpxchg failures */ 135#undef SLUB_DEBUG_CMPXCHG 136 137/* 138 * Mininum number of partial slabs. These will be left on the partial 139 * lists even if they are empty. kmem_cache_shrink may reclaim them. 140 */ 141#define MIN_PARTIAL 5 142 143/* 144 * Maximum number of desirable partial slabs. 145 * The existence of more partial slabs makes kmem_cache_shrink 146 * sort the partial list by the number of objects in the. 147 */ 148#define MAX_PARTIAL 10 149 150#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 151 SLAB_POISON | SLAB_STORE_USER) 152 153/* 154 * Debugging flags that require metadata to be stored in the slab. These get 155 * disabled when slub_debug=O is used and a cache's min order increases with 156 * metadata. 157 */ 158#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 159 160/* 161 * Set of flags that will prevent slab merging 162 */ 163#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 164 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ 165 SLAB_FAILSLAB) 166 167#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 168 SLAB_CACHE_DMA | SLAB_NOTRACK) 169 170#define OO_SHIFT 16 171#define OO_MASK ((1 << OO_SHIFT) - 1) 172#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ 173 174/* Internal SLUB flags */ 175#define __OBJECT_POISON 0x80000000UL /* Poison object */ 176#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ 177 178static int kmem_size = sizeof(struct kmem_cache); 179 180#ifdef CONFIG_SMP 181static struct notifier_block slab_notifier; 182#endif 183 184static enum { 185 DOWN, /* No slab functionality available */ 186 PARTIAL, /* Kmem_cache_node works */ 187 UP, /* Everything works but does not show up in sysfs */ 188 SYSFS /* Sysfs up */ 189} slab_state = DOWN; 190 191/* A list of all slab caches on the system */ 192static DECLARE_RWSEM(slub_lock); 193static LIST_HEAD(slab_caches); 194 195/* 196 * Tracking user of a slab. 197 */ 198struct track { 199 unsigned long addr; /* Called from address */ 200 int cpu; /* Was running on cpu */ 201 int pid; /* Pid context */ 202 unsigned long when; /* When did the operation occur */ 203}; 204 205enum track_item { TRACK_ALLOC, TRACK_FREE }; 206 207#ifdef CONFIG_SYSFS 208static int sysfs_slab_add(struct kmem_cache *); 209static int sysfs_slab_alias(struct kmem_cache *, const char *); 210static void sysfs_slab_remove(struct kmem_cache *); 211 212#else 213static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 214static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 215 { return 0; } 216static inline void sysfs_slab_remove(struct kmem_cache *s) 217{ 218 kfree(s->name); 219 kfree(s); 220} 221 222#endif 223 224static inline void stat(const struct kmem_cache *s, enum stat_item si) 225{ 226#ifdef CONFIG_SLUB_STATS 227 __this_cpu_inc(s->cpu_slab->stat[si]); 228#endif 229} 230 231/******************************************************************** 232 * Core slab cache functions 233 *******************************************************************/ 234 235int slab_is_available(void) 236{ 237 return slab_state >= UP; 238} 239 240static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 241{ 242 return s->node[node]; 243} 244 245/* Verify that a pointer has an address that is valid within a slab page */ 246static inline int check_valid_pointer(struct kmem_cache *s, 247 struct page *page, const void *object) 248{ 249 void *base; 250 251 if (!object) 252 return 1; 253 254 base = page_address(page); 255 if (object < base || object >= base + page->objects * s->size || 256 (object - base) % s->size) { 257 return 0; 258 } 259 260 return 1; 261} 262 263static inline void *get_freepointer(struct kmem_cache *s, void *object) 264{ 265 return *(void **)(object + s->offset); 266} 267 268static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 269{ 270 void *p; 271 272#ifdef CONFIG_DEBUG_PAGEALLOC 273 probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); 274#else 275 p = get_freepointer(s, object); 276#endif 277 return p; 278} 279 280static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 281{ 282 *(void **)(object + s->offset) = fp; 283} 284 285/* Loop over all objects in a slab */ 286#define for_each_object(__p, __s, __addr, __objects) \ 287 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ 288 __p += (__s)->size) 289 290/* Determine object index from a given position */ 291static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 292{ 293 return (p - addr) / s->size; 294} 295 296static inline size_t slab_ksize(const struct kmem_cache *s) 297{ 298#ifdef CONFIG_SLUB_DEBUG 299 /* 300 * Debugging requires use of the padding between object 301 * and whatever may come after it. 302 */ 303 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 304 return s->objsize; 305 306#endif 307 /* 308 * If we have the need to store the freelist pointer 309 * back there or track user information then we can 310 * only use the space before that information. 311 */ 312 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 313 return s->inuse; 314 /* 315 * Else we can use all the padding etc for the allocation 316 */ 317 return s->size; 318} 319 320static inline int order_objects(int order, unsigned long size, int reserved) 321{ 322 return ((PAGE_SIZE << order) - reserved) / size; 323} 324 325static inline struct kmem_cache_order_objects oo_make(int order, 326 unsigned long size, int reserved) 327{ 328 struct kmem_cache_order_objects x = { 329 (order << OO_SHIFT) + order_objects(order, size, reserved) 330 }; 331 332 return x; 333} 334 335static inline int oo_order(struct kmem_cache_order_objects x) 336{ 337 return x.x >> OO_SHIFT; 338} 339 340static inline int oo_objects(struct kmem_cache_order_objects x) 341{ 342 return x.x & OO_MASK; 343} 344 345static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 346 void *freelist_old, unsigned long counters_old, 347 void *freelist_new, unsigned long counters_new, 348 const char *n) 349{ 350#ifdef CONFIG_CMPXCHG_DOUBLE 351 if (s->flags & __CMPXCHG_DOUBLE) { 352 if (cmpxchg_double(&page->freelist, 353 freelist_old, counters_old, 354 freelist_new, counters_new)) 355 return 1; 356 } else 357#endif 358 { 359 if (page->freelist == freelist_old && page->counters == counters_old) { 360 page->freelist = freelist_new; 361 page->counters = counters_new; 362 return 1; 363 } 364 } 365 366 cpu_relax(); 367 stat(s, CMPXCHG_DOUBLE_FAIL); 368 369#ifdef SLUB_DEBUG_CMPXCHG 370 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 371#endif 372 373 return 0; 374} 375 376#ifdef CONFIG_SLUB_DEBUG 377/* 378 * Determine a map of object in use on a page. 379 * 380 * Slab lock or node listlock must be held to guarantee that the page does 381 * not vanish from under us. 382 */ 383static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) 384{ 385 void *p; 386 void *addr = page_address(page); 387 388 for (p = page->freelist; p; p = get_freepointer(s, p)) 389 set_bit(slab_index(p, s, addr), map); 390} 391 392/* 393 * Debug settings: 394 */ 395#ifdef CONFIG_SLUB_DEBUG_ON 396static int slub_debug = DEBUG_DEFAULT_FLAGS; 397#else 398static int slub_debug; 399#endif 400 401static char *slub_debug_slabs; 402static int disable_higher_order_debug; 403 404/* 405 * Object debugging 406 */ 407static void print_section(char *text, u8 *addr, unsigned int length) 408{ 409 int i, offset; 410 int newline = 1; 411 char ascii[17]; 412 413 ascii[16] = 0; 414 415 for (i = 0; i < length; i++) { 416 if (newline) { 417 printk(KERN_ERR "%8s 0x%p: ", text, addr + i); 418 newline = 0; 419 } 420 printk(KERN_CONT " %02x", addr[i]); 421 offset = i % 16; 422 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; 423 if (offset == 15) { 424 printk(KERN_CONT " %s\n", ascii); 425 newline = 1; 426 } 427 } 428 if (!newline) { 429 i %= 16; 430 while (i < 16) { 431 printk(KERN_CONT " "); 432 ascii[i] = ' '; 433 i++; 434 } 435 printk(KERN_CONT " %s\n", ascii); 436 } 437} 438 439static struct track *get_track(struct kmem_cache *s, void *object, 440 enum track_item alloc) 441{ 442 struct track *p; 443 444 if (s->offset) 445 p = object + s->offset + sizeof(void *); 446 else 447 p = object + s->inuse; 448 449 return p + alloc; 450} 451 452static void set_track(struct kmem_cache *s, void *object, 453 enum track_item alloc, unsigned long addr) 454{ 455 struct track *p = get_track(s, object, alloc); 456 457 if (addr) { 458 p->addr = addr; 459 p->cpu = smp_processor_id(); 460 p->pid = current->pid; 461 p->when = jiffies; 462 } else 463 memset(p, 0, sizeof(struct track)); 464} 465 466static void init_tracking(struct kmem_cache *s, void *object) 467{ 468 if (!(s->flags & SLAB_STORE_USER)) 469 return; 470 471 set_track(s, object, TRACK_FREE, 0UL); 472 set_track(s, object, TRACK_ALLOC, 0UL); 473} 474 475static void print_track(const char *s, struct track *t) 476{ 477 if (!t->addr) 478 return; 479 480 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 481 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 482} 483 484static void print_tracking(struct kmem_cache *s, void *object) 485{ 486 if (!(s->flags & SLAB_STORE_USER)) 487 return; 488 489 print_track("Allocated", get_track(s, object, TRACK_ALLOC)); 490 print_track("Freed", get_track(s, object, TRACK_FREE)); 491} 492 493static void print_page_info(struct page *page) 494{ 495 printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", 496 page, page->objects, page->inuse, page->freelist, page->flags); 497 498} 499 500static void slab_bug(struct kmem_cache *s, char *fmt, ...) 501{ 502 va_list args; 503 char buf[100]; 504 505 va_start(args, fmt); 506 vsnprintf(buf, sizeof(buf), fmt, args); 507 va_end(args); 508 printk(KERN_ERR "========================================" 509 "=====================================\n"); 510 printk(KERN_ERR "BUG %s: %s\n", s->name, buf); 511 printk(KERN_ERR "----------------------------------------" 512 "-------------------------------------\n\n"); 513} 514 515static void slab_fix(struct kmem_cache *s, char *fmt, ...) 516{ 517 va_list args; 518 char buf[100]; 519 520 va_start(args, fmt); 521 vsnprintf(buf, sizeof(buf), fmt, args); 522 va_end(args); 523 printk(KERN_ERR "FIX %s: %s\n", s->name, buf); 524} 525 526static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 527{ 528 unsigned int off; /* Offset of last byte */ 529 u8 *addr = page_address(page); 530 531 print_tracking(s, p); 532 533 print_page_info(page); 534 535 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 536 p, p - addr, get_freepointer(s, p)); 537 538 if (p > addr + 16) 539 print_section("Bytes b4", p - 16, 16); 540 541 print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); 542 543 if (s->flags & SLAB_RED_ZONE) 544 print_section("Redzone", p + s->objsize, 545 s->inuse - s->objsize); 546 547 if (s->offset) 548 off = s->offset + sizeof(void *); 549 else 550 off = s->inuse; 551 552 if (s->flags & SLAB_STORE_USER) 553 off += 2 * sizeof(struct track); 554 555 if (off != s->size) 556 /* Beginning of the filler is the free pointer */ 557 print_section("Padding", p + off, s->size - off); 558 559 dump_stack(); 560} 561 562static void object_err(struct kmem_cache *s, struct page *page, 563 u8 *object, char *reason) 564{ 565 slab_bug(s, "%s", reason); 566 print_trailer(s, page, object); 567} 568 569static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) 570{ 571 va_list args; 572 char buf[100]; 573 574 va_start(args, fmt); 575 vsnprintf(buf, sizeof(buf), fmt, args); 576 va_end(args); 577 slab_bug(s, "%s", buf); 578 print_page_info(page); 579 dump_stack(); 580} 581 582static void init_object(struct kmem_cache *s, void *object, u8 val) 583{ 584 u8 *p = object; 585 586 if (s->flags & __OBJECT_POISON) { 587 memset(p, POISON_FREE, s->objsize - 1); 588 p[s->objsize - 1] = POISON_END; 589 } 590 591 if (s->flags & SLAB_RED_ZONE) 592 memset(p + s->objsize, val, s->inuse - s->objsize); 593} 594 595static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) 596{ 597 while (bytes) { 598 if (*start != (u8)value) 599 return start; 600 start++; 601 bytes--; 602 } 603 return NULL; 604} 605 606static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 607 void *from, void *to) 608{ 609 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); 610 memset(from, data, to - from); 611} 612 613static int check_bytes_and_report(struct kmem_cache *s, struct page *page, 614 u8 *object, char *what, 615 u8 *start, unsigned int value, unsigned int bytes) 616{ 617 u8 *fault; 618 u8 *end; 619 620 fault = check_bytes(start, value, bytes); 621 if (!fault) 622 return 1; 623 624 end = start + bytes; 625 while (end > fault && end[-1] == value) 626 end--; 627 628 slab_bug(s, "%s overwritten", what); 629 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", 630 fault, end - 1, fault[0], value); 631 print_trailer(s, page, object); 632 633 restore_bytes(s, what, value, fault, end); 634 return 0; 635} 636 637/* 638 * Object layout: 639 * 640 * object address 641 * Bytes of the object to be managed. 642 * If the freepointer may overlay the object then the free 643 * pointer is the first word of the object. 644 * 645 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 646 * 0xa5 (POISON_END) 647 * 648 * object + s->objsize 649 * Padding to reach word boundary. This is also used for Redzoning. 650 * Padding is extended by another word if Redzoning is enabled and 651 * objsize == inuse. 652 * 653 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 654 * 0xcc (RED_ACTIVE) for objects in use. 655 * 656 * object + s->inuse 657 * Meta data starts here. 658 * 659 * A. Free pointer (if we cannot overwrite object on free) 660 * B. Tracking data for SLAB_STORE_USER 661 * C. Padding to reach required alignment boundary or at mininum 662 * one word if debugging is on to be able to detect writes 663 * before the word boundary. 664 * 665 * Padding is done using 0x5a (POISON_INUSE) 666 * 667 * object + s->size 668 * Nothing is used beyond s->size. 669 * 670 * If slabcaches are merged then the objsize and inuse boundaries are mostly 671 * ignored. And therefore no slab options that rely on these boundaries 672 * may be used with merged slabcaches. 673 */ 674 675static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 676{ 677 unsigned long off = s->inuse; /* The end of info */ 678 679 if (s->offset) 680 /* Freepointer is placed after the object. */ 681 off += sizeof(void *); 682 683 if (s->flags & SLAB_STORE_USER) 684 /* We also have user information there */ 685 off += 2 * sizeof(struct track); 686 687 if (s->size == off) 688 return 1; 689 690 return check_bytes_and_report(s, page, p, "Object padding", 691 p + off, POISON_INUSE, s->size - off); 692} 693 694/* Check the pad bytes at the end of a slab page */ 695static int slab_pad_check(struct kmem_cache *s, struct page *page) 696{ 697 u8 *start; 698 u8 *fault; 699 u8 *end; 700 int length; 701 int remainder; 702 703 if (!(s->flags & SLAB_POISON)) 704 return 1; 705 706 start = page_address(page); 707 length = (PAGE_SIZE << compound_order(page)) - s->reserved; 708 end = start + length; 709 remainder = length % s->size; 710 if (!remainder) 711 return 1; 712 713 fault = check_bytes(end - remainder, POISON_INUSE, remainder); 714 if (!fault) 715 return 1; 716 while (end > fault && end[-1] == POISON_INUSE) 717 end--; 718 719 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 720 print_section("Padding", end - remainder, remainder); 721 722 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); 723 return 0; 724} 725 726static int check_object(struct kmem_cache *s, struct page *page, 727 void *object, u8 val) 728{ 729 u8 *p = object; 730 u8 *endobject = object + s->objsize; 731 732 if (s->flags & SLAB_RED_ZONE) { 733 if (!check_bytes_and_report(s, page, object, "Redzone", 734 endobject, val, s->inuse - s->objsize)) 735 return 0; 736 } else { 737 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { 738 check_bytes_and_report(s, page, p, "Alignment padding", 739 endobject, POISON_INUSE, s->inuse - s->objsize); 740 } 741 } 742 743 if (s->flags & SLAB_POISON) { 744 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && 745 (!check_bytes_and_report(s, page, p, "Poison", p, 746 POISON_FREE, s->objsize - 1) || 747 !check_bytes_and_report(s, page, p, "Poison", 748 p + s->objsize - 1, POISON_END, 1))) 749 return 0; 750 /* 751 * check_pad_bytes cleans up on its own. 752 */ 753 check_pad_bytes(s, page, p); 754 } 755 756 if (!s->offset && val == SLUB_RED_ACTIVE) 757 /* 758 * Object and freepointer overlap. Cannot check 759 * freepointer while object is allocated. 760 */ 761 return 1; 762 763 /* Check free pointer validity */ 764 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 765 object_err(s, page, p, "Freepointer corrupt"); 766 /* 767 * No choice but to zap it and thus lose the remainder 768 * of the free objects in this slab. May cause 769 * another error because the object count is now wrong. 770 */ 771 set_freepointer(s, p, NULL); 772 return 0; 773 } 774 return 1; 775} 776 777static int check_slab(struct kmem_cache *s, struct page *page) 778{ 779 int maxobj; 780 781 VM_BUG_ON(!irqs_disabled()); 782 783 if (!PageSlab(page)) { 784 slab_err(s, page, "Not a valid slab page"); 785 return 0; 786 } 787 788 maxobj = order_objects(compound_order(page), s->size, s->reserved); 789 if (page->objects > maxobj) { 790 slab_err(s, page, "objects %u > max %u", 791 s->name, page->objects, maxobj); 792 return 0; 793 } 794 if (page->inuse > page->objects) { 795 slab_err(s, page, "inuse %u > max %u", 796 s->name, page->inuse, page->objects); 797 return 0; 798 } 799 /* Slab_pad_check fixes things up after itself */ 800 slab_pad_check(s, page); 801 return 1; 802} 803 804/* 805 * Determine if a certain object on a page is on the freelist. Must hold the 806 * slab lock to guarantee that the chains are in a consistent state. 807 */ 808static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 809{ 810 int nr = 0; 811 void *fp = page->freelist; 812 void *object = NULL; 813 unsigned long max_objects; 814 815 while (fp && nr <= page->objects) { 816 if (fp == search) 817 return 1; 818 if (!check_valid_pointer(s, page, fp)) { 819 if (object) { 820 object_err(s, page, object, 821 "Freechain corrupt"); 822 set_freepointer(s, object, NULL); 823 break; 824 } else { 825 slab_err(s, page, "Freepointer corrupt"); 826 page->freelist = NULL; 827 page->inuse = page->objects; 828 slab_fix(s, "Freelist cleared"); 829 return 0; 830 } 831 break; 832 } 833 object = fp; 834 fp = get_freepointer(s, object); 835 nr++; 836 } 837 838 max_objects = order_objects(compound_order(page), s->size, s->reserved); 839 if (max_objects > MAX_OBJS_PER_PAGE) 840 max_objects = MAX_OBJS_PER_PAGE; 841 842 if (page->objects != max_objects) { 843 slab_err(s, page, "Wrong number of objects. Found %d but " 844 "should be %d", page->objects, max_objects); 845 page->objects = max_objects; 846 slab_fix(s, "Number of objects adjusted."); 847 } 848 if (page->inuse != page->objects - nr) { 849 slab_err(s, page, "Wrong object count. Counter is %d but " 850 "counted were %d", page->inuse, page->objects - nr); 851 page->inuse = page->objects - nr; 852 slab_fix(s, "Object count adjusted."); 853 } 854 return search == NULL; 855} 856 857static void trace(struct kmem_cache *s, struct page *page, void *object, 858 int alloc) 859{ 860 if (s->flags & SLAB_TRACE) { 861 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 862 s->name, 863 alloc ? "alloc" : "free", 864 object, page->inuse, 865 page->freelist); 866 867 if (!alloc) 868 print_section("Object", (void *)object, s->objsize); 869 870 dump_stack(); 871 } 872} 873 874/* 875 * Hooks for other subsystems that check memory allocations. In a typical 876 * production configuration these hooks all should produce no code at all. 877 */ 878static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 879{ 880 flags &= gfp_allowed_mask; 881 lockdep_trace_alloc(flags); 882 might_sleep_if(flags & __GFP_WAIT); 883 884 return should_failslab(s->objsize, flags, s->flags); 885} 886 887static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) 888{ 889 flags &= gfp_allowed_mask; 890 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 891 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); 892} 893 894static inline void slab_free_hook(struct kmem_cache *s, void *x) 895{ 896 kmemleak_free_recursive(x, s->flags); 897 898 /* 899 * Trouble is that we may no longer disable interupts in the fast path 900 * So in order to make the debug calls that expect irqs to be 901 * disabled we need to disable interrupts temporarily. 902 */ 903#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) 904 { 905 unsigned long flags; 906 907 local_irq_save(flags); 908 kmemcheck_slab_free(s, x, s->objsize); 909 debug_check_no_locks_freed(x, s->objsize); 910 local_irq_restore(flags); 911 } 912#endif 913 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 914 debug_check_no_obj_freed(x, s->objsize); 915} 916 917/* 918 * Tracking of fully allocated slabs for debugging purposes. 919 * 920 * list_lock must be held. 921 */ 922static void add_full(struct kmem_cache *s, 923 struct kmem_cache_node *n, struct page *page) 924{ 925 if (!(s->flags & SLAB_STORE_USER)) 926 return; 927 928 list_add(&page->lru, &n->full); 929} 930 931/* 932 * list_lock must be held. 933 */ 934static void remove_full(struct kmem_cache *s, struct page *page) 935{ 936 if (!(s->flags & SLAB_STORE_USER)) 937 return; 938 939 list_del(&page->lru); 940} 941 942/* Tracking of the number of slabs for debugging purposes */ 943static inline unsigned long slabs_node(struct kmem_cache *s, int node) 944{ 945 struct kmem_cache_node *n = get_node(s, node); 946 947 return atomic_long_read(&n->nr_slabs); 948} 949 950static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 951{ 952 return atomic_long_read(&n->nr_slabs); 953} 954 955static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) 956{ 957 struct kmem_cache_node *n = get_node(s, node); 958 959 /* 960 * May be called early in order to allocate a slab for the 961 * kmem_cache_node structure. Solve the chicken-egg 962 * dilemma by deferring the increment of the count during 963 * bootstrap (see early_kmem_cache_node_alloc). 964 */ 965 if (n) { 966 atomic_long_inc(&n->nr_slabs); 967 atomic_long_add(objects, &n->total_objects); 968 } 969} 970static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) 971{ 972 struct kmem_cache_node *n = get_node(s, node); 973 974 atomic_long_dec(&n->nr_slabs); 975 atomic_long_sub(objects, &n->total_objects); 976} 977 978/* Object debug checks for alloc/free paths */ 979static void setup_object_debug(struct kmem_cache *s, struct page *page, 980 void *object) 981{ 982 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 983 return; 984 985 init_object(s, object, SLUB_RED_INACTIVE); 986 init_tracking(s, object); 987} 988 989static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page, 990 void *object, unsigned long addr) 991{ 992 if (!check_slab(s, page)) 993 goto bad; 994 995 if (!check_valid_pointer(s, page, object)) { 996 object_err(s, page, object, "Freelist Pointer check fails"); 997 goto bad; 998 } 999 1000 if (!check_object(s, page, object, SLUB_RED_INACTIVE)) 1001 goto bad; 1002 1003 /* Success perform special debug activities for allocs */ 1004 if (s->flags & SLAB_STORE_USER) 1005 set_track(s, object, TRACK_ALLOC, addr); 1006 trace(s, page, object, 1); 1007 init_object(s, object, SLUB_RED_ACTIVE); 1008 return 1; 1009 1010bad: 1011 if (PageSlab(page)) { 1012 /* 1013 * If this is a slab page then lets do the best we can 1014 * to avoid issues in the future. Marking all objects 1015 * as used avoids touching the remaining objects. 1016 */ 1017 slab_fix(s, "Marking all objects used"); 1018 page->inuse = page->objects; 1019 page->freelist = NULL; 1020 } 1021 return 0; 1022} 1023 1024static noinline int free_debug_processing(struct kmem_cache *s, 1025 struct page *page, void *object, unsigned long addr) 1026{ 1027 if (!check_slab(s, page)) 1028 goto fail; 1029 1030 if (!check_valid_pointer(s, page, object)) { 1031 slab_err(s, page, "Invalid object pointer 0x%p", object); 1032 goto fail; 1033 } 1034 1035 if (on_freelist(s, page, object)) { 1036 object_err(s, page, object, "Object already free"); 1037 goto fail; 1038 } 1039 1040 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1041 return 0; 1042 1043 if (unlikely(s != page->slab)) { 1044 if (!PageSlab(page)) { 1045 slab_err(s, page, "Attempt to free object(0x%p) " 1046 "outside of slab", object); 1047 } else if (!page->slab) { 1048 printk(KERN_ERR 1049 "SLUB <none>: no slab for object 0x%p.\n", 1050 object); 1051 dump_stack(); 1052 } else 1053 object_err(s, page, object, 1054 "page slab pointer corrupt."); 1055 goto fail; 1056 } 1057 1058 if (s->flags & SLAB_STORE_USER) 1059 set_track(s, object, TRACK_FREE, addr); 1060 trace(s, page, object, 0); 1061 init_object(s, object, SLUB_RED_INACTIVE); 1062 return 1; 1063 1064fail: 1065 slab_fix(s, "Object at 0x%p not freed", object); 1066 return 0; 1067} 1068 1069static int __init setup_slub_debug(char *str) 1070{ 1071 slub_debug = DEBUG_DEFAULT_FLAGS; 1072 if (*str++ != '=' || !*str) 1073 /* 1074 * No options specified. Switch on full debugging. 1075 */ 1076 goto out; 1077 1078 if (*str == ',') 1079 /* 1080 * No options but restriction on slabs. This means full 1081 * debugging for slabs matching a pattern. 1082 */ 1083 goto check_slabs; 1084 1085 if (tolower(*str) == 'o') { 1086 /* 1087 * Avoid enabling debugging on caches if its minimum order 1088 * would increase as a result. 1089 */ 1090 disable_higher_order_debug = 1; 1091 goto out; 1092 } 1093 1094 slub_debug = 0; 1095 if (*str == '-') 1096 /* 1097 * Switch off all debugging measures. 1098 */ 1099 goto out; 1100 1101 /* 1102 * Determine which debug features should be switched on 1103 */ 1104 for (; *str && *str != ','; str++) { 1105 switch (tolower(*str)) { 1106 case 'f': 1107 slub_debug |= SLAB_DEBUG_FREE; 1108 break; 1109 case 'z': 1110 slub_debug |= SLAB_RED_ZONE; 1111 break; 1112 case 'p': 1113 slub_debug |= SLAB_POISON; 1114 break; 1115 case 'u': 1116 slub_debug |= SLAB_STORE_USER; 1117 break; 1118 case 't': 1119 slub_debug |= SLAB_TRACE; 1120 break; 1121 case 'a': 1122 slub_debug |= SLAB_FAILSLAB; 1123 break; 1124 default: 1125 printk(KERN_ERR "slub_debug option '%c' " 1126 "unknown. skipped\n", *str); 1127 } 1128 } 1129 1130check_slabs: 1131 if (*str == ',') 1132 slub_debug_slabs = str + 1; 1133out: 1134 return 1; 1135} 1136 1137__setup("slub_debug", setup_slub_debug); 1138 1139static unsigned long kmem_cache_flags(unsigned long objsize, 1140 unsigned long flags, const char *name, 1141 void (*ctor)(void *)) 1142{ 1143 /* 1144 * Enable debugging if selected on the kernel commandline. 1145 */ 1146 if (slub_debug && (!slub_debug_slabs || 1147 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) 1148 flags |= slub_debug; 1149 1150 return flags; 1151} 1152#else 1153static inline void setup_object_debug(struct kmem_cache *s, 1154 struct page *page, void *object) {} 1155 1156static inline int alloc_debug_processing(struct kmem_cache *s, 1157 struct page *page, void *object, unsigned long addr) { return 0; } 1158 1159static inline int free_debug_processing(struct kmem_cache *s, 1160 struct page *page, void *object, unsigned long addr) { return 0; } 1161 1162static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1163 { return 1; } 1164static inline int check_object(struct kmem_cache *s, struct page *page, 1165 void *object, u8 val) { return 1; } 1166static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, 1167 struct page *page) {} 1168static inline void remove_full(struct kmem_cache *s, struct page *page) {} 1169static inline unsigned long kmem_cache_flags(unsigned long objsize, 1170 unsigned long flags, const char *name, 1171 void (*ctor)(void *)) 1172{ 1173 return flags; 1174} 1175#define slub_debug 0 1176 1177#define disable_higher_order_debug 0 1178 1179static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1180 { return 0; } 1181static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1182 { return 0; } 1183static inline void inc_slabs_node(struct kmem_cache *s, int node, 1184 int objects) {} 1185static inline void dec_slabs_node(struct kmem_cache *s, int node, 1186 int objects) {} 1187 1188static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1189 { return 0; } 1190 1191static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, 1192 void *object) {} 1193 1194static inline void slab_free_hook(struct kmem_cache *s, void *x) {} 1195 1196#endif /* CONFIG_SLUB_DEBUG */ 1197 1198/* 1199 * Slab allocation and freeing 1200 */ 1201static inline struct page *alloc_slab_page(gfp_t flags, int node, 1202 struct kmem_cache_order_objects oo) 1203{ 1204 int order = oo_order(oo); 1205 1206 flags |= __GFP_NOTRACK; 1207 1208 if (node == NUMA_NO_NODE) 1209 return alloc_pages(flags, order); 1210 else 1211 return alloc_pages_exact_node(node, flags, order); 1212} 1213 1214static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1215{ 1216 struct page *page; 1217 struct kmem_cache_order_objects oo = s->oo; 1218 gfp_t alloc_gfp; 1219 1220 flags &= gfp_allowed_mask; 1221 1222 if (flags & __GFP_WAIT) 1223 local_irq_enable(); 1224 1225 flags |= s->allocflags; 1226 1227 /* 1228 * Let the initial higher-order allocation fail under memory pressure 1229 * so we fall-back to the minimum order allocation. 1230 */ 1231 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; 1232 1233 page = alloc_slab_page(alloc_gfp, node, oo); 1234 if (unlikely(!page)) { 1235 oo = s->min; 1236 /* 1237 * Allocation may have failed due to fragmentation. 1238 * Try a lower order alloc if possible 1239 */ 1240 page = alloc_slab_page(flags, node, oo); 1241 1242 if (page) 1243 stat(s, ORDER_FALLBACK); 1244 } 1245 1246 if (flags & __GFP_WAIT) 1247 local_irq_disable(); 1248 1249 if (!page) 1250 return NULL; 1251 1252 if (kmemcheck_enabled 1253 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1254 int pages = 1 << oo_order(oo); 1255 1256 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); 1257 1258 /* 1259 * Objects from caches that have a constructor don't get 1260 * cleared when they're allocated, so we need to do it here. 1261 */ 1262 if (s->ctor) 1263 kmemcheck_mark_uninitialized_pages(page, pages); 1264 else 1265 kmemcheck_mark_unallocated_pages(page, pages); 1266 } 1267 1268 page->objects = oo_objects(oo); 1269 mod_zone_page_state(page_zone(page), 1270 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1271 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1272 1 << oo_order(oo)); 1273 1274 return page; 1275} 1276 1277static void setup_object(struct kmem_cache *s, struct page *page, 1278 void *object) 1279{ 1280 setup_object_debug(s, page, object); 1281 if (unlikely(s->ctor)) 1282 s->ctor(object); 1283} 1284 1285static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1286{ 1287 struct page *page; 1288 void *start; 1289 void *last; 1290 void *p; 1291 1292 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1293 1294 page = allocate_slab(s, 1295 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 1296 if (!page) 1297 goto out; 1298 1299 inc_slabs_node(s, page_to_nid(page), page->objects); 1300 page->slab = s; 1301 page->flags |= 1 << PG_slab; 1302 1303 start = page_address(page); 1304 1305 if (unlikely(s->flags & SLAB_POISON)) 1306 memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); 1307 1308 last = start; 1309 for_each_object(p, s, start, page->objects) { 1310 setup_object(s, page, last); 1311 set_freepointer(s, last, p); 1312 last = p; 1313 } 1314 setup_object(s, page, last); 1315 set_freepointer(s, last, NULL); 1316 1317 page->freelist = start; 1318 page->inuse = 0; 1319 page->frozen = 1; 1320out: 1321 return page; 1322} 1323 1324static void __free_slab(struct kmem_cache *s, struct page *page) 1325{ 1326 int order = compound_order(page); 1327 int pages = 1 << order; 1328 1329 if (kmem_cache_debug(s)) { 1330 void *p; 1331 1332 slab_pad_check(s, page); 1333 for_each_object(p, s, page_address(page), 1334 page->objects) 1335 check_object(s, page, p, SLUB_RED_INACTIVE); 1336 } 1337 1338 kmemcheck_free_shadow(page, compound_order(page)); 1339 1340 mod_zone_page_state(page_zone(page), 1341 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1342 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1343 -pages); 1344 1345 __ClearPageSlab(page); 1346 reset_page_mapcount(page); 1347 if (current->reclaim_state) 1348 current->reclaim_state->reclaimed_slab += pages; 1349 __free_pages(page, order); 1350} 1351 1352#define need_reserve_slab_rcu \ 1353 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) 1354 1355static void rcu_free_slab(struct rcu_head *h) 1356{ 1357 struct page *page; 1358 1359 if (need_reserve_slab_rcu) 1360 page = virt_to_head_page(h); 1361 else 1362 page = container_of((struct list_head *)h, struct page, lru); 1363 1364 __free_slab(page->slab, page); 1365} 1366 1367static void free_slab(struct kmem_cache *s, struct page *page) 1368{ 1369 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1370 struct rcu_head *head; 1371 1372 if (need_reserve_slab_rcu) { 1373 int order = compound_order(page); 1374 int offset = (PAGE_SIZE << order) - s->reserved; 1375 1376 VM_BUG_ON(s->reserved != sizeof(*head)); 1377 head = page_address(page) + offset; 1378 } else { 1379 /* 1380 * RCU free overloads the RCU head over the LRU 1381 */ 1382 head = (void *)&page->lru; 1383 } 1384 1385 call_rcu(head, rcu_free_slab); 1386 } else 1387 __free_slab(s, page); 1388} 1389 1390static void discard_slab(struct kmem_cache *s, struct page *page) 1391{ 1392 dec_slabs_node(s, page_to_nid(page), page->objects); 1393 free_slab(s, page); 1394} 1395 1396/* 1397 * Per slab locking using the pagelock 1398 */ 1399static __always_inline void slab_lock(struct page *page) 1400{ 1401 bit_spin_lock(PG_locked, &page->flags); 1402} 1403 1404static __always_inline void slab_unlock(struct page *page) 1405{ 1406 __bit_spin_unlock(PG_locked, &page->flags); 1407} 1408 1409static __always_inline int slab_trylock(struct page *page) 1410{ 1411 int rc = 1; 1412 1413 rc = bit_spin_trylock(PG_locked, &page->flags); 1414 return rc; 1415} 1416 1417/* 1418 * Management of partially allocated slabs. 1419 * 1420 * list_lock must be held. 1421 */ 1422static inline void add_partial(struct kmem_cache_node *n, 1423 struct page *page, int tail) 1424{ 1425 n->nr_partial++; 1426 if (tail) 1427 list_add_tail(&page->lru, &n->partial); 1428 else 1429 list_add(&page->lru, &n->partial); 1430} 1431 1432/* 1433 * list_lock must be held. 1434 */ 1435static inline void remove_partial(struct kmem_cache_node *n, 1436 struct page *page) 1437{ 1438 list_del(&page->lru); 1439 n->nr_partial--; 1440} 1441 1442/* 1443 * Lock slab, remove from the partial list and put the object into the 1444 * per cpu freelist. 1445 * 1446 * Must hold list_lock. 1447 */ 1448static inline int lock_and_freeze_slab(struct kmem_cache *s, 1449 struct kmem_cache_node *n, struct page *page) 1450{ 1451 void *freelist; 1452 unsigned long counters; 1453 struct page new; 1454 1455 1456 if (!slab_trylock(page)) 1457 return 0; 1458 1459 /* 1460 * Zap the freelist and set the frozen bit. 1461 * The old freelist is the list of objects for the 1462 * per cpu allocation list. 1463 */ 1464 do { 1465 freelist = page->freelist; 1466 counters = page->counters; 1467 new.counters = counters; 1468 new.inuse = page->objects; 1469 1470 VM_BUG_ON(new.frozen); 1471 new.frozen = 1; 1472 1473 } while (!cmpxchg_double_slab(s, page, 1474 freelist, counters, 1475 NULL, new.counters, 1476 "lock and freeze")); 1477 1478 remove_partial(n, page); 1479 1480 if (freelist) { 1481 /* Populate the per cpu freelist */ 1482 this_cpu_write(s->cpu_slab->freelist, freelist); 1483 this_cpu_write(s->cpu_slab->page, page); 1484 this_cpu_write(s->cpu_slab->node, page_to_nid(page)); 1485 return 1; 1486 } else { 1487 /* 1488 * Slab page came from the wrong list. No object to allocate 1489 * from. Put it onto the correct list and continue partial 1490 * scan. 1491 */ 1492 printk(KERN_ERR "SLUB: %s : Page without available objects on" 1493 " partial list\n", s->name); 1494 slab_unlock(page); 1495 return 0; 1496 } 1497} 1498 1499/* 1500 * Try to allocate a partial slab from a specific node. 1501 */ 1502static struct page *get_partial_node(struct kmem_cache *s, 1503 struct kmem_cache_node *n) 1504{ 1505 struct page *page; 1506 1507 /* 1508 * Racy check. If we mistakenly see no partial slabs then we 1509 * just allocate an empty slab. If we mistakenly try to get a 1510 * partial slab and there is none available then get_partials() 1511 * will return NULL. 1512 */ 1513 if (!n || !n->nr_partial) 1514 return NULL; 1515 1516 spin_lock(&n->list_lock); 1517 list_for_each_entry(page, &n->partial, lru) 1518 if (lock_and_freeze_slab(s, n, page)) 1519 goto out; 1520 page = NULL; 1521out: 1522 spin_unlock(&n->list_lock); 1523 return page; 1524} 1525 1526/* 1527 * Get a page from somewhere. Search in increasing NUMA distances. 1528 */ 1529static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1530{ 1531#ifdef CONFIG_NUMA 1532 struct zonelist *zonelist; 1533 struct zoneref *z; 1534 struct zone *zone; 1535 enum zone_type high_zoneidx = gfp_zone(flags); 1536 struct page *page; 1537 1538 /* 1539 * The defrag ratio allows a configuration of the tradeoffs between 1540 * inter node defragmentation and node local allocations. A lower 1541 * defrag_ratio increases the tendency to do local allocations 1542 * instead of attempting to obtain partial slabs from other nodes. 1543 * 1544 * If the defrag_ratio is set to 0 then kmalloc() always 1545 * returns node local objects. If the ratio is higher then kmalloc() 1546 * may return off node objects because partial slabs are obtained 1547 * from other nodes and filled up. 1548 * 1549 * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes 1550 * defrag_ratio = 1000) then every (well almost) allocation will 1551 * first attempt to defrag slab caches on other nodes. This means 1552 * scanning over all nodes to look for partial slabs which may be 1553 * expensive if we do it every time we are trying to find a slab 1554 * with available objects. 1555 */ 1556 if (!s->remote_node_defrag_ratio || 1557 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1558 return NULL; 1559 1560 get_mems_allowed(); 1561 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1562 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1563 struct kmem_cache_node *n; 1564 1565 n = get_node(s, zone_to_nid(zone)); 1566 1567 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1568 n->nr_partial > s->min_partial) { 1569 page = get_partial_node(s, n); 1570 if (page) { 1571 put_mems_allowed(); 1572 return page; 1573 } 1574 } 1575 } 1576 put_mems_allowed(); 1577#endif 1578 return NULL; 1579} 1580 1581/* 1582 * Get a partial page, lock it and return it. 1583 */ 1584static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1585{ 1586 struct page *page; 1587 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1588 1589 page = get_partial_node(s, get_node(s, searchnode)); 1590 if (page || node != NUMA_NO_NODE) 1591 return page; 1592 1593 return get_any_partial(s, flags); 1594} 1595 1596#ifdef CONFIG_PREEMPT 1597/* 1598 * Calculate the next globally unique transaction for disambiguiation 1599 * during cmpxchg. The transactions start with the cpu number and are then 1600 * incremented by CONFIG_NR_CPUS. 1601 */ 1602#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) 1603#else 1604/* 1605 * No preemption supported therefore also no need to check for 1606 * different cpus. 1607 */ 1608#define TID_STEP 1 1609#endif 1610 1611static inline unsigned long next_tid(unsigned long tid) 1612{ 1613 return tid + TID_STEP; 1614} 1615 1616static inline unsigned int tid_to_cpu(unsigned long tid) 1617{ 1618 return tid % TID_STEP; 1619} 1620 1621static inline unsigned long tid_to_event(unsigned long tid) 1622{ 1623 return tid / TID_STEP; 1624} 1625 1626static inline unsigned int init_tid(int cpu) 1627{ 1628 return cpu; 1629} 1630 1631static inline void note_cmpxchg_failure(const char *n, 1632 const struct kmem_cache *s, unsigned long tid) 1633{ 1634#ifdef SLUB_DEBUG_CMPXCHG 1635 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 1636 1637 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); 1638 1639#ifdef CONFIG_PREEMPT 1640 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) 1641 printk("due to cpu change %d -> %d\n", 1642 tid_to_cpu(tid), tid_to_cpu(actual_tid)); 1643 else 1644#endif 1645 if (tid_to_event(tid) != tid_to_event(actual_tid)) 1646 printk("due to cpu running other code. Event %ld->%ld\n", 1647 tid_to_event(tid), tid_to_event(actual_tid)); 1648 else 1649 printk("for unknown reason: actual=%lx was=%lx target=%lx\n", 1650 actual_tid, tid, next_tid(tid)); 1651#endif 1652 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 1653} 1654 1655void init_kmem_cache_cpus(struct kmem_cache *s) 1656{ 1657 int cpu; 1658 1659 for_each_possible_cpu(cpu) 1660 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); 1661} 1662/* 1663 * Remove the cpu slab 1664 */ 1665 1666/* 1667 * Remove the cpu slab 1668 */ 1669static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1670{ 1671 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; 1672 struct page *page = c->page; 1673 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1674 int lock = 0; 1675 enum slab_modes l = M_NONE, m = M_NONE; 1676 void *freelist; 1677 void *nextfree; 1678 int tail = 0; 1679 struct page new; 1680 struct page old; 1681 1682 if (page->freelist) { 1683 stat(s, DEACTIVATE_REMOTE_FREES); 1684 tail = 1; 1685 } 1686 1687 c->tid = next_tid(c->tid); 1688 c->page = NULL; 1689 freelist = c->freelist; 1690 c->freelist = NULL; 1691 1692 /* 1693 * Stage one: Free all available per cpu objects back 1694 * to the page freelist while it is still frozen. Leave the 1695 * last one. 1696 * 1697 * There is no need to take the list->lock because the page 1698 * is still frozen. 1699 */ 1700 while (freelist && (nextfree = get_freepointer(s, freelist))) { 1701 void *prior; 1702 unsigned long counters; 1703 1704 do { 1705 prior = page->freelist; 1706 counters = page->counters; 1707 set_freepointer(s, freelist, prior); 1708 new.counters = counters; 1709 new.inuse--; 1710 VM_BUG_ON(!new.frozen); 1711 1712 } while (!cmpxchg_double_slab(s, page, 1713 prior, counters, 1714 freelist, new.counters, 1715 "drain percpu freelist")); 1716 1717 freelist = nextfree; 1718 } 1719 1720 /* 1721 * Stage two: Ensure that the page is unfrozen while the 1722 * list presence reflects the actual number of objects 1723 * during unfreeze. 1724 * 1725 * We setup the list membership and then perform a cmpxchg 1726 * with the count. If there is a mismatch then the page 1727 * is not unfrozen but the page is on the wrong list. 1728 * 1729 * Then we restart the process which may have to remove 1730 * the page from the list that we just put it on again 1731 * because the number of objects in the slab may have 1732 * changed. 1733 */ 1734redo: 1735 1736 old.freelist = page->freelist; 1737 old.counters = page->counters; 1738 VM_BUG_ON(!old.frozen); 1739 1740 /* Determine target state of the slab */ 1741 new.counters = old.counters; 1742 if (freelist) { 1743 new.inuse--; 1744 set_freepointer(s, freelist, old.freelist); 1745 new.freelist = freelist; 1746 } else 1747 new.freelist = old.freelist; 1748 1749 new.frozen = 0; 1750 1751 if (!new.inuse && n->nr_partial < s->min_partial) 1752 m = M_FREE; 1753 else if (new.freelist) { 1754 m = M_PARTIAL; 1755 if (!lock) { 1756 lock = 1; 1757 /* 1758 * Taking the spinlock removes the possiblity 1759 * that acquire_slab() will see a slab page that 1760 * is frozen 1761 */ 1762 spin_lock(&n->list_lock); 1763 } 1764 } else { 1765 m = M_FULL; 1766 if (kmem_cache_debug(s) && !lock) { 1767 lock = 1; 1768 /* 1769 * This also ensures that the scanning of full 1770 * slabs from diagnostic functions will not see 1771 * any frozen slabs. 1772 */ 1773 spin_lock(&n->list_lock); 1774 } 1775 } 1776 1777 if (l != m) { 1778 1779 if (l == M_PARTIAL) 1780 1781 remove_partial(n, page); 1782 1783 else if (l == M_FULL) 1784 1785 remove_full(s, page); 1786 1787 if (m == M_PARTIAL) { 1788 1789 add_partial(n, page, tail); 1790 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1791 1792 } else if (m == M_FULL) { 1793 1794 stat(s, DEACTIVATE_FULL); 1795 add_full(s, n, page); 1796 1797 } 1798 } 1799 1800 l = m; 1801 if (!cmpxchg_double_slab(s, page, 1802 old.freelist, old.counters, 1803 new.freelist, new.counters, 1804 "unfreezing slab")) 1805 goto redo; 1806 1807 slab_unlock(page); 1808 1809 if (lock) 1810 spin_unlock(&n->list_lock); 1811 1812 if (m == M_FREE) { 1813 stat(s, DEACTIVATE_EMPTY); 1814 discard_slab(s, page); 1815 stat(s, FREE_SLAB); 1816 } 1817} 1818 1819static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1820{ 1821 stat(s, CPUSLAB_FLUSH); 1822 slab_lock(c->page); 1823 deactivate_slab(s, c); 1824} 1825 1826/* 1827 * Flush cpu slab. 1828 * 1829 * Called from IPI handler with interrupts disabled. 1830 */ 1831static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1832{ 1833 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 1834 1835 if (likely(c && c->page)) 1836 flush_slab(s, c); 1837} 1838 1839static void flush_cpu_slab(void *d) 1840{ 1841 struct kmem_cache *s = d; 1842 1843 __flush_cpu_slab(s, smp_processor_id()); 1844} 1845 1846static void flush_all(struct kmem_cache *s) 1847{ 1848 on_each_cpu(flush_cpu_slab, s, 1); 1849} 1850 1851/* 1852 * Check if the objects in a per cpu structure fit numa 1853 * locality expectations. 1854 */ 1855static inline int node_match(struct kmem_cache_cpu *c, int node) 1856{ 1857#ifdef CONFIG_NUMA 1858 if (node != NUMA_NO_NODE && c->node != node) 1859 return 0; 1860#endif 1861 return 1; 1862} 1863 1864static int count_free(struct page *page) 1865{ 1866 return page->objects - page->inuse; 1867} 1868 1869static unsigned long count_partial(struct kmem_cache_node *n, 1870 int (*get_count)(struct page *)) 1871{ 1872 unsigned long flags; 1873 unsigned long x = 0; 1874 struct page *page; 1875 1876 spin_lock_irqsave(&n->list_lock, flags); 1877 list_for_each_entry(page, &n->partial, lru) 1878 x += get_count(page); 1879 spin_unlock_irqrestore(&n->list_lock, flags); 1880 return x; 1881} 1882 1883static inline unsigned long node_nr_objs(struct kmem_cache_node *n) 1884{ 1885#ifdef CONFIG_SLUB_DEBUG 1886 return atomic_long_read(&n->total_objects); 1887#else 1888 return 0; 1889#endif 1890} 1891 1892static noinline void 1893slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) 1894{ 1895 int node; 1896 1897 printk(KERN_WARNING 1898 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 1899 nid, gfpflags); 1900 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " 1901 "default order: %d, min order: %d\n", s->name, s->objsize, 1902 s->size, oo_order(s->oo), oo_order(s->min)); 1903 1904 if (oo_order(s->min) > get_order(s->objsize)) 1905 printk(KERN_WARNING " %s debugging increased min order, use " 1906 "slub_debug=O to disable.\n", s->name); 1907 1908 for_each_online_node(node) { 1909 struct kmem_cache_node *n = get_node(s, node); 1910 unsigned long nr_slabs; 1911 unsigned long nr_objs; 1912 unsigned long nr_free; 1913 1914 if (!n) 1915 continue; 1916 1917 nr_free = count_partial(n, count_free); 1918 nr_slabs = node_nr_slabs(n); 1919 nr_objs = node_nr_objs(n); 1920 1921 printk(KERN_WARNING 1922 " node %d: slabs: %ld, objs: %ld, free: %ld\n", 1923 node, nr_slabs, nr_objs, nr_free); 1924 } 1925} 1926 1927/* 1928 * Slow path. The lockless freelist is empty or we need to perform 1929 * debugging duties. 1930 * 1931 * Interrupts are disabled. 1932 * 1933 * Processing is still very fast if new objects have been freed to the 1934 * regular freelist. In that case we simply take over the regular freelist 1935 * as the lockless freelist and zap the regular freelist. 1936 * 1937 * If that is not working then we fall back to the partial lists. We take the 1938 * first element of the freelist as the object to allocate now and move the 1939 * rest of the freelist to the lockless freelist. 1940 * 1941 * And if we were unable to get a new slab from the partial slab lists then 1942 * we need to allocate a new slab. This is the slowest path since it involves 1943 * a call to the page allocator and the setup of a new slab. 1944 */ 1945static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 1946 unsigned long addr, struct kmem_cache_cpu *c) 1947{ 1948 void **object; 1949 struct page *page; 1950 unsigned long flags; 1951 struct page new; 1952 unsigned long counters; 1953 1954 local_irq_save(flags); 1955#ifdef CONFIG_PREEMPT 1956 /* 1957 * We may have been preempted and rescheduled on a different 1958 * cpu before disabling interrupts. Need to reload cpu area 1959 * pointer. 1960 */ 1961 c = this_cpu_ptr(s->cpu_slab); 1962#endif 1963 1964 /* We handle __GFP_ZERO in the caller */ 1965 gfpflags &= ~__GFP_ZERO; 1966 1967 page = c->page; 1968 if (!page) 1969 goto new_slab; 1970 1971 slab_lock(page); 1972 if (unlikely(!node_match(c, node))) 1973 goto another_slab; 1974 1975 stat(s, ALLOC_SLOWPATH); 1976 1977 do { 1978 object = page->freelist; 1979 counters = page->counters; 1980 new.counters = counters; 1981 new.inuse = page->objects; 1982 VM_BUG_ON(!new.frozen); 1983 1984 } while (!cmpxchg_double_slab(s, page, 1985 object, counters, 1986 NULL, new.counters, 1987 "__slab_alloc")); 1988 1989load_freelist: 1990 VM_BUG_ON(!page->frozen); 1991 1992 if (unlikely(!object)) 1993 goto another_slab; 1994 1995 stat(s, ALLOC_REFILL); 1996 1997 slab_unlock(page); 1998 1999 c->freelist = get_freepointer(s, object); 2000 c->tid = next_tid(c->tid); 2001 local_irq_restore(flags); 2002 return object; 2003 2004another_slab: 2005 deactivate_slab(s, c); 2006 2007new_slab: 2008 page = get_partial(s, gfpflags, node); 2009 if (page) { 2010 stat(s, ALLOC_FROM_PARTIAL); 2011 object = c->freelist; 2012 2013 if (kmem_cache_debug(s)) 2014 goto debug; 2015 goto load_freelist; 2016 } 2017 2018 page = new_slab(s, gfpflags, node); 2019 2020 if (page) { 2021 c = __this_cpu_ptr(s->cpu_slab); 2022 if (c->page) 2023 flush_slab(s, c); 2024 2025 /* 2026 * No other reference to the page yet so we can 2027 * muck around with it freely without cmpxchg 2028 */ 2029 object = page->freelist; 2030 page->freelist = NULL; 2031 page->inuse = page->objects; 2032 2033 stat(s, ALLOC_SLAB); 2034 slab_lock(page); 2035 c->node = page_to_nid(page); 2036 c->page = page; 2037 goto load_freelist; 2038 } 2039 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2040 slab_out_of_memory(s, gfpflags, node); 2041 local_irq_restore(flags); 2042 return NULL; 2043 2044debug: 2045 if (!object || !alloc_debug_processing(s, page, object, addr)) 2046 goto new_slab; 2047 2048 c->freelist = get_freepointer(s, object); 2049 deactivate_slab(s, c); 2050 c->page = NULL; 2051 c->node = NUMA_NO_NODE; 2052 local_irq_restore(flags); 2053 return object; 2054} 2055 2056/* 2057 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) 2058 * have the fastpath folded into their functions. So no function call 2059 * overhead for requests that can be satisfied on the fastpath. 2060 * 2061 * The fastpath works by first checking if the lockless freelist can be used. 2062 * If not then __slab_alloc is called for slow processing. 2063 * 2064 * Otherwise we can simply pick the next object from the lockless free list. 2065 */ 2066static __always_inline void *slab_alloc(struct kmem_cache *s, 2067 gfp_t gfpflags, int node, unsigned long addr) 2068{ 2069 void **object; 2070 struct kmem_cache_cpu *c; 2071 unsigned long tid; 2072 2073 if (slab_pre_alloc_hook(s, gfpflags)) 2074 return NULL; 2075 2076redo: 2077 2078 /* 2079 * Must read kmem_cache cpu data via this cpu ptr. Preemption is 2080 * enabled. We may switch back and forth between cpus while 2081 * reading from one cpu area. That does not matter as long 2082 * as we end up on the original cpu again when doing the cmpxchg. 2083 */ 2084 c = __this_cpu_ptr(s->cpu_slab); 2085 2086 /* 2087 * The transaction ids are globally unique per cpu and per operation on 2088 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 2089 * occurs on the right processor and that there was no operation on the 2090 * linked list in between. 2091 */ 2092 tid = c->tid; 2093 barrier(); 2094 2095 object = c->freelist; 2096 if (unlikely(!object || !node_match(c, node))) 2097 2098 object = __slab_alloc(s, gfpflags, node, addr, c); 2099 2100 else { 2101 /* 2102 * The cmpxchg will only match if there was no additional 2103 * operation and if we are on the right processor. 2104 * 2105 * The cmpxchg does the following atomically (without lock semantics!) 2106 * 1. Relocate first pointer to the current per cpu area. 2107 * 2. Verify that tid and freelist have not been changed 2108 * 3. If they were not changed replace tid and freelist 2109 * 2110 * Since this is without lock semantics the protection is only against 2111 * code executing on this cpu *not* from access by other cpus. 2112 */ 2113 if (unlikely(!irqsafe_cpu_cmpxchg_double( 2114 s->cpu_slab->freelist, s->cpu_slab->tid, 2115 object, tid, 2116 get_freepointer_safe(s, object), next_tid(tid)))) { 2117 2118 note_cmpxchg_failure("slab_alloc", s, tid); 2119 goto redo; 2120 } 2121 stat(s, ALLOC_FASTPATH); 2122 } 2123 2124 if (unlikely(gfpflags & __GFP_ZERO) && object) 2125 memset(object, 0, s->objsize); 2126 2127 slab_post_alloc_hook(s, gfpflags, object); 2128 2129 return object; 2130} 2131 2132void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 2133{ 2134 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 2135 2136 trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); 2137 2138 return ret; 2139} 2140EXPORT_SYMBOL(kmem_cache_alloc); 2141 2142#ifdef CONFIG_TRACING 2143void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) 2144{ 2145 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 2146 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); 2147 return ret; 2148} 2149EXPORT_SYMBOL(kmem_cache_alloc_trace); 2150 2151void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) 2152{ 2153 void *ret = kmalloc_order(size, flags, order); 2154 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); 2155 return ret; 2156} 2157EXPORT_SYMBOL(kmalloc_order_trace); 2158#endif 2159 2160#ifdef CONFIG_NUMA 2161void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 2162{ 2163 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); 2164 2165 trace_kmem_cache_alloc_node(_RET_IP_, ret, 2166 s->objsize, s->size, gfpflags, node); 2167 2168 return ret; 2169} 2170EXPORT_SYMBOL(kmem_cache_alloc_node); 2171 2172#ifdef CONFIG_TRACING 2173void *kmem_cache_alloc_node_trace(struct kmem_cache *s, 2174 gfp_t gfpflags, 2175 int node, size_t size) 2176{ 2177 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); 2178 2179 trace_kmalloc_node(_RET_IP_, ret, 2180 size, s->size, gfpflags, node); 2181 return ret; 2182} 2183EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 2184#endif 2185#endif 2186 2187/* 2188 * Slow patch handling. This may still be called frequently since objects 2189 * have a longer lifetime than the cpu slabs in most processing loads. 2190 * 2191 * So we still attempt to reduce cache line usage. Just take the slab 2192 * lock and free the item. If there is no additional partial page 2193 * handling required then we can return immediately. 2194 */ 2195static void __slab_free(struct kmem_cache *s, struct page *page, 2196 void *x, unsigned long addr) 2197{ 2198 void *prior; 2199 void **object = (void *)x; 2200 int was_frozen; 2201 int inuse; 2202 struct page new; 2203 unsigned long counters; 2204 struct kmem_cache_node *n = NULL; 2205 unsigned long uninitialized_var(flags); 2206 2207 local_irq_save(flags); 2208 slab_lock(page); 2209 stat(s, FREE_SLOWPATH); 2210 2211 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) 2212 goto out_unlock; 2213 2214 do { 2215 prior = page->freelist; 2216 counters = page->counters; 2217 set_freepointer(s, object, prior); 2218 new.counters = counters; 2219 was_frozen = new.frozen; 2220 new.inuse--; 2221 if ((!new.inuse || !prior) && !was_frozen && !n) { 2222 n = get_node(s, page_to_nid(page)); 2223 /* 2224 * Speculatively acquire the list_lock. 2225 * If the cmpxchg does not succeed then we may 2226 * drop the list_lock without any processing. 2227 * 2228 * Otherwise the list_lock will synchronize with 2229 * other processors updating the list of slabs. 2230 */ 2231 spin_lock(&n->list_lock); 2232 } 2233 inuse = new.inuse; 2234 2235 } while (!cmpxchg_double_slab(s, page, 2236 prior, counters, 2237 object, new.counters, 2238 "__slab_free")); 2239 2240 if (likely(!n)) { 2241 /* 2242 * The list lock was not taken therefore no list 2243 * activity can be necessary. 2244 */ 2245 if (was_frozen) 2246 stat(s, FREE_FROZEN); 2247 goto out_unlock; 2248 } 2249 2250 /* 2251 * was_frozen may have been set after we acquired the list_lock in 2252 * an earlier loop. So we need to check it here again. 2253 */ 2254 if (was_frozen) 2255 stat(s, FREE_FROZEN); 2256 else { 2257 if (unlikely(!inuse && n->nr_partial > s->min_partial)) 2258 goto slab_empty; 2259 2260 /* 2261 * Objects left in the slab. If it was not on the partial list before 2262 * then add it. 2263 */ 2264 if (unlikely(!prior)) { 2265 remove_full(s, page); 2266 add_partial(n, page, 0); 2267 stat(s, FREE_ADD_PARTIAL); 2268 } 2269 } 2270 2271 spin_unlock(&n->list_lock); 2272 2273out_unlock: 2274 slab_unlock(page); 2275 local_irq_restore(flags); 2276 return; 2277 2278slab_empty: 2279 if (prior) { 2280 /* 2281 * Slab still on the partial list. 2282 */ 2283 remove_partial(n, page); 2284 stat(s, FREE_REMOVE_PARTIAL); 2285 } 2286 2287 spin_unlock(&n->list_lock); 2288 slab_unlock(page); 2289 local_irq_restore(flags); 2290 stat(s, FREE_SLAB); 2291 discard_slab(s, page); 2292} 2293 2294/* 2295 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 2296 * can perform fastpath freeing without additional function calls. 2297 * 2298 * The fastpath is only possible if we are freeing to the current cpu slab 2299 * of this processor. This typically the case if we have just allocated 2300 * the item before. 2301 * 2302 * If fastpath is not possible then fall back to __slab_free where we deal 2303 * with all sorts of special processing. 2304 */ 2305static __always_inline void slab_free(struct kmem_cache *s, 2306 struct page *page, void *x, unsigned long addr) 2307{ 2308 void **object = (void *)x; 2309 struct kmem_cache_cpu *c; 2310 unsigned long tid; 2311 2312 slab_free_hook(s, x); 2313 2314redo: 2315 2316 /* 2317 * Determine the currently cpus per cpu slab. 2318 * The cpu may change afterward. However that does not matter since 2319 * data is retrieved via this pointer. If we are on the same cpu 2320 * during the cmpxchg then the free will succedd. 2321 */ 2322 c = __this_cpu_ptr(s->cpu_slab); 2323 2324 tid = c->tid; 2325 barrier(); 2326 2327 if (likely(page == c->page)) { 2328 set_freepointer(s, object, c->freelist); 2329 2330 if (unlikely(!irqsafe_cpu_cmpxchg_double( 2331 s->cpu_slab->freelist, s->cpu_slab->tid, 2332 c->freelist, tid, 2333 object, next_tid(tid)))) { 2334 2335 note_cmpxchg_failure("slab_free", s, tid); 2336 goto redo; 2337 } 2338 stat(s, FREE_FASTPATH); 2339 } else 2340 __slab_free(s, page, x, addr); 2341 2342} 2343 2344void kmem_cache_free(struct kmem_cache *s, void *x) 2345{ 2346 struct page *page; 2347 2348 page = virt_to_head_page(x); 2349 2350 slab_free(s, page, x, _RET_IP_); 2351 2352 trace_kmem_cache_free(_RET_IP_, x); 2353} 2354EXPORT_SYMBOL(kmem_cache_free); 2355 2356/* 2357 * Object placement in a slab is made very easy because we always start at 2358 * offset 0. If we tune the size of the object to the alignment then we can 2359 * get the required alignment by putting one properly sized object after 2360 * another. 2361 * 2362 * Notice that the allocation order determines the sizes of the per cpu 2363 * caches. Each processor has always one slab available for allocations. 2364 * Increasing the allocation order reduces the number of times that slabs 2365 * must be moved on and off the partial lists and is therefore a factor in 2366 * locking overhead. 2367 */ 2368 2369/* 2370 * Mininum / Maximum order of slab pages. This influences locking overhead 2371 * and slab fragmentation. A higher order reduces the number of partial slabs 2372 * and increases the number of allocations possible without having to 2373 * take the list_lock. 2374 */ 2375static int slub_min_order; 2376static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; 2377static int slub_min_objects; 2378 2379/* 2380 * Merge control. If this is set then no merging of slab caches will occur. 2381 * (Could be removed. This was introduced to pacify the merge skeptics.) 2382 */ 2383static int slub_nomerge; 2384 2385/* 2386 * Calculate the order of allocation given an slab object size. 2387 * 2388 * The order of allocation has significant impact on performance and other 2389 * system components. Generally order 0 allocations should be preferred since 2390 * order 0 does not cause fragmentation in the page allocator. Larger objects 2391 * be problematic to put into order 0 slabs because there may be too much 2392 * unused space left. We go to a higher order if more than 1/16th of the slab 2393 * would be wasted. 2394 * 2395 * In order to reach satisfactory performance we must ensure that a minimum 2396 * number of objects is in one slab. Otherwise we may generate too much 2397 * activity on the partial lists which requires taking the list_lock. This is 2398 * less a concern for large slabs though which are rarely used. 2399 * 2400 * slub_max_order specifies the order where we begin to stop considering the 2401 * number of objects in a slab as critical. If we reach slub_max_order then 2402 * we try to keep the page order as low as possible. So we accept more waste 2403 * of space in favor of a small page order. 2404 * 2405 * Higher order allocations also allow the placement of more objects in a 2406 * slab and thereby reduce object handling overhead. If the user has 2407 * requested a higher mininum order then we start with that one instead of 2408 * the smallest order which will fit the object. 2409 */ 2410static inline int slab_order(int size, int min_objects, 2411 int max_order, int fract_leftover, int reserved) 2412{ 2413 int order; 2414 int rem; 2415 int min_order = slub_min_order; 2416 2417 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) 2418 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 2419 2420 for (order = max(min_order, 2421 fls(min_objects * size - 1) - PAGE_SHIFT); 2422 order <= max_order; order++) { 2423 2424 unsigned long slab_size = PAGE_SIZE << order; 2425 2426 if (slab_size < min_objects * size + reserved) 2427 continue; 2428 2429 rem = (slab_size - reserved) % size; 2430 2431 if (rem <= slab_size / fract_leftover) 2432 break; 2433 2434 } 2435 2436 return order; 2437} 2438 2439static inline int calculate_order(int size, int reserved) 2440{ 2441 int order; 2442 int min_objects; 2443 int fraction; 2444 int max_objects; 2445 2446 /* 2447 * Attempt to find best configuration for a slab. This 2448 * works by first attempting to generate a layout with 2449 * the best configuration and backing off gradually. 2450 * 2451 * First we reduce the acceptable waste in a slab. Then 2452 * we reduce the minimum objects required in a slab. 2453 */ 2454 min_objects = slub_min_objects; 2455 if (!min_objects) 2456 min_objects = 4 * (fls(nr_cpu_ids) + 1); 2457 max_objects = order_objects(slub_max_order, size, reserved); 2458 min_objects = min(min_objects, max_objects); 2459 2460 while (min_objects > 1) { 2461 fraction = 16; 2462 while (fraction >= 4) { 2463 order = slab_order(size, min_objects, 2464 slub_max_order, fraction, reserved); 2465 if (order <= slub_max_order) 2466 return order; 2467 fraction /= 2; 2468 } 2469 min_objects--; 2470 } 2471 2472 /* 2473 * We were unable to place multiple objects in a slab. Now 2474 * lets see if we can place a single object there. 2475 */ 2476 order = slab_order(size, 1, slub_max_order, 1, reserved); 2477 if (order <= slub_max_order) 2478 return order; 2479 2480 /* 2481 * Doh this slab cannot be placed using slub_max_order. 2482 */ 2483 order = slab_order(size, 1, MAX_ORDER, 1, reserved); 2484 if (order < MAX_ORDER) 2485 return order; 2486 return -ENOSYS; 2487} 2488 2489/* 2490 * Figure out what the alignment of the objects will be. 2491 */ 2492static unsigned long calculate_alignment(unsigned long flags, 2493 unsigned long align, unsigned long size) 2494{ 2495 /* 2496 * If the user wants hardware cache aligned objects then follow that 2497 * suggestion if the object is sufficiently large. 2498 * 2499 * The hardware cache alignment cannot override the specified 2500 * alignment though. If that is greater then use it. 2501 */ 2502 if (flags & SLAB_HWCACHE_ALIGN) { 2503 unsigned long ralign = cache_line_size(); 2504 while (size <= ralign / 2) 2505 ralign /= 2; 2506 align = max(align, ralign); 2507 } 2508 2509 if (align < ARCH_SLAB_MINALIGN) 2510 align = ARCH_SLAB_MINALIGN; 2511 2512 return ALIGN(align, sizeof(void *)); 2513} 2514 2515static void 2516init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 2517{ 2518 n->nr_partial = 0; 2519 spin_lock_init(&n->list_lock); 2520 INIT_LIST_HEAD(&n->partial); 2521#ifdef CONFIG_SLUB_DEBUG 2522 atomic_long_set(&n->nr_slabs, 0); 2523 atomic_long_set(&n->total_objects, 0); 2524 INIT_LIST_HEAD(&n->full); 2525#endif 2526} 2527 2528static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 2529{ 2530 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 2531 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); 2532 2533 /* 2534 * Must align to double word boundary for the double cmpxchg 2535 * instructions to work; see __pcpu_double_call_return_bool(). 2536 */ 2537 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2538 2 * sizeof(void *)); 2539 2540 if (!s->cpu_slab) 2541 return 0; 2542 2543 init_kmem_cache_cpus(s); 2544 2545 return 1; 2546} 2547 2548static struct kmem_cache *kmem_cache_node; 2549 2550/* 2551 * No kmalloc_node yet so do it by hand. We know that this is the first 2552 * slab on the node for this slabcache. There are no concurrent accesses 2553 * possible. 2554 * 2555 * Note that this function only works on the kmalloc_node_cache 2556 * when allocating for the kmalloc_node_cache. This is used for bootstrapping 2557 * memory on a fresh node that has no slab structures yet. 2558 */ 2559static void early_kmem_cache_node_alloc(int node) 2560{ 2561 struct page *page; 2562 struct kmem_cache_node *n; 2563 2564 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); 2565 2566 page = new_slab(kmem_cache_node, GFP_NOWAIT, node); 2567 2568 BUG_ON(!page); 2569 if (page_to_nid(page) != node) { 2570 printk(KERN_ERR "SLUB: Unable to allocate memory from " 2571 "node %d\n", node); 2572 printk(KERN_ERR "SLUB: Allocating a useless per node structure " 2573 "in order to be able to continue\n"); 2574 } 2575 2576 n = page->freelist; 2577 BUG_ON(!n); 2578 page->freelist = get_freepointer(kmem_cache_node, n); 2579 page->inuse++; 2580 page->frozen = 0; 2581 kmem_cache_node->node[node] = n; 2582#ifdef CONFIG_SLUB_DEBUG 2583 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2584 init_tracking(kmem_cache_node, n); 2585#endif 2586 init_kmem_cache_node(n, kmem_cache_node); 2587 inc_slabs_node(kmem_cache_node, node, page->objects); 2588 2589 add_partial(n, page, 0); 2590} 2591 2592static void free_kmem_cache_nodes(struct kmem_cache *s) 2593{ 2594 int node; 2595 2596 for_each_node_state(node, N_NORMAL_MEMORY) { 2597 struct kmem_cache_node *n = s->node[node]; 2598 2599 if (n) 2600 kmem_cache_free(kmem_cache_node, n); 2601 2602 s->node[node] = NULL; 2603 } 2604} 2605 2606static int init_kmem_cache_nodes(struct kmem_cache *s) 2607{ 2608 int node; 2609 2610 for_each_node_state(node, N_NORMAL_MEMORY) { 2611 struct kmem_cache_node *n; 2612 2613 if (slab_state == DOWN) { 2614 early_kmem_cache_node_alloc(node); 2615 continue; 2616 } 2617 n = kmem_cache_alloc_node(kmem_cache_node, 2618 GFP_KERNEL, node); 2619 2620 if (!n) { 2621 free_kmem_cache_nodes(s); 2622 return 0; 2623 } 2624 2625 s->node[node] = n; 2626 init_kmem_cache_node(n, s); 2627 } 2628 return 1; 2629} 2630 2631static void set_min_partial(struct kmem_cache *s, unsigned long min) 2632{ 2633 if (min < MIN_PARTIAL) 2634 min = MIN_PARTIAL; 2635 else if (min > MAX_PARTIAL) 2636 min = MAX_PARTIAL; 2637 s->min_partial = min; 2638} 2639 2640/* 2641 * calculate_sizes() determines the order and the distribution of data within 2642 * a slab object. 2643 */ 2644static int calculate_sizes(struct kmem_cache *s, int forced_order) 2645{ 2646 unsigned long flags = s->flags; 2647 unsigned long size = s->objsize; 2648 unsigned long align = s->align; 2649 int order; 2650 2651 /* 2652 * Round up object size to the next word boundary. We can only 2653 * place the free pointer at word boundaries and this determines 2654 * the possible location of the free pointer. 2655 */ 2656 size = ALIGN(size, sizeof(void *)); 2657 2658#ifdef CONFIG_SLUB_DEBUG 2659 /* 2660 * Determine if we can poison the object itself. If the user of 2661 * the slab may touch the object after free or before allocation 2662 * then we should never poison the object itself. 2663 */ 2664 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 2665 !s->ctor) 2666 s->flags |= __OBJECT_POISON; 2667 else 2668 s->flags &= ~__OBJECT_POISON; 2669 2670 2671 /* 2672 * If we are Redzoning then check if there is some space between the 2673 * end of the object and the free pointer. If not then add an 2674 * additional word to have some bytes to store Redzone information. 2675 */ 2676 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 2677 size += sizeof(void *); 2678#endif 2679 2680 /* 2681 * With that we have determined the number of bytes in actual use 2682 * by the object. This is the potential offset to the free pointer. 2683 */ 2684 s->inuse = size; 2685 2686 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 2687 s->ctor)) { 2688 /* 2689 * Relocate free pointer after the object if it is not 2690 * permitted to overwrite the first word of the object on 2691 * kmem_cache_free. 2692 * 2693 * This is the case if we do RCU, have a constructor or 2694 * destructor or are poisoning the objects. 2695 */ 2696 s->offset = size; 2697 size += sizeof(void *); 2698 } 2699 2700#ifdef CONFIG_SLUB_DEBUG 2701 if (flags & SLAB_STORE_USER) 2702 /* 2703 * Need to store information about allocs and frees after 2704 * the object. 2705 */ 2706 size += 2 * sizeof(struct track); 2707 2708 if (flags & SLAB_RED_ZONE) 2709 /* 2710 * Add some empty padding so that we can catch 2711 * overwrites from earlier objects rather than let 2712 * tracking information or the free pointer be 2713 * corrupted if a user writes before the start 2714 * of the object. 2715 */ 2716 size += sizeof(void *); 2717#endif 2718 2719 /* 2720 * Determine the alignment based on various parameters that the 2721 * user specified and the dynamic determination of cache line size 2722 * on bootup. 2723 */ 2724 align = calculate_alignment(flags, align, s->objsize); 2725 s->align = align; 2726 2727 /* 2728 * SLUB stores one object immediately after another beginning from 2729 * offset 0. In order to align the objects we have to simply size 2730 * each object to conform to the alignment. 2731 */ 2732 size = ALIGN(size, align); 2733 s->size = size; 2734 if (forced_order >= 0) 2735 order = forced_order; 2736 else 2737 order = calculate_order(size, s->reserved); 2738 2739 if (order < 0) 2740 return 0; 2741 2742 s->allocflags = 0; 2743 if (order) 2744 s->allocflags |= __GFP_COMP; 2745 2746 if (s->flags & SLAB_CACHE_DMA) 2747 s->allocflags |= SLUB_DMA; 2748 2749 if (s->flags & SLAB_RECLAIM_ACCOUNT) 2750 s->allocflags |= __GFP_RECLAIMABLE; 2751 2752 /* 2753 * Determine the number of objects per slab 2754 */ 2755 s->oo = oo_make(order, size, s->reserved); 2756 s->min = oo_make(get_order(size), size, s->reserved); 2757 if (oo_objects(s->oo) > oo_objects(s->max)) 2758 s->max = s->oo; 2759 2760 return !!oo_objects(s->oo); 2761 2762} 2763 2764static int kmem_cache_open(struct kmem_cache *s, 2765 const char *name, size_t size, 2766 size_t align, unsigned long flags, 2767 void (*ctor)(void *)) 2768{ 2769 memset(s, 0, kmem_size); 2770 s->name = name; 2771 s->ctor = ctor; 2772 s->objsize = size; 2773 s->align = align; 2774 s->flags = kmem_cache_flags(size, flags, name, ctor); 2775 s->reserved = 0; 2776 2777 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) 2778 s->reserved = sizeof(struct rcu_head); 2779 2780 if (!calculate_sizes(s, -1)) 2781 goto error; 2782 if (disable_higher_order_debug) { 2783 /* 2784 * Disable debugging flags that store metadata if the min slab 2785 * order increased. 2786 */ 2787 if (get_order(s->size) > get_order(s->objsize)) { 2788 s->flags &= ~DEBUG_METADATA_FLAGS; 2789 s->offset = 0; 2790 if (!calculate_sizes(s, -1)) 2791 goto error; 2792 } 2793 } 2794 2795#ifdef CONFIG_CMPXCHG_DOUBLE 2796 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) 2797 /* Enable fast mode */ 2798 s->flags |= __CMPXCHG_DOUBLE; 2799#endif 2800 2801 /* 2802 * The larger the object size is, the more pages we want on the partial 2803 * list to avoid pounding the page allocator excessively. 2804 */ 2805 set_min_partial(s, ilog2(s->size)); 2806 s->refcount = 1; 2807#ifdef CONFIG_NUMA 2808 s->remote_node_defrag_ratio = 1000; 2809#endif 2810 if (!init_kmem_cache_nodes(s)) 2811 goto error; 2812 2813 if (alloc_kmem_cache_cpus(s)) 2814 return 1; 2815 2816 free_kmem_cache_nodes(s); 2817error: 2818 if (flags & SLAB_PANIC) 2819 panic("Cannot create slab %s size=%lu realsize=%u " 2820 "order=%u offset=%u flags=%lx\n", 2821 s->name, (unsigned long)size, s->size, oo_order(s->oo), 2822 s->offset, flags); 2823 return 0; 2824} 2825 2826/* 2827 * Determine the size of a slab object 2828 */ 2829unsigned int kmem_cache_size(struct kmem_cache *s) 2830{ 2831 return s->objsize; 2832} 2833EXPORT_SYMBOL(kmem_cache_size); 2834 2835static void list_slab_objects(struct kmem_cache *s, struct page *page, 2836 const char *text) 2837{ 2838#ifdef CONFIG_SLUB_DEBUG 2839 void *addr = page_address(page); 2840 void *p; 2841 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * 2842 sizeof(long), GFP_ATOMIC); 2843 if (!map) 2844 return; 2845 slab_err(s, page, "%s", text); 2846 slab_lock(page); 2847 2848 get_map(s, page, map); 2849 for_each_object(p, s, addr, page->objects) { 2850 2851 if (!test_bit(slab_index(p, s, addr), map)) { 2852 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", 2853 p, p - addr); 2854 print_tracking(s, p); 2855 } 2856 } 2857 slab_unlock(page); 2858 kfree(map); 2859#endif 2860} 2861 2862/* 2863 * Attempt to free all partial slabs on a node. 2864 */ 2865static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) 2866{ 2867 unsigned long flags; 2868 struct page *page, *h; 2869 2870 spin_lock_irqsave(&n->list_lock, flags); 2871 list_for_each_entry_safe(page, h, &n->partial, lru) { 2872 if (!page->inuse) { 2873 remove_partial(n, page); 2874 discard_slab(s, page); 2875 } else { 2876 list_slab_objects(s, page, 2877 "Objects remaining on kmem_cache_close()"); 2878 } 2879 } 2880 spin_unlock_irqrestore(&n->list_lock, flags); 2881} 2882 2883/* 2884 * Release all resources used by a slab cache. 2885 */ 2886static inline int kmem_cache_close(struct kmem_cache *s) 2887{ 2888 int node; 2889 2890 flush_all(s); 2891 free_percpu(s->cpu_slab); 2892 /* Attempt to free all objects */ 2893 for_each_node_state(node, N_NORMAL_MEMORY) { 2894 struct kmem_cache_node *n = get_node(s, node); 2895 2896 free_partial(s, n); 2897 if (n->nr_partial || slabs_node(s, node)) 2898 return 1; 2899 } 2900 free_kmem_cache_nodes(s); 2901 return 0; 2902} 2903 2904/* 2905 * Close a cache and release the kmem_cache structure 2906 * (must be used for caches created using kmem_cache_create) 2907 */ 2908void kmem_cache_destroy(struct kmem_cache *s) 2909{ 2910 down_write(&slub_lock); 2911 s->refcount--; 2912 if (!s->refcount) { 2913 list_del(&s->list); 2914 if (kmem_cache_close(s)) { 2915 printk(KERN_ERR "SLUB %s: %s called for cache that " 2916 "still has objects.\n", s->name, __func__); 2917 dump_stack(); 2918 } 2919 if (s->flags & SLAB_DESTROY_BY_RCU) 2920 rcu_barrier(); 2921 sysfs_slab_remove(s); 2922 } 2923 up_write(&slub_lock); 2924} 2925EXPORT_SYMBOL(kmem_cache_destroy); 2926 2927/******************************************************************** 2928 * Kmalloc subsystem 2929 *******************************************************************/ 2930 2931struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; 2932EXPORT_SYMBOL(kmalloc_caches); 2933 2934static struct kmem_cache *kmem_cache; 2935 2936#ifdef CONFIG_ZONE_DMA 2937static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; 2938#endif 2939 2940static int __init setup_slub_min_order(char *str) 2941{ 2942 get_option(&str, &slub_min_order); 2943 2944 return 1; 2945} 2946 2947__setup("slub_min_order=", setup_slub_min_order); 2948 2949static int __init setup_slub_max_order(char *str) 2950{ 2951 get_option(&str, &slub_max_order); 2952 slub_max_order = min(slub_max_order, MAX_ORDER - 1); 2953 2954 return 1; 2955} 2956 2957__setup("slub_max_order=", setup_slub_max_order); 2958 2959static int __init setup_slub_min_objects(char *str) 2960{ 2961 get_option(&str, &slub_min_objects); 2962 2963 return 1; 2964} 2965 2966__setup("slub_min_objects=", setup_slub_min_objects); 2967 2968static int __init setup_slub_nomerge(char *str) 2969{ 2970 slub_nomerge = 1; 2971 return 1; 2972} 2973 2974__setup("slub_nomerge", setup_slub_nomerge); 2975 2976static struct kmem_cache *__init create_kmalloc_cache(const char *name, 2977 int size, unsigned int flags) 2978{ 2979 struct kmem_cache *s; 2980 2981 s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 2982 2983 /* 2984 * This function is called with IRQs disabled during early-boot on 2985 * single CPU so there's no need to take slub_lock here. 2986 */ 2987 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, 2988 flags, NULL)) 2989 goto panic; 2990 2991 list_add(&s->list, &slab_caches); 2992 return s; 2993 2994panic: 2995 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 2996 return NULL; 2997} 2998 2999/* 3000 * Conversion table for small slabs sizes / 8 to the index in the 3001 * kmalloc array. This is necessary for slabs < 192 since we have non power 3002 * of two cache sizes there. The size of larger slabs can be determined using 3003 * fls. 3004 */ 3005static s8 size_index[24] = { 3006 3, /* 8 */ 3007 4, /* 16 */ 3008 5, /* 24 */ 3009 5, /* 32 */ 3010 6, /* 40 */ 3011 6, /* 48 */ 3012 6, /* 56 */ 3013 6, /* 64 */ 3014 1, /* 72 */ 3015 1, /* 80 */ 3016 1, /* 88 */ 3017 1, /* 96 */ 3018 7, /* 104 */ 3019 7, /* 112 */ 3020 7, /* 120 */ 3021 7, /* 128 */ 3022 2, /* 136 */ 3023 2, /* 144 */ 3024 2, /* 152 */ 3025 2, /* 160 */ 3026 2, /* 168 */ 3027 2, /* 176 */ 3028 2, /* 184 */ 3029 2 /* 192 */ 3030}; 3031 3032static inline int size_index_elem(size_t bytes) 3033{ 3034 return (bytes - 1) / 8; 3035} 3036 3037static struct kmem_cache *get_slab(size_t size, gfp_t flags) 3038{ 3039 int index; 3040 3041 if (size <= 192) { 3042 if (!size) 3043 return ZERO_SIZE_PTR; 3044 3045 index = size_index[size_index_elem(size)]; 3046 } else 3047 index = fls(size - 1); 3048 3049#ifdef CONFIG_ZONE_DMA 3050 if (unlikely((flags & SLUB_DMA))) 3051 return kmalloc_dma_caches[index]; 3052 3053#endif 3054 return kmalloc_caches[index]; 3055} 3056 3057void *__kmalloc(size_t size, gfp_t flags) 3058{ 3059 struct kmem_cache *s; 3060 void *ret; 3061 3062 if (unlikely(size > SLUB_MAX_SIZE)) 3063 return kmalloc_large(size, flags); 3064 3065 s = get_slab(size, flags); 3066 3067 if (unlikely(ZERO_OR_NULL_PTR(s))) 3068 return s; 3069 3070 ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_); 3071 3072 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 3073 3074 return ret; 3075} 3076EXPORT_SYMBOL(__kmalloc); 3077 3078#ifdef CONFIG_NUMA 3079static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 3080{ 3081 struct page *page; 3082 void *ptr = NULL; 3083 3084 flags |= __GFP_COMP | __GFP_NOTRACK; 3085 page = alloc_pages_node(node, flags, get_order(size)); 3086 if (page) 3087 ptr = page_address(page); 3088 3089 kmemleak_alloc(ptr, size, 1, flags); 3090 return ptr; 3091} 3092 3093void *__kmalloc_node(size_t size, gfp_t flags, int node) 3094{ 3095 struct kmem_cache *s; 3096 void *ret; 3097 3098 if (unlikely(size > SLUB_MAX_SIZE)) { 3099 ret = kmalloc_large_node(size, flags, node); 3100 3101 trace_kmalloc_node(_RET_IP_, ret, 3102 size, PAGE_SIZE << get_order(size), 3103 flags, node); 3104 3105 return ret; 3106 } 3107 3108 s = get_slab(size, flags); 3109 3110 if (unlikely(ZERO_OR_NULL_PTR(s))) 3111 return s; 3112 3113 ret = slab_alloc(s, flags, node, _RET_IP_); 3114 3115 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); 3116 3117 return ret; 3118} 3119EXPORT_SYMBOL(__kmalloc_node); 3120#endif 3121 3122size_t ksize(const void *object) 3123{ 3124 struct page *page; 3125 3126 if (unlikely(object == ZERO_SIZE_PTR)) 3127 return 0; 3128 3129 page = virt_to_head_page(object); 3130 3131 if (unlikely(!PageSlab(page))) { 3132 WARN_ON(!PageCompound(page)); 3133 return PAGE_SIZE << compound_order(page); 3134 } 3135 3136 return slab_ksize(page->slab); 3137} 3138EXPORT_SYMBOL(ksize); 3139 3140void kfree(const void *x) 3141{ 3142 struct page *page; 3143 void *object = (void *)x; 3144 3145 trace_kfree(_RET_IP_, x); 3146 3147 if (unlikely(ZERO_OR_NULL_PTR(x))) 3148 return; 3149 3150 page = virt_to_head_page(x); 3151 if (unlikely(!PageSlab(page))) { 3152 BUG_ON(!PageCompound(page)); 3153 kmemleak_free(x); 3154 put_page(page); 3155 return; 3156 } 3157 slab_free(page->slab, page, object, _RET_IP_); 3158} 3159EXPORT_SYMBOL(kfree); 3160 3161/* 3162 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 3163 * the remaining slabs by the number of items in use. The slabs with the 3164 * most items in use come first. New allocations will then fill those up 3165 * and thus they can be removed from the partial lists. 3166 * 3167 * The slabs with the least items are placed last. This results in them 3168 * being allocated from last increasing the chance that the last objects 3169 * are freed in them. 3170 */ 3171int kmem_cache_shrink(struct kmem_cache *s) 3172{ 3173 int node; 3174 int i; 3175 struct kmem_cache_node *n; 3176 struct page *page; 3177 struct page *t; 3178 int objects = oo_objects(s->max); 3179 struct list_head *slabs_by_inuse = 3180 kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); 3181 unsigned long flags; 3182 3183 if (!slabs_by_inuse) 3184 return -ENOMEM; 3185 3186 flush_all(s); 3187 for_each_node_state(node, N_NORMAL_MEMORY) { 3188 n = get_node(s, node); 3189 3190 if (!n->nr_partial) 3191 continue; 3192 3193 for (i = 0; i < objects; i++) 3194 INIT_LIST_HEAD(slabs_by_inuse + i); 3195 3196 spin_lock_irqsave(&n->list_lock, flags); 3197 3198 /* 3199 * Build lists indexed by the items in use in each slab. 3200 * 3201 * Note that concurrent frees may occur while we hold the 3202 * list_lock. page->inuse here is the upper limit. 3203 */ 3204 list_for_each_entry_safe(page, t, &n->partial, lru) { 3205 if (!page->inuse && slab_trylock(page)) { 3206 /* 3207 * Must hold slab lock here because slab_free 3208 * may have freed the last object and be 3209 * waiting to release the slab. 3210 */ 3211 remove_partial(n, page); 3212 slab_unlock(page); 3213 discard_slab(s, page); 3214 } else { 3215 list_move(&page->lru, 3216 slabs_by_inuse + page->inuse); 3217 } 3218 } 3219 3220 /* 3221 * Rebuild the partial list with the slabs filled up most 3222 * first and the least used slabs at the end. 3223 */ 3224 for (i = objects - 1; i >= 0; i--) 3225 list_splice(slabs_by_inuse + i, n->partial.prev); 3226 3227 spin_unlock_irqrestore(&n->list_lock, flags); 3228 } 3229 3230 kfree(slabs_by_inuse); 3231 return 0; 3232} 3233EXPORT_SYMBOL(kmem_cache_shrink); 3234 3235#if defined(CONFIG_MEMORY_HOTPLUG) 3236static int slab_mem_going_offline_callback(void *arg) 3237{ 3238 struct kmem_cache *s; 3239 3240 down_read(&slub_lock); 3241 list_for_each_entry(s, &slab_caches, list) 3242 kmem_cache_shrink(s); 3243 up_read(&slub_lock); 3244 3245 return 0; 3246} 3247 3248static void slab_mem_offline_callback(void *arg) 3249{ 3250 struct kmem_cache_node *n; 3251 struct kmem_cache *s; 3252 struct memory_notify *marg = arg; 3253 int offline_node; 3254 3255 offline_node = marg->status_change_nid; 3256 3257 /* 3258 * If the node still has available memory. we need kmem_cache_node 3259 * for it yet. 3260 */ 3261 if (offline_node < 0) 3262 return; 3263 3264 down_read(&slub_lock); 3265 list_for_each_entry(s, &slab_caches, list) { 3266 n = get_node(s, offline_node); 3267 if (n) { 3268 /* 3269 * if n->nr_slabs > 0, slabs still exist on the node 3270 * that is going down. We were unable to free them, 3271 * and offline_pages() function shouldn't call this 3272 * callback. So, we must fail. 3273 */ 3274 BUG_ON(slabs_node(s, offline_node)); 3275 3276 s->node[offline_node] = NULL; 3277 kmem_cache_free(kmem_cache_node, n); 3278 } 3279 } 3280 up_read(&slub_lock); 3281} 3282 3283static int slab_mem_going_online_callback(void *arg) 3284{ 3285 struct kmem_cache_node *n; 3286 struct kmem_cache *s; 3287 struct memory_notify *marg = arg; 3288 int nid = marg->status_change_nid; 3289 int ret = 0; 3290 3291 /* 3292 * If the node's memory is already available, then kmem_cache_node is 3293 * already created. Nothing to do. 3294 */ 3295 if (nid < 0) 3296 return 0; 3297 3298 /* 3299 * We are bringing a node online. No memory is available yet. We must 3300 * allocate a kmem_cache_node structure in order to bring the node 3301 * online. 3302 */ 3303 down_read(&slub_lock); 3304 list_for_each_entry(s, &slab_caches, list) { 3305 /* 3306 * XXX: kmem_cache_alloc_node will fallback to other nodes 3307 * since memory is not yet available from the node that 3308 * is brought up. 3309 */ 3310 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); 3311 if (!n) { 3312 ret = -ENOMEM; 3313 goto out; 3314 } 3315 init_kmem_cache_node(n, s); 3316 s->node[nid] = n; 3317 } 3318out: 3319 up_read(&slub_lock); 3320 return ret; 3321} 3322 3323static int slab_memory_callback(struct notifier_block *self, 3324 unsigned long action, void *arg) 3325{ 3326 int ret = 0; 3327 3328 switch (action) { 3329 case MEM_GOING_ONLINE: 3330 ret = slab_mem_going_online_callback(arg); 3331 break; 3332 case MEM_GOING_OFFLINE: 3333 ret = slab_mem_going_offline_callback(arg); 3334 break; 3335 case MEM_OFFLINE: 3336 case MEM_CANCEL_ONLINE: 3337 slab_mem_offline_callback(arg); 3338 break; 3339 case MEM_ONLINE: 3340 case MEM_CANCEL_OFFLINE: 3341 break; 3342 } 3343 if (ret) 3344 ret = notifier_from_errno(ret); 3345 else 3346 ret = NOTIFY_OK; 3347 return ret; 3348} 3349 3350#endif /* CONFIG_MEMORY_HOTPLUG */ 3351 3352/******************************************************************** 3353 * Basic setup of slabs 3354 *******************************************************************/ 3355 3356/* 3357 * Used for early kmem_cache structures that were allocated using 3358 * the page allocator 3359 */ 3360 3361static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) 3362{ 3363 int node; 3364 3365 list_add(&s->list, &slab_caches); 3366 s->refcount = -1; 3367 3368 for_each_node_state(node, N_NORMAL_MEMORY) { 3369 struct kmem_cache_node *n = get_node(s, node); 3370 struct page *p; 3371 3372 if (n) { 3373 list_for_each_entry(p, &n->partial, lru) 3374 p->slab = s; 3375 3376#ifdef CONFIG_SLUB_DEBUG 3377 list_for_each_entry(p, &n->full, lru) 3378 p->slab = s; 3379#endif 3380 } 3381 } 3382} 3383 3384void __init kmem_cache_init(void) 3385{ 3386 int i; 3387 int caches = 0; 3388 struct kmem_cache *temp_kmem_cache; 3389 int order; 3390 struct kmem_cache *temp_kmem_cache_node; 3391 unsigned long kmalloc_size; 3392 3393 kmem_size = offsetof(struct kmem_cache, node) + 3394 nr_node_ids * sizeof(struct kmem_cache_node *); 3395 3396 /* Allocate two kmem_caches from the page allocator */ 3397 kmalloc_size = ALIGN(kmem_size, cache_line_size()); 3398 order = get_order(2 * kmalloc_size); 3399 kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); 3400 3401 /* 3402 * Must first have the slab cache available for the allocations of the 3403 * struct kmem_cache_node's. There is special bootstrap code in 3404 * kmem_cache_open for slab_state == DOWN. 3405 */ 3406 kmem_cache_node = (void *)kmem_cache + kmalloc_size; 3407 3408 kmem_cache_open(kmem_cache_node, "kmem_cache_node", 3409 sizeof(struct kmem_cache_node), 3410 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 3411 3412 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3413 3414 /* Able to allocate the per node structures */ 3415 slab_state = PARTIAL; 3416 3417 temp_kmem_cache = kmem_cache; 3418 kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, 3419 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 3420 kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3421 memcpy(kmem_cache, temp_kmem_cache, kmem_size); 3422 3423 /* 3424 * Allocate kmem_cache_node properly from the kmem_cache slab. 3425 * kmem_cache_node is separately allocated so no need to 3426 * update any list pointers. 3427 */ 3428 temp_kmem_cache_node = kmem_cache_node; 3429 3430 kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3431 memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); 3432 3433 kmem_cache_bootstrap_fixup(kmem_cache_node); 3434 3435 caches++; 3436 kmem_cache_bootstrap_fixup(kmem_cache); 3437 caches++; 3438 /* Free temporary boot structure */ 3439 free_pages((unsigned long)temp_kmem_cache, order); 3440 3441 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 3442 3443 /* 3444 * Patch up the size_index table if we have strange large alignment 3445 * requirements for the kmalloc array. This is only the case for 3446 * MIPS it seems. The standard arches will not generate any code here. 3447 * 3448 * Largest permitted alignment is 256 bytes due to the way we 3449 * handle the index determination for the smaller caches. 3450 * 3451 * Make sure that nothing crazy happens if someone starts tinkering 3452 * around with ARCH_KMALLOC_MINALIGN 3453 */ 3454 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 3455 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 3456 3457 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { 3458 int elem = size_index_elem(i); 3459 if (elem >= ARRAY_SIZE(size_index)) 3460 break; 3461 size_index[elem] = KMALLOC_SHIFT_LOW; 3462 } 3463 3464 if (KMALLOC_MIN_SIZE == 64) { 3465 /* 3466 * The 96 byte size cache is not used if the alignment 3467 * is 64 byte. 3468 */ 3469 for (i = 64 + 8; i <= 96; i += 8) 3470 size_index[size_index_elem(i)] = 7; 3471 } else if (KMALLOC_MIN_SIZE == 128) { 3472 /* 3473 * The 192 byte sized cache is not used if the alignment 3474 * is 128 byte. Redirect kmalloc to use the 256 byte cache 3475 * instead. 3476 */ 3477 for (i = 128 + 8; i <= 192; i += 8) 3478 size_index[size_index_elem(i)] = 8; 3479 } 3480 3481 /* Caches that are not of the two-to-the-power-of size */ 3482 if (KMALLOC_MIN_SIZE <= 32) { 3483 kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0); 3484 caches++; 3485 } 3486 3487 if (KMALLOC_MIN_SIZE <= 64) { 3488 kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0); 3489 caches++; 3490 } 3491 3492 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3493 kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0); 3494 caches++; 3495 } 3496 3497 slab_state = UP; 3498 3499 /* Provide the correct kmalloc names now that the caches are up */ 3500 if (KMALLOC_MIN_SIZE <= 32) { 3501 kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT); 3502 BUG_ON(!kmalloc_caches[1]->name); 3503 } 3504 3505 if (KMALLOC_MIN_SIZE <= 64) { 3506 kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT); 3507 BUG_ON(!kmalloc_caches[2]->name); 3508 } 3509 3510 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3511 char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); 3512 3513 BUG_ON(!s); 3514 kmalloc_caches[i]->name = s; 3515 } 3516 3517#ifdef CONFIG_SMP 3518 register_cpu_notifier(&slab_notifier); 3519#endif 3520 3521#ifdef CONFIG_ZONE_DMA 3522 for (i = 0; i < SLUB_PAGE_SHIFT; i++) { 3523 struct kmem_cache *s = kmalloc_caches[i]; 3524 3525 if (s && s->size) { 3526 char *name = kasprintf(GFP_NOWAIT, 3527 "dma-kmalloc-%d", s->objsize); 3528 3529 BUG_ON(!name); 3530 kmalloc_dma_caches[i] = create_kmalloc_cache(name, 3531 s->objsize, SLAB_CACHE_DMA); 3532 } 3533 } 3534#endif 3535 printk(KERN_INFO 3536 "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 3537 " CPUs=%d, Nodes=%d\n", 3538 caches, cache_line_size(), 3539 slub_min_order, slub_max_order, slub_min_objects, 3540 nr_cpu_ids, nr_node_ids); 3541} 3542 3543void __init kmem_cache_init_late(void) 3544{ 3545} 3546 3547/* 3548 * Find a mergeable slab cache 3549 */ 3550static int slab_unmergeable(struct kmem_cache *s) 3551{ 3552 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 3553 return 1; 3554 3555 if (s->ctor) 3556 return 1; 3557 3558 /* 3559 * We may have set a slab to be unmergeable during bootstrap. 3560 */ 3561 if (s->refcount < 0) 3562 return 1; 3563 3564 return 0; 3565} 3566 3567static struct kmem_cache *find_mergeable(size_t size, 3568 size_t align, unsigned long flags, const char *name, 3569 void (*ctor)(void *)) 3570{ 3571 struct kmem_cache *s; 3572 3573 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 3574 return NULL; 3575 3576 if (ctor) 3577 return NULL; 3578 3579 size = ALIGN(size, sizeof(void *)); 3580 align = calculate_alignment(flags, align, size); 3581 size = ALIGN(size, align); 3582 flags = kmem_cache_flags(size, flags, name, NULL); 3583 3584 list_for_each_entry(s, &slab_caches, list) { 3585 if (slab_unmergeable(s)) 3586 continue; 3587 3588 if (size > s->size) 3589 continue; 3590 3591 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) 3592 continue; 3593 /* 3594 * Check if alignment is compatible. 3595 * Courtesy of Adrian Drzewiecki 3596 */ 3597 if ((s->size & ~(align - 1)) != s->size) 3598 continue; 3599 3600 if (s->size - size >= sizeof(void *)) 3601 continue; 3602 3603 return s; 3604 } 3605 return NULL; 3606} 3607 3608struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3609 size_t align, unsigned long flags, void (*ctor)(void *)) 3610{ 3611 struct kmem_cache *s; 3612 char *n; 3613 3614 if (WARN_ON(!name)) 3615 return NULL; 3616 3617 down_write(&slub_lock); 3618 s = find_mergeable(size, align, flags, name, ctor); 3619 if (s) { 3620 s->refcount++; 3621 /* 3622 * Adjust the object sizes so that we clear 3623 * the complete object on kzalloc. 3624 */ 3625 s->objsize = max(s->objsize, (int)size); 3626 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3627 3628 if (sysfs_slab_alias(s, name)) { 3629 s->refcount--; 3630 goto err; 3631 } 3632 up_write(&slub_lock); 3633 return s; 3634 } 3635 3636 n = kstrdup(name, GFP_KERNEL); 3637 if (!n) 3638 goto err; 3639 3640 s = kmalloc(kmem_size, GFP_KERNEL); 3641 if (s) { 3642 if (kmem_cache_open(s, n, 3643 size, align, flags, ctor)) { 3644 list_add(&s->list, &slab_caches); 3645 if (sysfs_slab_add(s)) { 3646 list_del(&s->list); 3647 kfree(n); 3648 kfree(s); 3649 goto err; 3650 } 3651 up_write(&slub_lock); 3652 return s; 3653 } 3654 kfree(n); 3655 kfree(s); 3656 } 3657err: 3658 up_write(&slub_lock); 3659 3660 if (flags & SLAB_PANIC) 3661 panic("Cannot create slabcache %s\n", name); 3662 else 3663 s = NULL; 3664 return s; 3665} 3666EXPORT_SYMBOL(kmem_cache_create); 3667 3668#ifdef CONFIG_SMP 3669/* 3670 * Use the cpu notifier to insure that the cpu slabs are flushed when 3671 * necessary. 3672 */ 3673static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 3674 unsigned long action, void *hcpu) 3675{ 3676 long cpu = (long)hcpu; 3677 struct kmem_cache *s; 3678 unsigned long flags; 3679 3680 switch (action) { 3681 case CPU_UP_CANCELED: 3682 case CPU_UP_CANCELED_FROZEN: 3683 case CPU_DEAD: 3684 case CPU_DEAD_FROZEN: 3685 down_read(&slub_lock); 3686 list_for_each_entry(s, &slab_caches, list) { 3687 local_irq_save(flags); 3688 __flush_cpu_slab(s, cpu); 3689 local_irq_restore(flags); 3690 } 3691 up_read(&slub_lock); 3692 break; 3693 default: 3694 break; 3695 } 3696 return NOTIFY_OK; 3697} 3698 3699static struct notifier_block __cpuinitdata slab_notifier = { 3700 .notifier_call = slab_cpuup_callback 3701}; 3702 3703#endif 3704 3705void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) 3706{ 3707 struct kmem_cache *s; 3708 void *ret; 3709 3710 if (unlikely(size > SLUB_MAX_SIZE)) 3711 return kmalloc_large(size, gfpflags); 3712 3713 s = get_slab(size, gfpflags); 3714 3715 if (unlikely(ZERO_OR_NULL_PTR(s))) 3716 return s; 3717 3718 ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); 3719 3720 /* Honor the call site pointer we received. */ 3721 trace_kmalloc(caller, ret, size, s->size, gfpflags); 3722 3723 return ret; 3724} 3725 3726#ifdef CONFIG_NUMA 3727void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3728 int node, unsigned long caller) 3729{ 3730 struct kmem_cache *s; 3731 void *ret; 3732 3733 if (unlikely(size > SLUB_MAX_SIZE)) { 3734 ret = kmalloc_large_node(size, gfpflags, node); 3735 3736 trace_kmalloc_node(caller, ret, 3737 size, PAGE_SIZE << get_order(size), 3738 gfpflags, node); 3739 3740 return ret; 3741 } 3742 3743 s = get_slab(size, gfpflags); 3744 3745 if (unlikely(ZERO_OR_NULL_PTR(s))) 3746 return s; 3747 3748 ret = slab_alloc(s, gfpflags, node, caller); 3749 3750 /* Honor the call site pointer we received. */ 3751 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); 3752 3753 return ret; 3754} 3755#endif 3756 3757#ifdef CONFIG_SYSFS 3758static int count_inuse(struct page *page) 3759{ 3760 return page->inuse; 3761} 3762 3763static int count_total(struct page *page) 3764{ 3765 return page->objects; 3766} 3767#endif 3768 3769#ifdef CONFIG_SLUB_DEBUG 3770static int validate_slab(struct kmem_cache *s, struct page *page, 3771 unsigned long *map) 3772{ 3773 void *p; 3774 void *addr = page_address(page); 3775 3776 if (!check_slab(s, page) || 3777 !on_freelist(s, page, NULL)) 3778 return 0; 3779 3780 /* Now we know that a valid freelist exists */ 3781 bitmap_zero(map, page->objects); 3782 3783 get_map(s, page, map); 3784 for_each_object(p, s, addr, page->objects) { 3785 if (test_bit(slab_index(p, s, addr), map)) 3786 if (!check_object(s, page, p, SLUB_RED_INACTIVE)) 3787 return 0; 3788 } 3789 3790 for_each_object(p, s, addr, page->objects) 3791 if (!test_bit(slab_index(p, s, addr), map)) 3792 if (!check_object(s, page, p, SLUB_RED_ACTIVE)) 3793 return 0; 3794 return 1; 3795} 3796 3797static void validate_slab_slab(struct kmem_cache *s, struct page *page, 3798 unsigned long *map) 3799{ 3800 if (slab_trylock(page)) { 3801 validate_slab(s, page, map); 3802 slab_unlock(page); 3803 } else 3804 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", 3805 s->name, page); 3806} 3807 3808static int validate_slab_node(struct kmem_cache *s, 3809 struct kmem_cache_node *n, unsigned long *map) 3810{ 3811 unsigned long count = 0; 3812 struct page *page; 3813 unsigned long flags; 3814 3815 spin_lock_irqsave(&n->list_lock, flags); 3816 3817 list_for_each_entry(page, &n->partial, lru) { 3818 validate_slab_slab(s, page, map); 3819 count++; 3820 } 3821 if (count != n->nr_partial) 3822 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 3823 "counter=%ld\n", s->name, count, n->nr_partial); 3824 3825 if (!(s->flags & SLAB_STORE_USER)) 3826 goto out; 3827 3828 list_for_each_entry(page, &n->full, lru) { 3829 validate_slab_slab(s, page, map); 3830 count++; 3831 } 3832 if (count != atomic_long_read(&n->nr_slabs)) 3833 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 3834 "counter=%ld\n", s->name, count, 3835 atomic_long_read(&n->nr_slabs)); 3836 3837out: 3838 spin_unlock_irqrestore(&n->list_lock, flags); 3839 return count; 3840} 3841 3842static long validate_slab_cache(struct kmem_cache *s) 3843{ 3844 int node; 3845 unsigned long count = 0; 3846 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 3847 sizeof(unsigned long), GFP_KERNEL); 3848 3849 if (!map) 3850 return -ENOMEM; 3851 3852 flush_all(s); 3853 for_each_node_state(node, N_NORMAL_MEMORY) { 3854 struct kmem_cache_node *n = get_node(s, node); 3855 3856 count += validate_slab_node(s, n, map); 3857 } 3858 kfree(map); 3859 return count; 3860} 3861/* 3862 * Generate lists of code addresses where slabcache objects are allocated 3863 * and freed. 3864 */ 3865 3866struct location { 3867 unsigned long count; 3868 unsigned long addr; 3869 long long sum_time; 3870 long min_time; 3871 long max_time; 3872 long min_pid; 3873 long max_pid; 3874 DECLARE_BITMAP(cpus, NR_CPUS); 3875 nodemask_t nodes; 3876}; 3877 3878struct loc_track { 3879 unsigned long max; 3880 unsigned long count; 3881 struct location *loc; 3882}; 3883 3884static void free_loc_track(struct loc_track *t) 3885{ 3886 if (t->max) 3887 free_pages((unsigned long)t->loc, 3888 get_order(sizeof(struct location) * t->max)); 3889} 3890 3891static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) 3892{ 3893 struct location *l; 3894 int order; 3895 3896 order = get_order(sizeof(struct location) * max); 3897 3898 l = (void *)__get_free_pages(flags, order); 3899 if (!l) 3900 return 0; 3901 3902 if (t->count) { 3903 memcpy(l, t->loc, sizeof(struct location) * t->count); 3904 free_loc_track(t); 3905 } 3906 t->max = max; 3907 t->loc = l; 3908 return 1; 3909} 3910 3911static int add_location(struct loc_track *t, struct kmem_cache *s, 3912 const struct track *track) 3913{ 3914 long start, end, pos; 3915 struct location *l; 3916 unsigned long caddr; 3917 unsigned long age = jiffies - track->when; 3918 3919 start = -1; 3920 end = t->count; 3921 3922 for ( ; ; ) { 3923 pos = start + (end - start + 1) / 2; 3924 3925 /* 3926 * There is nothing at "end". If we end up there 3927 * we need to add something to before end. 3928 */ 3929 if (pos == end) 3930 break; 3931 3932 caddr = t->loc[pos].addr; 3933 if (track->addr == caddr) { 3934 3935 l = &t->loc[pos]; 3936 l->count++; 3937 if (track->when) { 3938 l->sum_time += age; 3939 if (age < l->min_time) 3940 l->min_time = age; 3941 if (age > l->max_time) 3942 l->max_time = age; 3943 3944 if (track->pid < l->min_pid) 3945 l->min_pid = track->pid; 3946 if (track->pid > l->max_pid) 3947 l->max_pid = track->pid; 3948 3949 cpumask_set_cpu(track->cpu, 3950 to_cpumask(l->cpus)); 3951 } 3952 node_set(page_to_nid(virt_to_page(track)), l->nodes); 3953 return 1; 3954 } 3955 3956 if (track->addr < caddr) 3957 end = pos; 3958 else 3959 start = pos; 3960 } 3961 3962 /* 3963 * Not found. Insert new tracking element. 3964 */ 3965 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) 3966 return 0; 3967 3968 l = t->loc + pos; 3969 if (pos < t->count) 3970 memmove(l + 1, l, 3971 (t->count - pos) * sizeof(struct location)); 3972 t->count++; 3973 l->count = 1; 3974 l->addr = track->addr; 3975 l->sum_time = age; 3976 l->min_time = age; 3977 l->max_time = age; 3978 l->min_pid = track->pid; 3979 l->max_pid = track->pid; 3980 cpumask_clear(to_cpumask(l->cpus)); 3981 cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); 3982 nodes_clear(l->nodes); 3983 node_set(page_to_nid(virt_to_page(track)), l->nodes); 3984 return 1; 3985} 3986 3987static void process_slab(struct loc_track *t, struct kmem_cache *s, 3988 struct page *page, enum track_item alloc, 3989 unsigned long *map) 3990{ 3991 void *addr = page_address(page); 3992 void *p; 3993 3994 bitmap_zero(map, page->objects); 3995 get_map(s, page, map); 3996 3997 for_each_object(p, s, addr, page->objects) 3998 if (!test_bit(slab_index(p, s, addr), map)) 3999 add_location(t, s, get_track(s, p, alloc)); 4000} 4001 4002static int list_locations(struct kmem_cache *s, char *buf, 4003 enum track_item alloc) 4004{ 4005 int len = 0; 4006 unsigned long i; 4007 struct loc_track t = { 0, 0, NULL }; 4008 int node; 4009 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 4010 sizeof(unsigned long), GFP_KERNEL); 4011 4012 if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 4013 GFP_TEMPORARY)) { 4014 kfree(map); 4015 return sprintf(buf, "Out of memory\n"); 4016 } 4017 /* Push back cpu slabs */ 4018 flush_all(s); 4019 4020 for_each_node_state(node, N_NORMAL_MEMORY) { 4021 struct kmem_cache_node *n = get_node(s, node); 4022 unsigned long flags; 4023 struct page *page; 4024 4025 if (!atomic_long_read(&n->nr_slabs)) 4026 continue; 4027 4028 spin_lock_irqsave(&n->list_lock, flags); 4029 list_for_each_entry(page, &n->partial, lru) 4030 process_slab(&t, s, page, alloc, map); 4031 list_for_each_entry(page, &n->full, lru) 4032 process_slab(&t, s, page, alloc, map); 4033 spin_unlock_irqrestore(&n->list_lock, flags); 4034 } 4035 4036 for (i = 0; i < t.count; i++) { 4037 struct location *l = &t.loc[i]; 4038 4039 if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) 4040 break; 4041 len += sprintf(buf + len, "%7ld ", l->count); 4042 4043 if (l->addr) 4044 len += sprintf(buf + len, "%pS", (void *)l->addr); 4045 else 4046 len += sprintf(buf + len, "<not-available>"); 4047 4048 if (l->sum_time != l->min_time) { 4049 len += sprintf(buf + len, " age=%ld/%ld/%ld", 4050 l->min_time, 4051 (long)div_u64(l->sum_time, l->count), 4052 l->max_time); 4053 } else 4054 len += sprintf(buf + len, " age=%ld", 4055 l->min_time); 4056 4057 if (l->min_pid != l->max_pid) 4058 len += sprintf(buf + len, " pid=%ld-%ld", 4059 l->min_pid, l->max_pid); 4060 else 4061 len += sprintf(buf + len, " pid=%ld", 4062 l->min_pid); 4063 4064 if (num_online_cpus() > 1 && 4065 !cpumask_empty(to_cpumask(l->cpus)) && 4066 len < PAGE_SIZE - 60) { 4067 len += sprintf(buf + len, " cpus="); 4068 len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, 4069 to_cpumask(l->cpus)); 4070 } 4071 4072 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && 4073 len < PAGE_SIZE - 60) { 4074 len += sprintf(buf + len, " nodes="); 4075 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, 4076 l->nodes); 4077 } 4078 4079 len += sprintf(buf + len, "\n"); 4080 } 4081 4082 free_loc_track(&t); 4083 kfree(map); 4084 if (!t.count) 4085 len += sprintf(buf, "No data\n"); 4086 return len; 4087} 4088#endif 4089 4090#ifdef SLUB_RESILIENCY_TEST 4091static void resiliency_test(void) 4092{ 4093 u8 *p; 4094 4095 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10); 4096 4097 printk(KERN_ERR "SLUB resiliency testing\n"); 4098 printk(KERN_ERR "-----------------------\n"); 4099 printk(KERN_ERR "A. Corruption after allocation\n"); 4100 4101 p = kzalloc(16, GFP_KERNEL); 4102 p[16] = 0x12; 4103 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 4104 " 0x12->0x%p\n\n", p + 16); 4105 4106 validate_slab_cache(kmalloc_caches[4]); 4107 4108 /* Hmmm... The next two are dangerous */ 4109 p = kzalloc(32, GFP_KERNEL); 4110 p[32 + sizeof(void *)] = 0x34; 4111 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 4112 " 0x34 -> -0x%p\n", p); 4113 printk(KERN_ERR 4114 "If allocated object is overwritten then not detectable\n\n"); 4115 4116 validate_slab_cache(kmalloc_caches[5]); 4117 p = kzalloc(64, GFP_KERNEL); 4118 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 4119 *p = 0x56; 4120 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 4121 p); 4122 printk(KERN_ERR 4123 "If allocated object is overwritten then not detectable\n\n"); 4124 validate_slab_cache(kmalloc_caches[6]); 4125 4126 printk(KERN_ERR "\nB. Corruption after free\n"); 4127 p = kzalloc(128, GFP_KERNEL); 4128 kfree(p); 4129 *p = 0x78; 4130 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 4131 validate_slab_cache(kmalloc_caches[7]); 4132 4133 p = kzalloc(256, GFP_KERNEL); 4134 kfree(p); 4135 p[50] = 0x9a; 4136 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", 4137 p); 4138 validate_slab_cache(kmalloc_caches[8]); 4139 4140 p = kzalloc(512, GFP_KERNEL); 4141 kfree(p); 4142 p[512] = 0xab; 4143 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 4144 validate_slab_cache(kmalloc_caches[9]); 4145} 4146#else 4147#ifdef CONFIG_SYSFS 4148static void resiliency_test(void) {}; 4149#endif 4150#endif 4151 4152#ifdef CONFIG_SYSFS 4153enum slab_stat_type { 4154 SL_ALL, /* All slabs */ 4155 SL_PARTIAL, /* Only partially allocated slabs */ 4156 SL_CPU, /* Only slabs used for cpu caches */ 4157 SL_OBJECTS, /* Determine allocated objects not slabs */ 4158 SL_TOTAL /* Determine object capacity not slabs */ 4159}; 4160 4161#define SO_ALL (1 << SL_ALL) 4162#define SO_PARTIAL (1 << SL_PARTIAL) 4163#define SO_CPU (1 << SL_CPU) 4164#define SO_OBJECTS (1 << SL_OBJECTS) 4165#define SO_TOTAL (1 << SL_TOTAL) 4166 4167static ssize_t show_slab_objects(struct kmem_cache *s, 4168 char *buf, unsigned long flags) 4169{ 4170 unsigned long total = 0; 4171 int node; 4172 int x; 4173 unsigned long *nodes; 4174 unsigned long *per_cpu; 4175 4176 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 4177 if (!nodes) 4178 return -ENOMEM; 4179 per_cpu = nodes + nr_node_ids; 4180 4181 if (flags & SO_CPU) { 4182 int cpu; 4183 4184 for_each_possible_cpu(cpu) { 4185 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4186 4187 if (!c || c->node < 0) 4188 continue; 4189 4190 if (c->page) { 4191 if (flags & SO_TOTAL) 4192 x = c->page->objects; 4193 else if (flags & SO_OBJECTS) 4194 x = c->page->inuse; 4195 else 4196 x = 1; 4197 4198 total += x; 4199 nodes[c->node] += x; 4200 } 4201 per_cpu[c->node]++; 4202 } 4203 } 4204 4205 lock_memory_hotplug(); 4206#ifdef CONFIG_SLUB_DEBUG 4207 if (flags & SO_ALL) { 4208 for_each_node_state(node, N_NORMAL_MEMORY) { 4209 struct kmem_cache_node *n = get_node(s, node); 4210 4211 if (flags & SO_TOTAL) 4212 x = atomic_long_read(&n->total_objects); 4213 else if (flags & SO_OBJECTS) 4214 x = atomic_long_read(&n->total_objects) - 4215 count_partial(n, count_free); 4216 4217 else 4218 x = atomic_long_read(&n->nr_slabs); 4219 total += x; 4220 nodes[node] += x; 4221 } 4222 4223 } else 4224#endif 4225 if (flags & SO_PARTIAL) { 4226 for_each_node_state(node, N_NORMAL_MEMORY) { 4227 struct kmem_cache_node *n = get_node(s, node); 4228 4229 if (flags & SO_TOTAL) 4230 x = count_partial(n, count_total); 4231 else if (flags & SO_OBJECTS) 4232 x = count_partial(n, count_inuse); 4233 else 4234 x = n->nr_partial; 4235 total += x; 4236 nodes[node] += x; 4237 } 4238 } 4239 x = sprintf(buf, "%lu", total); 4240#ifdef CONFIG_NUMA 4241 for_each_node_state(node, N_NORMAL_MEMORY) 4242 if (nodes[node]) 4243 x += sprintf(buf + x, " N%d=%lu", 4244 node, nodes[node]); 4245#endif 4246 unlock_memory_hotplug(); 4247 kfree(nodes); 4248 return x + sprintf(buf + x, "\n"); 4249} 4250 4251#ifdef CONFIG_SLUB_DEBUG 4252static int any_slab_objects(struct kmem_cache *s) 4253{ 4254 int node; 4255 4256 for_each_online_node(node) { 4257 struct kmem_cache_node *n = get_node(s, node); 4258 4259 if (!n) 4260 continue; 4261 4262 if (atomic_long_read(&n->total_objects)) 4263 return 1; 4264 } 4265 return 0; 4266} 4267#endif 4268 4269#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 4270#define to_slab(n) container_of(n, struct kmem_cache, kobj); 4271 4272struct slab_attribute { 4273 struct attribute attr; 4274 ssize_t (*show)(struct kmem_cache *s, char *buf); 4275 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 4276}; 4277 4278#define SLAB_ATTR_RO(_name) \ 4279 static struct slab_attribute _name##_attr = __ATTR_RO(_name) 4280 4281#define SLAB_ATTR(_name) \ 4282 static struct slab_attribute _name##_attr = \ 4283 __ATTR(_name, 0644, _name##_show, _name##_store) 4284 4285static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 4286{ 4287 return sprintf(buf, "%d\n", s->size); 4288} 4289SLAB_ATTR_RO(slab_size); 4290 4291static ssize_t align_show(struct kmem_cache *s, char *buf) 4292{ 4293 return sprintf(buf, "%d\n", s->align); 4294} 4295SLAB_ATTR_RO(align); 4296 4297static ssize_t object_size_show(struct kmem_cache *s, char *buf) 4298{ 4299 return sprintf(buf, "%d\n", s->objsize); 4300} 4301SLAB_ATTR_RO(object_size); 4302 4303static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 4304{ 4305 return sprintf(buf, "%d\n", oo_objects(s->oo)); 4306} 4307SLAB_ATTR_RO(objs_per_slab); 4308 4309static ssize_t order_store(struct kmem_cache *s, 4310 const char *buf, size_t length) 4311{ 4312 unsigned long order; 4313 int err; 4314 4315 err = strict_strtoul(buf, 10, &order); 4316 if (err) 4317 return err; 4318 4319 if (order > slub_max_order || order < slub_min_order) 4320 return -EINVAL; 4321 4322 calculate_sizes(s, order); 4323 return length; 4324} 4325 4326static ssize_t order_show(struct kmem_cache *s, char *buf) 4327{ 4328 return sprintf(buf, "%d\n", oo_order(s->oo)); 4329} 4330SLAB_ATTR(order); 4331 4332static ssize_t min_partial_show(struct kmem_cache *s, char *buf) 4333{ 4334 return sprintf(buf, "%lu\n", s->min_partial); 4335} 4336 4337static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, 4338 size_t length) 4339{ 4340 unsigned long min; 4341 int err; 4342 4343 err = strict_strtoul(buf, 10, &min); 4344 if (err) 4345 return err; 4346 4347 set_min_partial(s, min); 4348 return length; 4349} 4350SLAB_ATTR(min_partial); 4351 4352static ssize_t ctor_show(struct kmem_cache *s, char *buf) 4353{ 4354 if (!s->ctor) 4355 return 0; 4356 return sprintf(buf, "%pS\n", s->ctor); 4357} 4358SLAB_ATTR_RO(ctor); 4359 4360static ssize_t aliases_show(struct kmem_cache *s, char *buf) 4361{ 4362 return sprintf(buf, "%d\n", s->refcount - 1); 4363} 4364SLAB_ATTR_RO(aliases); 4365 4366static ssize_t partial_show(struct kmem_cache *s, char *buf) 4367{ 4368 return show_slab_objects(s, buf, SO_PARTIAL); 4369} 4370SLAB_ATTR_RO(partial); 4371 4372static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 4373{ 4374 return show_slab_objects(s, buf, SO_CPU); 4375} 4376SLAB_ATTR_RO(cpu_slabs); 4377 4378static ssize_t objects_show(struct kmem_cache *s, char *buf) 4379{ 4380 return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); 4381} 4382SLAB_ATTR_RO(objects); 4383 4384static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) 4385{ 4386 return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); 4387} 4388SLAB_ATTR_RO(objects_partial); 4389 4390static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4391{ 4392 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4393} 4394 4395static ssize_t reclaim_account_store(struct kmem_cache *s, 4396 const char *buf, size_t length) 4397{ 4398 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 4399 if (buf[0] == '1') 4400 s->flags |= SLAB_RECLAIM_ACCOUNT; 4401 return length; 4402} 4403SLAB_ATTR(reclaim_account); 4404 4405static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 4406{ 4407 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 4408} 4409SLAB_ATTR_RO(hwcache_align); 4410 4411#ifdef CONFIG_ZONE_DMA 4412static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 4413{ 4414 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 4415} 4416SLAB_ATTR_RO(cache_dma); 4417#endif 4418 4419static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 4420{ 4421 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 4422} 4423SLAB_ATTR_RO(destroy_by_rcu); 4424 4425static ssize_t reserved_show(struct kmem_cache *s, char *buf) 4426{ 4427 return sprintf(buf, "%d\n", s->reserved); 4428} 4429SLAB_ATTR_RO(reserved); 4430 4431#ifdef CONFIG_SLUB_DEBUG 4432static ssize_t slabs_show(struct kmem_cache *s, char *buf) 4433{ 4434 return show_slab_objects(s, buf, SO_ALL); 4435} 4436SLAB_ATTR_RO(slabs); 4437 4438static ssize_t total_objects_show(struct kmem_cache *s, char *buf) 4439{ 4440 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); 4441} 4442SLAB_ATTR_RO(total_objects); 4443 4444static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 4445{ 4446 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 4447} 4448 4449static ssize_t sanity_checks_store(struct kmem_cache *s, 4450 const char *buf, size_t length) 4451{ 4452 s->flags &= ~SLAB_DEBUG_FREE; 4453 if (buf[0] == '1') { 4454 s->flags &= ~__CMPXCHG_DOUBLE; 4455 s->flags |= SLAB_DEBUG_FREE; 4456 } 4457 return length; 4458} 4459SLAB_ATTR(sanity_checks); 4460 4461static ssize_t trace_show(struct kmem_cache *s, char *buf) 4462{ 4463 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 4464} 4465 4466static ssize_t trace_store(struct kmem_cache *s, const char *buf, 4467 size_t length) 4468{ 4469 s->flags &= ~SLAB_TRACE; 4470 if (buf[0] == '1') { 4471 s->flags &= ~__CMPXCHG_DOUBLE; 4472 s->flags |= SLAB_TRACE; 4473 } 4474 return length; 4475} 4476SLAB_ATTR(trace); 4477 4478static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 4479{ 4480 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 4481} 4482 4483static ssize_t red_zone_store(struct kmem_cache *s, 4484 const char *buf, size_t length) 4485{ 4486 if (any_slab_objects(s)) 4487 return -EBUSY; 4488 4489 s->flags &= ~SLAB_RED_ZONE; 4490 if (buf[0] == '1') { 4491 s->flags &= ~__CMPXCHG_DOUBLE; 4492 s->flags |= SLAB_RED_ZONE; 4493 } 4494 calculate_sizes(s, -1); 4495 return length; 4496} 4497SLAB_ATTR(red_zone); 4498 4499static ssize_t poison_show(struct kmem_cache *s, char *buf) 4500{ 4501 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 4502} 4503 4504static ssize_t poison_store(struct kmem_cache *s, 4505 const char *buf, size_t length) 4506{ 4507 if (any_slab_objects(s)) 4508 return -EBUSY; 4509 4510 s->flags &= ~SLAB_POISON; 4511 if (buf[0] == '1') { 4512 s->flags &= ~__CMPXCHG_DOUBLE; 4513 s->flags |= SLAB_POISON; 4514 } 4515 calculate_sizes(s, -1); 4516 return length; 4517} 4518SLAB_ATTR(poison); 4519 4520static ssize_t store_user_show(struct kmem_cache *s, char *buf) 4521{ 4522 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 4523} 4524 4525static ssize_t store_user_store(struct kmem_cache *s, 4526 const char *buf, size_t length) 4527{ 4528 if (any_slab_objects(s)) 4529 return -EBUSY; 4530 4531 s->flags &= ~SLAB_STORE_USER; 4532 if (buf[0] == '1') { 4533 s->flags &= ~__CMPXCHG_DOUBLE; 4534 s->flags |= SLAB_STORE_USER; 4535 } 4536 calculate_sizes(s, -1); 4537 return length; 4538} 4539SLAB_ATTR(store_user); 4540 4541static ssize_t validate_show(struct kmem_cache *s, char *buf) 4542{ 4543 return 0; 4544} 4545 4546static ssize_t validate_store(struct kmem_cache *s, 4547 const char *buf, size_t length) 4548{ 4549 int ret = -EINVAL; 4550 4551 if (buf[0] == '1') { 4552 ret = validate_slab_cache(s); 4553 if (ret >= 0) 4554 ret = length; 4555 } 4556 return ret; 4557} 4558SLAB_ATTR(validate); 4559 4560static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 4561{ 4562 if (!(s->flags & SLAB_STORE_USER)) 4563 return -ENOSYS; 4564 return list_locations(s, buf, TRACK_ALLOC); 4565} 4566SLAB_ATTR_RO(alloc_calls); 4567 4568static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 4569{ 4570 if (!(s->flags & SLAB_STORE_USER)) 4571 return -ENOSYS; 4572 return list_locations(s, buf, TRACK_FREE); 4573} 4574SLAB_ATTR_RO(free_calls); 4575#endif /* CONFIG_SLUB_DEBUG */ 4576 4577#ifdef CONFIG_FAILSLAB 4578static ssize_t failslab_show(struct kmem_cache *s, char *buf) 4579{ 4580 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); 4581} 4582 4583static ssize_t failslab_store(struct kmem_cache *s, const char *buf, 4584 size_t length) 4585{ 4586 s->flags &= ~SLAB_FAILSLAB; 4587 if (buf[0] == '1') 4588 s->flags |= SLAB_FAILSLAB; 4589 return length; 4590} 4591SLAB_ATTR(failslab); 4592#endif 4593 4594static ssize_t shrink_show(struct kmem_cache *s, char *buf) 4595{ 4596 return 0; 4597} 4598 4599static ssize_t shrink_store(struct kmem_cache *s, 4600 const char *buf, size_t length) 4601{ 4602 if (buf[0] == '1') { 4603 int rc = kmem_cache_shrink(s); 4604 4605 if (rc) 4606 return rc; 4607 } else 4608 return -EINVAL; 4609 return length; 4610} 4611SLAB_ATTR(shrink); 4612 4613#ifdef CONFIG_NUMA 4614static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 4615{ 4616 return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); 4617} 4618 4619static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, 4620 const char *buf, size_t length) 4621{ 4622 unsigned long ratio; 4623 int err; 4624 4625 err = strict_strtoul(buf, 10, &ratio); 4626 if (err) 4627 return err; 4628 4629 if (ratio <= 100) 4630 s->remote_node_defrag_ratio = ratio * 10; 4631 4632 return length; 4633} 4634SLAB_ATTR(remote_node_defrag_ratio); 4635#endif 4636 4637#ifdef CONFIG_SLUB_STATS 4638static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) 4639{ 4640 unsigned long sum = 0; 4641 int cpu; 4642 int len; 4643 int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); 4644 4645 if (!data) 4646 return -ENOMEM; 4647 4648 for_each_online_cpu(cpu) { 4649 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; 4650 4651 data[cpu] = x; 4652 sum += x; 4653 } 4654 4655 len = sprintf(buf, "%lu", sum); 4656 4657#ifdef CONFIG_SMP 4658 for_each_online_cpu(cpu) { 4659 if (data[cpu] && len < PAGE_SIZE - 20) 4660 len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); 4661 } 4662#endif 4663 kfree(data); 4664 return len + sprintf(buf + len, "\n"); 4665} 4666 4667static void clear_stat(struct kmem_cache *s, enum stat_item si) 4668{ 4669 int cpu; 4670 4671 for_each_online_cpu(cpu) 4672 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; 4673} 4674 4675#define STAT_ATTR(si, text) \ 4676static ssize_t text##_show(struct kmem_cache *s, char *buf) \ 4677{ \ 4678 return show_stat(s, buf, si); \ 4679} \ 4680static ssize_t text##_store(struct kmem_cache *s, \ 4681 const char *buf, size_t length) \ 4682{ \ 4683 if (buf[0] != '0') \ 4684 return -EINVAL; \ 4685 clear_stat(s, si); \ 4686 return length; \ 4687} \ 4688SLAB_ATTR(text); \ 4689 4690STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 4691STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 4692STAT_ATTR(FREE_FASTPATH, free_fastpath); 4693STAT_ATTR(FREE_SLOWPATH, free_slowpath); 4694STAT_ATTR(FREE_FROZEN, free_frozen); 4695STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 4696STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 4697STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 4698STAT_ATTR(ALLOC_SLAB, alloc_slab); 4699STAT_ATTR(ALLOC_REFILL, alloc_refill); 4700STAT_ATTR(FREE_SLAB, free_slab); 4701STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 4702STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 4703STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 4704STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 4705STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 4706STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 4707STAT_ATTR(ORDER_FALLBACK, order_fallback); 4708STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 4709STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 4710#endif 4711 4712static struct attribute *slab_attrs[] = { 4713 &slab_size_attr.attr, 4714 &object_size_attr.attr, 4715 &objs_per_slab_attr.attr, 4716 &order_attr.attr, 4717 &min_partial_attr.attr, 4718 &objects_attr.attr, 4719 &objects_partial_attr.attr, 4720 &partial_attr.attr, 4721 &cpu_slabs_attr.attr, 4722 &ctor_attr.attr, 4723 &aliases_attr.attr, 4724 &align_attr.attr, 4725 &hwcache_align_attr.attr, 4726 &reclaim_account_attr.attr, 4727 &destroy_by_rcu_attr.attr, 4728 &shrink_attr.attr, 4729 &reserved_attr.attr, 4730#ifdef CONFIG_SLUB_DEBUG 4731 &total_objects_attr.attr, 4732 &slabs_attr.attr, 4733 &sanity_checks_attr.attr, 4734 &trace_attr.attr, 4735 &red_zone_attr.attr, 4736 &poison_attr.attr, 4737 &store_user_attr.attr, 4738 &validate_attr.attr, 4739 &alloc_calls_attr.attr, 4740 &free_calls_attr.attr, 4741#endif 4742#ifdef CONFIG_ZONE_DMA 4743 &cache_dma_attr.attr, 4744#endif 4745#ifdef CONFIG_NUMA 4746 &remote_node_defrag_ratio_attr.attr, 4747#endif 4748#ifdef CONFIG_SLUB_STATS 4749 &alloc_fastpath_attr.attr, 4750 &alloc_slowpath_attr.attr, 4751 &free_fastpath_attr.attr, 4752 &free_slowpath_attr.attr, 4753 &free_frozen_attr.attr, 4754 &free_add_partial_attr.attr, 4755 &free_remove_partial_attr.attr, 4756 &alloc_from_partial_attr.attr, 4757 &alloc_slab_attr.attr, 4758 &alloc_refill_attr.attr, 4759 &free_slab_attr.attr, 4760 &cpuslab_flush_attr.attr, 4761 &deactivate_full_attr.attr, 4762 &deactivate_empty_attr.attr, 4763 &deactivate_to_head_attr.attr, 4764 &deactivate_to_tail_attr.attr, 4765 &deactivate_remote_frees_attr.attr, 4766 &order_fallback_attr.attr, 4767 &cmpxchg_double_fail_attr.attr, 4768 &cmpxchg_double_cpu_fail_attr.attr, 4769#endif 4770#ifdef CONFIG_FAILSLAB 4771 &failslab_attr.attr, 4772#endif 4773 4774 NULL 4775}; 4776 4777static struct attribute_group slab_attr_group = { 4778 .attrs = slab_attrs, 4779}; 4780 4781static ssize_t slab_attr_show(struct kobject *kobj, 4782 struct attribute *attr, 4783 char *buf) 4784{ 4785 struct slab_attribute *attribute; 4786 struct kmem_cache *s; 4787 int err; 4788 4789 attribute = to_slab_attr(attr); 4790 s = to_slab(kobj); 4791 4792 if (!attribute->show) 4793 return -EIO; 4794 4795 err = attribute->show(s, buf); 4796 4797 return err; 4798} 4799 4800static ssize_t slab_attr_store(struct kobject *kobj, 4801 struct attribute *attr, 4802 const char *buf, size_t len) 4803{ 4804 struct slab_attribute *attribute; 4805 struct kmem_cache *s; 4806 int err; 4807 4808 attribute = to_slab_attr(attr); 4809 s = to_slab(kobj); 4810 4811 if (!attribute->store) 4812 return -EIO; 4813 4814 err = attribute->store(s, buf, len); 4815 4816 return err; 4817} 4818 4819static void kmem_cache_release(struct kobject *kobj) 4820{ 4821 struct kmem_cache *s = to_slab(kobj); 4822 4823 kfree(s->name); 4824 kfree(s); 4825} 4826 4827static const struct sysfs_ops slab_sysfs_ops = { 4828 .show = slab_attr_show, 4829 .store = slab_attr_store, 4830}; 4831 4832static struct kobj_type slab_ktype = { 4833 .sysfs_ops = &slab_sysfs_ops, 4834 .release = kmem_cache_release 4835}; 4836 4837static int uevent_filter(struct kset *kset, struct kobject *kobj) 4838{ 4839 struct kobj_type *ktype = get_ktype(kobj); 4840 4841 if (ktype == &slab_ktype) 4842 return 1; 4843 return 0; 4844} 4845 4846static const struct kset_uevent_ops slab_uevent_ops = { 4847 .filter = uevent_filter, 4848}; 4849 4850static struct kset *slab_kset; 4851 4852#define ID_STR_LENGTH 64 4853 4854/* Create a unique string id for a slab cache: 4855 * 4856 * Format :[flags-]size 4857 */ 4858static char *create_unique_id(struct kmem_cache *s) 4859{ 4860 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 4861 char *p = name; 4862 4863 BUG_ON(!name); 4864 4865 *p++ = ':'; 4866 /* 4867 * First flags affecting slabcache operations. We will only 4868 * get here for aliasable slabs so we do not need to support 4869 * too many flags. The flags here must cover all flags that 4870 * are matched during merging to guarantee that the id is 4871 * unique. 4872 */ 4873 if (s->flags & SLAB_CACHE_DMA) 4874 *p++ = 'd'; 4875 if (s->flags & SLAB_RECLAIM_ACCOUNT) 4876 *p++ = 'a'; 4877 if (s->flags & SLAB_DEBUG_FREE) 4878 *p++ = 'F'; 4879 if (!(s->flags & SLAB_NOTRACK)) 4880 *p++ = 't'; 4881 if (p != name + 1) 4882 *p++ = '-'; 4883 p += sprintf(p, "%07d", s->size); 4884 BUG_ON(p > name + ID_STR_LENGTH - 1); 4885 return name; 4886} 4887 4888static int sysfs_slab_add(struct kmem_cache *s) 4889{ 4890 int err; 4891 const char *name; 4892 int unmergeable; 4893 4894 if (slab_state < SYSFS) 4895 /* Defer until later */ 4896 return 0; 4897 4898 unmergeable = slab_unmergeable(s); 4899 if (unmergeable) { 4900 /* 4901 * Slabcache can never be merged so we can use the name proper. 4902 * This is typically the case for debug situations. In that 4903 * case we can catch duplicate names easily. 4904 */ 4905 sysfs_remove_link(&slab_kset->kobj, s->name); 4906 name = s->name; 4907 } else { 4908 /* 4909 * Create a unique name for the slab as a target 4910 * for the symlinks. 4911 */ 4912 name = create_unique_id(s); 4913 } 4914 4915 s->kobj.kset = slab_kset; 4916 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); 4917 if (err) { 4918 kobject_put(&s->kobj); 4919 return err; 4920 } 4921 4922 err = sysfs_create_group(&s->kobj, &slab_attr_group); 4923 if (err) { 4924 kobject_del(&s->kobj); 4925 kobject_put(&s->kobj); 4926 return err; 4927 } 4928 kobject_uevent(&s->kobj, KOBJ_ADD); 4929 if (!unmergeable) { 4930 /* Setup first alias */ 4931 sysfs_slab_alias(s, s->name); 4932 kfree(name); 4933 } 4934 return 0; 4935} 4936 4937static void sysfs_slab_remove(struct kmem_cache *s) 4938{ 4939 if (slab_state < SYSFS) 4940 /* 4941 * Sysfs has not been setup yet so no need to remove the 4942 * cache from sysfs. 4943 */ 4944 return; 4945 4946 kobject_uevent(&s->kobj, KOBJ_REMOVE); 4947 kobject_del(&s->kobj); 4948 kobject_put(&s->kobj); 4949} 4950 4951/* 4952 * Need to buffer aliases during bootup until sysfs becomes 4953 * available lest we lose that information. 4954 */ 4955struct saved_alias { 4956 struct kmem_cache *s; 4957 const char *name; 4958 struct saved_alias *next; 4959}; 4960 4961static struct saved_alias *alias_list; 4962 4963static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 4964{ 4965 struct saved_alias *al; 4966 4967 if (slab_state == SYSFS) { 4968 /* 4969 * If we have a leftover link then remove it. 4970 */ 4971 sysfs_remove_link(&slab_kset->kobj, name); 4972 return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); 4973 } 4974 4975 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 4976 if (!al) 4977 return -ENOMEM; 4978 4979 al->s = s; 4980 al->name = name; 4981 al->next = alias_list; 4982 alias_list = al; 4983 return 0; 4984} 4985 4986static int __init slab_sysfs_init(void) 4987{ 4988 struct kmem_cache *s; 4989 int err; 4990 4991 down_write(&slub_lock); 4992 4993 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 4994 if (!slab_kset) { 4995 up_write(&slub_lock); 4996 printk(KERN_ERR "Cannot register slab subsystem.\n"); 4997 return -ENOSYS; 4998 } 4999 5000 slab_state = SYSFS; 5001 5002 list_for_each_entry(s, &slab_caches, list) { 5003 err = sysfs_slab_add(s); 5004 if (err) 5005 printk(KERN_ERR "SLUB: Unable to add boot slab %s" 5006 " to sysfs\n", s->name); 5007 } 5008 5009 while (alias_list) { 5010 struct saved_alias *al = alias_list; 5011 5012 alias_list = alias_list->next; 5013 err = sysfs_slab_alias(al->s, al->name); 5014 if (err) 5015 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 5016 " %s to sysfs\n", s->name); 5017 kfree(al); 5018 } 5019 5020 up_write(&slub_lock); 5021 resiliency_test(); 5022 return 0; 5023} 5024 5025__initcall(slab_sysfs_init); 5026#endif /* CONFIG_SYSFS */ 5027 5028/* 5029 * The /proc/slabinfo ABI 5030 */ 5031#ifdef CONFIG_SLABINFO 5032static void print_slabinfo_header(struct seq_file *m) 5033{ 5034 seq_puts(m, "slabinfo - version: 2.1\n"); 5035 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 5036 "<objperslab> <pagesperslab>"); 5037 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 5038 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 5039 seq_putc(m, '\n'); 5040} 5041 5042static void *s_start(struct seq_file *m, loff_t *pos) 5043{ 5044 loff_t n = *pos; 5045 5046 down_read(&slub_lock); 5047 if (!n) 5048 print_slabinfo_header(m); 5049 5050 return seq_list_start(&slab_caches, *pos); 5051} 5052 5053static void *s_next(struct seq_file *m, void *p, loff_t *pos) 5054{ 5055 return seq_list_next(p, &slab_caches, pos); 5056} 5057 5058static void s_stop(struct seq_file *m, void *p) 5059{ 5060 up_read(&slub_lock); 5061} 5062 5063static int s_show(struct seq_file *m, void *p) 5064{ 5065 unsigned long nr_partials = 0; 5066 unsigned long nr_slabs = 0; 5067 unsigned long nr_inuse = 0; 5068 unsigned long nr_objs = 0; 5069 unsigned long nr_free = 0; 5070 struct kmem_cache *s; 5071 int node; 5072 5073 s = list_entry(p, struct kmem_cache, list); 5074 5075 for_each_online_node(node) { 5076 struct kmem_cache_node *n = get_node(s, node); 5077 5078 if (!n) 5079 continue; 5080 5081 nr_partials += n->nr_partial; 5082 nr_slabs += atomic_long_read(&n->nr_slabs); 5083 nr_objs += atomic_long_read(&n->total_objects); 5084 nr_free += count_partial(n, count_free); 5085 } 5086 5087 nr_inuse = nr_objs - nr_free; 5088 5089 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, 5090 nr_objs, s->size, oo_objects(s->oo), 5091 (1 << oo_order(s->oo))); 5092 seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); 5093 seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, 5094 0UL); 5095 seq_putc(m, '\n'); 5096 return 0; 5097} 5098 5099static const struct seq_operations slabinfo_op = { 5100 .start = s_start, 5101 .next = s_next, 5102 .stop = s_stop, 5103 .show = s_show, 5104}; 5105 5106static int slabinfo_open(struct inode *inode, struct file *file) 5107{ 5108 return seq_open(file, &slabinfo_op); 5109} 5110 5111static const struct file_operations proc_slabinfo_operations = { 5112 .open = slabinfo_open, 5113 .read = seq_read, 5114 .llseek = seq_lseek, 5115 .release = seq_release, 5116}; 5117 5118static int __init slab_proc_init(void) 5119{ 5120 proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); 5121 return 0; 5122} 5123module_init(slab_proc_init); 5124#endif /* CONFIG_SLABINFO */ 5125