slub.c revision 8ff12cfc009a2a38d87fa7058226fe197bb2696f
1/* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks and only 6 * uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com> 9 */ 10 11#include <linux/mm.h> 12#include <linux/module.h> 13#include <linux/bit_spinlock.h> 14#include <linux/interrupt.h> 15#include <linux/bitops.h> 16#include <linux/slab.h> 17#include <linux/seq_file.h> 18#include <linux/cpu.h> 19#include <linux/cpuset.h> 20#include <linux/mempolicy.h> 21#include <linux/ctype.h> 22#include <linux/kallsyms.h> 23#include <linux/memory.h> 24 25/* 26 * Lock order: 27 * 1. slab_lock(page) 28 * 2. slab->list_lock 29 * 30 * The slab_lock protects operations on the object of a particular 31 * slab and its metadata in the page struct. If the slab lock 32 * has been taken then no allocations nor frees can be performed 33 * on the objects in the slab nor can the slab be added or removed 34 * from the partial or full lists since this would mean modifying 35 * the page_struct of the slab. 36 * 37 * The list_lock protects the partial and full list on each node and 38 * the partial slab counter. If taken then no new slabs may be added or 39 * removed from the lists nor make the number of partial slabs be modified. 40 * (Note that the total number of slabs is an atomic value that may be 41 * modified without taking the list lock). 42 * 43 * The list_lock is a centralized lock and thus we avoid taking it as 44 * much as possible. As long as SLUB does not have to handle partial 45 * slabs, operations can continue without any centralized lock. F.e. 46 * allocating a long series of objects that fill up slabs does not require 47 * the list lock. 48 * 49 * The lock order is sometimes inverted when we are trying to get a slab 50 * off a list. We take the list_lock and then look for a page on the list 51 * to use. While we do that objects in the slabs may be freed. We can 52 * only operate on the slab if we have also taken the slab_lock. So we use 53 * a slab_trylock() on the slab. If trylock was successful then no frees 54 * can occur anymore and we can use the slab for allocations etc. If the 55 * slab_trylock() does not succeed then frees are in progress in the slab and 56 * we must stay away from it for a while since we may cause a bouncing 57 * cacheline if we try to acquire the lock. So go onto the next slab. 58 * If all pages are busy then we may allocate a new slab instead of reusing 59 * a partial slab. A new slab has noone operating on it and thus there is 60 * no danger of cacheline contention. 61 * 62 * Interrupts are disabled during allocation and deallocation in order to 63 * make the slab allocator safe to use in the context of an irq. In addition 64 * interrupts are disabled to ensure that the processor does not change 65 * while handling per_cpu slabs, due to kernel preemption. 66 * 67 * SLUB assigns one slab for allocation to each processor. 68 * Allocations only occur from these slabs called cpu slabs. 69 * 70 * Slabs with free elements are kept on a partial list and during regular 71 * operations no list for full slabs is used. If an object in a full slab is 72 * freed then the slab will show up again on the partial lists. 73 * We track full slabs for debugging purposes though because otherwise we 74 * cannot scan all objects. 75 * 76 * Slabs are freed when they become empty. Teardown and setup is 77 * minimal so we rely on the page allocators per cpu caches for 78 * fast frees and allocs. 79 * 80 * Overloading of page flags that are otherwise used for LRU management. 81 * 82 * PageActive The slab is frozen and exempt from list processing. 83 * This means that the slab is dedicated to a purpose 84 * such as satisfying allocations for a specific 85 * processor. Objects may be freed in the slab while 86 * it is frozen but slab_free will then skip the usual 87 * list operations. It is up to the processor holding 88 * the slab to integrate the slab into the slab lists 89 * when the slab is no longer needed. 90 * 91 * One use of this flag is to mark slabs that are 92 * used for allocations. Then such a slab becomes a cpu 93 * slab. The cpu slab may be equipped with an additional 94 * freelist that allows lockless access to 95 * free objects in addition to the regular freelist 96 * that requires the slab lock. 97 * 98 * PageError Slab requires special handling due to debug 99 * options set. This moves slab handling out of 100 * the fast path and disables lockless freelists. 101 */ 102 103#define FROZEN (1 << PG_active) 104 105#ifdef CONFIG_SLUB_DEBUG 106#define SLABDEBUG (1 << PG_error) 107#else 108#define SLABDEBUG 0 109#endif 110 111static inline int SlabFrozen(struct page *page) 112{ 113 return page->flags & FROZEN; 114} 115 116static inline void SetSlabFrozen(struct page *page) 117{ 118 page->flags |= FROZEN; 119} 120 121static inline void ClearSlabFrozen(struct page *page) 122{ 123 page->flags &= ~FROZEN; 124} 125 126static inline int SlabDebug(struct page *page) 127{ 128 return page->flags & SLABDEBUG; 129} 130 131static inline void SetSlabDebug(struct page *page) 132{ 133 page->flags |= SLABDEBUG; 134} 135 136static inline void ClearSlabDebug(struct page *page) 137{ 138 page->flags &= ~SLABDEBUG; 139} 140 141/* 142 * Issues still to be resolved: 143 * 144 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 145 * 146 * - Variable sizing of the per node arrays 147 */ 148 149/* Enable to test recovery from slab corruption on boot */ 150#undef SLUB_RESILIENCY_TEST 151 152/* 153 * Currently fastpath is not supported if preemption is enabled. 154 */ 155#if defined(CONFIG_FAST_CMPXCHG_LOCAL) && !defined(CONFIG_PREEMPT) 156#define SLUB_FASTPATH 157#endif 158 159#if PAGE_SHIFT <= 12 160 161/* 162 * Small page size. Make sure that we do not fragment memory 163 */ 164#define DEFAULT_MAX_ORDER 1 165#define DEFAULT_MIN_OBJECTS 4 166 167#else 168 169/* 170 * Large page machines are customarily able to handle larger 171 * page orders. 172 */ 173#define DEFAULT_MAX_ORDER 2 174#define DEFAULT_MIN_OBJECTS 8 175 176#endif 177 178/* 179 * Mininum number of partial slabs. These will be left on the partial 180 * lists even if they are empty. kmem_cache_shrink may reclaim them. 181 */ 182#define MIN_PARTIAL 5 183 184/* 185 * Maximum number of desirable partial slabs. 186 * The existence of more partial slabs makes kmem_cache_shrink 187 * sort the partial list by the number of objects in the. 188 */ 189#define MAX_PARTIAL 10 190 191#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 192 SLAB_POISON | SLAB_STORE_USER) 193 194/* 195 * Set of flags that will prevent slab merging 196 */ 197#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 198 SLAB_TRACE | SLAB_DESTROY_BY_RCU) 199 200#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 201 SLAB_CACHE_DMA) 202 203#ifndef ARCH_KMALLOC_MINALIGN 204#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 205#endif 206 207#ifndef ARCH_SLAB_MINALIGN 208#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) 209#endif 210 211/* Internal SLUB flags */ 212#define __OBJECT_POISON 0x80000000 /* Poison object */ 213#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ 214 215/* Not all arches define cache_line_size */ 216#ifndef cache_line_size 217#define cache_line_size() L1_CACHE_BYTES 218#endif 219 220static int kmem_size = sizeof(struct kmem_cache); 221 222#ifdef CONFIG_SMP 223static struct notifier_block slab_notifier; 224#endif 225 226static enum { 227 DOWN, /* No slab functionality available */ 228 PARTIAL, /* kmem_cache_open() works but kmalloc does not */ 229 UP, /* Everything works but does not show up in sysfs */ 230 SYSFS /* Sysfs up */ 231} slab_state = DOWN; 232 233/* A list of all slab caches on the system */ 234static DECLARE_RWSEM(slub_lock); 235static LIST_HEAD(slab_caches); 236 237/* 238 * Tracking user of a slab. 239 */ 240struct track { 241 void *addr; /* Called from address */ 242 int cpu; /* Was running on cpu */ 243 int pid; /* Pid context */ 244 unsigned long when; /* When did the operation occur */ 245}; 246 247enum track_item { TRACK_ALLOC, TRACK_FREE }; 248 249#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) 250static int sysfs_slab_add(struct kmem_cache *); 251static int sysfs_slab_alias(struct kmem_cache *, const char *); 252static void sysfs_slab_remove(struct kmem_cache *); 253 254#else 255static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 256static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 257 { return 0; } 258static inline void sysfs_slab_remove(struct kmem_cache *s) 259{ 260 kfree(s); 261} 262 263#endif 264 265static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) 266{ 267#ifdef CONFIG_SLUB_STATS 268 c->stat[si]++; 269#endif 270} 271 272/******************************************************************** 273 * Core slab cache functions 274 *******************************************************************/ 275 276int slab_is_available(void) 277{ 278 return slab_state >= UP; 279} 280 281static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 282{ 283#ifdef CONFIG_NUMA 284 return s->node[node]; 285#else 286 return &s->local_node; 287#endif 288} 289 290static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) 291{ 292#ifdef CONFIG_SMP 293 return s->cpu_slab[cpu]; 294#else 295 return &s->cpu_slab; 296#endif 297} 298 299/* 300 * The end pointer in a slab is special. It points to the first object in the 301 * slab but has bit 0 set to mark it. 302 * 303 * Note that SLUB relies on page_mapping returning NULL for pages with bit 0 304 * in the mapping set. 305 */ 306static inline int is_end(void *addr) 307{ 308 return (unsigned long)addr & PAGE_MAPPING_ANON; 309} 310 311void *slab_address(struct page *page) 312{ 313 return page->end - PAGE_MAPPING_ANON; 314} 315 316static inline int check_valid_pointer(struct kmem_cache *s, 317 struct page *page, const void *object) 318{ 319 void *base; 320 321 if (object == page->end) 322 return 1; 323 324 base = slab_address(page); 325 if (object < base || object >= base + s->objects * s->size || 326 (object - base) % s->size) { 327 return 0; 328 } 329 330 return 1; 331} 332 333/* 334 * Slow version of get and set free pointer. 335 * 336 * This version requires touching the cache lines of kmem_cache which 337 * we avoid to do in the fast alloc free paths. There we obtain the offset 338 * from the page struct. 339 */ 340static inline void *get_freepointer(struct kmem_cache *s, void *object) 341{ 342 return *(void **)(object + s->offset); 343} 344 345static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 346{ 347 *(void **)(object + s->offset) = fp; 348} 349 350/* Loop over all objects in a slab */ 351#define for_each_object(__p, __s, __addr) \ 352 for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ 353 __p += (__s)->size) 354 355/* Scan freelist */ 356#define for_each_free_object(__p, __s, __free) \ 357 for (__p = (__free); (__p) != page->end; __p = get_freepointer((__s),\ 358 __p)) 359 360/* Determine object index from a given position */ 361static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 362{ 363 return (p - addr) / s->size; 364} 365 366#ifdef CONFIG_SLUB_DEBUG 367/* 368 * Debug settings: 369 */ 370#ifdef CONFIG_SLUB_DEBUG_ON 371static int slub_debug = DEBUG_DEFAULT_FLAGS; 372#else 373static int slub_debug; 374#endif 375 376static char *slub_debug_slabs; 377 378/* 379 * Object debugging 380 */ 381static void print_section(char *text, u8 *addr, unsigned int length) 382{ 383 int i, offset; 384 int newline = 1; 385 char ascii[17]; 386 387 ascii[16] = 0; 388 389 for (i = 0; i < length; i++) { 390 if (newline) { 391 printk(KERN_ERR "%8s 0x%p: ", text, addr + i); 392 newline = 0; 393 } 394 printk(KERN_CONT " %02x", addr[i]); 395 offset = i % 16; 396 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; 397 if (offset == 15) { 398 printk(KERN_CONT " %s\n", ascii); 399 newline = 1; 400 } 401 } 402 if (!newline) { 403 i %= 16; 404 while (i < 16) { 405 printk(KERN_CONT " "); 406 ascii[i] = ' '; 407 i++; 408 } 409 printk(KERN_CONT " %s\n", ascii); 410 } 411} 412 413static struct track *get_track(struct kmem_cache *s, void *object, 414 enum track_item alloc) 415{ 416 struct track *p; 417 418 if (s->offset) 419 p = object + s->offset + sizeof(void *); 420 else 421 p = object + s->inuse; 422 423 return p + alloc; 424} 425 426static void set_track(struct kmem_cache *s, void *object, 427 enum track_item alloc, void *addr) 428{ 429 struct track *p; 430 431 if (s->offset) 432 p = object + s->offset + sizeof(void *); 433 else 434 p = object + s->inuse; 435 436 p += alloc; 437 if (addr) { 438 p->addr = addr; 439 p->cpu = smp_processor_id(); 440 p->pid = current ? current->pid : -1; 441 p->when = jiffies; 442 } else 443 memset(p, 0, sizeof(struct track)); 444} 445 446static void init_tracking(struct kmem_cache *s, void *object) 447{ 448 if (!(s->flags & SLAB_STORE_USER)) 449 return; 450 451 set_track(s, object, TRACK_FREE, NULL); 452 set_track(s, object, TRACK_ALLOC, NULL); 453} 454 455static void print_track(const char *s, struct track *t) 456{ 457 if (!t->addr) 458 return; 459 460 printk(KERN_ERR "INFO: %s in ", s); 461 __print_symbol("%s", (unsigned long)t->addr); 462 printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); 463} 464 465static void print_tracking(struct kmem_cache *s, void *object) 466{ 467 if (!(s->flags & SLAB_STORE_USER)) 468 return; 469 470 print_track("Allocated", get_track(s, object, TRACK_ALLOC)); 471 print_track("Freed", get_track(s, object, TRACK_FREE)); 472} 473 474static void print_page_info(struct page *page) 475{ 476 printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", 477 page, page->inuse, page->freelist, page->flags); 478 479} 480 481static void slab_bug(struct kmem_cache *s, char *fmt, ...) 482{ 483 va_list args; 484 char buf[100]; 485 486 va_start(args, fmt); 487 vsnprintf(buf, sizeof(buf), fmt, args); 488 va_end(args); 489 printk(KERN_ERR "========================================" 490 "=====================================\n"); 491 printk(KERN_ERR "BUG %s: %s\n", s->name, buf); 492 printk(KERN_ERR "----------------------------------------" 493 "-------------------------------------\n\n"); 494} 495 496static void slab_fix(struct kmem_cache *s, char *fmt, ...) 497{ 498 va_list args; 499 char buf[100]; 500 501 va_start(args, fmt); 502 vsnprintf(buf, sizeof(buf), fmt, args); 503 va_end(args); 504 printk(KERN_ERR "FIX %s: %s\n", s->name, buf); 505} 506 507static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 508{ 509 unsigned int off; /* Offset of last byte */ 510 u8 *addr = slab_address(page); 511 512 print_tracking(s, p); 513 514 print_page_info(page); 515 516 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 517 p, p - addr, get_freepointer(s, p)); 518 519 if (p > addr + 16) 520 print_section("Bytes b4", p - 16, 16); 521 522 print_section("Object", p, min(s->objsize, 128)); 523 524 if (s->flags & SLAB_RED_ZONE) 525 print_section("Redzone", p + s->objsize, 526 s->inuse - s->objsize); 527 528 if (s->offset) 529 off = s->offset + sizeof(void *); 530 else 531 off = s->inuse; 532 533 if (s->flags & SLAB_STORE_USER) 534 off += 2 * sizeof(struct track); 535 536 if (off != s->size) 537 /* Beginning of the filler is the free pointer */ 538 print_section("Padding", p + off, s->size - off); 539 540 dump_stack(); 541} 542 543static void object_err(struct kmem_cache *s, struct page *page, 544 u8 *object, char *reason) 545{ 546 slab_bug(s, reason); 547 print_trailer(s, page, object); 548} 549 550static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) 551{ 552 va_list args; 553 char buf[100]; 554 555 va_start(args, fmt); 556 vsnprintf(buf, sizeof(buf), fmt, args); 557 va_end(args); 558 slab_bug(s, fmt); 559 print_page_info(page); 560 dump_stack(); 561} 562 563static void init_object(struct kmem_cache *s, void *object, int active) 564{ 565 u8 *p = object; 566 567 if (s->flags & __OBJECT_POISON) { 568 memset(p, POISON_FREE, s->objsize - 1); 569 p[s->objsize - 1] = POISON_END; 570 } 571 572 if (s->flags & SLAB_RED_ZONE) 573 memset(p + s->objsize, 574 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, 575 s->inuse - s->objsize); 576} 577 578static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) 579{ 580 while (bytes) { 581 if (*start != (u8)value) 582 return start; 583 start++; 584 bytes--; 585 } 586 return NULL; 587} 588 589static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 590 void *from, void *to) 591{ 592 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); 593 memset(from, data, to - from); 594} 595 596static int check_bytes_and_report(struct kmem_cache *s, struct page *page, 597 u8 *object, char *what, 598 u8 *start, unsigned int value, unsigned int bytes) 599{ 600 u8 *fault; 601 u8 *end; 602 603 fault = check_bytes(start, value, bytes); 604 if (!fault) 605 return 1; 606 607 end = start + bytes; 608 while (end > fault && end[-1] == value) 609 end--; 610 611 slab_bug(s, "%s overwritten", what); 612 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", 613 fault, end - 1, fault[0], value); 614 print_trailer(s, page, object); 615 616 restore_bytes(s, what, value, fault, end); 617 return 0; 618} 619 620/* 621 * Object layout: 622 * 623 * object address 624 * Bytes of the object to be managed. 625 * If the freepointer may overlay the object then the free 626 * pointer is the first word of the object. 627 * 628 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 629 * 0xa5 (POISON_END) 630 * 631 * object + s->objsize 632 * Padding to reach word boundary. This is also used for Redzoning. 633 * Padding is extended by another word if Redzoning is enabled and 634 * objsize == inuse. 635 * 636 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 637 * 0xcc (RED_ACTIVE) for objects in use. 638 * 639 * object + s->inuse 640 * Meta data starts here. 641 * 642 * A. Free pointer (if we cannot overwrite object on free) 643 * B. Tracking data for SLAB_STORE_USER 644 * C. Padding to reach required alignment boundary or at mininum 645 * one word if debuggin is on to be able to detect writes 646 * before the word boundary. 647 * 648 * Padding is done using 0x5a (POISON_INUSE) 649 * 650 * object + s->size 651 * Nothing is used beyond s->size. 652 * 653 * If slabcaches are merged then the objsize and inuse boundaries are mostly 654 * ignored. And therefore no slab options that rely on these boundaries 655 * may be used with merged slabcaches. 656 */ 657 658static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 659{ 660 unsigned long off = s->inuse; /* The end of info */ 661 662 if (s->offset) 663 /* Freepointer is placed after the object. */ 664 off += sizeof(void *); 665 666 if (s->flags & SLAB_STORE_USER) 667 /* We also have user information there */ 668 off += 2 * sizeof(struct track); 669 670 if (s->size == off) 671 return 1; 672 673 return check_bytes_and_report(s, page, p, "Object padding", 674 p + off, POISON_INUSE, s->size - off); 675} 676 677static int slab_pad_check(struct kmem_cache *s, struct page *page) 678{ 679 u8 *start; 680 u8 *fault; 681 u8 *end; 682 int length; 683 int remainder; 684 685 if (!(s->flags & SLAB_POISON)) 686 return 1; 687 688 start = slab_address(page); 689 end = start + (PAGE_SIZE << s->order); 690 length = s->objects * s->size; 691 remainder = end - (start + length); 692 if (!remainder) 693 return 1; 694 695 fault = check_bytes(start + length, POISON_INUSE, remainder); 696 if (!fault) 697 return 1; 698 while (end > fault && end[-1] == POISON_INUSE) 699 end--; 700 701 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 702 print_section("Padding", start, length); 703 704 restore_bytes(s, "slab padding", POISON_INUSE, start, end); 705 return 0; 706} 707 708static int check_object(struct kmem_cache *s, struct page *page, 709 void *object, int active) 710{ 711 u8 *p = object; 712 u8 *endobject = object + s->objsize; 713 714 if (s->flags & SLAB_RED_ZONE) { 715 unsigned int red = 716 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; 717 718 if (!check_bytes_and_report(s, page, object, "Redzone", 719 endobject, red, s->inuse - s->objsize)) 720 return 0; 721 } else { 722 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) 723 check_bytes_and_report(s, page, p, "Alignment padding", endobject, 724 POISON_INUSE, s->inuse - s->objsize); 725 } 726 727 if (s->flags & SLAB_POISON) { 728 if (!active && (s->flags & __OBJECT_POISON) && 729 (!check_bytes_and_report(s, page, p, "Poison", p, 730 POISON_FREE, s->objsize - 1) || 731 !check_bytes_and_report(s, page, p, "Poison", 732 p + s->objsize - 1, POISON_END, 1))) 733 return 0; 734 /* 735 * check_pad_bytes cleans up on its own. 736 */ 737 check_pad_bytes(s, page, p); 738 } 739 740 if (!s->offset && active) 741 /* 742 * Object and freepointer overlap. Cannot check 743 * freepointer while object is allocated. 744 */ 745 return 1; 746 747 /* Check free pointer validity */ 748 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 749 object_err(s, page, p, "Freepointer corrupt"); 750 /* 751 * No choice but to zap it and thus loose the remainder 752 * of the free objects in this slab. May cause 753 * another error because the object count is now wrong. 754 */ 755 set_freepointer(s, p, page->end); 756 return 0; 757 } 758 return 1; 759} 760 761static int check_slab(struct kmem_cache *s, struct page *page) 762{ 763 VM_BUG_ON(!irqs_disabled()); 764 765 if (!PageSlab(page)) { 766 slab_err(s, page, "Not a valid slab page"); 767 return 0; 768 } 769 if (page->inuse > s->objects) { 770 slab_err(s, page, "inuse %u > max %u", 771 s->name, page->inuse, s->objects); 772 return 0; 773 } 774 /* Slab_pad_check fixes things up after itself */ 775 slab_pad_check(s, page); 776 return 1; 777} 778 779/* 780 * Determine if a certain object on a page is on the freelist. Must hold the 781 * slab lock to guarantee that the chains are in a consistent state. 782 */ 783static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 784{ 785 int nr = 0; 786 void *fp = page->freelist; 787 void *object = NULL; 788 789 while (fp != page->end && nr <= s->objects) { 790 if (fp == search) 791 return 1; 792 if (!check_valid_pointer(s, page, fp)) { 793 if (object) { 794 object_err(s, page, object, 795 "Freechain corrupt"); 796 set_freepointer(s, object, page->end); 797 break; 798 } else { 799 slab_err(s, page, "Freepointer corrupt"); 800 page->freelist = page->end; 801 page->inuse = s->objects; 802 slab_fix(s, "Freelist cleared"); 803 return 0; 804 } 805 break; 806 } 807 object = fp; 808 fp = get_freepointer(s, object); 809 nr++; 810 } 811 812 if (page->inuse != s->objects - nr) { 813 slab_err(s, page, "Wrong object count. Counter is %d but " 814 "counted were %d", page->inuse, s->objects - nr); 815 page->inuse = s->objects - nr; 816 slab_fix(s, "Object count adjusted."); 817 } 818 return search == NULL; 819} 820 821static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) 822{ 823 if (s->flags & SLAB_TRACE) { 824 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 825 s->name, 826 alloc ? "alloc" : "free", 827 object, page->inuse, 828 page->freelist); 829 830 if (!alloc) 831 print_section("Object", (void *)object, s->objsize); 832 833 dump_stack(); 834 } 835} 836 837/* 838 * Tracking of fully allocated slabs for debugging purposes. 839 */ 840static void add_full(struct kmem_cache_node *n, struct page *page) 841{ 842 spin_lock(&n->list_lock); 843 list_add(&page->lru, &n->full); 844 spin_unlock(&n->list_lock); 845} 846 847static void remove_full(struct kmem_cache *s, struct page *page) 848{ 849 struct kmem_cache_node *n; 850 851 if (!(s->flags & SLAB_STORE_USER)) 852 return; 853 854 n = get_node(s, page_to_nid(page)); 855 856 spin_lock(&n->list_lock); 857 list_del(&page->lru); 858 spin_unlock(&n->list_lock); 859} 860 861static void setup_object_debug(struct kmem_cache *s, struct page *page, 862 void *object) 863{ 864 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 865 return; 866 867 init_object(s, object, 0); 868 init_tracking(s, object); 869} 870 871static int alloc_debug_processing(struct kmem_cache *s, struct page *page, 872 void *object, void *addr) 873{ 874 if (!check_slab(s, page)) 875 goto bad; 876 877 if (object && !on_freelist(s, page, object)) { 878 object_err(s, page, object, "Object already allocated"); 879 goto bad; 880 } 881 882 if (!check_valid_pointer(s, page, object)) { 883 object_err(s, page, object, "Freelist Pointer check fails"); 884 goto bad; 885 } 886 887 if (object && !check_object(s, page, object, 0)) 888 goto bad; 889 890 /* Success perform special debug activities for allocs */ 891 if (s->flags & SLAB_STORE_USER) 892 set_track(s, object, TRACK_ALLOC, addr); 893 trace(s, page, object, 1); 894 init_object(s, object, 1); 895 return 1; 896 897bad: 898 if (PageSlab(page)) { 899 /* 900 * If this is a slab page then lets do the best we can 901 * to avoid issues in the future. Marking all objects 902 * as used avoids touching the remaining objects. 903 */ 904 slab_fix(s, "Marking all objects used"); 905 page->inuse = s->objects; 906 page->freelist = page->end; 907 } 908 return 0; 909} 910 911static int free_debug_processing(struct kmem_cache *s, struct page *page, 912 void *object, void *addr) 913{ 914 if (!check_slab(s, page)) 915 goto fail; 916 917 if (!check_valid_pointer(s, page, object)) { 918 slab_err(s, page, "Invalid object pointer 0x%p", object); 919 goto fail; 920 } 921 922 if (on_freelist(s, page, object)) { 923 object_err(s, page, object, "Object already free"); 924 goto fail; 925 } 926 927 if (!check_object(s, page, object, 1)) 928 return 0; 929 930 if (unlikely(s != page->slab)) { 931 if (!PageSlab(page)) 932 slab_err(s, page, "Attempt to free object(0x%p) " 933 "outside of slab", object); 934 else 935 if (!page->slab) { 936 printk(KERN_ERR 937 "SLUB <none>: no slab for object 0x%p.\n", 938 object); 939 dump_stack(); 940 } else 941 object_err(s, page, object, 942 "page slab pointer corrupt."); 943 goto fail; 944 } 945 946 /* Special debug activities for freeing objects */ 947 if (!SlabFrozen(page) && page->freelist == page->end) 948 remove_full(s, page); 949 if (s->flags & SLAB_STORE_USER) 950 set_track(s, object, TRACK_FREE, addr); 951 trace(s, page, object, 0); 952 init_object(s, object, 0); 953 return 1; 954 955fail: 956 slab_fix(s, "Object at 0x%p not freed", object); 957 return 0; 958} 959 960static int __init setup_slub_debug(char *str) 961{ 962 slub_debug = DEBUG_DEFAULT_FLAGS; 963 if (*str++ != '=' || !*str) 964 /* 965 * No options specified. Switch on full debugging. 966 */ 967 goto out; 968 969 if (*str == ',') 970 /* 971 * No options but restriction on slabs. This means full 972 * debugging for slabs matching a pattern. 973 */ 974 goto check_slabs; 975 976 slub_debug = 0; 977 if (*str == '-') 978 /* 979 * Switch off all debugging measures. 980 */ 981 goto out; 982 983 /* 984 * Determine which debug features should be switched on 985 */ 986 for (; *str && *str != ','; str++) { 987 switch (tolower(*str)) { 988 case 'f': 989 slub_debug |= SLAB_DEBUG_FREE; 990 break; 991 case 'z': 992 slub_debug |= SLAB_RED_ZONE; 993 break; 994 case 'p': 995 slub_debug |= SLAB_POISON; 996 break; 997 case 'u': 998 slub_debug |= SLAB_STORE_USER; 999 break; 1000 case 't': 1001 slub_debug |= SLAB_TRACE; 1002 break; 1003 default: 1004 printk(KERN_ERR "slub_debug option '%c' " 1005 "unknown. skipped\n", *str); 1006 } 1007 } 1008 1009check_slabs: 1010 if (*str == ',') 1011 slub_debug_slabs = str + 1; 1012out: 1013 return 1; 1014} 1015 1016__setup("slub_debug", setup_slub_debug); 1017 1018static unsigned long kmem_cache_flags(unsigned long objsize, 1019 unsigned long flags, const char *name, 1020 void (*ctor)(struct kmem_cache *, void *)) 1021{ 1022 /* 1023 * The page->offset field is only 16 bit wide. This is an offset 1024 * in units of words from the beginning of an object. If the slab 1025 * size is bigger then we cannot move the free pointer behind the 1026 * object anymore. 1027 * 1028 * On 32 bit platforms the limit is 256k. On 64bit platforms 1029 * the limit is 512k. 1030 * 1031 * Debugging or ctor may create a need to move the free 1032 * pointer. Fail if this happens. 1033 */ 1034 if (objsize >= 65535 * sizeof(void *)) { 1035 BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | 1036 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); 1037 BUG_ON(ctor); 1038 } else { 1039 /* 1040 * Enable debugging if selected on the kernel commandline. 1041 */ 1042 if (slub_debug && (!slub_debug_slabs || 1043 strncmp(slub_debug_slabs, name, 1044 strlen(slub_debug_slabs)) == 0)) 1045 flags |= slub_debug; 1046 } 1047 1048 return flags; 1049} 1050#else 1051static inline void setup_object_debug(struct kmem_cache *s, 1052 struct page *page, void *object) {} 1053 1054static inline int alloc_debug_processing(struct kmem_cache *s, 1055 struct page *page, void *object, void *addr) { return 0; } 1056 1057static inline int free_debug_processing(struct kmem_cache *s, 1058 struct page *page, void *object, void *addr) { return 0; } 1059 1060static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1061 { return 1; } 1062static inline int check_object(struct kmem_cache *s, struct page *page, 1063 void *object, int active) { return 1; } 1064static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1065static inline unsigned long kmem_cache_flags(unsigned long objsize, 1066 unsigned long flags, const char *name, 1067 void (*ctor)(struct kmem_cache *, void *)) 1068{ 1069 return flags; 1070} 1071#define slub_debug 0 1072#endif 1073/* 1074 * Slab allocation and freeing 1075 */ 1076static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1077{ 1078 struct page *page; 1079 int pages = 1 << s->order; 1080 1081 if (s->order) 1082 flags |= __GFP_COMP; 1083 1084 if (s->flags & SLAB_CACHE_DMA) 1085 flags |= SLUB_DMA; 1086 1087 if (s->flags & SLAB_RECLAIM_ACCOUNT) 1088 flags |= __GFP_RECLAIMABLE; 1089 1090 if (node == -1) 1091 page = alloc_pages(flags, s->order); 1092 else 1093 page = alloc_pages_node(node, flags, s->order); 1094 1095 if (!page) 1096 return NULL; 1097 1098 mod_zone_page_state(page_zone(page), 1099 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1100 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1101 pages); 1102 1103 return page; 1104} 1105 1106static void setup_object(struct kmem_cache *s, struct page *page, 1107 void *object) 1108{ 1109 setup_object_debug(s, page, object); 1110 if (unlikely(s->ctor)) 1111 s->ctor(s, object); 1112} 1113 1114static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1115{ 1116 struct page *page; 1117 struct kmem_cache_node *n; 1118 void *start; 1119 void *last; 1120 void *p; 1121 1122 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1123 1124 page = allocate_slab(s, 1125 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 1126 if (!page) 1127 goto out; 1128 1129 n = get_node(s, page_to_nid(page)); 1130 if (n) 1131 atomic_long_inc(&n->nr_slabs); 1132 page->slab = s; 1133 page->flags |= 1 << PG_slab; 1134 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 1135 SLAB_STORE_USER | SLAB_TRACE)) 1136 SetSlabDebug(page); 1137 1138 start = page_address(page); 1139 page->end = start + 1; 1140 1141 if (unlikely(s->flags & SLAB_POISON)) 1142 memset(start, POISON_INUSE, PAGE_SIZE << s->order); 1143 1144 last = start; 1145 for_each_object(p, s, start) { 1146 setup_object(s, page, last); 1147 set_freepointer(s, last, p); 1148 last = p; 1149 } 1150 setup_object(s, page, last); 1151 set_freepointer(s, last, page->end); 1152 1153 page->freelist = start; 1154 page->inuse = 0; 1155out: 1156 return page; 1157} 1158 1159static void __free_slab(struct kmem_cache *s, struct page *page) 1160{ 1161 int pages = 1 << s->order; 1162 1163 if (unlikely(SlabDebug(page))) { 1164 void *p; 1165 1166 slab_pad_check(s, page); 1167 for_each_object(p, s, slab_address(page)) 1168 check_object(s, page, p, 0); 1169 ClearSlabDebug(page); 1170 } 1171 1172 mod_zone_page_state(page_zone(page), 1173 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1174 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1175 -pages); 1176 1177 page->mapping = NULL; 1178 __free_pages(page, s->order); 1179} 1180 1181static void rcu_free_slab(struct rcu_head *h) 1182{ 1183 struct page *page; 1184 1185 page = container_of((struct list_head *)h, struct page, lru); 1186 __free_slab(page->slab, page); 1187} 1188 1189static void free_slab(struct kmem_cache *s, struct page *page) 1190{ 1191 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1192 /* 1193 * RCU free overloads the RCU head over the LRU 1194 */ 1195 struct rcu_head *head = (void *)&page->lru; 1196 1197 call_rcu(head, rcu_free_slab); 1198 } else 1199 __free_slab(s, page); 1200} 1201 1202static void discard_slab(struct kmem_cache *s, struct page *page) 1203{ 1204 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1205 1206 atomic_long_dec(&n->nr_slabs); 1207 reset_page_mapcount(page); 1208 __ClearPageSlab(page); 1209 free_slab(s, page); 1210} 1211 1212/* 1213 * Per slab locking using the pagelock 1214 */ 1215static __always_inline void slab_lock(struct page *page) 1216{ 1217 bit_spin_lock(PG_locked, &page->flags); 1218} 1219 1220static __always_inline void slab_unlock(struct page *page) 1221{ 1222 bit_spin_unlock(PG_locked, &page->flags); 1223} 1224 1225static __always_inline int slab_trylock(struct page *page) 1226{ 1227 int rc = 1; 1228 1229 rc = bit_spin_trylock(PG_locked, &page->flags); 1230 return rc; 1231} 1232 1233/* 1234 * Management of partially allocated slabs 1235 */ 1236static void add_partial(struct kmem_cache_node *n, 1237 struct page *page, int tail) 1238{ 1239 spin_lock(&n->list_lock); 1240 n->nr_partial++; 1241 if (tail) 1242 list_add_tail(&page->lru, &n->partial); 1243 else 1244 list_add(&page->lru, &n->partial); 1245 spin_unlock(&n->list_lock); 1246} 1247 1248static void remove_partial(struct kmem_cache *s, 1249 struct page *page) 1250{ 1251 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1252 1253 spin_lock(&n->list_lock); 1254 list_del(&page->lru); 1255 n->nr_partial--; 1256 spin_unlock(&n->list_lock); 1257} 1258 1259/* 1260 * Lock slab and remove from the partial list. 1261 * 1262 * Must hold list_lock. 1263 */ 1264static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) 1265{ 1266 if (slab_trylock(page)) { 1267 list_del(&page->lru); 1268 n->nr_partial--; 1269 SetSlabFrozen(page); 1270 return 1; 1271 } 1272 return 0; 1273} 1274 1275/* 1276 * Try to allocate a partial slab from a specific node. 1277 */ 1278static struct page *get_partial_node(struct kmem_cache_node *n) 1279{ 1280 struct page *page; 1281 1282 /* 1283 * Racy check. If we mistakenly see no partial slabs then we 1284 * just allocate an empty slab. If we mistakenly try to get a 1285 * partial slab and there is none available then get_partials() 1286 * will return NULL. 1287 */ 1288 if (!n || !n->nr_partial) 1289 return NULL; 1290 1291 spin_lock(&n->list_lock); 1292 list_for_each_entry(page, &n->partial, lru) 1293 if (lock_and_freeze_slab(n, page)) 1294 goto out; 1295 page = NULL; 1296out: 1297 spin_unlock(&n->list_lock); 1298 return page; 1299} 1300 1301/* 1302 * Get a page from somewhere. Search in increasing NUMA distances. 1303 */ 1304static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1305{ 1306#ifdef CONFIG_NUMA 1307 struct zonelist *zonelist; 1308 struct zone **z; 1309 struct page *page; 1310 1311 /* 1312 * The defrag ratio allows a configuration of the tradeoffs between 1313 * inter node defragmentation and node local allocations. A lower 1314 * defrag_ratio increases the tendency to do local allocations 1315 * instead of attempting to obtain partial slabs from other nodes. 1316 * 1317 * If the defrag_ratio is set to 0 then kmalloc() always 1318 * returns node local objects. If the ratio is higher then kmalloc() 1319 * may return off node objects because partial slabs are obtained 1320 * from other nodes and filled up. 1321 * 1322 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes 1323 * defrag_ratio = 1000) then every (well almost) allocation will 1324 * first attempt to defrag slab caches on other nodes. This means 1325 * scanning over all nodes to look for partial slabs which may be 1326 * expensive if we do it every time we are trying to find a slab 1327 * with available objects. 1328 */ 1329 if (!s->remote_node_defrag_ratio || 1330 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1331 return NULL; 1332 1333 zonelist = &NODE_DATA(slab_node(current->mempolicy)) 1334 ->node_zonelists[gfp_zone(flags)]; 1335 for (z = zonelist->zones; *z; z++) { 1336 struct kmem_cache_node *n; 1337 1338 n = get_node(s, zone_to_nid(*z)); 1339 1340 if (n && cpuset_zone_allowed_hardwall(*z, flags) && 1341 n->nr_partial > MIN_PARTIAL) { 1342 page = get_partial_node(n); 1343 if (page) 1344 return page; 1345 } 1346 } 1347#endif 1348 return NULL; 1349} 1350 1351/* 1352 * Get a partial page, lock it and return it. 1353 */ 1354static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1355{ 1356 struct page *page; 1357 int searchnode = (node == -1) ? numa_node_id() : node; 1358 1359 page = get_partial_node(get_node(s, searchnode)); 1360 if (page || (flags & __GFP_THISNODE)) 1361 return page; 1362 1363 return get_any_partial(s, flags); 1364} 1365 1366/* 1367 * Move a page back to the lists. 1368 * 1369 * Must be called with the slab lock held. 1370 * 1371 * On exit the slab lock will have been dropped. 1372 */ 1373static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) 1374{ 1375 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1376 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); 1377 1378 ClearSlabFrozen(page); 1379 if (page->inuse) { 1380 1381 if (page->freelist != page->end) { 1382 add_partial(n, page, tail); 1383 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1384 } else { 1385 stat(c, DEACTIVATE_FULL); 1386 if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) 1387 add_full(n, page); 1388 } 1389 slab_unlock(page); 1390 } else { 1391 stat(c, DEACTIVATE_EMPTY); 1392 if (n->nr_partial < MIN_PARTIAL) { 1393 /* 1394 * Adding an empty slab to the partial slabs in order 1395 * to avoid page allocator overhead. This slab needs 1396 * to come after the other slabs with objects in 1397 * order to fill them up. That way the size of the 1398 * partial list stays small. kmem_cache_shrink can 1399 * reclaim empty slabs from the partial list. 1400 */ 1401 add_partial(n, page, 1); 1402 slab_unlock(page); 1403 } else { 1404 slab_unlock(page); 1405 stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); 1406 discard_slab(s, page); 1407 } 1408 } 1409} 1410 1411/* 1412 * Remove the cpu slab 1413 */ 1414static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1415{ 1416 struct page *page = c->page; 1417 int tail = 1; 1418 1419 if (c->freelist) 1420 stat(c, DEACTIVATE_REMOTE_FREES); 1421 /* 1422 * Merge cpu freelist into freelist. Typically we get here 1423 * because both freelists are empty. So this is unlikely 1424 * to occur. 1425 * 1426 * We need to use _is_end here because deactivate slab may 1427 * be called for a debug slab. Then c->freelist may contain 1428 * a dummy pointer. 1429 */ 1430 while (unlikely(!is_end(c->freelist))) { 1431 void **object; 1432 1433 tail = 0; /* Hot objects. Put the slab first */ 1434 1435 /* Retrieve object from cpu_freelist */ 1436 object = c->freelist; 1437 c->freelist = c->freelist[c->offset]; 1438 1439 /* And put onto the regular freelist */ 1440 object[c->offset] = page->freelist; 1441 page->freelist = object; 1442 page->inuse--; 1443 } 1444 c->page = NULL; 1445 unfreeze_slab(s, page, tail); 1446} 1447 1448static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1449{ 1450 stat(c, CPUSLAB_FLUSH); 1451 slab_lock(c->page); 1452 deactivate_slab(s, c); 1453} 1454 1455/* 1456 * Flush cpu slab. 1457 * Called from IPI handler with interrupts disabled. 1458 */ 1459static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1460{ 1461 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 1462 1463 if (likely(c && c->page)) 1464 flush_slab(s, c); 1465} 1466 1467static void flush_cpu_slab(void *d) 1468{ 1469 struct kmem_cache *s = d; 1470 1471 __flush_cpu_slab(s, smp_processor_id()); 1472} 1473 1474static void flush_all(struct kmem_cache *s) 1475{ 1476#ifdef CONFIG_SMP 1477 on_each_cpu(flush_cpu_slab, s, 1, 1); 1478#else 1479 unsigned long flags; 1480 1481 local_irq_save(flags); 1482 flush_cpu_slab(s); 1483 local_irq_restore(flags); 1484#endif 1485} 1486 1487/* 1488 * Check if the objects in a per cpu structure fit numa 1489 * locality expectations. 1490 */ 1491static inline int node_match(struct kmem_cache_cpu *c, int node) 1492{ 1493#ifdef CONFIG_NUMA 1494 if (node != -1 && c->node != node) 1495 return 0; 1496#endif 1497 return 1; 1498} 1499 1500/* 1501 * Slow path. The lockless freelist is empty or we need to perform 1502 * debugging duties. 1503 * 1504 * Interrupts are disabled. 1505 * 1506 * Processing is still very fast if new objects have been freed to the 1507 * regular freelist. In that case we simply take over the regular freelist 1508 * as the lockless freelist and zap the regular freelist. 1509 * 1510 * If that is not working then we fall back to the partial lists. We take the 1511 * first element of the freelist as the object to allocate now and move the 1512 * rest of the freelist to the lockless freelist. 1513 * 1514 * And if we were unable to get a new slab from the partial slab lists then 1515 * we need to allocate a new slab. This is slowest path since we may sleep. 1516 */ 1517static void *__slab_alloc(struct kmem_cache *s, 1518 gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) 1519{ 1520 void **object; 1521 struct page *new; 1522#ifdef SLUB_FASTPATH 1523 unsigned long flags; 1524 1525 local_irq_save(flags); 1526#endif 1527 if (!c->page) 1528 goto new_slab; 1529 1530 slab_lock(c->page); 1531 if (unlikely(!node_match(c, node))) 1532 goto another_slab; 1533 stat(c, ALLOC_REFILL); 1534load_freelist: 1535 object = c->page->freelist; 1536 if (unlikely(object == c->page->end)) 1537 goto another_slab; 1538 if (unlikely(SlabDebug(c->page))) 1539 goto debug; 1540 1541 object = c->page->freelist; 1542 c->freelist = object[c->offset]; 1543 c->page->inuse = s->objects; 1544 c->page->freelist = c->page->end; 1545 c->node = page_to_nid(c->page); 1546unlock_out: 1547 slab_unlock(c->page); 1548 stat(c, ALLOC_SLOWPATH); 1549out: 1550#ifdef SLUB_FASTPATH 1551 local_irq_restore(flags); 1552#endif 1553 return object; 1554 1555another_slab: 1556 deactivate_slab(s, c); 1557 1558new_slab: 1559 new = get_partial(s, gfpflags, node); 1560 if (new) { 1561 c->page = new; 1562 stat(c, ALLOC_FROM_PARTIAL); 1563 goto load_freelist; 1564 } 1565 1566 if (gfpflags & __GFP_WAIT) 1567 local_irq_enable(); 1568 1569 new = new_slab(s, gfpflags, node); 1570 1571 if (gfpflags & __GFP_WAIT) 1572 local_irq_disable(); 1573 1574 if (new) { 1575 c = get_cpu_slab(s, smp_processor_id()); 1576 stat(c, ALLOC_SLAB); 1577 if (c->page) 1578 flush_slab(s, c); 1579 slab_lock(new); 1580 SetSlabFrozen(new); 1581 c->page = new; 1582 goto load_freelist; 1583 } 1584 object = NULL; 1585 goto out; 1586debug: 1587 object = c->page->freelist; 1588 if (!alloc_debug_processing(s, c->page, object, addr)) 1589 goto another_slab; 1590 1591 c->page->inuse++; 1592 c->page->freelist = object[c->offset]; 1593 c->node = -1; 1594 goto unlock_out; 1595} 1596 1597/* 1598 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) 1599 * have the fastpath folded into their functions. So no function call 1600 * overhead for requests that can be satisfied on the fastpath. 1601 * 1602 * The fastpath works by first checking if the lockless freelist can be used. 1603 * If not then __slab_alloc is called for slow processing. 1604 * 1605 * Otherwise we can simply pick the next object from the lockless free list. 1606 */ 1607static __always_inline void *slab_alloc(struct kmem_cache *s, 1608 gfp_t gfpflags, int node, void *addr) 1609{ 1610 void **object; 1611 struct kmem_cache_cpu *c; 1612 1613/* 1614 * The SLUB_FASTPATH path is provisional and is currently disabled if the 1615 * kernel is compiled with preemption or if the arch does not support 1616 * fast cmpxchg operations. There are a couple of coming changes that will 1617 * simplify matters and allow preemption. Ultimately we may end up making 1618 * SLUB_FASTPATH the default. 1619 * 1620 * 1. The introduction of the per cpu allocator will avoid array lookups 1621 * through get_cpu_slab(). A special register can be used instead. 1622 * 1623 * 2. The introduction of per cpu atomic operations (cpu_ops) means that 1624 * we can realize the logic here entirely with per cpu atomics. The 1625 * per cpu atomic ops will take care of the preemption issues. 1626 */ 1627 1628#ifdef SLUB_FASTPATH 1629 c = get_cpu_slab(s, raw_smp_processor_id()); 1630 do { 1631 object = c->freelist; 1632 if (unlikely(is_end(object) || !node_match(c, node))) { 1633 object = __slab_alloc(s, gfpflags, node, addr, c); 1634 break; 1635 } 1636 stat(c, ALLOC_FASTPATH); 1637 } while (cmpxchg_local(&c->freelist, object, object[c->offset]) 1638 != object); 1639#else 1640 unsigned long flags; 1641 1642 local_irq_save(flags); 1643 c = get_cpu_slab(s, smp_processor_id()); 1644 if (unlikely(is_end(c->freelist) || !node_match(c, node))) 1645 1646 object = __slab_alloc(s, gfpflags, node, addr, c); 1647 1648 else { 1649 object = c->freelist; 1650 c->freelist = object[c->offset]; 1651 stat(c, ALLOC_FASTPATH); 1652 } 1653 local_irq_restore(flags); 1654#endif 1655 1656 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1657 memset(object, 0, c->objsize); 1658 1659 return object; 1660} 1661 1662void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 1663{ 1664 return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); 1665} 1666EXPORT_SYMBOL(kmem_cache_alloc); 1667 1668#ifdef CONFIG_NUMA 1669void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 1670{ 1671 return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); 1672} 1673EXPORT_SYMBOL(kmem_cache_alloc_node); 1674#endif 1675 1676/* 1677 * Slow patch handling. This may still be called frequently since objects 1678 * have a longer lifetime than the cpu slabs in most processing loads. 1679 * 1680 * So we still attempt to reduce cache line usage. Just take the slab 1681 * lock and free the item. If there is no additional partial page 1682 * handling required then we can return immediately. 1683 */ 1684static void __slab_free(struct kmem_cache *s, struct page *page, 1685 void *x, void *addr, unsigned int offset) 1686{ 1687 void *prior; 1688 void **object = (void *)x; 1689 struct kmem_cache_cpu *c; 1690 1691#ifdef SLUB_FASTPATH 1692 unsigned long flags; 1693 1694 local_irq_save(flags); 1695#endif 1696 c = get_cpu_slab(s, raw_smp_processor_id()); 1697 stat(c, FREE_SLOWPATH); 1698 slab_lock(page); 1699 1700 if (unlikely(SlabDebug(page))) 1701 goto debug; 1702checks_ok: 1703 prior = object[offset] = page->freelist; 1704 page->freelist = object; 1705 page->inuse--; 1706 1707 if (unlikely(SlabFrozen(page))) { 1708 stat(c, FREE_FROZEN); 1709 goto out_unlock; 1710 } 1711 1712 if (unlikely(!page->inuse)) 1713 goto slab_empty; 1714 1715 /* 1716 * Objects left in the slab. If it 1717 * was not on the partial list before 1718 * then add it. 1719 */ 1720 if (unlikely(prior == page->end)) { 1721 add_partial(get_node(s, page_to_nid(page)), page, 1); 1722 stat(c, FREE_ADD_PARTIAL); 1723 } 1724 1725out_unlock: 1726 slab_unlock(page); 1727#ifdef SLUB_FASTPATH 1728 local_irq_restore(flags); 1729#endif 1730 return; 1731 1732slab_empty: 1733 if (prior != page->end) { 1734 /* 1735 * Slab still on the partial list. 1736 */ 1737 remove_partial(s, page); 1738 stat(c, FREE_REMOVE_PARTIAL); 1739 } 1740 slab_unlock(page); 1741 stat(c, FREE_SLAB); 1742#ifdef SLUB_FASTPATH 1743 local_irq_restore(flags); 1744#endif 1745 discard_slab(s, page); 1746 return; 1747 1748debug: 1749 if (!free_debug_processing(s, page, x, addr)) 1750 goto out_unlock; 1751 goto checks_ok; 1752} 1753 1754/* 1755 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 1756 * can perform fastpath freeing without additional function calls. 1757 * 1758 * The fastpath is only possible if we are freeing to the current cpu slab 1759 * of this processor. This typically the case if we have just allocated 1760 * the item before. 1761 * 1762 * If fastpath is not possible then fall back to __slab_free where we deal 1763 * with all sorts of special processing. 1764 */ 1765static __always_inline void slab_free(struct kmem_cache *s, 1766 struct page *page, void *x, void *addr) 1767{ 1768 void **object = (void *)x; 1769 struct kmem_cache_cpu *c; 1770 1771#ifdef SLUB_FASTPATH 1772 void **freelist; 1773 1774 c = get_cpu_slab(s, raw_smp_processor_id()); 1775 debug_check_no_locks_freed(object, s->objsize); 1776 do { 1777 freelist = c->freelist; 1778 barrier(); 1779 /* 1780 * If the compiler would reorder the retrieval of c->page to 1781 * come before c->freelist then an interrupt could 1782 * change the cpu slab before we retrieve c->freelist. We 1783 * could be matching on a page no longer active and put the 1784 * object onto the freelist of the wrong slab. 1785 * 1786 * On the other hand: If we already have the freelist pointer 1787 * then any change of cpu_slab will cause the cmpxchg to fail 1788 * since the freelist pointers are unique per slab. 1789 */ 1790 if (unlikely(page != c->page || c->node < 0)) { 1791 __slab_free(s, page, x, addr, c->offset); 1792 break; 1793 } 1794 object[c->offset] = freelist; 1795 stat(c, FREE_FASTPATH); 1796 } while (cmpxchg_local(&c->freelist, freelist, object) != freelist); 1797#else 1798 unsigned long flags; 1799 1800 local_irq_save(flags); 1801 debug_check_no_locks_freed(object, s->objsize); 1802 c = get_cpu_slab(s, smp_processor_id()); 1803 if (likely(page == c->page && c->node >= 0)) { 1804 object[c->offset] = c->freelist; 1805 c->freelist = object; 1806 stat(c, FREE_FASTPATH); 1807 } else 1808 __slab_free(s, page, x, addr, c->offset); 1809 1810 local_irq_restore(flags); 1811#endif 1812} 1813 1814void kmem_cache_free(struct kmem_cache *s, void *x) 1815{ 1816 struct page *page; 1817 1818 page = virt_to_head_page(x); 1819 1820 slab_free(s, page, x, __builtin_return_address(0)); 1821} 1822EXPORT_SYMBOL(kmem_cache_free); 1823 1824/* Figure out on which slab object the object resides */ 1825static struct page *get_object_page(const void *x) 1826{ 1827 struct page *page = virt_to_head_page(x); 1828 1829 if (!PageSlab(page)) 1830 return NULL; 1831 1832 return page; 1833} 1834 1835/* 1836 * Object placement in a slab is made very easy because we always start at 1837 * offset 0. If we tune the size of the object to the alignment then we can 1838 * get the required alignment by putting one properly sized object after 1839 * another. 1840 * 1841 * Notice that the allocation order determines the sizes of the per cpu 1842 * caches. Each processor has always one slab available for allocations. 1843 * Increasing the allocation order reduces the number of times that slabs 1844 * must be moved on and off the partial lists and is therefore a factor in 1845 * locking overhead. 1846 */ 1847 1848/* 1849 * Mininum / Maximum order of slab pages. This influences locking overhead 1850 * and slab fragmentation. A higher order reduces the number of partial slabs 1851 * and increases the number of allocations possible without having to 1852 * take the list_lock. 1853 */ 1854static int slub_min_order; 1855static int slub_max_order = DEFAULT_MAX_ORDER; 1856static int slub_min_objects = DEFAULT_MIN_OBJECTS; 1857 1858/* 1859 * Merge control. If this is set then no merging of slab caches will occur. 1860 * (Could be removed. This was introduced to pacify the merge skeptics.) 1861 */ 1862static int slub_nomerge; 1863 1864/* 1865 * Calculate the order of allocation given an slab object size. 1866 * 1867 * The order of allocation has significant impact on performance and other 1868 * system components. Generally order 0 allocations should be preferred since 1869 * order 0 does not cause fragmentation in the page allocator. Larger objects 1870 * be problematic to put into order 0 slabs because there may be too much 1871 * unused space left. We go to a higher order if more than 1/8th of the slab 1872 * would be wasted. 1873 * 1874 * In order to reach satisfactory performance we must ensure that a minimum 1875 * number of objects is in one slab. Otherwise we may generate too much 1876 * activity on the partial lists which requires taking the list_lock. This is 1877 * less a concern for large slabs though which are rarely used. 1878 * 1879 * slub_max_order specifies the order where we begin to stop considering the 1880 * number of objects in a slab as critical. If we reach slub_max_order then 1881 * we try to keep the page order as low as possible. So we accept more waste 1882 * of space in favor of a small page order. 1883 * 1884 * Higher order allocations also allow the placement of more objects in a 1885 * slab and thereby reduce object handling overhead. If the user has 1886 * requested a higher mininum order then we start with that one instead of 1887 * the smallest order which will fit the object. 1888 */ 1889static inline int slab_order(int size, int min_objects, 1890 int max_order, int fract_leftover) 1891{ 1892 int order; 1893 int rem; 1894 int min_order = slub_min_order; 1895 1896 for (order = max(min_order, 1897 fls(min_objects * size - 1) - PAGE_SHIFT); 1898 order <= max_order; order++) { 1899 1900 unsigned long slab_size = PAGE_SIZE << order; 1901 1902 if (slab_size < min_objects * size) 1903 continue; 1904 1905 rem = slab_size % size; 1906 1907 if (rem <= slab_size / fract_leftover) 1908 break; 1909 1910 } 1911 1912 return order; 1913} 1914 1915static inline int calculate_order(int size) 1916{ 1917 int order; 1918 int min_objects; 1919 int fraction; 1920 1921 /* 1922 * Attempt to find best configuration for a slab. This 1923 * works by first attempting to generate a layout with 1924 * the best configuration and backing off gradually. 1925 * 1926 * First we reduce the acceptable waste in a slab. Then 1927 * we reduce the minimum objects required in a slab. 1928 */ 1929 min_objects = slub_min_objects; 1930 while (min_objects > 1) { 1931 fraction = 8; 1932 while (fraction >= 4) { 1933 order = slab_order(size, min_objects, 1934 slub_max_order, fraction); 1935 if (order <= slub_max_order) 1936 return order; 1937 fraction /= 2; 1938 } 1939 min_objects /= 2; 1940 } 1941 1942 /* 1943 * We were unable to place multiple objects in a slab. Now 1944 * lets see if we can place a single object there. 1945 */ 1946 order = slab_order(size, 1, slub_max_order, 1); 1947 if (order <= slub_max_order) 1948 return order; 1949 1950 /* 1951 * Doh this slab cannot be placed using slub_max_order. 1952 */ 1953 order = slab_order(size, 1, MAX_ORDER, 1); 1954 if (order <= MAX_ORDER) 1955 return order; 1956 return -ENOSYS; 1957} 1958 1959/* 1960 * Figure out what the alignment of the objects will be. 1961 */ 1962static unsigned long calculate_alignment(unsigned long flags, 1963 unsigned long align, unsigned long size) 1964{ 1965 /* 1966 * If the user wants hardware cache aligned objects then 1967 * follow that suggestion if the object is sufficiently 1968 * large. 1969 * 1970 * The hardware cache alignment cannot override the 1971 * specified alignment though. If that is greater 1972 * then use it. 1973 */ 1974 if ((flags & SLAB_HWCACHE_ALIGN) && 1975 size > cache_line_size() / 2) 1976 return max_t(unsigned long, align, cache_line_size()); 1977 1978 if (align < ARCH_SLAB_MINALIGN) 1979 return ARCH_SLAB_MINALIGN; 1980 1981 return ALIGN(align, sizeof(void *)); 1982} 1983 1984static void init_kmem_cache_cpu(struct kmem_cache *s, 1985 struct kmem_cache_cpu *c) 1986{ 1987 c->page = NULL; 1988 c->freelist = (void *)PAGE_MAPPING_ANON; 1989 c->node = 0; 1990 c->offset = s->offset / sizeof(void *); 1991 c->objsize = s->objsize; 1992} 1993 1994static void init_kmem_cache_node(struct kmem_cache_node *n) 1995{ 1996 n->nr_partial = 0; 1997 atomic_long_set(&n->nr_slabs, 0); 1998 spin_lock_init(&n->list_lock); 1999 INIT_LIST_HEAD(&n->partial); 2000#ifdef CONFIG_SLUB_DEBUG 2001 INIT_LIST_HEAD(&n->full); 2002#endif 2003} 2004 2005#ifdef CONFIG_SMP 2006/* 2007 * Per cpu array for per cpu structures. 2008 * 2009 * The per cpu array places all kmem_cache_cpu structures from one processor 2010 * close together meaning that it becomes possible that multiple per cpu 2011 * structures are contained in one cacheline. This may be particularly 2012 * beneficial for the kmalloc caches. 2013 * 2014 * A desktop system typically has around 60-80 slabs. With 100 here we are 2015 * likely able to get per cpu structures for all caches from the array defined 2016 * here. We must be able to cover all kmalloc caches during bootstrap. 2017 * 2018 * If the per cpu array is exhausted then fall back to kmalloc 2019 * of individual cachelines. No sharing is possible then. 2020 */ 2021#define NR_KMEM_CACHE_CPU 100 2022 2023static DEFINE_PER_CPU(struct kmem_cache_cpu, 2024 kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; 2025 2026static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); 2027static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; 2028 2029static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, 2030 int cpu, gfp_t flags) 2031{ 2032 struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); 2033 2034 if (c) 2035 per_cpu(kmem_cache_cpu_free, cpu) = 2036 (void *)c->freelist; 2037 else { 2038 /* Table overflow: So allocate ourselves */ 2039 c = kmalloc_node( 2040 ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), 2041 flags, cpu_to_node(cpu)); 2042 if (!c) 2043 return NULL; 2044 } 2045 2046 init_kmem_cache_cpu(s, c); 2047 return c; 2048} 2049 2050static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) 2051{ 2052 if (c < per_cpu(kmem_cache_cpu, cpu) || 2053 c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { 2054 kfree(c); 2055 return; 2056 } 2057 c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); 2058 per_cpu(kmem_cache_cpu_free, cpu) = c; 2059} 2060 2061static void free_kmem_cache_cpus(struct kmem_cache *s) 2062{ 2063 int cpu; 2064 2065 for_each_online_cpu(cpu) { 2066 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 2067 2068 if (c) { 2069 s->cpu_slab[cpu] = NULL; 2070 free_kmem_cache_cpu(c, cpu); 2071 } 2072 } 2073} 2074 2075static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) 2076{ 2077 int cpu; 2078 2079 for_each_online_cpu(cpu) { 2080 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 2081 2082 if (c) 2083 continue; 2084 2085 c = alloc_kmem_cache_cpu(s, cpu, flags); 2086 if (!c) { 2087 free_kmem_cache_cpus(s); 2088 return 0; 2089 } 2090 s->cpu_slab[cpu] = c; 2091 } 2092 return 1; 2093} 2094 2095/* 2096 * Initialize the per cpu array. 2097 */ 2098static void init_alloc_cpu_cpu(int cpu) 2099{ 2100 int i; 2101 2102 if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) 2103 return; 2104 2105 for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) 2106 free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); 2107 2108 cpu_set(cpu, kmem_cach_cpu_free_init_once); 2109} 2110 2111static void __init init_alloc_cpu(void) 2112{ 2113 int cpu; 2114 2115 for_each_online_cpu(cpu) 2116 init_alloc_cpu_cpu(cpu); 2117 } 2118 2119#else 2120static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} 2121static inline void init_alloc_cpu(void) {} 2122 2123static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) 2124{ 2125 init_kmem_cache_cpu(s, &s->cpu_slab); 2126 return 1; 2127} 2128#endif 2129 2130#ifdef CONFIG_NUMA 2131/* 2132 * No kmalloc_node yet so do it by hand. We know that this is the first 2133 * slab on the node for this slabcache. There are no concurrent accesses 2134 * possible. 2135 * 2136 * Note that this function only works on the kmalloc_node_cache 2137 * when allocating for the kmalloc_node_cache. This is used for bootstrapping 2138 * memory on a fresh node that has no slab structures yet. 2139 */ 2140static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, 2141 int node) 2142{ 2143 struct page *page; 2144 struct kmem_cache_node *n; 2145 unsigned long flags; 2146 2147 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); 2148 2149 page = new_slab(kmalloc_caches, gfpflags, node); 2150 2151 BUG_ON(!page); 2152 if (page_to_nid(page) != node) { 2153 printk(KERN_ERR "SLUB: Unable to allocate memory from " 2154 "node %d\n", node); 2155 printk(KERN_ERR "SLUB: Allocating a useless per node structure " 2156 "in order to be able to continue\n"); 2157 } 2158 2159 n = page->freelist; 2160 BUG_ON(!n); 2161 page->freelist = get_freepointer(kmalloc_caches, n); 2162 page->inuse++; 2163 kmalloc_caches->node[node] = n; 2164#ifdef CONFIG_SLUB_DEBUG 2165 init_object(kmalloc_caches, n, 1); 2166 init_tracking(kmalloc_caches, n); 2167#endif 2168 init_kmem_cache_node(n); 2169 atomic_long_inc(&n->nr_slabs); 2170 /* 2171 * lockdep requires consistent irq usage for each lock 2172 * so even though there cannot be a race this early in 2173 * the boot sequence, we still disable irqs. 2174 */ 2175 local_irq_save(flags); 2176 add_partial(n, page, 0); 2177 local_irq_restore(flags); 2178 return n; 2179} 2180 2181static void free_kmem_cache_nodes(struct kmem_cache *s) 2182{ 2183 int node; 2184 2185 for_each_node_state(node, N_NORMAL_MEMORY) { 2186 struct kmem_cache_node *n = s->node[node]; 2187 if (n && n != &s->local_node) 2188 kmem_cache_free(kmalloc_caches, n); 2189 s->node[node] = NULL; 2190 } 2191} 2192 2193static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 2194{ 2195 int node; 2196 int local_node; 2197 2198 if (slab_state >= UP) 2199 local_node = page_to_nid(virt_to_page(s)); 2200 else 2201 local_node = 0; 2202 2203 for_each_node_state(node, N_NORMAL_MEMORY) { 2204 struct kmem_cache_node *n; 2205 2206 if (local_node == node) 2207 n = &s->local_node; 2208 else { 2209 if (slab_state == DOWN) { 2210 n = early_kmem_cache_node_alloc(gfpflags, 2211 node); 2212 continue; 2213 } 2214 n = kmem_cache_alloc_node(kmalloc_caches, 2215 gfpflags, node); 2216 2217 if (!n) { 2218 free_kmem_cache_nodes(s); 2219 return 0; 2220 } 2221 2222 } 2223 s->node[node] = n; 2224 init_kmem_cache_node(n); 2225 } 2226 return 1; 2227} 2228#else 2229static void free_kmem_cache_nodes(struct kmem_cache *s) 2230{ 2231} 2232 2233static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 2234{ 2235 init_kmem_cache_node(&s->local_node); 2236 return 1; 2237} 2238#endif 2239 2240/* 2241 * calculate_sizes() determines the order and the distribution of data within 2242 * a slab object. 2243 */ 2244static int calculate_sizes(struct kmem_cache *s) 2245{ 2246 unsigned long flags = s->flags; 2247 unsigned long size = s->objsize; 2248 unsigned long align = s->align; 2249 2250 /* 2251 * Determine if we can poison the object itself. If the user of 2252 * the slab may touch the object after free or before allocation 2253 * then we should never poison the object itself. 2254 */ 2255 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 2256 !s->ctor) 2257 s->flags |= __OBJECT_POISON; 2258 else 2259 s->flags &= ~__OBJECT_POISON; 2260 2261 /* 2262 * Round up object size to the next word boundary. We can only 2263 * place the free pointer at word boundaries and this determines 2264 * the possible location of the free pointer. 2265 */ 2266 size = ALIGN(size, sizeof(void *)); 2267 2268#ifdef CONFIG_SLUB_DEBUG 2269 /* 2270 * If we are Redzoning then check if there is some space between the 2271 * end of the object and the free pointer. If not then add an 2272 * additional word to have some bytes to store Redzone information. 2273 */ 2274 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 2275 size += sizeof(void *); 2276#endif 2277 2278 /* 2279 * With that we have determined the number of bytes in actual use 2280 * by the object. This is the potential offset to the free pointer. 2281 */ 2282 s->inuse = size; 2283 2284 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 2285 s->ctor)) { 2286 /* 2287 * Relocate free pointer after the object if it is not 2288 * permitted to overwrite the first word of the object on 2289 * kmem_cache_free. 2290 * 2291 * This is the case if we do RCU, have a constructor or 2292 * destructor or are poisoning the objects. 2293 */ 2294 s->offset = size; 2295 size += sizeof(void *); 2296 } 2297 2298#ifdef CONFIG_SLUB_DEBUG 2299 if (flags & SLAB_STORE_USER) 2300 /* 2301 * Need to store information about allocs and frees after 2302 * the object. 2303 */ 2304 size += 2 * sizeof(struct track); 2305 2306 if (flags & SLAB_RED_ZONE) 2307 /* 2308 * Add some empty padding so that we can catch 2309 * overwrites from earlier objects rather than let 2310 * tracking information or the free pointer be 2311 * corrupted if an user writes before the start 2312 * of the object. 2313 */ 2314 size += sizeof(void *); 2315#endif 2316 2317 /* 2318 * Determine the alignment based on various parameters that the 2319 * user specified and the dynamic determination of cache line size 2320 * on bootup. 2321 */ 2322 align = calculate_alignment(flags, align, s->objsize); 2323 2324 /* 2325 * SLUB stores one object immediately after another beginning from 2326 * offset 0. In order to align the objects we have to simply size 2327 * each object to conform to the alignment. 2328 */ 2329 size = ALIGN(size, align); 2330 s->size = size; 2331 2332 s->order = calculate_order(size); 2333 if (s->order < 0) 2334 return 0; 2335 2336 /* 2337 * Determine the number of objects per slab 2338 */ 2339 s->objects = (PAGE_SIZE << s->order) / size; 2340 2341 return !!s->objects; 2342 2343} 2344 2345static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2346 const char *name, size_t size, 2347 size_t align, unsigned long flags, 2348 void (*ctor)(struct kmem_cache *, void *)) 2349{ 2350 memset(s, 0, kmem_size); 2351 s->name = name; 2352 s->ctor = ctor; 2353 s->objsize = size; 2354 s->align = align; 2355 s->flags = kmem_cache_flags(size, flags, name, ctor); 2356 2357 if (!calculate_sizes(s)) 2358 goto error; 2359 2360 s->refcount = 1; 2361#ifdef CONFIG_NUMA 2362 s->remote_node_defrag_ratio = 100; 2363#endif 2364 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 2365 goto error; 2366 2367 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) 2368 return 1; 2369 free_kmem_cache_nodes(s); 2370error: 2371 if (flags & SLAB_PANIC) 2372 panic("Cannot create slab %s size=%lu realsize=%u " 2373 "order=%u offset=%u flags=%lx\n", 2374 s->name, (unsigned long)size, s->size, s->order, 2375 s->offset, flags); 2376 return 0; 2377} 2378 2379/* 2380 * Check if a given pointer is valid 2381 */ 2382int kmem_ptr_validate(struct kmem_cache *s, const void *object) 2383{ 2384 struct page *page; 2385 2386 page = get_object_page(object); 2387 2388 if (!page || s != page->slab) 2389 /* No slab or wrong slab */ 2390 return 0; 2391 2392 if (!check_valid_pointer(s, page, object)) 2393 return 0; 2394 2395 /* 2396 * We could also check if the object is on the slabs freelist. 2397 * But this would be too expensive and it seems that the main 2398 * purpose of kmem_ptr_valid is to check if the object belongs 2399 * to a certain slab. 2400 */ 2401 return 1; 2402} 2403EXPORT_SYMBOL(kmem_ptr_validate); 2404 2405/* 2406 * Determine the size of a slab object 2407 */ 2408unsigned int kmem_cache_size(struct kmem_cache *s) 2409{ 2410 return s->objsize; 2411} 2412EXPORT_SYMBOL(kmem_cache_size); 2413 2414const char *kmem_cache_name(struct kmem_cache *s) 2415{ 2416 return s->name; 2417} 2418EXPORT_SYMBOL(kmem_cache_name); 2419 2420/* 2421 * Attempt to free all slabs on a node. Return the number of slabs we 2422 * were unable to free. 2423 */ 2424static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, 2425 struct list_head *list) 2426{ 2427 int slabs_inuse = 0; 2428 unsigned long flags; 2429 struct page *page, *h; 2430 2431 spin_lock_irqsave(&n->list_lock, flags); 2432 list_for_each_entry_safe(page, h, list, lru) 2433 if (!page->inuse) { 2434 list_del(&page->lru); 2435 discard_slab(s, page); 2436 } else 2437 slabs_inuse++; 2438 spin_unlock_irqrestore(&n->list_lock, flags); 2439 return slabs_inuse; 2440} 2441 2442/* 2443 * Release all resources used by a slab cache. 2444 */ 2445static inline int kmem_cache_close(struct kmem_cache *s) 2446{ 2447 int node; 2448 2449 flush_all(s); 2450 2451 /* Attempt to free all objects */ 2452 free_kmem_cache_cpus(s); 2453 for_each_node_state(node, N_NORMAL_MEMORY) { 2454 struct kmem_cache_node *n = get_node(s, node); 2455 2456 n->nr_partial -= free_list(s, n, &n->partial); 2457 if (atomic_long_read(&n->nr_slabs)) 2458 return 1; 2459 } 2460 free_kmem_cache_nodes(s); 2461 return 0; 2462} 2463 2464/* 2465 * Close a cache and release the kmem_cache structure 2466 * (must be used for caches created using kmem_cache_create) 2467 */ 2468void kmem_cache_destroy(struct kmem_cache *s) 2469{ 2470 down_write(&slub_lock); 2471 s->refcount--; 2472 if (!s->refcount) { 2473 list_del(&s->list); 2474 up_write(&slub_lock); 2475 if (kmem_cache_close(s)) 2476 WARN_ON(1); 2477 sysfs_slab_remove(s); 2478 } else 2479 up_write(&slub_lock); 2480} 2481EXPORT_SYMBOL(kmem_cache_destroy); 2482 2483/******************************************************************** 2484 * Kmalloc subsystem 2485 *******************************************************************/ 2486 2487struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned; 2488EXPORT_SYMBOL(kmalloc_caches); 2489 2490#ifdef CONFIG_ZONE_DMA 2491static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT]; 2492#endif 2493 2494static int __init setup_slub_min_order(char *str) 2495{ 2496 get_option(&str, &slub_min_order); 2497 2498 return 1; 2499} 2500 2501__setup("slub_min_order=", setup_slub_min_order); 2502 2503static int __init setup_slub_max_order(char *str) 2504{ 2505 get_option(&str, &slub_max_order); 2506 2507 return 1; 2508} 2509 2510__setup("slub_max_order=", setup_slub_max_order); 2511 2512static int __init setup_slub_min_objects(char *str) 2513{ 2514 get_option(&str, &slub_min_objects); 2515 2516 return 1; 2517} 2518 2519__setup("slub_min_objects=", setup_slub_min_objects); 2520 2521static int __init setup_slub_nomerge(char *str) 2522{ 2523 slub_nomerge = 1; 2524 return 1; 2525} 2526 2527__setup("slub_nomerge", setup_slub_nomerge); 2528 2529static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, 2530 const char *name, int size, gfp_t gfp_flags) 2531{ 2532 unsigned int flags = 0; 2533 2534 if (gfp_flags & SLUB_DMA) 2535 flags = SLAB_CACHE_DMA; 2536 2537 down_write(&slub_lock); 2538 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 2539 flags, NULL)) 2540 goto panic; 2541 2542 list_add(&s->list, &slab_caches); 2543 up_write(&slub_lock); 2544 if (sysfs_slab_add(s)) 2545 goto panic; 2546 return s; 2547 2548panic: 2549 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 2550} 2551 2552#ifdef CONFIG_ZONE_DMA 2553 2554static void sysfs_add_func(struct work_struct *w) 2555{ 2556 struct kmem_cache *s; 2557 2558 down_write(&slub_lock); 2559 list_for_each_entry(s, &slab_caches, list) { 2560 if (s->flags & __SYSFS_ADD_DEFERRED) { 2561 s->flags &= ~__SYSFS_ADD_DEFERRED; 2562 sysfs_slab_add(s); 2563 } 2564 } 2565 up_write(&slub_lock); 2566} 2567 2568static DECLARE_WORK(sysfs_add_work, sysfs_add_func); 2569 2570static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) 2571{ 2572 struct kmem_cache *s; 2573 char *text; 2574 size_t realsize; 2575 2576 s = kmalloc_caches_dma[index]; 2577 if (s) 2578 return s; 2579 2580 /* Dynamically create dma cache */ 2581 if (flags & __GFP_WAIT) 2582 down_write(&slub_lock); 2583 else { 2584 if (!down_write_trylock(&slub_lock)) 2585 goto out; 2586 } 2587 2588 if (kmalloc_caches_dma[index]) 2589 goto unlock_out; 2590 2591 realsize = kmalloc_caches[index].objsize; 2592 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize), 2593 s = kmalloc(kmem_size, flags & ~SLUB_DMA); 2594 2595 if (!s || !text || !kmem_cache_open(s, flags, text, 2596 realsize, ARCH_KMALLOC_MINALIGN, 2597 SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { 2598 kfree(s); 2599 kfree(text); 2600 goto unlock_out; 2601 } 2602 2603 list_add(&s->list, &slab_caches); 2604 kmalloc_caches_dma[index] = s; 2605 2606 schedule_work(&sysfs_add_work); 2607 2608unlock_out: 2609 up_write(&slub_lock); 2610out: 2611 return kmalloc_caches_dma[index]; 2612} 2613#endif 2614 2615/* 2616 * Conversion table for small slabs sizes / 8 to the index in the 2617 * kmalloc array. This is necessary for slabs < 192 since we have non power 2618 * of two cache sizes there. The size of larger slabs can be determined using 2619 * fls. 2620 */ 2621static s8 size_index[24] = { 2622 3, /* 8 */ 2623 4, /* 16 */ 2624 5, /* 24 */ 2625 5, /* 32 */ 2626 6, /* 40 */ 2627 6, /* 48 */ 2628 6, /* 56 */ 2629 6, /* 64 */ 2630 1, /* 72 */ 2631 1, /* 80 */ 2632 1, /* 88 */ 2633 1, /* 96 */ 2634 7, /* 104 */ 2635 7, /* 112 */ 2636 7, /* 120 */ 2637 7, /* 128 */ 2638 2, /* 136 */ 2639 2, /* 144 */ 2640 2, /* 152 */ 2641 2, /* 160 */ 2642 2, /* 168 */ 2643 2, /* 176 */ 2644 2, /* 184 */ 2645 2 /* 192 */ 2646}; 2647 2648static struct kmem_cache *get_slab(size_t size, gfp_t flags) 2649{ 2650 int index; 2651 2652 if (size <= 192) { 2653 if (!size) 2654 return ZERO_SIZE_PTR; 2655 2656 index = size_index[(size - 1) / 8]; 2657 } else 2658 index = fls(size - 1); 2659 2660#ifdef CONFIG_ZONE_DMA 2661 if (unlikely((flags & SLUB_DMA))) 2662 return dma_kmalloc_cache(index, flags); 2663 2664#endif 2665 return &kmalloc_caches[index]; 2666} 2667 2668void *__kmalloc(size_t size, gfp_t flags) 2669{ 2670 struct kmem_cache *s; 2671 2672 if (unlikely(size > PAGE_SIZE / 2)) 2673 return (void *)__get_free_pages(flags | __GFP_COMP, 2674 get_order(size)); 2675 2676 s = get_slab(size, flags); 2677 2678 if (unlikely(ZERO_OR_NULL_PTR(s))) 2679 return s; 2680 2681 return slab_alloc(s, flags, -1, __builtin_return_address(0)); 2682} 2683EXPORT_SYMBOL(__kmalloc); 2684 2685#ifdef CONFIG_NUMA 2686void *__kmalloc_node(size_t size, gfp_t flags, int node) 2687{ 2688 struct kmem_cache *s; 2689 2690 if (unlikely(size > PAGE_SIZE / 2)) 2691 return (void *)__get_free_pages(flags | __GFP_COMP, 2692 get_order(size)); 2693 2694 s = get_slab(size, flags); 2695 2696 if (unlikely(ZERO_OR_NULL_PTR(s))) 2697 return s; 2698 2699 return slab_alloc(s, flags, node, __builtin_return_address(0)); 2700} 2701EXPORT_SYMBOL(__kmalloc_node); 2702#endif 2703 2704size_t ksize(const void *object) 2705{ 2706 struct page *page; 2707 struct kmem_cache *s; 2708 2709 BUG_ON(!object); 2710 if (unlikely(object == ZERO_SIZE_PTR)) 2711 return 0; 2712 2713 page = virt_to_head_page(object); 2714 BUG_ON(!page); 2715 2716 if (unlikely(!PageSlab(page))) 2717 return PAGE_SIZE << compound_order(page); 2718 2719 s = page->slab; 2720 BUG_ON(!s); 2721 2722 /* 2723 * Debugging requires use of the padding between object 2724 * and whatever may come after it. 2725 */ 2726 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 2727 return s->objsize; 2728 2729 /* 2730 * If we have the need to store the freelist pointer 2731 * back there or track user information then we can 2732 * only use the space before that information. 2733 */ 2734 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 2735 return s->inuse; 2736 2737 /* 2738 * Else we can use all the padding etc for the allocation 2739 */ 2740 return s->size; 2741} 2742EXPORT_SYMBOL(ksize); 2743 2744void kfree(const void *x) 2745{ 2746 struct page *page; 2747 void *object = (void *)x; 2748 2749 if (unlikely(ZERO_OR_NULL_PTR(x))) 2750 return; 2751 2752 page = virt_to_head_page(x); 2753 if (unlikely(!PageSlab(page))) { 2754 put_page(page); 2755 return; 2756 } 2757 slab_free(page->slab, page, object, __builtin_return_address(0)); 2758} 2759EXPORT_SYMBOL(kfree); 2760 2761static unsigned long count_partial(struct kmem_cache_node *n) 2762{ 2763 unsigned long flags; 2764 unsigned long x = 0; 2765 struct page *page; 2766 2767 spin_lock_irqsave(&n->list_lock, flags); 2768 list_for_each_entry(page, &n->partial, lru) 2769 x += page->inuse; 2770 spin_unlock_irqrestore(&n->list_lock, flags); 2771 return x; 2772} 2773 2774/* 2775 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 2776 * the remaining slabs by the number of items in use. The slabs with the 2777 * most items in use come first. New allocations will then fill those up 2778 * and thus they can be removed from the partial lists. 2779 * 2780 * The slabs with the least items are placed last. This results in them 2781 * being allocated from last increasing the chance that the last objects 2782 * are freed in them. 2783 */ 2784int kmem_cache_shrink(struct kmem_cache *s) 2785{ 2786 int node; 2787 int i; 2788 struct kmem_cache_node *n; 2789 struct page *page; 2790 struct page *t; 2791 struct list_head *slabs_by_inuse = 2792 kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); 2793 unsigned long flags; 2794 2795 if (!slabs_by_inuse) 2796 return -ENOMEM; 2797 2798 flush_all(s); 2799 for_each_node_state(node, N_NORMAL_MEMORY) { 2800 n = get_node(s, node); 2801 2802 if (!n->nr_partial) 2803 continue; 2804 2805 for (i = 0; i < s->objects; i++) 2806 INIT_LIST_HEAD(slabs_by_inuse + i); 2807 2808 spin_lock_irqsave(&n->list_lock, flags); 2809 2810 /* 2811 * Build lists indexed by the items in use in each slab. 2812 * 2813 * Note that concurrent frees may occur while we hold the 2814 * list_lock. page->inuse here is the upper limit. 2815 */ 2816 list_for_each_entry_safe(page, t, &n->partial, lru) { 2817 if (!page->inuse && slab_trylock(page)) { 2818 /* 2819 * Must hold slab lock here because slab_free 2820 * may have freed the last object and be 2821 * waiting to release the slab. 2822 */ 2823 list_del(&page->lru); 2824 n->nr_partial--; 2825 slab_unlock(page); 2826 discard_slab(s, page); 2827 } else { 2828 list_move(&page->lru, 2829 slabs_by_inuse + page->inuse); 2830 } 2831 } 2832 2833 /* 2834 * Rebuild the partial list with the slabs filled up most 2835 * first and the least used slabs at the end. 2836 */ 2837 for (i = s->objects - 1; i >= 0; i--) 2838 list_splice(slabs_by_inuse + i, n->partial.prev); 2839 2840 spin_unlock_irqrestore(&n->list_lock, flags); 2841 } 2842 2843 kfree(slabs_by_inuse); 2844 return 0; 2845} 2846EXPORT_SYMBOL(kmem_cache_shrink); 2847 2848#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) 2849static int slab_mem_going_offline_callback(void *arg) 2850{ 2851 struct kmem_cache *s; 2852 2853 down_read(&slub_lock); 2854 list_for_each_entry(s, &slab_caches, list) 2855 kmem_cache_shrink(s); 2856 up_read(&slub_lock); 2857 2858 return 0; 2859} 2860 2861static void slab_mem_offline_callback(void *arg) 2862{ 2863 struct kmem_cache_node *n; 2864 struct kmem_cache *s; 2865 struct memory_notify *marg = arg; 2866 int offline_node; 2867 2868 offline_node = marg->status_change_nid; 2869 2870 /* 2871 * If the node still has available memory. we need kmem_cache_node 2872 * for it yet. 2873 */ 2874 if (offline_node < 0) 2875 return; 2876 2877 down_read(&slub_lock); 2878 list_for_each_entry(s, &slab_caches, list) { 2879 n = get_node(s, offline_node); 2880 if (n) { 2881 /* 2882 * if n->nr_slabs > 0, slabs still exist on the node 2883 * that is going down. We were unable to free them, 2884 * and offline_pages() function shoudn't call this 2885 * callback. So, we must fail. 2886 */ 2887 BUG_ON(atomic_long_read(&n->nr_slabs)); 2888 2889 s->node[offline_node] = NULL; 2890 kmem_cache_free(kmalloc_caches, n); 2891 } 2892 } 2893 up_read(&slub_lock); 2894} 2895 2896static int slab_mem_going_online_callback(void *arg) 2897{ 2898 struct kmem_cache_node *n; 2899 struct kmem_cache *s; 2900 struct memory_notify *marg = arg; 2901 int nid = marg->status_change_nid; 2902 int ret = 0; 2903 2904 /* 2905 * If the node's memory is already available, then kmem_cache_node is 2906 * already created. Nothing to do. 2907 */ 2908 if (nid < 0) 2909 return 0; 2910 2911 /* 2912 * We are bringing a node online. No memory is availabe yet. We must 2913 * allocate a kmem_cache_node structure in order to bring the node 2914 * online. 2915 */ 2916 down_read(&slub_lock); 2917 list_for_each_entry(s, &slab_caches, list) { 2918 /* 2919 * XXX: kmem_cache_alloc_node will fallback to other nodes 2920 * since memory is not yet available from the node that 2921 * is brought up. 2922 */ 2923 n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL); 2924 if (!n) { 2925 ret = -ENOMEM; 2926 goto out; 2927 } 2928 init_kmem_cache_node(n); 2929 s->node[nid] = n; 2930 } 2931out: 2932 up_read(&slub_lock); 2933 return ret; 2934} 2935 2936static int slab_memory_callback(struct notifier_block *self, 2937 unsigned long action, void *arg) 2938{ 2939 int ret = 0; 2940 2941 switch (action) { 2942 case MEM_GOING_ONLINE: 2943 ret = slab_mem_going_online_callback(arg); 2944 break; 2945 case MEM_GOING_OFFLINE: 2946 ret = slab_mem_going_offline_callback(arg); 2947 break; 2948 case MEM_OFFLINE: 2949 case MEM_CANCEL_ONLINE: 2950 slab_mem_offline_callback(arg); 2951 break; 2952 case MEM_ONLINE: 2953 case MEM_CANCEL_OFFLINE: 2954 break; 2955 } 2956 2957 ret = notifier_from_errno(ret); 2958 return ret; 2959} 2960 2961#endif /* CONFIG_MEMORY_HOTPLUG */ 2962 2963/******************************************************************** 2964 * Basic setup of slabs 2965 *******************************************************************/ 2966 2967void __init kmem_cache_init(void) 2968{ 2969 int i; 2970 int caches = 0; 2971 2972 init_alloc_cpu(); 2973 2974#ifdef CONFIG_NUMA 2975 /* 2976 * Must first have the slab cache available for the allocations of the 2977 * struct kmem_cache_node's. There is special bootstrap code in 2978 * kmem_cache_open for slab_state == DOWN. 2979 */ 2980 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 2981 sizeof(struct kmem_cache_node), GFP_KERNEL); 2982 kmalloc_caches[0].refcount = -1; 2983 caches++; 2984 2985 hotplug_memory_notifier(slab_memory_callback, 1); 2986#endif 2987 2988 /* Able to allocate the per node structures */ 2989 slab_state = PARTIAL; 2990 2991 /* Caches that are not of the two-to-the-power-of size */ 2992 if (KMALLOC_MIN_SIZE <= 64) { 2993 create_kmalloc_cache(&kmalloc_caches[1], 2994 "kmalloc-96", 96, GFP_KERNEL); 2995 caches++; 2996 } 2997 if (KMALLOC_MIN_SIZE <= 128) { 2998 create_kmalloc_cache(&kmalloc_caches[2], 2999 "kmalloc-192", 192, GFP_KERNEL); 3000 caches++; 3001 } 3002 3003 for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) { 3004 create_kmalloc_cache(&kmalloc_caches[i], 3005 "kmalloc", 1 << i, GFP_KERNEL); 3006 caches++; 3007 } 3008 3009 3010 /* 3011 * Patch up the size_index table if we have strange large alignment 3012 * requirements for the kmalloc array. This is only the case for 3013 * mips it seems. The standard arches will not generate any code here. 3014 * 3015 * Largest permitted alignment is 256 bytes due to the way we 3016 * handle the index determination for the smaller caches. 3017 * 3018 * Make sure that nothing crazy happens if someone starts tinkering 3019 * around with ARCH_KMALLOC_MINALIGN 3020 */ 3021 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 3022 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 3023 3024 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) 3025 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; 3026 3027 slab_state = UP; 3028 3029 /* Provide the correct kmalloc names now that the caches are up */ 3030 for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) 3031 kmalloc_caches[i]. name = 3032 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 3033 3034#ifdef CONFIG_SMP 3035 register_cpu_notifier(&slab_notifier); 3036 kmem_size = offsetof(struct kmem_cache, cpu_slab) + 3037 nr_cpu_ids * sizeof(struct kmem_cache_cpu *); 3038#else 3039 kmem_size = sizeof(struct kmem_cache); 3040#endif 3041 3042 3043 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 3044 " CPUs=%d, Nodes=%d\n", 3045 caches, cache_line_size(), 3046 slub_min_order, slub_max_order, slub_min_objects, 3047 nr_cpu_ids, nr_node_ids); 3048} 3049 3050/* 3051 * Find a mergeable slab cache 3052 */ 3053static int slab_unmergeable(struct kmem_cache *s) 3054{ 3055 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 3056 return 1; 3057 3058 if (s->ctor) 3059 return 1; 3060 3061 /* 3062 * We may have set a slab to be unmergeable during bootstrap. 3063 */ 3064 if (s->refcount < 0) 3065 return 1; 3066 3067 return 0; 3068} 3069 3070static struct kmem_cache *find_mergeable(size_t size, 3071 size_t align, unsigned long flags, const char *name, 3072 void (*ctor)(struct kmem_cache *, void *)) 3073{ 3074 struct kmem_cache *s; 3075 3076 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 3077 return NULL; 3078 3079 if (ctor) 3080 return NULL; 3081 3082 size = ALIGN(size, sizeof(void *)); 3083 align = calculate_alignment(flags, align, size); 3084 size = ALIGN(size, align); 3085 flags = kmem_cache_flags(size, flags, name, NULL); 3086 3087 list_for_each_entry(s, &slab_caches, list) { 3088 if (slab_unmergeable(s)) 3089 continue; 3090 3091 if (size > s->size) 3092 continue; 3093 3094 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) 3095 continue; 3096 /* 3097 * Check if alignment is compatible. 3098 * Courtesy of Adrian Drzewiecki 3099 */ 3100 if ((s->size & ~(align - 1)) != s->size) 3101 continue; 3102 3103 if (s->size - size >= sizeof(void *)) 3104 continue; 3105 3106 return s; 3107 } 3108 return NULL; 3109} 3110 3111struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3112 size_t align, unsigned long flags, 3113 void (*ctor)(struct kmem_cache *, void *)) 3114{ 3115 struct kmem_cache *s; 3116 3117 down_write(&slub_lock); 3118 s = find_mergeable(size, align, flags, name, ctor); 3119 if (s) { 3120 int cpu; 3121 3122 s->refcount++; 3123 /* 3124 * Adjust the object sizes so that we clear 3125 * the complete object on kzalloc. 3126 */ 3127 s->objsize = max(s->objsize, (int)size); 3128 3129 /* 3130 * And then we need to update the object size in the 3131 * per cpu structures 3132 */ 3133 for_each_online_cpu(cpu) 3134 get_cpu_slab(s, cpu)->objsize = s->objsize; 3135 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3136 up_write(&slub_lock); 3137 if (sysfs_slab_alias(s, name)) 3138 goto err; 3139 return s; 3140 } 3141 s = kmalloc(kmem_size, GFP_KERNEL); 3142 if (s) { 3143 if (kmem_cache_open(s, GFP_KERNEL, name, 3144 size, align, flags, ctor)) { 3145 list_add(&s->list, &slab_caches); 3146 up_write(&slub_lock); 3147 if (sysfs_slab_add(s)) 3148 goto err; 3149 return s; 3150 } 3151 kfree(s); 3152 } 3153 up_write(&slub_lock); 3154 3155err: 3156 if (flags & SLAB_PANIC) 3157 panic("Cannot create slabcache %s\n", name); 3158 else 3159 s = NULL; 3160 return s; 3161} 3162EXPORT_SYMBOL(kmem_cache_create); 3163 3164#ifdef CONFIG_SMP 3165/* 3166 * Use the cpu notifier to insure that the cpu slabs are flushed when 3167 * necessary. 3168 */ 3169static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 3170 unsigned long action, void *hcpu) 3171{ 3172 long cpu = (long)hcpu; 3173 struct kmem_cache *s; 3174 unsigned long flags; 3175 3176 switch (action) { 3177 case CPU_UP_PREPARE: 3178 case CPU_UP_PREPARE_FROZEN: 3179 init_alloc_cpu_cpu(cpu); 3180 down_read(&slub_lock); 3181 list_for_each_entry(s, &slab_caches, list) 3182 s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, 3183 GFP_KERNEL); 3184 up_read(&slub_lock); 3185 break; 3186 3187 case CPU_UP_CANCELED: 3188 case CPU_UP_CANCELED_FROZEN: 3189 case CPU_DEAD: 3190 case CPU_DEAD_FROZEN: 3191 down_read(&slub_lock); 3192 list_for_each_entry(s, &slab_caches, list) { 3193 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3194 3195 local_irq_save(flags); 3196 __flush_cpu_slab(s, cpu); 3197 local_irq_restore(flags); 3198 free_kmem_cache_cpu(c, cpu); 3199 s->cpu_slab[cpu] = NULL; 3200 } 3201 up_read(&slub_lock); 3202 break; 3203 default: 3204 break; 3205 } 3206 return NOTIFY_OK; 3207} 3208 3209static struct notifier_block __cpuinitdata slab_notifier = { 3210 &slab_cpuup_callback, NULL, 0 3211}; 3212 3213#endif 3214 3215void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) 3216{ 3217 struct kmem_cache *s; 3218 3219 if (unlikely(size > PAGE_SIZE / 2)) 3220 return (void *)__get_free_pages(gfpflags | __GFP_COMP, 3221 get_order(size)); 3222 s = get_slab(size, gfpflags); 3223 3224 if (unlikely(ZERO_OR_NULL_PTR(s))) 3225 return s; 3226 3227 return slab_alloc(s, gfpflags, -1, caller); 3228} 3229 3230void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3231 int node, void *caller) 3232{ 3233 struct kmem_cache *s; 3234 3235 if (unlikely(size > PAGE_SIZE / 2)) 3236 return (void *)__get_free_pages(gfpflags | __GFP_COMP, 3237 get_order(size)); 3238 s = get_slab(size, gfpflags); 3239 3240 if (unlikely(ZERO_OR_NULL_PTR(s))) 3241 return s; 3242 3243 return slab_alloc(s, gfpflags, node, caller); 3244} 3245 3246#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) 3247static int validate_slab(struct kmem_cache *s, struct page *page, 3248 unsigned long *map) 3249{ 3250 void *p; 3251 void *addr = slab_address(page); 3252 3253 if (!check_slab(s, page) || 3254 !on_freelist(s, page, NULL)) 3255 return 0; 3256 3257 /* Now we know that a valid freelist exists */ 3258 bitmap_zero(map, s->objects); 3259 3260 for_each_free_object(p, s, page->freelist) { 3261 set_bit(slab_index(p, s, addr), map); 3262 if (!check_object(s, page, p, 0)) 3263 return 0; 3264 } 3265 3266 for_each_object(p, s, addr) 3267 if (!test_bit(slab_index(p, s, addr), map)) 3268 if (!check_object(s, page, p, 1)) 3269 return 0; 3270 return 1; 3271} 3272 3273static void validate_slab_slab(struct kmem_cache *s, struct page *page, 3274 unsigned long *map) 3275{ 3276 if (slab_trylock(page)) { 3277 validate_slab(s, page, map); 3278 slab_unlock(page); 3279 } else 3280 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", 3281 s->name, page); 3282 3283 if (s->flags & DEBUG_DEFAULT_FLAGS) { 3284 if (!SlabDebug(page)) 3285 printk(KERN_ERR "SLUB %s: SlabDebug not set " 3286 "on slab 0x%p\n", s->name, page); 3287 } else { 3288 if (SlabDebug(page)) 3289 printk(KERN_ERR "SLUB %s: SlabDebug set on " 3290 "slab 0x%p\n", s->name, page); 3291 } 3292} 3293 3294static int validate_slab_node(struct kmem_cache *s, 3295 struct kmem_cache_node *n, unsigned long *map) 3296{ 3297 unsigned long count = 0; 3298 struct page *page; 3299 unsigned long flags; 3300 3301 spin_lock_irqsave(&n->list_lock, flags); 3302 3303 list_for_each_entry(page, &n->partial, lru) { 3304 validate_slab_slab(s, page, map); 3305 count++; 3306 } 3307 if (count != n->nr_partial) 3308 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 3309 "counter=%ld\n", s->name, count, n->nr_partial); 3310 3311 if (!(s->flags & SLAB_STORE_USER)) 3312 goto out; 3313 3314 list_for_each_entry(page, &n->full, lru) { 3315 validate_slab_slab(s, page, map); 3316 count++; 3317 } 3318 if (count != atomic_long_read(&n->nr_slabs)) 3319 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 3320 "counter=%ld\n", s->name, count, 3321 atomic_long_read(&n->nr_slabs)); 3322 3323out: 3324 spin_unlock_irqrestore(&n->list_lock, flags); 3325 return count; 3326} 3327 3328static long validate_slab_cache(struct kmem_cache *s) 3329{ 3330 int node; 3331 unsigned long count = 0; 3332 unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) * 3333 sizeof(unsigned long), GFP_KERNEL); 3334 3335 if (!map) 3336 return -ENOMEM; 3337 3338 flush_all(s); 3339 for_each_node_state(node, N_NORMAL_MEMORY) { 3340 struct kmem_cache_node *n = get_node(s, node); 3341 3342 count += validate_slab_node(s, n, map); 3343 } 3344 kfree(map); 3345 return count; 3346} 3347 3348#ifdef SLUB_RESILIENCY_TEST 3349static void resiliency_test(void) 3350{ 3351 u8 *p; 3352 3353 printk(KERN_ERR "SLUB resiliency testing\n"); 3354 printk(KERN_ERR "-----------------------\n"); 3355 printk(KERN_ERR "A. Corruption after allocation\n"); 3356 3357 p = kzalloc(16, GFP_KERNEL); 3358 p[16] = 0x12; 3359 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 3360 " 0x12->0x%p\n\n", p + 16); 3361 3362 validate_slab_cache(kmalloc_caches + 4); 3363 3364 /* Hmmm... The next two are dangerous */ 3365 p = kzalloc(32, GFP_KERNEL); 3366 p[32 + sizeof(void *)] = 0x34; 3367 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 3368 " 0x34 -> -0x%p\n", p); 3369 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 3370 3371 validate_slab_cache(kmalloc_caches + 5); 3372 p = kzalloc(64, GFP_KERNEL); 3373 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 3374 *p = 0x56; 3375 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 3376 p); 3377 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 3378 validate_slab_cache(kmalloc_caches + 6); 3379 3380 printk(KERN_ERR "\nB. Corruption after free\n"); 3381 p = kzalloc(128, GFP_KERNEL); 3382 kfree(p); 3383 *p = 0x78; 3384 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 3385 validate_slab_cache(kmalloc_caches + 7); 3386 3387 p = kzalloc(256, GFP_KERNEL); 3388 kfree(p); 3389 p[50] = 0x9a; 3390 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); 3391 validate_slab_cache(kmalloc_caches + 8); 3392 3393 p = kzalloc(512, GFP_KERNEL); 3394 kfree(p); 3395 p[512] = 0xab; 3396 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 3397 validate_slab_cache(kmalloc_caches + 9); 3398} 3399#else 3400static void resiliency_test(void) {}; 3401#endif 3402 3403/* 3404 * Generate lists of code addresses where slabcache objects are allocated 3405 * and freed. 3406 */ 3407 3408struct location { 3409 unsigned long count; 3410 void *addr; 3411 long long sum_time; 3412 long min_time; 3413 long max_time; 3414 long min_pid; 3415 long max_pid; 3416 cpumask_t cpus; 3417 nodemask_t nodes; 3418}; 3419 3420struct loc_track { 3421 unsigned long max; 3422 unsigned long count; 3423 struct location *loc; 3424}; 3425 3426static void free_loc_track(struct loc_track *t) 3427{ 3428 if (t->max) 3429 free_pages((unsigned long)t->loc, 3430 get_order(sizeof(struct location) * t->max)); 3431} 3432 3433static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) 3434{ 3435 struct location *l; 3436 int order; 3437 3438 order = get_order(sizeof(struct location) * max); 3439 3440 l = (void *)__get_free_pages(flags, order); 3441 if (!l) 3442 return 0; 3443 3444 if (t->count) { 3445 memcpy(l, t->loc, sizeof(struct location) * t->count); 3446 free_loc_track(t); 3447 } 3448 t->max = max; 3449 t->loc = l; 3450 return 1; 3451} 3452 3453static int add_location(struct loc_track *t, struct kmem_cache *s, 3454 const struct track *track) 3455{ 3456 long start, end, pos; 3457 struct location *l; 3458 void *caddr; 3459 unsigned long age = jiffies - track->when; 3460 3461 start = -1; 3462 end = t->count; 3463 3464 for ( ; ; ) { 3465 pos = start + (end - start + 1) / 2; 3466 3467 /* 3468 * There is nothing at "end". If we end up there 3469 * we need to add something to before end. 3470 */ 3471 if (pos == end) 3472 break; 3473 3474 caddr = t->loc[pos].addr; 3475 if (track->addr == caddr) { 3476 3477 l = &t->loc[pos]; 3478 l->count++; 3479 if (track->when) { 3480 l->sum_time += age; 3481 if (age < l->min_time) 3482 l->min_time = age; 3483 if (age > l->max_time) 3484 l->max_time = age; 3485 3486 if (track->pid < l->min_pid) 3487 l->min_pid = track->pid; 3488 if (track->pid > l->max_pid) 3489 l->max_pid = track->pid; 3490 3491 cpu_set(track->cpu, l->cpus); 3492 } 3493 node_set(page_to_nid(virt_to_page(track)), l->nodes); 3494 return 1; 3495 } 3496 3497 if (track->addr < caddr) 3498 end = pos; 3499 else 3500 start = pos; 3501 } 3502 3503 /* 3504 * Not found. Insert new tracking element. 3505 */ 3506 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) 3507 return 0; 3508 3509 l = t->loc + pos; 3510 if (pos < t->count) 3511 memmove(l + 1, l, 3512 (t->count - pos) * sizeof(struct location)); 3513 t->count++; 3514 l->count = 1; 3515 l->addr = track->addr; 3516 l->sum_time = age; 3517 l->min_time = age; 3518 l->max_time = age; 3519 l->min_pid = track->pid; 3520 l->max_pid = track->pid; 3521 cpus_clear(l->cpus); 3522 cpu_set(track->cpu, l->cpus); 3523 nodes_clear(l->nodes); 3524 node_set(page_to_nid(virt_to_page(track)), l->nodes); 3525 return 1; 3526} 3527 3528static void process_slab(struct loc_track *t, struct kmem_cache *s, 3529 struct page *page, enum track_item alloc) 3530{ 3531 void *addr = slab_address(page); 3532 DECLARE_BITMAP(map, s->objects); 3533 void *p; 3534 3535 bitmap_zero(map, s->objects); 3536 for_each_free_object(p, s, page->freelist) 3537 set_bit(slab_index(p, s, addr), map); 3538 3539 for_each_object(p, s, addr) 3540 if (!test_bit(slab_index(p, s, addr), map)) 3541 add_location(t, s, get_track(s, p, alloc)); 3542} 3543 3544static int list_locations(struct kmem_cache *s, char *buf, 3545 enum track_item alloc) 3546{ 3547 int len = 0; 3548 unsigned long i; 3549 struct loc_track t = { 0, 0, NULL }; 3550 int node; 3551 3552 if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 3553 GFP_TEMPORARY)) 3554 return sprintf(buf, "Out of memory\n"); 3555 3556 /* Push back cpu slabs */ 3557 flush_all(s); 3558 3559 for_each_node_state(node, N_NORMAL_MEMORY) { 3560 struct kmem_cache_node *n = get_node(s, node); 3561 unsigned long flags; 3562 struct page *page; 3563 3564 if (!atomic_long_read(&n->nr_slabs)) 3565 continue; 3566 3567 spin_lock_irqsave(&n->list_lock, flags); 3568 list_for_each_entry(page, &n->partial, lru) 3569 process_slab(&t, s, page, alloc); 3570 list_for_each_entry(page, &n->full, lru) 3571 process_slab(&t, s, page, alloc); 3572 spin_unlock_irqrestore(&n->list_lock, flags); 3573 } 3574 3575 for (i = 0; i < t.count; i++) { 3576 struct location *l = &t.loc[i]; 3577 3578 if (len > PAGE_SIZE - 100) 3579 break; 3580 len += sprintf(buf + len, "%7ld ", l->count); 3581 3582 if (l->addr) 3583 len += sprint_symbol(buf + len, (unsigned long)l->addr); 3584 else 3585 len += sprintf(buf + len, "<not-available>"); 3586 3587 if (l->sum_time != l->min_time) { 3588 unsigned long remainder; 3589 3590 len += sprintf(buf + len, " age=%ld/%ld/%ld", 3591 l->min_time, 3592 div_long_long_rem(l->sum_time, l->count, &remainder), 3593 l->max_time); 3594 } else 3595 len += sprintf(buf + len, " age=%ld", 3596 l->min_time); 3597 3598 if (l->min_pid != l->max_pid) 3599 len += sprintf(buf + len, " pid=%ld-%ld", 3600 l->min_pid, l->max_pid); 3601 else 3602 len += sprintf(buf + len, " pid=%ld", 3603 l->min_pid); 3604 3605 if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && 3606 len < PAGE_SIZE - 60) { 3607 len += sprintf(buf + len, " cpus="); 3608 len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, 3609 l->cpus); 3610 } 3611 3612 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && 3613 len < PAGE_SIZE - 60) { 3614 len += sprintf(buf + len, " nodes="); 3615 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, 3616 l->nodes); 3617 } 3618 3619 len += sprintf(buf + len, "\n"); 3620 } 3621 3622 free_loc_track(&t); 3623 if (!t.count) 3624 len += sprintf(buf, "No data\n"); 3625 return len; 3626} 3627 3628enum slab_stat_type { 3629 SL_FULL, 3630 SL_PARTIAL, 3631 SL_CPU, 3632 SL_OBJECTS 3633}; 3634 3635#define SO_FULL (1 << SL_FULL) 3636#define SO_PARTIAL (1 << SL_PARTIAL) 3637#define SO_CPU (1 << SL_CPU) 3638#define SO_OBJECTS (1 << SL_OBJECTS) 3639 3640static unsigned long slab_objects(struct kmem_cache *s, 3641 char *buf, unsigned long flags) 3642{ 3643 unsigned long total = 0; 3644 int cpu; 3645 int node; 3646 int x; 3647 unsigned long *nodes; 3648 unsigned long *per_cpu; 3649 3650 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 3651 per_cpu = nodes + nr_node_ids; 3652 3653 for_each_possible_cpu(cpu) { 3654 struct page *page; 3655 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3656 3657 if (!c) 3658 continue; 3659 3660 page = c->page; 3661 node = c->node; 3662 if (node < 0) 3663 continue; 3664 if (page) { 3665 if (flags & SO_CPU) { 3666 if (flags & SO_OBJECTS) 3667 x = page->inuse; 3668 else 3669 x = 1; 3670 total += x; 3671 nodes[node] += x; 3672 } 3673 per_cpu[node]++; 3674 } 3675 } 3676 3677 for_each_node_state(node, N_NORMAL_MEMORY) { 3678 struct kmem_cache_node *n = get_node(s, node); 3679 3680 if (flags & SO_PARTIAL) { 3681 if (flags & SO_OBJECTS) 3682 x = count_partial(n); 3683 else 3684 x = n->nr_partial; 3685 total += x; 3686 nodes[node] += x; 3687 } 3688 3689 if (flags & SO_FULL) { 3690 int full_slabs = atomic_long_read(&n->nr_slabs) 3691 - per_cpu[node] 3692 - n->nr_partial; 3693 3694 if (flags & SO_OBJECTS) 3695 x = full_slabs * s->objects; 3696 else 3697 x = full_slabs; 3698 total += x; 3699 nodes[node] += x; 3700 } 3701 } 3702 3703 x = sprintf(buf, "%lu", total); 3704#ifdef CONFIG_NUMA 3705 for_each_node_state(node, N_NORMAL_MEMORY) 3706 if (nodes[node]) 3707 x += sprintf(buf + x, " N%d=%lu", 3708 node, nodes[node]); 3709#endif 3710 kfree(nodes); 3711 return x + sprintf(buf + x, "\n"); 3712} 3713 3714static int any_slab_objects(struct kmem_cache *s) 3715{ 3716 int node; 3717 int cpu; 3718 3719 for_each_possible_cpu(cpu) { 3720 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3721 3722 if (c && c->page) 3723 return 1; 3724 } 3725 3726 for_each_online_node(node) { 3727 struct kmem_cache_node *n = get_node(s, node); 3728 3729 if (!n) 3730 continue; 3731 3732 if (n->nr_partial || atomic_long_read(&n->nr_slabs)) 3733 return 1; 3734 } 3735 return 0; 3736} 3737 3738#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 3739#define to_slab(n) container_of(n, struct kmem_cache, kobj); 3740 3741struct slab_attribute { 3742 struct attribute attr; 3743 ssize_t (*show)(struct kmem_cache *s, char *buf); 3744 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 3745}; 3746 3747#define SLAB_ATTR_RO(_name) \ 3748 static struct slab_attribute _name##_attr = __ATTR_RO(_name) 3749 3750#define SLAB_ATTR(_name) \ 3751 static struct slab_attribute _name##_attr = \ 3752 __ATTR(_name, 0644, _name##_show, _name##_store) 3753 3754static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 3755{ 3756 return sprintf(buf, "%d\n", s->size); 3757} 3758SLAB_ATTR_RO(slab_size); 3759 3760static ssize_t align_show(struct kmem_cache *s, char *buf) 3761{ 3762 return sprintf(buf, "%d\n", s->align); 3763} 3764SLAB_ATTR_RO(align); 3765 3766static ssize_t object_size_show(struct kmem_cache *s, char *buf) 3767{ 3768 return sprintf(buf, "%d\n", s->objsize); 3769} 3770SLAB_ATTR_RO(object_size); 3771 3772static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 3773{ 3774 return sprintf(buf, "%d\n", s->objects); 3775} 3776SLAB_ATTR_RO(objs_per_slab); 3777 3778static ssize_t order_show(struct kmem_cache *s, char *buf) 3779{ 3780 return sprintf(buf, "%d\n", s->order); 3781} 3782SLAB_ATTR_RO(order); 3783 3784static ssize_t ctor_show(struct kmem_cache *s, char *buf) 3785{ 3786 if (s->ctor) { 3787 int n = sprint_symbol(buf, (unsigned long)s->ctor); 3788 3789 return n + sprintf(buf + n, "\n"); 3790 } 3791 return 0; 3792} 3793SLAB_ATTR_RO(ctor); 3794 3795static ssize_t aliases_show(struct kmem_cache *s, char *buf) 3796{ 3797 return sprintf(buf, "%d\n", s->refcount - 1); 3798} 3799SLAB_ATTR_RO(aliases); 3800 3801static ssize_t slabs_show(struct kmem_cache *s, char *buf) 3802{ 3803 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); 3804} 3805SLAB_ATTR_RO(slabs); 3806 3807static ssize_t partial_show(struct kmem_cache *s, char *buf) 3808{ 3809 return slab_objects(s, buf, SO_PARTIAL); 3810} 3811SLAB_ATTR_RO(partial); 3812 3813static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 3814{ 3815 return slab_objects(s, buf, SO_CPU); 3816} 3817SLAB_ATTR_RO(cpu_slabs); 3818 3819static ssize_t objects_show(struct kmem_cache *s, char *buf) 3820{ 3821 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); 3822} 3823SLAB_ATTR_RO(objects); 3824 3825static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 3826{ 3827 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 3828} 3829 3830static ssize_t sanity_checks_store(struct kmem_cache *s, 3831 const char *buf, size_t length) 3832{ 3833 s->flags &= ~SLAB_DEBUG_FREE; 3834 if (buf[0] == '1') 3835 s->flags |= SLAB_DEBUG_FREE; 3836 return length; 3837} 3838SLAB_ATTR(sanity_checks); 3839 3840static ssize_t trace_show(struct kmem_cache *s, char *buf) 3841{ 3842 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 3843} 3844 3845static ssize_t trace_store(struct kmem_cache *s, const char *buf, 3846 size_t length) 3847{ 3848 s->flags &= ~SLAB_TRACE; 3849 if (buf[0] == '1') 3850 s->flags |= SLAB_TRACE; 3851 return length; 3852} 3853SLAB_ATTR(trace); 3854 3855static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 3856{ 3857 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 3858} 3859 3860static ssize_t reclaim_account_store(struct kmem_cache *s, 3861 const char *buf, size_t length) 3862{ 3863 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 3864 if (buf[0] == '1') 3865 s->flags |= SLAB_RECLAIM_ACCOUNT; 3866 return length; 3867} 3868SLAB_ATTR(reclaim_account); 3869 3870static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 3871{ 3872 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 3873} 3874SLAB_ATTR_RO(hwcache_align); 3875 3876#ifdef CONFIG_ZONE_DMA 3877static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 3878{ 3879 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 3880} 3881SLAB_ATTR_RO(cache_dma); 3882#endif 3883 3884static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 3885{ 3886 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 3887} 3888SLAB_ATTR_RO(destroy_by_rcu); 3889 3890static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 3891{ 3892 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 3893} 3894 3895static ssize_t red_zone_store(struct kmem_cache *s, 3896 const char *buf, size_t length) 3897{ 3898 if (any_slab_objects(s)) 3899 return -EBUSY; 3900 3901 s->flags &= ~SLAB_RED_ZONE; 3902 if (buf[0] == '1') 3903 s->flags |= SLAB_RED_ZONE; 3904 calculate_sizes(s); 3905 return length; 3906} 3907SLAB_ATTR(red_zone); 3908 3909static ssize_t poison_show(struct kmem_cache *s, char *buf) 3910{ 3911 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 3912} 3913 3914static ssize_t poison_store(struct kmem_cache *s, 3915 const char *buf, size_t length) 3916{ 3917 if (any_slab_objects(s)) 3918 return -EBUSY; 3919 3920 s->flags &= ~SLAB_POISON; 3921 if (buf[0] == '1') 3922 s->flags |= SLAB_POISON; 3923 calculate_sizes(s); 3924 return length; 3925} 3926SLAB_ATTR(poison); 3927 3928static ssize_t store_user_show(struct kmem_cache *s, char *buf) 3929{ 3930 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 3931} 3932 3933static ssize_t store_user_store(struct kmem_cache *s, 3934 const char *buf, size_t length) 3935{ 3936 if (any_slab_objects(s)) 3937 return -EBUSY; 3938 3939 s->flags &= ~SLAB_STORE_USER; 3940 if (buf[0] == '1') 3941 s->flags |= SLAB_STORE_USER; 3942 calculate_sizes(s); 3943 return length; 3944} 3945SLAB_ATTR(store_user); 3946 3947static ssize_t validate_show(struct kmem_cache *s, char *buf) 3948{ 3949 return 0; 3950} 3951 3952static ssize_t validate_store(struct kmem_cache *s, 3953 const char *buf, size_t length) 3954{ 3955 int ret = -EINVAL; 3956 3957 if (buf[0] == '1') { 3958 ret = validate_slab_cache(s); 3959 if (ret >= 0) 3960 ret = length; 3961 } 3962 return ret; 3963} 3964SLAB_ATTR(validate); 3965 3966static ssize_t shrink_show(struct kmem_cache *s, char *buf) 3967{ 3968 return 0; 3969} 3970 3971static ssize_t shrink_store(struct kmem_cache *s, 3972 const char *buf, size_t length) 3973{ 3974 if (buf[0] == '1') { 3975 int rc = kmem_cache_shrink(s); 3976 3977 if (rc) 3978 return rc; 3979 } else 3980 return -EINVAL; 3981 return length; 3982} 3983SLAB_ATTR(shrink); 3984 3985static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 3986{ 3987 if (!(s->flags & SLAB_STORE_USER)) 3988 return -ENOSYS; 3989 return list_locations(s, buf, TRACK_ALLOC); 3990} 3991SLAB_ATTR_RO(alloc_calls); 3992 3993static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 3994{ 3995 if (!(s->flags & SLAB_STORE_USER)) 3996 return -ENOSYS; 3997 return list_locations(s, buf, TRACK_FREE); 3998} 3999SLAB_ATTR_RO(free_calls); 4000 4001#ifdef CONFIG_NUMA 4002static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 4003{ 4004 return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); 4005} 4006 4007static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, 4008 const char *buf, size_t length) 4009{ 4010 int n = simple_strtoul(buf, NULL, 10); 4011 4012 if (n < 100) 4013 s->remote_node_defrag_ratio = n * 10; 4014 return length; 4015} 4016SLAB_ATTR(remote_node_defrag_ratio); 4017#endif 4018 4019#ifdef CONFIG_SLUB_STATS 4020 4021static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) 4022{ 4023 unsigned long sum = 0; 4024 int cpu; 4025 int len; 4026 int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); 4027 4028 if (!data) 4029 return -ENOMEM; 4030 4031 for_each_online_cpu(cpu) { 4032 unsigned x = get_cpu_slab(s, cpu)->stat[si]; 4033 4034 data[cpu] = x; 4035 sum += x; 4036 } 4037 4038 len = sprintf(buf, "%lu", sum); 4039 4040 for_each_online_cpu(cpu) { 4041 if (data[cpu] && len < PAGE_SIZE - 20) 4042 len += sprintf(buf + len, " c%d=%u", cpu, data[cpu]); 4043 } 4044 kfree(data); 4045 return len + sprintf(buf + len, "\n"); 4046} 4047 4048#define STAT_ATTR(si, text) \ 4049static ssize_t text##_show(struct kmem_cache *s, char *buf) \ 4050{ \ 4051 return show_stat(s, buf, si); \ 4052} \ 4053SLAB_ATTR_RO(text); \ 4054 4055STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 4056STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 4057STAT_ATTR(FREE_FASTPATH, free_fastpath); 4058STAT_ATTR(FREE_SLOWPATH, free_slowpath); 4059STAT_ATTR(FREE_FROZEN, free_frozen); 4060STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 4061STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 4062STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 4063STAT_ATTR(ALLOC_SLAB, alloc_slab); 4064STAT_ATTR(ALLOC_REFILL, alloc_refill); 4065STAT_ATTR(FREE_SLAB, free_slab); 4066STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 4067STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 4068STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 4069STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 4070STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 4071STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 4072 4073#endif 4074 4075static struct attribute *slab_attrs[] = { 4076 &slab_size_attr.attr, 4077 &object_size_attr.attr, 4078 &objs_per_slab_attr.attr, 4079 &order_attr.attr, 4080 &objects_attr.attr, 4081 &slabs_attr.attr, 4082 &partial_attr.attr, 4083 &cpu_slabs_attr.attr, 4084 &ctor_attr.attr, 4085 &aliases_attr.attr, 4086 &align_attr.attr, 4087 &sanity_checks_attr.attr, 4088 &trace_attr.attr, 4089 &hwcache_align_attr.attr, 4090 &reclaim_account_attr.attr, 4091 &destroy_by_rcu_attr.attr, 4092 &red_zone_attr.attr, 4093 &poison_attr.attr, 4094 &store_user_attr.attr, 4095 &validate_attr.attr, 4096 &shrink_attr.attr, 4097 &alloc_calls_attr.attr, 4098 &free_calls_attr.attr, 4099#ifdef CONFIG_ZONE_DMA 4100 &cache_dma_attr.attr, 4101#endif 4102#ifdef CONFIG_NUMA 4103 &remote_node_defrag_ratio_attr.attr, 4104#endif 4105#ifdef CONFIG_SLUB_STATS 4106 &alloc_fastpath_attr.attr, 4107 &alloc_slowpath_attr.attr, 4108 &free_fastpath_attr.attr, 4109 &free_slowpath_attr.attr, 4110 &free_frozen_attr.attr, 4111 &free_add_partial_attr.attr, 4112 &free_remove_partial_attr.attr, 4113 &alloc_from_partial_attr.attr, 4114 &alloc_slab_attr.attr, 4115 &alloc_refill_attr.attr, 4116 &free_slab_attr.attr, 4117 &cpuslab_flush_attr.attr, 4118 &deactivate_full_attr.attr, 4119 &deactivate_empty_attr.attr, 4120 &deactivate_to_head_attr.attr, 4121 &deactivate_to_tail_attr.attr, 4122 &deactivate_remote_frees_attr.attr, 4123#endif 4124 NULL 4125}; 4126 4127static struct attribute_group slab_attr_group = { 4128 .attrs = slab_attrs, 4129}; 4130 4131static ssize_t slab_attr_show(struct kobject *kobj, 4132 struct attribute *attr, 4133 char *buf) 4134{ 4135 struct slab_attribute *attribute; 4136 struct kmem_cache *s; 4137 int err; 4138 4139 attribute = to_slab_attr(attr); 4140 s = to_slab(kobj); 4141 4142 if (!attribute->show) 4143 return -EIO; 4144 4145 err = attribute->show(s, buf); 4146 4147 return err; 4148} 4149 4150static ssize_t slab_attr_store(struct kobject *kobj, 4151 struct attribute *attr, 4152 const char *buf, size_t len) 4153{ 4154 struct slab_attribute *attribute; 4155 struct kmem_cache *s; 4156 int err; 4157 4158 attribute = to_slab_attr(attr); 4159 s = to_slab(kobj); 4160 4161 if (!attribute->store) 4162 return -EIO; 4163 4164 err = attribute->store(s, buf, len); 4165 4166 return err; 4167} 4168 4169static void kmem_cache_release(struct kobject *kobj) 4170{ 4171 struct kmem_cache *s = to_slab(kobj); 4172 4173 kfree(s); 4174} 4175 4176static struct sysfs_ops slab_sysfs_ops = { 4177 .show = slab_attr_show, 4178 .store = slab_attr_store, 4179}; 4180 4181static struct kobj_type slab_ktype = { 4182 .sysfs_ops = &slab_sysfs_ops, 4183 .release = kmem_cache_release 4184}; 4185 4186static int uevent_filter(struct kset *kset, struct kobject *kobj) 4187{ 4188 struct kobj_type *ktype = get_ktype(kobj); 4189 4190 if (ktype == &slab_ktype) 4191 return 1; 4192 return 0; 4193} 4194 4195static struct kset_uevent_ops slab_uevent_ops = { 4196 .filter = uevent_filter, 4197}; 4198 4199static struct kset *slab_kset; 4200 4201#define ID_STR_LENGTH 64 4202 4203/* Create a unique string id for a slab cache: 4204 * format 4205 * :[flags-]size:[memory address of kmemcache] 4206 */ 4207static char *create_unique_id(struct kmem_cache *s) 4208{ 4209 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 4210 char *p = name; 4211 4212 BUG_ON(!name); 4213 4214 *p++ = ':'; 4215 /* 4216 * First flags affecting slabcache operations. We will only 4217 * get here for aliasable slabs so we do not need to support 4218 * too many flags. The flags here must cover all flags that 4219 * are matched during merging to guarantee that the id is 4220 * unique. 4221 */ 4222 if (s->flags & SLAB_CACHE_DMA) 4223 *p++ = 'd'; 4224 if (s->flags & SLAB_RECLAIM_ACCOUNT) 4225 *p++ = 'a'; 4226 if (s->flags & SLAB_DEBUG_FREE) 4227 *p++ = 'F'; 4228 if (p != name + 1) 4229 *p++ = '-'; 4230 p += sprintf(p, "%07d", s->size); 4231 BUG_ON(p > name + ID_STR_LENGTH - 1); 4232 return name; 4233} 4234 4235static int sysfs_slab_add(struct kmem_cache *s) 4236{ 4237 int err; 4238 const char *name; 4239 int unmergeable; 4240 4241 if (slab_state < SYSFS) 4242 /* Defer until later */ 4243 return 0; 4244 4245 unmergeable = slab_unmergeable(s); 4246 if (unmergeable) { 4247 /* 4248 * Slabcache can never be merged so we can use the name proper. 4249 * This is typically the case for debug situations. In that 4250 * case we can catch duplicate names easily. 4251 */ 4252 sysfs_remove_link(&slab_kset->kobj, s->name); 4253 name = s->name; 4254 } else { 4255 /* 4256 * Create a unique name for the slab as a target 4257 * for the symlinks. 4258 */ 4259 name = create_unique_id(s); 4260 } 4261 4262 s->kobj.kset = slab_kset; 4263 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); 4264 if (err) { 4265 kobject_put(&s->kobj); 4266 return err; 4267 } 4268 4269 err = sysfs_create_group(&s->kobj, &slab_attr_group); 4270 if (err) 4271 return err; 4272 kobject_uevent(&s->kobj, KOBJ_ADD); 4273 if (!unmergeable) { 4274 /* Setup first alias */ 4275 sysfs_slab_alias(s, s->name); 4276 kfree(name); 4277 } 4278 return 0; 4279} 4280 4281static void sysfs_slab_remove(struct kmem_cache *s) 4282{ 4283 kobject_uevent(&s->kobj, KOBJ_REMOVE); 4284 kobject_del(&s->kobj); 4285 kobject_put(&s->kobj); 4286} 4287 4288/* 4289 * Need to buffer aliases during bootup until sysfs becomes 4290 * available lest we loose that information. 4291 */ 4292struct saved_alias { 4293 struct kmem_cache *s; 4294 const char *name; 4295 struct saved_alias *next; 4296}; 4297 4298static struct saved_alias *alias_list; 4299 4300static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 4301{ 4302 struct saved_alias *al; 4303 4304 if (slab_state == SYSFS) { 4305 /* 4306 * If we have a leftover link then remove it. 4307 */ 4308 sysfs_remove_link(&slab_kset->kobj, name); 4309 return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); 4310 } 4311 4312 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 4313 if (!al) 4314 return -ENOMEM; 4315 4316 al->s = s; 4317 al->name = name; 4318 al->next = alias_list; 4319 alias_list = al; 4320 return 0; 4321} 4322 4323static int __init slab_sysfs_init(void) 4324{ 4325 struct kmem_cache *s; 4326 int err; 4327 4328 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 4329 if (!slab_kset) { 4330 printk(KERN_ERR "Cannot register slab subsystem.\n"); 4331 return -ENOSYS; 4332 } 4333 4334 slab_state = SYSFS; 4335 4336 list_for_each_entry(s, &slab_caches, list) { 4337 err = sysfs_slab_add(s); 4338 if (err) 4339 printk(KERN_ERR "SLUB: Unable to add boot slab %s" 4340 " to sysfs\n", s->name); 4341 } 4342 4343 while (alias_list) { 4344 struct saved_alias *al = alias_list; 4345 4346 alias_list = alias_list->next; 4347 err = sysfs_slab_alias(al->s, al->name); 4348 if (err) 4349 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 4350 " %s to sysfs\n", s->name); 4351 kfree(al); 4352 } 4353 4354 resiliency_test(); 4355 return 0; 4356} 4357 4358__initcall(slab_sysfs_init); 4359#endif 4360 4361/* 4362 * The /proc/slabinfo ABI 4363 */ 4364#ifdef CONFIG_SLABINFO 4365 4366ssize_t slabinfo_write(struct file *file, const char __user * buffer, 4367 size_t count, loff_t *ppos) 4368{ 4369 return -EINVAL; 4370} 4371 4372 4373static void print_slabinfo_header(struct seq_file *m) 4374{ 4375 seq_puts(m, "slabinfo - version: 2.1\n"); 4376 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 4377 "<objperslab> <pagesperslab>"); 4378 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 4379 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 4380 seq_putc(m, '\n'); 4381} 4382 4383static void *s_start(struct seq_file *m, loff_t *pos) 4384{ 4385 loff_t n = *pos; 4386 4387 down_read(&slub_lock); 4388 if (!n) 4389 print_slabinfo_header(m); 4390 4391 return seq_list_start(&slab_caches, *pos); 4392} 4393 4394static void *s_next(struct seq_file *m, void *p, loff_t *pos) 4395{ 4396 return seq_list_next(p, &slab_caches, pos); 4397} 4398 4399static void s_stop(struct seq_file *m, void *p) 4400{ 4401 up_read(&slub_lock); 4402} 4403 4404static int s_show(struct seq_file *m, void *p) 4405{ 4406 unsigned long nr_partials = 0; 4407 unsigned long nr_slabs = 0; 4408 unsigned long nr_inuse = 0; 4409 unsigned long nr_objs; 4410 struct kmem_cache *s; 4411 int node; 4412 4413 s = list_entry(p, struct kmem_cache, list); 4414 4415 for_each_online_node(node) { 4416 struct kmem_cache_node *n = get_node(s, node); 4417 4418 if (!n) 4419 continue; 4420 4421 nr_partials += n->nr_partial; 4422 nr_slabs += atomic_long_read(&n->nr_slabs); 4423 nr_inuse += count_partial(n); 4424 } 4425 4426 nr_objs = nr_slabs * s->objects; 4427 nr_inuse += (nr_slabs - nr_partials) * s->objects; 4428 4429 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, 4430 nr_objs, s->size, s->objects, (1 << s->order)); 4431 seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); 4432 seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, 4433 0UL); 4434 seq_putc(m, '\n'); 4435 return 0; 4436} 4437 4438const struct seq_operations slabinfo_op = { 4439 .start = s_start, 4440 .next = s_next, 4441 .stop = s_stop, 4442 .show = s_show, 4443}; 4444 4445#endif /* CONFIG_SLABINFO */ 4446