slub.c revision ba84c73c7ae21fc891a3c2576fa3be42752fce53
1/* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks and only 6 * uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com> 9 */ 10 11#include <linux/mm.h> 12#include <linux/module.h> 13#include <linux/bit_spinlock.h> 14#include <linux/interrupt.h> 15#include <linux/bitops.h> 16#include <linux/slab.h> 17#include <linux/seq_file.h> 18#include <linux/cpu.h> 19#include <linux/cpuset.h> 20#include <linux/mempolicy.h> 21#include <linux/ctype.h> 22#include <linux/kallsyms.h> 23#include <linux/memory.h> 24 25/* 26 * Lock order: 27 * 1. slab_lock(page) 28 * 2. slab->list_lock 29 * 30 * The slab_lock protects operations on the object of a particular 31 * slab and its metadata in the page struct. If the slab lock 32 * has been taken then no allocations nor frees can be performed 33 * on the objects in the slab nor can the slab be added or removed 34 * from the partial or full lists since this would mean modifying 35 * the page_struct of the slab. 36 * 37 * The list_lock protects the partial and full list on each node and 38 * the partial slab counter. If taken then no new slabs may be added or 39 * removed from the lists nor make the number of partial slabs be modified. 40 * (Note that the total number of slabs is an atomic value that may be 41 * modified without taking the list lock). 42 * 43 * The list_lock is a centralized lock and thus we avoid taking it as 44 * much as possible. As long as SLUB does not have to handle partial 45 * slabs, operations can continue without any centralized lock. F.e. 46 * allocating a long series of objects that fill up slabs does not require 47 * the list lock. 48 * 49 * The lock order is sometimes inverted when we are trying to get a slab 50 * off a list. We take the list_lock and then look for a page on the list 51 * to use. While we do that objects in the slabs may be freed. We can 52 * only operate on the slab if we have also taken the slab_lock. So we use 53 * a slab_trylock() on the slab. If trylock was successful then no frees 54 * can occur anymore and we can use the slab for allocations etc. If the 55 * slab_trylock() does not succeed then frees are in progress in the slab and 56 * we must stay away from it for a while since we may cause a bouncing 57 * cacheline if we try to acquire the lock. So go onto the next slab. 58 * If all pages are busy then we may allocate a new slab instead of reusing 59 * a partial slab. A new slab has noone operating on it and thus there is 60 * no danger of cacheline contention. 61 * 62 * Interrupts are disabled during allocation and deallocation in order to 63 * make the slab allocator safe to use in the context of an irq. In addition 64 * interrupts are disabled to ensure that the processor does not change 65 * while handling per_cpu slabs, due to kernel preemption. 66 * 67 * SLUB assigns one slab for allocation to each processor. 68 * Allocations only occur from these slabs called cpu slabs. 69 * 70 * Slabs with free elements are kept on a partial list and during regular 71 * operations no list for full slabs is used. If an object in a full slab is 72 * freed then the slab will show up again on the partial lists. 73 * We track full slabs for debugging purposes though because otherwise we 74 * cannot scan all objects. 75 * 76 * Slabs are freed when they become empty. Teardown and setup is 77 * minimal so we rely on the page allocators per cpu caches for 78 * fast frees and allocs. 79 * 80 * Overloading of page flags that are otherwise used for LRU management. 81 * 82 * PageActive The slab is frozen and exempt from list processing. 83 * This means that the slab is dedicated to a purpose 84 * such as satisfying allocations for a specific 85 * processor. Objects may be freed in the slab while 86 * it is frozen but slab_free will then skip the usual 87 * list operations. It is up to the processor holding 88 * the slab to integrate the slab into the slab lists 89 * when the slab is no longer needed. 90 * 91 * One use of this flag is to mark slabs that are 92 * used for allocations. Then such a slab becomes a cpu 93 * slab. The cpu slab may be equipped with an additional 94 * freelist that allows lockless access to 95 * free objects in addition to the regular freelist 96 * that requires the slab lock. 97 * 98 * PageError Slab requires special handling due to debug 99 * options set. This moves slab handling out of 100 * the fast path and disables lockless freelists. 101 */ 102 103#define FROZEN (1 << PG_active) 104 105#ifdef CONFIG_SLUB_DEBUG 106#define SLABDEBUG (1 << PG_error) 107#else 108#define SLABDEBUG 0 109#endif 110 111static inline int SlabFrozen(struct page *page) 112{ 113 return page->flags & FROZEN; 114} 115 116static inline void SetSlabFrozen(struct page *page) 117{ 118 page->flags |= FROZEN; 119} 120 121static inline void ClearSlabFrozen(struct page *page) 122{ 123 page->flags &= ~FROZEN; 124} 125 126static inline int SlabDebug(struct page *page) 127{ 128 return page->flags & SLABDEBUG; 129} 130 131static inline void SetSlabDebug(struct page *page) 132{ 133 page->flags |= SLABDEBUG; 134} 135 136static inline void ClearSlabDebug(struct page *page) 137{ 138 page->flags &= ~SLABDEBUG; 139} 140 141/* 142 * Issues still to be resolved: 143 * 144 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 145 * 146 * - Variable sizing of the per node arrays 147 */ 148 149/* Enable to test recovery from slab corruption on boot */ 150#undef SLUB_RESILIENCY_TEST 151 152#if PAGE_SHIFT <= 12 153 154/* 155 * Small page size. Make sure that we do not fragment memory 156 */ 157#define DEFAULT_MAX_ORDER 1 158#define DEFAULT_MIN_OBJECTS 4 159 160#else 161 162/* 163 * Large page machines are customarily able to handle larger 164 * page orders. 165 */ 166#define DEFAULT_MAX_ORDER 2 167#define DEFAULT_MIN_OBJECTS 8 168 169#endif 170 171/* 172 * Mininum number of partial slabs. These will be left on the partial 173 * lists even if they are empty. kmem_cache_shrink may reclaim them. 174 */ 175#define MIN_PARTIAL 5 176 177/* 178 * Maximum number of desirable partial slabs. 179 * The existence of more partial slabs makes kmem_cache_shrink 180 * sort the partial list by the number of objects in the. 181 */ 182#define MAX_PARTIAL 10 183 184#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 185 SLAB_POISON | SLAB_STORE_USER) 186 187/* 188 * Set of flags that will prevent slab merging 189 */ 190#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 191 SLAB_TRACE | SLAB_DESTROY_BY_RCU) 192 193#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 194 SLAB_CACHE_DMA) 195 196#ifndef ARCH_KMALLOC_MINALIGN 197#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 198#endif 199 200#ifndef ARCH_SLAB_MINALIGN 201#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) 202#endif 203 204/* Internal SLUB flags */ 205#define __OBJECT_POISON 0x80000000 /* Poison object */ 206#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ 207 208/* Not all arches define cache_line_size */ 209#ifndef cache_line_size 210#define cache_line_size() L1_CACHE_BYTES 211#endif 212 213static int kmem_size = sizeof(struct kmem_cache); 214 215#ifdef CONFIG_SMP 216static struct notifier_block slab_notifier; 217#endif 218 219static enum { 220 DOWN, /* No slab functionality available */ 221 PARTIAL, /* kmem_cache_open() works but kmalloc does not */ 222 UP, /* Everything works but does not show up in sysfs */ 223 SYSFS /* Sysfs up */ 224} slab_state = DOWN; 225 226/* A list of all slab caches on the system */ 227static DECLARE_RWSEM(slub_lock); 228static LIST_HEAD(slab_caches); 229 230/* 231 * Tracking user of a slab. 232 */ 233struct track { 234 void *addr; /* Called from address */ 235 int cpu; /* Was running on cpu */ 236 int pid; /* Pid context */ 237 unsigned long when; /* When did the operation occur */ 238}; 239 240enum track_item { TRACK_ALLOC, TRACK_FREE }; 241 242#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) 243static int sysfs_slab_add(struct kmem_cache *); 244static int sysfs_slab_alias(struct kmem_cache *, const char *); 245static void sysfs_slab_remove(struct kmem_cache *); 246#else 247static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 248static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 249 { return 0; } 250static inline void sysfs_slab_remove(struct kmem_cache *s) 251{ 252 kfree(s); 253} 254#endif 255 256/******************************************************************** 257 * Core slab cache functions 258 *******************************************************************/ 259 260int slab_is_available(void) 261{ 262 return slab_state >= UP; 263} 264 265static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 266{ 267#ifdef CONFIG_NUMA 268 return s->node[node]; 269#else 270 return &s->local_node; 271#endif 272} 273 274static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) 275{ 276#ifdef CONFIG_SMP 277 return s->cpu_slab[cpu]; 278#else 279 return &s->cpu_slab; 280#endif 281} 282 283static inline int check_valid_pointer(struct kmem_cache *s, 284 struct page *page, const void *object) 285{ 286 void *base; 287 288 if (!object) 289 return 1; 290 291 base = page_address(page); 292 if (object < base || object >= base + s->objects * s->size || 293 (object - base) % s->size) { 294 return 0; 295 } 296 297 return 1; 298} 299 300/* 301 * Slow version of get and set free pointer. 302 * 303 * This version requires touching the cache lines of kmem_cache which 304 * we avoid to do in the fast alloc free paths. There we obtain the offset 305 * from the page struct. 306 */ 307static inline void *get_freepointer(struct kmem_cache *s, void *object) 308{ 309 return *(void **)(object + s->offset); 310} 311 312static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 313{ 314 *(void **)(object + s->offset) = fp; 315} 316 317/* Loop over all objects in a slab */ 318#define for_each_object(__p, __s, __addr) \ 319 for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ 320 __p += (__s)->size) 321 322/* Scan freelist */ 323#define for_each_free_object(__p, __s, __free) \ 324 for (__p = (__free); __p; __p = get_freepointer((__s), __p)) 325 326/* Determine object index from a given position */ 327static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 328{ 329 return (p - addr) / s->size; 330} 331 332#ifdef CONFIG_SLUB_DEBUG 333/* 334 * Debug settings: 335 */ 336#ifdef CONFIG_SLUB_DEBUG_ON 337static int slub_debug = DEBUG_DEFAULT_FLAGS; 338#else 339static int slub_debug; 340#endif 341 342static char *slub_debug_slabs; 343 344/* 345 * Object debugging 346 */ 347static void print_section(char *text, u8 *addr, unsigned int length) 348{ 349 int i, offset; 350 int newline = 1; 351 char ascii[17]; 352 353 ascii[16] = 0; 354 355 for (i = 0; i < length; i++) { 356 if (newline) { 357 printk(KERN_ERR "%8s 0x%p: ", text, addr + i); 358 newline = 0; 359 } 360 printk(KERN_CONT " %02x", addr[i]); 361 offset = i % 16; 362 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; 363 if (offset == 15) { 364 printk(KERN_CONT " %s\n", ascii); 365 newline = 1; 366 } 367 } 368 if (!newline) { 369 i %= 16; 370 while (i < 16) { 371 printk(KERN_CONT " "); 372 ascii[i] = ' '; 373 i++; 374 } 375 printk(KERN_CONT " %s\n", ascii); 376 } 377} 378 379static struct track *get_track(struct kmem_cache *s, void *object, 380 enum track_item alloc) 381{ 382 struct track *p; 383 384 if (s->offset) 385 p = object + s->offset + sizeof(void *); 386 else 387 p = object + s->inuse; 388 389 return p + alloc; 390} 391 392static void set_track(struct kmem_cache *s, void *object, 393 enum track_item alloc, void *addr) 394{ 395 struct track *p; 396 397 if (s->offset) 398 p = object + s->offset + sizeof(void *); 399 else 400 p = object + s->inuse; 401 402 p += alloc; 403 if (addr) { 404 p->addr = addr; 405 p->cpu = smp_processor_id(); 406 p->pid = current ? current->pid : -1; 407 p->when = jiffies; 408 } else 409 memset(p, 0, sizeof(struct track)); 410} 411 412static void init_tracking(struct kmem_cache *s, void *object) 413{ 414 if (!(s->flags & SLAB_STORE_USER)) 415 return; 416 417 set_track(s, object, TRACK_FREE, NULL); 418 set_track(s, object, TRACK_ALLOC, NULL); 419} 420 421static void print_track(const char *s, struct track *t) 422{ 423 if (!t->addr) 424 return; 425 426 printk(KERN_ERR "INFO: %s in ", s); 427 __print_symbol("%s", (unsigned long)t->addr); 428 printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); 429} 430 431static void print_tracking(struct kmem_cache *s, void *object) 432{ 433 if (!(s->flags & SLAB_STORE_USER)) 434 return; 435 436 print_track("Allocated", get_track(s, object, TRACK_ALLOC)); 437 print_track("Freed", get_track(s, object, TRACK_FREE)); 438} 439 440static void print_page_info(struct page *page) 441{ 442 printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", 443 page, page->inuse, page->freelist, page->flags); 444 445} 446 447static void slab_bug(struct kmem_cache *s, char *fmt, ...) 448{ 449 va_list args; 450 char buf[100]; 451 452 va_start(args, fmt); 453 vsnprintf(buf, sizeof(buf), fmt, args); 454 va_end(args); 455 printk(KERN_ERR "========================================" 456 "=====================================\n"); 457 printk(KERN_ERR "BUG %s: %s\n", s->name, buf); 458 printk(KERN_ERR "----------------------------------------" 459 "-------------------------------------\n\n"); 460} 461 462static void slab_fix(struct kmem_cache *s, char *fmt, ...) 463{ 464 va_list args; 465 char buf[100]; 466 467 va_start(args, fmt); 468 vsnprintf(buf, sizeof(buf), fmt, args); 469 va_end(args); 470 printk(KERN_ERR "FIX %s: %s\n", s->name, buf); 471} 472 473static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 474{ 475 unsigned int off; /* Offset of last byte */ 476 u8 *addr = page_address(page); 477 478 print_tracking(s, p); 479 480 print_page_info(page); 481 482 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 483 p, p - addr, get_freepointer(s, p)); 484 485 if (p > addr + 16) 486 print_section("Bytes b4", p - 16, 16); 487 488 print_section("Object", p, min(s->objsize, 128)); 489 490 if (s->flags & SLAB_RED_ZONE) 491 print_section("Redzone", p + s->objsize, 492 s->inuse - s->objsize); 493 494 if (s->offset) 495 off = s->offset + sizeof(void *); 496 else 497 off = s->inuse; 498 499 if (s->flags & SLAB_STORE_USER) 500 off += 2 * sizeof(struct track); 501 502 if (off != s->size) 503 /* Beginning of the filler is the free pointer */ 504 print_section("Padding", p + off, s->size - off); 505 506 dump_stack(); 507} 508 509static void object_err(struct kmem_cache *s, struct page *page, 510 u8 *object, char *reason) 511{ 512 slab_bug(s, reason); 513 print_trailer(s, page, object); 514} 515 516static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) 517{ 518 va_list args; 519 char buf[100]; 520 521 va_start(args, fmt); 522 vsnprintf(buf, sizeof(buf), fmt, args); 523 va_end(args); 524 slab_bug(s, fmt); 525 print_page_info(page); 526 dump_stack(); 527} 528 529static void init_object(struct kmem_cache *s, void *object, int active) 530{ 531 u8 *p = object; 532 533 if (s->flags & __OBJECT_POISON) { 534 memset(p, POISON_FREE, s->objsize - 1); 535 p[s->objsize - 1] = POISON_END; 536 } 537 538 if (s->flags & SLAB_RED_ZONE) 539 memset(p + s->objsize, 540 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, 541 s->inuse - s->objsize); 542} 543 544static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) 545{ 546 while (bytes) { 547 if (*start != (u8)value) 548 return start; 549 start++; 550 bytes--; 551 } 552 return NULL; 553} 554 555static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 556 void *from, void *to) 557{ 558 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); 559 memset(from, data, to - from); 560} 561 562static int check_bytes_and_report(struct kmem_cache *s, struct page *page, 563 u8 *object, char *what, 564 u8 *start, unsigned int value, unsigned int bytes) 565{ 566 u8 *fault; 567 u8 *end; 568 569 fault = check_bytes(start, value, bytes); 570 if (!fault) 571 return 1; 572 573 end = start + bytes; 574 while (end > fault && end[-1] == value) 575 end--; 576 577 slab_bug(s, "%s overwritten", what); 578 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", 579 fault, end - 1, fault[0], value); 580 print_trailer(s, page, object); 581 582 restore_bytes(s, what, value, fault, end); 583 return 0; 584} 585 586/* 587 * Object layout: 588 * 589 * object address 590 * Bytes of the object to be managed. 591 * If the freepointer may overlay the object then the free 592 * pointer is the first word of the object. 593 * 594 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 595 * 0xa5 (POISON_END) 596 * 597 * object + s->objsize 598 * Padding to reach word boundary. This is also used for Redzoning. 599 * Padding is extended by another word if Redzoning is enabled and 600 * objsize == inuse. 601 * 602 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 603 * 0xcc (RED_ACTIVE) for objects in use. 604 * 605 * object + s->inuse 606 * Meta data starts here. 607 * 608 * A. Free pointer (if we cannot overwrite object on free) 609 * B. Tracking data for SLAB_STORE_USER 610 * C. Padding to reach required alignment boundary or at mininum 611 * one word if debuggin is on to be able to detect writes 612 * before the word boundary. 613 * 614 * Padding is done using 0x5a (POISON_INUSE) 615 * 616 * object + s->size 617 * Nothing is used beyond s->size. 618 * 619 * If slabcaches are merged then the objsize and inuse boundaries are mostly 620 * ignored. And therefore no slab options that rely on these boundaries 621 * may be used with merged slabcaches. 622 */ 623 624static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 625{ 626 unsigned long off = s->inuse; /* The end of info */ 627 628 if (s->offset) 629 /* Freepointer is placed after the object. */ 630 off += sizeof(void *); 631 632 if (s->flags & SLAB_STORE_USER) 633 /* We also have user information there */ 634 off += 2 * sizeof(struct track); 635 636 if (s->size == off) 637 return 1; 638 639 return check_bytes_and_report(s, page, p, "Object padding", 640 p + off, POISON_INUSE, s->size - off); 641} 642 643static int slab_pad_check(struct kmem_cache *s, struct page *page) 644{ 645 u8 *start; 646 u8 *fault; 647 u8 *end; 648 int length; 649 int remainder; 650 651 if (!(s->flags & SLAB_POISON)) 652 return 1; 653 654 start = page_address(page); 655 end = start + (PAGE_SIZE << s->order); 656 length = s->objects * s->size; 657 remainder = end - (start + length); 658 if (!remainder) 659 return 1; 660 661 fault = check_bytes(start + length, POISON_INUSE, remainder); 662 if (!fault) 663 return 1; 664 while (end > fault && end[-1] == POISON_INUSE) 665 end--; 666 667 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 668 print_section("Padding", start, length); 669 670 restore_bytes(s, "slab padding", POISON_INUSE, start, end); 671 return 0; 672} 673 674static int check_object(struct kmem_cache *s, struct page *page, 675 void *object, int active) 676{ 677 u8 *p = object; 678 u8 *endobject = object + s->objsize; 679 680 if (s->flags & SLAB_RED_ZONE) { 681 unsigned int red = 682 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; 683 684 if (!check_bytes_and_report(s, page, object, "Redzone", 685 endobject, red, s->inuse - s->objsize)) 686 return 0; 687 } else { 688 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) 689 check_bytes_and_report(s, page, p, "Alignment padding", endobject, 690 POISON_INUSE, s->inuse - s->objsize); 691 } 692 693 if (s->flags & SLAB_POISON) { 694 if (!active && (s->flags & __OBJECT_POISON) && 695 (!check_bytes_and_report(s, page, p, "Poison", p, 696 POISON_FREE, s->objsize - 1) || 697 !check_bytes_and_report(s, page, p, "Poison", 698 p + s->objsize - 1, POISON_END, 1))) 699 return 0; 700 /* 701 * check_pad_bytes cleans up on its own. 702 */ 703 check_pad_bytes(s, page, p); 704 } 705 706 if (!s->offset && active) 707 /* 708 * Object and freepointer overlap. Cannot check 709 * freepointer while object is allocated. 710 */ 711 return 1; 712 713 /* Check free pointer validity */ 714 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 715 object_err(s, page, p, "Freepointer corrupt"); 716 /* 717 * No choice but to zap it and thus loose the remainder 718 * of the free objects in this slab. May cause 719 * another error because the object count is now wrong. 720 */ 721 set_freepointer(s, p, NULL); 722 return 0; 723 } 724 return 1; 725} 726 727static int check_slab(struct kmem_cache *s, struct page *page) 728{ 729 VM_BUG_ON(!irqs_disabled()); 730 731 if (!PageSlab(page)) { 732 slab_err(s, page, "Not a valid slab page"); 733 return 0; 734 } 735 if (page->inuse > s->objects) { 736 slab_err(s, page, "inuse %u > max %u", 737 s->name, page->inuse, s->objects); 738 return 0; 739 } 740 /* Slab_pad_check fixes things up after itself */ 741 slab_pad_check(s, page); 742 return 1; 743} 744 745/* 746 * Determine if a certain object on a page is on the freelist. Must hold the 747 * slab lock to guarantee that the chains are in a consistent state. 748 */ 749static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 750{ 751 int nr = 0; 752 void *fp = page->freelist; 753 void *object = NULL; 754 755 while (fp && nr <= s->objects) { 756 if (fp == search) 757 return 1; 758 if (!check_valid_pointer(s, page, fp)) { 759 if (object) { 760 object_err(s, page, object, 761 "Freechain corrupt"); 762 set_freepointer(s, object, NULL); 763 break; 764 } else { 765 slab_err(s, page, "Freepointer corrupt"); 766 page->freelist = NULL; 767 page->inuse = s->objects; 768 slab_fix(s, "Freelist cleared"); 769 return 0; 770 } 771 break; 772 } 773 object = fp; 774 fp = get_freepointer(s, object); 775 nr++; 776 } 777 778 if (page->inuse != s->objects - nr) { 779 slab_err(s, page, "Wrong object count. Counter is %d but " 780 "counted were %d", page->inuse, s->objects - nr); 781 page->inuse = s->objects - nr; 782 slab_fix(s, "Object count adjusted."); 783 } 784 return search == NULL; 785} 786 787static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) 788{ 789 if (s->flags & SLAB_TRACE) { 790 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 791 s->name, 792 alloc ? "alloc" : "free", 793 object, page->inuse, 794 page->freelist); 795 796 if (!alloc) 797 print_section("Object", (void *)object, s->objsize); 798 799 dump_stack(); 800 } 801} 802 803/* 804 * Tracking of fully allocated slabs for debugging purposes. 805 */ 806static void add_full(struct kmem_cache_node *n, struct page *page) 807{ 808 spin_lock(&n->list_lock); 809 list_add(&page->lru, &n->full); 810 spin_unlock(&n->list_lock); 811} 812 813static void remove_full(struct kmem_cache *s, struct page *page) 814{ 815 struct kmem_cache_node *n; 816 817 if (!(s->flags & SLAB_STORE_USER)) 818 return; 819 820 n = get_node(s, page_to_nid(page)); 821 822 spin_lock(&n->list_lock); 823 list_del(&page->lru); 824 spin_unlock(&n->list_lock); 825} 826 827static void setup_object_debug(struct kmem_cache *s, struct page *page, 828 void *object) 829{ 830 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 831 return; 832 833 init_object(s, object, 0); 834 init_tracking(s, object); 835} 836 837static int alloc_debug_processing(struct kmem_cache *s, struct page *page, 838 void *object, void *addr) 839{ 840 if (!check_slab(s, page)) 841 goto bad; 842 843 if (object && !on_freelist(s, page, object)) { 844 object_err(s, page, object, "Object already allocated"); 845 goto bad; 846 } 847 848 if (!check_valid_pointer(s, page, object)) { 849 object_err(s, page, object, "Freelist Pointer check fails"); 850 goto bad; 851 } 852 853 if (object && !check_object(s, page, object, 0)) 854 goto bad; 855 856 /* Success perform special debug activities for allocs */ 857 if (s->flags & SLAB_STORE_USER) 858 set_track(s, object, TRACK_ALLOC, addr); 859 trace(s, page, object, 1); 860 init_object(s, object, 1); 861 return 1; 862 863bad: 864 if (PageSlab(page)) { 865 /* 866 * If this is a slab page then lets do the best we can 867 * to avoid issues in the future. Marking all objects 868 * as used avoids touching the remaining objects. 869 */ 870 slab_fix(s, "Marking all objects used"); 871 page->inuse = s->objects; 872 page->freelist = NULL; 873 } 874 return 0; 875} 876 877static int free_debug_processing(struct kmem_cache *s, struct page *page, 878 void *object, void *addr) 879{ 880 if (!check_slab(s, page)) 881 goto fail; 882 883 if (!check_valid_pointer(s, page, object)) { 884 slab_err(s, page, "Invalid object pointer 0x%p", object); 885 goto fail; 886 } 887 888 if (on_freelist(s, page, object)) { 889 object_err(s, page, object, "Object already free"); 890 goto fail; 891 } 892 893 if (!check_object(s, page, object, 1)) 894 return 0; 895 896 if (unlikely(s != page->slab)) { 897 if (!PageSlab(page)) 898 slab_err(s, page, "Attempt to free object(0x%p) " 899 "outside of slab", object); 900 else 901 if (!page->slab) { 902 printk(KERN_ERR 903 "SLUB <none>: no slab for object 0x%p.\n", 904 object); 905 dump_stack(); 906 } else 907 object_err(s, page, object, 908 "page slab pointer corrupt."); 909 goto fail; 910 } 911 912 /* Special debug activities for freeing objects */ 913 if (!SlabFrozen(page) && !page->freelist) 914 remove_full(s, page); 915 if (s->flags & SLAB_STORE_USER) 916 set_track(s, object, TRACK_FREE, addr); 917 trace(s, page, object, 0); 918 init_object(s, object, 0); 919 return 1; 920 921fail: 922 slab_fix(s, "Object at 0x%p not freed", object); 923 return 0; 924} 925 926static int __init setup_slub_debug(char *str) 927{ 928 slub_debug = DEBUG_DEFAULT_FLAGS; 929 if (*str++ != '=' || !*str) 930 /* 931 * No options specified. Switch on full debugging. 932 */ 933 goto out; 934 935 if (*str == ',') 936 /* 937 * No options but restriction on slabs. This means full 938 * debugging for slabs matching a pattern. 939 */ 940 goto check_slabs; 941 942 slub_debug = 0; 943 if (*str == '-') 944 /* 945 * Switch off all debugging measures. 946 */ 947 goto out; 948 949 /* 950 * Determine which debug features should be switched on 951 */ 952 for (; *str && *str != ','; str++) { 953 switch (tolower(*str)) { 954 case 'f': 955 slub_debug |= SLAB_DEBUG_FREE; 956 break; 957 case 'z': 958 slub_debug |= SLAB_RED_ZONE; 959 break; 960 case 'p': 961 slub_debug |= SLAB_POISON; 962 break; 963 case 'u': 964 slub_debug |= SLAB_STORE_USER; 965 break; 966 case 't': 967 slub_debug |= SLAB_TRACE; 968 break; 969 default: 970 printk(KERN_ERR "slub_debug option '%c' " 971 "unknown. skipped\n", *str); 972 } 973 } 974 975check_slabs: 976 if (*str == ',') 977 slub_debug_slabs = str + 1; 978out: 979 return 1; 980} 981 982__setup("slub_debug", setup_slub_debug); 983 984static unsigned long kmem_cache_flags(unsigned long objsize, 985 unsigned long flags, const char *name, 986 void (*ctor)(struct kmem_cache *, void *)) 987{ 988 /* 989 * The page->offset field is only 16 bit wide. This is an offset 990 * in units of words from the beginning of an object. If the slab 991 * size is bigger then we cannot move the free pointer behind the 992 * object anymore. 993 * 994 * On 32 bit platforms the limit is 256k. On 64bit platforms 995 * the limit is 512k. 996 * 997 * Debugging or ctor may create a need to move the free 998 * pointer. Fail if this happens. 999 */ 1000 if (objsize >= 65535 * sizeof(void *)) { 1001 BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | 1002 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); 1003 BUG_ON(ctor); 1004 } else { 1005 /* 1006 * Enable debugging if selected on the kernel commandline. 1007 */ 1008 if (slub_debug && (!slub_debug_slabs || 1009 strncmp(slub_debug_slabs, name, 1010 strlen(slub_debug_slabs)) == 0)) 1011 flags |= slub_debug; 1012 } 1013 1014 return flags; 1015} 1016#else 1017static inline void setup_object_debug(struct kmem_cache *s, 1018 struct page *page, void *object) {} 1019 1020static inline int alloc_debug_processing(struct kmem_cache *s, 1021 struct page *page, void *object, void *addr) { return 0; } 1022 1023static inline int free_debug_processing(struct kmem_cache *s, 1024 struct page *page, void *object, void *addr) { return 0; } 1025 1026static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1027 { return 1; } 1028static inline int check_object(struct kmem_cache *s, struct page *page, 1029 void *object, int active) { return 1; } 1030static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1031static inline unsigned long kmem_cache_flags(unsigned long objsize, 1032 unsigned long flags, const char *name, 1033 void (*ctor)(struct kmem_cache *, void *)) 1034{ 1035 return flags; 1036} 1037#define slub_debug 0 1038#endif 1039/* 1040 * Slab allocation and freeing 1041 */ 1042static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1043{ 1044 struct page *page; 1045 int pages = 1 << s->order; 1046 1047 if (s->order) 1048 flags |= __GFP_COMP; 1049 1050 if (s->flags & SLAB_CACHE_DMA) 1051 flags |= SLUB_DMA; 1052 1053 if (s->flags & SLAB_RECLAIM_ACCOUNT) 1054 flags |= __GFP_RECLAIMABLE; 1055 1056 if (node == -1) 1057 page = alloc_pages(flags, s->order); 1058 else 1059 page = alloc_pages_node(node, flags, s->order); 1060 1061 if (!page) 1062 return NULL; 1063 1064 mod_zone_page_state(page_zone(page), 1065 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1066 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1067 pages); 1068 1069 return page; 1070} 1071 1072static void setup_object(struct kmem_cache *s, struct page *page, 1073 void *object) 1074{ 1075 setup_object_debug(s, page, object); 1076 if (unlikely(s->ctor)) 1077 s->ctor(s, object); 1078} 1079 1080static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1081{ 1082 struct page *page; 1083 struct kmem_cache_node *n; 1084 void *start; 1085 void *last; 1086 void *p; 1087 1088 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1089 1090 page = allocate_slab(s, 1091 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 1092 if (!page) 1093 goto out; 1094 1095 n = get_node(s, page_to_nid(page)); 1096 if (n) 1097 atomic_long_inc(&n->nr_slabs); 1098 page->slab = s; 1099 page->flags |= 1 << PG_slab; 1100 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 1101 SLAB_STORE_USER | SLAB_TRACE)) 1102 SetSlabDebug(page); 1103 1104 start = page_address(page); 1105 1106 if (unlikely(s->flags & SLAB_POISON)) 1107 memset(start, POISON_INUSE, PAGE_SIZE << s->order); 1108 1109 last = start; 1110 for_each_object(p, s, start) { 1111 setup_object(s, page, last); 1112 set_freepointer(s, last, p); 1113 last = p; 1114 } 1115 setup_object(s, page, last); 1116 set_freepointer(s, last, NULL); 1117 1118 page->freelist = start; 1119 page->inuse = 0; 1120out: 1121 return page; 1122} 1123 1124static void __free_slab(struct kmem_cache *s, struct page *page) 1125{ 1126 int pages = 1 << s->order; 1127 1128 if (unlikely(SlabDebug(page))) { 1129 void *p; 1130 1131 slab_pad_check(s, page); 1132 for_each_object(p, s, page_address(page)) 1133 check_object(s, page, p, 0); 1134 ClearSlabDebug(page); 1135 } 1136 1137 mod_zone_page_state(page_zone(page), 1138 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1139 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1140 -pages); 1141 1142 __free_pages(page, s->order); 1143} 1144 1145static void rcu_free_slab(struct rcu_head *h) 1146{ 1147 struct page *page; 1148 1149 page = container_of((struct list_head *)h, struct page, lru); 1150 __free_slab(page->slab, page); 1151} 1152 1153static void free_slab(struct kmem_cache *s, struct page *page) 1154{ 1155 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1156 /* 1157 * RCU free overloads the RCU head over the LRU 1158 */ 1159 struct rcu_head *head = (void *)&page->lru; 1160 1161 call_rcu(head, rcu_free_slab); 1162 } else 1163 __free_slab(s, page); 1164} 1165 1166static void discard_slab(struct kmem_cache *s, struct page *page) 1167{ 1168 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1169 1170 atomic_long_dec(&n->nr_slabs); 1171 reset_page_mapcount(page); 1172 __ClearPageSlab(page); 1173 free_slab(s, page); 1174} 1175 1176/* 1177 * Per slab locking using the pagelock 1178 */ 1179static __always_inline void slab_lock(struct page *page) 1180{ 1181 bit_spin_lock(PG_locked, &page->flags); 1182} 1183 1184static __always_inline void slab_unlock(struct page *page) 1185{ 1186 bit_spin_unlock(PG_locked, &page->flags); 1187} 1188 1189static __always_inline int slab_trylock(struct page *page) 1190{ 1191 int rc = 1; 1192 1193 rc = bit_spin_trylock(PG_locked, &page->flags); 1194 return rc; 1195} 1196 1197/* 1198 * Management of partially allocated slabs 1199 */ 1200static void add_partial(struct kmem_cache_node *n, 1201 struct page *page, int tail) 1202{ 1203 spin_lock(&n->list_lock); 1204 n->nr_partial++; 1205 if (tail) 1206 list_add_tail(&page->lru, &n->partial); 1207 else 1208 list_add(&page->lru, &n->partial); 1209 spin_unlock(&n->list_lock); 1210} 1211 1212static void remove_partial(struct kmem_cache *s, 1213 struct page *page) 1214{ 1215 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1216 1217 spin_lock(&n->list_lock); 1218 list_del(&page->lru); 1219 n->nr_partial--; 1220 spin_unlock(&n->list_lock); 1221} 1222 1223/* 1224 * Lock slab and remove from the partial list. 1225 * 1226 * Must hold list_lock. 1227 */ 1228static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) 1229{ 1230 if (slab_trylock(page)) { 1231 list_del(&page->lru); 1232 n->nr_partial--; 1233 SetSlabFrozen(page); 1234 return 1; 1235 } 1236 return 0; 1237} 1238 1239/* 1240 * Try to allocate a partial slab from a specific node. 1241 */ 1242static struct page *get_partial_node(struct kmem_cache_node *n) 1243{ 1244 struct page *page; 1245 1246 /* 1247 * Racy check. If we mistakenly see no partial slabs then we 1248 * just allocate an empty slab. If we mistakenly try to get a 1249 * partial slab and there is none available then get_partials() 1250 * will return NULL. 1251 */ 1252 if (!n || !n->nr_partial) 1253 return NULL; 1254 1255 spin_lock(&n->list_lock); 1256 list_for_each_entry(page, &n->partial, lru) 1257 if (lock_and_freeze_slab(n, page)) 1258 goto out; 1259 page = NULL; 1260out: 1261 spin_unlock(&n->list_lock); 1262 return page; 1263} 1264 1265/* 1266 * Get a page from somewhere. Search in increasing NUMA distances. 1267 */ 1268static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1269{ 1270#ifdef CONFIG_NUMA 1271 struct zonelist *zonelist; 1272 struct zone **z; 1273 struct page *page; 1274 1275 /* 1276 * The defrag ratio allows a configuration of the tradeoffs between 1277 * inter node defragmentation and node local allocations. A lower 1278 * defrag_ratio increases the tendency to do local allocations 1279 * instead of attempting to obtain partial slabs from other nodes. 1280 * 1281 * If the defrag_ratio is set to 0 then kmalloc() always 1282 * returns node local objects. If the ratio is higher then kmalloc() 1283 * may return off node objects because partial slabs are obtained 1284 * from other nodes and filled up. 1285 * 1286 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes 1287 * defrag_ratio = 1000) then every (well almost) allocation will 1288 * first attempt to defrag slab caches on other nodes. This means 1289 * scanning over all nodes to look for partial slabs which may be 1290 * expensive if we do it every time we are trying to find a slab 1291 * with available objects. 1292 */ 1293 if (!s->remote_node_defrag_ratio || 1294 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1295 return NULL; 1296 1297 zonelist = &NODE_DATA(slab_node(current->mempolicy)) 1298 ->node_zonelists[gfp_zone(flags)]; 1299 for (z = zonelist->zones; *z; z++) { 1300 struct kmem_cache_node *n; 1301 1302 n = get_node(s, zone_to_nid(*z)); 1303 1304 if (n && cpuset_zone_allowed_hardwall(*z, flags) && 1305 n->nr_partial > MIN_PARTIAL) { 1306 page = get_partial_node(n); 1307 if (page) 1308 return page; 1309 } 1310 } 1311#endif 1312 return NULL; 1313} 1314 1315/* 1316 * Get a partial page, lock it and return it. 1317 */ 1318static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1319{ 1320 struct page *page; 1321 int searchnode = (node == -1) ? numa_node_id() : node; 1322 1323 page = get_partial_node(get_node(s, searchnode)); 1324 if (page || (flags & __GFP_THISNODE)) 1325 return page; 1326 1327 return get_any_partial(s, flags); 1328} 1329 1330/* 1331 * Move a page back to the lists. 1332 * 1333 * Must be called with the slab lock held. 1334 * 1335 * On exit the slab lock will have been dropped. 1336 */ 1337static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) 1338{ 1339 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1340 1341 ClearSlabFrozen(page); 1342 if (page->inuse) { 1343 1344 if (page->freelist) 1345 add_partial(n, page, tail); 1346 else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) 1347 add_full(n, page); 1348 slab_unlock(page); 1349 1350 } else { 1351 if (n->nr_partial < MIN_PARTIAL) { 1352 /* 1353 * Adding an empty slab to the partial slabs in order 1354 * to avoid page allocator overhead. This slab needs 1355 * to come after the other slabs with objects in 1356 * order to fill them up. That way the size of the 1357 * partial list stays small. kmem_cache_shrink can 1358 * reclaim empty slabs from the partial list. 1359 */ 1360 add_partial(n, page, 1); 1361 slab_unlock(page); 1362 } else { 1363 slab_unlock(page); 1364 discard_slab(s, page); 1365 } 1366 } 1367} 1368 1369/* 1370 * Remove the cpu slab 1371 */ 1372static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1373{ 1374 struct page *page = c->page; 1375 int tail = 1; 1376 /* 1377 * Merge cpu freelist into freelist. Typically we get here 1378 * because both freelists are empty. So this is unlikely 1379 * to occur. 1380 */ 1381 while (unlikely(c->freelist)) { 1382 void **object; 1383 1384 tail = 0; /* Hot objects. Put the slab first */ 1385 1386 /* Retrieve object from cpu_freelist */ 1387 object = c->freelist; 1388 c->freelist = c->freelist[c->offset]; 1389 1390 /* And put onto the regular freelist */ 1391 object[c->offset] = page->freelist; 1392 page->freelist = object; 1393 page->inuse--; 1394 } 1395 c->page = NULL; 1396 unfreeze_slab(s, page, tail); 1397} 1398 1399static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1400{ 1401 slab_lock(c->page); 1402 deactivate_slab(s, c); 1403} 1404 1405/* 1406 * Flush cpu slab. 1407 * Called from IPI handler with interrupts disabled. 1408 */ 1409static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1410{ 1411 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 1412 1413 if (likely(c && c->page)) 1414 flush_slab(s, c); 1415} 1416 1417static void flush_cpu_slab(void *d) 1418{ 1419 struct kmem_cache *s = d; 1420 1421 __flush_cpu_slab(s, smp_processor_id()); 1422} 1423 1424static void flush_all(struct kmem_cache *s) 1425{ 1426#ifdef CONFIG_SMP 1427 on_each_cpu(flush_cpu_slab, s, 1, 1); 1428#else 1429 unsigned long flags; 1430 1431 local_irq_save(flags); 1432 flush_cpu_slab(s); 1433 local_irq_restore(flags); 1434#endif 1435} 1436 1437/* 1438 * Check if the objects in a per cpu structure fit numa 1439 * locality expectations. 1440 */ 1441static inline int node_match(struct kmem_cache_cpu *c, int node) 1442{ 1443#ifdef CONFIG_NUMA 1444 if (node != -1 && c->node != node) 1445 return 0; 1446#endif 1447 return 1; 1448} 1449 1450/* 1451 * Slow path. The lockless freelist is empty or we need to perform 1452 * debugging duties. 1453 * 1454 * Interrupts are disabled. 1455 * 1456 * Processing is still very fast if new objects have been freed to the 1457 * regular freelist. In that case we simply take over the regular freelist 1458 * as the lockless freelist and zap the regular freelist. 1459 * 1460 * If that is not working then we fall back to the partial lists. We take the 1461 * first element of the freelist as the object to allocate now and move the 1462 * rest of the freelist to the lockless freelist. 1463 * 1464 * And if we were unable to get a new slab from the partial slab lists then 1465 * we need to allocate a new slab. This is slowest path since we may sleep. 1466 */ 1467static void *__slab_alloc(struct kmem_cache *s, 1468 gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) 1469{ 1470 void **object; 1471 struct page *new; 1472 1473 if (!c->page) 1474 goto new_slab; 1475 1476 slab_lock(c->page); 1477 if (unlikely(!node_match(c, node))) 1478 goto another_slab; 1479load_freelist: 1480 object = c->page->freelist; 1481 if (unlikely(!object)) 1482 goto another_slab; 1483 if (unlikely(SlabDebug(c->page))) 1484 goto debug; 1485 1486 object = c->page->freelist; 1487 c->freelist = object[c->offset]; 1488 c->page->inuse = s->objects; 1489 c->page->freelist = NULL; 1490 c->node = page_to_nid(c->page); 1491 slab_unlock(c->page); 1492 return object; 1493 1494another_slab: 1495 deactivate_slab(s, c); 1496 1497new_slab: 1498 new = get_partial(s, gfpflags, node); 1499 if (new) { 1500 c->page = new; 1501 goto load_freelist; 1502 } 1503 1504 if (gfpflags & __GFP_WAIT) 1505 local_irq_enable(); 1506 1507 new = new_slab(s, gfpflags, node); 1508 1509 if (gfpflags & __GFP_WAIT) 1510 local_irq_disable(); 1511 1512 if (new) { 1513 c = get_cpu_slab(s, smp_processor_id()); 1514 if (c->page) 1515 flush_slab(s, c); 1516 slab_lock(new); 1517 SetSlabFrozen(new); 1518 c->page = new; 1519 goto load_freelist; 1520 } 1521 return NULL; 1522debug: 1523 object = c->page->freelist; 1524 if (!alloc_debug_processing(s, c->page, object, addr)) 1525 goto another_slab; 1526 1527 c->page->inuse++; 1528 c->page->freelist = object[c->offset]; 1529 c->node = -1; 1530 slab_unlock(c->page); 1531 return object; 1532} 1533 1534/* 1535 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) 1536 * have the fastpath folded into their functions. So no function call 1537 * overhead for requests that can be satisfied on the fastpath. 1538 * 1539 * The fastpath works by first checking if the lockless freelist can be used. 1540 * If not then __slab_alloc is called for slow processing. 1541 * 1542 * Otherwise we can simply pick the next object from the lockless free list. 1543 */ 1544static __always_inline void *slab_alloc(struct kmem_cache *s, 1545 gfp_t gfpflags, int node, void *addr) 1546{ 1547 void **object; 1548 unsigned long flags; 1549 struct kmem_cache_cpu *c; 1550 1551 local_irq_save(flags); 1552 c = get_cpu_slab(s, smp_processor_id()); 1553 if (unlikely(!c->freelist || !node_match(c, node))) 1554 1555 object = __slab_alloc(s, gfpflags, node, addr, c); 1556 1557 else { 1558 object = c->freelist; 1559 c->freelist = object[c->offset]; 1560 } 1561 local_irq_restore(flags); 1562 1563 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1564 memset(object, 0, c->objsize); 1565 1566 return object; 1567} 1568 1569void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 1570{ 1571 return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); 1572} 1573EXPORT_SYMBOL(kmem_cache_alloc); 1574 1575#ifdef CONFIG_NUMA 1576void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 1577{ 1578 return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); 1579} 1580EXPORT_SYMBOL(kmem_cache_alloc_node); 1581#endif 1582 1583/* 1584 * Slow patch handling. This may still be called frequently since objects 1585 * have a longer lifetime than the cpu slabs in most processing loads. 1586 * 1587 * So we still attempt to reduce cache line usage. Just take the slab 1588 * lock and free the item. If there is no additional partial page 1589 * handling required then we can return immediately. 1590 */ 1591static void __slab_free(struct kmem_cache *s, struct page *page, 1592 void *x, void *addr, unsigned int offset) 1593{ 1594 void *prior; 1595 void **object = (void *)x; 1596 1597 slab_lock(page); 1598 1599 if (unlikely(SlabDebug(page))) 1600 goto debug; 1601checks_ok: 1602 prior = object[offset] = page->freelist; 1603 page->freelist = object; 1604 page->inuse--; 1605 1606 if (unlikely(SlabFrozen(page))) 1607 goto out_unlock; 1608 1609 if (unlikely(!page->inuse)) 1610 goto slab_empty; 1611 1612 /* 1613 * Objects left in the slab. If it 1614 * was not on the partial list before 1615 * then add it. 1616 */ 1617 if (unlikely(!prior)) 1618 add_partial(get_node(s, page_to_nid(page)), page, 1); 1619 1620out_unlock: 1621 slab_unlock(page); 1622 return; 1623 1624slab_empty: 1625 if (prior) 1626 /* 1627 * Slab still on the partial list. 1628 */ 1629 remove_partial(s, page); 1630 1631 slab_unlock(page); 1632 discard_slab(s, page); 1633 return; 1634 1635debug: 1636 if (!free_debug_processing(s, page, x, addr)) 1637 goto out_unlock; 1638 goto checks_ok; 1639} 1640 1641/* 1642 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 1643 * can perform fastpath freeing without additional function calls. 1644 * 1645 * The fastpath is only possible if we are freeing to the current cpu slab 1646 * of this processor. This typically the case if we have just allocated 1647 * the item before. 1648 * 1649 * If fastpath is not possible then fall back to __slab_free where we deal 1650 * with all sorts of special processing. 1651 */ 1652static __always_inline void slab_free(struct kmem_cache *s, 1653 struct page *page, void *x, void *addr) 1654{ 1655 void **object = (void *)x; 1656 unsigned long flags; 1657 struct kmem_cache_cpu *c; 1658 1659 local_irq_save(flags); 1660 debug_check_no_locks_freed(object, s->objsize); 1661 c = get_cpu_slab(s, smp_processor_id()); 1662 if (likely(page == c->page && c->node >= 0)) { 1663 object[c->offset] = c->freelist; 1664 c->freelist = object; 1665 } else 1666 __slab_free(s, page, x, addr, c->offset); 1667 1668 local_irq_restore(flags); 1669} 1670 1671void kmem_cache_free(struct kmem_cache *s, void *x) 1672{ 1673 struct page *page; 1674 1675 page = virt_to_head_page(x); 1676 1677 slab_free(s, page, x, __builtin_return_address(0)); 1678} 1679EXPORT_SYMBOL(kmem_cache_free); 1680 1681/* Figure out on which slab object the object resides */ 1682static struct page *get_object_page(const void *x) 1683{ 1684 struct page *page = virt_to_head_page(x); 1685 1686 if (!PageSlab(page)) 1687 return NULL; 1688 1689 return page; 1690} 1691 1692/* 1693 * Object placement in a slab is made very easy because we always start at 1694 * offset 0. If we tune the size of the object to the alignment then we can 1695 * get the required alignment by putting one properly sized object after 1696 * another. 1697 * 1698 * Notice that the allocation order determines the sizes of the per cpu 1699 * caches. Each processor has always one slab available for allocations. 1700 * Increasing the allocation order reduces the number of times that slabs 1701 * must be moved on and off the partial lists and is therefore a factor in 1702 * locking overhead. 1703 */ 1704 1705/* 1706 * Mininum / Maximum order of slab pages. This influences locking overhead 1707 * and slab fragmentation. A higher order reduces the number of partial slabs 1708 * and increases the number of allocations possible without having to 1709 * take the list_lock. 1710 */ 1711static int slub_min_order; 1712static int slub_max_order = DEFAULT_MAX_ORDER; 1713static int slub_min_objects = DEFAULT_MIN_OBJECTS; 1714 1715/* 1716 * Merge control. If this is set then no merging of slab caches will occur. 1717 * (Could be removed. This was introduced to pacify the merge skeptics.) 1718 */ 1719static int slub_nomerge; 1720 1721/* 1722 * Calculate the order of allocation given an slab object size. 1723 * 1724 * The order of allocation has significant impact on performance and other 1725 * system components. Generally order 0 allocations should be preferred since 1726 * order 0 does not cause fragmentation in the page allocator. Larger objects 1727 * be problematic to put into order 0 slabs because there may be too much 1728 * unused space left. We go to a higher order if more than 1/8th of the slab 1729 * would be wasted. 1730 * 1731 * In order to reach satisfactory performance we must ensure that a minimum 1732 * number of objects is in one slab. Otherwise we may generate too much 1733 * activity on the partial lists which requires taking the list_lock. This is 1734 * less a concern for large slabs though which are rarely used. 1735 * 1736 * slub_max_order specifies the order where we begin to stop considering the 1737 * number of objects in a slab as critical. If we reach slub_max_order then 1738 * we try to keep the page order as low as possible. So we accept more waste 1739 * of space in favor of a small page order. 1740 * 1741 * Higher order allocations also allow the placement of more objects in a 1742 * slab and thereby reduce object handling overhead. If the user has 1743 * requested a higher mininum order then we start with that one instead of 1744 * the smallest order which will fit the object. 1745 */ 1746static inline int slab_order(int size, int min_objects, 1747 int max_order, int fract_leftover) 1748{ 1749 int order; 1750 int rem; 1751 int min_order = slub_min_order; 1752 1753 for (order = max(min_order, 1754 fls(min_objects * size - 1) - PAGE_SHIFT); 1755 order <= max_order; order++) { 1756 1757 unsigned long slab_size = PAGE_SIZE << order; 1758 1759 if (slab_size < min_objects * size) 1760 continue; 1761 1762 rem = slab_size % size; 1763 1764 if (rem <= slab_size / fract_leftover) 1765 break; 1766 1767 } 1768 1769 return order; 1770} 1771 1772static inline int calculate_order(int size) 1773{ 1774 int order; 1775 int min_objects; 1776 int fraction; 1777 1778 /* 1779 * Attempt to find best configuration for a slab. This 1780 * works by first attempting to generate a layout with 1781 * the best configuration and backing off gradually. 1782 * 1783 * First we reduce the acceptable waste in a slab. Then 1784 * we reduce the minimum objects required in a slab. 1785 */ 1786 min_objects = slub_min_objects; 1787 while (min_objects > 1) { 1788 fraction = 8; 1789 while (fraction >= 4) { 1790 order = slab_order(size, min_objects, 1791 slub_max_order, fraction); 1792 if (order <= slub_max_order) 1793 return order; 1794 fraction /= 2; 1795 } 1796 min_objects /= 2; 1797 } 1798 1799 /* 1800 * We were unable to place multiple objects in a slab. Now 1801 * lets see if we can place a single object there. 1802 */ 1803 order = slab_order(size, 1, slub_max_order, 1); 1804 if (order <= slub_max_order) 1805 return order; 1806 1807 /* 1808 * Doh this slab cannot be placed using slub_max_order. 1809 */ 1810 order = slab_order(size, 1, MAX_ORDER, 1); 1811 if (order <= MAX_ORDER) 1812 return order; 1813 return -ENOSYS; 1814} 1815 1816/* 1817 * Figure out what the alignment of the objects will be. 1818 */ 1819static unsigned long calculate_alignment(unsigned long flags, 1820 unsigned long align, unsigned long size) 1821{ 1822 /* 1823 * If the user wants hardware cache aligned objects then 1824 * follow that suggestion if the object is sufficiently 1825 * large. 1826 * 1827 * The hardware cache alignment cannot override the 1828 * specified alignment though. If that is greater 1829 * then use it. 1830 */ 1831 if ((flags & SLAB_HWCACHE_ALIGN) && 1832 size > cache_line_size() / 2) 1833 return max_t(unsigned long, align, cache_line_size()); 1834 1835 if (align < ARCH_SLAB_MINALIGN) 1836 return ARCH_SLAB_MINALIGN; 1837 1838 return ALIGN(align, sizeof(void *)); 1839} 1840 1841static void init_kmem_cache_cpu(struct kmem_cache *s, 1842 struct kmem_cache_cpu *c) 1843{ 1844 c->page = NULL; 1845 c->freelist = NULL; 1846 c->node = 0; 1847 c->offset = s->offset / sizeof(void *); 1848 c->objsize = s->objsize; 1849} 1850 1851static void init_kmem_cache_node(struct kmem_cache_node *n) 1852{ 1853 n->nr_partial = 0; 1854 atomic_long_set(&n->nr_slabs, 0); 1855 spin_lock_init(&n->list_lock); 1856 INIT_LIST_HEAD(&n->partial); 1857#ifdef CONFIG_SLUB_DEBUG 1858 INIT_LIST_HEAD(&n->full); 1859#endif 1860} 1861 1862#ifdef CONFIG_SMP 1863/* 1864 * Per cpu array for per cpu structures. 1865 * 1866 * The per cpu array places all kmem_cache_cpu structures from one processor 1867 * close together meaning that it becomes possible that multiple per cpu 1868 * structures are contained in one cacheline. This may be particularly 1869 * beneficial for the kmalloc caches. 1870 * 1871 * A desktop system typically has around 60-80 slabs. With 100 here we are 1872 * likely able to get per cpu structures for all caches from the array defined 1873 * here. We must be able to cover all kmalloc caches during bootstrap. 1874 * 1875 * If the per cpu array is exhausted then fall back to kmalloc 1876 * of individual cachelines. No sharing is possible then. 1877 */ 1878#define NR_KMEM_CACHE_CPU 100 1879 1880static DEFINE_PER_CPU(struct kmem_cache_cpu, 1881 kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; 1882 1883static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); 1884static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; 1885 1886static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, 1887 int cpu, gfp_t flags) 1888{ 1889 struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); 1890 1891 if (c) 1892 per_cpu(kmem_cache_cpu_free, cpu) = 1893 (void *)c->freelist; 1894 else { 1895 /* Table overflow: So allocate ourselves */ 1896 c = kmalloc_node( 1897 ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), 1898 flags, cpu_to_node(cpu)); 1899 if (!c) 1900 return NULL; 1901 } 1902 1903 init_kmem_cache_cpu(s, c); 1904 return c; 1905} 1906 1907static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) 1908{ 1909 if (c < per_cpu(kmem_cache_cpu, cpu) || 1910 c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { 1911 kfree(c); 1912 return; 1913 } 1914 c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); 1915 per_cpu(kmem_cache_cpu_free, cpu) = c; 1916} 1917 1918static void free_kmem_cache_cpus(struct kmem_cache *s) 1919{ 1920 int cpu; 1921 1922 for_each_online_cpu(cpu) { 1923 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 1924 1925 if (c) { 1926 s->cpu_slab[cpu] = NULL; 1927 free_kmem_cache_cpu(c, cpu); 1928 } 1929 } 1930} 1931 1932static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) 1933{ 1934 int cpu; 1935 1936 for_each_online_cpu(cpu) { 1937 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 1938 1939 if (c) 1940 continue; 1941 1942 c = alloc_kmem_cache_cpu(s, cpu, flags); 1943 if (!c) { 1944 free_kmem_cache_cpus(s); 1945 return 0; 1946 } 1947 s->cpu_slab[cpu] = c; 1948 } 1949 return 1; 1950} 1951 1952/* 1953 * Initialize the per cpu array. 1954 */ 1955static void init_alloc_cpu_cpu(int cpu) 1956{ 1957 int i; 1958 1959 if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) 1960 return; 1961 1962 for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) 1963 free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); 1964 1965 cpu_set(cpu, kmem_cach_cpu_free_init_once); 1966} 1967 1968static void __init init_alloc_cpu(void) 1969{ 1970 int cpu; 1971 1972 for_each_online_cpu(cpu) 1973 init_alloc_cpu_cpu(cpu); 1974 } 1975 1976#else 1977static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} 1978static inline void init_alloc_cpu(void) {} 1979 1980static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) 1981{ 1982 init_kmem_cache_cpu(s, &s->cpu_slab); 1983 return 1; 1984} 1985#endif 1986 1987#ifdef CONFIG_NUMA 1988/* 1989 * No kmalloc_node yet so do it by hand. We know that this is the first 1990 * slab on the node for this slabcache. There are no concurrent accesses 1991 * possible. 1992 * 1993 * Note that this function only works on the kmalloc_node_cache 1994 * when allocating for the kmalloc_node_cache. This is used for bootstrapping 1995 * memory on a fresh node that has no slab structures yet. 1996 */ 1997static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, 1998 int node) 1999{ 2000 struct page *page; 2001 struct kmem_cache_node *n; 2002 unsigned long flags; 2003 2004 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); 2005 2006 page = new_slab(kmalloc_caches, gfpflags, node); 2007 2008 BUG_ON(!page); 2009 if (page_to_nid(page) != node) { 2010 printk(KERN_ERR "SLUB: Unable to allocate memory from " 2011 "node %d\n", node); 2012 printk(KERN_ERR "SLUB: Allocating a useless per node structure " 2013 "in order to be able to continue\n"); 2014 } 2015 2016 n = page->freelist; 2017 BUG_ON(!n); 2018 page->freelist = get_freepointer(kmalloc_caches, n); 2019 page->inuse++; 2020 kmalloc_caches->node[node] = n; 2021#ifdef CONFIG_SLUB_DEBUG 2022 init_object(kmalloc_caches, n, 1); 2023 init_tracking(kmalloc_caches, n); 2024#endif 2025 init_kmem_cache_node(n); 2026 atomic_long_inc(&n->nr_slabs); 2027 /* 2028 * lockdep requires consistent irq usage for each lock 2029 * so even though there cannot be a race this early in 2030 * the boot sequence, we still disable irqs. 2031 */ 2032 local_irq_save(flags); 2033 add_partial(n, page, 0); 2034 local_irq_restore(flags); 2035 return n; 2036} 2037 2038static void free_kmem_cache_nodes(struct kmem_cache *s) 2039{ 2040 int node; 2041 2042 for_each_node_state(node, N_NORMAL_MEMORY) { 2043 struct kmem_cache_node *n = s->node[node]; 2044 if (n && n != &s->local_node) 2045 kmem_cache_free(kmalloc_caches, n); 2046 s->node[node] = NULL; 2047 } 2048} 2049 2050static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 2051{ 2052 int node; 2053 int local_node; 2054 2055 if (slab_state >= UP) 2056 local_node = page_to_nid(virt_to_page(s)); 2057 else 2058 local_node = 0; 2059 2060 for_each_node_state(node, N_NORMAL_MEMORY) { 2061 struct kmem_cache_node *n; 2062 2063 if (local_node == node) 2064 n = &s->local_node; 2065 else { 2066 if (slab_state == DOWN) { 2067 n = early_kmem_cache_node_alloc(gfpflags, 2068 node); 2069 continue; 2070 } 2071 n = kmem_cache_alloc_node(kmalloc_caches, 2072 gfpflags, node); 2073 2074 if (!n) { 2075 free_kmem_cache_nodes(s); 2076 return 0; 2077 } 2078 2079 } 2080 s->node[node] = n; 2081 init_kmem_cache_node(n); 2082 } 2083 return 1; 2084} 2085#else 2086static void free_kmem_cache_nodes(struct kmem_cache *s) 2087{ 2088} 2089 2090static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 2091{ 2092 init_kmem_cache_node(&s->local_node); 2093 return 1; 2094} 2095#endif 2096 2097/* 2098 * calculate_sizes() determines the order and the distribution of data within 2099 * a slab object. 2100 */ 2101static int calculate_sizes(struct kmem_cache *s) 2102{ 2103 unsigned long flags = s->flags; 2104 unsigned long size = s->objsize; 2105 unsigned long align = s->align; 2106 2107 /* 2108 * Determine if we can poison the object itself. If the user of 2109 * the slab may touch the object after free or before allocation 2110 * then we should never poison the object itself. 2111 */ 2112 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 2113 !s->ctor) 2114 s->flags |= __OBJECT_POISON; 2115 else 2116 s->flags &= ~__OBJECT_POISON; 2117 2118 /* 2119 * Round up object size to the next word boundary. We can only 2120 * place the free pointer at word boundaries and this determines 2121 * the possible location of the free pointer. 2122 */ 2123 size = ALIGN(size, sizeof(void *)); 2124 2125#ifdef CONFIG_SLUB_DEBUG 2126 /* 2127 * If we are Redzoning then check if there is some space between the 2128 * end of the object and the free pointer. If not then add an 2129 * additional word to have some bytes to store Redzone information. 2130 */ 2131 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 2132 size += sizeof(void *); 2133#endif 2134 2135 /* 2136 * With that we have determined the number of bytes in actual use 2137 * by the object. This is the potential offset to the free pointer. 2138 */ 2139 s->inuse = size; 2140 2141 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 2142 s->ctor)) { 2143 /* 2144 * Relocate free pointer after the object if it is not 2145 * permitted to overwrite the first word of the object on 2146 * kmem_cache_free. 2147 * 2148 * This is the case if we do RCU, have a constructor or 2149 * destructor or are poisoning the objects. 2150 */ 2151 s->offset = size; 2152 size += sizeof(void *); 2153 } 2154 2155#ifdef CONFIG_SLUB_DEBUG 2156 if (flags & SLAB_STORE_USER) 2157 /* 2158 * Need to store information about allocs and frees after 2159 * the object. 2160 */ 2161 size += 2 * sizeof(struct track); 2162 2163 if (flags & SLAB_RED_ZONE) 2164 /* 2165 * Add some empty padding so that we can catch 2166 * overwrites from earlier objects rather than let 2167 * tracking information or the free pointer be 2168 * corrupted if an user writes before the start 2169 * of the object. 2170 */ 2171 size += sizeof(void *); 2172#endif 2173 2174 /* 2175 * Determine the alignment based on various parameters that the 2176 * user specified and the dynamic determination of cache line size 2177 * on bootup. 2178 */ 2179 align = calculate_alignment(flags, align, s->objsize); 2180 2181 /* 2182 * SLUB stores one object immediately after another beginning from 2183 * offset 0. In order to align the objects we have to simply size 2184 * each object to conform to the alignment. 2185 */ 2186 size = ALIGN(size, align); 2187 s->size = size; 2188 2189 s->order = calculate_order(size); 2190 if (s->order < 0) 2191 return 0; 2192 2193 /* 2194 * Determine the number of objects per slab 2195 */ 2196 s->objects = (PAGE_SIZE << s->order) / size; 2197 2198 return !!s->objects; 2199 2200} 2201 2202static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2203 const char *name, size_t size, 2204 size_t align, unsigned long flags, 2205 void (*ctor)(struct kmem_cache *, void *)) 2206{ 2207 memset(s, 0, kmem_size); 2208 s->name = name; 2209 s->ctor = ctor; 2210 s->objsize = size; 2211 s->align = align; 2212 s->flags = kmem_cache_flags(size, flags, name, ctor); 2213 2214 if (!calculate_sizes(s)) 2215 goto error; 2216 2217 s->refcount = 1; 2218#ifdef CONFIG_NUMA 2219 s->remote_node_defrag_ratio = 100; 2220#endif 2221 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 2222 goto error; 2223 2224 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) 2225 return 1; 2226 free_kmem_cache_nodes(s); 2227error: 2228 if (flags & SLAB_PANIC) 2229 panic("Cannot create slab %s size=%lu realsize=%u " 2230 "order=%u offset=%u flags=%lx\n", 2231 s->name, (unsigned long)size, s->size, s->order, 2232 s->offset, flags); 2233 return 0; 2234} 2235 2236/* 2237 * Check if a given pointer is valid 2238 */ 2239int kmem_ptr_validate(struct kmem_cache *s, const void *object) 2240{ 2241 struct page *page; 2242 2243 page = get_object_page(object); 2244 2245 if (!page || s != page->slab) 2246 /* No slab or wrong slab */ 2247 return 0; 2248 2249 if (!check_valid_pointer(s, page, object)) 2250 return 0; 2251 2252 /* 2253 * We could also check if the object is on the slabs freelist. 2254 * But this would be too expensive and it seems that the main 2255 * purpose of kmem_ptr_valid is to check if the object belongs 2256 * to a certain slab. 2257 */ 2258 return 1; 2259} 2260EXPORT_SYMBOL(kmem_ptr_validate); 2261 2262/* 2263 * Determine the size of a slab object 2264 */ 2265unsigned int kmem_cache_size(struct kmem_cache *s) 2266{ 2267 return s->objsize; 2268} 2269EXPORT_SYMBOL(kmem_cache_size); 2270 2271const char *kmem_cache_name(struct kmem_cache *s) 2272{ 2273 return s->name; 2274} 2275EXPORT_SYMBOL(kmem_cache_name); 2276 2277/* 2278 * Attempt to free all slabs on a node. Return the number of slabs we 2279 * were unable to free. 2280 */ 2281static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, 2282 struct list_head *list) 2283{ 2284 int slabs_inuse = 0; 2285 unsigned long flags; 2286 struct page *page, *h; 2287 2288 spin_lock_irqsave(&n->list_lock, flags); 2289 list_for_each_entry_safe(page, h, list, lru) 2290 if (!page->inuse) { 2291 list_del(&page->lru); 2292 discard_slab(s, page); 2293 } else 2294 slabs_inuse++; 2295 spin_unlock_irqrestore(&n->list_lock, flags); 2296 return slabs_inuse; 2297} 2298 2299/* 2300 * Release all resources used by a slab cache. 2301 */ 2302static inline int kmem_cache_close(struct kmem_cache *s) 2303{ 2304 int node; 2305 2306 flush_all(s); 2307 2308 /* Attempt to free all objects */ 2309 free_kmem_cache_cpus(s); 2310 for_each_node_state(node, N_NORMAL_MEMORY) { 2311 struct kmem_cache_node *n = get_node(s, node); 2312 2313 n->nr_partial -= free_list(s, n, &n->partial); 2314 if (atomic_long_read(&n->nr_slabs)) 2315 return 1; 2316 } 2317 free_kmem_cache_nodes(s); 2318 return 0; 2319} 2320 2321/* 2322 * Close a cache and release the kmem_cache structure 2323 * (must be used for caches created using kmem_cache_create) 2324 */ 2325void kmem_cache_destroy(struct kmem_cache *s) 2326{ 2327 down_write(&slub_lock); 2328 s->refcount--; 2329 if (!s->refcount) { 2330 list_del(&s->list); 2331 up_write(&slub_lock); 2332 if (kmem_cache_close(s)) 2333 WARN_ON(1); 2334 sysfs_slab_remove(s); 2335 } else 2336 up_write(&slub_lock); 2337} 2338EXPORT_SYMBOL(kmem_cache_destroy); 2339 2340/******************************************************************** 2341 * Kmalloc subsystem 2342 *******************************************************************/ 2343 2344struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned; 2345EXPORT_SYMBOL(kmalloc_caches); 2346 2347#ifdef CONFIG_ZONE_DMA 2348static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT]; 2349#endif 2350 2351static int __init setup_slub_min_order(char *str) 2352{ 2353 get_option(&str, &slub_min_order); 2354 2355 return 1; 2356} 2357 2358__setup("slub_min_order=", setup_slub_min_order); 2359 2360static int __init setup_slub_max_order(char *str) 2361{ 2362 get_option(&str, &slub_max_order); 2363 2364 return 1; 2365} 2366 2367__setup("slub_max_order=", setup_slub_max_order); 2368 2369static int __init setup_slub_min_objects(char *str) 2370{ 2371 get_option(&str, &slub_min_objects); 2372 2373 return 1; 2374} 2375 2376__setup("slub_min_objects=", setup_slub_min_objects); 2377 2378static int __init setup_slub_nomerge(char *str) 2379{ 2380 slub_nomerge = 1; 2381 return 1; 2382} 2383 2384__setup("slub_nomerge", setup_slub_nomerge); 2385 2386static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, 2387 const char *name, int size, gfp_t gfp_flags) 2388{ 2389 unsigned int flags = 0; 2390 2391 if (gfp_flags & SLUB_DMA) 2392 flags = SLAB_CACHE_DMA; 2393 2394 down_write(&slub_lock); 2395 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 2396 flags, NULL)) 2397 goto panic; 2398 2399 list_add(&s->list, &slab_caches); 2400 up_write(&slub_lock); 2401 if (sysfs_slab_add(s)) 2402 goto panic; 2403 return s; 2404 2405panic: 2406 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 2407} 2408 2409#ifdef CONFIG_ZONE_DMA 2410 2411static void sysfs_add_func(struct work_struct *w) 2412{ 2413 struct kmem_cache *s; 2414 2415 down_write(&slub_lock); 2416 list_for_each_entry(s, &slab_caches, list) { 2417 if (s->flags & __SYSFS_ADD_DEFERRED) { 2418 s->flags &= ~__SYSFS_ADD_DEFERRED; 2419 sysfs_slab_add(s); 2420 } 2421 } 2422 up_write(&slub_lock); 2423} 2424 2425static DECLARE_WORK(sysfs_add_work, sysfs_add_func); 2426 2427static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) 2428{ 2429 struct kmem_cache *s; 2430 char *text; 2431 size_t realsize; 2432 2433 s = kmalloc_caches_dma[index]; 2434 if (s) 2435 return s; 2436 2437 /* Dynamically create dma cache */ 2438 if (flags & __GFP_WAIT) 2439 down_write(&slub_lock); 2440 else { 2441 if (!down_write_trylock(&slub_lock)) 2442 goto out; 2443 } 2444 2445 if (kmalloc_caches_dma[index]) 2446 goto unlock_out; 2447 2448 realsize = kmalloc_caches[index].objsize; 2449 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize), 2450 s = kmalloc(kmem_size, flags & ~SLUB_DMA); 2451 2452 if (!s || !text || !kmem_cache_open(s, flags, text, 2453 realsize, ARCH_KMALLOC_MINALIGN, 2454 SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { 2455 kfree(s); 2456 kfree(text); 2457 goto unlock_out; 2458 } 2459 2460 list_add(&s->list, &slab_caches); 2461 kmalloc_caches_dma[index] = s; 2462 2463 schedule_work(&sysfs_add_work); 2464 2465unlock_out: 2466 up_write(&slub_lock); 2467out: 2468 return kmalloc_caches_dma[index]; 2469} 2470#endif 2471 2472/* 2473 * Conversion table for small slabs sizes / 8 to the index in the 2474 * kmalloc array. This is necessary for slabs < 192 since we have non power 2475 * of two cache sizes there. The size of larger slabs can be determined using 2476 * fls. 2477 */ 2478static s8 size_index[24] = { 2479 3, /* 8 */ 2480 4, /* 16 */ 2481 5, /* 24 */ 2482 5, /* 32 */ 2483 6, /* 40 */ 2484 6, /* 48 */ 2485 6, /* 56 */ 2486 6, /* 64 */ 2487 1, /* 72 */ 2488 1, /* 80 */ 2489 1, /* 88 */ 2490 1, /* 96 */ 2491 7, /* 104 */ 2492 7, /* 112 */ 2493 7, /* 120 */ 2494 7, /* 128 */ 2495 2, /* 136 */ 2496 2, /* 144 */ 2497 2, /* 152 */ 2498 2, /* 160 */ 2499 2, /* 168 */ 2500 2, /* 176 */ 2501 2, /* 184 */ 2502 2 /* 192 */ 2503}; 2504 2505static struct kmem_cache *get_slab(size_t size, gfp_t flags) 2506{ 2507 int index; 2508 2509 if (size <= 192) { 2510 if (!size) 2511 return ZERO_SIZE_PTR; 2512 2513 index = size_index[(size - 1) / 8]; 2514 } else 2515 index = fls(size - 1); 2516 2517#ifdef CONFIG_ZONE_DMA 2518 if (unlikely((flags & SLUB_DMA))) 2519 return dma_kmalloc_cache(index, flags); 2520 2521#endif 2522 return &kmalloc_caches[index]; 2523} 2524 2525void *__kmalloc(size_t size, gfp_t flags) 2526{ 2527 struct kmem_cache *s; 2528 2529 if (unlikely(size > PAGE_SIZE / 2)) 2530 return (void *)__get_free_pages(flags | __GFP_COMP, 2531 get_order(size)); 2532 2533 s = get_slab(size, flags); 2534 2535 if (unlikely(ZERO_OR_NULL_PTR(s))) 2536 return s; 2537 2538 return slab_alloc(s, flags, -1, __builtin_return_address(0)); 2539} 2540EXPORT_SYMBOL(__kmalloc); 2541 2542#ifdef CONFIG_NUMA 2543void *__kmalloc_node(size_t size, gfp_t flags, int node) 2544{ 2545 struct kmem_cache *s; 2546 2547 if (unlikely(size > PAGE_SIZE / 2)) 2548 return (void *)__get_free_pages(flags | __GFP_COMP, 2549 get_order(size)); 2550 2551 s = get_slab(size, flags); 2552 2553 if (unlikely(ZERO_OR_NULL_PTR(s))) 2554 return s; 2555 2556 return slab_alloc(s, flags, node, __builtin_return_address(0)); 2557} 2558EXPORT_SYMBOL(__kmalloc_node); 2559#endif 2560 2561size_t ksize(const void *object) 2562{ 2563 struct page *page; 2564 struct kmem_cache *s; 2565 2566 BUG_ON(!object); 2567 if (unlikely(object == ZERO_SIZE_PTR)) 2568 return 0; 2569 2570 page = virt_to_head_page(object); 2571 BUG_ON(!page); 2572 2573 if (unlikely(!PageSlab(page))) 2574 return PAGE_SIZE << compound_order(page); 2575 2576 s = page->slab; 2577 BUG_ON(!s); 2578 2579 /* 2580 * Debugging requires use of the padding between object 2581 * and whatever may come after it. 2582 */ 2583 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 2584 return s->objsize; 2585 2586 /* 2587 * If we have the need to store the freelist pointer 2588 * back there or track user information then we can 2589 * only use the space before that information. 2590 */ 2591 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 2592 return s->inuse; 2593 2594 /* 2595 * Else we can use all the padding etc for the allocation 2596 */ 2597 return s->size; 2598} 2599EXPORT_SYMBOL(ksize); 2600 2601void kfree(const void *x) 2602{ 2603 struct page *page; 2604 2605 if (unlikely(ZERO_OR_NULL_PTR(x))) 2606 return; 2607 2608 page = virt_to_head_page(x); 2609 if (unlikely(!PageSlab(page))) { 2610 put_page(page); 2611 return; 2612 } 2613 slab_free(page->slab, page, (void *)x, __builtin_return_address(0)); 2614} 2615EXPORT_SYMBOL(kfree); 2616 2617static unsigned long count_partial(struct kmem_cache_node *n) 2618{ 2619 unsigned long flags; 2620 unsigned long x = 0; 2621 struct page *page; 2622 2623 spin_lock_irqsave(&n->list_lock, flags); 2624 list_for_each_entry(page, &n->partial, lru) 2625 x += page->inuse; 2626 spin_unlock_irqrestore(&n->list_lock, flags); 2627 return x; 2628} 2629 2630/* 2631 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 2632 * the remaining slabs by the number of items in use. The slabs with the 2633 * most items in use come first. New allocations will then fill those up 2634 * and thus they can be removed from the partial lists. 2635 * 2636 * The slabs with the least items are placed last. This results in them 2637 * being allocated from last increasing the chance that the last objects 2638 * are freed in them. 2639 */ 2640int kmem_cache_shrink(struct kmem_cache *s) 2641{ 2642 int node; 2643 int i; 2644 struct kmem_cache_node *n; 2645 struct page *page; 2646 struct page *t; 2647 struct list_head *slabs_by_inuse = 2648 kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); 2649 unsigned long flags; 2650 2651 if (!slabs_by_inuse) 2652 return -ENOMEM; 2653 2654 flush_all(s); 2655 for_each_node_state(node, N_NORMAL_MEMORY) { 2656 n = get_node(s, node); 2657 2658 if (!n->nr_partial) 2659 continue; 2660 2661 for (i = 0; i < s->objects; i++) 2662 INIT_LIST_HEAD(slabs_by_inuse + i); 2663 2664 spin_lock_irqsave(&n->list_lock, flags); 2665 2666 /* 2667 * Build lists indexed by the items in use in each slab. 2668 * 2669 * Note that concurrent frees may occur while we hold the 2670 * list_lock. page->inuse here is the upper limit. 2671 */ 2672 list_for_each_entry_safe(page, t, &n->partial, lru) { 2673 if (!page->inuse && slab_trylock(page)) { 2674 /* 2675 * Must hold slab lock here because slab_free 2676 * may have freed the last object and be 2677 * waiting to release the slab. 2678 */ 2679 list_del(&page->lru); 2680 n->nr_partial--; 2681 slab_unlock(page); 2682 discard_slab(s, page); 2683 } else { 2684 list_move(&page->lru, 2685 slabs_by_inuse + page->inuse); 2686 } 2687 } 2688 2689 /* 2690 * Rebuild the partial list with the slabs filled up most 2691 * first and the least used slabs at the end. 2692 */ 2693 for (i = s->objects - 1; i >= 0; i--) 2694 list_splice(slabs_by_inuse + i, n->partial.prev); 2695 2696 spin_unlock_irqrestore(&n->list_lock, flags); 2697 } 2698 2699 kfree(slabs_by_inuse); 2700 return 0; 2701} 2702EXPORT_SYMBOL(kmem_cache_shrink); 2703 2704#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) 2705static int slab_mem_going_offline_callback(void *arg) 2706{ 2707 struct kmem_cache *s; 2708 2709 down_read(&slub_lock); 2710 list_for_each_entry(s, &slab_caches, list) 2711 kmem_cache_shrink(s); 2712 up_read(&slub_lock); 2713 2714 return 0; 2715} 2716 2717static void slab_mem_offline_callback(void *arg) 2718{ 2719 struct kmem_cache_node *n; 2720 struct kmem_cache *s; 2721 struct memory_notify *marg = arg; 2722 int offline_node; 2723 2724 offline_node = marg->status_change_nid; 2725 2726 /* 2727 * If the node still has available memory. we need kmem_cache_node 2728 * for it yet. 2729 */ 2730 if (offline_node < 0) 2731 return; 2732 2733 down_read(&slub_lock); 2734 list_for_each_entry(s, &slab_caches, list) { 2735 n = get_node(s, offline_node); 2736 if (n) { 2737 /* 2738 * if n->nr_slabs > 0, slabs still exist on the node 2739 * that is going down. We were unable to free them, 2740 * and offline_pages() function shoudn't call this 2741 * callback. So, we must fail. 2742 */ 2743 BUG_ON(atomic_long_read(&n->nr_slabs)); 2744 2745 s->node[offline_node] = NULL; 2746 kmem_cache_free(kmalloc_caches, n); 2747 } 2748 } 2749 up_read(&slub_lock); 2750} 2751 2752static int slab_mem_going_online_callback(void *arg) 2753{ 2754 struct kmem_cache_node *n; 2755 struct kmem_cache *s; 2756 struct memory_notify *marg = arg; 2757 int nid = marg->status_change_nid; 2758 int ret = 0; 2759 2760 /* 2761 * If the node's memory is already available, then kmem_cache_node is 2762 * already created. Nothing to do. 2763 */ 2764 if (nid < 0) 2765 return 0; 2766 2767 /* 2768 * We are bringing a node online. No memory is availabe yet. We must 2769 * allocate a kmem_cache_node structure in order to bring the node 2770 * online. 2771 */ 2772 down_read(&slub_lock); 2773 list_for_each_entry(s, &slab_caches, list) { 2774 /* 2775 * XXX: kmem_cache_alloc_node will fallback to other nodes 2776 * since memory is not yet available from the node that 2777 * is brought up. 2778 */ 2779 n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL); 2780 if (!n) { 2781 ret = -ENOMEM; 2782 goto out; 2783 } 2784 init_kmem_cache_node(n); 2785 s->node[nid] = n; 2786 } 2787out: 2788 up_read(&slub_lock); 2789 return ret; 2790} 2791 2792static int slab_memory_callback(struct notifier_block *self, 2793 unsigned long action, void *arg) 2794{ 2795 int ret = 0; 2796 2797 switch (action) { 2798 case MEM_GOING_ONLINE: 2799 ret = slab_mem_going_online_callback(arg); 2800 break; 2801 case MEM_GOING_OFFLINE: 2802 ret = slab_mem_going_offline_callback(arg); 2803 break; 2804 case MEM_OFFLINE: 2805 case MEM_CANCEL_ONLINE: 2806 slab_mem_offline_callback(arg); 2807 break; 2808 case MEM_ONLINE: 2809 case MEM_CANCEL_OFFLINE: 2810 break; 2811 } 2812 2813 ret = notifier_from_errno(ret); 2814 return ret; 2815} 2816 2817#endif /* CONFIG_MEMORY_HOTPLUG */ 2818 2819/******************************************************************** 2820 * Basic setup of slabs 2821 *******************************************************************/ 2822 2823void __init kmem_cache_init(void) 2824{ 2825 int i; 2826 int caches = 0; 2827 2828 init_alloc_cpu(); 2829 2830#ifdef CONFIG_NUMA 2831 /* 2832 * Must first have the slab cache available for the allocations of the 2833 * struct kmem_cache_node's. There is special bootstrap code in 2834 * kmem_cache_open for slab_state == DOWN. 2835 */ 2836 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 2837 sizeof(struct kmem_cache_node), GFP_KERNEL); 2838 kmalloc_caches[0].refcount = -1; 2839 caches++; 2840 2841 hotplug_memory_notifier(slab_memory_callback, 1); 2842#endif 2843 2844 /* Able to allocate the per node structures */ 2845 slab_state = PARTIAL; 2846 2847 /* Caches that are not of the two-to-the-power-of size */ 2848 if (KMALLOC_MIN_SIZE <= 64) { 2849 create_kmalloc_cache(&kmalloc_caches[1], 2850 "kmalloc-96", 96, GFP_KERNEL); 2851 caches++; 2852 } 2853 if (KMALLOC_MIN_SIZE <= 128) { 2854 create_kmalloc_cache(&kmalloc_caches[2], 2855 "kmalloc-192", 192, GFP_KERNEL); 2856 caches++; 2857 } 2858 2859 for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) { 2860 create_kmalloc_cache(&kmalloc_caches[i], 2861 "kmalloc", 1 << i, GFP_KERNEL); 2862 caches++; 2863 } 2864 2865 2866 /* 2867 * Patch up the size_index table if we have strange large alignment 2868 * requirements for the kmalloc array. This is only the case for 2869 * mips it seems. The standard arches will not generate any code here. 2870 * 2871 * Largest permitted alignment is 256 bytes due to the way we 2872 * handle the index determination for the smaller caches. 2873 * 2874 * Make sure that nothing crazy happens if someone starts tinkering 2875 * around with ARCH_KMALLOC_MINALIGN 2876 */ 2877 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 2878 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 2879 2880 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) 2881 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; 2882 2883 slab_state = UP; 2884 2885 /* Provide the correct kmalloc names now that the caches are up */ 2886 for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) 2887 kmalloc_caches[i]. name = 2888 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 2889 2890#ifdef CONFIG_SMP 2891 register_cpu_notifier(&slab_notifier); 2892 kmem_size = offsetof(struct kmem_cache, cpu_slab) + 2893 nr_cpu_ids * sizeof(struct kmem_cache_cpu *); 2894#else 2895 kmem_size = sizeof(struct kmem_cache); 2896#endif 2897 2898 2899 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2900 " CPUs=%d, Nodes=%d\n", 2901 caches, cache_line_size(), 2902 slub_min_order, slub_max_order, slub_min_objects, 2903 nr_cpu_ids, nr_node_ids); 2904} 2905 2906/* 2907 * Find a mergeable slab cache 2908 */ 2909static int slab_unmergeable(struct kmem_cache *s) 2910{ 2911 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 2912 return 1; 2913 2914 if (s->ctor) 2915 return 1; 2916 2917 /* 2918 * We may have set a slab to be unmergeable during bootstrap. 2919 */ 2920 if (s->refcount < 0) 2921 return 1; 2922 2923 return 0; 2924} 2925 2926static struct kmem_cache *find_mergeable(size_t size, 2927 size_t align, unsigned long flags, const char *name, 2928 void (*ctor)(struct kmem_cache *, void *)) 2929{ 2930 struct kmem_cache *s; 2931 2932 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 2933 return NULL; 2934 2935 if (ctor) 2936 return NULL; 2937 2938 size = ALIGN(size, sizeof(void *)); 2939 align = calculate_alignment(flags, align, size); 2940 size = ALIGN(size, align); 2941 flags = kmem_cache_flags(size, flags, name, NULL); 2942 2943 list_for_each_entry(s, &slab_caches, list) { 2944 if (slab_unmergeable(s)) 2945 continue; 2946 2947 if (size > s->size) 2948 continue; 2949 2950 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) 2951 continue; 2952 /* 2953 * Check if alignment is compatible. 2954 * Courtesy of Adrian Drzewiecki 2955 */ 2956 if ((s->size & ~(align - 1)) != s->size) 2957 continue; 2958 2959 if (s->size - size >= sizeof(void *)) 2960 continue; 2961 2962 return s; 2963 } 2964 return NULL; 2965} 2966 2967struct kmem_cache *kmem_cache_create(const char *name, size_t size, 2968 size_t align, unsigned long flags, 2969 void (*ctor)(struct kmem_cache *, void *)) 2970{ 2971 struct kmem_cache *s; 2972 2973 down_write(&slub_lock); 2974 s = find_mergeable(size, align, flags, name, ctor); 2975 if (s) { 2976 int cpu; 2977 2978 s->refcount++; 2979 /* 2980 * Adjust the object sizes so that we clear 2981 * the complete object on kzalloc. 2982 */ 2983 s->objsize = max(s->objsize, (int)size); 2984 2985 /* 2986 * And then we need to update the object size in the 2987 * per cpu structures 2988 */ 2989 for_each_online_cpu(cpu) 2990 get_cpu_slab(s, cpu)->objsize = s->objsize; 2991 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 2992 up_write(&slub_lock); 2993 if (sysfs_slab_alias(s, name)) 2994 goto err; 2995 return s; 2996 } 2997 s = kmalloc(kmem_size, GFP_KERNEL); 2998 if (s) { 2999 if (kmem_cache_open(s, GFP_KERNEL, name, 3000 size, align, flags, ctor)) { 3001 list_add(&s->list, &slab_caches); 3002 up_write(&slub_lock); 3003 if (sysfs_slab_add(s)) 3004 goto err; 3005 return s; 3006 } 3007 kfree(s); 3008 } 3009 up_write(&slub_lock); 3010 3011err: 3012 if (flags & SLAB_PANIC) 3013 panic("Cannot create slabcache %s\n", name); 3014 else 3015 s = NULL; 3016 return s; 3017} 3018EXPORT_SYMBOL(kmem_cache_create); 3019 3020#ifdef CONFIG_SMP 3021/* 3022 * Use the cpu notifier to insure that the cpu slabs are flushed when 3023 * necessary. 3024 */ 3025static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 3026 unsigned long action, void *hcpu) 3027{ 3028 long cpu = (long)hcpu; 3029 struct kmem_cache *s; 3030 unsigned long flags; 3031 3032 switch (action) { 3033 case CPU_UP_PREPARE: 3034 case CPU_UP_PREPARE_FROZEN: 3035 init_alloc_cpu_cpu(cpu); 3036 down_read(&slub_lock); 3037 list_for_each_entry(s, &slab_caches, list) 3038 s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, 3039 GFP_KERNEL); 3040 up_read(&slub_lock); 3041 break; 3042 3043 case CPU_UP_CANCELED: 3044 case CPU_UP_CANCELED_FROZEN: 3045 case CPU_DEAD: 3046 case CPU_DEAD_FROZEN: 3047 down_read(&slub_lock); 3048 list_for_each_entry(s, &slab_caches, list) { 3049 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3050 3051 local_irq_save(flags); 3052 __flush_cpu_slab(s, cpu); 3053 local_irq_restore(flags); 3054 free_kmem_cache_cpu(c, cpu); 3055 s->cpu_slab[cpu] = NULL; 3056 } 3057 up_read(&slub_lock); 3058 break; 3059 default: 3060 break; 3061 } 3062 return NOTIFY_OK; 3063} 3064 3065static struct notifier_block __cpuinitdata slab_notifier = { 3066 &slab_cpuup_callback, NULL, 0 3067}; 3068 3069#endif 3070 3071void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) 3072{ 3073 struct kmem_cache *s; 3074 3075 if (unlikely(size > PAGE_SIZE / 2)) 3076 return (void *)__get_free_pages(gfpflags | __GFP_COMP, 3077 get_order(size)); 3078 s = get_slab(size, gfpflags); 3079 3080 if (unlikely(ZERO_OR_NULL_PTR(s))) 3081 return s; 3082 3083 return slab_alloc(s, gfpflags, -1, caller); 3084} 3085 3086void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3087 int node, void *caller) 3088{ 3089 struct kmem_cache *s; 3090 3091 if (unlikely(size > PAGE_SIZE / 2)) 3092 return (void *)__get_free_pages(gfpflags | __GFP_COMP, 3093 get_order(size)); 3094 s = get_slab(size, gfpflags); 3095 3096 if (unlikely(ZERO_OR_NULL_PTR(s))) 3097 return s; 3098 3099 return slab_alloc(s, gfpflags, node, caller); 3100} 3101 3102#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) 3103static int validate_slab(struct kmem_cache *s, struct page *page, 3104 unsigned long *map) 3105{ 3106 void *p; 3107 void *addr = page_address(page); 3108 3109 if (!check_slab(s, page) || 3110 !on_freelist(s, page, NULL)) 3111 return 0; 3112 3113 /* Now we know that a valid freelist exists */ 3114 bitmap_zero(map, s->objects); 3115 3116 for_each_free_object(p, s, page->freelist) { 3117 set_bit(slab_index(p, s, addr), map); 3118 if (!check_object(s, page, p, 0)) 3119 return 0; 3120 } 3121 3122 for_each_object(p, s, addr) 3123 if (!test_bit(slab_index(p, s, addr), map)) 3124 if (!check_object(s, page, p, 1)) 3125 return 0; 3126 return 1; 3127} 3128 3129static void validate_slab_slab(struct kmem_cache *s, struct page *page, 3130 unsigned long *map) 3131{ 3132 if (slab_trylock(page)) { 3133 validate_slab(s, page, map); 3134 slab_unlock(page); 3135 } else 3136 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", 3137 s->name, page); 3138 3139 if (s->flags & DEBUG_DEFAULT_FLAGS) { 3140 if (!SlabDebug(page)) 3141 printk(KERN_ERR "SLUB %s: SlabDebug not set " 3142 "on slab 0x%p\n", s->name, page); 3143 } else { 3144 if (SlabDebug(page)) 3145 printk(KERN_ERR "SLUB %s: SlabDebug set on " 3146 "slab 0x%p\n", s->name, page); 3147 } 3148} 3149 3150static int validate_slab_node(struct kmem_cache *s, 3151 struct kmem_cache_node *n, unsigned long *map) 3152{ 3153 unsigned long count = 0; 3154 struct page *page; 3155 unsigned long flags; 3156 3157 spin_lock_irqsave(&n->list_lock, flags); 3158 3159 list_for_each_entry(page, &n->partial, lru) { 3160 validate_slab_slab(s, page, map); 3161 count++; 3162 } 3163 if (count != n->nr_partial) 3164 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 3165 "counter=%ld\n", s->name, count, n->nr_partial); 3166 3167 if (!(s->flags & SLAB_STORE_USER)) 3168 goto out; 3169 3170 list_for_each_entry(page, &n->full, lru) { 3171 validate_slab_slab(s, page, map); 3172 count++; 3173 } 3174 if (count != atomic_long_read(&n->nr_slabs)) 3175 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 3176 "counter=%ld\n", s->name, count, 3177 atomic_long_read(&n->nr_slabs)); 3178 3179out: 3180 spin_unlock_irqrestore(&n->list_lock, flags); 3181 return count; 3182} 3183 3184static long validate_slab_cache(struct kmem_cache *s) 3185{ 3186 int node; 3187 unsigned long count = 0; 3188 unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) * 3189 sizeof(unsigned long), GFP_KERNEL); 3190 3191 if (!map) 3192 return -ENOMEM; 3193 3194 flush_all(s); 3195 for_each_node_state(node, N_NORMAL_MEMORY) { 3196 struct kmem_cache_node *n = get_node(s, node); 3197 3198 count += validate_slab_node(s, n, map); 3199 } 3200 kfree(map); 3201 return count; 3202} 3203 3204#ifdef SLUB_RESILIENCY_TEST 3205static void resiliency_test(void) 3206{ 3207 u8 *p; 3208 3209 printk(KERN_ERR "SLUB resiliency testing\n"); 3210 printk(KERN_ERR "-----------------------\n"); 3211 printk(KERN_ERR "A. Corruption after allocation\n"); 3212 3213 p = kzalloc(16, GFP_KERNEL); 3214 p[16] = 0x12; 3215 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 3216 " 0x12->0x%p\n\n", p + 16); 3217 3218 validate_slab_cache(kmalloc_caches + 4); 3219 3220 /* Hmmm... The next two are dangerous */ 3221 p = kzalloc(32, GFP_KERNEL); 3222 p[32 + sizeof(void *)] = 0x34; 3223 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 3224 " 0x34 -> -0x%p\n", p); 3225 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 3226 3227 validate_slab_cache(kmalloc_caches + 5); 3228 p = kzalloc(64, GFP_KERNEL); 3229 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 3230 *p = 0x56; 3231 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 3232 p); 3233 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 3234 validate_slab_cache(kmalloc_caches + 6); 3235 3236 printk(KERN_ERR "\nB. Corruption after free\n"); 3237 p = kzalloc(128, GFP_KERNEL); 3238 kfree(p); 3239 *p = 0x78; 3240 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 3241 validate_slab_cache(kmalloc_caches + 7); 3242 3243 p = kzalloc(256, GFP_KERNEL); 3244 kfree(p); 3245 p[50] = 0x9a; 3246 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); 3247 validate_slab_cache(kmalloc_caches + 8); 3248 3249 p = kzalloc(512, GFP_KERNEL); 3250 kfree(p); 3251 p[512] = 0xab; 3252 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 3253 validate_slab_cache(kmalloc_caches + 9); 3254} 3255#else 3256static void resiliency_test(void) {}; 3257#endif 3258 3259/* 3260 * Generate lists of code addresses where slabcache objects are allocated 3261 * and freed. 3262 */ 3263 3264struct location { 3265 unsigned long count; 3266 void *addr; 3267 long long sum_time; 3268 long min_time; 3269 long max_time; 3270 long min_pid; 3271 long max_pid; 3272 cpumask_t cpus; 3273 nodemask_t nodes; 3274}; 3275 3276struct loc_track { 3277 unsigned long max; 3278 unsigned long count; 3279 struct location *loc; 3280}; 3281 3282static void free_loc_track(struct loc_track *t) 3283{ 3284 if (t->max) 3285 free_pages((unsigned long)t->loc, 3286 get_order(sizeof(struct location) * t->max)); 3287} 3288 3289static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) 3290{ 3291 struct location *l; 3292 int order; 3293 3294 order = get_order(sizeof(struct location) * max); 3295 3296 l = (void *)__get_free_pages(flags, order); 3297 if (!l) 3298 return 0; 3299 3300 if (t->count) { 3301 memcpy(l, t->loc, sizeof(struct location) * t->count); 3302 free_loc_track(t); 3303 } 3304 t->max = max; 3305 t->loc = l; 3306 return 1; 3307} 3308 3309static int add_location(struct loc_track *t, struct kmem_cache *s, 3310 const struct track *track) 3311{ 3312 long start, end, pos; 3313 struct location *l; 3314 void *caddr; 3315 unsigned long age = jiffies - track->when; 3316 3317 start = -1; 3318 end = t->count; 3319 3320 for ( ; ; ) { 3321 pos = start + (end - start + 1) / 2; 3322 3323 /* 3324 * There is nothing at "end". If we end up there 3325 * we need to add something to before end. 3326 */ 3327 if (pos == end) 3328 break; 3329 3330 caddr = t->loc[pos].addr; 3331 if (track->addr == caddr) { 3332 3333 l = &t->loc[pos]; 3334 l->count++; 3335 if (track->when) { 3336 l->sum_time += age; 3337 if (age < l->min_time) 3338 l->min_time = age; 3339 if (age > l->max_time) 3340 l->max_time = age; 3341 3342 if (track->pid < l->min_pid) 3343 l->min_pid = track->pid; 3344 if (track->pid > l->max_pid) 3345 l->max_pid = track->pid; 3346 3347 cpu_set(track->cpu, l->cpus); 3348 } 3349 node_set(page_to_nid(virt_to_page(track)), l->nodes); 3350 return 1; 3351 } 3352 3353 if (track->addr < caddr) 3354 end = pos; 3355 else 3356 start = pos; 3357 } 3358 3359 /* 3360 * Not found. Insert new tracking element. 3361 */ 3362 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) 3363 return 0; 3364 3365 l = t->loc + pos; 3366 if (pos < t->count) 3367 memmove(l + 1, l, 3368 (t->count - pos) * sizeof(struct location)); 3369 t->count++; 3370 l->count = 1; 3371 l->addr = track->addr; 3372 l->sum_time = age; 3373 l->min_time = age; 3374 l->max_time = age; 3375 l->min_pid = track->pid; 3376 l->max_pid = track->pid; 3377 cpus_clear(l->cpus); 3378 cpu_set(track->cpu, l->cpus); 3379 nodes_clear(l->nodes); 3380 node_set(page_to_nid(virt_to_page(track)), l->nodes); 3381 return 1; 3382} 3383 3384static void process_slab(struct loc_track *t, struct kmem_cache *s, 3385 struct page *page, enum track_item alloc) 3386{ 3387 void *addr = page_address(page); 3388 DECLARE_BITMAP(map, s->objects); 3389 void *p; 3390 3391 bitmap_zero(map, s->objects); 3392 for_each_free_object(p, s, page->freelist) 3393 set_bit(slab_index(p, s, addr), map); 3394 3395 for_each_object(p, s, addr) 3396 if (!test_bit(slab_index(p, s, addr), map)) 3397 add_location(t, s, get_track(s, p, alloc)); 3398} 3399 3400static int list_locations(struct kmem_cache *s, char *buf, 3401 enum track_item alloc) 3402{ 3403 int len = 0; 3404 unsigned long i; 3405 struct loc_track t = { 0, 0, NULL }; 3406 int node; 3407 3408 if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 3409 GFP_TEMPORARY)) 3410 return sprintf(buf, "Out of memory\n"); 3411 3412 /* Push back cpu slabs */ 3413 flush_all(s); 3414 3415 for_each_node_state(node, N_NORMAL_MEMORY) { 3416 struct kmem_cache_node *n = get_node(s, node); 3417 unsigned long flags; 3418 struct page *page; 3419 3420 if (!atomic_long_read(&n->nr_slabs)) 3421 continue; 3422 3423 spin_lock_irqsave(&n->list_lock, flags); 3424 list_for_each_entry(page, &n->partial, lru) 3425 process_slab(&t, s, page, alloc); 3426 list_for_each_entry(page, &n->full, lru) 3427 process_slab(&t, s, page, alloc); 3428 spin_unlock_irqrestore(&n->list_lock, flags); 3429 } 3430 3431 for (i = 0; i < t.count; i++) { 3432 struct location *l = &t.loc[i]; 3433 3434 if (len > PAGE_SIZE - 100) 3435 break; 3436 len += sprintf(buf + len, "%7ld ", l->count); 3437 3438 if (l->addr) 3439 len += sprint_symbol(buf + len, (unsigned long)l->addr); 3440 else 3441 len += sprintf(buf + len, "<not-available>"); 3442 3443 if (l->sum_time != l->min_time) { 3444 unsigned long remainder; 3445 3446 len += sprintf(buf + len, " age=%ld/%ld/%ld", 3447 l->min_time, 3448 div_long_long_rem(l->sum_time, l->count, &remainder), 3449 l->max_time); 3450 } else 3451 len += sprintf(buf + len, " age=%ld", 3452 l->min_time); 3453 3454 if (l->min_pid != l->max_pid) 3455 len += sprintf(buf + len, " pid=%ld-%ld", 3456 l->min_pid, l->max_pid); 3457 else 3458 len += sprintf(buf + len, " pid=%ld", 3459 l->min_pid); 3460 3461 if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && 3462 len < PAGE_SIZE - 60) { 3463 len += sprintf(buf + len, " cpus="); 3464 len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, 3465 l->cpus); 3466 } 3467 3468 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && 3469 len < PAGE_SIZE - 60) { 3470 len += sprintf(buf + len, " nodes="); 3471 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, 3472 l->nodes); 3473 } 3474 3475 len += sprintf(buf + len, "\n"); 3476 } 3477 3478 free_loc_track(&t); 3479 if (!t.count) 3480 len += sprintf(buf, "No data\n"); 3481 return len; 3482} 3483 3484enum slab_stat_type { 3485 SL_FULL, 3486 SL_PARTIAL, 3487 SL_CPU, 3488 SL_OBJECTS 3489}; 3490 3491#define SO_FULL (1 << SL_FULL) 3492#define SO_PARTIAL (1 << SL_PARTIAL) 3493#define SO_CPU (1 << SL_CPU) 3494#define SO_OBJECTS (1 << SL_OBJECTS) 3495 3496static unsigned long slab_objects(struct kmem_cache *s, 3497 char *buf, unsigned long flags) 3498{ 3499 unsigned long total = 0; 3500 int cpu; 3501 int node; 3502 int x; 3503 unsigned long *nodes; 3504 unsigned long *per_cpu; 3505 3506 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 3507 per_cpu = nodes + nr_node_ids; 3508 3509 for_each_possible_cpu(cpu) { 3510 struct page *page; 3511 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3512 3513 if (!c) 3514 continue; 3515 3516 page = c->page; 3517 node = c->node; 3518 if (node < 0) 3519 continue; 3520 if (page) { 3521 if (flags & SO_CPU) { 3522 if (flags & SO_OBJECTS) 3523 x = page->inuse; 3524 else 3525 x = 1; 3526 total += x; 3527 nodes[node] += x; 3528 } 3529 per_cpu[node]++; 3530 } 3531 } 3532 3533 for_each_node_state(node, N_NORMAL_MEMORY) { 3534 struct kmem_cache_node *n = get_node(s, node); 3535 3536 if (flags & SO_PARTIAL) { 3537 if (flags & SO_OBJECTS) 3538 x = count_partial(n); 3539 else 3540 x = n->nr_partial; 3541 total += x; 3542 nodes[node] += x; 3543 } 3544 3545 if (flags & SO_FULL) { 3546 int full_slabs = atomic_long_read(&n->nr_slabs) 3547 - per_cpu[node] 3548 - n->nr_partial; 3549 3550 if (flags & SO_OBJECTS) 3551 x = full_slabs * s->objects; 3552 else 3553 x = full_slabs; 3554 total += x; 3555 nodes[node] += x; 3556 } 3557 } 3558 3559 x = sprintf(buf, "%lu", total); 3560#ifdef CONFIG_NUMA 3561 for_each_node_state(node, N_NORMAL_MEMORY) 3562 if (nodes[node]) 3563 x += sprintf(buf + x, " N%d=%lu", 3564 node, nodes[node]); 3565#endif 3566 kfree(nodes); 3567 return x + sprintf(buf + x, "\n"); 3568} 3569 3570static int any_slab_objects(struct kmem_cache *s) 3571{ 3572 int node; 3573 int cpu; 3574 3575 for_each_possible_cpu(cpu) { 3576 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3577 3578 if (c && c->page) 3579 return 1; 3580 } 3581 3582 for_each_online_node(node) { 3583 struct kmem_cache_node *n = get_node(s, node); 3584 3585 if (!n) 3586 continue; 3587 3588 if (n->nr_partial || atomic_long_read(&n->nr_slabs)) 3589 return 1; 3590 } 3591 return 0; 3592} 3593 3594#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 3595#define to_slab(n) container_of(n, struct kmem_cache, kobj); 3596 3597struct slab_attribute { 3598 struct attribute attr; 3599 ssize_t (*show)(struct kmem_cache *s, char *buf); 3600 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 3601}; 3602 3603#define SLAB_ATTR_RO(_name) \ 3604 static struct slab_attribute _name##_attr = __ATTR_RO(_name) 3605 3606#define SLAB_ATTR(_name) \ 3607 static struct slab_attribute _name##_attr = \ 3608 __ATTR(_name, 0644, _name##_show, _name##_store) 3609 3610static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 3611{ 3612 return sprintf(buf, "%d\n", s->size); 3613} 3614SLAB_ATTR_RO(slab_size); 3615 3616static ssize_t align_show(struct kmem_cache *s, char *buf) 3617{ 3618 return sprintf(buf, "%d\n", s->align); 3619} 3620SLAB_ATTR_RO(align); 3621 3622static ssize_t object_size_show(struct kmem_cache *s, char *buf) 3623{ 3624 return sprintf(buf, "%d\n", s->objsize); 3625} 3626SLAB_ATTR_RO(object_size); 3627 3628static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 3629{ 3630 return sprintf(buf, "%d\n", s->objects); 3631} 3632SLAB_ATTR_RO(objs_per_slab); 3633 3634static ssize_t order_show(struct kmem_cache *s, char *buf) 3635{ 3636 return sprintf(buf, "%d\n", s->order); 3637} 3638SLAB_ATTR_RO(order); 3639 3640static ssize_t ctor_show(struct kmem_cache *s, char *buf) 3641{ 3642 if (s->ctor) { 3643 int n = sprint_symbol(buf, (unsigned long)s->ctor); 3644 3645 return n + sprintf(buf + n, "\n"); 3646 } 3647 return 0; 3648} 3649SLAB_ATTR_RO(ctor); 3650 3651static ssize_t aliases_show(struct kmem_cache *s, char *buf) 3652{ 3653 return sprintf(buf, "%d\n", s->refcount - 1); 3654} 3655SLAB_ATTR_RO(aliases); 3656 3657static ssize_t slabs_show(struct kmem_cache *s, char *buf) 3658{ 3659 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); 3660} 3661SLAB_ATTR_RO(slabs); 3662 3663static ssize_t partial_show(struct kmem_cache *s, char *buf) 3664{ 3665 return slab_objects(s, buf, SO_PARTIAL); 3666} 3667SLAB_ATTR_RO(partial); 3668 3669static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 3670{ 3671 return slab_objects(s, buf, SO_CPU); 3672} 3673SLAB_ATTR_RO(cpu_slabs); 3674 3675static ssize_t objects_show(struct kmem_cache *s, char *buf) 3676{ 3677 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); 3678} 3679SLAB_ATTR_RO(objects); 3680 3681static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 3682{ 3683 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 3684} 3685 3686static ssize_t sanity_checks_store(struct kmem_cache *s, 3687 const char *buf, size_t length) 3688{ 3689 s->flags &= ~SLAB_DEBUG_FREE; 3690 if (buf[0] == '1') 3691 s->flags |= SLAB_DEBUG_FREE; 3692 return length; 3693} 3694SLAB_ATTR(sanity_checks); 3695 3696static ssize_t trace_show(struct kmem_cache *s, char *buf) 3697{ 3698 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 3699} 3700 3701static ssize_t trace_store(struct kmem_cache *s, const char *buf, 3702 size_t length) 3703{ 3704 s->flags &= ~SLAB_TRACE; 3705 if (buf[0] == '1') 3706 s->flags |= SLAB_TRACE; 3707 return length; 3708} 3709SLAB_ATTR(trace); 3710 3711static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 3712{ 3713 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 3714} 3715 3716static ssize_t reclaim_account_store(struct kmem_cache *s, 3717 const char *buf, size_t length) 3718{ 3719 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 3720 if (buf[0] == '1') 3721 s->flags |= SLAB_RECLAIM_ACCOUNT; 3722 return length; 3723} 3724SLAB_ATTR(reclaim_account); 3725 3726static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 3727{ 3728 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 3729} 3730SLAB_ATTR_RO(hwcache_align); 3731 3732#ifdef CONFIG_ZONE_DMA 3733static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 3734{ 3735 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 3736} 3737SLAB_ATTR_RO(cache_dma); 3738#endif 3739 3740static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 3741{ 3742 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 3743} 3744SLAB_ATTR_RO(destroy_by_rcu); 3745 3746static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 3747{ 3748 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 3749} 3750 3751static ssize_t red_zone_store(struct kmem_cache *s, 3752 const char *buf, size_t length) 3753{ 3754 if (any_slab_objects(s)) 3755 return -EBUSY; 3756 3757 s->flags &= ~SLAB_RED_ZONE; 3758 if (buf[0] == '1') 3759 s->flags |= SLAB_RED_ZONE; 3760 calculate_sizes(s); 3761 return length; 3762} 3763SLAB_ATTR(red_zone); 3764 3765static ssize_t poison_show(struct kmem_cache *s, char *buf) 3766{ 3767 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 3768} 3769 3770static ssize_t poison_store(struct kmem_cache *s, 3771 const char *buf, size_t length) 3772{ 3773 if (any_slab_objects(s)) 3774 return -EBUSY; 3775 3776 s->flags &= ~SLAB_POISON; 3777 if (buf[0] == '1') 3778 s->flags |= SLAB_POISON; 3779 calculate_sizes(s); 3780 return length; 3781} 3782SLAB_ATTR(poison); 3783 3784static ssize_t store_user_show(struct kmem_cache *s, char *buf) 3785{ 3786 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 3787} 3788 3789static ssize_t store_user_store(struct kmem_cache *s, 3790 const char *buf, size_t length) 3791{ 3792 if (any_slab_objects(s)) 3793 return -EBUSY; 3794 3795 s->flags &= ~SLAB_STORE_USER; 3796 if (buf[0] == '1') 3797 s->flags |= SLAB_STORE_USER; 3798 calculate_sizes(s); 3799 return length; 3800} 3801SLAB_ATTR(store_user); 3802 3803static ssize_t validate_show(struct kmem_cache *s, char *buf) 3804{ 3805 return 0; 3806} 3807 3808static ssize_t validate_store(struct kmem_cache *s, 3809 const char *buf, size_t length) 3810{ 3811 int ret = -EINVAL; 3812 3813 if (buf[0] == '1') { 3814 ret = validate_slab_cache(s); 3815 if (ret >= 0) 3816 ret = length; 3817 } 3818 return ret; 3819} 3820SLAB_ATTR(validate); 3821 3822static ssize_t shrink_show(struct kmem_cache *s, char *buf) 3823{ 3824 return 0; 3825} 3826 3827static ssize_t shrink_store(struct kmem_cache *s, 3828 const char *buf, size_t length) 3829{ 3830 if (buf[0] == '1') { 3831 int rc = kmem_cache_shrink(s); 3832 3833 if (rc) 3834 return rc; 3835 } else 3836 return -EINVAL; 3837 return length; 3838} 3839SLAB_ATTR(shrink); 3840 3841static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 3842{ 3843 if (!(s->flags & SLAB_STORE_USER)) 3844 return -ENOSYS; 3845 return list_locations(s, buf, TRACK_ALLOC); 3846} 3847SLAB_ATTR_RO(alloc_calls); 3848 3849static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 3850{ 3851 if (!(s->flags & SLAB_STORE_USER)) 3852 return -ENOSYS; 3853 return list_locations(s, buf, TRACK_FREE); 3854} 3855SLAB_ATTR_RO(free_calls); 3856 3857#ifdef CONFIG_NUMA 3858static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 3859{ 3860 return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); 3861} 3862 3863static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, 3864 const char *buf, size_t length) 3865{ 3866 int n = simple_strtoul(buf, NULL, 10); 3867 3868 if (n < 100) 3869 s->remote_node_defrag_ratio = n * 10; 3870 return length; 3871} 3872SLAB_ATTR(remote_node_defrag_ratio); 3873#endif 3874 3875static struct attribute *slab_attrs[] = { 3876 &slab_size_attr.attr, 3877 &object_size_attr.attr, 3878 &objs_per_slab_attr.attr, 3879 &order_attr.attr, 3880 &objects_attr.attr, 3881 &slabs_attr.attr, 3882 &partial_attr.attr, 3883 &cpu_slabs_attr.attr, 3884 &ctor_attr.attr, 3885 &aliases_attr.attr, 3886 &align_attr.attr, 3887 &sanity_checks_attr.attr, 3888 &trace_attr.attr, 3889 &hwcache_align_attr.attr, 3890 &reclaim_account_attr.attr, 3891 &destroy_by_rcu_attr.attr, 3892 &red_zone_attr.attr, 3893 &poison_attr.attr, 3894 &store_user_attr.attr, 3895 &validate_attr.attr, 3896 &shrink_attr.attr, 3897 &alloc_calls_attr.attr, 3898 &free_calls_attr.attr, 3899#ifdef CONFIG_ZONE_DMA 3900 &cache_dma_attr.attr, 3901#endif 3902#ifdef CONFIG_NUMA 3903 &remote_node_defrag_ratio_attr.attr, 3904#endif 3905 NULL 3906}; 3907 3908static struct attribute_group slab_attr_group = { 3909 .attrs = slab_attrs, 3910}; 3911 3912static ssize_t slab_attr_show(struct kobject *kobj, 3913 struct attribute *attr, 3914 char *buf) 3915{ 3916 struct slab_attribute *attribute; 3917 struct kmem_cache *s; 3918 int err; 3919 3920 attribute = to_slab_attr(attr); 3921 s = to_slab(kobj); 3922 3923 if (!attribute->show) 3924 return -EIO; 3925 3926 err = attribute->show(s, buf); 3927 3928 return err; 3929} 3930 3931static ssize_t slab_attr_store(struct kobject *kobj, 3932 struct attribute *attr, 3933 const char *buf, size_t len) 3934{ 3935 struct slab_attribute *attribute; 3936 struct kmem_cache *s; 3937 int err; 3938 3939 attribute = to_slab_attr(attr); 3940 s = to_slab(kobj); 3941 3942 if (!attribute->store) 3943 return -EIO; 3944 3945 err = attribute->store(s, buf, len); 3946 3947 return err; 3948} 3949 3950static void kmem_cache_release(struct kobject *kobj) 3951{ 3952 struct kmem_cache *s = to_slab(kobj); 3953 3954 kfree(s); 3955} 3956 3957static struct sysfs_ops slab_sysfs_ops = { 3958 .show = slab_attr_show, 3959 .store = slab_attr_store, 3960}; 3961 3962static struct kobj_type slab_ktype = { 3963 .sysfs_ops = &slab_sysfs_ops, 3964 .release = kmem_cache_release 3965}; 3966 3967static int uevent_filter(struct kset *kset, struct kobject *kobj) 3968{ 3969 struct kobj_type *ktype = get_ktype(kobj); 3970 3971 if (ktype == &slab_ktype) 3972 return 1; 3973 return 0; 3974} 3975 3976static struct kset_uevent_ops slab_uevent_ops = { 3977 .filter = uevent_filter, 3978}; 3979 3980static struct kset *slab_kset; 3981 3982#define ID_STR_LENGTH 64 3983 3984/* Create a unique string id for a slab cache: 3985 * format 3986 * :[flags-]size:[memory address of kmemcache] 3987 */ 3988static char *create_unique_id(struct kmem_cache *s) 3989{ 3990 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 3991 char *p = name; 3992 3993 BUG_ON(!name); 3994 3995 *p++ = ':'; 3996 /* 3997 * First flags affecting slabcache operations. We will only 3998 * get here for aliasable slabs so we do not need to support 3999 * too many flags. The flags here must cover all flags that 4000 * are matched during merging to guarantee that the id is 4001 * unique. 4002 */ 4003 if (s->flags & SLAB_CACHE_DMA) 4004 *p++ = 'd'; 4005 if (s->flags & SLAB_RECLAIM_ACCOUNT) 4006 *p++ = 'a'; 4007 if (s->flags & SLAB_DEBUG_FREE) 4008 *p++ = 'F'; 4009 if (p != name + 1) 4010 *p++ = '-'; 4011 p += sprintf(p, "%07d", s->size); 4012 BUG_ON(p > name + ID_STR_LENGTH - 1); 4013 return name; 4014} 4015 4016static int sysfs_slab_add(struct kmem_cache *s) 4017{ 4018 int err; 4019 const char *name; 4020 int unmergeable; 4021 4022 if (slab_state < SYSFS) 4023 /* Defer until later */ 4024 return 0; 4025 4026 unmergeable = slab_unmergeable(s); 4027 if (unmergeable) { 4028 /* 4029 * Slabcache can never be merged so we can use the name proper. 4030 * This is typically the case for debug situations. In that 4031 * case we can catch duplicate names easily. 4032 */ 4033 sysfs_remove_link(&slab_kset->kobj, s->name); 4034 name = s->name; 4035 } else { 4036 /* 4037 * Create a unique name for the slab as a target 4038 * for the symlinks. 4039 */ 4040 name = create_unique_id(s); 4041 } 4042 4043 s->kobj.kset = slab_kset; 4044 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); 4045 if (err) { 4046 kobject_put(&s->kobj); 4047 return err; 4048 } 4049 4050 err = sysfs_create_group(&s->kobj, &slab_attr_group); 4051 if (err) 4052 return err; 4053 kobject_uevent(&s->kobj, KOBJ_ADD); 4054 if (!unmergeable) { 4055 /* Setup first alias */ 4056 sysfs_slab_alias(s, s->name); 4057 kfree(name); 4058 } 4059 return 0; 4060} 4061 4062static void sysfs_slab_remove(struct kmem_cache *s) 4063{ 4064 kobject_uevent(&s->kobj, KOBJ_REMOVE); 4065 kobject_del(&s->kobj); 4066 kobject_put(&s->kobj); 4067} 4068 4069/* 4070 * Need to buffer aliases during bootup until sysfs becomes 4071 * available lest we loose that information. 4072 */ 4073struct saved_alias { 4074 struct kmem_cache *s; 4075 const char *name; 4076 struct saved_alias *next; 4077}; 4078 4079static struct saved_alias *alias_list; 4080 4081static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 4082{ 4083 struct saved_alias *al; 4084 4085 if (slab_state == SYSFS) { 4086 /* 4087 * If we have a leftover link then remove it. 4088 */ 4089 sysfs_remove_link(&slab_kset->kobj, name); 4090 return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); 4091 } 4092 4093 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 4094 if (!al) 4095 return -ENOMEM; 4096 4097 al->s = s; 4098 al->name = name; 4099 al->next = alias_list; 4100 alias_list = al; 4101 return 0; 4102} 4103 4104static int __init slab_sysfs_init(void) 4105{ 4106 struct kmem_cache *s; 4107 int err; 4108 4109 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 4110 if (!slab_kset) { 4111 printk(KERN_ERR "Cannot register slab subsystem.\n"); 4112 return -ENOSYS; 4113 } 4114 4115 slab_state = SYSFS; 4116 4117 list_for_each_entry(s, &slab_caches, list) { 4118 err = sysfs_slab_add(s); 4119 if (err) 4120 printk(KERN_ERR "SLUB: Unable to add boot slab %s" 4121 " to sysfs\n", s->name); 4122 } 4123 4124 while (alias_list) { 4125 struct saved_alias *al = alias_list; 4126 4127 alias_list = alias_list->next; 4128 err = sysfs_slab_alias(al->s, al->name); 4129 if (err) 4130 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 4131 " %s to sysfs\n", s->name); 4132 kfree(al); 4133 } 4134 4135 resiliency_test(); 4136 return 0; 4137} 4138 4139__initcall(slab_sysfs_init); 4140#endif 4141 4142/* 4143 * The /proc/slabinfo ABI 4144 */ 4145#ifdef CONFIG_SLABINFO 4146 4147ssize_t slabinfo_write(struct file *file, const char __user * buffer, 4148 size_t count, loff_t *ppos) 4149{ 4150 return -EINVAL; 4151} 4152 4153 4154static void print_slabinfo_header(struct seq_file *m) 4155{ 4156 seq_puts(m, "slabinfo - version: 2.1\n"); 4157 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 4158 "<objperslab> <pagesperslab>"); 4159 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 4160 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 4161 seq_putc(m, '\n'); 4162} 4163 4164static void *s_start(struct seq_file *m, loff_t *pos) 4165{ 4166 loff_t n = *pos; 4167 4168 down_read(&slub_lock); 4169 if (!n) 4170 print_slabinfo_header(m); 4171 4172 return seq_list_start(&slab_caches, *pos); 4173} 4174 4175static void *s_next(struct seq_file *m, void *p, loff_t *pos) 4176{ 4177 return seq_list_next(p, &slab_caches, pos); 4178} 4179 4180static void s_stop(struct seq_file *m, void *p) 4181{ 4182 up_read(&slub_lock); 4183} 4184 4185static int s_show(struct seq_file *m, void *p) 4186{ 4187 unsigned long nr_partials = 0; 4188 unsigned long nr_slabs = 0; 4189 unsigned long nr_inuse = 0; 4190 unsigned long nr_objs; 4191 struct kmem_cache *s; 4192 int node; 4193 4194 s = list_entry(p, struct kmem_cache, list); 4195 4196 for_each_online_node(node) { 4197 struct kmem_cache_node *n = get_node(s, node); 4198 4199 if (!n) 4200 continue; 4201 4202 nr_partials += n->nr_partial; 4203 nr_slabs += atomic_long_read(&n->nr_slabs); 4204 nr_inuse += count_partial(n); 4205 } 4206 4207 nr_objs = nr_slabs * s->objects; 4208 nr_inuse += (nr_slabs - nr_partials) * s->objects; 4209 4210 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, 4211 nr_objs, s->size, s->objects, (1 << s->order)); 4212 seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); 4213 seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, 4214 0UL); 4215 seq_putc(m, '\n'); 4216 return 0; 4217} 4218 4219const struct seq_operations slabinfo_op = { 4220 .start = s_start, 4221 .next = s_next, 4222 .stop = s_stop, 4223 .show = s_show, 4224}; 4225 4226#endif /* CONFIG_SLABINFO */ 4227