slub.c revision 4f104934591ed98534b3a4c3d17d972b790e9c42
1/* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks and only 6 * uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com> 9 */ 10 11#include <linux/mm.h> 12#include <linux/module.h> 13#include <linux/bit_spinlock.h> 14#include <linux/interrupt.h> 15#include <linux/bitops.h> 16#include <linux/slab.h> 17#include <linux/seq_file.h> 18#include <linux/cpu.h> 19#include <linux/cpuset.h> 20#include <linux/mempolicy.h> 21#include <linux/ctype.h> 22#include <linux/kallsyms.h> 23 24/* 25 * Lock order: 26 * 1. slab_lock(page) 27 * 2. slab->list_lock 28 * 29 * The slab_lock protects operations on the object of a particular 30 * slab and its metadata in the page struct. If the slab lock 31 * has been taken then no allocations nor frees can be performed 32 * on the objects in the slab nor can the slab be added or removed 33 * from the partial or full lists since this would mean modifying 34 * the page_struct of the slab. 35 * 36 * The list_lock protects the partial and full list on each node and 37 * the partial slab counter. If taken then no new slabs may be added or 38 * removed from the lists nor make the number of partial slabs be modified. 39 * (Note that the total number of slabs is an atomic value that may be 40 * modified without taking the list lock). 41 * 42 * The list_lock is a centralized lock and thus we avoid taking it as 43 * much as possible. As long as SLUB does not have to handle partial 44 * slabs, operations can continue without any centralized lock. F.e. 45 * allocating a long series of objects that fill up slabs does not require 46 * the list lock. 47 * 48 * The lock order is sometimes inverted when we are trying to get a slab 49 * off a list. We take the list_lock and then look for a page on the list 50 * to use. While we do that objects in the slabs may be freed. We can 51 * only operate on the slab if we have also taken the slab_lock. So we use 52 * a slab_trylock() on the slab. If trylock was successful then no frees 53 * can occur anymore and we can use the slab for allocations etc. If the 54 * slab_trylock() does not succeed then frees are in progress in the slab and 55 * we must stay away from it for a while since we may cause a bouncing 56 * cacheline if we try to acquire the lock. So go onto the next slab. 57 * If all pages are busy then we may allocate a new slab instead of reusing 58 * a partial slab. A new slab has noone operating on it and thus there is 59 * no danger of cacheline contention. 60 * 61 * Interrupts are disabled during allocation and deallocation in order to 62 * make the slab allocator safe to use in the context of an irq. In addition 63 * interrupts are disabled to ensure that the processor does not change 64 * while handling per_cpu slabs, due to kernel preemption. 65 * 66 * SLUB assigns one slab for allocation to each processor. 67 * Allocations only occur from these slabs called cpu slabs. 68 * 69 * Slabs with free elements are kept on a partial list. 70 * There is no list for full slabs. If an object in a full slab is 71 * freed then the slab will show up again on the partial lists. 72 * Otherwise there is no need to track full slabs unless we have to 73 * track full slabs for debugging purposes. 74 * 75 * Slabs are freed when they become empty. Teardown and setup is 76 * minimal so we rely on the page allocators per cpu caches for 77 * fast frees and allocs. 78 * 79 * Overloading of page flags that are otherwise used for LRU management. 80 * 81 * PageActive The slab is used as a cpu cache. Allocations 82 * may be performed from the slab. The slab is not 83 * on any slab list and cannot be moved onto one. 84 * 85 * PageError Slab requires special handling due to debug 86 * options set. This moves slab handling out of 87 * the fast path. 88 */ 89 90/* 91 * Issues still to be resolved: 92 * 93 * - The per cpu array is updated for each new slab and and is a remote 94 * cacheline for most nodes. This could become a bouncing cacheline given 95 * enough frequent updates. There are 16 pointers in a cacheline.so at 96 * max 16 cpus could compete. Likely okay. 97 * 98 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 99 * 100 * - Variable sizing of the per node arrays 101 */ 102 103/* Enable to test recovery from slab corruption on boot */ 104#undef SLUB_RESILIENCY_TEST 105 106#if PAGE_SHIFT <= 12 107 108/* 109 * Small page size. Make sure that we do not fragment memory 110 */ 111#define DEFAULT_MAX_ORDER 1 112#define DEFAULT_MIN_OBJECTS 4 113 114#else 115 116/* 117 * Large page machines are customarily able to handle larger 118 * page orders. 119 */ 120#define DEFAULT_MAX_ORDER 2 121#define DEFAULT_MIN_OBJECTS 8 122 123#endif 124 125/* 126 * Mininum number of partial slabs. These will be left on the partial 127 * lists even if they are empty. kmem_cache_shrink may reclaim them. 128 */ 129#define MIN_PARTIAL 2 130 131/* 132 * Maximum number of desirable partial slabs. 133 * The existence of more partial slabs makes kmem_cache_shrink 134 * sort the partial list by the number of objects in the. 135 */ 136#define MAX_PARTIAL 10 137 138#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 139 SLAB_POISON | SLAB_STORE_USER) 140/* 141 * Set of flags that will prevent slab merging 142 */ 143#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 144 SLAB_TRACE | SLAB_DESTROY_BY_RCU) 145 146#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 147 SLAB_CACHE_DMA) 148 149#ifndef ARCH_KMALLOC_MINALIGN 150#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 151#endif 152 153#ifndef ARCH_SLAB_MINALIGN 154#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) 155#endif 156 157/* Internal SLUB flags */ 158#define __OBJECT_POISON 0x80000000 /* Poison object */ 159 160static int kmem_size = sizeof(struct kmem_cache); 161 162#ifdef CONFIG_SMP 163static struct notifier_block slab_notifier; 164#endif 165 166static enum { 167 DOWN, /* No slab functionality available */ 168 PARTIAL, /* kmem_cache_open() works but kmalloc does not */ 169 UP, /* Everything works */ 170 SYSFS /* Sysfs up */ 171} slab_state = DOWN; 172 173/* A list of all slab caches on the system */ 174static DECLARE_RWSEM(slub_lock); 175LIST_HEAD(slab_caches); 176 177#ifdef CONFIG_SYSFS 178static int sysfs_slab_add(struct kmem_cache *); 179static int sysfs_slab_alias(struct kmem_cache *, const char *); 180static void sysfs_slab_remove(struct kmem_cache *); 181#else 182static int sysfs_slab_add(struct kmem_cache *s) { return 0; } 183static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } 184static void sysfs_slab_remove(struct kmem_cache *s) {} 185#endif 186 187/******************************************************************** 188 * Core slab cache functions 189 *******************************************************************/ 190 191int slab_is_available(void) 192{ 193 return slab_state >= UP; 194} 195 196static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 197{ 198#ifdef CONFIG_NUMA 199 return s->node[node]; 200#else 201 return &s->local_node; 202#endif 203} 204 205/* 206 * Object debugging 207 */ 208static void print_section(char *text, u8 *addr, unsigned int length) 209{ 210 int i, offset; 211 int newline = 1; 212 char ascii[17]; 213 214 ascii[16] = 0; 215 216 for (i = 0; i < length; i++) { 217 if (newline) { 218 printk(KERN_ERR "%10s 0x%p: ", text, addr + i); 219 newline = 0; 220 } 221 printk(" %02x", addr[i]); 222 offset = i % 16; 223 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; 224 if (offset == 15) { 225 printk(" %s\n",ascii); 226 newline = 1; 227 } 228 } 229 if (!newline) { 230 i %= 16; 231 while (i < 16) { 232 printk(" "); 233 ascii[i] = ' '; 234 i++; 235 } 236 printk(" %s\n", ascii); 237 } 238} 239 240/* 241 * Slow version of get and set free pointer. 242 * 243 * This requires touching the cache lines of kmem_cache. 244 * The offset can also be obtained from the page. In that 245 * case it is in the cacheline that we already need to touch. 246 */ 247static void *get_freepointer(struct kmem_cache *s, void *object) 248{ 249 return *(void **)(object + s->offset); 250} 251 252static void set_freepointer(struct kmem_cache *s, void *object, void *fp) 253{ 254 *(void **)(object + s->offset) = fp; 255} 256 257/* 258 * Tracking user of a slab. 259 */ 260struct track { 261 void *addr; /* Called from address */ 262 int cpu; /* Was running on cpu */ 263 int pid; /* Pid context */ 264 unsigned long when; /* When did the operation occur */ 265}; 266 267enum track_item { TRACK_ALLOC, TRACK_FREE }; 268 269static struct track *get_track(struct kmem_cache *s, void *object, 270 enum track_item alloc) 271{ 272 struct track *p; 273 274 if (s->offset) 275 p = object + s->offset + sizeof(void *); 276 else 277 p = object + s->inuse; 278 279 return p + alloc; 280} 281 282static void set_track(struct kmem_cache *s, void *object, 283 enum track_item alloc, void *addr) 284{ 285 struct track *p; 286 287 if (s->offset) 288 p = object + s->offset + sizeof(void *); 289 else 290 p = object + s->inuse; 291 292 p += alloc; 293 if (addr) { 294 p->addr = addr; 295 p->cpu = smp_processor_id(); 296 p->pid = current ? current->pid : -1; 297 p->when = jiffies; 298 } else 299 memset(p, 0, sizeof(struct track)); 300} 301 302static void init_tracking(struct kmem_cache *s, void *object) 303{ 304 if (s->flags & SLAB_STORE_USER) { 305 set_track(s, object, TRACK_FREE, NULL); 306 set_track(s, object, TRACK_ALLOC, NULL); 307 } 308} 309 310static void print_track(const char *s, struct track *t) 311{ 312 if (!t->addr) 313 return; 314 315 printk(KERN_ERR "%s: ", s); 316 __print_symbol("%s", (unsigned long)t->addr); 317 printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); 318} 319 320static void print_trailer(struct kmem_cache *s, u8 *p) 321{ 322 unsigned int off; /* Offset of last byte */ 323 324 if (s->flags & SLAB_RED_ZONE) 325 print_section("Redzone", p + s->objsize, 326 s->inuse - s->objsize); 327 328 printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n", 329 p + s->offset, 330 get_freepointer(s, p)); 331 332 if (s->offset) 333 off = s->offset + sizeof(void *); 334 else 335 off = s->inuse; 336 337 if (s->flags & SLAB_STORE_USER) { 338 print_track("Last alloc", get_track(s, p, TRACK_ALLOC)); 339 print_track("Last free ", get_track(s, p, TRACK_FREE)); 340 off += 2 * sizeof(struct track); 341 } 342 343 if (off != s->size) 344 /* Beginning of the filler is the free pointer */ 345 print_section("Filler", p + off, s->size - off); 346} 347 348static void object_err(struct kmem_cache *s, struct page *page, 349 u8 *object, char *reason) 350{ 351 u8 *addr = page_address(page); 352 353 printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n", 354 s->name, reason, object, page); 355 printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n", 356 object - addr, page->flags, page->inuse, page->freelist); 357 if (object > addr + 16) 358 print_section("Bytes b4", object - 16, 16); 359 print_section("Object", object, min(s->objsize, 128)); 360 print_trailer(s, object); 361 dump_stack(); 362} 363 364static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...) 365{ 366 va_list args; 367 char buf[100]; 368 369 va_start(args, reason); 370 vsnprintf(buf, sizeof(buf), reason, args); 371 va_end(args); 372 printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf, 373 page); 374 dump_stack(); 375} 376 377static void init_object(struct kmem_cache *s, void *object, int active) 378{ 379 u8 *p = object; 380 381 if (s->flags & __OBJECT_POISON) { 382 memset(p, POISON_FREE, s->objsize - 1); 383 p[s->objsize -1] = POISON_END; 384 } 385 386 if (s->flags & SLAB_RED_ZONE) 387 memset(p + s->objsize, 388 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, 389 s->inuse - s->objsize); 390} 391 392static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) 393{ 394 while (bytes) { 395 if (*start != (u8)value) 396 return 0; 397 start++; 398 bytes--; 399 } 400 return 1; 401} 402 403 404static int check_valid_pointer(struct kmem_cache *s, struct page *page, 405 void *object) 406{ 407 void *base; 408 409 if (!object) 410 return 1; 411 412 base = page_address(page); 413 if (object < base || object >= base + s->objects * s->size || 414 (object - base) % s->size) { 415 return 0; 416 } 417 418 return 1; 419} 420 421/* 422 * Object layout: 423 * 424 * object address 425 * Bytes of the object to be managed. 426 * If the freepointer may overlay the object then the free 427 * pointer is the first word of the object. 428 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 429 * 0xa5 (POISON_END) 430 * 431 * object + s->objsize 432 * Padding to reach word boundary. This is also used for Redzoning. 433 * Padding is extended to word size if Redzoning is enabled 434 * and objsize == inuse. 435 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 436 * 0xcc (RED_ACTIVE) for objects in use. 437 * 438 * object + s->inuse 439 * A. Free pointer (if we cannot overwrite object on free) 440 * B. Tracking data for SLAB_STORE_USER 441 * C. Padding to reach required alignment boundary 442 * Padding is done using 0x5a (POISON_INUSE) 443 * 444 * object + s->size 445 * 446 * If slabcaches are merged then the objsize and inuse boundaries are to 447 * be ignored. And therefore no slab options that rely on these boundaries 448 * may be used with merged slabcaches. 449 */ 450 451static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 452 void *from, void *to) 453{ 454 printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n", 455 s->name, message, data, from, to - 1); 456 memset(from, data, to - from); 457} 458 459static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 460{ 461 unsigned long off = s->inuse; /* The end of info */ 462 463 if (s->offset) 464 /* Freepointer is placed after the object. */ 465 off += sizeof(void *); 466 467 if (s->flags & SLAB_STORE_USER) 468 /* We also have user information there */ 469 off += 2 * sizeof(struct track); 470 471 if (s->size == off) 472 return 1; 473 474 if (check_bytes(p + off, POISON_INUSE, s->size - off)) 475 return 1; 476 477 object_err(s, page, p, "Object padding check fails"); 478 479 /* 480 * Restore padding 481 */ 482 restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size); 483 return 0; 484} 485 486static int slab_pad_check(struct kmem_cache *s, struct page *page) 487{ 488 u8 *p; 489 int length, remainder; 490 491 if (!(s->flags & SLAB_POISON)) 492 return 1; 493 494 p = page_address(page); 495 length = s->objects * s->size; 496 remainder = (PAGE_SIZE << s->order) - length; 497 if (!remainder) 498 return 1; 499 500 if (!check_bytes(p + length, POISON_INUSE, remainder)) { 501 slab_err(s, page, "Padding check failed"); 502 restore_bytes(s, "slab padding", POISON_INUSE, p + length, 503 p + length + remainder); 504 return 0; 505 } 506 return 1; 507} 508 509static int check_object(struct kmem_cache *s, struct page *page, 510 void *object, int active) 511{ 512 u8 *p = object; 513 u8 *endobject = object + s->objsize; 514 515 if (s->flags & SLAB_RED_ZONE) { 516 unsigned int red = 517 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; 518 519 if (!check_bytes(endobject, red, s->inuse - s->objsize)) { 520 object_err(s, page, object, 521 active ? "Redzone Active" : "Redzone Inactive"); 522 restore_bytes(s, "redzone", red, 523 endobject, object + s->inuse); 524 return 0; 525 } 526 } else { 527 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse && 528 !check_bytes(endobject, POISON_INUSE, 529 s->inuse - s->objsize)) { 530 object_err(s, page, p, "Alignment padding check fails"); 531 /* 532 * Fix it so that there will not be another report. 533 * 534 * Hmmm... We may be corrupting an object that now expects 535 * to be longer than allowed. 536 */ 537 restore_bytes(s, "alignment padding", POISON_INUSE, 538 endobject, object + s->inuse); 539 } 540 } 541 542 if (s->flags & SLAB_POISON) { 543 if (!active && (s->flags & __OBJECT_POISON) && 544 (!check_bytes(p, POISON_FREE, s->objsize - 1) || 545 p[s->objsize - 1] != POISON_END)) { 546 547 object_err(s, page, p, "Poison check failed"); 548 restore_bytes(s, "Poison", POISON_FREE, 549 p, p + s->objsize -1); 550 restore_bytes(s, "Poison", POISON_END, 551 p + s->objsize - 1, p + s->objsize); 552 return 0; 553 } 554 /* 555 * check_pad_bytes cleans up on its own. 556 */ 557 check_pad_bytes(s, page, p); 558 } 559 560 if (!s->offset && active) 561 /* 562 * Object and freepointer overlap. Cannot check 563 * freepointer while object is allocated. 564 */ 565 return 1; 566 567 /* Check free pointer validity */ 568 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 569 object_err(s, page, p, "Freepointer corrupt"); 570 /* 571 * No choice but to zap it and thus loose the remainder 572 * of the free objects in this slab. May cause 573 * another error because the object count maybe 574 * wrong now. 575 */ 576 set_freepointer(s, p, NULL); 577 return 0; 578 } 579 return 1; 580} 581 582static int check_slab(struct kmem_cache *s, struct page *page) 583{ 584 VM_BUG_ON(!irqs_disabled()); 585 586 if (!PageSlab(page)) { 587 slab_err(s, page, "Not a valid slab page flags=%lx " 588 "mapping=0x%p count=%d", page->flags, page->mapping, 589 page_count(page)); 590 return 0; 591 } 592 if (page->offset * sizeof(void *) != s->offset) { 593 slab_err(s, page, "Corrupted offset %lu flags=0x%lx " 594 "mapping=0x%p count=%d", 595 (unsigned long)(page->offset * sizeof(void *)), 596 page->flags, 597 page->mapping, 598 page_count(page)); 599 return 0; 600 } 601 if (page->inuse > s->objects) { 602 slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx " 603 "mapping=0x%p count=%d", 604 s->name, page->inuse, s->objects, page->flags, 605 page->mapping, page_count(page)); 606 return 0; 607 } 608 /* Slab_pad_check fixes things up after itself */ 609 slab_pad_check(s, page); 610 return 1; 611} 612 613/* 614 * Determine if a certain object on a page is on the freelist and 615 * therefore free. Must hold the slab lock for cpu slabs to 616 * guarantee that the chains are consistent. 617 */ 618static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 619{ 620 int nr = 0; 621 void *fp = page->freelist; 622 void *object = NULL; 623 624 while (fp && nr <= s->objects) { 625 if (fp == search) 626 return 1; 627 if (!check_valid_pointer(s, page, fp)) { 628 if (object) { 629 object_err(s, page, object, 630 "Freechain corrupt"); 631 set_freepointer(s, object, NULL); 632 break; 633 } else { 634 slab_err(s, page, "Freepointer 0x%p corrupt", 635 fp); 636 page->freelist = NULL; 637 page->inuse = s->objects; 638 printk(KERN_ERR "@@@ SLUB %s: Freelist " 639 "cleared. Slab 0x%p\n", 640 s->name, page); 641 return 0; 642 } 643 break; 644 } 645 object = fp; 646 fp = get_freepointer(s, object); 647 nr++; 648 } 649 650 if (page->inuse != s->objects - nr) { 651 slab_err(s, page, "Wrong object count. Counter is %d but " 652 "counted were %d", s, page, page->inuse, 653 s->objects - nr); 654 page->inuse = s->objects - nr; 655 printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. " 656 "Slab @0x%p\n", s->name, page); 657 } 658 return search == NULL; 659} 660 661/* 662 * Tracking of fully allocated slabs for debugging 663 */ 664static void add_full(struct kmem_cache_node *n, struct page *page) 665{ 666 spin_lock(&n->list_lock); 667 list_add(&page->lru, &n->full); 668 spin_unlock(&n->list_lock); 669} 670 671static void remove_full(struct kmem_cache *s, struct page *page) 672{ 673 struct kmem_cache_node *n; 674 675 if (!(s->flags & SLAB_STORE_USER)) 676 return; 677 678 n = get_node(s, page_to_nid(page)); 679 680 spin_lock(&n->list_lock); 681 list_del(&page->lru); 682 spin_unlock(&n->list_lock); 683} 684 685static int alloc_object_checks(struct kmem_cache *s, struct page *page, 686 void *object) 687{ 688 if (!check_slab(s, page)) 689 goto bad; 690 691 if (object && !on_freelist(s, page, object)) { 692 slab_err(s, page, "Object 0x%p already allocated", object); 693 goto bad; 694 } 695 696 if (!check_valid_pointer(s, page, object)) { 697 object_err(s, page, object, "Freelist Pointer check fails"); 698 goto bad; 699 } 700 701 if (!object) 702 return 1; 703 704 if (!check_object(s, page, object, 0)) 705 goto bad; 706 707 return 1; 708bad: 709 if (PageSlab(page)) { 710 /* 711 * If this is a slab page then lets do the best we can 712 * to avoid issues in the future. Marking all objects 713 * as used avoids touching the remainder. 714 */ 715 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", 716 s->name, page); 717 page->inuse = s->objects; 718 page->freelist = NULL; 719 /* Fix up fields that may be corrupted */ 720 page->offset = s->offset / sizeof(void *); 721 } 722 return 0; 723} 724 725static int free_object_checks(struct kmem_cache *s, struct page *page, 726 void *object) 727{ 728 if (!check_slab(s, page)) 729 goto fail; 730 731 if (!check_valid_pointer(s, page, object)) { 732 slab_err(s, page, "Invalid object pointer 0x%p", object); 733 goto fail; 734 } 735 736 if (on_freelist(s, page, object)) { 737 slab_err(s, page, "Object 0x%p already free", object); 738 goto fail; 739 } 740 741 if (!check_object(s, page, object, 1)) 742 return 0; 743 744 if (unlikely(s != page->slab)) { 745 if (!PageSlab(page)) 746 slab_err(s, page, "Attempt to free object(0x%p) " 747 "outside of slab", object); 748 else 749 if (!page->slab) { 750 printk(KERN_ERR 751 "SLUB <none>: no slab for object 0x%p.\n", 752 object); 753 dump_stack(); 754 } 755 else 756 slab_err(s, page, "object at 0x%p belongs " 757 "to slab %s", object, page->slab->name); 758 goto fail; 759 } 760 return 1; 761fail: 762 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", 763 s->name, page, object); 764 return 0; 765} 766 767/* 768 * Slab allocation and freeing 769 */ 770static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 771{ 772 struct page * page; 773 int pages = 1 << s->order; 774 775 if (s->order) 776 flags |= __GFP_COMP; 777 778 if (s->flags & SLAB_CACHE_DMA) 779 flags |= SLUB_DMA; 780 781 if (node == -1) 782 page = alloc_pages(flags, s->order); 783 else 784 page = alloc_pages_node(node, flags, s->order); 785 786 if (!page) 787 return NULL; 788 789 mod_zone_page_state(page_zone(page), 790 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 791 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 792 pages); 793 794 return page; 795} 796 797static void setup_object(struct kmem_cache *s, struct page *page, 798 void *object) 799{ 800 if (PageError(page)) { 801 init_object(s, object, 0); 802 init_tracking(s, object); 803 } 804 805 if (unlikely(s->ctor)) 806 s->ctor(object, s, SLAB_CTOR_CONSTRUCTOR); 807} 808 809static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 810{ 811 struct page *page; 812 struct kmem_cache_node *n; 813 void *start; 814 void *end; 815 void *last; 816 void *p; 817 818 if (flags & __GFP_NO_GROW) 819 return NULL; 820 821 BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); 822 823 if (flags & __GFP_WAIT) 824 local_irq_enable(); 825 826 page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); 827 if (!page) 828 goto out; 829 830 n = get_node(s, page_to_nid(page)); 831 if (n) 832 atomic_long_inc(&n->nr_slabs); 833 page->offset = s->offset / sizeof(void *); 834 page->slab = s; 835 page->flags |= 1 << PG_slab; 836 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 837 SLAB_STORE_USER | SLAB_TRACE)) 838 page->flags |= 1 << PG_error; 839 840 start = page_address(page); 841 end = start + s->objects * s->size; 842 843 if (unlikely(s->flags & SLAB_POISON)) 844 memset(start, POISON_INUSE, PAGE_SIZE << s->order); 845 846 last = start; 847 for (p = start + s->size; p < end; p += s->size) { 848 setup_object(s, page, last); 849 set_freepointer(s, last, p); 850 last = p; 851 } 852 setup_object(s, page, last); 853 set_freepointer(s, last, NULL); 854 855 page->freelist = start; 856 page->inuse = 0; 857out: 858 if (flags & __GFP_WAIT) 859 local_irq_disable(); 860 return page; 861} 862 863static void __free_slab(struct kmem_cache *s, struct page *page) 864{ 865 int pages = 1 << s->order; 866 867 if (unlikely(PageError(page) || s->dtor)) { 868 void *start = page_address(page); 869 void *end = start + (pages << PAGE_SHIFT); 870 void *p; 871 872 slab_pad_check(s, page); 873 for (p = start; p <= end - s->size; p += s->size) { 874 if (s->dtor) 875 s->dtor(p, s, 0); 876 check_object(s, page, p, 0); 877 } 878 } 879 880 mod_zone_page_state(page_zone(page), 881 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 882 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 883 - pages); 884 885 page->mapping = NULL; 886 __free_pages(page, s->order); 887} 888 889static void rcu_free_slab(struct rcu_head *h) 890{ 891 struct page *page; 892 893 page = container_of((struct list_head *)h, struct page, lru); 894 __free_slab(page->slab, page); 895} 896 897static void free_slab(struct kmem_cache *s, struct page *page) 898{ 899 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 900 /* 901 * RCU free overloads the RCU head over the LRU 902 */ 903 struct rcu_head *head = (void *)&page->lru; 904 905 call_rcu(head, rcu_free_slab); 906 } else 907 __free_slab(s, page); 908} 909 910static void discard_slab(struct kmem_cache *s, struct page *page) 911{ 912 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 913 914 atomic_long_dec(&n->nr_slabs); 915 reset_page_mapcount(page); 916 page->flags &= ~(1 << PG_slab | 1 << PG_error); 917 free_slab(s, page); 918} 919 920/* 921 * Per slab locking using the pagelock 922 */ 923static __always_inline void slab_lock(struct page *page) 924{ 925 bit_spin_lock(PG_locked, &page->flags); 926} 927 928static __always_inline void slab_unlock(struct page *page) 929{ 930 bit_spin_unlock(PG_locked, &page->flags); 931} 932 933static __always_inline int slab_trylock(struct page *page) 934{ 935 int rc = 1; 936 937 rc = bit_spin_trylock(PG_locked, &page->flags); 938 return rc; 939} 940 941/* 942 * Management of partially allocated slabs 943 */ 944static void add_partial_tail(struct kmem_cache_node *n, struct page *page) 945{ 946 spin_lock(&n->list_lock); 947 n->nr_partial++; 948 list_add_tail(&page->lru, &n->partial); 949 spin_unlock(&n->list_lock); 950} 951 952static void add_partial(struct kmem_cache_node *n, struct page *page) 953{ 954 spin_lock(&n->list_lock); 955 n->nr_partial++; 956 list_add(&page->lru, &n->partial); 957 spin_unlock(&n->list_lock); 958} 959 960static void remove_partial(struct kmem_cache *s, 961 struct page *page) 962{ 963 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 964 965 spin_lock(&n->list_lock); 966 list_del(&page->lru); 967 n->nr_partial--; 968 spin_unlock(&n->list_lock); 969} 970 971/* 972 * Lock page and remove it from the partial list 973 * 974 * Must hold list_lock 975 */ 976static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) 977{ 978 if (slab_trylock(page)) { 979 list_del(&page->lru); 980 n->nr_partial--; 981 return 1; 982 } 983 return 0; 984} 985 986/* 987 * Try to get a partial slab from a specific node 988 */ 989static struct page *get_partial_node(struct kmem_cache_node *n) 990{ 991 struct page *page; 992 993 /* 994 * Racy check. If we mistakenly see no partial slabs then we 995 * just allocate an empty slab. If we mistakenly try to get a 996 * partial slab then get_partials() will return NULL. 997 */ 998 if (!n || !n->nr_partial) 999 return NULL; 1000 1001 spin_lock(&n->list_lock); 1002 list_for_each_entry(page, &n->partial, lru) 1003 if (lock_and_del_slab(n, page)) 1004 goto out; 1005 page = NULL; 1006out: 1007 spin_unlock(&n->list_lock); 1008 return page; 1009} 1010 1011/* 1012 * Get a page from somewhere. Search in increasing NUMA 1013 * distances. 1014 */ 1015static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1016{ 1017#ifdef CONFIG_NUMA 1018 struct zonelist *zonelist; 1019 struct zone **z; 1020 struct page *page; 1021 1022 /* 1023 * The defrag ratio allows to configure the tradeoffs between 1024 * inter node defragmentation and node local allocations. 1025 * A lower defrag_ratio increases the tendency to do local 1026 * allocations instead of scanning throught the partial 1027 * lists on other nodes. 1028 * 1029 * If defrag_ratio is set to 0 then kmalloc() always 1030 * returns node local objects. If its higher then kmalloc() 1031 * may return off node objects in order to avoid fragmentation. 1032 * 1033 * A higher ratio means slabs may be taken from other nodes 1034 * thus reducing the number of partial slabs on those nodes. 1035 * 1036 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes 1037 * defrag_ratio = 1000) then every (well almost) allocation 1038 * will first attempt to defrag slab caches on other nodes. This 1039 * means scanning over all nodes to look for partial slabs which 1040 * may be a bit expensive to do on every slab allocation. 1041 */ 1042 if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) 1043 return NULL; 1044 1045 zonelist = &NODE_DATA(slab_node(current->mempolicy)) 1046 ->node_zonelists[gfp_zone(flags)]; 1047 for (z = zonelist->zones; *z; z++) { 1048 struct kmem_cache_node *n; 1049 1050 n = get_node(s, zone_to_nid(*z)); 1051 1052 if (n && cpuset_zone_allowed_hardwall(*z, flags) && 1053 n->nr_partial > MIN_PARTIAL) { 1054 page = get_partial_node(n); 1055 if (page) 1056 return page; 1057 } 1058 } 1059#endif 1060 return NULL; 1061} 1062 1063/* 1064 * Get a partial page, lock it and return it. 1065 */ 1066static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1067{ 1068 struct page *page; 1069 int searchnode = (node == -1) ? numa_node_id() : node; 1070 1071 page = get_partial_node(get_node(s, searchnode)); 1072 if (page || (flags & __GFP_THISNODE)) 1073 return page; 1074 1075 return get_any_partial(s, flags); 1076} 1077 1078/* 1079 * Move a page back to the lists. 1080 * 1081 * Must be called with the slab lock held. 1082 * 1083 * On exit the slab lock will have been dropped. 1084 */ 1085static void putback_slab(struct kmem_cache *s, struct page *page) 1086{ 1087 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1088 1089 if (page->inuse) { 1090 1091 if (page->freelist) 1092 add_partial(n, page); 1093 else if (PageError(page) && (s->flags & SLAB_STORE_USER)) 1094 add_full(n, page); 1095 slab_unlock(page); 1096 1097 } else { 1098 if (n->nr_partial < MIN_PARTIAL) { 1099 /* 1100 * Adding an empty page to the partial slabs in order 1101 * to avoid page allocator overhead. This page needs to 1102 * come after all the others that are not fully empty 1103 * in order to make sure that we do maximum 1104 * defragmentation. 1105 */ 1106 add_partial_tail(n, page); 1107 slab_unlock(page); 1108 } else { 1109 slab_unlock(page); 1110 discard_slab(s, page); 1111 } 1112 } 1113} 1114 1115/* 1116 * Remove the cpu slab 1117 */ 1118static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) 1119{ 1120 s->cpu_slab[cpu] = NULL; 1121 ClearPageActive(page); 1122 1123 putback_slab(s, page); 1124} 1125 1126static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) 1127{ 1128 slab_lock(page); 1129 deactivate_slab(s, page, cpu); 1130} 1131 1132/* 1133 * Flush cpu slab. 1134 * Called from IPI handler with interrupts disabled. 1135 */ 1136static void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1137{ 1138 struct page *page = s->cpu_slab[cpu]; 1139 1140 if (likely(page)) 1141 flush_slab(s, page, cpu); 1142} 1143 1144static void flush_cpu_slab(void *d) 1145{ 1146 struct kmem_cache *s = d; 1147 int cpu = smp_processor_id(); 1148 1149 __flush_cpu_slab(s, cpu); 1150} 1151 1152static void flush_all(struct kmem_cache *s) 1153{ 1154#ifdef CONFIG_SMP 1155 on_each_cpu(flush_cpu_slab, s, 1, 1); 1156#else 1157 unsigned long flags; 1158 1159 local_irq_save(flags); 1160 flush_cpu_slab(s); 1161 local_irq_restore(flags); 1162#endif 1163} 1164 1165/* 1166 * slab_alloc is optimized to only modify two cachelines on the fast path 1167 * (aside from the stack): 1168 * 1169 * 1. The page struct 1170 * 2. The first cacheline of the object to be allocated. 1171 * 1172 * The only cache lines that are read (apart from code) is the 1173 * per cpu array in the kmem_cache struct. 1174 * 1175 * Fastpath is not possible if we need to get a new slab or have 1176 * debugging enabled (which means all slabs are marked with PageError) 1177 */ 1178static void *slab_alloc(struct kmem_cache *s, 1179 gfp_t gfpflags, int node, void *addr) 1180{ 1181 struct page *page; 1182 void **object; 1183 unsigned long flags; 1184 int cpu; 1185 1186 local_irq_save(flags); 1187 cpu = smp_processor_id(); 1188 page = s->cpu_slab[cpu]; 1189 if (!page) 1190 goto new_slab; 1191 1192 slab_lock(page); 1193 if (unlikely(node != -1 && page_to_nid(page) != node)) 1194 goto another_slab; 1195redo: 1196 object = page->freelist; 1197 if (unlikely(!object)) 1198 goto another_slab; 1199 if (unlikely(PageError(page))) 1200 goto debug; 1201 1202have_object: 1203 page->inuse++; 1204 page->freelist = object[page->offset]; 1205 slab_unlock(page); 1206 local_irq_restore(flags); 1207 return object; 1208 1209another_slab: 1210 deactivate_slab(s, page, cpu); 1211 1212new_slab: 1213 page = get_partial(s, gfpflags, node); 1214 if (likely(page)) { 1215have_slab: 1216 s->cpu_slab[cpu] = page; 1217 SetPageActive(page); 1218 goto redo; 1219 } 1220 1221 page = new_slab(s, gfpflags, node); 1222 if (page) { 1223 cpu = smp_processor_id(); 1224 if (s->cpu_slab[cpu]) { 1225 /* 1226 * Someone else populated the cpu_slab while we enabled 1227 * interrupts, or we have got scheduled on another cpu. 1228 * The page may not be on the requested node. 1229 */ 1230 if (node == -1 || 1231 page_to_nid(s->cpu_slab[cpu]) == node) { 1232 /* 1233 * Current cpuslab is acceptable and we 1234 * want the current one since its cache hot 1235 */ 1236 discard_slab(s, page); 1237 page = s->cpu_slab[cpu]; 1238 slab_lock(page); 1239 goto redo; 1240 } 1241 /* Dump the current slab */ 1242 flush_slab(s, s->cpu_slab[cpu], cpu); 1243 } 1244 slab_lock(page); 1245 goto have_slab; 1246 } 1247 local_irq_restore(flags); 1248 return NULL; 1249debug: 1250 if (!alloc_object_checks(s, page, object)) 1251 goto another_slab; 1252 if (s->flags & SLAB_STORE_USER) 1253 set_track(s, object, TRACK_ALLOC, addr); 1254 if (s->flags & SLAB_TRACE) { 1255 printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n", 1256 s->name, object, page->inuse, 1257 page->freelist); 1258 dump_stack(); 1259 } 1260 init_object(s, object, 1); 1261 goto have_object; 1262} 1263 1264void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 1265{ 1266 return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); 1267} 1268EXPORT_SYMBOL(kmem_cache_alloc); 1269 1270#ifdef CONFIG_NUMA 1271void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 1272{ 1273 return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); 1274} 1275EXPORT_SYMBOL(kmem_cache_alloc_node); 1276#endif 1277 1278/* 1279 * The fastpath only writes the cacheline of the page struct and the first 1280 * cacheline of the object. 1281 * 1282 * No special cachelines need to be read 1283 */ 1284static void slab_free(struct kmem_cache *s, struct page *page, 1285 void *x, void *addr) 1286{ 1287 void *prior; 1288 void **object = (void *)x; 1289 unsigned long flags; 1290 1291 local_irq_save(flags); 1292 slab_lock(page); 1293 1294 if (unlikely(PageError(page))) 1295 goto debug; 1296checks_ok: 1297 prior = object[page->offset] = page->freelist; 1298 page->freelist = object; 1299 page->inuse--; 1300 1301 if (unlikely(PageActive(page))) 1302 /* 1303 * Cpu slabs are never on partial lists and are 1304 * never freed. 1305 */ 1306 goto out_unlock; 1307 1308 if (unlikely(!page->inuse)) 1309 goto slab_empty; 1310 1311 /* 1312 * Objects left in the slab. If it 1313 * was not on the partial list before 1314 * then add it. 1315 */ 1316 if (unlikely(!prior)) 1317 add_partial(get_node(s, page_to_nid(page)), page); 1318 1319out_unlock: 1320 slab_unlock(page); 1321 local_irq_restore(flags); 1322 return; 1323 1324slab_empty: 1325 if (prior) 1326 /* 1327 * Slab on the partial list. 1328 */ 1329 remove_partial(s, page); 1330 1331 slab_unlock(page); 1332 discard_slab(s, page); 1333 local_irq_restore(flags); 1334 return; 1335 1336debug: 1337 if (!free_object_checks(s, page, x)) 1338 goto out_unlock; 1339 if (!PageActive(page) && !page->freelist) 1340 remove_full(s, page); 1341 if (s->flags & SLAB_STORE_USER) 1342 set_track(s, x, TRACK_FREE, addr); 1343 if (s->flags & SLAB_TRACE) { 1344 printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n", 1345 s->name, object, page->inuse, 1346 page->freelist); 1347 print_section("Object", (void *)object, s->objsize); 1348 dump_stack(); 1349 } 1350 init_object(s, object, 0); 1351 goto checks_ok; 1352} 1353 1354void kmem_cache_free(struct kmem_cache *s, void *x) 1355{ 1356 struct page *page; 1357 1358 page = virt_to_head_page(x); 1359 1360 slab_free(s, page, x, __builtin_return_address(0)); 1361} 1362EXPORT_SYMBOL(kmem_cache_free); 1363 1364/* Figure out on which slab object the object resides */ 1365static struct page *get_object_page(const void *x) 1366{ 1367 struct page *page = virt_to_head_page(x); 1368 1369 if (!PageSlab(page)) 1370 return NULL; 1371 1372 return page; 1373} 1374 1375/* 1376 * kmem_cache_open produces objects aligned at "size" and the first object 1377 * is placed at offset 0 in the slab (We have no metainformation on the 1378 * slab, all slabs are in essence "off slab"). 1379 * 1380 * In order to get the desired alignment one just needs to align the 1381 * size. 1382 * 1383 * Notice that the allocation order determines the sizes of the per cpu 1384 * caches. Each processor has always one slab available for allocations. 1385 * Increasing the allocation order reduces the number of times that slabs 1386 * must be moved on and off the partial lists and therefore may influence 1387 * locking overhead. 1388 * 1389 * The offset is used to relocate the free list link in each object. It is 1390 * therefore possible to move the free list link behind the object. This 1391 * is necessary for RCU to work properly and also useful for debugging. 1392 */ 1393 1394/* 1395 * Mininum / Maximum order of slab pages. This influences locking overhead 1396 * and slab fragmentation. A higher order reduces the number of partial slabs 1397 * and increases the number of allocations possible without having to 1398 * take the list_lock. 1399 */ 1400static int slub_min_order; 1401static int slub_max_order = DEFAULT_MAX_ORDER; 1402 1403/* 1404 * Minimum number of objects per slab. This is necessary in order to 1405 * reduce locking overhead. Similar to the queue size in SLAB. 1406 */ 1407static int slub_min_objects = DEFAULT_MIN_OBJECTS; 1408 1409/* 1410 * Merge control. If this is set then no merging of slab caches will occur. 1411 */ 1412static int slub_nomerge; 1413 1414/* 1415 * Debug settings: 1416 */ 1417static int slub_debug; 1418 1419static char *slub_debug_slabs; 1420 1421/* 1422 * Calculate the order of allocation given an slab object size. 1423 * 1424 * The order of allocation has significant impact on other elements 1425 * of the system. Generally order 0 allocations should be preferred 1426 * since they do not cause fragmentation in the page allocator. Larger 1427 * objects may have problems with order 0 because there may be too much 1428 * space left unused in a slab. We go to a higher order if more than 1/8th 1429 * of the slab would be wasted. 1430 * 1431 * In order to reach satisfactory performance we must ensure that 1432 * a minimum number of objects is in one slab. Otherwise we may 1433 * generate too much activity on the partial lists. This is less a 1434 * concern for large slabs though. slub_max_order specifies the order 1435 * where we begin to stop considering the number of objects in a slab. 1436 * 1437 * Higher order allocations also allow the placement of more objects 1438 * in a slab and thereby reduce object handling overhead. If the user 1439 * has requested a higher mininum order then we start with that one 1440 * instead of zero. 1441 */ 1442static int calculate_order(int size) 1443{ 1444 int order; 1445 int rem; 1446 1447 for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT); 1448 order < MAX_ORDER; order++) { 1449 unsigned long slab_size = PAGE_SIZE << order; 1450 1451 if (slub_max_order > order && 1452 slab_size < slub_min_objects * size) 1453 continue; 1454 1455 if (slab_size < size) 1456 continue; 1457 1458 rem = slab_size % size; 1459 1460 if (rem <= (PAGE_SIZE << order) / 8) 1461 break; 1462 1463 } 1464 if (order >= MAX_ORDER) 1465 return -E2BIG; 1466 return order; 1467} 1468 1469/* 1470 * Function to figure out which alignment to use from the 1471 * various ways of specifying it. 1472 */ 1473static unsigned long calculate_alignment(unsigned long flags, 1474 unsigned long align, unsigned long size) 1475{ 1476 /* 1477 * If the user wants hardware cache aligned objects then 1478 * follow that suggestion if the object is sufficiently 1479 * large. 1480 * 1481 * The hardware cache alignment cannot override the 1482 * specified alignment though. If that is greater 1483 * then use it. 1484 */ 1485 if ((flags & SLAB_HWCACHE_ALIGN) && 1486 size > L1_CACHE_BYTES / 2) 1487 return max_t(unsigned long, align, L1_CACHE_BYTES); 1488 1489 if (align < ARCH_SLAB_MINALIGN) 1490 return ARCH_SLAB_MINALIGN; 1491 1492 return ALIGN(align, sizeof(void *)); 1493} 1494 1495static void init_kmem_cache_node(struct kmem_cache_node *n) 1496{ 1497 n->nr_partial = 0; 1498 atomic_long_set(&n->nr_slabs, 0); 1499 spin_lock_init(&n->list_lock); 1500 INIT_LIST_HEAD(&n->partial); 1501 INIT_LIST_HEAD(&n->full); 1502} 1503 1504#ifdef CONFIG_NUMA 1505/* 1506 * No kmalloc_node yet so do it by hand. We know that this is the first 1507 * slab on the node for this slabcache. There are no concurrent accesses 1508 * possible. 1509 * 1510 * Note that this function only works on the kmalloc_node_cache 1511 * when allocating for the kmalloc_node_cache. 1512 */ 1513static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, 1514 int node) 1515{ 1516 struct page *page; 1517 struct kmem_cache_node *n; 1518 1519 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); 1520 1521 page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); 1522 /* new_slab() disables interupts */ 1523 local_irq_enable(); 1524 1525 BUG_ON(!page); 1526 n = page->freelist; 1527 BUG_ON(!n); 1528 page->freelist = get_freepointer(kmalloc_caches, n); 1529 page->inuse++; 1530 kmalloc_caches->node[node] = n; 1531 init_object(kmalloc_caches, n, 1); 1532 init_kmem_cache_node(n); 1533 atomic_long_inc(&n->nr_slabs); 1534 add_partial(n, page); 1535 return n; 1536} 1537 1538static void free_kmem_cache_nodes(struct kmem_cache *s) 1539{ 1540 int node; 1541 1542 for_each_online_node(node) { 1543 struct kmem_cache_node *n = s->node[node]; 1544 if (n && n != &s->local_node) 1545 kmem_cache_free(kmalloc_caches, n); 1546 s->node[node] = NULL; 1547 } 1548} 1549 1550static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 1551{ 1552 int node; 1553 int local_node; 1554 1555 if (slab_state >= UP) 1556 local_node = page_to_nid(virt_to_page(s)); 1557 else 1558 local_node = 0; 1559 1560 for_each_online_node(node) { 1561 struct kmem_cache_node *n; 1562 1563 if (local_node == node) 1564 n = &s->local_node; 1565 else { 1566 if (slab_state == DOWN) { 1567 n = early_kmem_cache_node_alloc(gfpflags, 1568 node); 1569 continue; 1570 } 1571 n = kmem_cache_alloc_node(kmalloc_caches, 1572 gfpflags, node); 1573 1574 if (!n) { 1575 free_kmem_cache_nodes(s); 1576 return 0; 1577 } 1578 1579 } 1580 s->node[node] = n; 1581 init_kmem_cache_node(n); 1582 } 1583 return 1; 1584} 1585#else 1586static void free_kmem_cache_nodes(struct kmem_cache *s) 1587{ 1588} 1589 1590static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 1591{ 1592 init_kmem_cache_node(&s->local_node); 1593 return 1; 1594} 1595#endif 1596 1597/* 1598 * calculate_sizes() determines the order and the distribution of data within 1599 * a slab object. 1600 */ 1601static int calculate_sizes(struct kmem_cache *s) 1602{ 1603 unsigned long flags = s->flags; 1604 unsigned long size = s->objsize; 1605 unsigned long align = s->align; 1606 1607 /* 1608 * Determine if we can poison the object itself. If the user of 1609 * the slab may touch the object after free or before allocation 1610 * then we should never poison the object itself. 1611 */ 1612 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 1613 !s->ctor && !s->dtor) 1614 s->flags |= __OBJECT_POISON; 1615 else 1616 s->flags &= ~__OBJECT_POISON; 1617 1618 /* 1619 * Round up object size to the next word boundary. We can only 1620 * place the free pointer at word boundaries and this determines 1621 * the possible location of the free pointer. 1622 */ 1623 size = ALIGN(size, sizeof(void *)); 1624 1625 /* 1626 * If we are redzoning then check if there is some space between the 1627 * end of the object and the free pointer. If not then add an 1628 * additional word, so that we can establish a redzone between 1629 * the object and the freepointer to be able to check for overwrites. 1630 */ 1631 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 1632 size += sizeof(void *); 1633 1634 /* 1635 * With that we have determined how much of the slab is in actual 1636 * use by the object. This is the potential offset to the free 1637 * pointer. 1638 */ 1639 s->inuse = size; 1640 1641 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 1642 s->ctor || s->dtor)) { 1643 /* 1644 * Relocate free pointer after the object if it is not 1645 * permitted to overwrite the first word of the object on 1646 * kmem_cache_free. 1647 * 1648 * This is the case if we do RCU, have a constructor or 1649 * destructor or are poisoning the objects. 1650 */ 1651 s->offset = size; 1652 size += sizeof(void *); 1653 } 1654 1655 if (flags & SLAB_STORE_USER) 1656 /* 1657 * Need to store information about allocs and frees after 1658 * the object. 1659 */ 1660 size += 2 * sizeof(struct track); 1661 1662 if (flags & DEBUG_DEFAULT_FLAGS) 1663 /* 1664 * Add some empty padding so that we can catch 1665 * overwrites from earlier objects rather than let 1666 * tracking information or the free pointer be 1667 * corrupted if an user writes before the start 1668 * of the object. 1669 */ 1670 size += sizeof(void *); 1671 /* 1672 * Determine the alignment based on various parameters that the 1673 * user specified (this is unecessarily complex due to the attempt 1674 * to be compatible with SLAB. Should be cleaned up some day). 1675 */ 1676 align = calculate_alignment(flags, align, s->objsize); 1677 1678 /* 1679 * SLUB stores one object immediately after another beginning from 1680 * offset 0. In order to align the objects we have to simply size 1681 * each object to conform to the alignment. 1682 */ 1683 size = ALIGN(size, align); 1684 s->size = size; 1685 1686 s->order = calculate_order(size); 1687 if (s->order < 0) 1688 return 0; 1689 1690 /* 1691 * Determine the number of objects per slab 1692 */ 1693 s->objects = (PAGE_SIZE << s->order) / size; 1694 1695 /* 1696 * Verify that the number of objects is within permitted limits. 1697 * The page->inuse field is only 16 bit wide! So we cannot have 1698 * more than 64k objects per slab. 1699 */ 1700 if (!s->objects || s->objects > 65535) 1701 return 0; 1702 return 1; 1703 1704} 1705 1706static int __init finish_bootstrap(void) 1707{ 1708 struct list_head *h; 1709 int err; 1710 1711 slab_state = SYSFS; 1712 1713 list_for_each(h, &slab_caches) { 1714 struct kmem_cache *s = 1715 container_of(h, struct kmem_cache, list); 1716 1717 err = sysfs_slab_add(s); 1718 BUG_ON(err); 1719 } 1720 return 0; 1721} 1722 1723static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 1724 const char *name, size_t size, 1725 size_t align, unsigned long flags, 1726 void (*ctor)(void *, struct kmem_cache *, unsigned long), 1727 void (*dtor)(void *, struct kmem_cache *, unsigned long)) 1728{ 1729 memset(s, 0, kmem_size); 1730 s->name = name; 1731 s->ctor = ctor; 1732 s->dtor = dtor; 1733 s->objsize = size; 1734 s->flags = flags; 1735 s->align = align; 1736 1737 /* 1738 * The page->offset field is only 16 bit wide. This is an offset 1739 * in units of words from the beginning of an object. If the slab 1740 * size is bigger then we cannot move the free pointer behind the 1741 * object anymore. 1742 * 1743 * On 32 bit platforms the limit is 256k. On 64bit platforms 1744 * the limit is 512k. 1745 * 1746 * Debugging or ctor/dtors may create a need to move the free 1747 * pointer. Fail if this happens. 1748 */ 1749 if (s->size >= 65535 * sizeof(void *)) { 1750 BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | 1751 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); 1752 BUG_ON(ctor || dtor); 1753 } 1754 else 1755 /* 1756 * Enable debugging if selected on the kernel commandline. 1757 */ 1758 if (slub_debug && (!slub_debug_slabs || 1759 strncmp(slub_debug_slabs, name, 1760 strlen(slub_debug_slabs)) == 0)) 1761 s->flags |= slub_debug; 1762 1763 if (!calculate_sizes(s)) 1764 goto error; 1765 1766 s->refcount = 1; 1767#ifdef CONFIG_NUMA 1768 s->defrag_ratio = 100; 1769#endif 1770 1771 if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 1772 return 1; 1773error: 1774 if (flags & SLAB_PANIC) 1775 panic("Cannot create slab %s size=%lu realsize=%u " 1776 "order=%u offset=%u flags=%lx\n", 1777 s->name, (unsigned long)size, s->size, s->order, 1778 s->offset, flags); 1779 return 0; 1780} 1781EXPORT_SYMBOL(kmem_cache_open); 1782 1783/* 1784 * Check if a given pointer is valid 1785 */ 1786int kmem_ptr_validate(struct kmem_cache *s, const void *object) 1787{ 1788 struct page * page; 1789 void *addr; 1790 1791 page = get_object_page(object); 1792 1793 if (!page || s != page->slab) 1794 /* No slab or wrong slab */ 1795 return 0; 1796 1797 addr = page_address(page); 1798 if (object < addr || object >= addr + s->objects * s->size) 1799 /* Out of bounds */ 1800 return 0; 1801 1802 if ((object - addr) % s->size) 1803 /* Improperly aligned */ 1804 return 0; 1805 1806 /* 1807 * We could also check if the object is on the slabs freelist. 1808 * But this would be too expensive and it seems that the main 1809 * purpose of kmem_ptr_valid is to check if the object belongs 1810 * to a certain slab. 1811 */ 1812 return 1; 1813} 1814EXPORT_SYMBOL(kmem_ptr_validate); 1815 1816/* 1817 * Determine the size of a slab object 1818 */ 1819unsigned int kmem_cache_size(struct kmem_cache *s) 1820{ 1821 return s->objsize; 1822} 1823EXPORT_SYMBOL(kmem_cache_size); 1824 1825const char *kmem_cache_name(struct kmem_cache *s) 1826{ 1827 return s->name; 1828} 1829EXPORT_SYMBOL(kmem_cache_name); 1830 1831/* 1832 * Attempt to free all slabs on a node 1833 */ 1834static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, 1835 struct list_head *list) 1836{ 1837 int slabs_inuse = 0; 1838 unsigned long flags; 1839 struct page *page, *h; 1840 1841 spin_lock_irqsave(&n->list_lock, flags); 1842 list_for_each_entry_safe(page, h, list, lru) 1843 if (!page->inuse) { 1844 list_del(&page->lru); 1845 discard_slab(s, page); 1846 } else 1847 slabs_inuse++; 1848 spin_unlock_irqrestore(&n->list_lock, flags); 1849 return slabs_inuse; 1850} 1851 1852/* 1853 * Release all resources used by slab cache 1854 */ 1855static int kmem_cache_close(struct kmem_cache *s) 1856{ 1857 int node; 1858 1859 flush_all(s); 1860 1861 /* Attempt to free all objects */ 1862 for_each_online_node(node) { 1863 struct kmem_cache_node *n = get_node(s, node); 1864 1865 n->nr_partial -= free_list(s, n, &n->partial); 1866 if (atomic_long_read(&n->nr_slabs)) 1867 return 1; 1868 } 1869 free_kmem_cache_nodes(s); 1870 return 0; 1871} 1872 1873/* 1874 * Close a cache and release the kmem_cache structure 1875 * (must be used for caches created using kmem_cache_create) 1876 */ 1877void kmem_cache_destroy(struct kmem_cache *s) 1878{ 1879 down_write(&slub_lock); 1880 s->refcount--; 1881 if (!s->refcount) { 1882 list_del(&s->list); 1883 if (kmem_cache_close(s)) 1884 WARN_ON(1); 1885 sysfs_slab_remove(s); 1886 kfree(s); 1887 } 1888 up_write(&slub_lock); 1889} 1890EXPORT_SYMBOL(kmem_cache_destroy); 1891 1892/******************************************************************** 1893 * Kmalloc subsystem 1894 *******************************************************************/ 1895 1896struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; 1897EXPORT_SYMBOL(kmalloc_caches); 1898 1899#ifdef CONFIG_ZONE_DMA 1900static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; 1901#endif 1902 1903static int __init setup_slub_min_order(char *str) 1904{ 1905 get_option (&str, &slub_min_order); 1906 1907 return 1; 1908} 1909 1910__setup("slub_min_order=", setup_slub_min_order); 1911 1912static int __init setup_slub_max_order(char *str) 1913{ 1914 get_option (&str, &slub_max_order); 1915 1916 return 1; 1917} 1918 1919__setup("slub_max_order=", setup_slub_max_order); 1920 1921static int __init setup_slub_min_objects(char *str) 1922{ 1923 get_option (&str, &slub_min_objects); 1924 1925 return 1; 1926} 1927 1928__setup("slub_min_objects=", setup_slub_min_objects); 1929 1930static int __init setup_slub_nomerge(char *str) 1931{ 1932 slub_nomerge = 1; 1933 return 1; 1934} 1935 1936__setup("slub_nomerge", setup_slub_nomerge); 1937 1938static int __init setup_slub_debug(char *str) 1939{ 1940 if (!str || *str != '=') 1941 slub_debug = DEBUG_DEFAULT_FLAGS; 1942 else { 1943 str++; 1944 if (*str == 0 || *str == ',') 1945 slub_debug = DEBUG_DEFAULT_FLAGS; 1946 else 1947 for( ;*str && *str != ','; str++) 1948 switch (*str) { 1949 case 'f' : case 'F' : 1950 slub_debug |= SLAB_DEBUG_FREE; 1951 break; 1952 case 'z' : case 'Z' : 1953 slub_debug |= SLAB_RED_ZONE; 1954 break; 1955 case 'p' : case 'P' : 1956 slub_debug |= SLAB_POISON; 1957 break; 1958 case 'u' : case 'U' : 1959 slub_debug |= SLAB_STORE_USER; 1960 break; 1961 case 't' : case 'T' : 1962 slub_debug |= SLAB_TRACE; 1963 break; 1964 default: 1965 printk(KERN_ERR "slub_debug option '%c' " 1966 "unknown. skipped\n",*str); 1967 } 1968 } 1969 1970 if (*str == ',') 1971 slub_debug_slabs = str + 1; 1972 return 1; 1973} 1974 1975__setup("slub_debug", setup_slub_debug); 1976 1977static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, 1978 const char *name, int size, gfp_t gfp_flags) 1979{ 1980 unsigned int flags = 0; 1981 1982 if (gfp_flags & SLUB_DMA) 1983 flags = SLAB_CACHE_DMA; 1984 1985 down_write(&slub_lock); 1986 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 1987 flags, NULL, NULL)) 1988 goto panic; 1989 1990 list_add(&s->list, &slab_caches); 1991 up_write(&slub_lock); 1992 if (sysfs_slab_add(s)) 1993 goto panic; 1994 return s; 1995 1996panic: 1997 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 1998} 1999 2000static struct kmem_cache *get_slab(size_t size, gfp_t flags) 2001{ 2002 int index = kmalloc_index(size); 2003 2004 if (!index) 2005 return NULL; 2006 2007 /* Allocation too large? */ 2008 BUG_ON(index < 0); 2009 2010#ifdef CONFIG_ZONE_DMA 2011 if ((flags & SLUB_DMA)) { 2012 struct kmem_cache *s; 2013 struct kmem_cache *x; 2014 char *text; 2015 size_t realsize; 2016 2017 s = kmalloc_caches_dma[index]; 2018 if (s) 2019 return s; 2020 2021 /* Dynamically create dma cache */ 2022 x = kmalloc(kmem_size, flags & ~SLUB_DMA); 2023 if (!x) 2024 panic("Unable to allocate memory for dma cache\n"); 2025 2026 if (index <= KMALLOC_SHIFT_HIGH) 2027 realsize = 1 << index; 2028 else { 2029 if (index == 1) 2030 realsize = 96; 2031 else 2032 realsize = 192; 2033 } 2034 2035 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", 2036 (unsigned int)realsize); 2037 s = create_kmalloc_cache(x, text, realsize, flags); 2038 kmalloc_caches_dma[index] = s; 2039 return s; 2040 } 2041#endif 2042 return &kmalloc_caches[index]; 2043} 2044 2045void *__kmalloc(size_t size, gfp_t flags) 2046{ 2047 struct kmem_cache *s = get_slab(size, flags); 2048 2049 if (s) 2050 return slab_alloc(s, flags, -1, __builtin_return_address(0)); 2051 return NULL; 2052} 2053EXPORT_SYMBOL(__kmalloc); 2054 2055#ifdef CONFIG_NUMA 2056void *__kmalloc_node(size_t size, gfp_t flags, int node) 2057{ 2058 struct kmem_cache *s = get_slab(size, flags); 2059 2060 if (s) 2061 return slab_alloc(s, flags, node, __builtin_return_address(0)); 2062 return NULL; 2063} 2064EXPORT_SYMBOL(__kmalloc_node); 2065#endif 2066 2067size_t ksize(const void *object) 2068{ 2069 struct page *page = get_object_page(object); 2070 struct kmem_cache *s; 2071 2072 BUG_ON(!page); 2073 s = page->slab; 2074 BUG_ON(!s); 2075 2076 /* 2077 * Debugging requires use of the padding between object 2078 * and whatever may come after it. 2079 */ 2080 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 2081 return s->objsize; 2082 2083 /* 2084 * If we have the need to store the freelist pointer 2085 * back there or track user information then we can 2086 * only use the space before that information. 2087 */ 2088 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 2089 return s->inuse; 2090 2091 /* 2092 * Else we can use all the padding etc for the allocation 2093 */ 2094 return s->size; 2095} 2096EXPORT_SYMBOL(ksize); 2097 2098void kfree(const void *x) 2099{ 2100 struct kmem_cache *s; 2101 struct page *page; 2102 2103 if (!x) 2104 return; 2105 2106 page = virt_to_head_page(x); 2107 s = page->slab; 2108 2109 slab_free(s, page, (void *)x, __builtin_return_address(0)); 2110} 2111EXPORT_SYMBOL(kfree); 2112 2113/* 2114 * kmem_cache_shrink removes empty slabs from the partial lists 2115 * and then sorts the partially allocated slabs by the number 2116 * of items in use. The slabs with the most items in use 2117 * come first. New allocations will remove these from the 2118 * partial list because they are full. The slabs with the 2119 * least items are placed last. If it happens that the objects 2120 * are freed then the page can be returned to the page allocator. 2121 */ 2122int kmem_cache_shrink(struct kmem_cache *s) 2123{ 2124 int node; 2125 int i; 2126 struct kmem_cache_node *n; 2127 struct page *page; 2128 struct page *t; 2129 struct list_head *slabs_by_inuse = 2130 kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); 2131 unsigned long flags; 2132 2133 if (!slabs_by_inuse) 2134 return -ENOMEM; 2135 2136 flush_all(s); 2137 for_each_online_node(node) { 2138 n = get_node(s, node); 2139 2140 if (!n->nr_partial) 2141 continue; 2142 2143 for (i = 0; i < s->objects; i++) 2144 INIT_LIST_HEAD(slabs_by_inuse + i); 2145 2146 spin_lock_irqsave(&n->list_lock, flags); 2147 2148 /* 2149 * Build lists indexed by the items in use in 2150 * each slab or free slabs if empty. 2151 * 2152 * Note that concurrent frees may occur while 2153 * we hold the list_lock. page->inuse here is 2154 * the upper limit. 2155 */ 2156 list_for_each_entry_safe(page, t, &n->partial, lru) { 2157 if (!page->inuse && slab_trylock(page)) { 2158 /* 2159 * Must hold slab lock here because slab_free 2160 * may have freed the last object and be 2161 * waiting to release the slab. 2162 */ 2163 list_del(&page->lru); 2164 n->nr_partial--; 2165 slab_unlock(page); 2166 discard_slab(s, page); 2167 } else { 2168 if (n->nr_partial > MAX_PARTIAL) 2169 list_move(&page->lru, 2170 slabs_by_inuse + page->inuse); 2171 } 2172 } 2173 2174 if (n->nr_partial <= MAX_PARTIAL) 2175 goto out; 2176 2177 /* 2178 * Rebuild the partial list with the slabs filled up 2179 * most first and the least used slabs at the end. 2180 */ 2181 for (i = s->objects - 1; i >= 0; i--) 2182 list_splice(slabs_by_inuse + i, n->partial.prev); 2183 2184 out: 2185 spin_unlock_irqrestore(&n->list_lock, flags); 2186 } 2187 2188 kfree(slabs_by_inuse); 2189 return 0; 2190} 2191EXPORT_SYMBOL(kmem_cache_shrink); 2192 2193/** 2194 * krealloc - reallocate memory. The contents will remain unchanged. 2195 * 2196 * @p: object to reallocate memory for. 2197 * @new_size: how many bytes of memory are required. 2198 * @flags: the type of memory to allocate. 2199 * 2200 * The contents of the object pointed to are preserved up to the 2201 * lesser of the new and old sizes. If @p is %NULL, krealloc() 2202 * behaves exactly like kmalloc(). If @size is 0 and @p is not a 2203 * %NULL pointer, the object pointed to is freed. 2204 */ 2205void *krealloc(const void *p, size_t new_size, gfp_t flags) 2206{ 2207 struct kmem_cache *new_cache; 2208 void *ret; 2209 struct page *page; 2210 2211 if (unlikely(!p)) 2212 return kmalloc(new_size, flags); 2213 2214 if (unlikely(!new_size)) { 2215 kfree(p); 2216 return NULL; 2217 } 2218 2219 page = virt_to_head_page(p); 2220 2221 new_cache = get_slab(new_size, flags); 2222 2223 /* 2224 * If new size fits in the current cache, bail out. 2225 */ 2226 if (likely(page->slab == new_cache)) 2227 return (void *)p; 2228 2229 ret = kmalloc(new_size, flags); 2230 if (ret) { 2231 memcpy(ret, p, min(new_size, ksize(p))); 2232 kfree(p); 2233 } 2234 return ret; 2235} 2236EXPORT_SYMBOL(krealloc); 2237 2238/******************************************************************** 2239 * Basic setup of slabs 2240 *******************************************************************/ 2241 2242void __init kmem_cache_init(void) 2243{ 2244 int i; 2245 2246#ifdef CONFIG_NUMA 2247 /* 2248 * Must first have the slab cache available for the allocations of the 2249 * struct kmalloc_cache_node's. There is special bootstrap code in 2250 * kmem_cache_open for slab_state == DOWN. 2251 */ 2252 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 2253 sizeof(struct kmem_cache_node), GFP_KERNEL); 2254#endif 2255 2256 /* Able to allocate the per node structures */ 2257 slab_state = PARTIAL; 2258 2259 /* Caches that are not of the two-to-the-power-of size */ 2260 create_kmalloc_cache(&kmalloc_caches[1], 2261 "kmalloc-96", 96, GFP_KERNEL); 2262 create_kmalloc_cache(&kmalloc_caches[2], 2263 "kmalloc-192", 192, GFP_KERNEL); 2264 2265 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) 2266 create_kmalloc_cache(&kmalloc_caches[i], 2267 "kmalloc", 1 << i, GFP_KERNEL); 2268 2269 slab_state = UP; 2270 2271 /* Provide the correct kmalloc names now that the caches are up */ 2272 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) 2273 kmalloc_caches[i]. name = 2274 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 2275 2276#ifdef CONFIG_SMP 2277 register_cpu_notifier(&slab_notifier); 2278#endif 2279 2280 if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */ 2281 kmem_size = offsetof(struct kmem_cache, cpu_slab) 2282 + nr_cpu_ids * sizeof(struct page *); 2283 2284 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2285 " Processors=%d, Nodes=%d\n", 2286 KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES, 2287 slub_min_order, slub_max_order, slub_min_objects, 2288 nr_cpu_ids, nr_node_ids); 2289} 2290 2291/* 2292 * Find a mergeable slab cache 2293 */ 2294static int slab_unmergeable(struct kmem_cache *s) 2295{ 2296 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 2297 return 1; 2298 2299 if (s->ctor || s->dtor) 2300 return 1; 2301 2302 return 0; 2303} 2304 2305static struct kmem_cache *find_mergeable(size_t size, 2306 size_t align, unsigned long flags, 2307 void (*ctor)(void *, struct kmem_cache *, unsigned long), 2308 void (*dtor)(void *, struct kmem_cache *, unsigned long)) 2309{ 2310 struct list_head *h; 2311 2312 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 2313 return NULL; 2314 2315 if (ctor || dtor) 2316 return NULL; 2317 2318 size = ALIGN(size, sizeof(void *)); 2319 align = calculate_alignment(flags, align, size); 2320 size = ALIGN(size, align); 2321 2322 list_for_each(h, &slab_caches) { 2323 struct kmem_cache *s = 2324 container_of(h, struct kmem_cache, list); 2325 2326 if (slab_unmergeable(s)) 2327 continue; 2328 2329 if (size > s->size) 2330 continue; 2331 2332 if (((flags | slub_debug) & SLUB_MERGE_SAME) != 2333 (s->flags & SLUB_MERGE_SAME)) 2334 continue; 2335 /* 2336 * Check if alignment is compatible. 2337 * Courtesy of Adrian Drzewiecki 2338 */ 2339 if ((s->size & ~(align -1)) != s->size) 2340 continue; 2341 2342 if (s->size - size >= sizeof(void *)) 2343 continue; 2344 2345 return s; 2346 } 2347 return NULL; 2348} 2349 2350struct kmem_cache *kmem_cache_create(const char *name, size_t size, 2351 size_t align, unsigned long flags, 2352 void (*ctor)(void *, struct kmem_cache *, unsigned long), 2353 void (*dtor)(void *, struct kmem_cache *, unsigned long)) 2354{ 2355 struct kmem_cache *s; 2356 2357 down_write(&slub_lock); 2358 s = find_mergeable(size, align, flags, dtor, ctor); 2359 if (s) { 2360 s->refcount++; 2361 /* 2362 * Adjust the object sizes so that we clear 2363 * the complete object on kzalloc. 2364 */ 2365 s->objsize = max(s->objsize, (int)size); 2366 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 2367 if (sysfs_slab_alias(s, name)) 2368 goto err; 2369 } else { 2370 s = kmalloc(kmem_size, GFP_KERNEL); 2371 if (s && kmem_cache_open(s, GFP_KERNEL, name, 2372 size, align, flags, ctor, dtor)) { 2373 if (sysfs_slab_add(s)) { 2374 kfree(s); 2375 goto err; 2376 } 2377 list_add(&s->list, &slab_caches); 2378 } else 2379 kfree(s); 2380 } 2381 up_write(&slub_lock); 2382 return s; 2383 2384err: 2385 up_write(&slub_lock); 2386 if (flags & SLAB_PANIC) 2387 panic("Cannot create slabcache %s\n", name); 2388 else 2389 s = NULL; 2390 return s; 2391} 2392EXPORT_SYMBOL(kmem_cache_create); 2393 2394void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags) 2395{ 2396 void *x; 2397 2398 x = slab_alloc(s, flags, -1, __builtin_return_address(0)); 2399 if (x) 2400 memset(x, 0, s->objsize); 2401 return x; 2402} 2403EXPORT_SYMBOL(kmem_cache_zalloc); 2404 2405#ifdef CONFIG_SMP 2406static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) 2407{ 2408 struct list_head *h; 2409 2410 down_read(&slub_lock); 2411 list_for_each(h, &slab_caches) { 2412 struct kmem_cache *s = 2413 container_of(h, struct kmem_cache, list); 2414 2415 func(s, cpu); 2416 } 2417 up_read(&slub_lock); 2418} 2419 2420/* 2421 * Use the cpu notifier to insure that the slab are flushed 2422 * when necessary. 2423 */ 2424static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 2425 unsigned long action, void *hcpu) 2426{ 2427 long cpu = (long)hcpu; 2428 2429 switch (action) { 2430 case CPU_UP_CANCELED: 2431 case CPU_DEAD: 2432 for_all_slabs(__flush_cpu_slab, cpu); 2433 break; 2434 default: 2435 break; 2436 } 2437 return NOTIFY_OK; 2438} 2439 2440static struct notifier_block __cpuinitdata slab_notifier = 2441 { &slab_cpuup_callback, NULL, 0 }; 2442 2443#endif 2444 2445#ifdef CONFIG_NUMA 2446 2447/***************************************************************** 2448 * Generic reaper used to support the page allocator 2449 * (the cpu slabs are reaped by a per slab workqueue). 2450 * 2451 * Maybe move this to the page allocator? 2452 ****************************************************************/ 2453 2454static DEFINE_PER_CPU(unsigned long, reap_node); 2455 2456static void init_reap_node(int cpu) 2457{ 2458 int node; 2459 2460 node = next_node(cpu_to_node(cpu), node_online_map); 2461 if (node == MAX_NUMNODES) 2462 node = first_node(node_online_map); 2463 2464 __get_cpu_var(reap_node) = node; 2465} 2466 2467static void next_reap_node(void) 2468{ 2469 int node = __get_cpu_var(reap_node); 2470 2471 /* 2472 * Also drain per cpu pages on remote zones 2473 */ 2474 if (node != numa_node_id()) 2475 drain_node_pages(node); 2476 2477 node = next_node(node, node_online_map); 2478 if (unlikely(node >= MAX_NUMNODES)) 2479 node = first_node(node_online_map); 2480 __get_cpu_var(reap_node) = node; 2481} 2482#else 2483#define init_reap_node(cpu) do { } while (0) 2484#define next_reap_node(void) do { } while (0) 2485#endif 2486 2487#define REAPTIMEOUT_CPUC (2*HZ) 2488 2489#ifdef CONFIG_SMP 2490static DEFINE_PER_CPU(struct delayed_work, reap_work); 2491 2492static void cache_reap(struct work_struct *unused) 2493{ 2494 next_reap_node(); 2495 refresh_cpu_vm_stats(smp_processor_id()); 2496 schedule_delayed_work(&__get_cpu_var(reap_work), 2497 REAPTIMEOUT_CPUC); 2498} 2499 2500static void __devinit start_cpu_timer(int cpu) 2501{ 2502 struct delayed_work *reap_work = &per_cpu(reap_work, cpu); 2503 2504 /* 2505 * When this gets called from do_initcalls via cpucache_init(), 2506 * init_workqueues() has already run, so keventd will be setup 2507 * at that time. 2508 */ 2509 if (keventd_up() && reap_work->work.func == NULL) { 2510 init_reap_node(cpu); 2511 INIT_DELAYED_WORK(reap_work, cache_reap); 2512 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 2513 } 2514} 2515 2516static int __init cpucache_init(void) 2517{ 2518 int cpu; 2519 2520 /* 2521 * Register the timers that drain pcp pages and update vm statistics 2522 */ 2523 for_each_online_cpu(cpu) 2524 start_cpu_timer(cpu); 2525 return 0; 2526} 2527__initcall(cpucache_init); 2528#endif 2529 2530#ifdef SLUB_RESILIENCY_TEST 2531static unsigned long validate_slab_cache(struct kmem_cache *s); 2532 2533static void resiliency_test(void) 2534{ 2535 u8 *p; 2536 2537 printk(KERN_ERR "SLUB resiliency testing\n"); 2538 printk(KERN_ERR "-----------------------\n"); 2539 printk(KERN_ERR "A. Corruption after allocation\n"); 2540 2541 p = kzalloc(16, GFP_KERNEL); 2542 p[16] = 0x12; 2543 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 2544 " 0x12->0x%p\n\n", p + 16); 2545 2546 validate_slab_cache(kmalloc_caches + 4); 2547 2548 /* Hmmm... The next two are dangerous */ 2549 p = kzalloc(32, GFP_KERNEL); 2550 p[32 + sizeof(void *)] = 0x34; 2551 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 2552 " 0x34 -> -0x%p\n", p); 2553 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 2554 2555 validate_slab_cache(kmalloc_caches + 5); 2556 p = kzalloc(64, GFP_KERNEL); 2557 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 2558 *p = 0x56; 2559 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 2560 p); 2561 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 2562 validate_slab_cache(kmalloc_caches + 6); 2563 2564 printk(KERN_ERR "\nB. Corruption after free\n"); 2565 p = kzalloc(128, GFP_KERNEL); 2566 kfree(p); 2567 *p = 0x78; 2568 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 2569 validate_slab_cache(kmalloc_caches + 7); 2570 2571 p = kzalloc(256, GFP_KERNEL); 2572 kfree(p); 2573 p[50] = 0x9a; 2574 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); 2575 validate_slab_cache(kmalloc_caches + 8); 2576 2577 p = kzalloc(512, GFP_KERNEL); 2578 kfree(p); 2579 p[512] = 0xab; 2580 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 2581 validate_slab_cache(kmalloc_caches + 9); 2582} 2583#else 2584static void resiliency_test(void) {}; 2585#endif 2586 2587/* 2588 * These are not as efficient as kmalloc for the non debug case. 2589 * We do not have the page struct available so we have to touch one 2590 * cacheline in struct kmem_cache to check slab flags. 2591 */ 2592void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) 2593{ 2594 struct kmem_cache *s = get_slab(size, gfpflags); 2595 2596 if (!s) 2597 return NULL; 2598 2599 return slab_alloc(s, gfpflags, -1, caller); 2600} 2601 2602void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 2603 int node, void *caller) 2604{ 2605 struct kmem_cache *s = get_slab(size, gfpflags); 2606 2607 if (!s) 2608 return NULL; 2609 2610 return slab_alloc(s, gfpflags, node, caller); 2611} 2612 2613#ifdef CONFIG_SYSFS 2614 2615static int validate_slab(struct kmem_cache *s, struct page *page) 2616{ 2617 void *p; 2618 void *addr = page_address(page); 2619 unsigned long map[BITS_TO_LONGS(s->objects)]; 2620 2621 if (!check_slab(s, page) || 2622 !on_freelist(s, page, NULL)) 2623 return 0; 2624 2625 /* Now we know that a valid freelist exists */ 2626 bitmap_zero(map, s->objects); 2627 2628 for(p = page->freelist; p; p = get_freepointer(s, p)) { 2629 set_bit((p - addr) / s->size, map); 2630 if (!check_object(s, page, p, 0)) 2631 return 0; 2632 } 2633 2634 for(p = addr; p < addr + s->objects * s->size; p += s->size) 2635 if (!test_bit((p - addr) / s->size, map)) 2636 if (!check_object(s, page, p, 1)) 2637 return 0; 2638 return 1; 2639} 2640 2641static void validate_slab_slab(struct kmem_cache *s, struct page *page) 2642{ 2643 if (slab_trylock(page)) { 2644 validate_slab(s, page); 2645 slab_unlock(page); 2646 } else 2647 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", 2648 s->name, page); 2649 2650 if (s->flags & DEBUG_DEFAULT_FLAGS) { 2651 if (!PageError(page)) 2652 printk(KERN_ERR "SLUB %s: PageError not set " 2653 "on slab 0x%p\n", s->name, page); 2654 } else { 2655 if (PageError(page)) 2656 printk(KERN_ERR "SLUB %s: PageError set on " 2657 "slab 0x%p\n", s->name, page); 2658 } 2659} 2660 2661static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n) 2662{ 2663 unsigned long count = 0; 2664 struct page *page; 2665 unsigned long flags; 2666 2667 spin_lock_irqsave(&n->list_lock, flags); 2668 2669 list_for_each_entry(page, &n->partial, lru) { 2670 validate_slab_slab(s, page); 2671 count++; 2672 } 2673 if (count != n->nr_partial) 2674 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 2675 "counter=%ld\n", s->name, count, n->nr_partial); 2676 2677 if (!(s->flags & SLAB_STORE_USER)) 2678 goto out; 2679 2680 list_for_each_entry(page, &n->full, lru) { 2681 validate_slab_slab(s, page); 2682 count++; 2683 } 2684 if (count != atomic_long_read(&n->nr_slabs)) 2685 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 2686 "counter=%ld\n", s->name, count, 2687 atomic_long_read(&n->nr_slabs)); 2688 2689out: 2690 spin_unlock_irqrestore(&n->list_lock, flags); 2691 return count; 2692} 2693 2694static unsigned long validate_slab_cache(struct kmem_cache *s) 2695{ 2696 int node; 2697 unsigned long count = 0; 2698 2699 flush_all(s); 2700 for_each_online_node(node) { 2701 struct kmem_cache_node *n = get_node(s, node); 2702 2703 count += validate_slab_node(s, n); 2704 } 2705 return count; 2706} 2707 2708/* 2709 * Generate lists of locations where slabcache objects are allocated 2710 * and freed. 2711 */ 2712 2713struct location { 2714 unsigned long count; 2715 void *addr; 2716}; 2717 2718struct loc_track { 2719 unsigned long max; 2720 unsigned long count; 2721 struct location *loc; 2722}; 2723 2724static void free_loc_track(struct loc_track *t) 2725{ 2726 if (t->max) 2727 free_pages((unsigned long)t->loc, 2728 get_order(sizeof(struct location) * t->max)); 2729} 2730 2731static int alloc_loc_track(struct loc_track *t, unsigned long max) 2732{ 2733 struct location *l; 2734 int order; 2735 2736 if (!max) 2737 max = PAGE_SIZE / sizeof(struct location); 2738 2739 order = get_order(sizeof(struct location) * max); 2740 2741 l = (void *)__get_free_pages(GFP_KERNEL, order); 2742 2743 if (!l) 2744 return 0; 2745 2746 if (t->count) { 2747 memcpy(l, t->loc, sizeof(struct location) * t->count); 2748 free_loc_track(t); 2749 } 2750 t->max = max; 2751 t->loc = l; 2752 return 1; 2753} 2754 2755static int add_location(struct loc_track *t, struct kmem_cache *s, 2756 void *addr) 2757{ 2758 long start, end, pos; 2759 struct location *l; 2760 void *caddr; 2761 2762 start = -1; 2763 end = t->count; 2764 2765 for ( ; ; ) { 2766 pos = start + (end - start + 1) / 2; 2767 2768 /* 2769 * There is nothing at "end". If we end up there 2770 * we need to add something to before end. 2771 */ 2772 if (pos == end) 2773 break; 2774 2775 caddr = t->loc[pos].addr; 2776 if (addr == caddr) { 2777 t->loc[pos].count++; 2778 return 1; 2779 } 2780 2781 if (addr < caddr) 2782 end = pos; 2783 else 2784 start = pos; 2785 } 2786 2787 /* 2788 * Not found. Insert new tracking element 2789 */ 2790 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) 2791 return 0; 2792 2793 l = t->loc + pos; 2794 if (pos < t->count) 2795 memmove(l + 1, l, 2796 (t->count - pos) * sizeof(struct location)); 2797 t->count++; 2798 l->count = 1; 2799 l->addr = addr; 2800 return 1; 2801} 2802 2803static void process_slab(struct loc_track *t, struct kmem_cache *s, 2804 struct page *page, enum track_item alloc) 2805{ 2806 void *addr = page_address(page); 2807 unsigned long map[BITS_TO_LONGS(s->objects)]; 2808 void *p; 2809 2810 bitmap_zero(map, s->objects); 2811 for (p = page->freelist; p; p = get_freepointer(s, p)) 2812 set_bit((p - addr) / s->size, map); 2813 2814 for (p = addr; p < addr + s->objects * s->size; p += s->size) 2815 if (!test_bit((p - addr) / s->size, map)) { 2816 void *addr = get_track(s, p, alloc)->addr; 2817 2818 add_location(t, s, addr); 2819 } 2820} 2821 2822static int list_locations(struct kmem_cache *s, char *buf, 2823 enum track_item alloc) 2824{ 2825 int n = 0; 2826 unsigned long i; 2827 struct loc_track t; 2828 int node; 2829 2830 t.count = 0; 2831 t.max = 0; 2832 2833 /* Push back cpu slabs */ 2834 flush_all(s); 2835 2836 for_each_online_node(node) { 2837 struct kmem_cache_node *n = get_node(s, node); 2838 unsigned long flags; 2839 struct page *page; 2840 2841 if (!atomic_read(&n->nr_slabs)) 2842 continue; 2843 2844 spin_lock_irqsave(&n->list_lock, flags); 2845 list_for_each_entry(page, &n->partial, lru) 2846 process_slab(&t, s, page, alloc); 2847 list_for_each_entry(page, &n->full, lru) 2848 process_slab(&t, s, page, alloc); 2849 spin_unlock_irqrestore(&n->list_lock, flags); 2850 } 2851 2852 for (i = 0; i < t.count; i++) { 2853 void *addr = t.loc[i].addr; 2854 2855 if (n > PAGE_SIZE - 100) 2856 break; 2857 n += sprintf(buf + n, "%7ld ", t.loc[i].count); 2858 if (addr) 2859 n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr); 2860 else 2861 n += sprintf(buf + n, "<not-available>"); 2862 n += sprintf(buf + n, "\n"); 2863 } 2864 2865 free_loc_track(&t); 2866 if (!t.count) 2867 n += sprintf(buf, "No data\n"); 2868 return n; 2869} 2870 2871static unsigned long count_partial(struct kmem_cache_node *n) 2872{ 2873 unsigned long flags; 2874 unsigned long x = 0; 2875 struct page *page; 2876 2877 spin_lock_irqsave(&n->list_lock, flags); 2878 list_for_each_entry(page, &n->partial, lru) 2879 x += page->inuse; 2880 spin_unlock_irqrestore(&n->list_lock, flags); 2881 return x; 2882} 2883 2884enum slab_stat_type { 2885 SL_FULL, 2886 SL_PARTIAL, 2887 SL_CPU, 2888 SL_OBJECTS 2889}; 2890 2891#define SO_FULL (1 << SL_FULL) 2892#define SO_PARTIAL (1 << SL_PARTIAL) 2893#define SO_CPU (1 << SL_CPU) 2894#define SO_OBJECTS (1 << SL_OBJECTS) 2895 2896static unsigned long slab_objects(struct kmem_cache *s, 2897 char *buf, unsigned long flags) 2898{ 2899 unsigned long total = 0; 2900 int cpu; 2901 int node; 2902 int x; 2903 unsigned long *nodes; 2904 unsigned long *per_cpu; 2905 2906 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 2907 per_cpu = nodes + nr_node_ids; 2908 2909 for_each_possible_cpu(cpu) { 2910 struct page *page = s->cpu_slab[cpu]; 2911 int node; 2912 2913 if (page) { 2914 node = page_to_nid(page); 2915 if (flags & SO_CPU) { 2916 int x = 0; 2917 2918 if (flags & SO_OBJECTS) 2919 x = page->inuse; 2920 else 2921 x = 1; 2922 total += x; 2923 nodes[node] += x; 2924 } 2925 per_cpu[node]++; 2926 } 2927 } 2928 2929 for_each_online_node(node) { 2930 struct kmem_cache_node *n = get_node(s, node); 2931 2932 if (flags & SO_PARTIAL) { 2933 if (flags & SO_OBJECTS) 2934 x = count_partial(n); 2935 else 2936 x = n->nr_partial; 2937 total += x; 2938 nodes[node] += x; 2939 } 2940 2941 if (flags & SO_FULL) { 2942 int full_slabs = atomic_read(&n->nr_slabs) 2943 - per_cpu[node] 2944 - n->nr_partial; 2945 2946 if (flags & SO_OBJECTS) 2947 x = full_slabs * s->objects; 2948 else 2949 x = full_slabs; 2950 total += x; 2951 nodes[node] += x; 2952 } 2953 } 2954 2955 x = sprintf(buf, "%lu", total); 2956#ifdef CONFIG_NUMA 2957 for_each_online_node(node) 2958 if (nodes[node]) 2959 x += sprintf(buf + x, " N%d=%lu", 2960 node, nodes[node]); 2961#endif 2962 kfree(nodes); 2963 return x + sprintf(buf + x, "\n"); 2964} 2965 2966static int any_slab_objects(struct kmem_cache *s) 2967{ 2968 int node; 2969 int cpu; 2970 2971 for_each_possible_cpu(cpu) 2972 if (s->cpu_slab[cpu]) 2973 return 1; 2974 2975 for_each_node(node) { 2976 struct kmem_cache_node *n = get_node(s, node); 2977 2978 if (n->nr_partial || atomic_read(&n->nr_slabs)) 2979 return 1; 2980 } 2981 return 0; 2982} 2983 2984#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 2985#define to_slab(n) container_of(n, struct kmem_cache, kobj); 2986 2987struct slab_attribute { 2988 struct attribute attr; 2989 ssize_t (*show)(struct kmem_cache *s, char *buf); 2990 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 2991}; 2992 2993#define SLAB_ATTR_RO(_name) \ 2994 static struct slab_attribute _name##_attr = __ATTR_RO(_name) 2995 2996#define SLAB_ATTR(_name) \ 2997 static struct slab_attribute _name##_attr = \ 2998 __ATTR(_name, 0644, _name##_show, _name##_store) 2999 3000static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 3001{ 3002 return sprintf(buf, "%d\n", s->size); 3003} 3004SLAB_ATTR_RO(slab_size); 3005 3006static ssize_t align_show(struct kmem_cache *s, char *buf) 3007{ 3008 return sprintf(buf, "%d\n", s->align); 3009} 3010SLAB_ATTR_RO(align); 3011 3012static ssize_t object_size_show(struct kmem_cache *s, char *buf) 3013{ 3014 return sprintf(buf, "%d\n", s->objsize); 3015} 3016SLAB_ATTR_RO(object_size); 3017 3018static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 3019{ 3020 return sprintf(buf, "%d\n", s->objects); 3021} 3022SLAB_ATTR_RO(objs_per_slab); 3023 3024static ssize_t order_show(struct kmem_cache *s, char *buf) 3025{ 3026 return sprintf(buf, "%d\n", s->order); 3027} 3028SLAB_ATTR_RO(order); 3029 3030static ssize_t ctor_show(struct kmem_cache *s, char *buf) 3031{ 3032 if (s->ctor) { 3033 int n = sprint_symbol(buf, (unsigned long)s->ctor); 3034 3035 return n + sprintf(buf + n, "\n"); 3036 } 3037 return 0; 3038} 3039SLAB_ATTR_RO(ctor); 3040 3041static ssize_t dtor_show(struct kmem_cache *s, char *buf) 3042{ 3043 if (s->dtor) { 3044 int n = sprint_symbol(buf, (unsigned long)s->dtor); 3045 3046 return n + sprintf(buf + n, "\n"); 3047 } 3048 return 0; 3049} 3050SLAB_ATTR_RO(dtor); 3051 3052static ssize_t aliases_show(struct kmem_cache *s, char *buf) 3053{ 3054 return sprintf(buf, "%d\n", s->refcount - 1); 3055} 3056SLAB_ATTR_RO(aliases); 3057 3058static ssize_t slabs_show(struct kmem_cache *s, char *buf) 3059{ 3060 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); 3061} 3062SLAB_ATTR_RO(slabs); 3063 3064static ssize_t partial_show(struct kmem_cache *s, char *buf) 3065{ 3066 return slab_objects(s, buf, SO_PARTIAL); 3067} 3068SLAB_ATTR_RO(partial); 3069 3070static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 3071{ 3072 return slab_objects(s, buf, SO_CPU); 3073} 3074SLAB_ATTR_RO(cpu_slabs); 3075 3076static ssize_t objects_show(struct kmem_cache *s, char *buf) 3077{ 3078 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); 3079} 3080SLAB_ATTR_RO(objects); 3081 3082static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 3083{ 3084 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 3085} 3086 3087static ssize_t sanity_checks_store(struct kmem_cache *s, 3088 const char *buf, size_t length) 3089{ 3090 s->flags &= ~SLAB_DEBUG_FREE; 3091 if (buf[0] == '1') 3092 s->flags |= SLAB_DEBUG_FREE; 3093 return length; 3094} 3095SLAB_ATTR(sanity_checks); 3096 3097static ssize_t trace_show(struct kmem_cache *s, char *buf) 3098{ 3099 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 3100} 3101 3102static ssize_t trace_store(struct kmem_cache *s, const char *buf, 3103 size_t length) 3104{ 3105 s->flags &= ~SLAB_TRACE; 3106 if (buf[0] == '1') 3107 s->flags |= SLAB_TRACE; 3108 return length; 3109} 3110SLAB_ATTR(trace); 3111 3112static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 3113{ 3114 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 3115} 3116 3117static ssize_t reclaim_account_store(struct kmem_cache *s, 3118 const char *buf, size_t length) 3119{ 3120 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 3121 if (buf[0] == '1') 3122 s->flags |= SLAB_RECLAIM_ACCOUNT; 3123 return length; 3124} 3125SLAB_ATTR(reclaim_account); 3126 3127static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 3128{ 3129 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 3130} 3131SLAB_ATTR_RO(hwcache_align); 3132 3133#ifdef CONFIG_ZONE_DMA 3134static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 3135{ 3136 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 3137} 3138SLAB_ATTR_RO(cache_dma); 3139#endif 3140 3141static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 3142{ 3143 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 3144} 3145SLAB_ATTR_RO(destroy_by_rcu); 3146 3147static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 3148{ 3149 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 3150} 3151 3152static ssize_t red_zone_store(struct kmem_cache *s, 3153 const char *buf, size_t length) 3154{ 3155 if (any_slab_objects(s)) 3156 return -EBUSY; 3157 3158 s->flags &= ~SLAB_RED_ZONE; 3159 if (buf[0] == '1') 3160 s->flags |= SLAB_RED_ZONE; 3161 calculate_sizes(s); 3162 return length; 3163} 3164SLAB_ATTR(red_zone); 3165 3166static ssize_t poison_show(struct kmem_cache *s, char *buf) 3167{ 3168 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 3169} 3170 3171static ssize_t poison_store(struct kmem_cache *s, 3172 const char *buf, size_t length) 3173{ 3174 if (any_slab_objects(s)) 3175 return -EBUSY; 3176 3177 s->flags &= ~SLAB_POISON; 3178 if (buf[0] == '1') 3179 s->flags |= SLAB_POISON; 3180 calculate_sizes(s); 3181 return length; 3182} 3183SLAB_ATTR(poison); 3184 3185static ssize_t store_user_show(struct kmem_cache *s, char *buf) 3186{ 3187 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 3188} 3189 3190static ssize_t store_user_store(struct kmem_cache *s, 3191 const char *buf, size_t length) 3192{ 3193 if (any_slab_objects(s)) 3194 return -EBUSY; 3195 3196 s->flags &= ~SLAB_STORE_USER; 3197 if (buf[0] == '1') 3198 s->flags |= SLAB_STORE_USER; 3199 calculate_sizes(s); 3200 return length; 3201} 3202SLAB_ATTR(store_user); 3203 3204static ssize_t validate_show(struct kmem_cache *s, char *buf) 3205{ 3206 return 0; 3207} 3208 3209static ssize_t validate_store(struct kmem_cache *s, 3210 const char *buf, size_t length) 3211{ 3212 if (buf[0] == '1') 3213 validate_slab_cache(s); 3214 else 3215 return -EINVAL; 3216 return length; 3217} 3218SLAB_ATTR(validate); 3219 3220static ssize_t shrink_show(struct kmem_cache *s, char *buf) 3221{ 3222 return 0; 3223} 3224 3225static ssize_t shrink_store(struct kmem_cache *s, 3226 const char *buf, size_t length) 3227{ 3228 if (buf[0] == '1') { 3229 int rc = kmem_cache_shrink(s); 3230 3231 if (rc) 3232 return rc; 3233 } else 3234 return -EINVAL; 3235 return length; 3236} 3237SLAB_ATTR(shrink); 3238 3239static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 3240{ 3241 if (!(s->flags & SLAB_STORE_USER)) 3242 return -ENOSYS; 3243 return list_locations(s, buf, TRACK_ALLOC); 3244} 3245SLAB_ATTR_RO(alloc_calls); 3246 3247static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 3248{ 3249 if (!(s->flags & SLAB_STORE_USER)) 3250 return -ENOSYS; 3251 return list_locations(s, buf, TRACK_FREE); 3252} 3253SLAB_ATTR_RO(free_calls); 3254 3255#ifdef CONFIG_NUMA 3256static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) 3257{ 3258 return sprintf(buf, "%d\n", s->defrag_ratio / 10); 3259} 3260 3261static ssize_t defrag_ratio_store(struct kmem_cache *s, 3262 const char *buf, size_t length) 3263{ 3264 int n = simple_strtoul(buf, NULL, 10); 3265 3266 if (n < 100) 3267 s->defrag_ratio = n * 10; 3268 return length; 3269} 3270SLAB_ATTR(defrag_ratio); 3271#endif 3272 3273static struct attribute * slab_attrs[] = { 3274 &slab_size_attr.attr, 3275 &object_size_attr.attr, 3276 &objs_per_slab_attr.attr, 3277 &order_attr.attr, 3278 &objects_attr.attr, 3279 &slabs_attr.attr, 3280 &partial_attr.attr, 3281 &cpu_slabs_attr.attr, 3282 &ctor_attr.attr, 3283 &dtor_attr.attr, 3284 &aliases_attr.attr, 3285 &align_attr.attr, 3286 &sanity_checks_attr.attr, 3287 &trace_attr.attr, 3288 &hwcache_align_attr.attr, 3289 &reclaim_account_attr.attr, 3290 &destroy_by_rcu_attr.attr, 3291 &red_zone_attr.attr, 3292 &poison_attr.attr, 3293 &store_user_attr.attr, 3294 &validate_attr.attr, 3295 &shrink_attr.attr, 3296 &alloc_calls_attr.attr, 3297 &free_calls_attr.attr, 3298#ifdef CONFIG_ZONE_DMA 3299 &cache_dma_attr.attr, 3300#endif 3301#ifdef CONFIG_NUMA 3302 &defrag_ratio_attr.attr, 3303#endif 3304 NULL 3305}; 3306 3307static struct attribute_group slab_attr_group = { 3308 .attrs = slab_attrs, 3309}; 3310 3311static ssize_t slab_attr_show(struct kobject *kobj, 3312 struct attribute *attr, 3313 char *buf) 3314{ 3315 struct slab_attribute *attribute; 3316 struct kmem_cache *s; 3317 int err; 3318 3319 attribute = to_slab_attr(attr); 3320 s = to_slab(kobj); 3321 3322 if (!attribute->show) 3323 return -EIO; 3324 3325 err = attribute->show(s, buf); 3326 3327 return err; 3328} 3329 3330static ssize_t slab_attr_store(struct kobject *kobj, 3331 struct attribute *attr, 3332 const char *buf, size_t len) 3333{ 3334 struct slab_attribute *attribute; 3335 struct kmem_cache *s; 3336 int err; 3337 3338 attribute = to_slab_attr(attr); 3339 s = to_slab(kobj); 3340 3341 if (!attribute->store) 3342 return -EIO; 3343 3344 err = attribute->store(s, buf, len); 3345 3346 return err; 3347} 3348 3349static struct sysfs_ops slab_sysfs_ops = { 3350 .show = slab_attr_show, 3351 .store = slab_attr_store, 3352}; 3353 3354static struct kobj_type slab_ktype = { 3355 .sysfs_ops = &slab_sysfs_ops, 3356}; 3357 3358static int uevent_filter(struct kset *kset, struct kobject *kobj) 3359{ 3360 struct kobj_type *ktype = get_ktype(kobj); 3361 3362 if (ktype == &slab_ktype) 3363 return 1; 3364 return 0; 3365} 3366 3367static struct kset_uevent_ops slab_uevent_ops = { 3368 .filter = uevent_filter, 3369}; 3370 3371decl_subsys(slab, &slab_ktype, &slab_uevent_ops); 3372 3373#define ID_STR_LENGTH 64 3374 3375/* Create a unique string id for a slab cache: 3376 * format 3377 * :[flags-]size:[memory address of kmemcache] 3378 */ 3379static char *create_unique_id(struct kmem_cache *s) 3380{ 3381 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 3382 char *p = name; 3383 3384 BUG_ON(!name); 3385 3386 *p++ = ':'; 3387 /* 3388 * First flags affecting slabcache operations. We will only 3389 * get here for aliasable slabs so we do not need to support 3390 * too many flags. The flags here must cover all flags that 3391 * are matched during merging to guarantee that the id is 3392 * unique. 3393 */ 3394 if (s->flags & SLAB_CACHE_DMA) 3395 *p++ = 'd'; 3396 if (s->flags & SLAB_RECLAIM_ACCOUNT) 3397 *p++ = 'a'; 3398 if (s->flags & SLAB_DEBUG_FREE) 3399 *p++ = 'F'; 3400 if (p != name + 1) 3401 *p++ = '-'; 3402 p += sprintf(p, "%07d", s->size); 3403 BUG_ON(p > name + ID_STR_LENGTH - 1); 3404 return name; 3405} 3406 3407static int sysfs_slab_add(struct kmem_cache *s) 3408{ 3409 int err; 3410 const char *name; 3411 int unmergeable; 3412 3413 if (slab_state < SYSFS) 3414 /* Defer until later */ 3415 return 0; 3416 3417 unmergeable = slab_unmergeable(s); 3418 if (unmergeable) { 3419 /* 3420 * Slabcache can never be merged so we can use the name proper. 3421 * This is typically the case for debug situations. In that 3422 * case we can catch duplicate names easily. 3423 */ 3424 sysfs_remove_link(&slab_subsys.kset.kobj, s->name); 3425 name = s->name; 3426 } else { 3427 /* 3428 * Create a unique name for the slab as a target 3429 * for the symlinks. 3430 */ 3431 name = create_unique_id(s); 3432 } 3433 3434 kobj_set_kset_s(s, slab_subsys); 3435 kobject_set_name(&s->kobj, name); 3436 kobject_init(&s->kobj); 3437 err = kobject_add(&s->kobj); 3438 if (err) 3439 return err; 3440 3441 err = sysfs_create_group(&s->kobj, &slab_attr_group); 3442 if (err) 3443 return err; 3444 kobject_uevent(&s->kobj, KOBJ_ADD); 3445 if (!unmergeable) { 3446 /* Setup first alias */ 3447 sysfs_slab_alias(s, s->name); 3448 kfree(name); 3449 } 3450 return 0; 3451} 3452 3453static void sysfs_slab_remove(struct kmem_cache *s) 3454{ 3455 kobject_uevent(&s->kobj, KOBJ_REMOVE); 3456 kobject_del(&s->kobj); 3457} 3458 3459/* 3460 * Need to buffer aliases during bootup until sysfs becomes 3461 * available lest we loose that information. 3462 */ 3463struct saved_alias { 3464 struct kmem_cache *s; 3465 const char *name; 3466 struct saved_alias *next; 3467}; 3468 3469struct saved_alias *alias_list; 3470 3471static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 3472{ 3473 struct saved_alias *al; 3474 3475 if (slab_state == SYSFS) { 3476 /* 3477 * If we have a leftover link then remove it. 3478 */ 3479 sysfs_remove_link(&slab_subsys.kset.kobj, name); 3480 return sysfs_create_link(&slab_subsys.kset.kobj, 3481 &s->kobj, name); 3482 } 3483 3484 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 3485 if (!al) 3486 return -ENOMEM; 3487 3488 al->s = s; 3489 al->name = name; 3490 al->next = alias_list; 3491 alias_list = al; 3492 return 0; 3493} 3494 3495static int __init slab_sysfs_init(void) 3496{ 3497 int err; 3498 3499 err = subsystem_register(&slab_subsys); 3500 if (err) { 3501 printk(KERN_ERR "Cannot register slab subsystem.\n"); 3502 return -ENOSYS; 3503 } 3504 3505 finish_bootstrap(); 3506 3507 while (alias_list) { 3508 struct saved_alias *al = alias_list; 3509 3510 alias_list = alias_list->next; 3511 err = sysfs_slab_alias(al->s, al->name); 3512 BUG_ON(err); 3513 kfree(al); 3514 } 3515 3516 resiliency_test(); 3517 return 0; 3518} 3519 3520__initcall(slab_sysfs_init); 3521#else 3522__initcall(finish_bootstrap); 3523#endif 3524