slub.c revision 65c02d4cfbbd10188ded3d6577922ab034d943ba
1/* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks and only 6 * uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com> 9 */ 10 11#include <linux/mm.h> 12#include <linux/module.h> 13#include <linux/bit_spinlock.h> 14#include <linux/interrupt.h> 15#include <linux/bitops.h> 16#include <linux/slab.h> 17#include <linux/seq_file.h> 18#include <linux/cpu.h> 19#include <linux/cpuset.h> 20#include <linux/mempolicy.h> 21#include <linux/ctype.h> 22#include <linux/kallsyms.h> 23 24/* 25 * Lock order: 26 * 1. slab_lock(page) 27 * 2. slab->list_lock 28 * 29 * The slab_lock protects operations on the object of a particular 30 * slab and its metadata in the page struct. If the slab lock 31 * has been taken then no allocations nor frees can be performed 32 * on the objects in the slab nor can the slab be added or removed 33 * from the partial or full lists since this would mean modifying 34 * the page_struct of the slab. 35 * 36 * The list_lock protects the partial and full list on each node and 37 * the partial slab counter. If taken then no new slabs may be added or 38 * removed from the lists nor make the number of partial slabs be modified. 39 * (Note that the total number of slabs is an atomic value that may be 40 * modified without taking the list lock). 41 * 42 * The list_lock is a centralized lock and thus we avoid taking it as 43 * much as possible. As long as SLUB does not have to handle partial 44 * slabs, operations can continue without any centralized lock. F.e. 45 * allocating a long series of objects that fill up slabs does not require 46 * the list lock. 47 * 48 * The lock order is sometimes inverted when we are trying to get a slab 49 * off a list. We take the list_lock and then look for a page on the list 50 * to use. While we do that objects in the slabs may be freed. We can 51 * only operate on the slab if we have also taken the slab_lock. So we use 52 * a slab_trylock() on the slab. If trylock was successful then no frees 53 * can occur anymore and we can use the slab for allocations etc. If the 54 * slab_trylock() does not succeed then frees are in progress in the slab and 55 * we must stay away from it for a while since we may cause a bouncing 56 * cacheline if we try to acquire the lock. So go onto the next slab. 57 * If all pages are busy then we may allocate a new slab instead of reusing 58 * a partial slab. A new slab has noone operating on it and thus there is 59 * no danger of cacheline contention. 60 * 61 * Interrupts are disabled during allocation and deallocation in order to 62 * make the slab allocator safe to use in the context of an irq. In addition 63 * interrupts are disabled to ensure that the processor does not change 64 * while handling per_cpu slabs, due to kernel preemption. 65 * 66 * SLUB assigns one slab for allocation to each processor. 67 * Allocations only occur from these slabs called cpu slabs. 68 * 69 * Slabs with free elements are kept on a partial list. 70 * There is no list for full slabs. If an object in a full slab is 71 * freed then the slab will show up again on the partial lists. 72 * Otherwise there is no need to track full slabs unless we have to 73 * track full slabs for debugging purposes. 74 * 75 * Slabs are freed when they become empty. Teardown and setup is 76 * minimal so we rely on the page allocators per cpu caches for 77 * fast frees and allocs. 78 * 79 * Overloading of page flags that are otherwise used for LRU management. 80 * 81 * PageActive The slab is used as a cpu cache. Allocations 82 * may be performed from the slab. The slab is not 83 * on any slab list and cannot be moved onto one. 84 * 85 * PageError Slab requires special handling due to debug 86 * options set. This moves slab handling out of 87 * the fast path. 88 */ 89 90/* 91 * Issues still to be resolved: 92 * 93 * - The per cpu array is updated for each new slab and and is a remote 94 * cacheline for most nodes. This could become a bouncing cacheline given 95 * enough frequent updates. There are 16 pointers in a cacheline.so at 96 * max 16 cpus could compete. Likely okay. 97 * 98 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 99 * 100 * - Variable sizing of the per node arrays 101 */ 102 103/* Enable to test recovery from slab corruption on boot */ 104#undef SLUB_RESILIENCY_TEST 105 106#if PAGE_SHIFT <= 12 107 108/* 109 * Small page size. Make sure that we do not fragment memory 110 */ 111#define DEFAULT_MAX_ORDER 1 112#define DEFAULT_MIN_OBJECTS 4 113 114#else 115 116/* 117 * Large page machines are customarily able to handle larger 118 * page orders. 119 */ 120#define DEFAULT_MAX_ORDER 2 121#define DEFAULT_MIN_OBJECTS 8 122 123#endif 124 125/* 126 * Mininum number of partial slabs. These will be left on the partial 127 * lists even if they are empty. kmem_cache_shrink may reclaim them. 128 */ 129#define MIN_PARTIAL 2 130 131/* 132 * Maximum number of desirable partial slabs. 133 * The existence of more partial slabs makes kmem_cache_shrink 134 * sort the partial list by the number of objects in the. 135 */ 136#define MAX_PARTIAL 10 137 138#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 139 SLAB_POISON | SLAB_STORE_USER) 140/* 141 * Set of flags that will prevent slab merging 142 */ 143#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 144 SLAB_TRACE | SLAB_DESTROY_BY_RCU) 145 146#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 147 SLAB_CACHE_DMA) 148 149#ifndef ARCH_KMALLOC_MINALIGN 150#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 151#endif 152 153#ifndef ARCH_SLAB_MINALIGN 154#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) 155#endif 156 157/* Internal SLUB flags */ 158#define __OBJECT_POISON 0x80000000 /* Poison object */ 159 160/* Not all arches define cache_line_size */ 161#ifndef cache_line_size 162#define cache_line_size() L1_CACHE_BYTES 163#endif 164 165static int kmem_size = sizeof(struct kmem_cache); 166 167#ifdef CONFIG_SMP 168static struct notifier_block slab_notifier; 169#endif 170 171static enum { 172 DOWN, /* No slab functionality available */ 173 PARTIAL, /* kmem_cache_open() works but kmalloc does not */ 174 UP, /* Everything works */ 175 SYSFS /* Sysfs up */ 176} slab_state = DOWN; 177 178/* A list of all slab caches on the system */ 179static DECLARE_RWSEM(slub_lock); 180LIST_HEAD(slab_caches); 181 182#ifdef CONFIG_SYSFS 183static int sysfs_slab_add(struct kmem_cache *); 184static int sysfs_slab_alias(struct kmem_cache *, const char *); 185static void sysfs_slab_remove(struct kmem_cache *); 186#else 187static int sysfs_slab_add(struct kmem_cache *s) { return 0; } 188static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } 189static void sysfs_slab_remove(struct kmem_cache *s) {} 190#endif 191 192/******************************************************************** 193 * Core slab cache functions 194 *******************************************************************/ 195 196int slab_is_available(void) 197{ 198 return slab_state >= UP; 199} 200 201static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 202{ 203#ifdef CONFIG_NUMA 204 return s->node[node]; 205#else 206 return &s->local_node; 207#endif 208} 209 210/* 211 * Object debugging 212 */ 213static void print_section(char *text, u8 *addr, unsigned int length) 214{ 215 int i, offset; 216 int newline = 1; 217 char ascii[17]; 218 219 ascii[16] = 0; 220 221 for (i = 0; i < length; i++) { 222 if (newline) { 223 printk(KERN_ERR "%10s 0x%p: ", text, addr + i); 224 newline = 0; 225 } 226 printk(" %02x", addr[i]); 227 offset = i % 16; 228 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; 229 if (offset == 15) { 230 printk(" %s\n",ascii); 231 newline = 1; 232 } 233 } 234 if (!newline) { 235 i %= 16; 236 while (i < 16) { 237 printk(" "); 238 ascii[i] = ' '; 239 i++; 240 } 241 printk(" %s\n", ascii); 242 } 243} 244 245/* 246 * Slow version of get and set free pointer. 247 * 248 * This requires touching the cache lines of kmem_cache. 249 * The offset can also be obtained from the page. In that 250 * case it is in the cacheline that we already need to touch. 251 */ 252static void *get_freepointer(struct kmem_cache *s, void *object) 253{ 254 return *(void **)(object + s->offset); 255} 256 257static void set_freepointer(struct kmem_cache *s, void *object, void *fp) 258{ 259 *(void **)(object + s->offset) = fp; 260} 261 262/* 263 * Tracking user of a slab. 264 */ 265struct track { 266 void *addr; /* Called from address */ 267 int cpu; /* Was running on cpu */ 268 int pid; /* Pid context */ 269 unsigned long when; /* When did the operation occur */ 270}; 271 272enum track_item { TRACK_ALLOC, TRACK_FREE }; 273 274static struct track *get_track(struct kmem_cache *s, void *object, 275 enum track_item alloc) 276{ 277 struct track *p; 278 279 if (s->offset) 280 p = object + s->offset + sizeof(void *); 281 else 282 p = object + s->inuse; 283 284 return p + alloc; 285} 286 287static void set_track(struct kmem_cache *s, void *object, 288 enum track_item alloc, void *addr) 289{ 290 struct track *p; 291 292 if (s->offset) 293 p = object + s->offset + sizeof(void *); 294 else 295 p = object + s->inuse; 296 297 p += alloc; 298 if (addr) { 299 p->addr = addr; 300 p->cpu = smp_processor_id(); 301 p->pid = current ? current->pid : -1; 302 p->when = jiffies; 303 } else 304 memset(p, 0, sizeof(struct track)); 305} 306 307static void init_tracking(struct kmem_cache *s, void *object) 308{ 309 if (s->flags & SLAB_STORE_USER) { 310 set_track(s, object, TRACK_FREE, NULL); 311 set_track(s, object, TRACK_ALLOC, NULL); 312 } 313} 314 315static void print_track(const char *s, struct track *t) 316{ 317 if (!t->addr) 318 return; 319 320 printk(KERN_ERR "%s: ", s); 321 __print_symbol("%s", (unsigned long)t->addr); 322 printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); 323} 324 325static void print_trailer(struct kmem_cache *s, u8 *p) 326{ 327 unsigned int off; /* Offset of last byte */ 328 329 if (s->flags & SLAB_RED_ZONE) 330 print_section("Redzone", p + s->objsize, 331 s->inuse - s->objsize); 332 333 printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n", 334 p + s->offset, 335 get_freepointer(s, p)); 336 337 if (s->offset) 338 off = s->offset + sizeof(void *); 339 else 340 off = s->inuse; 341 342 if (s->flags & SLAB_STORE_USER) { 343 print_track("Last alloc", get_track(s, p, TRACK_ALLOC)); 344 print_track("Last free ", get_track(s, p, TRACK_FREE)); 345 off += 2 * sizeof(struct track); 346 } 347 348 if (off != s->size) 349 /* Beginning of the filler is the free pointer */ 350 print_section("Filler", p + off, s->size - off); 351} 352 353static void object_err(struct kmem_cache *s, struct page *page, 354 u8 *object, char *reason) 355{ 356 u8 *addr = page_address(page); 357 358 printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n", 359 s->name, reason, object, page); 360 printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n", 361 object - addr, page->flags, page->inuse, page->freelist); 362 if (object > addr + 16) 363 print_section("Bytes b4", object - 16, 16); 364 print_section("Object", object, min(s->objsize, 128)); 365 print_trailer(s, object); 366 dump_stack(); 367} 368 369static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...) 370{ 371 va_list args; 372 char buf[100]; 373 374 va_start(args, reason); 375 vsnprintf(buf, sizeof(buf), reason, args); 376 va_end(args); 377 printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf, 378 page); 379 dump_stack(); 380} 381 382static void init_object(struct kmem_cache *s, void *object, int active) 383{ 384 u8 *p = object; 385 386 if (s->flags & __OBJECT_POISON) { 387 memset(p, POISON_FREE, s->objsize - 1); 388 p[s->objsize -1] = POISON_END; 389 } 390 391 if (s->flags & SLAB_RED_ZONE) 392 memset(p + s->objsize, 393 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, 394 s->inuse - s->objsize); 395} 396 397static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) 398{ 399 while (bytes) { 400 if (*start != (u8)value) 401 return 0; 402 start++; 403 bytes--; 404 } 405 return 1; 406} 407 408 409static int check_valid_pointer(struct kmem_cache *s, struct page *page, 410 void *object) 411{ 412 void *base; 413 414 if (!object) 415 return 1; 416 417 base = page_address(page); 418 if (object < base || object >= base + s->objects * s->size || 419 (object - base) % s->size) { 420 return 0; 421 } 422 423 return 1; 424} 425 426/* 427 * Object layout: 428 * 429 * object address 430 * Bytes of the object to be managed. 431 * If the freepointer may overlay the object then the free 432 * pointer is the first word of the object. 433 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 434 * 0xa5 (POISON_END) 435 * 436 * object + s->objsize 437 * Padding to reach word boundary. This is also used for Redzoning. 438 * Padding is extended to word size if Redzoning is enabled 439 * and objsize == inuse. 440 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 441 * 0xcc (RED_ACTIVE) for objects in use. 442 * 443 * object + s->inuse 444 * A. Free pointer (if we cannot overwrite object on free) 445 * B. Tracking data for SLAB_STORE_USER 446 * C. Padding to reach required alignment boundary 447 * Padding is done using 0x5a (POISON_INUSE) 448 * 449 * object + s->size 450 * 451 * If slabcaches are merged then the objsize and inuse boundaries are to 452 * be ignored. And therefore no slab options that rely on these boundaries 453 * may be used with merged slabcaches. 454 */ 455 456static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 457 void *from, void *to) 458{ 459 printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n", 460 s->name, message, data, from, to - 1); 461 memset(from, data, to - from); 462} 463 464static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 465{ 466 unsigned long off = s->inuse; /* The end of info */ 467 468 if (s->offset) 469 /* Freepointer is placed after the object. */ 470 off += sizeof(void *); 471 472 if (s->flags & SLAB_STORE_USER) 473 /* We also have user information there */ 474 off += 2 * sizeof(struct track); 475 476 if (s->size == off) 477 return 1; 478 479 if (check_bytes(p + off, POISON_INUSE, s->size - off)) 480 return 1; 481 482 object_err(s, page, p, "Object padding check fails"); 483 484 /* 485 * Restore padding 486 */ 487 restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size); 488 return 0; 489} 490 491static int slab_pad_check(struct kmem_cache *s, struct page *page) 492{ 493 u8 *p; 494 int length, remainder; 495 496 if (!(s->flags & SLAB_POISON)) 497 return 1; 498 499 p = page_address(page); 500 length = s->objects * s->size; 501 remainder = (PAGE_SIZE << s->order) - length; 502 if (!remainder) 503 return 1; 504 505 if (!check_bytes(p + length, POISON_INUSE, remainder)) { 506 slab_err(s, page, "Padding check failed"); 507 restore_bytes(s, "slab padding", POISON_INUSE, p + length, 508 p + length + remainder); 509 return 0; 510 } 511 return 1; 512} 513 514static int check_object(struct kmem_cache *s, struct page *page, 515 void *object, int active) 516{ 517 u8 *p = object; 518 u8 *endobject = object + s->objsize; 519 520 if (s->flags & SLAB_RED_ZONE) { 521 unsigned int red = 522 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; 523 524 if (!check_bytes(endobject, red, s->inuse - s->objsize)) { 525 object_err(s, page, object, 526 active ? "Redzone Active" : "Redzone Inactive"); 527 restore_bytes(s, "redzone", red, 528 endobject, object + s->inuse); 529 return 0; 530 } 531 } else { 532 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse && 533 !check_bytes(endobject, POISON_INUSE, 534 s->inuse - s->objsize)) { 535 object_err(s, page, p, "Alignment padding check fails"); 536 /* 537 * Fix it so that there will not be another report. 538 * 539 * Hmmm... We may be corrupting an object that now expects 540 * to be longer than allowed. 541 */ 542 restore_bytes(s, "alignment padding", POISON_INUSE, 543 endobject, object + s->inuse); 544 } 545 } 546 547 if (s->flags & SLAB_POISON) { 548 if (!active && (s->flags & __OBJECT_POISON) && 549 (!check_bytes(p, POISON_FREE, s->objsize - 1) || 550 p[s->objsize - 1] != POISON_END)) { 551 552 object_err(s, page, p, "Poison check failed"); 553 restore_bytes(s, "Poison", POISON_FREE, 554 p, p + s->objsize -1); 555 restore_bytes(s, "Poison", POISON_END, 556 p + s->objsize - 1, p + s->objsize); 557 return 0; 558 } 559 /* 560 * check_pad_bytes cleans up on its own. 561 */ 562 check_pad_bytes(s, page, p); 563 } 564 565 if (!s->offset && active) 566 /* 567 * Object and freepointer overlap. Cannot check 568 * freepointer while object is allocated. 569 */ 570 return 1; 571 572 /* Check free pointer validity */ 573 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 574 object_err(s, page, p, "Freepointer corrupt"); 575 /* 576 * No choice but to zap it and thus loose the remainder 577 * of the free objects in this slab. May cause 578 * another error because the object count maybe 579 * wrong now. 580 */ 581 set_freepointer(s, p, NULL); 582 return 0; 583 } 584 return 1; 585} 586 587static int check_slab(struct kmem_cache *s, struct page *page) 588{ 589 VM_BUG_ON(!irqs_disabled()); 590 591 if (!PageSlab(page)) { 592 slab_err(s, page, "Not a valid slab page flags=%lx " 593 "mapping=0x%p count=%d", page->flags, page->mapping, 594 page_count(page)); 595 return 0; 596 } 597 if (page->offset * sizeof(void *) != s->offset) { 598 slab_err(s, page, "Corrupted offset %lu flags=0x%lx " 599 "mapping=0x%p count=%d", 600 (unsigned long)(page->offset * sizeof(void *)), 601 page->flags, 602 page->mapping, 603 page_count(page)); 604 return 0; 605 } 606 if (page->inuse > s->objects) { 607 slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx " 608 "mapping=0x%p count=%d", 609 s->name, page->inuse, s->objects, page->flags, 610 page->mapping, page_count(page)); 611 return 0; 612 } 613 /* Slab_pad_check fixes things up after itself */ 614 slab_pad_check(s, page); 615 return 1; 616} 617 618/* 619 * Determine if a certain object on a page is on the freelist and 620 * therefore free. Must hold the slab lock for cpu slabs to 621 * guarantee that the chains are consistent. 622 */ 623static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 624{ 625 int nr = 0; 626 void *fp = page->freelist; 627 void *object = NULL; 628 629 while (fp && nr <= s->objects) { 630 if (fp == search) 631 return 1; 632 if (!check_valid_pointer(s, page, fp)) { 633 if (object) { 634 object_err(s, page, object, 635 "Freechain corrupt"); 636 set_freepointer(s, object, NULL); 637 break; 638 } else { 639 slab_err(s, page, "Freepointer 0x%p corrupt", 640 fp); 641 page->freelist = NULL; 642 page->inuse = s->objects; 643 printk(KERN_ERR "@@@ SLUB %s: Freelist " 644 "cleared. Slab 0x%p\n", 645 s->name, page); 646 return 0; 647 } 648 break; 649 } 650 object = fp; 651 fp = get_freepointer(s, object); 652 nr++; 653 } 654 655 if (page->inuse != s->objects - nr) { 656 slab_err(s, page, "Wrong object count. Counter is %d but " 657 "counted were %d", s, page, page->inuse, 658 s->objects - nr); 659 page->inuse = s->objects - nr; 660 printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. " 661 "Slab @0x%p\n", s->name, page); 662 } 663 return search == NULL; 664} 665 666/* 667 * Tracking of fully allocated slabs for debugging 668 */ 669static void add_full(struct kmem_cache_node *n, struct page *page) 670{ 671 spin_lock(&n->list_lock); 672 list_add(&page->lru, &n->full); 673 spin_unlock(&n->list_lock); 674} 675 676static void remove_full(struct kmem_cache *s, struct page *page) 677{ 678 struct kmem_cache_node *n; 679 680 if (!(s->flags & SLAB_STORE_USER)) 681 return; 682 683 n = get_node(s, page_to_nid(page)); 684 685 spin_lock(&n->list_lock); 686 list_del(&page->lru); 687 spin_unlock(&n->list_lock); 688} 689 690static int alloc_object_checks(struct kmem_cache *s, struct page *page, 691 void *object) 692{ 693 if (!check_slab(s, page)) 694 goto bad; 695 696 if (object && !on_freelist(s, page, object)) { 697 slab_err(s, page, "Object 0x%p already allocated", object); 698 goto bad; 699 } 700 701 if (!check_valid_pointer(s, page, object)) { 702 object_err(s, page, object, "Freelist Pointer check fails"); 703 goto bad; 704 } 705 706 if (!object) 707 return 1; 708 709 if (!check_object(s, page, object, 0)) 710 goto bad; 711 712 return 1; 713bad: 714 if (PageSlab(page)) { 715 /* 716 * If this is a slab page then lets do the best we can 717 * to avoid issues in the future. Marking all objects 718 * as used avoids touching the remainder. 719 */ 720 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", 721 s->name, page); 722 page->inuse = s->objects; 723 page->freelist = NULL; 724 /* Fix up fields that may be corrupted */ 725 page->offset = s->offset / sizeof(void *); 726 } 727 return 0; 728} 729 730static int free_object_checks(struct kmem_cache *s, struct page *page, 731 void *object) 732{ 733 if (!check_slab(s, page)) 734 goto fail; 735 736 if (!check_valid_pointer(s, page, object)) { 737 slab_err(s, page, "Invalid object pointer 0x%p", object); 738 goto fail; 739 } 740 741 if (on_freelist(s, page, object)) { 742 slab_err(s, page, "Object 0x%p already free", object); 743 goto fail; 744 } 745 746 if (!check_object(s, page, object, 1)) 747 return 0; 748 749 if (unlikely(s != page->slab)) { 750 if (!PageSlab(page)) 751 slab_err(s, page, "Attempt to free object(0x%p) " 752 "outside of slab", object); 753 else 754 if (!page->slab) { 755 printk(KERN_ERR 756 "SLUB <none>: no slab for object 0x%p.\n", 757 object); 758 dump_stack(); 759 } 760 else 761 slab_err(s, page, "object at 0x%p belongs " 762 "to slab %s", object, page->slab->name); 763 goto fail; 764 } 765 return 1; 766fail: 767 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", 768 s->name, page, object); 769 return 0; 770} 771 772/* 773 * Slab allocation and freeing 774 */ 775static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 776{ 777 struct page * page; 778 int pages = 1 << s->order; 779 780 if (s->order) 781 flags |= __GFP_COMP; 782 783 if (s->flags & SLAB_CACHE_DMA) 784 flags |= SLUB_DMA; 785 786 if (node == -1) 787 page = alloc_pages(flags, s->order); 788 else 789 page = alloc_pages_node(node, flags, s->order); 790 791 if (!page) 792 return NULL; 793 794 mod_zone_page_state(page_zone(page), 795 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 796 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 797 pages); 798 799 return page; 800} 801 802static void setup_object(struct kmem_cache *s, struct page *page, 803 void *object) 804{ 805 if (PageError(page)) { 806 init_object(s, object, 0); 807 init_tracking(s, object); 808 } 809 810 if (unlikely(s->ctor)) 811 s->ctor(object, s, SLAB_CTOR_CONSTRUCTOR); 812} 813 814static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 815{ 816 struct page *page; 817 struct kmem_cache_node *n; 818 void *start; 819 void *end; 820 void *last; 821 void *p; 822 823 BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); 824 825 if (flags & __GFP_WAIT) 826 local_irq_enable(); 827 828 page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); 829 if (!page) 830 goto out; 831 832 n = get_node(s, page_to_nid(page)); 833 if (n) 834 atomic_long_inc(&n->nr_slabs); 835 page->offset = s->offset / sizeof(void *); 836 page->slab = s; 837 page->flags |= 1 << PG_slab; 838 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 839 SLAB_STORE_USER | SLAB_TRACE)) 840 page->flags |= 1 << PG_error; 841 842 start = page_address(page); 843 end = start + s->objects * s->size; 844 845 if (unlikely(s->flags & SLAB_POISON)) 846 memset(start, POISON_INUSE, PAGE_SIZE << s->order); 847 848 last = start; 849 for (p = start + s->size; p < end; p += s->size) { 850 setup_object(s, page, last); 851 set_freepointer(s, last, p); 852 last = p; 853 } 854 setup_object(s, page, last); 855 set_freepointer(s, last, NULL); 856 857 page->freelist = start; 858 page->inuse = 0; 859out: 860 if (flags & __GFP_WAIT) 861 local_irq_disable(); 862 return page; 863} 864 865static void __free_slab(struct kmem_cache *s, struct page *page) 866{ 867 int pages = 1 << s->order; 868 869 if (unlikely(PageError(page) || s->dtor)) { 870 void *start = page_address(page); 871 void *end = start + (pages << PAGE_SHIFT); 872 void *p; 873 874 slab_pad_check(s, page); 875 for (p = start; p <= end - s->size; p += s->size) { 876 if (s->dtor) 877 s->dtor(p, s, 0); 878 check_object(s, page, p, 0); 879 } 880 } 881 882 mod_zone_page_state(page_zone(page), 883 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 884 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 885 - pages); 886 887 page->mapping = NULL; 888 __free_pages(page, s->order); 889} 890 891static void rcu_free_slab(struct rcu_head *h) 892{ 893 struct page *page; 894 895 page = container_of((struct list_head *)h, struct page, lru); 896 __free_slab(page->slab, page); 897} 898 899static void free_slab(struct kmem_cache *s, struct page *page) 900{ 901 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 902 /* 903 * RCU free overloads the RCU head over the LRU 904 */ 905 struct rcu_head *head = (void *)&page->lru; 906 907 call_rcu(head, rcu_free_slab); 908 } else 909 __free_slab(s, page); 910} 911 912static void discard_slab(struct kmem_cache *s, struct page *page) 913{ 914 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 915 916 atomic_long_dec(&n->nr_slabs); 917 reset_page_mapcount(page); 918 page->flags &= ~(1 << PG_slab | 1 << PG_error); 919 free_slab(s, page); 920} 921 922/* 923 * Per slab locking using the pagelock 924 */ 925static __always_inline void slab_lock(struct page *page) 926{ 927 bit_spin_lock(PG_locked, &page->flags); 928} 929 930static __always_inline void slab_unlock(struct page *page) 931{ 932 bit_spin_unlock(PG_locked, &page->flags); 933} 934 935static __always_inline int slab_trylock(struct page *page) 936{ 937 int rc = 1; 938 939 rc = bit_spin_trylock(PG_locked, &page->flags); 940 return rc; 941} 942 943/* 944 * Management of partially allocated slabs 945 */ 946static void add_partial_tail(struct kmem_cache_node *n, struct page *page) 947{ 948 spin_lock(&n->list_lock); 949 n->nr_partial++; 950 list_add_tail(&page->lru, &n->partial); 951 spin_unlock(&n->list_lock); 952} 953 954static void add_partial(struct kmem_cache_node *n, struct page *page) 955{ 956 spin_lock(&n->list_lock); 957 n->nr_partial++; 958 list_add(&page->lru, &n->partial); 959 spin_unlock(&n->list_lock); 960} 961 962static void remove_partial(struct kmem_cache *s, 963 struct page *page) 964{ 965 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 966 967 spin_lock(&n->list_lock); 968 list_del(&page->lru); 969 n->nr_partial--; 970 spin_unlock(&n->list_lock); 971} 972 973/* 974 * Lock page and remove it from the partial list 975 * 976 * Must hold list_lock 977 */ 978static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) 979{ 980 if (slab_trylock(page)) { 981 list_del(&page->lru); 982 n->nr_partial--; 983 return 1; 984 } 985 return 0; 986} 987 988/* 989 * Try to get a partial slab from a specific node 990 */ 991static struct page *get_partial_node(struct kmem_cache_node *n) 992{ 993 struct page *page; 994 995 /* 996 * Racy check. If we mistakenly see no partial slabs then we 997 * just allocate an empty slab. If we mistakenly try to get a 998 * partial slab then get_partials() will return NULL. 999 */ 1000 if (!n || !n->nr_partial) 1001 return NULL; 1002 1003 spin_lock(&n->list_lock); 1004 list_for_each_entry(page, &n->partial, lru) 1005 if (lock_and_del_slab(n, page)) 1006 goto out; 1007 page = NULL; 1008out: 1009 spin_unlock(&n->list_lock); 1010 return page; 1011} 1012 1013/* 1014 * Get a page from somewhere. Search in increasing NUMA 1015 * distances. 1016 */ 1017static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1018{ 1019#ifdef CONFIG_NUMA 1020 struct zonelist *zonelist; 1021 struct zone **z; 1022 struct page *page; 1023 1024 /* 1025 * The defrag ratio allows to configure the tradeoffs between 1026 * inter node defragmentation and node local allocations. 1027 * A lower defrag_ratio increases the tendency to do local 1028 * allocations instead of scanning throught the partial 1029 * lists on other nodes. 1030 * 1031 * If defrag_ratio is set to 0 then kmalloc() always 1032 * returns node local objects. If its higher then kmalloc() 1033 * may return off node objects in order to avoid fragmentation. 1034 * 1035 * A higher ratio means slabs may be taken from other nodes 1036 * thus reducing the number of partial slabs on those nodes. 1037 * 1038 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes 1039 * defrag_ratio = 1000) then every (well almost) allocation 1040 * will first attempt to defrag slab caches on other nodes. This 1041 * means scanning over all nodes to look for partial slabs which 1042 * may be a bit expensive to do on every slab allocation. 1043 */ 1044 if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) 1045 return NULL; 1046 1047 zonelist = &NODE_DATA(slab_node(current->mempolicy)) 1048 ->node_zonelists[gfp_zone(flags)]; 1049 for (z = zonelist->zones; *z; z++) { 1050 struct kmem_cache_node *n; 1051 1052 n = get_node(s, zone_to_nid(*z)); 1053 1054 if (n && cpuset_zone_allowed_hardwall(*z, flags) && 1055 n->nr_partial > MIN_PARTIAL) { 1056 page = get_partial_node(n); 1057 if (page) 1058 return page; 1059 } 1060 } 1061#endif 1062 return NULL; 1063} 1064 1065/* 1066 * Get a partial page, lock it and return it. 1067 */ 1068static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1069{ 1070 struct page *page; 1071 int searchnode = (node == -1) ? numa_node_id() : node; 1072 1073 page = get_partial_node(get_node(s, searchnode)); 1074 if (page || (flags & __GFP_THISNODE)) 1075 return page; 1076 1077 return get_any_partial(s, flags); 1078} 1079 1080/* 1081 * Move a page back to the lists. 1082 * 1083 * Must be called with the slab lock held. 1084 * 1085 * On exit the slab lock will have been dropped. 1086 */ 1087static void putback_slab(struct kmem_cache *s, struct page *page) 1088{ 1089 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1090 1091 if (page->inuse) { 1092 1093 if (page->freelist) 1094 add_partial(n, page); 1095 else if (PageError(page) && (s->flags & SLAB_STORE_USER)) 1096 add_full(n, page); 1097 slab_unlock(page); 1098 1099 } else { 1100 if (n->nr_partial < MIN_PARTIAL) { 1101 /* 1102 * Adding an empty page to the partial slabs in order 1103 * to avoid page allocator overhead. This page needs to 1104 * come after all the others that are not fully empty 1105 * in order to make sure that we do maximum 1106 * defragmentation. 1107 */ 1108 add_partial_tail(n, page); 1109 slab_unlock(page); 1110 } else { 1111 slab_unlock(page); 1112 discard_slab(s, page); 1113 } 1114 } 1115} 1116 1117/* 1118 * Remove the cpu slab 1119 */ 1120static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) 1121{ 1122 s->cpu_slab[cpu] = NULL; 1123 ClearPageActive(page); 1124 1125 putback_slab(s, page); 1126} 1127 1128static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) 1129{ 1130 slab_lock(page); 1131 deactivate_slab(s, page, cpu); 1132} 1133 1134/* 1135 * Flush cpu slab. 1136 * Called from IPI handler with interrupts disabled. 1137 */ 1138static void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1139{ 1140 struct page *page = s->cpu_slab[cpu]; 1141 1142 if (likely(page)) 1143 flush_slab(s, page, cpu); 1144} 1145 1146static void flush_cpu_slab(void *d) 1147{ 1148 struct kmem_cache *s = d; 1149 int cpu = smp_processor_id(); 1150 1151 __flush_cpu_slab(s, cpu); 1152} 1153 1154static void flush_all(struct kmem_cache *s) 1155{ 1156#ifdef CONFIG_SMP 1157 on_each_cpu(flush_cpu_slab, s, 1, 1); 1158#else 1159 unsigned long flags; 1160 1161 local_irq_save(flags); 1162 flush_cpu_slab(s); 1163 local_irq_restore(flags); 1164#endif 1165} 1166 1167/* 1168 * slab_alloc is optimized to only modify two cachelines on the fast path 1169 * (aside from the stack): 1170 * 1171 * 1. The page struct 1172 * 2. The first cacheline of the object to be allocated. 1173 * 1174 * The only cache lines that are read (apart from code) is the 1175 * per cpu array in the kmem_cache struct. 1176 * 1177 * Fastpath is not possible if we need to get a new slab or have 1178 * debugging enabled (which means all slabs are marked with PageError) 1179 */ 1180static void *slab_alloc(struct kmem_cache *s, 1181 gfp_t gfpflags, int node, void *addr) 1182{ 1183 struct page *page; 1184 void **object; 1185 unsigned long flags; 1186 int cpu; 1187 1188 local_irq_save(flags); 1189 cpu = smp_processor_id(); 1190 page = s->cpu_slab[cpu]; 1191 if (!page) 1192 goto new_slab; 1193 1194 slab_lock(page); 1195 if (unlikely(node != -1 && page_to_nid(page) != node)) 1196 goto another_slab; 1197redo: 1198 object = page->freelist; 1199 if (unlikely(!object)) 1200 goto another_slab; 1201 if (unlikely(PageError(page))) 1202 goto debug; 1203 1204have_object: 1205 page->inuse++; 1206 page->freelist = object[page->offset]; 1207 slab_unlock(page); 1208 local_irq_restore(flags); 1209 return object; 1210 1211another_slab: 1212 deactivate_slab(s, page, cpu); 1213 1214new_slab: 1215 page = get_partial(s, gfpflags, node); 1216 if (likely(page)) { 1217have_slab: 1218 s->cpu_slab[cpu] = page; 1219 SetPageActive(page); 1220 goto redo; 1221 } 1222 1223 page = new_slab(s, gfpflags, node); 1224 if (page) { 1225 cpu = smp_processor_id(); 1226 if (s->cpu_slab[cpu]) { 1227 /* 1228 * Someone else populated the cpu_slab while we enabled 1229 * interrupts, or we have got scheduled on another cpu. 1230 * The page may not be on the requested node. 1231 */ 1232 if (node == -1 || 1233 page_to_nid(s->cpu_slab[cpu]) == node) { 1234 /* 1235 * Current cpuslab is acceptable and we 1236 * want the current one since its cache hot 1237 */ 1238 discard_slab(s, page); 1239 page = s->cpu_slab[cpu]; 1240 slab_lock(page); 1241 goto redo; 1242 } 1243 /* Dump the current slab */ 1244 flush_slab(s, s->cpu_slab[cpu], cpu); 1245 } 1246 slab_lock(page); 1247 goto have_slab; 1248 } 1249 local_irq_restore(flags); 1250 return NULL; 1251debug: 1252 if (!alloc_object_checks(s, page, object)) 1253 goto another_slab; 1254 if (s->flags & SLAB_STORE_USER) 1255 set_track(s, object, TRACK_ALLOC, addr); 1256 if (s->flags & SLAB_TRACE) { 1257 printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n", 1258 s->name, object, page->inuse, 1259 page->freelist); 1260 dump_stack(); 1261 } 1262 init_object(s, object, 1); 1263 goto have_object; 1264} 1265 1266void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 1267{ 1268 return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); 1269} 1270EXPORT_SYMBOL(kmem_cache_alloc); 1271 1272#ifdef CONFIG_NUMA 1273void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 1274{ 1275 return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); 1276} 1277EXPORT_SYMBOL(kmem_cache_alloc_node); 1278#endif 1279 1280/* 1281 * The fastpath only writes the cacheline of the page struct and the first 1282 * cacheline of the object. 1283 * 1284 * No special cachelines need to be read 1285 */ 1286static void slab_free(struct kmem_cache *s, struct page *page, 1287 void *x, void *addr) 1288{ 1289 void *prior; 1290 void **object = (void *)x; 1291 unsigned long flags; 1292 1293 local_irq_save(flags); 1294 slab_lock(page); 1295 1296 if (unlikely(PageError(page))) 1297 goto debug; 1298checks_ok: 1299 prior = object[page->offset] = page->freelist; 1300 page->freelist = object; 1301 page->inuse--; 1302 1303 if (unlikely(PageActive(page))) 1304 /* 1305 * Cpu slabs are never on partial lists and are 1306 * never freed. 1307 */ 1308 goto out_unlock; 1309 1310 if (unlikely(!page->inuse)) 1311 goto slab_empty; 1312 1313 /* 1314 * Objects left in the slab. If it 1315 * was not on the partial list before 1316 * then add it. 1317 */ 1318 if (unlikely(!prior)) 1319 add_partial(get_node(s, page_to_nid(page)), page); 1320 1321out_unlock: 1322 slab_unlock(page); 1323 local_irq_restore(flags); 1324 return; 1325 1326slab_empty: 1327 if (prior) 1328 /* 1329 * Slab on the partial list. 1330 */ 1331 remove_partial(s, page); 1332 1333 slab_unlock(page); 1334 discard_slab(s, page); 1335 local_irq_restore(flags); 1336 return; 1337 1338debug: 1339 if (!free_object_checks(s, page, x)) 1340 goto out_unlock; 1341 if (!PageActive(page) && !page->freelist) 1342 remove_full(s, page); 1343 if (s->flags & SLAB_STORE_USER) 1344 set_track(s, x, TRACK_FREE, addr); 1345 if (s->flags & SLAB_TRACE) { 1346 printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n", 1347 s->name, object, page->inuse, 1348 page->freelist); 1349 print_section("Object", (void *)object, s->objsize); 1350 dump_stack(); 1351 } 1352 init_object(s, object, 0); 1353 goto checks_ok; 1354} 1355 1356void kmem_cache_free(struct kmem_cache *s, void *x) 1357{ 1358 struct page *page; 1359 1360 page = virt_to_head_page(x); 1361 1362 slab_free(s, page, x, __builtin_return_address(0)); 1363} 1364EXPORT_SYMBOL(kmem_cache_free); 1365 1366/* Figure out on which slab object the object resides */ 1367static struct page *get_object_page(const void *x) 1368{ 1369 struct page *page = virt_to_head_page(x); 1370 1371 if (!PageSlab(page)) 1372 return NULL; 1373 1374 return page; 1375} 1376 1377/* 1378 * kmem_cache_open produces objects aligned at "size" and the first object 1379 * is placed at offset 0 in the slab (We have no metainformation on the 1380 * slab, all slabs are in essence "off slab"). 1381 * 1382 * In order to get the desired alignment one just needs to align the 1383 * size. 1384 * 1385 * Notice that the allocation order determines the sizes of the per cpu 1386 * caches. Each processor has always one slab available for allocations. 1387 * Increasing the allocation order reduces the number of times that slabs 1388 * must be moved on and off the partial lists and therefore may influence 1389 * locking overhead. 1390 * 1391 * The offset is used to relocate the free list link in each object. It is 1392 * therefore possible to move the free list link behind the object. This 1393 * is necessary for RCU to work properly and also useful for debugging. 1394 */ 1395 1396/* 1397 * Mininum / Maximum order of slab pages. This influences locking overhead 1398 * and slab fragmentation. A higher order reduces the number of partial slabs 1399 * and increases the number of allocations possible without having to 1400 * take the list_lock. 1401 */ 1402static int slub_min_order; 1403static int slub_max_order = DEFAULT_MAX_ORDER; 1404 1405/* 1406 * Minimum number of objects per slab. This is necessary in order to 1407 * reduce locking overhead. Similar to the queue size in SLAB. 1408 */ 1409static int slub_min_objects = DEFAULT_MIN_OBJECTS; 1410 1411/* 1412 * Merge control. If this is set then no merging of slab caches will occur. 1413 */ 1414static int slub_nomerge; 1415 1416/* 1417 * Debug settings: 1418 */ 1419static int slub_debug; 1420 1421static char *slub_debug_slabs; 1422 1423/* 1424 * Calculate the order of allocation given an slab object size. 1425 * 1426 * The order of allocation has significant impact on other elements 1427 * of the system. Generally order 0 allocations should be preferred 1428 * since they do not cause fragmentation in the page allocator. Larger 1429 * objects may have problems with order 0 because there may be too much 1430 * space left unused in a slab. We go to a higher order if more than 1/8th 1431 * of the slab would be wasted. 1432 * 1433 * In order to reach satisfactory performance we must ensure that 1434 * a minimum number of objects is in one slab. Otherwise we may 1435 * generate too much activity on the partial lists. This is less a 1436 * concern for large slabs though. slub_max_order specifies the order 1437 * where we begin to stop considering the number of objects in a slab. 1438 * 1439 * Higher order allocations also allow the placement of more objects 1440 * in a slab and thereby reduce object handling overhead. If the user 1441 * has requested a higher mininum order then we start with that one 1442 * instead of zero. 1443 */ 1444static int calculate_order(int size) 1445{ 1446 int order; 1447 int rem; 1448 1449 for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT); 1450 order < MAX_ORDER; order++) { 1451 unsigned long slab_size = PAGE_SIZE << order; 1452 1453 if (slub_max_order > order && 1454 slab_size < slub_min_objects * size) 1455 continue; 1456 1457 if (slab_size < size) 1458 continue; 1459 1460 rem = slab_size % size; 1461 1462 if (rem <= (PAGE_SIZE << order) / 8) 1463 break; 1464 1465 } 1466 if (order >= MAX_ORDER) 1467 return -E2BIG; 1468 return order; 1469} 1470 1471/* 1472 * Function to figure out which alignment to use from the 1473 * various ways of specifying it. 1474 */ 1475static unsigned long calculate_alignment(unsigned long flags, 1476 unsigned long align, unsigned long size) 1477{ 1478 /* 1479 * If the user wants hardware cache aligned objects then 1480 * follow that suggestion if the object is sufficiently 1481 * large. 1482 * 1483 * The hardware cache alignment cannot override the 1484 * specified alignment though. If that is greater 1485 * then use it. 1486 */ 1487 if ((flags & SLAB_HWCACHE_ALIGN) && 1488 size > cache_line_size() / 2) 1489 return max_t(unsigned long, align, cache_line_size()); 1490 1491 if (align < ARCH_SLAB_MINALIGN) 1492 return ARCH_SLAB_MINALIGN; 1493 1494 return ALIGN(align, sizeof(void *)); 1495} 1496 1497static void init_kmem_cache_node(struct kmem_cache_node *n) 1498{ 1499 n->nr_partial = 0; 1500 atomic_long_set(&n->nr_slabs, 0); 1501 spin_lock_init(&n->list_lock); 1502 INIT_LIST_HEAD(&n->partial); 1503 INIT_LIST_HEAD(&n->full); 1504} 1505 1506#ifdef CONFIG_NUMA 1507/* 1508 * No kmalloc_node yet so do it by hand. We know that this is the first 1509 * slab on the node for this slabcache. There are no concurrent accesses 1510 * possible. 1511 * 1512 * Note that this function only works on the kmalloc_node_cache 1513 * when allocating for the kmalloc_node_cache. 1514 */ 1515static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, 1516 int node) 1517{ 1518 struct page *page; 1519 struct kmem_cache_node *n; 1520 1521 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); 1522 1523 page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); 1524 /* new_slab() disables interupts */ 1525 local_irq_enable(); 1526 1527 BUG_ON(!page); 1528 n = page->freelist; 1529 BUG_ON(!n); 1530 page->freelist = get_freepointer(kmalloc_caches, n); 1531 page->inuse++; 1532 kmalloc_caches->node[node] = n; 1533 init_object(kmalloc_caches, n, 1); 1534 init_kmem_cache_node(n); 1535 atomic_long_inc(&n->nr_slabs); 1536 add_partial(n, page); 1537 return n; 1538} 1539 1540static void free_kmem_cache_nodes(struct kmem_cache *s) 1541{ 1542 int node; 1543 1544 for_each_online_node(node) { 1545 struct kmem_cache_node *n = s->node[node]; 1546 if (n && n != &s->local_node) 1547 kmem_cache_free(kmalloc_caches, n); 1548 s->node[node] = NULL; 1549 } 1550} 1551 1552static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 1553{ 1554 int node; 1555 int local_node; 1556 1557 if (slab_state >= UP) 1558 local_node = page_to_nid(virt_to_page(s)); 1559 else 1560 local_node = 0; 1561 1562 for_each_online_node(node) { 1563 struct kmem_cache_node *n; 1564 1565 if (local_node == node) 1566 n = &s->local_node; 1567 else { 1568 if (slab_state == DOWN) { 1569 n = early_kmem_cache_node_alloc(gfpflags, 1570 node); 1571 continue; 1572 } 1573 n = kmem_cache_alloc_node(kmalloc_caches, 1574 gfpflags, node); 1575 1576 if (!n) { 1577 free_kmem_cache_nodes(s); 1578 return 0; 1579 } 1580 1581 } 1582 s->node[node] = n; 1583 init_kmem_cache_node(n); 1584 } 1585 return 1; 1586} 1587#else 1588static void free_kmem_cache_nodes(struct kmem_cache *s) 1589{ 1590} 1591 1592static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 1593{ 1594 init_kmem_cache_node(&s->local_node); 1595 return 1; 1596} 1597#endif 1598 1599/* 1600 * calculate_sizes() determines the order and the distribution of data within 1601 * a slab object. 1602 */ 1603static int calculate_sizes(struct kmem_cache *s) 1604{ 1605 unsigned long flags = s->flags; 1606 unsigned long size = s->objsize; 1607 unsigned long align = s->align; 1608 1609 /* 1610 * Determine if we can poison the object itself. If the user of 1611 * the slab may touch the object after free or before allocation 1612 * then we should never poison the object itself. 1613 */ 1614 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 1615 !s->ctor && !s->dtor) 1616 s->flags |= __OBJECT_POISON; 1617 else 1618 s->flags &= ~__OBJECT_POISON; 1619 1620 /* 1621 * Round up object size to the next word boundary. We can only 1622 * place the free pointer at word boundaries and this determines 1623 * the possible location of the free pointer. 1624 */ 1625 size = ALIGN(size, sizeof(void *)); 1626 1627 /* 1628 * If we are redzoning then check if there is some space between the 1629 * end of the object and the free pointer. If not then add an 1630 * additional word, so that we can establish a redzone between 1631 * the object and the freepointer to be able to check for overwrites. 1632 */ 1633 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 1634 size += sizeof(void *); 1635 1636 /* 1637 * With that we have determined how much of the slab is in actual 1638 * use by the object. This is the potential offset to the free 1639 * pointer. 1640 */ 1641 s->inuse = size; 1642 1643 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 1644 s->ctor || s->dtor)) { 1645 /* 1646 * Relocate free pointer after the object if it is not 1647 * permitted to overwrite the first word of the object on 1648 * kmem_cache_free. 1649 * 1650 * This is the case if we do RCU, have a constructor or 1651 * destructor or are poisoning the objects. 1652 */ 1653 s->offset = size; 1654 size += sizeof(void *); 1655 } 1656 1657 if (flags & SLAB_STORE_USER) 1658 /* 1659 * Need to store information about allocs and frees after 1660 * the object. 1661 */ 1662 size += 2 * sizeof(struct track); 1663 1664 if (flags & DEBUG_DEFAULT_FLAGS) 1665 /* 1666 * Add some empty padding so that we can catch 1667 * overwrites from earlier objects rather than let 1668 * tracking information or the free pointer be 1669 * corrupted if an user writes before the start 1670 * of the object. 1671 */ 1672 size += sizeof(void *); 1673 /* 1674 * Determine the alignment based on various parameters that the 1675 * user specified and the dynamic determination of cache line size 1676 * on bootup. 1677 */ 1678 align = calculate_alignment(flags, align, s->objsize); 1679 1680 /* 1681 * SLUB stores one object immediately after another beginning from 1682 * offset 0. In order to align the objects we have to simply size 1683 * each object to conform to the alignment. 1684 */ 1685 size = ALIGN(size, align); 1686 s->size = size; 1687 1688 s->order = calculate_order(size); 1689 if (s->order < 0) 1690 return 0; 1691 1692 /* 1693 * Determine the number of objects per slab 1694 */ 1695 s->objects = (PAGE_SIZE << s->order) / size; 1696 1697 /* 1698 * Verify that the number of objects is within permitted limits. 1699 * The page->inuse field is only 16 bit wide! So we cannot have 1700 * more than 64k objects per slab. 1701 */ 1702 if (!s->objects || s->objects > 65535) 1703 return 0; 1704 return 1; 1705 1706} 1707 1708static int __init finish_bootstrap(void) 1709{ 1710 struct list_head *h; 1711 int err; 1712 1713 slab_state = SYSFS; 1714 1715 list_for_each(h, &slab_caches) { 1716 struct kmem_cache *s = 1717 container_of(h, struct kmem_cache, list); 1718 1719 err = sysfs_slab_add(s); 1720 BUG_ON(err); 1721 } 1722 return 0; 1723} 1724 1725static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 1726 const char *name, size_t size, 1727 size_t align, unsigned long flags, 1728 void (*ctor)(void *, struct kmem_cache *, unsigned long), 1729 void (*dtor)(void *, struct kmem_cache *, unsigned long)) 1730{ 1731 memset(s, 0, kmem_size); 1732 s->name = name; 1733 s->ctor = ctor; 1734 s->dtor = dtor; 1735 s->objsize = size; 1736 s->flags = flags; 1737 s->align = align; 1738 1739 /* 1740 * The page->offset field is only 16 bit wide. This is an offset 1741 * in units of words from the beginning of an object. If the slab 1742 * size is bigger then we cannot move the free pointer behind the 1743 * object anymore. 1744 * 1745 * On 32 bit platforms the limit is 256k. On 64bit platforms 1746 * the limit is 512k. 1747 * 1748 * Debugging or ctor/dtors may create a need to move the free 1749 * pointer. Fail if this happens. 1750 */ 1751 if (s->size >= 65535 * sizeof(void *)) { 1752 BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | 1753 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); 1754 BUG_ON(ctor || dtor); 1755 } 1756 else 1757 /* 1758 * Enable debugging if selected on the kernel commandline. 1759 */ 1760 if (slub_debug && (!slub_debug_slabs || 1761 strncmp(slub_debug_slabs, name, 1762 strlen(slub_debug_slabs)) == 0)) 1763 s->flags |= slub_debug; 1764 1765 if (!calculate_sizes(s)) 1766 goto error; 1767 1768 s->refcount = 1; 1769#ifdef CONFIG_NUMA 1770 s->defrag_ratio = 100; 1771#endif 1772 1773 if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 1774 return 1; 1775error: 1776 if (flags & SLAB_PANIC) 1777 panic("Cannot create slab %s size=%lu realsize=%u " 1778 "order=%u offset=%u flags=%lx\n", 1779 s->name, (unsigned long)size, s->size, s->order, 1780 s->offset, flags); 1781 return 0; 1782} 1783EXPORT_SYMBOL(kmem_cache_open); 1784 1785/* 1786 * Check if a given pointer is valid 1787 */ 1788int kmem_ptr_validate(struct kmem_cache *s, const void *object) 1789{ 1790 struct page * page; 1791 void *addr; 1792 1793 page = get_object_page(object); 1794 1795 if (!page || s != page->slab) 1796 /* No slab or wrong slab */ 1797 return 0; 1798 1799 addr = page_address(page); 1800 if (object < addr || object >= addr + s->objects * s->size) 1801 /* Out of bounds */ 1802 return 0; 1803 1804 if ((object - addr) % s->size) 1805 /* Improperly aligned */ 1806 return 0; 1807 1808 /* 1809 * We could also check if the object is on the slabs freelist. 1810 * But this would be too expensive and it seems that the main 1811 * purpose of kmem_ptr_valid is to check if the object belongs 1812 * to a certain slab. 1813 */ 1814 return 1; 1815} 1816EXPORT_SYMBOL(kmem_ptr_validate); 1817 1818/* 1819 * Determine the size of a slab object 1820 */ 1821unsigned int kmem_cache_size(struct kmem_cache *s) 1822{ 1823 return s->objsize; 1824} 1825EXPORT_SYMBOL(kmem_cache_size); 1826 1827const char *kmem_cache_name(struct kmem_cache *s) 1828{ 1829 return s->name; 1830} 1831EXPORT_SYMBOL(kmem_cache_name); 1832 1833/* 1834 * Attempt to free all slabs on a node 1835 */ 1836static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, 1837 struct list_head *list) 1838{ 1839 int slabs_inuse = 0; 1840 unsigned long flags; 1841 struct page *page, *h; 1842 1843 spin_lock_irqsave(&n->list_lock, flags); 1844 list_for_each_entry_safe(page, h, list, lru) 1845 if (!page->inuse) { 1846 list_del(&page->lru); 1847 discard_slab(s, page); 1848 } else 1849 slabs_inuse++; 1850 spin_unlock_irqrestore(&n->list_lock, flags); 1851 return slabs_inuse; 1852} 1853 1854/* 1855 * Release all resources used by slab cache 1856 */ 1857static int kmem_cache_close(struct kmem_cache *s) 1858{ 1859 int node; 1860 1861 flush_all(s); 1862 1863 /* Attempt to free all objects */ 1864 for_each_online_node(node) { 1865 struct kmem_cache_node *n = get_node(s, node); 1866 1867 n->nr_partial -= free_list(s, n, &n->partial); 1868 if (atomic_long_read(&n->nr_slabs)) 1869 return 1; 1870 } 1871 free_kmem_cache_nodes(s); 1872 return 0; 1873} 1874 1875/* 1876 * Close a cache and release the kmem_cache structure 1877 * (must be used for caches created using kmem_cache_create) 1878 */ 1879void kmem_cache_destroy(struct kmem_cache *s) 1880{ 1881 down_write(&slub_lock); 1882 s->refcount--; 1883 if (!s->refcount) { 1884 list_del(&s->list); 1885 if (kmem_cache_close(s)) 1886 WARN_ON(1); 1887 sysfs_slab_remove(s); 1888 kfree(s); 1889 } 1890 up_write(&slub_lock); 1891} 1892EXPORT_SYMBOL(kmem_cache_destroy); 1893 1894/******************************************************************** 1895 * Kmalloc subsystem 1896 *******************************************************************/ 1897 1898struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; 1899EXPORT_SYMBOL(kmalloc_caches); 1900 1901#ifdef CONFIG_ZONE_DMA 1902static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; 1903#endif 1904 1905static int __init setup_slub_min_order(char *str) 1906{ 1907 get_option (&str, &slub_min_order); 1908 1909 return 1; 1910} 1911 1912__setup("slub_min_order=", setup_slub_min_order); 1913 1914static int __init setup_slub_max_order(char *str) 1915{ 1916 get_option (&str, &slub_max_order); 1917 1918 return 1; 1919} 1920 1921__setup("slub_max_order=", setup_slub_max_order); 1922 1923static int __init setup_slub_min_objects(char *str) 1924{ 1925 get_option (&str, &slub_min_objects); 1926 1927 return 1; 1928} 1929 1930__setup("slub_min_objects=", setup_slub_min_objects); 1931 1932static int __init setup_slub_nomerge(char *str) 1933{ 1934 slub_nomerge = 1; 1935 return 1; 1936} 1937 1938__setup("slub_nomerge", setup_slub_nomerge); 1939 1940static int __init setup_slub_debug(char *str) 1941{ 1942 if (!str || *str != '=') 1943 slub_debug = DEBUG_DEFAULT_FLAGS; 1944 else { 1945 str++; 1946 if (*str == 0 || *str == ',') 1947 slub_debug = DEBUG_DEFAULT_FLAGS; 1948 else 1949 for( ;*str && *str != ','; str++) 1950 switch (*str) { 1951 case 'f' : case 'F' : 1952 slub_debug |= SLAB_DEBUG_FREE; 1953 break; 1954 case 'z' : case 'Z' : 1955 slub_debug |= SLAB_RED_ZONE; 1956 break; 1957 case 'p' : case 'P' : 1958 slub_debug |= SLAB_POISON; 1959 break; 1960 case 'u' : case 'U' : 1961 slub_debug |= SLAB_STORE_USER; 1962 break; 1963 case 't' : case 'T' : 1964 slub_debug |= SLAB_TRACE; 1965 break; 1966 default: 1967 printk(KERN_ERR "slub_debug option '%c' " 1968 "unknown. skipped\n",*str); 1969 } 1970 } 1971 1972 if (*str == ',') 1973 slub_debug_slabs = str + 1; 1974 return 1; 1975} 1976 1977__setup("slub_debug", setup_slub_debug); 1978 1979static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, 1980 const char *name, int size, gfp_t gfp_flags) 1981{ 1982 unsigned int flags = 0; 1983 1984 if (gfp_flags & SLUB_DMA) 1985 flags = SLAB_CACHE_DMA; 1986 1987 down_write(&slub_lock); 1988 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 1989 flags, NULL, NULL)) 1990 goto panic; 1991 1992 list_add(&s->list, &slab_caches); 1993 up_write(&slub_lock); 1994 if (sysfs_slab_add(s)) 1995 goto panic; 1996 return s; 1997 1998panic: 1999 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 2000} 2001 2002static struct kmem_cache *get_slab(size_t size, gfp_t flags) 2003{ 2004 int index = kmalloc_index(size); 2005 2006 if (!index) 2007 return NULL; 2008 2009 /* Allocation too large? */ 2010 BUG_ON(index < 0); 2011 2012#ifdef CONFIG_ZONE_DMA 2013 if ((flags & SLUB_DMA)) { 2014 struct kmem_cache *s; 2015 struct kmem_cache *x; 2016 char *text; 2017 size_t realsize; 2018 2019 s = kmalloc_caches_dma[index]; 2020 if (s) 2021 return s; 2022 2023 /* Dynamically create dma cache */ 2024 x = kmalloc(kmem_size, flags & ~SLUB_DMA); 2025 if (!x) 2026 panic("Unable to allocate memory for dma cache\n"); 2027 2028 if (index <= KMALLOC_SHIFT_HIGH) 2029 realsize = 1 << index; 2030 else { 2031 if (index == 1) 2032 realsize = 96; 2033 else 2034 realsize = 192; 2035 } 2036 2037 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", 2038 (unsigned int)realsize); 2039 s = create_kmalloc_cache(x, text, realsize, flags); 2040 kmalloc_caches_dma[index] = s; 2041 return s; 2042 } 2043#endif 2044 return &kmalloc_caches[index]; 2045} 2046 2047void *__kmalloc(size_t size, gfp_t flags) 2048{ 2049 struct kmem_cache *s = get_slab(size, flags); 2050 2051 if (s) 2052 return slab_alloc(s, flags, -1, __builtin_return_address(0)); 2053 return NULL; 2054} 2055EXPORT_SYMBOL(__kmalloc); 2056 2057#ifdef CONFIG_NUMA 2058void *__kmalloc_node(size_t size, gfp_t flags, int node) 2059{ 2060 struct kmem_cache *s = get_slab(size, flags); 2061 2062 if (s) 2063 return slab_alloc(s, flags, node, __builtin_return_address(0)); 2064 return NULL; 2065} 2066EXPORT_SYMBOL(__kmalloc_node); 2067#endif 2068 2069size_t ksize(const void *object) 2070{ 2071 struct page *page = get_object_page(object); 2072 struct kmem_cache *s; 2073 2074 BUG_ON(!page); 2075 s = page->slab; 2076 BUG_ON(!s); 2077 2078 /* 2079 * Debugging requires use of the padding between object 2080 * and whatever may come after it. 2081 */ 2082 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 2083 return s->objsize; 2084 2085 /* 2086 * If we have the need to store the freelist pointer 2087 * back there or track user information then we can 2088 * only use the space before that information. 2089 */ 2090 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 2091 return s->inuse; 2092 2093 /* 2094 * Else we can use all the padding etc for the allocation 2095 */ 2096 return s->size; 2097} 2098EXPORT_SYMBOL(ksize); 2099 2100void kfree(const void *x) 2101{ 2102 struct kmem_cache *s; 2103 struct page *page; 2104 2105 if (!x) 2106 return; 2107 2108 page = virt_to_head_page(x); 2109 s = page->slab; 2110 2111 slab_free(s, page, (void *)x, __builtin_return_address(0)); 2112} 2113EXPORT_SYMBOL(kfree); 2114 2115/* 2116 * kmem_cache_shrink removes empty slabs from the partial lists 2117 * and then sorts the partially allocated slabs by the number 2118 * of items in use. The slabs with the most items in use 2119 * come first. New allocations will remove these from the 2120 * partial list because they are full. The slabs with the 2121 * least items are placed last. If it happens that the objects 2122 * are freed then the page can be returned to the page allocator. 2123 */ 2124int kmem_cache_shrink(struct kmem_cache *s) 2125{ 2126 int node; 2127 int i; 2128 struct kmem_cache_node *n; 2129 struct page *page; 2130 struct page *t; 2131 struct list_head *slabs_by_inuse = 2132 kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); 2133 unsigned long flags; 2134 2135 if (!slabs_by_inuse) 2136 return -ENOMEM; 2137 2138 flush_all(s); 2139 for_each_online_node(node) { 2140 n = get_node(s, node); 2141 2142 if (!n->nr_partial) 2143 continue; 2144 2145 for (i = 0; i < s->objects; i++) 2146 INIT_LIST_HEAD(slabs_by_inuse + i); 2147 2148 spin_lock_irqsave(&n->list_lock, flags); 2149 2150 /* 2151 * Build lists indexed by the items in use in 2152 * each slab or free slabs if empty. 2153 * 2154 * Note that concurrent frees may occur while 2155 * we hold the list_lock. page->inuse here is 2156 * the upper limit. 2157 */ 2158 list_for_each_entry_safe(page, t, &n->partial, lru) { 2159 if (!page->inuse && slab_trylock(page)) { 2160 /* 2161 * Must hold slab lock here because slab_free 2162 * may have freed the last object and be 2163 * waiting to release the slab. 2164 */ 2165 list_del(&page->lru); 2166 n->nr_partial--; 2167 slab_unlock(page); 2168 discard_slab(s, page); 2169 } else { 2170 if (n->nr_partial > MAX_PARTIAL) 2171 list_move(&page->lru, 2172 slabs_by_inuse + page->inuse); 2173 } 2174 } 2175 2176 if (n->nr_partial <= MAX_PARTIAL) 2177 goto out; 2178 2179 /* 2180 * Rebuild the partial list with the slabs filled up 2181 * most first and the least used slabs at the end. 2182 */ 2183 for (i = s->objects - 1; i >= 0; i--) 2184 list_splice(slabs_by_inuse + i, n->partial.prev); 2185 2186 out: 2187 spin_unlock_irqrestore(&n->list_lock, flags); 2188 } 2189 2190 kfree(slabs_by_inuse); 2191 return 0; 2192} 2193EXPORT_SYMBOL(kmem_cache_shrink); 2194 2195/** 2196 * krealloc - reallocate memory. The contents will remain unchanged. 2197 * 2198 * @p: object to reallocate memory for. 2199 * @new_size: how many bytes of memory are required. 2200 * @flags: the type of memory to allocate. 2201 * 2202 * The contents of the object pointed to are preserved up to the 2203 * lesser of the new and old sizes. If @p is %NULL, krealloc() 2204 * behaves exactly like kmalloc(). If @size is 0 and @p is not a 2205 * %NULL pointer, the object pointed to is freed. 2206 */ 2207void *krealloc(const void *p, size_t new_size, gfp_t flags) 2208{ 2209 struct kmem_cache *new_cache; 2210 void *ret; 2211 struct page *page; 2212 2213 if (unlikely(!p)) 2214 return kmalloc(new_size, flags); 2215 2216 if (unlikely(!new_size)) { 2217 kfree(p); 2218 return NULL; 2219 } 2220 2221 page = virt_to_head_page(p); 2222 2223 new_cache = get_slab(new_size, flags); 2224 2225 /* 2226 * If new size fits in the current cache, bail out. 2227 */ 2228 if (likely(page->slab == new_cache)) 2229 return (void *)p; 2230 2231 ret = kmalloc(new_size, flags); 2232 if (ret) { 2233 memcpy(ret, p, min(new_size, ksize(p))); 2234 kfree(p); 2235 } 2236 return ret; 2237} 2238EXPORT_SYMBOL(krealloc); 2239 2240/******************************************************************** 2241 * Basic setup of slabs 2242 *******************************************************************/ 2243 2244void __init kmem_cache_init(void) 2245{ 2246 int i; 2247 2248#ifdef CONFIG_NUMA 2249 /* 2250 * Must first have the slab cache available for the allocations of the 2251 * struct kmalloc_cache_node's. There is special bootstrap code in 2252 * kmem_cache_open for slab_state == DOWN. 2253 */ 2254 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 2255 sizeof(struct kmem_cache_node), GFP_KERNEL); 2256#endif 2257 2258 /* Able to allocate the per node structures */ 2259 slab_state = PARTIAL; 2260 2261 /* Caches that are not of the two-to-the-power-of size */ 2262 create_kmalloc_cache(&kmalloc_caches[1], 2263 "kmalloc-96", 96, GFP_KERNEL); 2264 create_kmalloc_cache(&kmalloc_caches[2], 2265 "kmalloc-192", 192, GFP_KERNEL); 2266 2267 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) 2268 create_kmalloc_cache(&kmalloc_caches[i], 2269 "kmalloc", 1 << i, GFP_KERNEL); 2270 2271 slab_state = UP; 2272 2273 /* Provide the correct kmalloc names now that the caches are up */ 2274 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) 2275 kmalloc_caches[i]. name = 2276 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 2277 2278#ifdef CONFIG_SMP 2279 register_cpu_notifier(&slab_notifier); 2280#endif 2281 2282 if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */ 2283 kmem_size = offsetof(struct kmem_cache, cpu_slab) 2284 + nr_cpu_ids * sizeof(struct page *); 2285 2286 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2287 " Processors=%d, Nodes=%d\n", 2288 KMALLOC_SHIFT_HIGH, cache_line_size(), 2289 slub_min_order, slub_max_order, slub_min_objects, 2290 nr_cpu_ids, nr_node_ids); 2291} 2292 2293/* 2294 * Find a mergeable slab cache 2295 */ 2296static int slab_unmergeable(struct kmem_cache *s) 2297{ 2298 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 2299 return 1; 2300 2301 if (s->ctor || s->dtor) 2302 return 1; 2303 2304 return 0; 2305} 2306 2307static struct kmem_cache *find_mergeable(size_t size, 2308 size_t align, unsigned long flags, 2309 void (*ctor)(void *, struct kmem_cache *, unsigned long), 2310 void (*dtor)(void *, struct kmem_cache *, unsigned long)) 2311{ 2312 struct list_head *h; 2313 2314 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 2315 return NULL; 2316 2317 if (ctor || dtor) 2318 return NULL; 2319 2320 size = ALIGN(size, sizeof(void *)); 2321 align = calculate_alignment(flags, align, size); 2322 size = ALIGN(size, align); 2323 2324 list_for_each(h, &slab_caches) { 2325 struct kmem_cache *s = 2326 container_of(h, struct kmem_cache, list); 2327 2328 if (slab_unmergeable(s)) 2329 continue; 2330 2331 if (size > s->size) 2332 continue; 2333 2334 if (((flags | slub_debug) & SLUB_MERGE_SAME) != 2335 (s->flags & SLUB_MERGE_SAME)) 2336 continue; 2337 /* 2338 * Check if alignment is compatible. 2339 * Courtesy of Adrian Drzewiecki 2340 */ 2341 if ((s->size & ~(align -1)) != s->size) 2342 continue; 2343 2344 if (s->size - size >= sizeof(void *)) 2345 continue; 2346 2347 return s; 2348 } 2349 return NULL; 2350} 2351 2352struct kmem_cache *kmem_cache_create(const char *name, size_t size, 2353 size_t align, unsigned long flags, 2354 void (*ctor)(void *, struct kmem_cache *, unsigned long), 2355 void (*dtor)(void *, struct kmem_cache *, unsigned long)) 2356{ 2357 struct kmem_cache *s; 2358 2359 down_write(&slub_lock); 2360 s = find_mergeable(size, align, flags, dtor, ctor); 2361 if (s) { 2362 s->refcount++; 2363 /* 2364 * Adjust the object sizes so that we clear 2365 * the complete object on kzalloc. 2366 */ 2367 s->objsize = max(s->objsize, (int)size); 2368 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 2369 if (sysfs_slab_alias(s, name)) 2370 goto err; 2371 } else { 2372 s = kmalloc(kmem_size, GFP_KERNEL); 2373 if (s && kmem_cache_open(s, GFP_KERNEL, name, 2374 size, align, flags, ctor, dtor)) { 2375 if (sysfs_slab_add(s)) { 2376 kfree(s); 2377 goto err; 2378 } 2379 list_add(&s->list, &slab_caches); 2380 } else 2381 kfree(s); 2382 } 2383 up_write(&slub_lock); 2384 return s; 2385 2386err: 2387 up_write(&slub_lock); 2388 if (flags & SLAB_PANIC) 2389 panic("Cannot create slabcache %s\n", name); 2390 else 2391 s = NULL; 2392 return s; 2393} 2394EXPORT_SYMBOL(kmem_cache_create); 2395 2396void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags) 2397{ 2398 void *x; 2399 2400 x = slab_alloc(s, flags, -1, __builtin_return_address(0)); 2401 if (x) 2402 memset(x, 0, s->objsize); 2403 return x; 2404} 2405EXPORT_SYMBOL(kmem_cache_zalloc); 2406 2407#ifdef CONFIG_SMP 2408static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) 2409{ 2410 struct list_head *h; 2411 2412 down_read(&slub_lock); 2413 list_for_each(h, &slab_caches) { 2414 struct kmem_cache *s = 2415 container_of(h, struct kmem_cache, list); 2416 2417 func(s, cpu); 2418 } 2419 up_read(&slub_lock); 2420} 2421 2422/* 2423 * Use the cpu notifier to insure that the slab are flushed 2424 * when necessary. 2425 */ 2426static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 2427 unsigned long action, void *hcpu) 2428{ 2429 long cpu = (long)hcpu; 2430 2431 switch (action) { 2432 case CPU_UP_CANCELED: 2433 case CPU_DEAD: 2434 for_all_slabs(__flush_cpu_slab, cpu); 2435 break; 2436 default: 2437 break; 2438 } 2439 return NOTIFY_OK; 2440} 2441 2442static struct notifier_block __cpuinitdata slab_notifier = 2443 { &slab_cpuup_callback, NULL, 0 }; 2444 2445#endif 2446 2447#ifdef CONFIG_NUMA 2448 2449/***************************************************************** 2450 * Generic reaper used to support the page allocator 2451 * (the cpu slabs are reaped by a per slab workqueue). 2452 * 2453 * Maybe move this to the page allocator? 2454 ****************************************************************/ 2455 2456static DEFINE_PER_CPU(unsigned long, reap_node); 2457 2458static void init_reap_node(int cpu) 2459{ 2460 int node; 2461 2462 node = next_node(cpu_to_node(cpu), node_online_map); 2463 if (node == MAX_NUMNODES) 2464 node = first_node(node_online_map); 2465 2466 __get_cpu_var(reap_node) = node; 2467} 2468 2469static void next_reap_node(void) 2470{ 2471 int node = __get_cpu_var(reap_node); 2472 2473 /* 2474 * Also drain per cpu pages on remote zones 2475 */ 2476 if (node != numa_node_id()) 2477 drain_node_pages(node); 2478 2479 node = next_node(node, node_online_map); 2480 if (unlikely(node >= MAX_NUMNODES)) 2481 node = first_node(node_online_map); 2482 __get_cpu_var(reap_node) = node; 2483} 2484#else 2485#define init_reap_node(cpu) do { } while (0) 2486#define next_reap_node(void) do { } while (0) 2487#endif 2488 2489#define REAPTIMEOUT_CPUC (2*HZ) 2490 2491#ifdef CONFIG_SMP 2492static DEFINE_PER_CPU(struct delayed_work, reap_work); 2493 2494static void cache_reap(struct work_struct *unused) 2495{ 2496 next_reap_node(); 2497 refresh_cpu_vm_stats(smp_processor_id()); 2498 schedule_delayed_work(&__get_cpu_var(reap_work), 2499 REAPTIMEOUT_CPUC); 2500} 2501 2502static void __devinit start_cpu_timer(int cpu) 2503{ 2504 struct delayed_work *reap_work = &per_cpu(reap_work, cpu); 2505 2506 /* 2507 * When this gets called from do_initcalls via cpucache_init(), 2508 * init_workqueues() has already run, so keventd will be setup 2509 * at that time. 2510 */ 2511 if (keventd_up() && reap_work->work.func == NULL) { 2512 init_reap_node(cpu); 2513 INIT_DELAYED_WORK(reap_work, cache_reap); 2514 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 2515 } 2516} 2517 2518static int __init cpucache_init(void) 2519{ 2520 int cpu; 2521 2522 /* 2523 * Register the timers that drain pcp pages and update vm statistics 2524 */ 2525 for_each_online_cpu(cpu) 2526 start_cpu_timer(cpu); 2527 return 0; 2528} 2529__initcall(cpucache_init); 2530#endif 2531 2532#ifdef SLUB_RESILIENCY_TEST 2533static unsigned long validate_slab_cache(struct kmem_cache *s); 2534 2535static void resiliency_test(void) 2536{ 2537 u8 *p; 2538 2539 printk(KERN_ERR "SLUB resiliency testing\n"); 2540 printk(KERN_ERR "-----------------------\n"); 2541 printk(KERN_ERR "A. Corruption after allocation\n"); 2542 2543 p = kzalloc(16, GFP_KERNEL); 2544 p[16] = 0x12; 2545 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 2546 " 0x12->0x%p\n\n", p + 16); 2547 2548 validate_slab_cache(kmalloc_caches + 4); 2549 2550 /* Hmmm... The next two are dangerous */ 2551 p = kzalloc(32, GFP_KERNEL); 2552 p[32 + sizeof(void *)] = 0x34; 2553 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 2554 " 0x34 -> -0x%p\n", p); 2555 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 2556 2557 validate_slab_cache(kmalloc_caches + 5); 2558 p = kzalloc(64, GFP_KERNEL); 2559 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 2560 *p = 0x56; 2561 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 2562 p); 2563 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 2564 validate_slab_cache(kmalloc_caches + 6); 2565 2566 printk(KERN_ERR "\nB. Corruption after free\n"); 2567 p = kzalloc(128, GFP_KERNEL); 2568 kfree(p); 2569 *p = 0x78; 2570 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 2571 validate_slab_cache(kmalloc_caches + 7); 2572 2573 p = kzalloc(256, GFP_KERNEL); 2574 kfree(p); 2575 p[50] = 0x9a; 2576 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); 2577 validate_slab_cache(kmalloc_caches + 8); 2578 2579 p = kzalloc(512, GFP_KERNEL); 2580 kfree(p); 2581 p[512] = 0xab; 2582 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 2583 validate_slab_cache(kmalloc_caches + 9); 2584} 2585#else 2586static void resiliency_test(void) {}; 2587#endif 2588 2589/* 2590 * These are not as efficient as kmalloc for the non debug case. 2591 * We do not have the page struct available so we have to touch one 2592 * cacheline in struct kmem_cache to check slab flags. 2593 */ 2594void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) 2595{ 2596 struct kmem_cache *s = get_slab(size, gfpflags); 2597 2598 if (!s) 2599 return NULL; 2600 2601 return slab_alloc(s, gfpflags, -1, caller); 2602} 2603 2604void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 2605 int node, void *caller) 2606{ 2607 struct kmem_cache *s = get_slab(size, gfpflags); 2608 2609 if (!s) 2610 return NULL; 2611 2612 return slab_alloc(s, gfpflags, node, caller); 2613} 2614 2615#ifdef CONFIG_SYSFS 2616 2617static int validate_slab(struct kmem_cache *s, struct page *page) 2618{ 2619 void *p; 2620 void *addr = page_address(page); 2621 unsigned long map[BITS_TO_LONGS(s->objects)]; 2622 2623 if (!check_slab(s, page) || 2624 !on_freelist(s, page, NULL)) 2625 return 0; 2626 2627 /* Now we know that a valid freelist exists */ 2628 bitmap_zero(map, s->objects); 2629 2630 for(p = page->freelist; p; p = get_freepointer(s, p)) { 2631 set_bit((p - addr) / s->size, map); 2632 if (!check_object(s, page, p, 0)) 2633 return 0; 2634 } 2635 2636 for(p = addr; p < addr + s->objects * s->size; p += s->size) 2637 if (!test_bit((p - addr) / s->size, map)) 2638 if (!check_object(s, page, p, 1)) 2639 return 0; 2640 return 1; 2641} 2642 2643static void validate_slab_slab(struct kmem_cache *s, struct page *page) 2644{ 2645 if (slab_trylock(page)) { 2646 validate_slab(s, page); 2647 slab_unlock(page); 2648 } else 2649 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", 2650 s->name, page); 2651 2652 if (s->flags & DEBUG_DEFAULT_FLAGS) { 2653 if (!PageError(page)) 2654 printk(KERN_ERR "SLUB %s: PageError not set " 2655 "on slab 0x%p\n", s->name, page); 2656 } else { 2657 if (PageError(page)) 2658 printk(KERN_ERR "SLUB %s: PageError set on " 2659 "slab 0x%p\n", s->name, page); 2660 } 2661} 2662 2663static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n) 2664{ 2665 unsigned long count = 0; 2666 struct page *page; 2667 unsigned long flags; 2668 2669 spin_lock_irqsave(&n->list_lock, flags); 2670 2671 list_for_each_entry(page, &n->partial, lru) { 2672 validate_slab_slab(s, page); 2673 count++; 2674 } 2675 if (count != n->nr_partial) 2676 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 2677 "counter=%ld\n", s->name, count, n->nr_partial); 2678 2679 if (!(s->flags & SLAB_STORE_USER)) 2680 goto out; 2681 2682 list_for_each_entry(page, &n->full, lru) { 2683 validate_slab_slab(s, page); 2684 count++; 2685 } 2686 if (count != atomic_long_read(&n->nr_slabs)) 2687 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 2688 "counter=%ld\n", s->name, count, 2689 atomic_long_read(&n->nr_slabs)); 2690 2691out: 2692 spin_unlock_irqrestore(&n->list_lock, flags); 2693 return count; 2694} 2695 2696static unsigned long validate_slab_cache(struct kmem_cache *s) 2697{ 2698 int node; 2699 unsigned long count = 0; 2700 2701 flush_all(s); 2702 for_each_online_node(node) { 2703 struct kmem_cache_node *n = get_node(s, node); 2704 2705 count += validate_slab_node(s, n); 2706 } 2707 return count; 2708} 2709 2710/* 2711 * Generate lists of locations where slabcache objects are allocated 2712 * and freed. 2713 */ 2714 2715struct location { 2716 unsigned long count; 2717 void *addr; 2718}; 2719 2720struct loc_track { 2721 unsigned long max; 2722 unsigned long count; 2723 struct location *loc; 2724}; 2725 2726static void free_loc_track(struct loc_track *t) 2727{ 2728 if (t->max) 2729 free_pages((unsigned long)t->loc, 2730 get_order(sizeof(struct location) * t->max)); 2731} 2732 2733static int alloc_loc_track(struct loc_track *t, unsigned long max) 2734{ 2735 struct location *l; 2736 int order; 2737 2738 if (!max) 2739 max = PAGE_SIZE / sizeof(struct location); 2740 2741 order = get_order(sizeof(struct location) * max); 2742 2743 l = (void *)__get_free_pages(GFP_KERNEL, order); 2744 2745 if (!l) 2746 return 0; 2747 2748 if (t->count) { 2749 memcpy(l, t->loc, sizeof(struct location) * t->count); 2750 free_loc_track(t); 2751 } 2752 t->max = max; 2753 t->loc = l; 2754 return 1; 2755} 2756 2757static int add_location(struct loc_track *t, struct kmem_cache *s, 2758 void *addr) 2759{ 2760 long start, end, pos; 2761 struct location *l; 2762 void *caddr; 2763 2764 start = -1; 2765 end = t->count; 2766 2767 for ( ; ; ) { 2768 pos = start + (end - start + 1) / 2; 2769 2770 /* 2771 * There is nothing at "end". If we end up there 2772 * we need to add something to before end. 2773 */ 2774 if (pos == end) 2775 break; 2776 2777 caddr = t->loc[pos].addr; 2778 if (addr == caddr) { 2779 t->loc[pos].count++; 2780 return 1; 2781 } 2782 2783 if (addr < caddr) 2784 end = pos; 2785 else 2786 start = pos; 2787 } 2788 2789 /* 2790 * Not found. Insert new tracking element 2791 */ 2792 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) 2793 return 0; 2794 2795 l = t->loc + pos; 2796 if (pos < t->count) 2797 memmove(l + 1, l, 2798 (t->count - pos) * sizeof(struct location)); 2799 t->count++; 2800 l->count = 1; 2801 l->addr = addr; 2802 return 1; 2803} 2804 2805static void process_slab(struct loc_track *t, struct kmem_cache *s, 2806 struct page *page, enum track_item alloc) 2807{ 2808 void *addr = page_address(page); 2809 unsigned long map[BITS_TO_LONGS(s->objects)]; 2810 void *p; 2811 2812 bitmap_zero(map, s->objects); 2813 for (p = page->freelist; p; p = get_freepointer(s, p)) 2814 set_bit((p - addr) / s->size, map); 2815 2816 for (p = addr; p < addr + s->objects * s->size; p += s->size) 2817 if (!test_bit((p - addr) / s->size, map)) { 2818 void *addr = get_track(s, p, alloc)->addr; 2819 2820 add_location(t, s, addr); 2821 } 2822} 2823 2824static int list_locations(struct kmem_cache *s, char *buf, 2825 enum track_item alloc) 2826{ 2827 int n = 0; 2828 unsigned long i; 2829 struct loc_track t; 2830 int node; 2831 2832 t.count = 0; 2833 t.max = 0; 2834 2835 /* Push back cpu slabs */ 2836 flush_all(s); 2837 2838 for_each_online_node(node) { 2839 struct kmem_cache_node *n = get_node(s, node); 2840 unsigned long flags; 2841 struct page *page; 2842 2843 if (!atomic_read(&n->nr_slabs)) 2844 continue; 2845 2846 spin_lock_irqsave(&n->list_lock, flags); 2847 list_for_each_entry(page, &n->partial, lru) 2848 process_slab(&t, s, page, alloc); 2849 list_for_each_entry(page, &n->full, lru) 2850 process_slab(&t, s, page, alloc); 2851 spin_unlock_irqrestore(&n->list_lock, flags); 2852 } 2853 2854 for (i = 0; i < t.count; i++) { 2855 void *addr = t.loc[i].addr; 2856 2857 if (n > PAGE_SIZE - 100) 2858 break; 2859 n += sprintf(buf + n, "%7ld ", t.loc[i].count); 2860 if (addr) 2861 n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr); 2862 else 2863 n += sprintf(buf + n, "<not-available>"); 2864 n += sprintf(buf + n, "\n"); 2865 } 2866 2867 free_loc_track(&t); 2868 if (!t.count) 2869 n += sprintf(buf, "No data\n"); 2870 return n; 2871} 2872 2873static unsigned long count_partial(struct kmem_cache_node *n) 2874{ 2875 unsigned long flags; 2876 unsigned long x = 0; 2877 struct page *page; 2878 2879 spin_lock_irqsave(&n->list_lock, flags); 2880 list_for_each_entry(page, &n->partial, lru) 2881 x += page->inuse; 2882 spin_unlock_irqrestore(&n->list_lock, flags); 2883 return x; 2884} 2885 2886enum slab_stat_type { 2887 SL_FULL, 2888 SL_PARTIAL, 2889 SL_CPU, 2890 SL_OBJECTS 2891}; 2892 2893#define SO_FULL (1 << SL_FULL) 2894#define SO_PARTIAL (1 << SL_PARTIAL) 2895#define SO_CPU (1 << SL_CPU) 2896#define SO_OBJECTS (1 << SL_OBJECTS) 2897 2898static unsigned long slab_objects(struct kmem_cache *s, 2899 char *buf, unsigned long flags) 2900{ 2901 unsigned long total = 0; 2902 int cpu; 2903 int node; 2904 int x; 2905 unsigned long *nodes; 2906 unsigned long *per_cpu; 2907 2908 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 2909 per_cpu = nodes + nr_node_ids; 2910 2911 for_each_possible_cpu(cpu) { 2912 struct page *page = s->cpu_slab[cpu]; 2913 int node; 2914 2915 if (page) { 2916 node = page_to_nid(page); 2917 if (flags & SO_CPU) { 2918 int x = 0; 2919 2920 if (flags & SO_OBJECTS) 2921 x = page->inuse; 2922 else 2923 x = 1; 2924 total += x; 2925 nodes[node] += x; 2926 } 2927 per_cpu[node]++; 2928 } 2929 } 2930 2931 for_each_online_node(node) { 2932 struct kmem_cache_node *n = get_node(s, node); 2933 2934 if (flags & SO_PARTIAL) { 2935 if (flags & SO_OBJECTS) 2936 x = count_partial(n); 2937 else 2938 x = n->nr_partial; 2939 total += x; 2940 nodes[node] += x; 2941 } 2942 2943 if (flags & SO_FULL) { 2944 int full_slabs = atomic_read(&n->nr_slabs) 2945 - per_cpu[node] 2946 - n->nr_partial; 2947 2948 if (flags & SO_OBJECTS) 2949 x = full_slabs * s->objects; 2950 else 2951 x = full_slabs; 2952 total += x; 2953 nodes[node] += x; 2954 } 2955 } 2956 2957 x = sprintf(buf, "%lu", total); 2958#ifdef CONFIG_NUMA 2959 for_each_online_node(node) 2960 if (nodes[node]) 2961 x += sprintf(buf + x, " N%d=%lu", 2962 node, nodes[node]); 2963#endif 2964 kfree(nodes); 2965 return x + sprintf(buf + x, "\n"); 2966} 2967 2968static int any_slab_objects(struct kmem_cache *s) 2969{ 2970 int node; 2971 int cpu; 2972 2973 for_each_possible_cpu(cpu) 2974 if (s->cpu_slab[cpu]) 2975 return 1; 2976 2977 for_each_node(node) { 2978 struct kmem_cache_node *n = get_node(s, node); 2979 2980 if (n->nr_partial || atomic_read(&n->nr_slabs)) 2981 return 1; 2982 } 2983 return 0; 2984} 2985 2986#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 2987#define to_slab(n) container_of(n, struct kmem_cache, kobj); 2988 2989struct slab_attribute { 2990 struct attribute attr; 2991 ssize_t (*show)(struct kmem_cache *s, char *buf); 2992 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 2993}; 2994 2995#define SLAB_ATTR_RO(_name) \ 2996 static struct slab_attribute _name##_attr = __ATTR_RO(_name) 2997 2998#define SLAB_ATTR(_name) \ 2999 static struct slab_attribute _name##_attr = \ 3000 __ATTR(_name, 0644, _name##_show, _name##_store) 3001 3002static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 3003{ 3004 return sprintf(buf, "%d\n", s->size); 3005} 3006SLAB_ATTR_RO(slab_size); 3007 3008static ssize_t align_show(struct kmem_cache *s, char *buf) 3009{ 3010 return sprintf(buf, "%d\n", s->align); 3011} 3012SLAB_ATTR_RO(align); 3013 3014static ssize_t object_size_show(struct kmem_cache *s, char *buf) 3015{ 3016 return sprintf(buf, "%d\n", s->objsize); 3017} 3018SLAB_ATTR_RO(object_size); 3019 3020static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 3021{ 3022 return sprintf(buf, "%d\n", s->objects); 3023} 3024SLAB_ATTR_RO(objs_per_slab); 3025 3026static ssize_t order_show(struct kmem_cache *s, char *buf) 3027{ 3028 return sprintf(buf, "%d\n", s->order); 3029} 3030SLAB_ATTR_RO(order); 3031 3032static ssize_t ctor_show(struct kmem_cache *s, char *buf) 3033{ 3034 if (s->ctor) { 3035 int n = sprint_symbol(buf, (unsigned long)s->ctor); 3036 3037 return n + sprintf(buf + n, "\n"); 3038 } 3039 return 0; 3040} 3041SLAB_ATTR_RO(ctor); 3042 3043static ssize_t dtor_show(struct kmem_cache *s, char *buf) 3044{ 3045 if (s->dtor) { 3046 int n = sprint_symbol(buf, (unsigned long)s->dtor); 3047 3048 return n + sprintf(buf + n, "\n"); 3049 } 3050 return 0; 3051} 3052SLAB_ATTR_RO(dtor); 3053 3054static ssize_t aliases_show(struct kmem_cache *s, char *buf) 3055{ 3056 return sprintf(buf, "%d\n", s->refcount - 1); 3057} 3058SLAB_ATTR_RO(aliases); 3059 3060static ssize_t slabs_show(struct kmem_cache *s, char *buf) 3061{ 3062 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); 3063} 3064SLAB_ATTR_RO(slabs); 3065 3066static ssize_t partial_show(struct kmem_cache *s, char *buf) 3067{ 3068 return slab_objects(s, buf, SO_PARTIAL); 3069} 3070SLAB_ATTR_RO(partial); 3071 3072static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 3073{ 3074 return slab_objects(s, buf, SO_CPU); 3075} 3076SLAB_ATTR_RO(cpu_slabs); 3077 3078static ssize_t objects_show(struct kmem_cache *s, char *buf) 3079{ 3080 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); 3081} 3082SLAB_ATTR_RO(objects); 3083 3084static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 3085{ 3086 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 3087} 3088 3089static ssize_t sanity_checks_store(struct kmem_cache *s, 3090 const char *buf, size_t length) 3091{ 3092 s->flags &= ~SLAB_DEBUG_FREE; 3093 if (buf[0] == '1') 3094 s->flags |= SLAB_DEBUG_FREE; 3095 return length; 3096} 3097SLAB_ATTR(sanity_checks); 3098 3099static ssize_t trace_show(struct kmem_cache *s, char *buf) 3100{ 3101 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 3102} 3103 3104static ssize_t trace_store(struct kmem_cache *s, const char *buf, 3105 size_t length) 3106{ 3107 s->flags &= ~SLAB_TRACE; 3108 if (buf[0] == '1') 3109 s->flags |= SLAB_TRACE; 3110 return length; 3111} 3112SLAB_ATTR(trace); 3113 3114static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 3115{ 3116 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 3117} 3118 3119static ssize_t reclaim_account_store(struct kmem_cache *s, 3120 const char *buf, size_t length) 3121{ 3122 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 3123 if (buf[0] == '1') 3124 s->flags |= SLAB_RECLAIM_ACCOUNT; 3125 return length; 3126} 3127SLAB_ATTR(reclaim_account); 3128 3129static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 3130{ 3131 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 3132} 3133SLAB_ATTR_RO(hwcache_align); 3134 3135#ifdef CONFIG_ZONE_DMA 3136static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 3137{ 3138 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 3139} 3140SLAB_ATTR_RO(cache_dma); 3141#endif 3142 3143static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 3144{ 3145 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 3146} 3147SLAB_ATTR_RO(destroy_by_rcu); 3148 3149static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 3150{ 3151 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 3152} 3153 3154static ssize_t red_zone_store(struct kmem_cache *s, 3155 const char *buf, size_t length) 3156{ 3157 if (any_slab_objects(s)) 3158 return -EBUSY; 3159 3160 s->flags &= ~SLAB_RED_ZONE; 3161 if (buf[0] == '1') 3162 s->flags |= SLAB_RED_ZONE; 3163 calculate_sizes(s); 3164 return length; 3165} 3166SLAB_ATTR(red_zone); 3167 3168static ssize_t poison_show(struct kmem_cache *s, char *buf) 3169{ 3170 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 3171} 3172 3173static ssize_t poison_store(struct kmem_cache *s, 3174 const char *buf, size_t length) 3175{ 3176 if (any_slab_objects(s)) 3177 return -EBUSY; 3178 3179 s->flags &= ~SLAB_POISON; 3180 if (buf[0] == '1') 3181 s->flags |= SLAB_POISON; 3182 calculate_sizes(s); 3183 return length; 3184} 3185SLAB_ATTR(poison); 3186 3187static ssize_t store_user_show(struct kmem_cache *s, char *buf) 3188{ 3189 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 3190} 3191 3192static ssize_t store_user_store(struct kmem_cache *s, 3193 const char *buf, size_t length) 3194{ 3195 if (any_slab_objects(s)) 3196 return -EBUSY; 3197 3198 s->flags &= ~SLAB_STORE_USER; 3199 if (buf[0] == '1') 3200 s->flags |= SLAB_STORE_USER; 3201 calculate_sizes(s); 3202 return length; 3203} 3204SLAB_ATTR(store_user); 3205 3206static ssize_t validate_show(struct kmem_cache *s, char *buf) 3207{ 3208 return 0; 3209} 3210 3211static ssize_t validate_store(struct kmem_cache *s, 3212 const char *buf, size_t length) 3213{ 3214 if (buf[0] == '1') 3215 validate_slab_cache(s); 3216 else 3217 return -EINVAL; 3218 return length; 3219} 3220SLAB_ATTR(validate); 3221 3222static ssize_t shrink_show(struct kmem_cache *s, char *buf) 3223{ 3224 return 0; 3225} 3226 3227static ssize_t shrink_store(struct kmem_cache *s, 3228 const char *buf, size_t length) 3229{ 3230 if (buf[0] == '1') { 3231 int rc = kmem_cache_shrink(s); 3232 3233 if (rc) 3234 return rc; 3235 } else 3236 return -EINVAL; 3237 return length; 3238} 3239SLAB_ATTR(shrink); 3240 3241static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 3242{ 3243 if (!(s->flags & SLAB_STORE_USER)) 3244 return -ENOSYS; 3245 return list_locations(s, buf, TRACK_ALLOC); 3246} 3247SLAB_ATTR_RO(alloc_calls); 3248 3249static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 3250{ 3251 if (!(s->flags & SLAB_STORE_USER)) 3252 return -ENOSYS; 3253 return list_locations(s, buf, TRACK_FREE); 3254} 3255SLAB_ATTR_RO(free_calls); 3256 3257#ifdef CONFIG_NUMA 3258static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) 3259{ 3260 return sprintf(buf, "%d\n", s->defrag_ratio / 10); 3261} 3262 3263static ssize_t defrag_ratio_store(struct kmem_cache *s, 3264 const char *buf, size_t length) 3265{ 3266 int n = simple_strtoul(buf, NULL, 10); 3267 3268 if (n < 100) 3269 s->defrag_ratio = n * 10; 3270 return length; 3271} 3272SLAB_ATTR(defrag_ratio); 3273#endif 3274 3275static struct attribute * slab_attrs[] = { 3276 &slab_size_attr.attr, 3277 &object_size_attr.attr, 3278 &objs_per_slab_attr.attr, 3279 &order_attr.attr, 3280 &objects_attr.attr, 3281 &slabs_attr.attr, 3282 &partial_attr.attr, 3283 &cpu_slabs_attr.attr, 3284 &ctor_attr.attr, 3285 &dtor_attr.attr, 3286 &aliases_attr.attr, 3287 &align_attr.attr, 3288 &sanity_checks_attr.attr, 3289 &trace_attr.attr, 3290 &hwcache_align_attr.attr, 3291 &reclaim_account_attr.attr, 3292 &destroy_by_rcu_attr.attr, 3293 &red_zone_attr.attr, 3294 &poison_attr.attr, 3295 &store_user_attr.attr, 3296 &validate_attr.attr, 3297 &shrink_attr.attr, 3298 &alloc_calls_attr.attr, 3299 &free_calls_attr.attr, 3300#ifdef CONFIG_ZONE_DMA 3301 &cache_dma_attr.attr, 3302#endif 3303#ifdef CONFIG_NUMA 3304 &defrag_ratio_attr.attr, 3305#endif 3306 NULL 3307}; 3308 3309static struct attribute_group slab_attr_group = { 3310 .attrs = slab_attrs, 3311}; 3312 3313static ssize_t slab_attr_show(struct kobject *kobj, 3314 struct attribute *attr, 3315 char *buf) 3316{ 3317 struct slab_attribute *attribute; 3318 struct kmem_cache *s; 3319 int err; 3320 3321 attribute = to_slab_attr(attr); 3322 s = to_slab(kobj); 3323 3324 if (!attribute->show) 3325 return -EIO; 3326 3327 err = attribute->show(s, buf); 3328 3329 return err; 3330} 3331 3332static ssize_t slab_attr_store(struct kobject *kobj, 3333 struct attribute *attr, 3334 const char *buf, size_t len) 3335{ 3336 struct slab_attribute *attribute; 3337 struct kmem_cache *s; 3338 int err; 3339 3340 attribute = to_slab_attr(attr); 3341 s = to_slab(kobj); 3342 3343 if (!attribute->store) 3344 return -EIO; 3345 3346 err = attribute->store(s, buf, len); 3347 3348 return err; 3349} 3350 3351static struct sysfs_ops slab_sysfs_ops = { 3352 .show = slab_attr_show, 3353 .store = slab_attr_store, 3354}; 3355 3356static struct kobj_type slab_ktype = { 3357 .sysfs_ops = &slab_sysfs_ops, 3358}; 3359 3360static int uevent_filter(struct kset *kset, struct kobject *kobj) 3361{ 3362 struct kobj_type *ktype = get_ktype(kobj); 3363 3364 if (ktype == &slab_ktype) 3365 return 1; 3366 return 0; 3367} 3368 3369static struct kset_uevent_ops slab_uevent_ops = { 3370 .filter = uevent_filter, 3371}; 3372 3373decl_subsys(slab, &slab_ktype, &slab_uevent_ops); 3374 3375#define ID_STR_LENGTH 64 3376 3377/* Create a unique string id for a slab cache: 3378 * format 3379 * :[flags-]size:[memory address of kmemcache] 3380 */ 3381static char *create_unique_id(struct kmem_cache *s) 3382{ 3383 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 3384 char *p = name; 3385 3386 BUG_ON(!name); 3387 3388 *p++ = ':'; 3389 /* 3390 * First flags affecting slabcache operations. We will only 3391 * get here for aliasable slabs so we do not need to support 3392 * too many flags. The flags here must cover all flags that 3393 * are matched during merging to guarantee that the id is 3394 * unique. 3395 */ 3396 if (s->flags & SLAB_CACHE_DMA) 3397 *p++ = 'd'; 3398 if (s->flags & SLAB_RECLAIM_ACCOUNT) 3399 *p++ = 'a'; 3400 if (s->flags & SLAB_DEBUG_FREE) 3401 *p++ = 'F'; 3402 if (p != name + 1) 3403 *p++ = '-'; 3404 p += sprintf(p, "%07d", s->size); 3405 BUG_ON(p > name + ID_STR_LENGTH - 1); 3406 return name; 3407} 3408 3409static int sysfs_slab_add(struct kmem_cache *s) 3410{ 3411 int err; 3412 const char *name; 3413 int unmergeable; 3414 3415 if (slab_state < SYSFS) 3416 /* Defer until later */ 3417 return 0; 3418 3419 unmergeable = slab_unmergeable(s); 3420 if (unmergeable) { 3421 /* 3422 * Slabcache can never be merged so we can use the name proper. 3423 * This is typically the case for debug situations. In that 3424 * case we can catch duplicate names easily. 3425 */ 3426 sysfs_remove_link(&slab_subsys.kobj, s->name); 3427 name = s->name; 3428 } else { 3429 /* 3430 * Create a unique name for the slab as a target 3431 * for the symlinks. 3432 */ 3433 name = create_unique_id(s); 3434 } 3435 3436 kobj_set_kset_s(s, slab_subsys); 3437 kobject_set_name(&s->kobj, name); 3438 kobject_init(&s->kobj); 3439 err = kobject_add(&s->kobj); 3440 if (err) 3441 return err; 3442 3443 err = sysfs_create_group(&s->kobj, &slab_attr_group); 3444 if (err) 3445 return err; 3446 kobject_uevent(&s->kobj, KOBJ_ADD); 3447 if (!unmergeable) { 3448 /* Setup first alias */ 3449 sysfs_slab_alias(s, s->name); 3450 kfree(name); 3451 } 3452 return 0; 3453} 3454 3455static void sysfs_slab_remove(struct kmem_cache *s) 3456{ 3457 kobject_uevent(&s->kobj, KOBJ_REMOVE); 3458 kobject_del(&s->kobj); 3459} 3460 3461/* 3462 * Need to buffer aliases during bootup until sysfs becomes 3463 * available lest we loose that information. 3464 */ 3465struct saved_alias { 3466 struct kmem_cache *s; 3467 const char *name; 3468 struct saved_alias *next; 3469}; 3470 3471struct saved_alias *alias_list; 3472 3473static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 3474{ 3475 struct saved_alias *al; 3476 3477 if (slab_state == SYSFS) { 3478 /* 3479 * If we have a leftover link then remove it. 3480 */ 3481 sysfs_remove_link(&slab_subsys.kobj, name); 3482 return sysfs_create_link(&slab_subsys.kobj, 3483 &s->kobj, name); 3484 } 3485 3486 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 3487 if (!al) 3488 return -ENOMEM; 3489 3490 al->s = s; 3491 al->name = name; 3492 al->next = alias_list; 3493 alias_list = al; 3494 return 0; 3495} 3496 3497static int __init slab_sysfs_init(void) 3498{ 3499 int err; 3500 3501 err = subsystem_register(&slab_subsys); 3502 if (err) { 3503 printk(KERN_ERR "Cannot register slab subsystem.\n"); 3504 return -ENOSYS; 3505 } 3506 3507 finish_bootstrap(); 3508 3509 while (alias_list) { 3510 struct saved_alias *al = alias_list; 3511 3512 alias_list = alias_list->next; 3513 err = sysfs_slab_alias(al->s, al->name); 3514 BUG_ON(err); 3515 kfree(al); 3516 } 3517 3518 resiliency_test(); 3519 return 0; 3520} 3521 3522__initcall(slab_sysfs_init); 3523#else 3524__initcall(finish_bootstrap); 3525#endif 3526