slub.c revision dfb4f09609827301740ef0a11b37530d190f1681
1/* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks and only 6 * uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com> 9 */ 10 11#include <linux/mm.h> 12#include <linux/module.h> 13#include <linux/bit_spinlock.h> 14#include <linux/interrupt.h> 15#include <linux/bitops.h> 16#include <linux/slab.h> 17#include <linux/seq_file.h> 18#include <linux/cpu.h> 19#include <linux/cpuset.h> 20#include <linux/mempolicy.h> 21#include <linux/ctype.h> 22#include <linux/kallsyms.h> 23 24/* 25 * Lock order: 26 * 1. slab_lock(page) 27 * 2. slab->list_lock 28 * 29 * The slab_lock protects operations on the object of a particular 30 * slab and its metadata in the page struct. If the slab lock 31 * has been taken then no allocations nor frees can be performed 32 * on the objects in the slab nor can the slab be added or removed 33 * from the partial or full lists since this would mean modifying 34 * the page_struct of the slab. 35 * 36 * The list_lock protects the partial and full list on each node and 37 * the partial slab counter. If taken then no new slabs may be added or 38 * removed from the lists nor make the number of partial slabs be modified. 39 * (Note that the total number of slabs is an atomic value that may be 40 * modified without taking the list lock). 41 * 42 * The list_lock is a centralized lock and thus we avoid taking it as 43 * much as possible. As long as SLUB does not have to handle partial 44 * slabs, operations can continue without any centralized lock. F.e. 45 * allocating a long series of objects that fill up slabs does not require 46 * the list lock. 47 * 48 * The lock order is sometimes inverted when we are trying to get a slab 49 * off a list. We take the list_lock and then look for a page on the list 50 * to use. While we do that objects in the slabs may be freed. We can 51 * only operate on the slab if we have also taken the slab_lock. So we use 52 * a slab_trylock() on the slab. If trylock was successful then no frees 53 * can occur anymore and we can use the slab for allocations etc. If the 54 * slab_trylock() does not succeed then frees are in progress in the slab and 55 * we must stay away from it for a while since we may cause a bouncing 56 * cacheline if we try to acquire the lock. So go onto the next slab. 57 * If all pages are busy then we may allocate a new slab instead of reusing 58 * a partial slab. A new slab has noone operating on it and thus there is 59 * no danger of cacheline contention. 60 * 61 * Interrupts are disabled during allocation and deallocation in order to 62 * make the slab allocator safe to use in the context of an irq. In addition 63 * interrupts are disabled to ensure that the processor does not change 64 * while handling per_cpu slabs, due to kernel preemption. 65 * 66 * SLUB assigns one slab for allocation to each processor. 67 * Allocations only occur from these slabs called cpu slabs. 68 * 69 * Slabs with free elements are kept on a partial list and during regular 70 * operations no list for full slabs is used. If an object in a full slab is 71 * freed then the slab will show up again on the partial lists. 72 * We track full slabs for debugging purposes though because otherwise we 73 * cannot scan all objects. 74 * 75 * Slabs are freed when they become empty. Teardown and setup is 76 * minimal so we rely on the page allocators per cpu caches for 77 * fast frees and allocs. 78 * 79 * Overloading of page flags that are otherwise used for LRU management. 80 * 81 * PageActive The slab is frozen and exempt from list processing. 82 * This means that the slab is dedicated to a purpose 83 * such as satisfying allocations for a specific 84 * processor. Objects may be freed in the slab while 85 * it is frozen but slab_free will then skip the usual 86 * list operations. It is up to the processor holding 87 * the slab to integrate the slab into the slab lists 88 * when the slab is no longer needed. 89 * 90 * One use of this flag is to mark slabs that are 91 * used for allocations. Then such a slab becomes a cpu 92 * slab. The cpu slab may be equipped with an additional 93 * freelist that allows lockless access to 94 * free objects in addition to the regular freelist 95 * that requires the slab lock. 96 * 97 * PageError Slab requires special handling due to debug 98 * options set. This moves slab handling out of 99 * the fast path and disables lockless freelists. 100 */ 101 102#define FROZEN (1 << PG_active) 103 104#ifdef CONFIG_SLUB_DEBUG 105#define SLABDEBUG (1 << PG_error) 106#else 107#define SLABDEBUG 0 108#endif 109 110static inline int SlabFrozen(struct page *page) 111{ 112 return page->flags & FROZEN; 113} 114 115static inline void SetSlabFrozen(struct page *page) 116{ 117 page->flags |= FROZEN; 118} 119 120static inline void ClearSlabFrozen(struct page *page) 121{ 122 page->flags &= ~FROZEN; 123} 124 125static inline int SlabDebug(struct page *page) 126{ 127 return page->flags & SLABDEBUG; 128} 129 130static inline void SetSlabDebug(struct page *page) 131{ 132 page->flags |= SLABDEBUG; 133} 134 135static inline void ClearSlabDebug(struct page *page) 136{ 137 page->flags &= ~SLABDEBUG; 138} 139 140/* 141 * Issues still to be resolved: 142 * 143 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 144 * 145 * - Variable sizing of the per node arrays 146 */ 147 148/* Enable to test recovery from slab corruption on boot */ 149#undef SLUB_RESILIENCY_TEST 150 151#if PAGE_SHIFT <= 12 152 153/* 154 * Small page size. Make sure that we do not fragment memory 155 */ 156#define DEFAULT_MAX_ORDER 1 157#define DEFAULT_MIN_OBJECTS 4 158 159#else 160 161/* 162 * Large page machines are customarily able to handle larger 163 * page orders. 164 */ 165#define DEFAULT_MAX_ORDER 2 166#define DEFAULT_MIN_OBJECTS 8 167 168#endif 169 170/* 171 * Mininum number of partial slabs. These will be left on the partial 172 * lists even if they are empty. kmem_cache_shrink may reclaim them. 173 */ 174#define MIN_PARTIAL 2 175 176/* 177 * Maximum number of desirable partial slabs. 178 * The existence of more partial slabs makes kmem_cache_shrink 179 * sort the partial list by the number of objects in the. 180 */ 181#define MAX_PARTIAL 10 182 183#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 184 SLAB_POISON | SLAB_STORE_USER) 185 186/* 187 * Set of flags that will prevent slab merging 188 */ 189#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 190 SLAB_TRACE | SLAB_DESTROY_BY_RCU) 191 192#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 193 SLAB_CACHE_DMA) 194 195#ifndef ARCH_KMALLOC_MINALIGN 196#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 197#endif 198 199#ifndef ARCH_SLAB_MINALIGN 200#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) 201#endif 202 203/* 204 * The page->inuse field is 16 bit thus we have this limitation 205 */ 206#define MAX_OBJECTS_PER_SLAB 65535 207 208/* Internal SLUB flags */ 209#define __OBJECT_POISON 0x80000000 /* Poison object */ 210#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ 211 212/* Not all arches define cache_line_size */ 213#ifndef cache_line_size 214#define cache_line_size() L1_CACHE_BYTES 215#endif 216 217static int kmem_size = sizeof(struct kmem_cache); 218 219#ifdef CONFIG_SMP 220static struct notifier_block slab_notifier; 221#endif 222 223static enum { 224 DOWN, /* No slab functionality available */ 225 PARTIAL, /* kmem_cache_open() works but kmalloc does not */ 226 UP, /* Everything works but does not show up in sysfs */ 227 SYSFS /* Sysfs up */ 228} slab_state = DOWN; 229 230/* A list of all slab caches on the system */ 231static DECLARE_RWSEM(slub_lock); 232static LIST_HEAD(slab_caches); 233 234/* 235 * Tracking user of a slab. 236 */ 237struct track { 238 void *addr; /* Called from address */ 239 int cpu; /* Was running on cpu */ 240 int pid; /* Pid context */ 241 unsigned long when; /* When did the operation occur */ 242}; 243 244enum track_item { TRACK_ALLOC, TRACK_FREE }; 245 246#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) 247static int sysfs_slab_add(struct kmem_cache *); 248static int sysfs_slab_alias(struct kmem_cache *, const char *); 249static void sysfs_slab_remove(struct kmem_cache *); 250#else 251static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 252static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 253 { return 0; } 254static inline void sysfs_slab_remove(struct kmem_cache *s) {} 255#endif 256 257/******************************************************************** 258 * Core slab cache functions 259 *******************************************************************/ 260 261int slab_is_available(void) 262{ 263 return slab_state >= UP; 264} 265 266static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 267{ 268#ifdef CONFIG_NUMA 269 return s->node[node]; 270#else 271 return &s->local_node; 272#endif 273} 274 275static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) 276{ 277 return &s->cpu_slab[cpu]; 278} 279 280static inline int check_valid_pointer(struct kmem_cache *s, 281 struct page *page, const void *object) 282{ 283 void *base; 284 285 if (!object) 286 return 1; 287 288 base = page_address(page); 289 if (object < base || object >= base + s->objects * s->size || 290 (object - base) % s->size) { 291 return 0; 292 } 293 294 return 1; 295} 296 297/* 298 * Slow version of get and set free pointer. 299 * 300 * This version requires touching the cache lines of kmem_cache which 301 * we avoid to do in the fast alloc free paths. There we obtain the offset 302 * from the page struct. 303 */ 304static inline void *get_freepointer(struct kmem_cache *s, void *object) 305{ 306 return *(void **)(object + s->offset); 307} 308 309static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 310{ 311 *(void **)(object + s->offset) = fp; 312} 313 314/* Loop over all objects in a slab */ 315#define for_each_object(__p, __s, __addr) \ 316 for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ 317 __p += (__s)->size) 318 319/* Scan freelist */ 320#define for_each_free_object(__p, __s, __free) \ 321 for (__p = (__free); __p; __p = get_freepointer((__s), __p)) 322 323/* Determine object index from a given position */ 324static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 325{ 326 return (p - addr) / s->size; 327} 328 329#ifdef CONFIG_SLUB_DEBUG 330/* 331 * Debug settings: 332 */ 333#ifdef CONFIG_SLUB_DEBUG_ON 334static int slub_debug = DEBUG_DEFAULT_FLAGS; 335#else 336static int slub_debug; 337#endif 338 339static char *slub_debug_slabs; 340 341/* 342 * Object debugging 343 */ 344static void print_section(char *text, u8 *addr, unsigned int length) 345{ 346 int i, offset; 347 int newline = 1; 348 char ascii[17]; 349 350 ascii[16] = 0; 351 352 for (i = 0; i < length; i++) { 353 if (newline) { 354 printk(KERN_ERR "%8s 0x%p: ", text, addr + i); 355 newline = 0; 356 } 357 printk(" %02x", addr[i]); 358 offset = i % 16; 359 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; 360 if (offset == 15) { 361 printk(" %s\n",ascii); 362 newline = 1; 363 } 364 } 365 if (!newline) { 366 i %= 16; 367 while (i < 16) { 368 printk(" "); 369 ascii[i] = ' '; 370 i++; 371 } 372 printk(" %s\n", ascii); 373 } 374} 375 376static struct track *get_track(struct kmem_cache *s, void *object, 377 enum track_item alloc) 378{ 379 struct track *p; 380 381 if (s->offset) 382 p = object + s->offset + sizeof(void *); 383 else 384 p = object + s->inuse; 385 386 return p + alloc; 387} 388 389static void set_track(struct kmem_cache *s, void *object, 390 enum track_item alloc, void *addr) 391{ 392 struct track *p; 393 394 if (s->offset) 395 p = object + s->offset + sizeof(void *); 396 else 397 p = object + s->inuse; 398 399 p += alloc; 400 if (addr) { 401 p->addr = addr; 402 p->cpu = smp_processor_id(); 403 p->pid = current ? current->pid : -1; 404 p->when = jiffies; 405 } else 406 memset(p, 0, sizeof(struct track)); 407} 408 409static void init_tracking(struct kmem_cache *s, void *object) 410{ 411 if (!(s->flags & SLAB_STORE_USER)) 412 return; 413 414 set_track(s, object, TRACK_FREE, NULL); 415 set_track(s, object, TRACK_ALLOC, NULL); 416} 417 418static void print_track(const char *s, struct track *t) 419{ 420 if (!t->addr) 421 return; 422 423 printk(KERN_ERR "INFO: %s in ", s); 424 __print_symbol("%s", (unsigned long)t->addr); 425 printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); 426} 427 428static void print_tracking(struct kmem_cache *s, void *object) 429{ 430 if (!(s->flags & SLAB_STORE_USER)) 431 return; 432 433 print_track("Allocated", get_track(s, object, TRACK_ALLOC)); 434 print_track("Freed", get_track(s, object, TRACK_FREE)); 435} 436 437static void print_page_info(struct page *page) 438{ 439 printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", 440 page, page->inuse, page->freelist, page->flags); 441 442} 443 444static void slab_bug(struct kmem_cache *s, char *fmt, ...) 445{ 446 va_list args; 447 char buf[100]; 448 449 va_start(args, fmt); 450 vsnprintf(buf, sizeof(buf), fmt, args); 451 va_end(args); 452 printk(KERN_ERR "========================================" 453 "=====================================\n"); 454 printk(KERN_ERR "BUG %s: %s\n", s->name, buf); 455 printk(KERN_ERR "----------------------------------------" 456 "-------------------------------------\n\n"); 457} 458 459static void slab_fix(struct kmem_cache *s, char *fmt, ...) 460{ 461 va_list args; 462 char buf[100]; 463 464 va_start(args, fmt); 465 vsnprintf(buf, sizeof(buf), fmt, args); 466 va_end(args); 467 printk(KERN_ERR "FIX %s: %s\n", s->name, buf); 468} 469 470static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 471{ 472 unsigned int off; /* Offset of last byte */ 473 u8 *addr = page_address(page); 474 475 print_tracking(s, p); 476 477 print_page_info(page); 478 479 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 480 p, p - addr, get_freepointer(s, p)); 481 482 if (p > addr + 16) 483 print_section("Bytes b4", p - 16, 16); 484 485 print_section("Object", p, min(s->objsize, 128)); 486 487 if (s->flags & SLAB_RED_ZONE) 488 print_section("Redzone", p + s->objsize, 489 s->inuse - s->objsize); 490 491 if (s->offset) 492 off = s->offset + sizeof(void *); 493 else 494 off = s->inuse; 495 496 if (s->flags & SLAB_STORE_USER) 497 off += 2 * sizeof(struct track); 498 499 if (off != s->size) 500 /* Beginning of the filler is the free pointer */ 501 print_section("Padding", p + off, s->size - off); 502 503 dump_stack(); 504} 505 506static void object_err(struct kmem_cache *s, struct page *page, 507 u8 *object, char *reason) 508{ 509 slab_bug(s, reason); 510 print_trailer(s, page, object); 511} 512 513static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) 514{ 515 va_list args; 516 char buf[100]; 517 518 va_start(args, fmt); 519 vsnprintf(buf, sizeof(buf), fmt, args); 520 va_end(args); 521 slab_bug(s, fmt); 522 print_page_info(page); 523 dump_stack(); 524} 525 526static void init_object(struct kmem_cache *s, void *object, int active) 527{ 528 u8 *p = object; 529 530 if (s->flags & __OBJECT_POISON) { 531 memset(p, POISON_FREE, s->objsize - 1); 532 p[s->objsize -1] = POISON_END; 533 } 534 535 if (s->flags & SLAB_RED_ZONE) 536 memset(p + s->objsize, 537 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, 538 s->inuse - s->objsize); 539} 540 541static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) 542{ 543 while (bytes) { 544 if (*start != (u8)value) 545 return start; 546 start++; 547 bytes--; 548 } 549 return NULL; 550} 551 552static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 553 void *from, void *to) 554{ 555 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); 556 memset(from, data, to - from); 557} 558 559static int check_bytes_and_report(struct kmem_cache *s, struct page *page, 560 u8 *object, char *what, 561 u8* start, unsigned int value, unsigned int bytes) 562{ 563 u8 *fault; 564 u8 *end; 565 566 fault = check_bytes(start, value, bytes); 567 if (!fault) 568 return 1; 569 570 end = start + bytes; 571 while (end > fault && end[-1] == value) 572 end--; 573 574 slab_bug(s, "%s overwritten", what); 575 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", 576 fault, end - 1, fault[0], value); 577 print_trailer(s, page, object); 578 579 restore_bytes(s, what, value, fault, end); 580 return 0; 581} 582 583/* 584 * Object layout: 585 * 586 * object address 587 * Bytes of the object to be managed. 588 * If the freepointer may overlay the object then the free 589 * pointer is the first word of the object. 590 * 591 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 592 * 0xa5 (POISON_END) 593 * 594 * object + s->objsize 595 * Padding to reach word boundary. This is also used for Redzoning. 596 * Padding is extended by another word if Redzoning is enabled and 597 * objsize == inuse. 598 * 599 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 600 * 0xcc (RED_ACTIVE) for objects in use. 601 * 602 * object + s->inuse 603 * Meta data starts here. 604 * 605 * A. Free pointer (if we cannot overwrite object on free) 606 * B. Tracking data for SLAB_STORE_USER 607 * C. Padding to reach required alignment boundary or at mininum 608 * one word if debuggin is on to be able to detect writes 609 * before the word boundary. 610 * 611 * Padding is done using 0x5a (POISON_INUSE) 612 * 613 * object + s->size 614 * Nothing is used beyond s->size. 615 * 616 * If slabcaches are merged then the objsize and inuse boundaries are mostly 617 * ignored. And therefore no slab options that rely on these boundaries 618 * may be used with merged slabcaches. 619 */ 620 621static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 622{ 623 unsigned long off = s->inuse; /* The end of info */ 624 625 if (s->offset) 626 /* Freepointer is placed after the object. */ 627 off += sizeof(void *); 628 629 if (s->flags & SLAB_STORE_USER) 630 /* We also have user information there */ 631 off += 2 * sizeof(struct track); 632 633 if (s->size == off) 634 return 1; 635 636 return check_bytes_and_report(s, page, p, "Object padding", 637 p + off, POISON_INUSE, s->size - off); 638} 639 640static int slab_pad_check(struct kmem_cache *s, struct page *page) 641{ 642 u8 *start; 643 u8 *fault; 644 u8 *end; 645 int length; 646 int remainder; 647 648 if (!(s->flags & SLAB_POISON)) 649 return 1; 650 651 start = page_address(page); 652 end = start + (PAGE_SIZE << s->order); 653 length = s->objects * s->size; 654 remainder = end - (start + length); 655 if (!remainder) 656 return 1; 657 658 fault = check_bytes(start + length, POISON_INUSE, remainder); 659 if (!fault) 660 return 1; 661 while (end > fault && end[-1] == POISON_INUSE) 662 end--; 663 664 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 665 print_section("Padding", start, length); 666 667 restore_bytes(s, "slab padding", POISON_INUSE, start, end); 668 return 0; 669} 670 671static int check_object(struct kmem_cache *s, struct page *page, 672 void *object, int active) 673{ 674 u8 *p = object; 675 u8 *endobject = object + s->objsize; 676 677 if (s->flags & SLAB_RED_ZONE) { 678 unsigned int red = 679 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; 680 681 if (!check_bytes_and_report(s, page, object, "Redzone", 682 endobject, red, s->inuse - s->objsize)) 683 return 0; 684 } else { 685 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) 686 check_bytes_and_report(s, page, p, "Alignment padding", endobject, 687 POISON_INUSE, s->inuse - s->objsize); 688 } 689 690 if (s->flags & SLAB_POISON) { 691 if (!active && (s->flags & __OBJECT_POISON) && 692 (!check_bytes_and_report(s, page, p, "Poison", p, 693 POISON_FREE, s->objsize - 1) || 694 !check_bytes_and_report(s, page, p, "Poison", 695 p + s->objsize -1, POISON_END, 1))) 696 return 0; 697 /* 698 * check_pad_bytes cleans up on its own. 699 */ 700 check_pad_bytes(s, page, p); 701 } 702 703 if (!s->offset && active) 704 /* 705 * Object and freepointer overlap. Cannot check 706 * freepointer while object is allocated. 707 */ 708 return 1; 709 710 /* Check free pointer validity */ 711 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 712 object_err(s, page, p, "Freepointer corrupt"); 713 /* 714 * No choice but to zap it and thus loose the remainder 715 * of the free objects in this slab. May cause 716 * another error because the object count is now wrong. 717 */ 718 set_freepointer(s, p, NULL); 719 return 0; 720 } 721 return 1; 722} 723 724static int check_slab(struct kmem_cache *s, struct page *page) 725{ 726 VM_BUG_ON(!irqs_disabled()); 727 728 if (!PageSlab(page)) { 729 slab_err(s, page, "Not a valid slab page"); 730 return 0; 731 } 732 if (page->offset * sizeof(void *) != s->offset) { 733 slab_err(s, page, "Corrupted offset %lu", 734 (unsigned long)(page->offset * sizeof(void *))); 735 return 0; 736 } 737 if (page->inuse > s->objects) { 738 slab_err(s, page, "inuse %u > max %u", 739 s->name, page->inuse, s->objects); 740 return 0; 741 } 742 /* Slab_pad_check fixes things up after itself */ 743 slab_pad_check(s, page); 744 return 1; 745} 746 747/* 748 * Determine if a certain object on a page is on the freelist. Must hold the 749 * slab lock to guarantee that the chains are in a consistent state. 750 */ 751static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 752{ 753 int nr = 0; 754 void *fp = page->freelist; 755 void *object = NULL; 756 757 while (fp && nr <= s->objects) { 758 if (fp == search) 759 return 1; 760 if (!check_valid_pointer(s, page, fp)) { 761 if (object) { 762 object_err(s, page, object, 763 "Freechain corrupt"); 764 set_freepointer(s, object, NULL); 765 break; 766 } else { 767 slab_err(s, page, "Freepointer corrupt"); 768 page->freelist = NULL; 769 page->inuse = s->objects; 770 slab_fix(s, "Freelist cleared"); 771 return 0; 772 } 773 break; 774 } 775 object = fp; 776 fp = get_freepointer(s, object); 777 nr++; 778 } 779 780 if (page->inuse != s->objects - nr) { 781 slab_err(s, page, "Wrong object count. Counter is %d but " 782 "counted were %d", page->inuse, s->objects - nr); 783 page->inuse = s->objects - nr; 784 slab_fix(s, "Object count adjusted."); 785 } 786 return search == NULL; 787} 788 789static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) 790{ 791 if (s->flags & SLAB_TRACE) { 792 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 793 s->name, 794 alloc ? "alloc" : "free", 795 object, page->inuse, 796 page->freelist); 797 798 if (!alloc) 799 print_section("Object", (void *)object, s->objsize); 800 801 dump_stack(); 802 } 803} 804 805/* 806 * Tracking of fully allocated slabs for debugging purposes. 807 */ 808static void add_full(struct kmem_cache_node *n, struct page *page) 809{ 810 spin_lock(&n->list_lock); 811 list_add(&page->lru, &n->full); 812 spin_unlock(&n->list_lock); 813} 814 815static void remove_full(struct kmem_cache *s, struct page *page) 816{ 817 struct kmem_cache_node *n; 818 819 if (!(s->flags & SLAB_STORE_USER)) 820 return; 821 822 n = get_node(s, page_to_nid(page)); 823 824 spin_lock(&n->list_lock); 825 list_del(&page->lru); 826 spin_unlock(&n->list_lock); 827} 828 829static void setup_object_debug(struct kmem_cache *s, struct page *page, 830 void *object) 831{ 832 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 833 return; 834 835 init_object(s, object, 0); 836 init_tracking(s, object); 837} 838 839static int alloc_debug_processing(struct kmem_cache *s, struct page *page, 840 void *object, void *addr) 841{ 842 if (!check_slab(s, page)) 843 goto bad; 844 845 if (object && !on_freelist(s, page, object)) { 846 object_err(s, page, object, "Object already allocated"); 847 goto bad; 848 } 849 850 if (!check_valid_pointer(s, page, object)) { 851 object_err(s, page, object, "Freelist Pointer check fails"); 852 goto bad; 853 } 854 855 if (object && !check_object(s, page, object, 0)) 856 goto bad; 857 858 /* Success perform special debug activities for allocs */ 859 if (s->flags & SLAB_STORE_USER) 860 set_track(s, object, TRACK_ALLOC, addr); 861 trace(s, page, object, 1); 862 init_object(s, object, 1); 863 return 1; 864 865bad: 866 if (PageSlab(page)) { 867 /* 868 * If this is a slab page then lets do the best we can 869 * to avoid issues in the future. Marking all objects 870 * as used avoids touching the remaining objects. 871 */ 872 slab_fix(s, "Marking all objects used"); 873 page->inuse = s->objects; 874 page->freelist = NULL; 875 /* Fix up fields that may be corrupted */ 876 page->offset = s->offset / sizeof(void *); 877 } 878 return 0; 879} 880 881static int free_debug_processing(struct kmem_cache *s, struct page *page, 882 void *object, void *addr) 883{ 884 if (!check_slab(s, page)) 885 goto fail; 886 887 if (!check_valid_pointer(s, page, object)) { 888 slab_err(s, page, "Invalid object pointer 0x%p", object); 889 goto fail; 890 } 891 892 if (on_freelist(s, page, object)) { 893 object_err(s, page, object, "Object already free"); 894 goto fail; 895 } 896 897 if (!check_object(s, page, object, 1)) 898 return 0; 899 900 if (unlikely(s != page->slab)) { 901 if (!PageSlab(page)) 902 slab_err(s, page, "Attempt to free object(0x%p) " 903 "outside of slab", object); 904 else 905 if (!page->slab) { 906 printk(KERN_ERR 907 "SLUB <none>: no slab for object 0x%p.\n", 908 object); 909 dump_stack(); 910 } 911 else 912 object_err(s, page, object, 913 "page slab pointer corrupt."); 914 goto fail; 915 } 916 917 /* Special debug activities for freeing objects */ 918 if (!SlabFrozen(page) && !page->freelist) 919 remove_full(s, page); 920 if (s->flags & SLAB_STORE_USER) 921 set_track(s, object, TRACK_FREE, addr); 922 trace(s, page, object, 0); 923 init_object(s, object, 0); 924 return 1; 925 926fail: 927 slab_fix(s, "Object at 0x%p not freed", object); 928 return 0; 929} 930 931static int __init setup_slub_debug(char *str) 932{ 933 slub_debug = DEBUG_DEFAULT_FLAGS; 934 if (*str++ != '=' || !*str) 935 /* 936 * No options specified. Switch on full debugging. 937 */ 938 goto out; 939 940 if (*str == ',') 941 /* 942 * No options but restriction on slabs. This means full 943 * debugging for slabs matching a pattern. 944 */ 945 goto check_slabs; 946 947 slub_debug = 0; 948 if (*str == '-') 949 /* 950 * Switch off all debugging measures. 951 */ 952 goto out; 953 954 /* 955 * Determine which debug features should be switched on 956 */ 957 for ( ;*str && *str != ','; str++) { 958 switch (tolower(*str)) { 959 case 'f': 960 slub_debug |= SLAB_DEBUG_FREE; 961 break; 962 case 'z': 963 slub_debug |= SLAB_RED_ZONE; 964 break; 965 case 'p': 966 slub_debug |= SLAB_POISON; 967 break; 968 case 'u': 969 slub_debug |= SLAB_STORE_USER; 970 break; 971 case 't': 972 slub_debug |= SLAB_TRACE; 973 break; 974 default: 975 printk(KERN_ERR "slub_debug option '%c' " 976 "unknown. skipped\n",*str); 977 } 978 } 979 980check_slabs: 981 if (*str == ',') 982 slub_debug_slabs = str + 1; 983out: 984 return 1; 985} 986 987__setup("slub_debug", setup_slub_debug); 988 989static unsigned long kmem_cache_flags(unsigned long objsize, 990 unsigned long flags, const char *name, 991 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 992{ 993 /* 994 * The page->offset field is only 16 bit wide. This is an offset 995 * in units of words from the beginning of an object. If the slab 996 * size is bigger then we cannot move the free pointer behind the 997 * object anymore. 998 * 999 * On 32 bit platforms the limit is 256k. On 64bit platforms 1000 * the limit is 512k. 1001 * 1002 * Debugging or ctor may create a need to move the free 1003 * pointer. Fail if this happens. 1004 */ 1005 if (objsize >= 65535 * sizeof(void *)) { 1006 BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | 1007 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); 1008 BUG_ON(ctor); 1009 } else { 1010 /* 1011 * Enable debugging if selected on the kernel commandline. 1012 */ 1013 if (slub_debug && (!slub_debug_slabs || 1014 strncmp(slub_debug_slabs, name, 1015 strlen(slub_debug_slabs)) == 0)) 1016 flags |= slub_debug; 1017 } 1018 1019 return flags; 1020} 1021#else 1022static inline void setup_object_debug(struct kmem_cache *s, 1023 struct page *page, void *object) {} 1024 1025static inline int alloc_debug_processing(struct kmem_cache *s, 1026 struct page *page, void *object, void *addr) { return 0; } 1027 1028static inline int free_debug_processing(struct kmem_cache *s, 1029 struct page *page, void *object, void *addr) { return 0; } 1030 1031static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1032 { return 1; } 1033static inline int check_object(struct kmem_cache *s, struct page *page, 1034 void *object, int active) { return 1; } 1035static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1036static inline unsigned long kmem_cache_flags(unsigned long objsize, 1037 unsigned long flags, const char *name, 1038 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 1039{ 1040 return flags; 1041} 1042#define slub_debug 0 1043#endif 1044/* 1045 * Slab allocation and freeing 1046 */ 1047static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1048{ 1049 struct page * page; 1050 int pages = 1 << s->order; 1051 1052 if (s->order) 1053 flags |= __GFP_COMP; 1054 1055 if (s->flags & SLAB_CACHE_DMA) 1056 flags |= SLUB_DMA; 1057 1058 if (s->flags & SLAB_RECLAIM_ACCOUNT) 1059 flags |= __GFP_RECLAIMABLE; 1060 1061 if (node == -1) 1062 page = alloc_pages(flags, s->order); 1063 else 1064 page = alloc_pages_node(node, flags, s->order); 1065 1066 if (!page) 1067 return NULL; 1068 1069 mod_zone_page_state(page_zone(page), 1070 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1071 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1072 pages); 1073 1074 return page; 1075} 1076 1077static void setup_object(struct kmem_cache *s, struct page *page, 1078 void *object) 1079{ 1080 setup_object_debug(s, page, object); 1081 if (unlikely(s->ctor)) 1082 s->ctor(object, s, 0); 1083} 1084 1085static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1086{ 1087 struct page *page; 1088 struct kmem_cache_node *n; 1089 void *start; 1090 void *end; 1091 void *last; 1092 void *p; 1093 1094 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1095 1096 if (flags & __GFP_WAIT) 1097 local_irq_enable(); 1098 1099 page = allocate_slab(s, 1100 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 1101 if (!page) 1102 goto out; 1103 1104 n = get_node(s, page_to_nid(page)); 1105 if (n) 1106 atomic_long_inc(&n->nr_slabs); 1107 page->offset = s->offset / sizeof(void *); 1108 page->slab = s; 1109 page->flags |= 1 << PG_slab; 1110 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 1111 SLAB_STORE_USER | SLAB_TRACE)) 1112 SetSlabDebug(page); 1113 1114 start = page_address(page); 1115 end = start + s->objects * s->size; 1116 1117 if (unlikely(s->flags & SLAB_POISON)) 1118 memset(start, POISON_INUSE, PAGE_SIZE << s->order); 1119 1120 last = start; 1121 for_each_object(p, s, start) { 1122 setup_object(s, page, last); 1123 set_freepointer(s, last, p); 1124 last = p; 1125 } 1126 setup_object(s, page, last); 1127 set_freepointer(s, last, NULL); 1128 1129 page->freelist = start; 1130 page->lockless_freelist = NULL; 1131 page->inuse = 0; 1132out: 1133 if (flags & __GFP_WAIT) 1134 local_irq_disable(); 1135 return page; 1136} 1137 1138static void __free_slab(struct kmem_cache *s, struct page *page) 1139{ 1140 int pages = 1 << s->order; 1141 1142 if (unlikely(SlabDebug(page))) { 1143 void *p; 1144 1145 slab_pad_check(s, page); 1146 for_each_object(p, s, page_address(page)) 1147 check_object(s, page, p, 0); 1148 ClearSlabDebug(page); 1149 } 1150 1151 mod_zone_page_state(page_zone(page), 1152 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1153 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1154 - pages); 1155 1156 page->mapping = NULL; 1157 __free_pages(page, s->order); 1158} 1159 1160static void rcu_free_slab(struct rcu_head *h) 1161{ 1162 struct page *page; 1163 1164 page = container_of((struct list_head *)h, struct page, lru); 1165 __free_slab(page->slab, page); 1166} 1167 1168static void free_slab(struct kmem_cache *s, struct page *page) 1169{ 1170 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1171 /* 1172 * RCU free overloads the RCU head over the LRU 1173 */ 1174 struct rcu_head *head = (void *)&page->lru; 1175 1176 call_rcu(head, rcu_free_slab); 1177 } else 1178 __free_slab(s, page); 1179} 1180 1181static void discard_slab(struct kmem_cache *s, struct page *page) 1182{ 1183 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1184 1185 atomic_long_dec(&n->nr_slabs); 1186 reset_page_mapcount(page); 1187 __ClearPageSlab(page); 1188 free_slab(s, page); 1189} 1190 1191/* 1192 * Per slab locking using the pagelock 1193 */ 1194static __always_inline void slab_lock(struct page *page) 1195{ 1196 bit_spin_lock(PG_locked, &page->flags); 1197} 1198 1199static __always_inline void slab_unlock(struct page *page) 1200{ 1201 bit_spin_unlock(PG_locked, &page->flags); 1202} 1203 1204static __always_inline int slab_trylock(struct page *page) 1205{ 1206 int rc = 1; 1207 1208 rc = bit_spin_trylock(PG_locked, &page->flags); 1209 return rc; 1210} 1211 1212/* 1213 * Management of partially allocated slabs 1214 */ 1215static void add_partial_tail(struct kmem_cache_node *n, struct page *page) 1216{ 1217 spin_lock(&n->list_lock); 1218 n->nr_partial++; 1219 list_add_tail(&page->lru, &n->partial); 1220 spin_unlock(&n->list_lock); 1221} 1222 1223static void add_partial(struct kmem_cache_node *n, struct page *page) 1224{ 1225 spin_lock(&n->list_lock); 1226 n->nr_partial++; 1227 list_add(&page->lru, &n->partial); 1228 spin_unlock(&n->list_lock); 1229} 1230 1231static void remove_partial(struct kmem_cache *s, 1232 struct page *page) 1233{ 1234 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1235 1236 spin_lock(&n->list_lock); 1237 list_del(&page->lru); 1238 n->nr_partial--; 1239 spin_unlock(&n->list_lock); 1240} 1241 1242/* 1243 * Lock slab and remove from the partial list. 1244 * 1245 * Must hold list_lock. 1246 */ 1247static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) 1248{ 1249 if (slab_trylock(page)) { 1250 list_del(&page->lru); 1251 n->nr_partial--; 1252 SetSlabFrozen(page); 1253 return 1; 1254 } 1255 return 0; 1256} 1257 1258/* 1259 * Try to allocate a partial slab from a specific node. 1260 */ 1261static struct page *get_partial_node(struct kmem_cache_node *n) 1262{ 1263 struct page *page; 1264 1265 /* 1266 * Racy check. If we mistakenly see no partial slabs then we 1267 * just allocate an empty slab. If we mistakenly try to get a 1268 * partial slab and there is none available then get_partials() 1269 * will return NULL. 1270 */ 1271 if (!n || !n->nr_partial) 1272 return NULL; 1273 1274 spin_lock(&n->list_lock); 1275 list_for_each_entry(page, &n->partial, lru) 1276 if (lock_and_freeze_slab(n, page)) 1277 goto out; 1278 page = NULL; 1279out: 1280 spin_unlock(&n->list_lock); 1281 return page; 1282} 1283 1284/* 1285 * Get a page from somewhere. Search in increasing NUMA distances. 1286 */ 1287static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1288{ 1289#ifdef CONFIG_NUMA 1290 struct zonelist *zonelist; 1291 struct zone **z; 1292 struct page *page; 1293 1294 /* 1295 * The defrag ratio allows a configuration of the tradeoffs between 1296 * inter node defragmentation and node local allocations. A lower 1297 * defrag_ratio increases the tendency to do local allocations 1298 * instead of attempting to obtain partial slabs from other nodes. 1299 * 1300 * If the defrag_ratio is set to 0 then kmalloc() always 1301 * returns node local objects. If the ratio is higher then kmalloc() 1302 * may return off node objects because partial slabs are obtained 1303 * from other nodes and filled up. 1304 * 1305 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes 1306 * defrag_ratio = 1000) then every (well almost) allocation will 1307 * first attempt to defrag slab caches on other nodes. This means 1308 * scanning over all nodes to look for partial slabs which may be 1309 * expensive if we do it every time we are trying to find a slab 1310 * with available objects. 1311 */ 1312 if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) 1313 return NULL; 1314 1315 zonelist = &NODE_DATA(slab_node(current->mempolicy)) 1316 ->node_zonelists[gfp_zone(flags)]; 1317 for (z = zonelist->zones; *z; z++) { 1318 struct kmem_cache_node *n; 1319 1320 n = get_node(s, zone_to_nid(*z)); 1321 1322 if (n && cpuset_zone_allowed_hardwall(*z, flags) && 1323 n->nr_partial > MIN_PARTIAL) { 1324 page = get_partial_node(n); 1325 if (page) 1326 return page; 1327 } 1328 } 1329#endif 1330 return NULL; 1331} 1332 1333/* 1334 * Get a partial page, lock it and return it. 1335 */ 1336static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1337{ 1338 struct page *page; 1339 int searchnode = (node == -1) ? numa_node_id() : node; 1340 1341 page = get_partial_node(get_node(s, searchnode)); 1342 if (page || (flags & __GFP_THISNODE)) 1343 return page; 1344 1345 return get_any_partial(s, flags); 1346} 1347 1348/* 1349 * Move a page back to the lists. 1350 * 1351 * Must be called with the slab lock held. 1352 * 1353 * On exit the slab lock will have been dropped. 1354 */ 1355static void unfreeze_slab(struct kmem_cache *s, struct page *page) 1356{ 1357 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1358 1359 ClearSlabFrozen(page); 1360 if (page->inuse) { 1361 1362 if (page->freelist) 1363 add_partial(n, page); 1364 else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) 1365 add_full(n, page); 1366 slab_unlock(page); 1367 1368 } else { 1369 if (n->nr_partial < MIN_PARTIAL) { 1370 /* 1371 * Adding an empty slab to the partial slabs in order 1372 * to avoid page allocator overhead. This slab needs 1373 * to come after the other slabs with objects in 1374 * order to fill them up. That way the size of the 1375 * partial list stays small. kmem_cache_shrink can 1376 * reclaim empty slabs from the partial list. 1377 */ 1378 add_partial_tail(n, page); 1379 slab_unlock(page); 1380 } else { 1381 slab_unlock(page); 1382 discard_slab(s, page); 1383 } 1384 } 1385} 1386 1387/* 1388 * Remove the cpu slab 1389 */ 1390static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1391{ 1392 struct page *page = c->page; 1393 /* 1394 * Merge cpu freelist into freelist. Typically we get here 1395 * because both freelists are empty. So this is unlikely 1396 * to occur. 1397 */ 1398 while (unlikely(c->freelist)) { 1399 void **object; 1400 1401 /* Retrieve object from cpu_freelist */ 1402 object = c->freelist; 1403 c->freelist = c->freelist[page->offset]; 1404 1405 /* And put onto the regular freelist */ 1406 object[page->offset] = page->freelist; 1407 page->freelist = object; 1408 page->inuse--; 1409 } 1410 c->page = NULL; 1411 unfreeze_slab(s, page); 1412} 1413 1414static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1415{ 1416 slab_lock(c->page); 1417 deactivate_slab(s, c); 1418} 1419 1420/* 1421 * Flush cpu slab. 1422 * Called from IPI handler with interrupts disabled. 1423 */ 1424static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1425{ 1426 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 1427 1428 if (likely(c && c->page)) 1429 flush_slab(s, c); 1430} 1431 1432static void flush_cpu_slab(void *d) 1433{ 1434 struct kmem_cache *s = d; 1435 1436 __flush_cpu_slab(s, smp_processor_id()); 1437} 1438 1439static void flush_all(struct kmem_cache *s) 1440{ 1441#ifdef CONFIG_SMP 1442 on_each_cpu(flush_cpu_slab, s, 1, 1); 1443#else 1444 unsigned long flags; 1445 1446 local_irq_save(flags); 1447 flush_cpu_slab(s); 1448 local_irq_restore(flags); 1449#endif 1450} 1451 1452/* 1453 * Check if the objects in a per cpu structure fit numa 1454 * locality expectations. 1455 */ 1456static inline int node_match(struct kmem_cache_cpu *c, int node) 1457{ 1458#ifdef CONFIG_NUMA 1459 if (node != -1 && c->node != node) 1460 return 0; 1461#endif 1462 return 1; 1463} 1464 1465/* 1466 * Slow path. The lockless freelist is empty or we need to perform 1467 * debugging duties. 1468 * 1469 * Interrupts are disabled. 1470 * 1471 * Processing is still very fast if new objects have been freed to the 1472 * regular freelist. In that case we simply take over the regular freelist 1473 * as the lockless freelist and zap the regular freelist. 1474 * 1475 * If that is not working then we fall back to the partial lists. We take the 1476 * first element of the freelist as the object to allocate now and move the 1477 * rest of the freelist to the lockless freelist. 1478 * 1479 * And if we were unable to get a new slab from the partial slab lists then 1480 * we need to allocate a new slab. This is slowest path since we may sleep. 1481 */ 1482static void *__slab_alloc(struct kmem_cache *s, 1483 gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) 1484{ 1485 void **object; 1486 struct page *new; 1487 1488 if (!c->page) 1489 goto new_slab; 1490 1491 slab_lock(c->page); 1492 if (unlikely(!node_match(c, node))) 1493 goto another_slab; 1494load_freelist: 1495 object = c->page->freelist; 1496 if (unlikely(!object)) 1497 goto another_slab; 1498 if (unlikely(SlabDebug(c->page))) 1499 goto debug; 1500 1501 object = c->page->freelist; 1502 c->freelist = object[c->page->offset]; 1503 c->page->inuse = s->objects; 1504 c->page->freelist = NULL; 1505 c->node = page_to_nid(c->page); 1506 slab_unlock(c->page); 1507 return object; 1508 1509another_slab: 1510 deactivate_slab(s, c); 1511 1512new_slab: 1513 new = get_partial(s, gfpflags, node); 1514 if (new) { 1515 c->page = new; 1516 goto load_freelist; 1517 } 1518 1519 new = new_slab(s, gfpflags, node); 1520 if (new) { 1521 c = get_cpu_slab(s, smp_processor_id()); 1522 if (c->page) { 1523 /* 1524 * Someone else populated the cpu_slab while we 1525 * enabled interrupts, or we have gotten scheduled 1526 * on another cpu. The page may not be on the 1527 * requested node even if __GFP_THISNODE was 1528 * specified. So we need to recheck. 1529 */ 1530 if (node_match(c, node)) { 1531 /* 1532 * Current cpuslab is acceptable and we 1533 * want the current one since its cache hot 1534 */ 1535 discard_slab(s, new); 1536 slab_lock(c->page); 1537 goto load_freelist; 1538 } 1539 /* New slab does not fit our expectations */ 1540 flush_slab(s, c); 1541 } 1542 slab_lock(new); 1543 SetSlabFrozen(new); 1544 c->page = new; 1545 goto load_freelist; 1546 } 1547 return NULL; 1548debug: 1549 object = c->page->freelist; 1550 if (!alloc_debug_processing(s, c->page, object, addr)) 1551 goto another_slab; 1552 1553 c->page->inuse++; 1554 c->page->freelist = object[c->page->offset]; 1555 slab_unlock(c->page); 1556 return object; 1557} 1558 1559/* 1560 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) 1561 * have the fastpath folded into their functions. So no function call 1562 * overhead for requests that can be satisfied on the fastpath. 1563 * 1564 * The fastpath works by first checking if the lockless freelist can be used. 1565 * If not then __slab_alloc is called for slow processing. 1566 * 1567 * Otherwise we can simply pick the next object from the lockless free list. 1568 */ 1569static void __always_inline *slab_alloc(struct kmem_cache *s, 1570 gfp_t gfpflags, int node, void *addr) 1571{ 1572 void **object; 1573 unsigned long flags; 1574 struct kmem_cache_cpu *c; 1575 1576 local_irq_save(flags); 1577 c = get_cpu_slab(s, smp_processor_id()); 1578 if (unlikely(!c->page || !c->freelist || 1579 !node_match(c, node))) 1580 1581 object = __slab_alloc(s, gfpflags, node, addr, c); 1582 1583 else { 1584 object = c->freelist; 1585 c->freelist = object[c->page->offset]; 1586 } 1587 local_irq_restore(flags); 1588 1589 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1590 memset(object, 0, s->objsize); 1591 1592 return object; 1593} 1594 1595void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 1596{ 1597 return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); 1598} 1599EXPORT_SYMBOL(kmem_cache_alloc); 1600 1601#ifdef CONFIG_NUMA 1602void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 1603{ 1604 return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); 1605} 1606EXPORT_SYMBOL(kmem_cache_alloc_node); 1607#endif 1608 1609/* 1610 * Slow patch handling. This may still be called frequently since objects 1611 * have a longer lifetime than the cpu slabs in most processing loads. 1612 * 1613 * So we still attempt to reduce cache line usage. Just take the slab 1614 * lock and free the item. If there is no additional partial page 1615 * handling required then we can return immediately. 1616 */ 1617static void __slab_free(struct kmem_cache *s, struct page *page, 1618 void *x, void *addr) 1619{ 1620 void *prior; 1621 void **object = (void *)x; 1622 1623 slab_lock(page); 1624 1625 if (unlikely(SlabDebug(page))) 1626 goto debug; 1627checks_ok: 1628 prior = object[page->offset] = page->freelist; 1629 page->freelist = object; 1630 page->inuse--; 1631 1632 if (unlikely(SlabFrozen(page))) 1633 goto out_unlock; 1634 1635 if (unlikely(!page->inuse)) 1636 goto slab_empty; 1637 1638 /* 1639 * Objects left in the slab. If it 1640 * was not on the partial list before 1641 * then add it. 1642 */ 1643 if (unlikely(!prior)) 1644 add_partial(get_node(s, page_to_nid(page)), page); 1645 1646out_unlock: 1647 slab_unlock(page); 1648 return; 1649 1650slab_empty: 1651 if (prior) 1652 /* 1653 * Slab still on the partial list. 1654 */ 1655 remove_partial(s, page); 1656 1657 slab_unlock(page); 1658 discard_slab(s, page); 1659 return; 1660 1661debug: 1662 if (!free_debug_processing(s, page, x, addr)) 1663 goto out_unlock; 1664 goto checks_ok; 1665} 1666 1667/* 1668 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 1669 * can perform fastpath freeing without additional function calls. 1670 * 1671 * The fastpath is only possible if we are freeing to the current cpu slab 1672 * of this processor. This typically the case if we have just allocated 1673 * the item before. 1674 * 1675 * If fastpath is not possible then fall back to __slab_free where we deal 1676 * with all sorts of special processing. 1677 */ 1678static void __always_inline slab_free(struct kmem_cache *s, 1679 struct page *page, void *x, void *addr) 1680{ 1681 void **object = (void *)x; 1682 unsigned long flags; 1683 struct kmem_cache_cpu *c; 1684 1685 local_irq_save(flags); 1686 debug_check_no_locks_freed(object, s->objsize); 1687 c = get_cpu_slab(s, smp_processor_id()); 1688 if (likely(page == c->page && !SlabDebug(page))) { 1689 object[page->offset] = c->freelist; 1690 c->freelist = object; 1691 } else 1692 __slab_free(s, page, x, addr); 1693 1694 local_irq_restore(flags); 1695} 1696 1697void kmem_cache_free(struct kmem_cache *s, void *x) 1698{ 1699 struct page *page; 1700 1701 page = virt_to_head_page(x); 1702 1703 slab_free(s, page, x, __builtin_return_address(0)); 1704} 1705EXPORT_SYMBOL(kmem_cache_free); 1706 1707/* Figure out on which slab object the object resides */ 1708static struct page *get_object_page(const void *x) 1709{ 1710 struct page *page = virt_to_head_page(x); 1711 1712 if (!PageSlab(page)) 1713 return NULL; 1714 1715 return page; 1716} 1717 1718/* 1719 * Object placement in a slab is made very easy because we always start at 1720 * offset 0. If we tune the size of the object to the alignment then we can 1721 * get the required alignment by putting one properly sized object after 1722 * another. 1723 * 1724 * Notice that the allocation order determines the sizes of the per cpu 1725 * caches. Each processor has always one slab available for allocations. 1726 * Increasing the allocation order reduces the number of times that slabs 1727 * must be moved on and off the partial lists and is therefore a factor in 1728 * locking overhead. 1729 */ 1730 1731/* 1732 * Mininum / Maximum order of slab pages. This influences locking overhead 1733 * and slab fragmentation. A higher order reduces the number of partial slabs 1734 * and increases the number of allocations possible without having to 1735 * take the list_lock. 1736 */ 1737static int slub_min_order; 1738static int slub_max_order = DEFAULT_MAX_ORDER; 1739static int slub_min_objects = DEFAULT_MIN_OBJECTS; 1740 1741/* 1742 * Merge control. If this is set then no merging of slab caches will occur. 1743 * (Could be removed. This was introduced to pacify the merge skeptics.) 1744 */ 1745static int slub_nomerge; 1746 1747/* 1748 * Calculate the order of allocation given an slab object size. 1749 * 1750 * The order of allocation has significant impact on performance and other 1751 * system components. Generally order 0 allocations should be preferred since 1752 * order 0 does not cause fragmentation in the page allocator. Larger objects 1753 * be problematic to put into order 0 slabs because there may be too much 1754 * unused space left. We go to a higher order if more than 1/8th of the slab 1755 * would be wasted. 1756 * 1757 * In order to reach satisfactory performance we must ensure that a minimum 1758 * number of objects is in one slab. Otherwise we may generate too much 1759 * activity on the partial lists which requires taking the list_lock. This is 1760 * less a concern for large slabs though which are rarely used. 1761 * 1762 * slub_max_order specifies the order where we begin to stop considering the 1763 * number of objects in a slab as critical. If we reach slub_max_order then 1764 * we try to keep the page order as low as possible. So we accept more waste 1765 * of space in favor of a small page order. 1766 * 1767 * Higher order allocations also allow the placement of more objects in a 1768 * slab and thereby reduce object handling overhead. If the user has 1769 * requested a higher mininum order then we start with that one instead of 1770 * the smallest order which will fit the object. 1771 */ 1772static inline int slab_order(int size, int min_objects, 1773 int max_order, int fract_leftover) 1774{ 1775 int order; 1776 int rem; 1777 int min_order = slub_min_order; 1778 1779 /* 1780 * If we would create too many object per slab then reduce 1781 * the slab order even if it goes below slub_min_order. 1782 */ 1783 while (min_order > 0 && 1784 (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size) 1785 min_order--; 1786 1787 for (order = max(min_order, 1788 fls(min_objects * size - 1) - PAGE_SHIFT); 1789 order <= max_order; order++) { 1790 1791 unsigned long slab_size = PAGE_SIZE << order; 1792 1793 if (slab_size < min_objects * size) 1794 continue; 1795 1796 rem = slab_size % size; 1797 1798 if (rem <= slab_size / fract_leftover) 1799 break; 1800 1801 /* If the next size is too high then exit now */ 1802 if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size) 1803 break; 1804 } 1805 1806 return order; 1807} 1808 1809static inline int calculate_order(int size) 1810{ 1811 int order; 1812 int min_objects; 1813 int fraction; 1814 1815 /* 1816 * Attempt to find best configuration for a slab. This 1817 * works by first attempting to generate a layout with 1818 * the best configuration and backing off gradually. 1819 * 1820 * First we reduce the acceptable waste in a slab. Then 1821 * we reduce the minimum objects required in a slab. 1822 */ 1823 min_objects = slub_min_objects; 1824 while (min_objects > 1) { 1825 fraction = 8; 1826 while (fraction >= 4) { 1827 order = slab_order(size, min_objects, 1828 slub_max_order, fraction); 1829 if (order <= slub_max_order) 1830 return order; 1831 fraction /= 2; 1832 } 1833 min_objects /= 2; 1834 } 1835 1836 /* 1837 * We were unable to place multiple objects in a slab. Now 1838 * lets see if we can place a single object there. 1839 */ 1840 order = slab_order(size, 1, slub_max_order, 1); 1841 if (order <= slub_max_order) 1842 return order; 1843 1844 /* 1845 * Doh this slab cannot be placed using slub_max_order. 1846 */ 1847 order = slab_order(size, 1, MAX_ORDER, 1); 1848 if (order <= MAX_ORDER) 1849 return order; 1850 return -ENOSYS; 1851} 1852 1853/* 1854 * Figure out what the alignment of the objects will be. 1855 */ 1856static unsigned long calculate_alignment(unsigned long flags, 1857 unsigned long align, unsigned long size) 1858{ 1859 /* 1860 * If the user wants hardware cache aligned objects then 1861 * follow that suggestion if the object is sufficiently 1862 * large. 1863 * 1864 * The hardware cache alignment cannot override the 1865 * specified alignment though. If that is greater 1866 * then use it. 1867 */ 1868 if ((flags & SLAB_HWCACHE_ALIGN) && 1869 size > cache_line_size() / 2) 1870 return max_t(unsigned long, align, cache_line_size()); 1871 1872 if (align < ARCH_SLAB_MINALIGN) 1873 return ARCH_SLAB_MINALIGN; 1874 1875 return ALIGN(align, sizeof(void *)); 1876} 1877 1878static void init_kmem_cache_cpu(struct kmem_cache *s, 1879 struct kmem_cache_cpu *c) 1880{ 1881 c->page = NULL; 1882 c->freelist = NULL; 1883 c->node = 0; 1884} 1885 1886static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) 1887{ 1888 int cpu; 1889 1890 for_each_possible_cpu(cpu) 1891 init_kmem_cache_cpu(s, get_cpu_slab(s, cpu)); 1892 1893 return 1; 1894} 1895 1896static void init_kmem_cache_node(struct kmem_cache_node *n) 1897{ 1898 n->nr_partial = 0; 1899 atomic_long_set(&n->nr_slabs, 0); 1900 spin_lock_init(&n->list_lock); 1901 INIT_LIST_HEAD(&n->partial); 1902#ifdef CONFIG_SLUB_DEBUG 1903 INIT_LIST_HEAD(&n->full); 1904#endif 1905} 1906 1907#ifdef CONFIG_NUMA 1908/* 1909 * No kmalloc_node yet so do it by hand. We know that this is the first 1910 * slab on the node for this slabcache. There are no concurrent accesses 1911 * possible. 1912 * 1913 * Note that this function only works on the kmalloc_node_cache 1914 * when allocating for the kmalloc_node_cache. 1915 */ 1916static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, 1917 int node) 1918{ 1919 struct page *page; 1920 struct kmem_cache_node *n; 1921 1922 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); 1923 1924 page = new_slab(kmalloc_caches, gfpflags, node); 1925 1926 BUG_ON(!page); 1927 if (page_to_nid(page) != node) { 1928 printk(KERN_ERR "SLUB: Unable to allocate memory from " 1929 "node %d\n", node); 1930 printk(KERN_ERR "SLUB: Allocating a useless per node structure " 1931 "in order to be able to continue\n"); 1932 } 1933 1934 n = page->freelist; 1935 BUG_ON(!n); 1936 page->freelist = get_freepointer(kmalloc_caches, n); 1937 page->inuse++; 1938 kmalloc_caches->node[node] = n; 1939#ifdef CONFIG_SLUB_DEBUG 1940 init_object(kmalloc_caches, n, 1); 1941 init_tracking(kmalloc_caches, n); 1942#endif 1943 init_kmem_cache_node(n); 1944 atomic_long_inc(&n->nr_slabs); 1945 add_partial(n, page); 1946 1947 /* 1948 * new_slab() disables interupts. If we do not reenable interrupts here 1949 * then bootup would continue with interrupts disabled. 1950 */ 1951 local_irq_enable(); 1952 return n; 1953} 1954 1955static void free_kmem_cache_nodes(struct kmem_cache *s) 1956{ 1957 int node; 1958 1959 for_each_node_state(node, N_NORMAL_MEMORY) { 1960 struct kmem_cache_node *n = s->node[node]; 1961 if (n && n != &s->local_node) 1962 kmem_cache_free(kmalloc_caches, n); 1963 s->node[node] = NULL; 1964 } 1965} 1966 1967static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 1968{ 1969 int node; 1970 int local_node; 1971 1972 if (slab_state >= UP) 1973 local_node = page_to_nid(virt_to_page(s)); 1974 else 1975 local_node = 0; 1976 1977 for_each_node_state(node, N_NORMAL_MEMORY) { 1978 struct kmem_cache_node *n; 1979 1980 if (local_node == node) 1981 n = &s->local_node; 1982 else { 1983 if (slab_state == DOWN) { 1984 n = early_kmem_cache_node_alloc(gfpflags, 1985 node); 1986 continue; 1987 } 1988 n = kmem_cache_alloc_node(kmalloc_caches, 1989 gfpflags, node); 1990 1991 if (!n) { 1992 free_kmem_cache_nodes(s); 1993 return 0; 1994 } 1995 1996 } 1997 s->node[node] = n; 1998 init_kmem_cache_node(n); 1999 } 2000 return 1; 2001} 2002#else 2003static void free_kmem_cache_nodes(struct kmem_cache *s) 2004{ 2005} 2006 2007static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 2008{ 2009 init_kmem_cache_node(&s->local_node); 2010 return 1; 2011} 2012#endif 2013 2014/* 2015 * calculate_sizes() determines the order and the distribution of data within 2016 * a slab object. 2017 */ 2018static int calculate_sizes(struct kmem_cache *s) 2019{ 2020 unsigned long flags = s->flags; 2021 unsigned long size = s->objsize; 2022 unsigned long align = s->align; 2023 2024 /* 2025 * Determine if we can poison the object itself. If the user of 2026 * the slab may touch the object after free or before allocation 2027 * then we should never poison the object itself. 2028 */ 2029 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 2030 !s->ctor) 2031 s->flags |= __OBJECT_POISON; 2032 else 2033 s->flags &= ~__OBJECT_POISON; 2034 2035 /* 2036 * Round up object size to the next word boundary. We can only 2037 * place the free pointer at word boundaries and this determines 2038 * the possible location of the free pointer. 2039 */ 2040 size = ALIGN(size, sizeof(void *)); 2041 2042#ifdef CONFIG_SLUB_DEBUG 2043 /* 2044 * If we are Redzoning then check if there is some space between the 2045 * end of the object and the free pointer. If not then add an 2046 * additional word to have some bytes to store Redzone information. 2047 */ 2048 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 2049 size += sizeof(void *); 2050#endif 2051 2052 /* 2053 * With that we have determined the number of bytes in actual use 2054 * by the object. This is the potential offset to the free pointer. 2055 */ 2056 s->inuse = size; 2057 2058 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 2059 s->ctor)) { 2060 /* 2061 * Relocate free pointer after the object if it is not 2062 * permitted to overwrite the first word of the object on 2063 * kmem_cache_free. 2064 * 2065 * This is the case if we do RCU, have a constructor or 2066 * destructor or are poisoning the objects. 2067 */ 2068 s->offset = size; 2069 size += sizeof(void *); 2070 } 2071 2072#ifdef CONFIG_SLUB_DEBUG 2073 if (flags & SLAB_STORE_USER) 2074 /* 2075 * Need to store information about allocs and frees after 2076 * the object. 2077 */ 2078 size += 2 * sizeof(struct track); 2079 2080 if (flags & SLAB_RED_ZONE) 2081 /* 2082 * Add some empty padding so that we can catch 2083 * overwrites from earlier objects rather than let 2084 * tracking information or the free pointer be 2085 * corrupted if an user writes before the start 2086 * of the object. 2087 */ 2088 size += sizeof(void *); 2089#endif 2090 2091 /* 2092 * Determine the alignment based on various parameters that the 2093 * user specified and the dynamic determination of cache line size 2094 * on bootup. 2095 */ 2096 align = calculate_alignment(flags, align, s->objsize); 2097 2098 /* 2099 * SLUB stores one object immediately after another beginning from 2100 * offset 0. In order to align the objects we have to simply size 2101 * each object to conform to the alignment. 2102 */ 2103 size = ALIGN(size, align); 2104 s->size = size; 2105 2106 s->order = calculate_order(size); 2107 if (s->order < 0) 2108 return 0; 2109 2110 /* 2111 * Determine the number of objects per slab 2112 */ 2113 s->objects = (PAGE_SIZE << s->order) / size; 2114 2115 /* 2116 * Verify that the number of objects is within permitted limits. 2117 * The page->inuse field is only 16 bit wide! So we cannot have 2118 * more than 64k objects per slab. 2119 */ 2120 if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB) 2121 return 0; 2122 return 1; 2123 2124} 2125 2126static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2127 const char *name, size_t size, 2128 size_t align, unsigned long flags, 2129 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 2130{ 2131 memset(s, 0, kmem_size); 2132 s->name = name; 2133 s->ctor = ctor; 2134 s->objsize = size; 2135 s->align = align; 2136 s->flags = kmem_cache_flags(size, flags, name, ctor); 2137 2138 if (!calculate_sizes(s)) 2139 goto error; 2140 2141 s->refcount = 1; 2142#ifdef CONFIG_NUMA 2143 s->defrag_ratio = 100; 2144#endif 2145 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 2146 goto error; 2147 2148 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) 2149 return 1; 2150error: 2151 if (flags & SLAB_PANIC) 2152 panic("Cannot create slab %s size=%lu realsize=%u " 2153 "order=%u offset=%u flags=%lx\n", 2154 s->name, (unsigned long)size, s->size, s->order, 2155 s->offset, flags); 2156 return 0; 2157} 2158 2159/* 2160 * Check if a given pointer is valid 2161 */ 2162int kmem_ptr_validate(struct kmem_cache *s, const void *object) 2163{ 2164 struct page * page; 2165 2166 page = get_object_page(object); 2167 2168 if (!page || s != page->slab) 2169 /* No slab or wrong slab */ 2170 return 0; 2171 2172 if (!check_valid_pointer(s, page, object)) 2173 return 0; 2174 2175 /* 2176 * We could also check if the object is on the slabs freelist. 2177 * But this would be too expensive and it seems that the main 2178 * purpose of kmem_ptr_valid is to check if the object belongs 2179 * to a certain slab. 2180 */ 2181 return 1; 2182} 2183EXPORT_SYMBOL(kmem_ptr_validate); 2184 2185/* 2186 * Determine the size of a slab object 2187 */ 2188unsigned int kmem_cache_size(struct kmem_cache *s) 2189{ 2190 return s->objsize; 2191} 2192EXPORT_SYMBOL(kmem_cache_size); 2193 2194const char *kmem_cache_name(struct kmem_cache *s) 2195{ 2196 return s->name; 2197} 2198EXPORT_SYMBOL(kmem_cache_name); 2199 2200/* 2201 * Attempt to free all slabs on a node. Return the number of slabs we 2202 * were unable to free. 2203 */ 2204static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, 2205 struct list_head *list) 2206{ 2207 int slabs_inuse = 0; 2208 unsigned long flags; 2209 struct page *page, *h; 2210 2211 spin_lock_irqsave(&n->list_lock, flags); 2212 list_for_each_entry_safe(page, h, list, lru) 2213 if (!page->inuse) { 2214 list_del(&page->lru); 2215 discard_slab(s, page); 2216 } else 2217 slabs_inuse++; 2218 spin_unlock_irqrestore(&n->list_lock, flags); 2219 return slabs_inuse; 2220} 2221 2222/* 2223 * Release all resources used by a slab cache. 2224 */ 2225static inline int kmem_cache_close(struct kmem_cache *s) 2226{ 2227 int node; 2228 2229 flush_all(s); 2230 2231 /* Attempt to free all objects */ 2232 for_each_node_state(node, N_NORMAL_MEMORY) { 2233 struct kmem_cache_node *n = get_node(s, node); 2234 2235 n->nr_partial -= free_list(s, n, &n->partial); 2236 if (atomic_long_read(&n->nr_slabs)) 2237 return 1; 2238 } 2239 free_kmem_cache_nodes(s); 2240 return 0; 2241} 2242 2243/* 2244 * Close a cache and release the kmem_cache structure 2245 * (must be used for caches created using kmem_cache_create) 2246 */ 2247void kmem_cache_destroy(struct kmem_cache *s) 2248{ 2249 down_write(&slub_lock); 2250 s->refcount--; 2251 if (!s->refcount) { 2252 list_del(&s->list); 2253 up_write(&slub_lock); 2254 if (kmem_cache_close(s)) 2255 WARN_ON(1); 2256 sysfs_slab_remove(s); 2257 kfree(s); 2258 } else 2259 up_write(&slub_lock); 2260} 2261EXPORT_SYMBOL(kmem_cache_destroy); 2262 2263/******************************************************************** 2264 * Kmalloc subsystem 2265 *******************************************************************/ 2266 2267struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned; 2268EXPORT_SYMBOL(kmalloc_caches); 2269 2270#ifdef CONFIG_ZONE_DMA 2271static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT]; 2272#endif 2273 2274static int __init setup_slub_min_order(char *str) 2275{ 2276 get_option (&str, &slub_min_order); 2277 2278 return 1; 2279} 2280 2281__setup("slub_min_order=", setup_slub_min_order); 2282 2283static int __init setup_slub_max_order(char *str) 2284{ 2285 get_option (&str, &slub_max_order); 2286 2287 return 1; 2288} 2289 2290__setup("slub_max_order=", setup_slub_max_order); 2291 2292static int __init setup_slub_min_objects(char *str) 2293{ 2294 get_option (&str, &slub_min_objects); 2295 2296 return 1; 2297} 2298 2299__setup("slub_min_objects=", setup_slub_min_objects); 2300 2301static int __init setup_slub_nomerge(char *str) 2302{ 2303 slub_nomerge = 1; 2304 return 1; 2305} 2306 2307__setup("slub_nomerge", setup_slub_nomerge); 2308 2309static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, 2310 const char *name, int size, gfp_t gfp_flags) 2311{ 2312 unsigned int flags = 0; 2313 2314 if (gfp_flags & SLUB_DMA) 2315 flags = SLAB_CACHE_DMA; 2316 2317 down_write(&slub_lock); 2318 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 2319 flags, NULL)) 2320 goto panic; 2321 2322 list_add(&s->list, &slab_caches); 2323 up_write(&slub_lock); 2324 if (sysfs_slab_add(s)) 2325 goto panic; 2326 return s; 2327 2328panic: 2329 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 2330} 2331 2332#ifdef CONFIG_ZONE_DMA 2333 2334static void sysfs_add_func(struct work_struct *w) 2335{ 2336 struct kmem_cache *s; 2337 2338 down_write(&slub_lock); 2339 list_for_each_entry(s, &slab_caches, list) { 2340 if (s->flags & __SYSFS_ADD_DEFERRED) { 2341 s->flags &= ~__SYSFS_ADD_DEFERRED; 2342 sysfs_slab_add(s); 2343 } 2344 } 2345 up_write(&slub_lock); 2346} 2347 2348static DECLARE_WORK(sysfs_add_work, sysfs_add_func); 2349 2350static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) 2351{ 2352 struct kmem_cache *s; 2353 char *text; 2354 size_t realsize; 2355 2356 s = kmalloc_caches_dma[index]; 2357 if (s) 2358 return s; 2359 2360 /* Dynamically create dma cache */ 2361 if (flags & __GFP_WAIT) 2362 down_write(&slub_lock); 2363 else { 2364 if (!down_write_trylock(&slub_lock)) 2365 goto out; 2366 } 2367 2368 if (kmalloc_caches_dma[index]) 2369 goto unlock_out; 2370 2371 realsize = kmalloc_caches[index].objsize; 2372 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize), 2373 s = kmalloc(kmem_size, flags & ~SLUB_DMA); 2374 2375 if (!s || !text || !kmem_cache_open(s, flags, text, 2376 realsize, ARCH_KMALLOC_MINALIGN, 2377 SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { 2378 kfree(s); 2379 kfree(text); 2380 goto unlock_out; 2381 } 2382 2383 list_add(&s->list, &slab_caches); 2384 kmalloc_caches_dma[index] = s; 2385 2386 schedule_work(&sysfs_add_work); 2387 2388unlock_out: 2389 up_write(&slub_lock); 2390out: 2391 return kmalloc_caches_dma[index]; 2392} 2393#endif 2394 2395/* 2396 * Conversion table for small slabs sizes / 8 to the index in the 2397 * kmalloc array. This is necessary for slabs < 192 since we have non power 2398 * of two cache sizes there. The size of larger slabs can be determined using 2399 * fls. 2400 */ 2401static s8 size_index[24] = { 2402 3, /* 8 */ 2403 4, /* 16 */ 2404 5, /* 24 */ 2405 5, /* 32 */ 2406 6, /* 40 */ 2407 6, /* 48 */ 2408 6, /* 56 */ 2409 6, /* 64 */ 2410 1, /* 72 */ 2411 1, /* 80 */ 2412 1, /* 88 */ 2413 1, /* 96 */ 2414 7, /* 104 */ 2415 7, /* 112 */ 2416 7, /* 120 */ 2417 7, /* 128 */ 2418 2, /* 136 */ 2419 2, /* 144 */ 2420 2, /* 152 */ 2421 2, /* 160 */ 2422 2, /* 168 */ 2423 2, /* 176 */ 2424 2, /* 184 */ 2425 2 /* 192 */ 2426}; 2427 2428static struct kmem_cache *get_slab(size_t size, gfp_t flags) 2429{ 2430 int index; 2431 2432 if (size <= 192) { 2433 if (!size) 2434 return ZERO_SIZE_PTR; 2435 2436 index = size_index[(size - 1) / 8]; 2437 } else 2438 index = fls(size - 1); 2439 2440#ifdef CONFIG_ZONE_DMA 2441 if (unlikely((flags & SLUB_DMA))) 2442 return dma_kmalloc_cache(index, flags); 2443 2444#endif 2445 return &kmalloc_caches[index]; 2446} 2447 2448void *__kmalloc(size_t size, gfp_t flags) 2449{ 2450 struct kmem_cache *s; 2451 2452 if (unlikely(size > PAGE_SIZE / 2)) 2453 return (void *)__get_free_pages(flags | __GFP_COMP, 2454 get_order(size)); 2455 2456 s = get_slab(size, flags); 2457 2458 if (unlikely(ZERO_OR_NULL_PTR(s))) 2459 return s; 2460 2461 return slab_alloc(s, flags, -1, __builtin_return_address(0)); 2462} 2463EXPORT_SYMBOL(__kmalloc); 2464 2465#ifdef CONFIG_NUMA 2466void *__kmalloc_node(size_t size, gfp_t flags, int node) 2467{ 2468 struct kmem_cache *s; 2469 2470 if (unlikely(size > PAGE_SIZE / 2)) 2471 return (void *)__get_free_pages(flags | __GFP_COMP, 2472 get_order(size)); 2473 2474 s = get_slab(size, flags); 2475 2476 if (unlikely(ZERO_OR_NULL_PTR(s))) 2477 return s; 2478 2479 return slab_alloc(s, flags, node, __builtin_return_address(0)); 2480} 2481EXPORT_SYMBOL(__kmalloc_node); 2482#endif 2483 2484size_t ksize(const void *object) 2485{ 2486 struct page *page; 2487 struct kmem_cache *s; 2488 2489 BUG_ON(!object); 2490 if (unlikely(object == ZERO_SIZE_PTR)) 2491 return 0; 2492 2493 page = get_object_page(object); 2494 BUG_ON(!page); 2495 s = page->slab; 2496 BUG_ON(!s); 2497 2498 /* 2499 * Debugging requires use of the padding between object 2500 * and whatever may come after it. 2501 */ 2502 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 2503 return s->objsize; 2504 2505 /* 2506 * If we have the need to store the freelist pointer 2507 * back there or track user information then we can 2508 * only use the space before that information. 2509 */ 2510 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 2511 return s->inuse; 2512 2513 /* 2514 * Else we can use all the padding etc for the allocation 2515 */ 2516 return s->size; 2517} 2518EXPORT_SYMBOL(ksize); 2519 2520void kfree(const void *x) 2521{ 2522 struct page *page; 2523 2524 if (unlikely(ZERO_OR_NULL_PTR(x))) 2525 return; 2526 2527 page = virt_to_head_page(x); 2528 if (unlikely(!PageSlab(page))) { 2529 put_page(page); 2530 return; 2531 } 2532 slab_free(page->slab, page, (void *)x, __builtin_return_address(0)); 2533} 2534EXPORT_SYMBOL(kfree); 2535 2536/* 2537 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 2538 * the remaining slabs by the number of items in use. The slabs with the 2539 * most items in use come first. New allocations will then fill those up 2540 * and thus they can be removed from the partial lists. 2541 * 2542 * The slabs with the least items are placed last. This results in them 2543 * being allocated from last increasing the chance that the last objects 2544 * are freed in them. 2545 */ 2546int kmem_cache_shrink(struct kmem_cache *s) 2547{ 2548 int node; 2549 int i; 2550 struct kmem_cache_node *n; 2551 struct page *page; 2552 struct page *t; 2553 struct list_head *slabs_by_inuse = 2554 kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); 2555 unsigned long flags; 2556 2557 if (!slabs_by_inuse) 2558 return -ENOMEM; 2559 2560 flush_all(s); 2561 for_each_node_state(node, N_NORMAL_MEMORY) { 2562 n = get_node(s, node); 2563 2564 if (!n->nr_partial) 2565 continue; 2566 2567 for (i = 0; i < s->objects; i++) 2568 INIT_LIST_HEAD(slabs_by_inuse + i); 2569 2570 spin_lock_irqsave(&n->list_lock, flags); 2571 2572 /* 2573 * Build lists indexed by the items in use in each slab. 2574 * 2575 * Note that concurrent frees may occur while we hold the 2576 * list_lock. page->inuse here is the upper limit. 2577 */ 2578 list_for_each_entry_safe(page, t, &n->partial, lru) { 2579 if (!page->inuse && slab_trylock(page)) { 2580 /* 2581 * Must hold slab lock here because slab_free 2582 * may have freed the last object and be 2583 * waiting to release the slab. 2584 */ 2585 list_del(&page->lru); 2586 n->nr_partial--; 2587 slab_unlock(page); 2588 discard_slab(s, page); 2589 } else { 2590 list_move(&page->lru, 2591 slabs_by_inuse + page->inuse); 2592 } 2593 } 2594 2595 /* 2596 * Rebuild the partial list with the slabs filled up most 2597 * first and the least used slabs at the end. 2598 */ 2599 for (i = s->objects - 1; i >= 0; i--) 2600 list_splice(slabs_by_inuse + i, n->partial.prev); 2601 2602 spin_unlock_irqrestore(&n->list_lock, flags); 2603 } 2604 2605 kfree(slabs_by_inuse); 2606 return 0; 2607} 2608EXPORT_SYMBOL(kmem_cache_shrink); 2609 2610/******************************************************************** 2611 * Basic setup of slabs 2612 *******************************************************************/ 2613 2614void __init kmem_cache_init(void) 2615{ 2616 int i; 2617 int caches = 0; 2618 2619#ifdef CONFIG_NUMA 2620 /* 2621 * Must first have the slab cache available for the allocations of the 2622 * struct kmem_cache_node's. There is special bootstrap code in 2623 * kmem_cache_open for slab_state == DOWN. 2624 */ 2625 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 2626 sizeof(struct kmem_cache_node), GFP_KERNEL); 2627 kmalloc_caches[0].refcount = -1; 2628 caches++; 2629#endif 2630 2631 /* Able to allocate the per node structures */ 2632 slab_state = PARTIAL; 2633 2634 /* Caches that are not of the two-to-the-power-of size */ 2635 if (KMALLOC_MIN_SIZE <= 64) { 2636 create_kmalloc_cache(&kmalloc_caches[1], 2637 "kmalloc-96", 96, GFP_KERNEL); 2638 caches++; 2639 } 2640 if (KMALLOC_MIN_SIZE <= 128) { 2641 create_kmalloc_cache(&kmalloc_caches[2], 2642 "kmalloc-192", 192, GFP_KERNEL); 2643 caches++; 2644 } 2645 2646 for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) { 2647 create_kmalloc_cache(&kmalloc_caches[i], 2648 "kmalloc", 1 << i, GFP_KERNEL); 2649 caches++; 2650 } 2651 2652 2653 /* 2654 * Patch up the size_index table if we have strange large alignment 2655 * requirements for the kmalloc array. This is only the case for 2656 * mips it seems. The standard arches will not generate any code here. 2657 * 2658 * Largest permitted alignment is 256 bytes due to the way we 2659 * handle the index determination for the smaller caches. 2660 * 2661 * Make sure that nothing crazy happens if someone starts tinkering 2662 * around with ARCH_KMALLOC_MINALIGN 2663 */ 2664 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 2665 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 2666 2667 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) 2668 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; 2669 2670 slab_state = UP; 2671 2672 /* Provide the correct kmalloc names now that the caches are up */ 2673 for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) 2674 kmalloc_caches[i]. name = 2675 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 2676 2677#ifdef CONFIG_SMP 2678 register_cpu_notifier(&slab_notifier); 2679#endif 2680 2681 kmem_size = offsetof(struct kmem_cache, cpu_slab) + 2682 nr_cpu_ids * sizeof(struct kmem_cache_cpu); 2683 2684 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2685 " CPUs=%d, Nodes=%d\n", 2686 caches, cache_line_size(), 2687 slub_min_order, slub_max_order, slub_min_objects, 2688 nr_cpu_ids, nr_node_ids); 2689} 2690 2691/* 2692 * Find a mergeable slab cache 2693 */ 2694static int slab_unmergeable(struct kmem_cache *s) 2695{ 2696 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 2697 return 1; 2698 2699 if (s->ctor) 2700 return 1; 2701 2702 /* 2703 * We may have set a slab to be unmergeable during bootstrap. 2704 */ 2705 if (s->refcount < 0) 2706 return 1; 2707 2708 return 0; 2709} 2710 2711static struct kmem_cache *find_mergeable(size_t size, 2712 size_t align, unsigned long flags, const char *name, 2713 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 2714{ 2715 struct kmem_cache *s; 2716 2717 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 2718 return NULL; 2719 2720 if (ctor) 2721 return NULL; 2722 2723 size = ALIGN(size, sizeof(void *)); 2724 align = calculate_alignment(flags, align, size); 2725 size = ALIGN(size, align); 2726 flags = kmem_cache_flags(size, flags, name, NULL); 2727 2728 list_for_each_entry(s, &slab_caches, list) { 2729 if (slab_unmergeable(s)) 2730 continue; 2731 2732 if (size > s->size) 2733 continue; 2734 2735 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) 2736 continue; 2737 /* 2738 * Check if alignment is compatible. 2739 * Courtesy of Adrian Drzewiecki 2740 */ 2741 if ((s->size & ~(align -1)) != s->size) 2742 continue; 2743 2744 if (s->size - size >= sizeof(void *)) 2745 continue; 2746 2747 return s; 2748 } 2749 return NULL; 2750} 2751 2752struct kmem_cache *kmem_cache_create(const char *name, size_t size, 2753 size_t align, unsigned long flags, 2754 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 2755{ 2756 struct kmem_cache *s; 2757 2758 down_write(&slub_lock); 2759 s = find_mergeable(size, align, flags, name, ctor); 2760 if (s) { 2761 s->refcount++; 2762 /* 2763 * Adjust the object sizes so that we clear 2764 * the complete object on kzalloc. 2765 */ 2766 s->objsize = max(s->objsize, (int)size); 2767 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 2768 up_write(&slub_lock); 2769 if (sysfs_slab_alias(s, name)) 2770 goto err; 2771 return s; 2772 } 2773 s = kmalloc(kmem_size, GFP_KERNEL); 2774 if (s) { 2775 if (kmem_cache_open(s, GFP_KERNEL, name, 2776 size, align, flags, ctor)) { 2777 list_add(&s->list, &slab_caches); 2778 up_write(&slub_lock); 2779 if (sysfs_slab_add(s)) 2780 goto err; 2781 return s; 2782 } 2783 kfree(s); 2784 } 2785 up_write(&slub_lock); 2786 2787err: 2788 if (flags & SLAB_PANIC) 2789 panic("Cannot create slabcache %s\n", name); 2790 else 2791 s = NULL; 2792 return s; 2793} 2794EXPORT_SYMBOL(kmem_cache_create); 2795 2796#ifdef CONFIG_SMP 2797/* 2798 * Use the cpu notifier to insure that the cpu slabs are flushed when 2799 * necessary. 2800 */ 2801static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 2802 unsigned long action, void *hcpu) 2803{ 2804 long cpu = (long)hcpu; 2805 struct kmem_cache *s; 2806 unsigned long flags; 2807 2808 switch (action) { 2809 case CPU_UP_CANCELED: 2810 case CPU_UP_CANCELED_FROZEN: 2811 case CPU_DEAD: 2812 case CPU_DEAD_FROZEN: 2813 down_read(&slub_lock); 2814 list_for_each_entry(s, &slab_caches, list) { 2815 local_irq_save(flags); 2816 __flush_cpu_slab(s, cpu); 2817 local_irq_restore(flags); 2818 } 2819 up_read(&slub_lock); 2820 break; 2821 default: 2822 break; 2823 } 2824 return NOTIFY_OK; 2825} 2826 2827static struct notifier_block __cpuinitdata slab_notifier = 2828 { &slab_cpuup_callback, NULL, 0 }; 2829 2830#endif 2831 2832void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) 2833{ 2834 struct kmem_cache *s; 2835 2836 if (unlikely(size > PAGE_SIZE / 2)) 2837 return (void *)__get_free_pages(gfpflags | __GFP_COMP, 2838 get_order(size)); 2839 s = get_slab(size, gfpflags); 2840 2841 if (unlikely(ZERO_OR_NULL_PTR(s))) 2842 return s; 2843 2844 return slab_alloc(s, gfpflags, -1, caller); 2845} 2846 2847void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 2848 int node, void *caller) 2849{ 2850 struct kmem_cache *s; 2851 2852 if (unlikely(size > PAGE_SIZE / 2)) 2853 return (void *)__get_free_pages(gfpflags | __GFP_COMP, 2854 get_order(size)); 2855 s = get_slab(size, gfpflags); 2856 2857 if (unlikely(ZERO_OR_NULL_PTR(s))) 2858 return s; 2859 2860 return slab_alloc(s, gfpflags, node, caller); 2861} 2862 2863#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) 2864static int validate_slab(struct kmem_cache *s, struct page *page, 2865 unsigned long *map) 2866{ 2867 void *p; 2868 void *addr = page_address(page); 2869 2870 if (!check_slab(s, page) || 2871 !on_freelist(s, page, NULL)) 2872 return 0; 2873 2874 /* Now we know that a valid freelist exists */ 2875 bitmap_zero(map, s->objects); 2876 2877 for_each_free_object(p, s, page->freelist) { 2878 set_bit(slab_index(p, s, addr), map); 2879 if (!check_object(s, page, p, 0)) 2880 return 0; 2881 } 2882 2883 for_each_object(p, s, addr) 2884 if (!test_bit(slab_index(p, s, addr), map)) 2885 if (!check_object(s, page, p, 1)) 2886 return 0; 2887 return 1; 2888} 2889 2890static void validate_slab_slab(struct kmem_cache *s, struct page *page, 2891 unsigned long *map) 2892{ 2893 if (slab_trylock(page)) { 2894 validate_slab(s, page, map); 2895 slab_unlock(page); 2896 } else 2897 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", 2898 s->name, page); 2899 2900 if (s->flags & DEBUG_DEFAULT_FLAGS) { 2901 if (!SlabDebug(page)) 2902 printk(KERN_ERR "SLUB %s: SlabDebug not set " 2903 "on slab 0x%p\n", s->name, page); 2904 } else { 2905 if (SlabDebug(page)) 2906 printk(KERN_ERR "SLUB %s: SlabDebug set on " 2907 "slab 0x%p\n", s->name, page); 2908 } 2909} 2910 2911static int validate_slab_node(struct kmem_cache *s, 2912 struct kmem_cache_node *n, unsigned long *map) 2913{ 2914 unsigned long count = 0; 2915 struct page *page; 2916 unsigned long flags; 2917 2918 spin_lock_irqsave(&n->list_lock, flags); 2919 2920 list_for_each_entry(page, &n->partial, lru) { 2921 validate_slab_slab(s, page, map); 2922 count++; 2923 } 2924 if (count != n->nr_partial) 2925 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 2926 "counter=%ld\n", s->name, count, n->nr_partial); 2927 2928 if (!(s->flags & SLAB_STORE_USER)) 2929 goto out; 2930 2931 list_for_each_entry(page, &n->full, lru) { 2932 validate_slab_slab(s, page, map); 2933 count++; 2934 } 2935 if (count != atomic_long_read(&n->nr_slabs)) 2936 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 2937 "counter=%ld\n", s->name, count, 2938 atomic_long_read(&n->nr_slabs)); 2939 2940out: 2941 spin_unlock_irqrestore(&n->list_lock, flags); 2942 return count; 2943} 2944 2945static long validate_slab_cache(struct kmem_cache *s) 2946{ 2947 int node; 2948 unsigned long count = 0; 2949 unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) * 2950 sizeof(unsigned long), GFP_KERNEL); 2951 2952 if (!map) 2953 return -ENOMEM; 2954 2955 flush_all(s); 2956 for_each_node_state(node, N_NORMAL_MEMORY) { 2957 struct kmem_cache_node *n = get_node(s, node); 2958 2959 count += validate_slab_node(s, n, map); 2960 } 2961 kfree(map); 2962 return count; 2963} 2964 2965#ifdef SLUB_RESILIENCY_TEST 2966static void resiliency_test(void) 2967{ 2968 u8 *p; 2969 2970 printk(KERN_ERR "SLUB resiliency testing\n"); 2971 printk(KERN_ERR "-----------------------\n"); 2972 printk(KERN_ERR "A. Corruption after allocation\n"); 2973 2974 p = kzalloc(16, GFP_KERNEL); 2975 p[16] = 0x12; 2976 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 2977 " 0x12->0x%p\n\n", p + 16); 2978 2979 validate_slab_cache(kmalloc_caches + 4); 2980 2981 /* Hmmm... The next two are dangerous */ 2982 p = kzalloc(32, GFP_KERNEL); 2983 p[32 + sizeof(void *)] = 0x34; 2984 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 2985 " 0x34 -> -0x%p\n", p); 2986 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 2987 2988 validate_slab_cache(kmalloc_caches + 5); 2989 p = kzalloc(64, GFP_KERNEL); 2990 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 2991 *p = 0x56; 2992 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 2993 p); 2994 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 2995 validate_slab_cache(kmalloc_caches + 6); 2996 2997 printk(KERN_ERR "\nB. Corruption after free\n"); 2998 p = kzalloc(128, GFP_KERNEL); 2999 kfree(p); 3000 *p = 0x78; 3001 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 3002 validate_slab_cache(kmalloc_caches + 7); 3003 3004 p = kzalloc(256, GFP_KERNEL); 3005 kfree(p); 3006 p[50] = 0x9a; 3007 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); 3008 validate_slab_cache(kmalloc_caches + 8); 3009 3010 p = kzalloc(512, GFP_KERNEL); 3011 kfree(p); 3012 p[512] = 0xab; 3013 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 3014 validate_slab_cache(kmalloc_caches + 9); 3015} 3016#else 3017static void resiliency_test(void) {}; 3018#endif 3019 3020/* 3021 * Generate lists of code addresses where slabcache objects are allocated 3022 * and freed. 3023 */ 3024 3025struct location { 3026 unsigned long count; 3027 void *addr; 3028 long long sum_time; 3029 long min_time; 3030 long max_time; 3031 long min_pid; 3032 long max_pid; 3033 cpumask_t cpus; 3034 nodemask_t nodes; 3035}; 3036 3037struct loc_track { 3038 unsigned long max; 3039 unsigned long count; 3040 struct location *loc; 3041}; 3042 3043static void free_loc_track(struct loc_track *t) 3044{ 3045 if (t->max) 3046 free_pages((unsigned long)t->loc, 3047 get_order(sizeof(struct location) * t->max)); 3048} 3049 3050static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) 3051{ 3052 struct location *l; 3053 int order; 3054 3055 order = get_order(sizeof(struct location) * max); 3056 3057 l = (void *)__get_free_pages(flags, order); 3058 if (!l) 3059 return 0; 3060 3061 if (t->count) { 3062 memcpy(l, t->loc, sizeof(struct location) * t->count); 3063 free_loc_track(t); 3064 } 3065 t->max = max; 3066 t->loc = l; 3067 return 1; 3068} 3069 3070static int add_location(struct loc_track *t, struct kmem_cache *s, 3071 const struct track *track) 3072{ 3073 long start, end, pos; 3074 struct location *l; 3075 void *caddr; 3076 unsigned long age = jiffies - track->when; 3077 3078 start = -1; 3079 end = t->count; 3080 3081 for ( ; ; ) { 3082 pos = start + (end - start + 1) / 2; 3083 3084 /* 3085 * There is nothing at "end". If we end up there 3086 * we need to add something to before end. 3087 */ 3088 if (pos == end) 3089 break; 3090 3091 caddr = t->loc[pos].addr; 3092 if (track->addr == caddr) { 3093 3094 l = &t->loc[pos]; 3095 l->count++; 3096 if (track->when) { 3097 l->sum_time += age; 3098 if (age < l->min_time) 3099 l->min_time = age; 3100 if (age > l->max_time) 3101 l->max_time = age; 3102 3103 if (track->pid < l->min_pid) 3104 l->min_pid = track->pid; 3105 if (track->pid > l->max_pid) 3106 l->max_pid = track->pid; 3107 3108 cpu_set(track->cpu, l->cpus); 3109 } 3110 node_set(page_to_nid(virt_to_page(track)), l->nodes); 3111 return 1; 3112 } 3113 3114 if (track->addr < caddr) 3115 end = pos; 3116 else 3117 start = pos; 3118 } 3119 3120 /* 3121 * Not found. Insert new tracking element. 3122 */ 3123 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) 3124 return 0; 3125 3126 l = t->loc + pos; 3127 if (pos < t->count) 3128 memmove(l + 1, l, 3129 (t->count - pos) * sizeof(struct location)); 3130 t->count++; 3131 l->count = 1; 3132 l->addr = track->addr; 3133 l->sum_time = age; 3134 l->min_time = age; 3135 l->max_time = age; 3136 l->min_pid = track->pid; 3137 l->max_pid = track->pid; 3138 cpus_clear(l->cpus); 3139 cpu_set(track->cpu, l->cpus); 3140 nodes_clear(l->nodes); 3141 node_set(page_to_nid(virt_to_page(track)), l->nodes); 3142 return 1; 3143} 3144 3145static void process_slab(struct loc_track *t, struct kmem_cache *s, 3146 struct page *page, enum track_item alloc) 3147{ 3148 void *addr = page_address(page); 3149 DECLARE_BITMAP(map, s->objects); 3150 void *p; 3151 3152 bitmap_zero(map, s->objects); 3153 for_each_free_object(p, s, page->freelist) 3154 set_bit(slab_index(p, s, addr), map); 3155 3156 for_each_object(p, s, addr) 3157 if (!test_bit(slab_index(p, s, addr), map)) 3158 add_location(t, s, get_track(s, p, alloc)); 3159} 3160 3161static int list_locations(struct kmem_cache *s, char *buf, 3162 enum track_item alloc) 3163{ 3164 int n = 0; 3165 unsigned long i; 3166 struct loc_track t = { 0, 0, NULL }; 3167 int node; 3168 3169 if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 3170 GFP_KERNEL)) 3171 return sprintf(buf, "Out of memory\n"); 3172 3173 /* Push back cpu slabs */ 3174 flush_all(s); 3175 3176 for_each_node_state(node, N_NORMAL_MEMORY) { 3177 struct kmem_cache_node *n = get_node(s, node); 3178 unsigned long flags; 3179 struct page *page; 3180 3181 if (!atomic_long_read(&n->nr_slabs)) 3182 continue; 3183 3184 spin_lock_irqsave(&n->list_lock, flags); 3185 list_for_each_entry(page, &n->partial, lru) 3186 process_slab(&t, s, page, alloc); 3187 list_for_each_entry(page, &n->full, lru) 3188 process_slab(&t, s, page, alloc); 3189 spin_unlock_irqrestore(&n->list_lock, flags); 3190 } 3191 3192 for (i = 0; i < t.count; i++) { 3193 struct location *l = &t.loc[i]; 3194 3195 if (n > PAGE_SIZE - 100) 3196 break; 3197 n += sprintf(buf + n, "%7ld ", l->count); 3198 3199 if (l->addr) 3200 n += sprint_symbol(buf + n, (unsigned long)l->addr); 3201 else 3202 n += sprintf(buf + n, "<not-available>"); 3203 3204 if (l->sum_time != l->min_time) { 3205 unsigned long remainder; 3206 3207 n += sprintf(buf + n, " age=%ld/%ld/%ld", 3208 l->min_time, 3209 div_long_long_rem(l->sum_time, l->count, &remainder), 3210 l->max_time); 3211 } else 3212 n += sprintf(buf + n, " age=%ld", 3213 l->min_time); 3214 3215 if (l->min_pid != l->max_pid) 3216 n += sprintf(buf + n, " pid=%ld-%ld", 3217 l->min_pid, l->max_pid); 3218 else 3219 n += sprintf(buf + n, " pid=%ld", 3220 l->min_pid); 3221 3222 if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && 3223 n < PAGE_SIZE - 60) { 3224 n += sprintf(buf + n, " cpus="); 3225 n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50, 3226 l->cpus); 3227 } 3228 3229 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && 3230 n < PAGE_SIZE - 60) { 3231 n += sprintf(buf + n, " nodes="); 3232 n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50, 3233 l->nodes); 3234 } 3235 3236 n += sprintf(buf + n, "\n"); 3237 } 3238 3239 free_loc_track(&t); 3240 if (!t.count) 3241 n += sprintf(buf, "No data\n"); 3242 return n; 3243} 3244 3245static unsigned long count_partial(struct kmem_cache_node *n) 3246{ 3247 unsigned long flags; 3248 unsigned long x = 0; 3249 struct page *page; 3250 3251 spin_lock_irqsave(&n->list_lock, flags); 3252 list_for_each_entry(page, &n->partial, lru) 3253 x += page->inuse; 3254 spin_unlock_irqrestore(&n->list_lock, flags); 3255 return x; 3256} 3257 3258enum slab_stat_type { 3259 SL_FULL, 3260 SL_PARTIAL, 3261 SL_CPU, 3262 SL_OBJECTS 3263}; 3264 3265#define SO_FULL (1 << SL_FULL) 3266#define SO_PARTIAL (1 << SL_PARTIAL) 3267#define SO_CPU (1 << SL_CPU) 3268#define SO_OBJECTS (1 << SL_OBJECTS) 3269 3270static unsigned long slab_objects(struct kmem_cache *s, 3271 char *buf, unsigned long flags) 3272{ 3273 unsigned long total = 0; 3274 int cpu; 3275 int node; 3276 int x; 3277 unsigned long *nodes; 3278 unsigned long *per_cpu; 3279 3280 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 3281 per_cpu = nodes + nr_node_ids; 3282 3283 for_each_possible_cpu(cpu) { 3284 struct page *page; 3285 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3286 3287 if (!c) 3288 continue; 3289 3290 page = c->page; 3291 if (page) { 3292 if (flags & SO_CPU) { 3293 int x = 0; 3294 3295 if (flags & SO_OBJECTS) 3296 x = page->inuse; 3297 else 3298 x = 1; 3299 total += x; 3300 nodes[c->node] += x; 3301 } 3302 per_cpu[c->node]++; 3303 } 3304 } 3305 3306 for_each_node_state(node, N_NORMAL_MEMORY) { 3307 struct kmem_cache_node *n = get_node(s, node); 3308 3309 if (flags & SO_PARTIAL) { 3310 if (flags & SO_OBJECTS) 3311 x = count_partial(n); 3312 else 3313 x = n->nr_partial; 3314 total += x; 3315 nodes[node] += x; 3316 } 3317 3318 if (flags & SO_FULL) { 3319 int full_slabs = atomic_long_read(&n->nr_slabs) 3320 - per_cpu[node] 3321 - n->nr_partial; 3322 3323 if (flags & SO_OBJECTS) 3324 x = full_slabs * s->objects; 3325 else 3326 x = full_slabs; 3327 total += x; 3328 nodes[node] += x; 3329 } 3330 } 3331 3332 x = sprintf(buf, "%lu", total); 3333#ifdef CONFIG_NUMA 3334 for_each_node_state(node, N_NORMAL_MEMORY) 3335 if (nodes[node]) 3336 x += sprintf(buf + x, " N%d=%lu", 3337 node, nodes[node]); 3338#endif 3339 kfree(nodes); 3340 return x + sprintf(buf + x, "\n"); 3341} 3342 3343static int any_slab_objects(struct kmem_cache *s) 3344{ 3345 int node; 3346 int cpu; 3347 3348 for_each_possible_cpu(cpu) { 3349 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3350 3351 if (c && c->page) 3352 return 1; 3353 } 3354 3355 for_each_online_node(node) { 3356 struct kmem_cache_node *n = get_node(s, node); 3357 3358 if (!n) 3359 continue; 3360 3361 if (n->nr_partial || atomic_long_read(&n->nr_slabs)) 3362 return 1; 3363 } 3364 return 0; 3365} 3366 3367#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 3368#define to_slab(n) container_of(n, struct kmem_cache, kobj); 3369 3370struct slab_attribute { 3371 struct attribute attr; 3372 ssize_t (*show)(struct kmem_cache *s, char *buf); 3373 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 3374}; 3375 3376#define SLAB_ATTR_RO(_name) \ 3377 static struct slab_attribute _name##_attr = __ATTR_RO(_name) 3378 3379#define SLAB_ATTR(_name) \ 3380 static struct slab_attribute _name##_attr = \ 3381 __ATTR(_name, 0644, _name##_show, _name##_store) 3382 3383static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 3384{ 3385 return sprintf(buf, "%d\n", s->size); 3386} 3387SLAB_ATTR_RO(slab_size); 3388 3389static ssize_t align_show(struct kmem_cache *s, char *buf) 3390{ 3391 return sprintf(buf, "%d\n", s->align); 3392} 3393SLAB_ATTR_RO(align); 3394 3395static ssize_t object_size_show(struct kmem_cache *s, char *buf) 3396{ 3397 return sprintf(buf, "%d\n", s->objsize); 3398} 3399SLAB_ATTR_RO(object_size); 3400 3401static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 3402{ 3403 return sprintf(buf, "%d\n", s->objects); 3404} 3405SLAB_ATTR_RO(objs_per_slab); 3406 3407static ssize_t order_show(struct kmem_cache *s, char *buf) 3408{ 3409 return sprintf(buf, "%d\n", s->order); 3410} 3411SLAB_ATTR_RO(order); 3412 3413static ssize_t ctor_show(struct kmem_cache *s, char *buf) 3414{ 3415 if (s->ctor) { 3416 int n = sprint_symbol(buf, (unsigned long)s->ctor); 3417 3418 return n + sprintf(buf + n, "\n"); 3419 } 3420 return 0; 3421} 3422SLAB_ATTR_RO(ctor); 3423 3424static ssize_t aliases_show(struct kmem_cache *s, char *buf) 3425{ 3426 return sprintf(buf, "%d\n", s->refcount - 1); 3427} 3428SLAB_ATTR_RO(aliases); 3429 3430static ssize_t slabs_show(struct kmem_cache *s, char *buf) 3431{ 3432 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); 3433} 3434SLAB_ATTR_RO(slabs); 3435 3436static ssize_t partial_show(struct kmem_cache *s, char *buf) 3437{ 3438 return slab_objects(s, buf, SO_PARTIAL); 3439} 3440SLAB_ATTR_RO(partial); 3441 3442static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 3443{ 3444 return slab_objects(s, buf, SO_CPU); 3445} 3446SLAB_ATTR_RO(cpu_slabs); 3447 3448static ssize_t objects_show(struct kmem_cache *s, char *buf) 3449{ 3450 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); 3451} 3452SLAB_ATTR_RO(objects); 3453 3454static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 3455{ 3456 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 3457} 3458 3459static ssize_t sanity_checks_store(struct kmem_cache *s, 3460 const char *buf, size_t length) 3461{ 3462 s->flags &= ~SLAB_DEBUG_FREE; 3463 if (buf[0] == '1') 3464 s->flags |= SLAB_DEBUG_FREE; 3465 return length; 3466} 3467SLAB_ATTR(sanity_checks); 3468 3469static ssize_t trace_show(struct kmem_cache *s, char *buf) 3470{ 3471 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 3472} 3473 3474static ssize_t trace_store(struct kmem_cache *s, const char *buf, 3475 size_t length) 3476{ 3477 s->flags &= ~SLAB_TRACE; 3478 if (buf[0] == '1') 3479 s->flags |= SLAB_TRACE; 3480 return length; 3481} 3482SLAB_ATTR(trace); 3483 3484static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 3485{ 3486 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 3487} 3488 3489static ssize_t reclaim_account_store(struct kmem_cache *s, 3490 const char *buf, size_t length) 3491{ 3492 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 3493 if (buf[0] == '1') 3494 s->flags |= SLAB_RECLAIM_ACCOUNT; 3495 return length; 3496} 3497SLAB_ATTR(reclaim_account); 3498 3499static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 3500{ 3501 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 3502} 3503SLAB_ATTR_RO(hwcache_align); 3504 3505#ifdef CONFIG_ZONE_DMA 3506static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 3507{ 3508 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 3509} 3510SLAB_ATTR_RO(cache_dma); 3511#endif 3512 3513static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 3514{ 3515 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 3516} 3517SLAB_ATTR_RO(destroy_by_rcu); 3518 3519static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 3520{ 3521 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 3522} 3523 3524static ssize_t red_zone_store(struct kmem_cache *s, 3525 const char *buf, size_t length) 3526{ 3527 if (any_slab_objects(s)) 3528 return -EBUSY; 3529 3530 s->flags &= ~SLAB_RED_ZONE; 3531 if (buf[0] == '1') 3532 s->flags |= SLAB_RED_ZONE; 3533 calculate_sizes(s); 3534 return length; 3535} 3536SLAB_ATTR(red_zone); 3537 3538static ssize_t poison_show(struct kmem_cache *s, char *buf) 3539{ 3540 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 3541} 3542 3543static ssize_t poison_store(struct kmem_cache *s, 3544 const char *buf, size_t length) 3545{ 3546 if (any_slab_objects(s)) 3547 return -EBUSY; 3548 3549 s->flags &= ~SLAB_POISON; 3550 if (buf[0] == '1') 3551 s->flags |= SLAB_POISON; 3552 calculate_sizes(s); 3553 return length; 3554} 3555SLAB_ATTR(poison); 3556 3557static ssize_t store_user_show(struct kmem_cache *s, char *buf) 3558{ 3559 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 3560} 3561 3562static ssize_t store_user_store(struct kmem_cache *s, 3563 const char *buf, size_t length) 3564{ 3565 if (any_slab_objects(s)) 3566 return -EBUSY; 3567 3568 s->flags &= ~SLAB_STORE_USER; 3569 if (buf[0] == '1') 3570 s->flags |= SLAB_STORE_USER; 3571 calculate_sizes(s); 3572 return length; 3573} 3574SLAB_ATTR(store_user); 3575 3576static ssize_t validate_show(struct kmem_cache *s, char *buf) 3577{ 3578 return 0; 3579} 3580 3581static ssize_t validate_store(struct kmem_cache *s, 3582 const char *buf, size_t length) 3583{ 3584 int ret = -EINVAL; 3585 3586 if (buf[0] == '1') { 3587 ret = validate_slab_cache(s); 3588 if (ret >= 0) 3589 ret = length; 3590 } 3591 return ret; 3592} 3593SLAB_ATTR(validate); 3594 3595static ssize_t shrink_show(struct kmem_cache *s, char *buf) 3596{ 3597 return 0; 3598} 3599 3600static ssize_t shrink_store(struct kmem_cache *s, 3601 const char *buf, size_t length) 3602{ 3603 if (buf[0] == '1') { 3604 int rc = kmem_cache_shrink(s); 3605 3606 if (rc) 3607 return rc; 3608 } else 3609 return -EINVAL; 3610 return length; 3611} 3612SLAB_ATTR(shrink); 3613 3614static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 3615{ 3616 if (!(s->flags & SLAB_STORE_USER)) 3617 return -ENOSYS; 3618 return list_locations(s, buf, TRACK_ALLOC); 3619} 3620SLAB_ATTR_RO(alloc_calls); 3621 3622static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 3623{ 3624 if (!(s->flags & SLAB_STORE_USER)) 3625 return -ENOSYS; 3626 return list_locations(s, buf, TRACK_FREE); 3627} 3628SLAB_ATTR_RO(free_calls); 3629 3630#ifdef CONFIG_NUMA 3631static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) 3632{ 3633 return sprintf(buf, "%d\n", s->defrag_ratio / 10); 3634} 3635 3636static ssize_t defrag_ratio_store(struct kmem_cache *s, 3637 const char *buf, size_t length) 3638{ 3639 int n = simple_strtoul(buf, NULL, 10); 3640 3641 if (n < 100) 3642 s->defrag_ratio = n * 10; 3643 return length; 3644} 3645SLAB_ATTR(defrag_ratio); 3646#endif 3647 3648static struct attribute * slab_attrs[] = { 3649 &slab_size_attr.attr, 3650 &object_size_attr.attr, 3651 &objs_per_slab_attr.attr, 3652 &order_attr.attr, 3653 &objects_attr.attr, 3654 &slabs_attr.attr, 3655 &partial_attr.attr, 3656 &cpu_slabs_attr.attr, 3657 &ctor_attr.attr, 3658 &aliases_attr.attr, 3659 &align_attr.attr, 3660 &sanity_checks_attr.attr, 3661 &trace_attr.attr, 3662 &hwcache_align_attr.attr, 3663 &reclaim_account_attr.attr, 3664 &destroy_by_rcu_attr.attr, 3665 &red_zone_attr.attr, 3666 &poison_attr.attr, 3667 &store_user_attr.attr, 3668 &validate_attr.attr, 3669 &shrink_attr.attr, 3670 &alloc_calls_attr.attr, 3671 &free_calls_attr.attr, 3672#ifdef CONFIG_ZONE_DMA 3673 &cache_dma_attr.attr, 3674#endif 3675#ifdef CONFIG_NUMA 3676 &defrag_ratio_attr.attr, 3677#endif 3678 NULL 3679}; 3680 3681static struct attribute_group slab_attr_group = { 3682 .attrs = slab_attrs, 3683}; 3684 3685static ssize_t slab_attr_show(struct kobject *kobj, 3686 struct attribute *attr, 3687 char *buf) 3688{ 3689 struct slab_attribute *attribute; 3690 struct kmem_cache *s; 3691 int err; 3692 3693 attribute = to_slab_attr(attr); 3694 s = to_slab(kobj); 3695 3696 if (!attribute->show) 3697 return -EIO; 3698 3699 err = attribute->show(s, buf); 3700 3701 return err; 3702} 3703 3704static ssize_t slab_attr_store(struct kobject *kobj, 3705 struct attribute *attr, 3706 const char *buf, size_t len) 3707{ 3708 struct slab_attribute *attribute; 3709 struct kmem_cache *s; 3710 int err; 3711 3712 attribute = to_slab_attr(attr); 3713 s = to_slab(kobj); 3714 3715 if (!attribute->store) 3716 return -EIO; 3717 3718 err = attribute->store(s, buf, len); 3719 3720 return err; 3721} 3722 3723static struct sysfs_ops slab_sysfs_ops = { 3724 .show = slab_attr_show, 3725 .store = slab_attr_store, 3726}; 3727 3728static struct kobj_type slab_ktype = { 3729 .sysfs_ops = &slab_sysfs_ops, 3730}; 3731 3732static int uevent_filter(struct kset *kset, struct kobject *kobj) 3733{ 3734 struct kobj_type *ktype = get_ktype(kobj); 3735 3736 if (ktype == &slab_ktype) 3737 return 1; 3738 return 0; 3739} 3740 3741static struct kset_uevent_ops slab_uevent_ops = { 3742 .filter = uevent_filter, 3743}; 3744 3745static decl_subsys(slab, &slab_ktype, &slab_uevent_ops); 3746 3747#define ID_STR_LENGTH 64 3748 3749/* Create a unique string id for a slab cache: 3750 * format 3751 * :[flags-]size:[memory address of kmemcache] 3752 */ 3753static char *create_unique_id(struct kmem_cache *s) 3754{ 3755 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 3756 char *p = name; 3757 3758 BUG_ON(!name); 3759 3760 *p++ = ':'; 3761 /* 3762 * First flags affecting slabcache operations. We will only 3763 * get here for aliasable slabs so we do not need to support 3764 * too many flags. The flags here must cover all flags that 3765 * are matched during merging to guarantee that the id is 3766 * unique. 3767 */ 3768 if (s->flags & SLAB_CACHE_DMA) 3769 *p++ = 'd'; 3770 if (s->flags & SLAB_RECLAIM_ACCOUNT) 3771 *p++ = 'a'; 3772 if (s->flags & SLAB_DEBUG_FREE) 3773 *p++ = 'F'; 3774 if (p != name + 1) 3775 *p++ = '-'; 3776 p += sprintf(p, "%07d", s->size); 3777 BUG_ON(p > name + ID_STR_LENGTH - 1); 3778 return name; 3779} 3780 3781static int sysfs_slab_add(struct kmem_cache *s) 3782{ 3783 int err; 3784 const char *name; 3785 int unmergeable; 3786 3787 if (slab_state < SYSFS) 3788 /* Defer until later */ 3789 return 0; 3790 3791 unmergeable = slab_unmergeable(s); 3792 if (unmergeable) { 3793 /* 3794 * Slabcache can never be merged so we can use the name proper. 3795 * This is typically the case for debug situations. In that 3796 * case we can catch duplicate names easily. 3797 */ 3798 sysfs_remove_link(&slab_subsys.kobj, s->name); 3799 name = s->name; 3800 } else { 3801 /* 3802 * Create a unique name for the slab as a target 3803 * for the symlinks. 3804 */ 3805 name = create_unique_id(s); 3806 } 3807 3808 kobj_set_kset_s(s, slab_subsys); 3809 kobject_set_name(&s->kobj, name); 3810 kobject_init(&s->kobj); 3811 err = kobject_add(&s->kobj); 3812 if (err) 3813 return err; 3814 3815 err = sysfs_create_group(&s->kobj, &slab_attr_group); 3816 if (err) 3817 return err; 3818 kobject_uevent(&s->kobj, KOBJ_ADD); 3819 if (!unmergeable) { 3820 /* Setup first alias */ 3821 sysfs_slab_alias(s, s->name); 3822 kfree(name); 3823 } 3824 return 0; 3825} 3826 3827static void sysfs_slab_remove(struct kmem_cache *s) 3828{ 3829 kobject_uevent(&s->kobj, KOBJ_REMOVE); 3830 kobject_del(&s->kobj); 3831} 3832 3833/* 3834 * Need to buffer aliases during bootup until sysfs becomes 3835 * available lest we loose that information. 3836 */ 3837struct saved_alias { 3838 struct kmem_cache *s; 3839 const char *name; 3840 struct saved_alias *next; 3841}; 3842 3843static struct saved_alias *alias_list; 3844 3845static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 3846{ 3847 struct saved_alias *al; 3848 3849 if (slab_state == SYSFS) { 3850 /* 3851 * If we have a leftover link then remove it. 3852 */ 3853 sysfs_remove_link(&slab_subsys.kobj, name); 3854 return sysfs_create_link(&slab_subsys.kobj, 3855 &s->kobj, name); 3856 } 3857 3858 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 3859 if (!al) 3860 return -ENOMEM; 3861 3862 al->s = s; 3863 al->name = name; 3864 al->next = alias_list; 3865 alias_list = al; 3866 return 0; 3867} 3868 3869static int __init slab_sysfs_init(void) 3870{ 3871 struct kmem_cache *s; 3872 int err; 3873 3874 err = subsystem_register(&slab_subsys); 3875 if (err) { 3876 printk(KERN_ERR "Cannot register slab subsystem.\n"); 3877 return -ENOSYS; 3878 } 3879 3880 slab_state = SYSFS; 3881 3882 list_for_each_entry(s, &slab_caches, list) { 3883 err = sysfs_slab_add(s); 3884 if (err) 3885 printk(KERN_ERR "SLUB: Unable to add boot slab %s" 3886 " to sysfs\n", s->name); 3887 } 3888 3889 while (alias_list) { 3890 struct saved_alias *al = alias_list; 3891 3892 alias_list = alias_list->next; 3893 err = sysfs_slab_alias(al->s, al->name); 3894 if (err) 3895 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 3896 " %s to sysfs\n", s->name); 3897 kfree(al); 3898 } 3899 3900 resiliency_test(); 3901 return 0; 3902} 3903 3904__initcall(slab_sysfs_init); 3905#endif 3906