slab.c revision 6ed5eb2211204224799b2821656bbbfde26ef200
1/* 2 * linux/mm/slab.c 3 * Written by Mark Hemment, 1996/97. 4 * (markhe@nextd.demon.co.uk) 5 * 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 7 * 8 * Major cleanup, different bufctl logic, per-cpu arrays 9 * (c) 2000 Manfred Spraul 10 * 11 * Cleanup, make the head arrays unconditional, preparation for NUMA 12 * (c) 2002 Manfred Spraul 13 * 14 * An implementation of the Slab Allocator as described in outline in; 15 * UNIX Internals: The New Frontiers by Uresh Vahalia 16 * Pub: Prentice Hall ISBN 0-13-101908-2 17 * or with a little more detail in; 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 19 * Jeff Bonwick (Sun Microsystems). 20 * Presented at: USENIX Summer 1994 Technical Conference 21 * 22 * The memory is organized in caches, one cache for each object type. 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 24 * Each cache consists out of many slabs (they are small (usually one 25 * page long) and always contiguous), and each slab contains multiple 26 * initialized objects. 27 * 28 * This means, that your constructor is used only for newly allocated 29 * slabs and you must pass objects with the same intializations to 30 * kmem_cache_free. 31 * 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 33 * normal). If you need a special memory type, then must create a new 34 * cache for that memory type. 35 * 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 37 * full slabs with 0 free objects 38 * partial slabs 39 * empty slabs with no allocated objects 40 * 41 * If partial slabs exist, then new allocations come from these slabs, 42 * otherwise from empty slabs or new slabs are allocated. 43 * 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 46 * 47 * Each cache has a short per-cpu head array, most allocs 48 * and frees go into that array, and if that array overflows, then 1/2 49 * of the entries in the array are given back into the global cache. 50 * The head array is strictly LIFO and should improve the cache hit rates. 51 * On SMP, it additionally reduces the spinlock operations. 52 * 53 * The c_cpuarray may not be read with enabled local interrupts - 54 * it's changed with a smp_call_function(). 55 * 56 * SMP synchronization: 57 * constructors and destructors are called without any locking. 58 * Several members in kmem_cache_t and struct slab never change, they 59 * are accessed without any locking. 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 61 * and local interrupts are disabled so slab code is preempt-safe. 62 * The non-constant members are protected with a per-cache irq spinlock. 63 * 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 65 * in 2000 - many ideas in the current implementation are derived from 66 * his patch. 67 * 68 * Further notes from the original documentation: 69 * 70 * 11 April '97. Started multi-threading - markhe 71 * The global cache-chain is protected by the mutex 'cache_chain_mutex'. 72 * The sem is only needed when accessing/extending the cache-chain, which 73 * can never happen inside an interrupt (kmem_cache_create(), 74 * kmem_cache_shrink() and kmem_cache_reap()). 75 * 76 * At present, each engine can be growing a cache. This should be blocked. 77 * 78 * 15 March 2005. NUMA slab allocator. 79 * Shai Fultheim <shai@scalex86.org>. 80 * Shobhit Dayal <shobhit@calsoftinc.com> 81 * Alok N Kataria <alokk@calsoftinc.com> 82 * Christoph Lameter <christoph@lameter.com> 83 * 84 * Modified the slab allocator to be node aware on NUMA systems. 85 * Each node has its own list of partial, free and full slabs. 86 * All object allocations for a node occur from node specific slab lists. 87 */ 88 89#include <linux/config.h> 90#include <linux/slab.h> 91#include <linux/mm.h> 92#include <linux/swap.h> 93#include <linux/cache.h> 94#include <linux/interrupt.h> 95#include <linux/init.h> 96#include <linux/compiler.h> 97#include <linux/seq_file.h> 98#include <linux/notifier.h> 99#include <linux/kallsyms.h> 100#include <linux/cpu.h> 101#include <linux/sysctl.h> 102#include <linux/module.h> 103#include <linux/rcupdate.h> 104#include <linux/string.h> 105#include <linux/nodemask.h> 106#include <linux/mempolicy.h> 107#include <linux/mutex.h> 108 109#include <asm/uaccess.h> 110#include <asm/cacheflush.h> 111#include <asm/tlbflush.h> 112#include <asm/page.h> 113 114/* 115 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, 116 * SLAB_RED_ZONE & SLAB_POISON. 117 * 0 for faster, smaller code (especially in the critical paths). 118 * 119 * STATS - 1 to collect stats for /proc/slabinfo. 120 * 0 for faster, smaller code (especially in the critical paths). 121 * 122 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 123 */ 124 125#ifdef CONFIG_DEBUG_SLAB 126#define DEBUG 1 127#define STATS 1 128#define FORCED_DEBUG 1 129#else 130#define DEBUG 0 131#define STATS 0 132#define FORCED_DEBUG 0 133#endif 134 135/* Shouldn't this be in a header file somewhere? */ 136#define BYTES_PER_WORD sizeof(void *) 137 138#ifndef cache_line_size 139#define cache_line_size() L1_CACHE_BYTES 140#endif 141 142#ifndef ARCH_KMALLOC_MINALIGN 143/* 144 * Enforce a minimum alignment for the kmalloc caches. 145 * Usually, the kmalloc caches are cache_line_size() aligned, except when 146 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. 147 * Some archs want to perform DMA into kmalloc caches and need a guaranteed 148 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. 149 * Note that this flag disables some debug features. 150 */ 151#define ARCH_KMALLOC_MINALIGN 0 152#endif 153 154#ifndef ARCH_SLAB_MINALIGN 155/* 156 * Enforce a minimum alignment for all caches. 157 * Intended for archs that get misalignment faults even for BYTES_PER_WORD 158 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. 159 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables 160 * some debug features. 161 */ 162#define ARCH_SLAB_MINALIGN 0 163#endif 164 165#ifndef ARCH_KMALLOC_FLAGS 166#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 167#endif 168 169/* Legal flag mask for kmem_cache_create(). */ 170#if DEBUG 171# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ 172 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 173 SLAB_NO_REAP | SLAB_CACHE_DMA | \ 174 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ 175 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 176 SLAB_DESTROY_BY_RCU) 177#else 178# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ 179 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ 180 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 181 SLAB_DESTROY_BY_RCU) 182#endif 183 184/* 185 * kmem_bufctl_t: 186 * 187 * Bufctl's are used for linking objs within a slab 188 * linked offsets. 189 * 190 * This implementation relies on "struct page" for locating the cache & 191 * slab an object belongs to. 192 * This allows the bufctl structure to be small (one int), but limits 193 * the number of objects a slab (not a cache) can contain when off-slab 194 * bufctls are used. The limit is the size of the largest general cache 195 * that does not use off-slab slabs. 196 * For 32bit archs with 4 kB pages, is this 56. 197 * This is not serious, as it is only for large objects, when it is unwise 198 * to have too many per slab. 199 * Note: This limit can be raised by introducing a general cache whose size 200 * is less than 512 (PAGE_SIZE<<3), but greater than 256. 201 */ 202 203typedef unsigned int kmem_bufctl_t; 204#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) 205#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) 206#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) 207 208/* Max number of objs-per-slab for caches which use off-slab slabs. 209 * Needed to avoid a possible looping condition in cache_grow(). 210 */ 211static unsigned long offslab_limit; 212 213/* 214 * struct slab 215 * 216 * Manages the objs in a slab. Placed either at the beginning of mem allocated 217 * for a slab, or allocated from an general cache. 218 * Slabs are chained into three list: fully used, partial, fully free slabs. 219 */ 220struct slab { 221 struct list_head list; 222 unsigned long colouroff; 223 void *s_mem; /* including colour offset */ 224 unsigned int inuse; /* num of objs active in slab */ 225 kmem_bufctl_t free; 226 unsigned short nodeid; 227}; 228 229/* 230 * struct slab_rcu 231 * 232 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 233 * arrange for kmem_freepages to be called via RCU. This is useful if 234 * we need to approach a kernel structure obliquely, from its address 235 * obtained without the usual locking. We can lock the structure to 236 * stabilize it and check it's still at the given address, only if we 237 * can be sure that the memory has not been meanwhile reused for some 238 * other kind of object (which our subsystem's lock might corrupt). 239 * 240 * rcu_read_lock before reading the address, then rcu_read_unlock after 241 * taking the spinlock within the structure expected at that address. 242 * 243 * We assume struct slab_rcu can overlay struct slab when destroying. 244 */ 245struct slab_rcu { 246 struct rcu_head head; 247 kmem_cache_t *cachep; 248 void *addr; 249}; 250 251/* 252 * struct array_cache 253 * 254 * Purpose: 255 * - LIFO ordering, to hand out cache-warm objects from _alloc 256 * - reduce the number of linked list operations 257 * - reduce spinlock operations 258 * 259 * The limit is stored in the per-cpu structure to reduce the data cache 260 * footprint. 261 * 262 */ 263struct array_cache { 264 unsigned int avail; 265 unsigned int limit; 266 unsigned int batchcount; 267 unsigned int touched; 268 spinlock_t lock; 269 void *entry[0]; /* 270 * Must have this definition in here for the proper 271 * alignment of array_cache. Also simplifies accessing 272 * the entries. 273 * [0] is for gcc 2.95. It should really be []. 274 */ 275}; 276 277/* bootstrap: The caches do not work without cpuarrays anymore, 278 * but the cpuarrays are allocated from the generic caches... 279 */ 280#define BOOT_CPUCACHE_ENTRIES 1 281struct arraycache_init { 282 struct array_cache cache; 283 void *entries[BOOT_CPUCACHE_ENTRIES]; 284}; 285 286/* 287 * The slab lists for all objects. 288 */ 289struct kmem_list3 { 290 struct list_head slabs_partial; /* partial list first, better asm code */ 291 struct list_head slabs_full; 292 struct list_head slabs_free; 293 unsigned long free_objects; 294 unsigned long next_reap; 295 int free_touched; 296 unsigned int free_limit; 297 spinlock_t list_lock; 298 struct array_cache *shared; /* shared per node */ 299 struct array_cache **alien; /* on other nodes */ 300}; 301 302/* 303 * Need this for bootstrapping a per node allocator. 304 */ 305#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) 306struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; 307#define CACHE_CACHE 0 308#define SIZE_AC 1 309#define SIZE_L3 (1 + MAX_NUMNODES) 310 311/* 312 * This function must be completely optimized away if 313 * a constant is passed to it. Mostly the same as 314 * what is in linux/slab.h except it returns an 315 * index. 316 */ 317static __always_inline int index_of(const size_t size) 318{ 319 extern void __bad_size(void); 320 321 if (__builtin_constant_p(size)) { 322 int i = 0; 323 324#define CACHE(x) \ 325 if (size <=x) \ 326 return i; \ 327 else \ 328 i++; 329#include "linux/kmalloc_sizes.h" 330#undef CACHE 331 __bad_size(); 332 } else 333 __bad_size(); 334 return 0; 335} 336 337#define INDEX_AC index_of(sizeof(struct arraycache_init)) 338#define INDEX_L3 index_of(sizeof(struct kmem_list3)) 339 340static void kmem_list3_init(struct kmem_list3 *parent) 341{ 342 INIT_LIST_HEAD(&parent->slabs_full); 343 INIT_LIST_HEAD(&parent->slabs_partial); 344 INIT_LIST_HEAD(&parent->slabs_free); 345 parent->shared = NULL; 346 parent->alien = NULL; 347 spin_lock_init(&parent->list_lock); 348 parent->free_objects = 0; 349 parent->free_touched = 0; 350} 351 352#define MAKE_LIST(cachep, listp, slab, nodeid) \ 353 do { \ 354 INIT_LIST_HEAD(listp); \ 355 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ 356 } while (0) 357 358#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 359 do { \ 360 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 361 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 362 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 363 } while (0) 364 365/* 366 * kmem_cache_t 367 * 368 * manages a cache. 369 */ 370 371struct kmem_cache { 372/* 1) per-cpu data, touched during every alloc/free */ 373 struct array_cache *array[NR_CPUS]; 374 unsigned int batchcount; 375 unsigned int limit; 376 unsigned int shared; 377 unsigned int buffer_size; 378/* 2) touched by every alloc & free from the backend */ 379 struct kmem_list3 *nodelists[MAX_NUMNODES]; 380 unsigned int flags; /* constant flags */ 381 unsigned int num; /* # of objs per slab */ 382 spinlock_t spinlock; 383 384/* 3) cache_grow/shrink */ 385 /* order of pgs per slab (2^n) */ 386 unsigned int gfporder; 387 388 /* force GFP flags, e.g. GFP_DMA */ 389 gfp_t gfpflags; 390 391 size_t colour; /* cache colouring range */ 392 unsigned int colour_off; /* colour offset */ 393 unsigned int colour_next; /* cache colouring */ 394 kmem_cache_t *slabp_cache; 395 unsigned int slab_size; 396 unsigned int dflags; /* dynamic flags */ 397 398 /* constructor func */ 399 void (*ctor) (void *, kmem_cache_t *, unsigned long); 400 401 /* de-constructor func */ 402 void (*dtor) (void *, kmem_cache_t *, unsigned long); 403 404/* 4) cache creation/removal */ 405 const char *name; 406 struct list_head next; 407 408/* 5) statistics */ 409#if STATS 410 unsigned long num_active; 411 unsigned long num_allocations; 412 unsigned long high_mark; 413 unsigned long grown; 414 unsigned long reaped; 415 unsigned long errors; 416 unsigned long max_freeable; 417 unsigned long node_allocs; 418 unsigned long node_frees; 419 atomic_t allochit; 420 atomic_t allocmiss; 421 atomic_t freehit; 422 atomic_t freemiss; 423#endif 424#if DEBUG 425 /* 426 * If debugging is enabled, then the allocator can add additional 427 * fields and/or padding to every object. buffer_size contains the total 428 * object size including these internal fields, the following two 429 * variables contain the offset to the user object and its size. 430 */ 431 int obj_offset; 432 int obj_size; 433#endif 434}; 435 436#define CFLGS_OFF_SLAB (0x80000000UL) 437#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 438 439#define BATCHREFILL_LIMIT 16 440/* Optimization question: fewer reaps means less 441 * probability for unnessary cpucache drain/refill cycles. 442 * 443 * OTOH the cpuarrays can contain lots of objects, 444 * which could lock up otherwise freeable slabs. 445 */ 446#define REAPTIMEOUT_CPUC (2*HZ) 447#define REAPTIMEOUT_LIST3 (4*HZ) 448 449#if STATS 450#define STATS_INC_ACTIVE(x) ((x)->num_active++) 451#define STATS_DEC_ACTIVE(x) ((x)->num_active--) 452#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 453#define STATS_INC_GROWN(x) ((x)->grown++) 454#define STATS_INC_REAPED(x) ((x)->reaped++) 455#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ 456 (x)->high_mark = (x)->num_active; \ 457 } while (0) 458#define STATS_INC_ERR(x) ((x)->errors++) 459#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 460#define STATS_INC_NODEFREES(x) ((x)->node_frees++) 461#define STATS_SET_FREEABLE(x, i) \ 462 do { if ((x)->max_freeable < i) \ 463 (x)->max_freeable = i; \ 464 } while (0) 465 466#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 467#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 468#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 469#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 470#else 471#define STATS_INC_ACTIVE(x) do { } while (0) 472#define STATS_DEC_ACTIVE(x) do { } while (0) 473#define STATS_INC_ALLOCED(x) do { } while (0) 474#define STATS_INC_GROWN(x) do { } while (0) 475#define STATS_INC_REAPED(x) do { } while (0) 476#define STATS_SET_HIGH(x) do { } while (0) 477#define STATS_INC_ERR(x) do { } while (0) 478#define STATS_INC_NODEALLOCS(x) do { } while (0) 479#define STATS_INC_NODEFREES(x) do { } while (0) 480#define STATS_SET_FREEABLE(x, i) \ 481 do { } while (0) 482 483#define STATS_INC_ALLOCHIT(x) do { } while (0) 484#define STATS_INC_ALLOCMISS(x) do { } while (0) 485#define STATS_INC_FREEHIT(x) do { } while (0) 486#define STATS_INC_FREEMISS(x) do { } while (0) 487#endif 488 489#if DEBUG 490/* Magic nums for obj red zoning. 491 * Placed in the first word before and the first word after an obj. 492 */ 493#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ 494#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ 495 496/* ...and for poisoning */ 497#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ 498#define POISON_FREE 0x6b /* for use-after-free poisoning */ 499#define POISON_END 0xa5 /* end-byte of poisoning */ 500 501/* memory layout of objects: 502 * 0 : objp 503 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that 504 * the end of an object is aligned with the end of the real 505 * allocation. Catches writes behind the end of the allocation. 506 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: 507 * redzone word. 508 * cachep->obj_offset: The real object. 509 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 510 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] 511 */ 512static int obj_offset(kmem_cache_t *cachep) 513{ 514 return cachep->obj_offset; 515} 516 517static int obj_size(kmem_cache_t *cachep) 518{ 519 return cachep->obj_size; 520} 521 522static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp) 523{ 524 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 525 return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD); 526} 527 528static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) 529{ 530 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 531 if (cachep->flags & SLAB_STORE_USER) 532 return (unsigned long *)(objp + cachep->buffer_size - 533 2 * BYTES_PER_WORD); 534 return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD); 535} 536 537static void **dbg_userword(kmem_cache_t *cachep, void *objp) 538{ 539 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 540 return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); 541} 542 543#else 544 545#define obj_offset(x) 0 546#define obj_size(cachep) (cachep->buffer_size) 547#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 548#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 549#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 550 551#endif 552 553/* 554 * Maximum size of an obj (in 2^order pages) 555 * and absolute limit for the gfp order. 556 */ 557#if defined(CONFIG_LARGE_ALLOCS) 558#define MAX_OBJ_ORDER 13 /* up to 32Mb */ 559#define MAX_GFP_ORDER 13 /* up to 32Mb */ 560#elif defined(CONFIG_MMU) 561#define MAX_OBJ_ORDER 5 /* 32 pages */ 562#define MAX_GFP_ORDER 5 /* 32 pages */ 563#else 564#define MAX_OBJ_ORDER 8 /* up to 1Mb */ 565#define MAX_GFP_ORDER 8 /* up to 1Mb */ 566#endif 567 568/* 569 * Do not go above this order unless 0 objects fit into the slab. 570 */ 571#define BREAK_GFP_ORDER_HI 1 572#define BREAK_GFP_ORDER_LO 0 573static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 574 575/* Functions for storing/retrieving the cachep and or slab from the 576 * global 'mem_map'. These are used to find the slab an obj belongs to. 577 * With kfree(), these are used to find the cache which an obj belongs to. 578 */ 579static inline void page_set_cache(struct page *page, struct kmem_cache *cache) 580{ 581 page->lru.next = (struct list_head *)cache; 582} 583 584static inline struct kmem_cache *page_get_cache(struct page *page) 585{ 586 return (struct kmem_cache *)page->lru.next; 587} 588 589static inline void page_set_slab(struct page *page, struct slab *slab) 590{ 591 page->lru.prev = (struct list_head *)slab; 592} 593 594static inline struct slab *page_get_slab(struct page *page) 595{ 596 return (struct slab *)page->lru.prev; 597} 598 599static inline struct kmem_cache *virt_to_cache(const void *obj) 600{ 601 struct page *page = virt_to_page(obj); 602 return page_get_cache(page); 603} 604 605static inline struct slab *virt_to_slab(const void *obj) 606{ 607 struct page *page = virt_to_page(obj); 608 return page_get_slab(page); 609} 610 611/* These are the default caches for kmalloc. Custom caches can have other sizes. */ 612struct cache_sizes malloc_sizes[] = { 613#define CACHE(x) { .cs_size = (x) }, 614#include <linux/kmalloc_sizes.h> 615 CACHE(ULONG_MAX) 616#undef CACHE 617}; 618EXPORT_SYMBOL(malloc_sizes); 619 620/* Must match cache_sizes above. Out of line to keep cache footprint low. */ 621struct cache_names { 622 char *name; 623 char *name_dma; 624}; 625 626static struct cache_names __initdata cache_names[] = { 627#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 628#include <linux/kmalloc_sizes.h> 629 {NULL,} 630#undef CACHE 631}; 632 633static struct arraycache_init initarray_cache __initdata = 634 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 635static struct arraycache_init initarray_generic = 636 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 637 638/* internal cache of cache description objs */ 639static kmem_cache_t cache_cache = { 640 .batchcount = 1, 641 .limit = BOOT_CPUCACHE_ENTRIES, 642 .shared = 1, 643 .buffer_size = sizeof(kmem_cache_t), 644 .flags = SLAB_NO_REAP, 645 .spinlock = SPIN_LOCK_UNLOCKED, 646 .name = "kmem_cache", 647#if DEBUG 648 .obj_size = sizeof(kmem_cache_t), 649#endif 650}; 651 652/* Guard access to the cache-chain. */ 653static DEFINE_MUTEX(cache_chain_mutex); 654static struct list_head cache_chain; 655 656/* 657 * vm_enough_memory() looks at this to determine how many 658 * slab-allocated pages are possibly freeable under pressure 659 * 660 * SLAB_RECLAIM_ACCOUNT turns this on per-slab 661 */ 662atomic_t slab_reclaim_pages; 663 664/* 665 * chicken and egg problem: delay the per-cpu array allocation 666 * until the general caches are up. 667 */ 668static enum { 669 NONE, 670 PARTIAL_AC, 671 PARTIAL_L3, 672 FULL 673} g_cpucache_up; 674 675static DEFINE_PER_CPU(struct work_struct, reap_work); 676 677static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node); 678static void enable_cpucache(kmem_cache_t *cachep); 679static void cache_reap(void *unused); 680static int __node_shrink(kmem_cache_t *cachep, int node); 681 682static inline struct array_cache *ac_data(kmem_cache_t *cachep) 683{ 684 return cachep->array[smp_processor_id()]; 685} 686 687static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags) 688{ 689 struct cache_sizes *csizep = malloc_sizes; 690 691#if DEBUG 692 /* This happens if someone tries to call 693 * kmem_cache_create(), or __kmalloc(), before 694 * the generic caches are initialized. 695 */ 696 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 697#endif 698 while (size > csizep->cs_size) 699 csizep++; 700 701 /* 702 * Really subtle: The last entry with cs->cs_size==ULONG_MAX 703 * has cs_{dma,}cachep==NULL. Thus no special case 704 * for large kmalloc calls required. 705 */ 706 if (unlikely(gfpflags & GFP_DMA)) 707 return csizep->cs_dmacachep; 708 return csizep->cs_cachep; 709} 710 711kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 712{ 713 return __find_general_cachep(size, gfpflags); 714} 715EXPORT_SYMBOL(kmem_find_general_cachep); 716 717static size_t slab_mgmt_size(size_t nr_objs, size_t align) 718{ 719 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); 720} 721 722/* Calculate the number of objects and left-over bytes for a given 723 buffer size. */ 724static void cache_estimate(unsigned long gfporder, size_t buffer_size, 725 size_t align, int flags, size_t *left_over, 726 unsigned int *num) 727{ 728 int nr_objs; 729 size_t mgmt_size; 730 size_t slab_size = PAGE_SIZE << gfporder; 731 732 /* 733 * The slab management structure can be either off the slab or 734 * on it. For the latter case, the memory allocated for a 735 * slab is used for: 736 * 737 * - The struct slab 738 * - One kmem_bufctl_t for each object 739 * - Padding to respect alignment of @align 740 * - @buffer_size bytes for each object 741 * 742 * If the slab management structure is off the slab, then the 743 * alignment will already be calculated into the size. Because 744 * the slabs are all pages aligned, the objects will be at the 745 * correct alignment when allocated. 746 */ 747 if (flags & CFLGS_OFF_SLAB) { 748 mgmt_size = 0; 749 nr_objs = slab_size / buffer_size; 750 751 if (nr_objs > SLAB_LIMIT) 752 nr_objs = SLAB_LIMIT; 753 } else { 754 /* 755 * Ignore padding for the initial guess. The padding 756 * is at most @align-1 bytes, and @buffer_size is at 757 * least @align. In the worst case, this result will 758 * be one greater than the number of objects that fit 759 * into the memory allocation when taking the padding 760 * into account. 761 */ 762 nr_objs = (slab_size - sizeof(struct slab)) / 763 (buffer_size + sizeof(kmem_bufctl_t)); 764 765 /* 766 * This calculated number will be either the right 767 * amount, or one greater than what we want. 768 */ 769 if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size 770 > slab_size) 771 nr_objs--; 772 773 if (nr_objs > SLAB_LIMIT) 774 nr_objs = SLAB_LIMIT; 775 776 mgmt_size = slab_mgmt_size(nr_objs, align); 777 } 778 *num = nr_objs; 779 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 780} 781 782#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 783 784static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) 785{ 786 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 787 function, cachep->name, msg); 788 dump_stack(); 789} 790 791/* 792 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 793 * via the workqueue/eventd. 794 * Add the CPU number into the expiration time to minimize the possibility of 795 * the CPUs getting into lockstep and contending for the global cache chain 796 * lock. 797 */ 798static void __devinit start_cpu_timer(int cpu) 799{ 800 struct work_struct *reap_work = &per_cpu(reap_work, cpu); 801 802 /* 803 * When this gets called from do_initcalls via cpucache_init(), 804 * init_workqueues() has already run, so keventd will be setup 805 * at that time. 806 */ 807 if (keventd_up() && reap_work->func == NULL) { 808 INIT_WORK(reap_work, cache_reap, NULL); 809 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 810 } 811} 812 813static struct array_cache *alloc_arraycache(int node, int entries, 814 int batchcount) 815{ 816 int memsize = sizeof(void *) * entries + sizeof(struct array_cache); 817 struct array_cache *nc = NULL; 818 819 nc = kmalloc_node(memsize, GFP_KERNEL, node); 820 if (nc) { 821 nc->avail = 0; 822 nc->limit = entries; 823 nc->batchcount = batchcount; 824 nc->touched = 0; 825 spin_lock_init(&nc->lock); 826 } 827 return nc; 828} 829 830#ifdef CONFIG_NUMA 831static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int); 832 833static struct array_cache **alloc_alien_cache(int node, int limit) 834{ 835 struct array_cache **ac_ptr; 836 int memsize = sizeof(void *) * MAX_NUMNODES; 837 int i; 838 839 if (limit > 1) 840 limit = 12; 841 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); 842 if (ac_ptr) { 843 for_each_node(i) { 844 if (i == node || !node_online(i)) { 845 ac_ptr[i] = NULL; 846 continue; 847 } 848 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 849 if (!ac_ptr[i]) { 850 for (i--; i <= 0; i--) 851 kfree(ac_ptr[i]); 852 kfree(ac_ptr); 853 return NULL; 854 } 855 } 856 } 857 return ac_ptr; 858} 859 860static void free_alien_cache(struct array_cache **ac_ptr) 861{ 862 int i; 863 864 if (!ac_ptr) 865 return; 866 867 for_each_node(i) 868 kfree(ac_ptr[i]); 869 870 kfree(ac_ptr); 871} 872 873static void __drain_alien_cache(kmem_cache_t *cachep, 874 struct array_cache *ac, int node) 875{ 876 struct kmem_list3 *rl3 = cachep->nodelists[node]; 877 878 if (ac->avail) { 879 spin_lock(&rl3->list_lock); 880 free_block(cachep, ac->entry, ac->avail, node); 881 ac->avail = 0; 882 spin_unlock(&rl3->list_lock); 883 } 884} 885 886static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) 887{ 888 int i = 0; 889 struct array_cache *ac; 890 unsigned long flags; 891 892 for_each_online_node(i) { 893 ac = l3->alien[i]; 894 if (ac) { 895 spin_lock_irqsave(&ac->lock, flags); 896 __drain_alien_cache(cachep, ac, i); 897 spin_unlock_irqrestore(&ac->lock, flags); 898 } 899 } 900} 901#else 902#define alloc_alien_cache(node, limit) do { } while (0) 903#define free_alien_cache(ac_ptr) do { } while (0) 904#define drain_alien_cache(cachep, l3) do { } while (0) 905#endif 906 907static int __devinit cpuup_callback(struct notifier_block *nfb, 908 unsigned long action, void *hcpu) 909{ 910 long cpu = (long)hcpu; 911 kmem_cache_t *cachep; 912 struct kmem_list3 *l3 = NULL; 913 int node = cpu_to_node(cpu); 914 int memsize = sizeof(struct kmem_list3); 915 916 switch (action) { 917 case CPU_UP_PREPARE: 918 mutex_lock(&cache_chain_mutex); 919 /* we need to do this right in the beginning since 920 * alloc_arraycache's are going to use this list. 921 * kmalloc_node allows us to add the slab to the right 922 * kmem_list3 and not this cpu's kmem_list3 923 */ 924 925 list_for_each_entry(cachep, &cache_chain, next) { 926 /* setup the size64 kmemlist for cpu before we can 927 * begin anything. Make sure some other cpu on this 928 * node has not already allocated this 929 */ 930 if (!cachep->nodelists[node]) { 931 if (!(l3 = kmalloc_node(memsize, 932 GFP_KERNEL, node))) 933 goto bad; 934 kmem_list3_init(l3); 935 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 936 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 937 938 cachep->nodelists[node] = l3; 939 } 940 941 spin_lock_irq(&cachep->nodelists[node]->list_lock); 942 cachep->nodelists[node]->free_limit = 943 (1 + nr_cpus_node(node)) * 944 cachep->batchcount + cachep->num; 945 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 946 } 947 948 /* Now we can go ahead with allocating the shared array's 949 & array cache's */ 950 list_for_each_entry(cachep, &cache_chain, next) { 951 struct array_cache *nc; 952 953 nc = alloc_arraycache(node, cachep->limit, 954 cachep->batchcount); 955 if (!nc) 956 goto bad; 957 cachep->array[cpu] = nc; 958 959 l3 = cachep->nodelists[node]; 960 BUG_ON(!l3); 961 if (!l3->shared) { 962 if (!(nc = alloc_arraycache(node, 963 cachep->shared * 964 cachep->batchcount, 965 0xbaadf00d))) 966 goto bad; 967 968 /* we are serialised from CPU_DEAD or 969 CPU_UP_CANCELLED by the cpucontrol lock */ 970 l3->shared = nc; 971 } 972 } 973 mutex_unlock(&cache_chain_mutex); 974 break; 975 case CPU_ONLINE: 976 start_cpu_timer(cpu); 977 break; 978#ifdef CONFIG_HOTPLUG_CPU 979 case CPU_DEAD: 980 /* fall thru */ 981 case CPU_UP_CANCELED: 982 mutex_lock(&cache_chain_mutex); 983 984 list_for_each_entry(cachep, &cache_chain, next) { 985 struct array_cache *nc; 986 cpumask_t mask; 987 988 mask = node_to_cpumask(node); 989 spin_lock_irq(&cachep->spinlock); 990 /* cpu is dead; no one can alloc from it. */ 991 nc = cachep->array[cpu]; 992 cachep->array[cpu] = NULL; 993 l3 = cachep->nodelists[node]; 994 995 if (!l3) 996 goto unlock_cache; 997 998 spin_lock(&l3->list_lock); 999 1000 /* Free limit for this kmem_list3 */ 1001 l3->free_limit -= cachep->batchcount; 1002 if (nc) 1003 free_block(cachep, nc->entry, nc->avail, node); 1004 1005 if (!cpus_empty(mask)) { 1006 spin_unlock(&l3->list_lock); 1007 goto unlock_cache; 1008 } 1009 1010 if (l3->shared) { 1011 free_block(cachep, l3->shared->entry, 1012 l3->shared->avail, node); 1013 kfree(l3->shared); 1014 l3->shared = NULL; 1015 } 1016 if (l3->alien) { 1017 drain_alien_cache(cachep, l3); 1018 free_alien_cache(l3->alien); 1019 l3->alien = NULL; 1020 } 1021 1022 /* free slabs belonging to this node */ 1023 if (__node_shrink(cachep, node)) { 1024 cachep->nodelists[node] = NULL; 1025 spin_unlock(&l3->list_lock); 1026 kfree(l3); 1027 } else { 1028 spin_unlock(&l3->list_lock); 1029 } 1030 unlock_cache: 1031 spin_unlock_irq(&cachep->spinlock); 1032 kfree(nc); 1033 } 1034 mutex_unlock(&cache_chain_mutex); 1035 break; 1036#endif 1037 } 1038 return NOTIFY_OK; 1039 bad: 1040 mutex_unlock(&cache_chain_mutex); 1041 return NOTIFY_BAD; 1042} 1043 1044static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; 1045 1046/* 1047 * swap the static kmem_list3 with kmalloced memory 1048 */ 1049static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid) 1050{ 1051 struct kmem_list3 *ptr; 1052 1053 BUG_ON(cachep->nodelists[nodeid] != list); 1054 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); 1055 BUG_ON(!ptr); 1056 1057 local_irq_disable(); 1058 memcpy(ptr, list, sizeof(struct kmem_list3)); 1059 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1060 cachep->nodelists[nodeid] = ptr; 1061 local_irq_enable(); 1062} 1063 1064/* Initialisation. 1065 * Called after the gfp() functions have been enabled, and before smp_init(). 1066 */ 1067void __init kmem_cache_init(void) 1068{ 1069 size_t left_over; 1070 struct cache_sizes *sizes; 1071 struct cache_names *names; 1072 int i; 1073 1074 for (i = 0; i < NUM_INIT_LISTS; i++) { 1075 kmem_list3_init(&initkmem_list3[i]); 1076 if (i < MAX_NUMNODES) 1077 cache_cache.nodelists[i] = NULL; 1078 } 1079 1080 /* 1081 * Fragmentation resistance on low memory - only use bigger 1082 * page orders on machines with more than 32MB of memory. 1083 */ 1084 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1085 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1086 1087 /* Bootstrap is tricky, because several objects are allocated 1088 * from caches that do not exist yet: 1089 * 1) initialize the cache_cache cache: it contains the kmem_cache_t 1090 * structures of all caches, except cache_cache itself: cache_cache 1091 * is statically allocated. 1092 * Initially an __init data area is used for the head array and the 1093 * kmem_list3 structures, it's replaced with a kmalloc allocated 1094 * array at the end of the bootstrap. 1095 * 2) Create the first kmalloc cache. 1096 * The kmem_cache_t for the new cache is allocated normally. 1097 * An __init data area is used for the head array. 1098 * 3) Create the remaining kmalloc caches, with minimally sized 1099 * head arrays. 1100 * 4) Replace the __init data head arrays for cache_cache and the first 1101 * kmalloc cache with kmalloc allocated arrays. 1102 * 5) Replace the __init data for kmem_list3 for cache_cache and 1103 * the other cache's with kmalloc allocated memory. 1104 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1105 */ 1106 1107 /* 1) create the cache_cache */ 1108 INIT_LIST_HEAD(&cache_chain); 1109 list_add(&cache_cache.next, &cache_chain); 1110 cache_cache.colour_off = cache_line_size(); 1111 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1112 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; 1113 1114 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); 1115 1116 cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0, 1117 &left_over, &cache_cache.num); 1118 if (!cache_cache.num) 1119 BUG(); 1120 1121 cache_cache.colour = left_over / cache_cache.colour_off; 1122 cache_cache.colour_next = 0; 1123 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + 1124 sizeof(struct slab), cache_line_size()); 1125 1126 /* 2+3) create the kmalloc caches */ 1127 sizes = malloc_sizes; 1128 names = cache_names; 1129 1130 /* Initialize the caches that provide memory for the array cache 1131 * and the kmem_list3 structures first. 1132 * Without this, further allocations will bug 1133 */ 1134 1135 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1136 sizes[INDEX_AC].cs_size, 1137 ARCH_KMALLOC_MINALIGN, 1138 (ARCH_KMALLOC_FLAGS | 1139 SLAB_PANIC), NULL, NULL); 1140 1141 if (INDEX_AC != INDEX_L3) 1142 sizes[INDEX_L3].cs_cachep = 1143 kmem_cache_create(names[INDEX_L3].name, 1144 sizes[INDEX_L3].cs_size, 1145 ARCH_KMALLOC_MINALIGN, 1146 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, 1147 NULL); 1148 1149 while (sizes->cs_size != ULONG_MAX) { 1150 /* 1151 * For performance, all the general caches are L1 aligned. 1152 * This should be particularly beneficial on SMP boxes, as it 1153 * eliminates "false sharing". 1154 * Note for systems short on memory removing the alignment will 1155 * allow tighter packing of the smaller caches. 1156 */ 1157 if (!sizes->cs_cachep) 1158 sizes->cs_cachep = kmem_cache_create(names->name, 1159 sizes->cs_size, 1160 ARCH_KMALLOC_MINALIGN, 1161 (ARCH_KMALLOC_FLAGS 1162 | SLAB_PANIC), 1163 NULL, NULL); 1164 1165 /* Inc off-slab bufctl limit until the ceiling is hit. */ 1166 if (!(OFF_SLAB(sizes->cs_cachep))) { 1167 offslab_limit = sizes->cs_size - sizeof(struct slab); 1168 offslab_limit /= sizeof(kmem_bufctl_t); 1169 } 1170 1171 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1172 sizes->cs_size, 1173 ARCH_KMALLOC_MINALIGN, 1174 (ARCH_KMALLOC_FLAGS | 1175 SLAB_CACHE_DMA | 1176 SLAB_PANIC), NULL, 1177 NULL); 1178 1179 sizes++; 1180 names++; 1181 } 1182 /* 4) Replace the bootstrap head arrays */ 1183 { 1184 void *ptr; 1185 1186 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1187 1188 local_irq_disable(); 1189 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 1190 memcpy(ptr, ac_data(&cache_cache), 1191 sizeof(struct arraycache_init)); 1192 cache_cache.array[smp_processor_id()] = ptr; 1193 local_irq_enable(); 1194 1195 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1196 1197 local_irq_disable(); 1198 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) 1199 != &initarray_generic.cache); 1200 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), 1201 sizeof(struct arraycache_init)); 1202 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1203 ptr; 1204 local_irq_enable(); 1205 } 1206 /* 5) Replace the bootstrap kmem_list3's */ 1207 { 1208 int node; 1209 /* Replace the static kmem_list3 structures for the boot cpu */ 1210 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], 1211 numa_node_id()); 1212 1213 for_each_online_node(node) { 1214 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1215 &initkmem_list3[SIZE_AC + node], node); 1216 1217 if (INDEX_AC != INDEX_L3) { 1218 init_list(malloc_sizes[INDEX_L3].cs_cachep, 1219 &initkmem_list3[SIZE_L3 + node], 1220 node); 1221 } 1222 } 1223 } 1224 1225 /* 6) resize the head arrays to their final sizes */ 1226 { 1227 kmem_cache_t *cachep; 1228 mutex_lock(&cache_chain_mutex); 1229 list_for_each_entry(cachep, &cache_chain, next) 1230 enable_cpucache(cachep); 1231 mutex_unlock(&cache_chain_mutex); 1232 } 1233 1234 /* Done! */ 1235 g_cpucache_up = FULL; 1236 1237 /* Register a cpu startup notifier callback 1238 * that initializes ac_data for all new cpus 1239 */ 1240 register_cpu_notifier(&cpucache_notifier); 1241 1242 /* The reap timers are started later, with a module init call: 1243 * That part of the kernel is not yet operational. 1244 */ 1245} 1246 1247static int __init cpucache_init(void) 1248{ 1249 int cpu; 1250 1251 /* 1252 * Register the timers that return unneeded 1253 * pages to gfp. 1254 */ 1255 for_each_online_cpu(cpu) 1256 start_cpu_timer(cpu); 1257 1258 return 0; 1259} 1260 1261__initcall(cpucache_init); 1262 1263/* 1264 * Interface to system's page allocator. No need to hold the cache-lock. 1265 * 1266 * If we requested dmaable memory, we will get it. Even if we 1267 * did not request dmaable memory, we might get it, but that 1268 * would be relatively rare and ignorable. 1269 */ 1270static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid) 1271{ 1272 struct page *page; 1273 void *addr; 1274 int i; 1275 1276 flags |= cachep->gfpflags; 1277 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1278 if (!page) 1279 return NULL; 1280 addr = page_address(page); 1281 1282 i = (1 << cachep->gfporder); 1283 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1284 atomic_add(i, &slab_reclaim_pages); 1285 add_page_state(nr_slab, i); 1286 while (i--) { 1287 SetPageSlab(page); 1288 page++; 1289 } 1290 return addr; 1291} 1292 1293/* 1294 * Interface to system's page release. 1295 */ 1296static void kmem_freepages(kmem_cache_t *cachep, void *addr) 1297{ 1298 unsigned long i = (1 << cachep->gfporder); 1299 struct page *page = virt_to_page(addr); 1300 const unsigned long nr_freed = i; 1301 1302 while (i--) { 1303 if (!TestClearPageSlab(page)) 1304 BUG(); 1305 page++; 1306 } 1307 sub_page_state(nr_slab, nr_freed); 1308 if (current->reclaim_state) 1309 current->reclaim_state->reclaimed_slab += nr_freed; 1310 free_pages((unsigned long)addr, cachep->gfporder); 1311 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1312 atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages); 1313} 1314 1315static void kmem_rcu_free(struct rcu_head *head) 1316{ 1317 struct slab_rcu *slab_rcu = (struct slab_rcu *)head; 1318 kmem_cache_t *cachep = slab_rcu->cachep; 1319 1320 kmem_freepages(cachep, slab_rcu->addr); 1321 if (OFF_SLAB(cachep)) 1322 kmem_cache_free(cachep->slabp_cache, slab_rcu); 1323} 1324 1325#if DEBUG 1326 1327#ifdef CONFIG_DEBUG_PAGEALLOC 1328static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, 1329 unsigned long caller) 1330{ 1331 int size = obj_size(cachep); 1332 1333 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; 1334 1335 if (size < 5 * sizeof(unsigned long)) 1336 return; 1337 1338 *addr++ = 0x12345678; 1339 *addr++ = caller; 1340 *addr++ = smp_processor_id(); 1341 size -= 3 * sizeof(unsigned long); 1342 { 1343 unsigned long *sptr = &caller; 1344 unsigned long svalue; 1345 1346 while (!kstack_end(sptr)) { 1347 svalue = *sptr++; 1348 if (kernel_text_address(svalue)) { 1349 *addr++ = svalue; 1350 size -= sizeof(unsigned long); 1351 if (size <= sizeof(unsigned long)) 1352 break; 1353 } 1354 } 1355 1356 } 1357 *addr++ = 0x87654321; 1358} 1359#endif 1360 1361static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) 1362{ 1363 int size = obj_size(cachep); 1364 addr = &((char *)addr)[obj_offset(cachep)]; 1365 1366 memset(addr, val, size); 1367 *(unsigned char *)(addr + size - 1) = POISON_END; 1368} 1369 1370static void dump_line(char *data, int offset, int limit) 1371{ 1372 int i; 1373 printk(KERN_ERR "%03x:", offset); 1374 for (i = 0; i < limit; i++) { 1375 printk(" %02x", (unsigned char)data[offset + i]); 1376 } 1377 printk("\n"); 1378} 1379#endif 1380 1381#if DEBUG 1382 1383static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines) 1384{ 1385 int i, size; 1386 char *realobj; 1387 1388 if (cachep->flags & SLAB_RED_ZONE) { 1389 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1390 *dbg_redzone1(cachep, objp), 1391 *dbg_redzone2(cachep, objp)); 1392 } 1393 1394 if (cachep->flags & SLAB_STORE_USER) { 1395 printk(KERN_ERR "Last user: [<%p>]", 1396 *dbg_userword(cachep, objp)); 1397 print_symbol("(%s)", 1398 (unsigned long)*dbg_userword(cachep, objp)); 1399 printk("\n"); 1400 } 1401 realobj = (char *)objp + obj_offset(cachep); 1402 size = obj_size(cachep); 1403 for (i = 0; i < size && lines; i += 16, lines--) { 1404 int limit; 1405 limit = 16; 1406 if (i + limit > size) 1407 limit = size - i; 1408 dump_line(realobj, i, limit); 1409 } 1410} 1411 1412static void check_poison_obj(kmem_cache_t *cachep, void *objp) 1413{ 1414 char *realobj; 1415 int size, i; 1416 int lines = 0; 1417 1418 realobj = (char *)objp + obj_offset(cachep); 1419 size = obj_size(cachep); 1420 1421 for (i = 0; i < size; i++) { 1422 char exp = POISON_FREE; 1423 if (i == size - 1) 1424 exp = POISON_END; 1425 if (realobj[i] != exp) { 1426 int limit; 1427 /* Mismatch ! */ 1428 /* Print header */ 1429 if (lines == 0) { 1430 printk(KERN_ERR 1431 "Slab corruption: start=%p, len=%d\n", 1432 realobj, size); 1433 print_objinfo(cachep, objp, 0); 1434 } 1435 /* Hexdump the affected line */ 1436 i = (i / 16) * 16; 1437 limit = 16; 1438 if (i + limit > size) 1439 limit = size - i; 1440 dump_line(realobj, i, limit); 1441 i += 16; 1442 lines++; 1443 /* Limit to 5 lines */ 1444 if (lines > 5) 1445 break; 1446 } 1447 } 1448 if (lines != 0) { 1449 /* Print some data about the neighboring objects, if they 1450 * exist: 1451 */ 1452 struct slab *slabp = virt_to_slab(objp); 1453 int objnr; 1454 1455 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 1456 if (objnr) { 1457 objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size; 1458 realobj = (char *)objp + obj_offset(cachep); 1459 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1460 realobj, size); 1461 print_objinfo(cachep, objp, 2); 1462 } 1463 if (objnr + 1 < cachep->num) { 1464 objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size; 1465 realobj = (char *)objp + obj_offset(cachep); 1466 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1467 realobj, size); 1468 print_objinfo(cachep, objp, 2); 1469 } 1470 } 1471} 1472#endif 1473 1474#if DEBUG 1475/** 1476 * slab_destroy_objs - call the registered destructor for each object in 1477 * a slab that is to be destroyed. 1478 */ 1479static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp) 1480{ 1481 int i; 1482 for (i = 0; i < cachep->num; i++) { 1483 void *objp = slabp->s_mem + cachep->buffer_size * i; 1484 1485 if (cachep->flags & SLAB_POISON) { 1486#ifdef CONFIG_DEBUG_PAGEALLOC 1487 if ((cachep->buffer_size % PAGE_SIZE) == 0 1488 && OFF_SLAB(cachep)) 1489 kernel_map_pages(virt_to_page(objp), 1490 cachep->buffer_size / PAGE_SIZE, 1491 1); 1492 else 1493 check_poison_obj(cachep, objp); 1494#else 1495 check_poison_obj(cachep, objp); 1496#endif 1497 } 1498 if (cachep->flags & SLAB_RED_ZONE) { 1499 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1500 slab_error(cachep, "start of a freed object " 1501 "was overwritten"); 1502 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1503 slab_error(cachep, "end of a freed object " 1504 "was overwritten"); 1505 } 1506 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1507 (cachep->dtor) (objp + obj_offset(cachep), cachep, 0); 1508 } 1509} 1510#else 1511static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp) 1512{ 1513 if (cachep->dtor) { 1514 int i; 1515 for (i = 0; i < cachep->num; i++) { 1516 void *objp = slabp->s_mem + cachep->buffer_size * i; 1517 (cachep->dtor) (objp, cachep, 0); 1518 } 1519 } 1520} 1521#endif 1522 1523/** 1524 * Destroy all the objs in a slab, and release the mem back to the system. 1525 * Before calling the slab must have been unlinked from the cache. 1526 * The cache-lock is not held/needed. 1527 */ 1528static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp) 1529{ 1530 void *addr = slabp->s_mem - slabp->colouroff; 1531 1532 slab_destroy_objs(cachep, slabp); 1533 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1534 struct slab_rcu *slab_rcu; 1535 1536 slab_rcu = (struct slab_rcu *)slabp; 1537 slab_rcu->cachep = cachep; 1538 slab_rcu->addr = addr; 1539 call_rcu(&slab_rcu->head, kmem_rcu_free); 1540 } else { 1541 kmem_freepages(cachep, addr); 1542 if (OFF_SLAB(cachep)) 1543 kmem_cache_free(cachep->slabp_cache, slabp); 1544 } 1545} 1546 1547/* For setting up all the kmem_list3s for cache whose buffer_size is same 1548 as size of kmem_list3. */ 1549static void set_up_list3s(kmem_cache_t *cachep, int index) 1550{ 1551 int node; 1552 1553 for_each_online_node(node) { 1554 cachep->nodelists[node] = &initkmem_list3[index + node]; 1555 cachep->nodelists[node]->next_reap = jiffies + 1556 REAPTIMEOUT_LIST3 + 1557 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1558 } 1559} 1560 1561/** 1562 * calculate_slab_order - calculate size (page order) of slabs and the number 1563 * of objects per slab. 1564 * 1565 * This could be made much more intelligent. For now, try to avoid using 1566 * high order pages for slabs. When the gfp() functions are more friendly 1567 * towards high-order requests, this should be changed. 1568 */ 1569static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size, 1570 size_t align, gfp_t flags) 1571{ 1572 size_t left_over = 0; 1573 1574 for (;; cachep->gfporder++) { 1575 unsigned int num; 1576 size_t remainder; 1577 1578 if (cachep->gfporder > MAX_GFP_ORDER) { 1579 cachep->num = 0; 1580 break; 1581 } 1582 1583 cache_estimate(cachep->gfporder, size, align, flags, 1584 &remainder, &num); 1585 if (!num) 1586 continue; 1587 /* More than offslab_limit objects will cause problems */ 1588 if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) 1589 break; 1590 1591 cachep->num = num; 1592 left_over = remainder; 1593 1594 /* 1595 * Large number of objects is good, but very large slabs are 1596 * currently bad for the gfp()s. 1597 */ 1598 if (cachep->gfporder >= slab_break_gfp_order) 1599 break; 1600 1601 if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder)) 1602 /* Acceptable internal fragmentation */ 1603 break; 1604 } 1605 return left_over; 1606} 1607 1608/** 1609 * kmem_cache_create - Create a cache. 1610 * @name: A string which is used in /proc/slabinfo to identify this cache. 1611 * @size: The size of objects to be created in this cache. 1612 * @align: The required alignment for the objects. 1613 * @flags: SLAB flags 1614 * @ctor: A constructor for the objects. 1615 * @dtor: A destructor for the objects. 1616 * 1617 * Returns a ptr to the cache on success, NULL on failure. 1618 * Cannot be called within a int, but can be interrupted. 1619 * The @ctor is run when new pages are allocated by the cache 1620 * and the @dtor is run before the pages are handed back. 1621 * 1622 * @name must be valid until the cache is destroyed. This implies that 1623 * the module calling this has to destroy the cache before getting 1624 * unloaded. 1625 * 1626 * The flags are 1627 * 1628 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 1629 * to catch references to uninitialised memory. 1630 * 1631 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 1632 * for buffer overruns. 1633 * 1634 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under 1635 * memory pressure. 1636 * 1637 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 1638 * cacheline. This can be beneficial if you're counting cycles as closely 1639 * as davem. 1640 */ 1641kmem_cache_t * 1642kmem_cache_create (const char *name, size_t size, size_t align, 1643 unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), 1644 void (*dtor)(void*, kmem_cache_t *, unsigned long)) 1645{ 1646 size_t left_over, slab_size, ralign; 1647 kmem_cache_t *cachep = NULL; 1648 struct list_head *p; 1649 1650 /* 1651 * Sanity checks... these are all serious usage bugs. 1652 */ 1653 if ((!name) || 1654 in_interrupt() || 1655 (size < BYTES_PER_WORD) || 1656 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { 1657 printk(KERN_ERR "%s: Early error in slab %s\n", 1658 __FUNCTION__, name); 1659 BUG(); 1660 } 1661 1662 mutex_lock(&cache_chain_mutex); 1663 1664 list_for_each(p, &cache_chain) { 1665 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); 1666 mm_segment_t old_fs = get_fs(); 1667 char tmp; 1668 int res; 1669 1670 /* 1671 * This happens when the module gets unloaded and doesn't 1672 * destroy its slab cache and no-one else reuses the vmalloc 1673 * area of the module. Print a warning. 1674 */ 1675 set_fs(KERNEL_DS); 1676 res = __get_user(tmp, pc->name); 1677 set_fs(old_fs); 1678 if (res) { 1679 printk("SLAB: cache with size %d has lost its name\n", 1680 pc->buffer_size); 1681 continue; 1682 } 1683 1684 if (!strcmp(pc->name, name)) { 1685 printk("kmem_cache_create: duplicate cache %s\n", name); 1686 dump_stack(); 1687 goto oops; 1688 } 1689 } 1690 1691#if DEBUG 1692 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 1693 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { 1694 /* No constructor, but inital state check requested */ 1695 printk(KERN_ERR "%s: No con, but init state check " 1696 "requested - %s\n", __FUNCTION__, name); 1697 flags &= ~SLAB_DEBUG_INITIAL; 1698 } 1699#if FORCED_DEBUG 1700 /* 1701 * Enable redzoning and last user accounting, except for caches with 1702 * large objects, if the increased size would increase the object size 1703 * above the next power of two: caches with object sizes just above a 1704 * power of two have a significant amount of internal fragmentation. 1705 */ 1706 if ((size < 4096 1707 || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD))) 1708 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 1709 if (!(flags & SLAB_DESTROY_BY_RCU)) 1710 flags |= SLAB_POISON; 1711#endif 1712 if (flags & SLAB_DESTROY_BY_RCU) 1713 BUG_ON(flags & SLAB_POISON); 1714#endif 1715 if (flags & SLAB_DESTROY_BY_RCU) 1716 BUG_ON(dtor); 1717 1718 /* 1719 * Always checks flags, a caller might be expecting debug 1720 * support which isn't available. 1721 */ 1722 if (flags & ~CREATE_MASK) 1723 BUG(); 1724 1725 /* Check that size is in terms of words. This is needed to avoid 1726 * unaligned accesses for some archs when redzoning is used, and makes 1727 * sure any on-slab bufctl's are also correctly aligned. 1728 */ 1729 if (size & (BYTES_PER_WORD - 1)) { 1730 size += (BYTES_PER_WORD - 1); 1731 size &= ~(BYTES_PER_WORD - 1); 1732 } 1733 1734 /* calculate out the final buffer alignment: */ 1735 /* 1) arch recommendation: can be overridden for debug */ 1736 if (flags & SLAB_HWCACHE_ALIGN) { 1737 /* Default alignment: as specified by the arch code. 1738 * Except if an object is really small, then squeeze multiple 1739 * objects into one cacheline. 1740 */ 1741 ralign = cache_line_size(); 1742 while (size <= ralign / 2) 1743 ralign /= 2; 1744 } else { 1745 ralign = BYTES_PER_WORD; 1746 } 1747 /* 2) arch mandated alignment: disables debug if necessary */ 1748 if (ralign < ARCH_SLAB_MINALIGN) { 1749 ralign = ARCH_SLAB_MINALIGN; 1750 if (ralign > BYTES_PER_WORD) 1751 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 1752 } 1753 /* 3) caller mandated alignment: disables debug if necessary */ 1754 if (ralign < align) { 1755 ralign = align; 1756 if (ralign > BYTES_PER_WORD) 1757 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 1758 } 1759 /* 4) Store it. Note that the debug code below can reduce 1760 * the alignment to BYTES_PER_WORD. 1761 */ 1762 align = ralign; 1763 1764 /* Get cache's description obj. */ 1765 cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); 1766 if (!cachep) 1767 goto oops; 1768 memset(cachep, 0, sizeof(kmem_cache_t)); 1769 1770#if DEBUG 1771 cachep->obj_size = size; 1772 1773 if (flags & SLAB_RED_ZONE) { 1774 /* redzoning only works with word aligned caches */ 1775 align = BYTES_PER_WORD; 1776 1777 /* add space for red zone words */ 1778 cachep->obj_offset += BYTES_PER_WORD; 1779 size += 2 * BYTES_PER_WORD; 1780 } 1781 if (flags & SLAB_STORE_USER) { 1782 /* user store requires word alignment and 1783 * one word storage behind the end of the real 1784 * object. 1785 */ 1786 align = BYTES_PER_WORD; 1787 size += BYTES_PER_WORD; 1788 } 1789#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1790 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 1791 && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { 1792 cachep->obj_offset += PAGE_SIZE - size; 1793 size = PAGE_SIZE; 1794 } 1795#endif 1796#endif 1797 1798 /* Determine if the slab management is 'on' or 'off' slab. */ 1799 if (size >= (PAGE_SIZE >> 3)) 1800 /* 1801 * Size is large, assume best to place the slab management obj 1802 * off-slab (should allow better packing of objs). 1803 */ 1804 flags |= CFLGS_OFF_SLAB; 1805 1806 size = ALIGN(size, align); 1807 1808 if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { 1809 /* 1810 * A VFS-reclaimable slab tends to have most allocations 1811 * as GFP_NOFS and we really don't want to have to be allocating 1812 * higher-order pages when we are unable to shrink dcache. 1813 */ 1814 cachep->gfporder = 0; 1815 cache_estimate(cachep->gfporder, size, align, flags, 1816 &left_over, &cachep->num); 1817 } else 1818 left_over = calculate_slab_order(cachep, size, align, flags); 1819 1820 if (!cachep->num) { 1821 printk("kmem_cache_create: couldn't create cache %s.\n", name); 1822 kmem_cache_free(&cache_cache, cachep); 1823 cachep = NULL; 1824 goto oops; 1825 } 1826 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) 1827 + sizeof(struct slab), align); 1828 1829 /* 1830 * If the slab has been placed off-slab, and we have enough space then 1831 * move it on-slab. This is at the expense of any extra colouring. 1832 */ 1833 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { 1834 flags &= ~CFLGS_OFF_SLAB; 1835 left_over -= slab_size; 1836 } 1837 1838 if (flags & CFLGS_OFF_SLAB) { 1839 /* really off slab. No need for manual alignment */ 1840 slab_size = 1841 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); 1842 } 1843 1844 cachep->colour_off = cache_line_size(); 1845 /* Offset must be a multiple of the alignment. */ 1846 if (cachep->colour_off < align) 1847 cachep->colour_off = align; 1848 cachep->colour = left_over / cachep->colour_off; 1849 cachep->slab_size = slab_size; 1850 cachep->flags = flags; 1851 cachep->gfpflags = 0; 1852 if (flags & SLAB_CACHE_DMA) 1853 cachep->gfpflags |= GFP_DMA; 1854 spin_lock_init(&cachep->spinlock); 1855 cachep->buffer_size = size; 1856 1857 if (flags & CFLGS_OFF_SLAB) 1858 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 1859 cachep->ctor = ctor; 1860 cachep->dtor = dtor; 1861 cachep->name = name; 1862 1863 /* Don't let CPUs to come and go */ 1864 lock_cpu_hotplug(); 1865 1866 if (g_cpucache_up == FULL) { 1867 enable_cpucache(cachep); 1868 } else { 1869 if (g_cpucache_up == NONE) { 1870 /* Note: the first kmem_cache_create must create 1871 * the cache that's used by kmalloc(24), otherwise 1872 * the creation of further caches will BUG(). 1873 */ 1874 cachep->array[smp_processor_id()] = 1875 &initarray_generic.cache; 1876 1877 /* If the cache that's used by 1878 * kmalloc(sizeof(kmem_list3)) is the first cache, 1879 * then we need to set up all its list3s, otherwise 1880 * the creation of further caches will BUG(). 1881 */ 1882 set_up_list3s(cachep, SIZE_AC); 1883 if (INDEX_AC == INDEX_L3) 1884 g_cpucache_up = PARTIAL_L3; 1885 else 1886 g_cpucache_up = PARTIAL_AC; 1887 } else { 1888 cachep->array[smp_processor_id()] = 1889 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1890 1891 if (g_cpucache_up == PARTIAL_AC) { 1892 set_up_list3s(cachep, SIZE_L3); 1893 g_cpucache_up = PARTIAL_L3; 1894 } else { 1895 int node; 1896 for_each_online_node(node) { 1897 1898 cachep->nodelists[node] = 1899 kmalloc_node(sizeof 1900 (struct kmem_list3), 1901 GFP_KERNEL, node); 1902 BUG_ON(!cachep->nodelists[node]); 1903 kmem_list3_init(cachep-> 1904 nodelists[node]); 1905 } 1906 } 1907 } 1908 cachep->nodelists[numa_node_id()]->next_reap = 1909 jiffies + REAPTIMEOUT_LIST3 + 1910 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1911 1912 BUG_ON(!ac_data(cachep)); 1913 ac_data(cachep)->avail = 0; 1914 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 1915 ac_data(cachep)->batchcount = 1; 1916 ac_data(cachep)->touched = 0; 1917 cachep->batchcount = 1; 1918 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1919 } 1920 1921 /* cache setup completed, link it into the list */ 1922 list_add(&cachep->next, &cache_chain); 1923 unlock_cpu_hotplug(); 1924 oops: 1925 if (!cachep && (flags & SLAB_PANIC)) 1926 panic("kmem_cache_create(): failed to create slab `%s'\n", 1927 name); 1928 mutex_unlock(&cache_chain_mutex); 1929 return cachep; 1930} 1931EXPORT_SYMBOL(kmem_cache_create); 1932 1933#if DEBUG 1934static void check_irq_off(void) 1935{ 1936 BUG_ON(!irqs_disabled()); 1937} 1938 1939static void check_irq_on(void) 1940{ 1941 BUG_ON(irqs_disabled()); 1942} 1943 1944static void check_spinlock_acquired(kmem_cache_t *cachep) 1945{ 1946#ifdef CONFIG_SMP 1947 check_irq_off(); 1948 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); 1949#endif 1950} 1951 1952static void check_spinlock_acquired_node(kmem_cache_t *cachep, int node) 1953{ 1954#ifdef CONFIG_SMP 1955 check_irq_off(); 1956 assert_spin_locked(&cachep->nodelists[node]->list_lock); 1957#endif 1958} 1959 1960#else 1961#define check_irq_off() do { } while(0) 1962#define check_irq_on() do { } while(0) 1963#define check_spinlock_acquired(x) do { } while(0) 1964#define check_spinlock_acquired_node(x, y) do { } while(0) 1965#endif 1966 1967/* 1968 * Waits for all CPUs to execute func(). 1969 */ 1970static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg) 1971{ 1972 check_irq_on(); 1973 preempt_disable(); 1974 1975 local_irq_disable(); 1976 func(arg); 1977 local_irq_enable(); 1978 1979 if (smp_call_function(func, arg, 1, 1)) 1980 BUG(); 1981 1982 preempt_enable(); 1983} 1984 1985static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, 1986 int force, int node); 1987 1988static void do_drain(void *arg) 1989{ 1990 kmem_cache_t *cachep = (kmem_cache_t *) arg; 1991 struct array_cache *ac; 1992 int node = numa_node_id(); 1993 1994 check_irq_off(); 1995 ac = ac_data(cachep); 1996 spin_lock(&cachep->nodelists[node]->list_lock); 1997 free_block(cachep, ac->entry, ac->avail, node); 1998 spin_unlock(&cachep->nodelists[node]->list_lock); 1999 ac->avail = 0; 2000} 2001 2002static void drain_cpu_caches(kmem_cache_t *cachep) 2003{ 2004 struct kmem_list3 *l3; 2005 int node; 2006 2007 smp_call_function_all_cpus(do_drain, cachep); 2008 check_irq_on(); 2009 spin_lock_irq(&cachep->spinlock); 2010 for_each_online_node(node) { 2011 l3 = cachep->nodelists[node]; 2012 if (l3) { 2013 spin_lock(&l3->list_lock); 2014 drain_array_locked(cachep, l3->shared, 1, node); 2015 spin_unlock(&l3->list_lock); 2016 if (l3->alien) 2017 drain_alien_cache(cachep, l3); 2018 } 2019 } 2020 spin_unlock_irq(&cachep->spinlock); 2021} 2022 2023static int __node_shrink(kmem_cache_t *cachep, int node) 2024{ 2025 struct slab *slabp; 2026 struct kmem_list3 *l3 = cachep->nodelists[node]; 2027 int ret; 2028 2029 for (;;) { 2030 struct list_head *p; 2031 2032 p = l3->slabs_free.prev; 2033 if (p == &l3->slabs_free) 2034 break; 2035 2036 slabp = list_entry(l3->slabs_free.prev, struct slab, list); 2037#if DEBUG 2038 if (slabp->inuse) 2039 BUG(); 2040#endif 2041 list_del(&slabp->list); 2042 2043 l3->free_objects -= cachep->num; 2044 spin_unlock_irq(&l3->list_lock); 2045 slab_destroy(cachep, slabp); 2046 spin_lock_irq(&l3->list_lock); 2047 } 2048 ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial); 2049 return ret; 2050} 2051 2052static int __cache_shrink(kmem_cache_t *cachep) 2053{ 2054 int ret = 0, i = 0; 2055 struct kmem_list3 *l3; 2056 2057 drain_cpu_caches(cachep); 2058 2059 check_irq_on(); 2060 for_each_online_node(i) { 2061 l3 = cachep->nodelists[i]; 2062 if (l3) { 2063 spin_lock_irq(&l3->list_lock); 2064 ret += __node_shrink(cachep, i); 2065 spin_unlock_irq(&l3->list_lock); 2066 } 2067 } 2068 return (ret ? 1 : 0); 2069} 2070 2071/** 2072 * kmem_cache_shrink - Shrink a cache. 2073 * @cachep: The cache to shrink. 2074 * 2075 * Releases as many slabs as possible for a cache. 2076 * To help debugging, a zero exit status indicates all slabs were released. 2077 */ 2078int kmem_cache_shrink(kmem_cache_t *cachep) 2079{ 2080 if (!cachep || in_interrupt()) 2081 BUG(); 2082 2083 return __cache_shrink(cachep); 2084} 2085EXPORT_SYMBOL(kmem_cache_shrink); 2086 2087/** 2088 * kmem_cache_destroy - delete a cache 2089 * @cachep: the cache to destroy 2090 * 2091 * Remove a kmem_cache_t object from the slab cache. 2092 * Returns 0 on success. 2093 * 2094 * It is expected this function will be called by a module when it is 2095 * unloaded. This will remove the cache completely, and avoid a duplicate 2096 * cache being allocated each time a module is loaded and unloaded, if the 2097 * module doesn't have persistent in-kernel storage across loads and unloads. 2098 * 2099 * The cache must be empty before calling this function. 2100 * 2101 * The caller must guarantee that noone will allocate memory from the cache 2102 * during the kmem_cache_destroy(). 2103 */ 2104int kmem_cache_destroy(kmem_cache_t *cachep) 2105{ 2106 int i; 2107 struct kmem_list3 *l3; 2108 2109 if (!cachep || in_interrupt()) 2110 BUG(); 2111 2112 /* Don't let CPUs to come and go */ 2113 lock_cpu_hotplug(); 2114 2115 /* Find the cache in the chain of caches. */ 2116 mutex_lock(&cache_chain_mutex); 2117 /* 2118 * the chain is never empty, cache_cache is never destroyed 2119 */ 2120 list_del(&cachep->next); 2121 mutex_unlock(&cache_chain_mutex); 2122 2123 if (__cache_shrink(cachep)) { 2124 slab_error(cachep, "Can't free all objects"); 2125 mutex_lock(&cache_chain_mutex); 2126 list_add(&cachep->next, &cache_chain); 2127 mutex_unlock(&cache_chain_mutex); 2128 unlock_cpu_hotplug(); 2129 return 1; 2130 } 2131 2132 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2133 synchronize_rcu(); 2134 2135 for_each_online_cpu(i) 2136 kfree(cachep->array[i]); 2137 2138 /* NUMA: free the list3 structures */ 2139 for_each_online_node(i) { 2140 if ((l3 = cachep->nodelists[i])) { 2141 kfree(l3->shared); 2142 free_alien_cache(l3->alien); 2143 kfree(l3); 2144 } 2145 } 2146 kmem_cache_free(&cache_cache, cachep); 2147 2148 unlock_cpu_hotplug(); 2149 2150 return 0; 2151} 2152EXPORT_SYMBOL(kmem_cache_destroy); 2153 2154/* Get the memory for a slab management obj. */ 2155static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp, 2156 int colour_off, gfp_t local_flags) 2157{ 2158 struct slab *slabp; 2159 2160 if (OFF_SLAB(cachep)) { 2161 /* Slab management obj is off-slab. */ 2162 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); 2163 if (!slabp) 2164 return NULL; 2165 } else { 2166 slabp = objp + colour_off; 2167 colour_off += cachep->slab_size; 2168 } 2169 slabp->inuse = 0; 2170 slabp->colouroff = colour_off; 2171 slabp->s_mem = objp + colour_off; 2172 2173 return slabp; 2174} 2175 2176static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2177{ 2178 return (kmem_bufctl_t *) (slabp + 1); 2179} 2180 2181static void cache_init_objs(kmem_cache_t *cachep, 2182 struct slab *slabp, unsigned long ctor_flags) 2183{ 2184 int i; 2185 2186 for (i = 0; i < cachep->num; i++) { 2187 void *objp = slabp->s_mem + cachep->buffer_size * i; 2188#if DEBUG 2189 /* need to poison the objs? */ 2190 if (cachep->flags & SLAB_POISON) 2191 poison_obj(cachep, objp, POISON_FREE); 2192 if (cachep->flags & SLAB_STORE_USER) 2193 *dbg_userword(cachep, objp) = NULL; 2194 2195 if (cachep->flags & SLAB_RED_ZONE) { 2196 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2197 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2198 } 2199 /* 2200 * Constructors are not allowed to allocate memory from 2201 * the same cache which they are a constructor for. 2202 * Otherwise, deadlock. They must also be threaded. 2203 */ 2204 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2205 cachep->ctor(objp + obj_offset(cachep), cachep, 2206 ctor_flags); 2207 2208 if (cachep->flags & SLAB_RED_ZONE) { 2209 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2210 slab_error(cachep, "constructor overwrote the" 2211 " end of an object"); 2212 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2213 slab_error(cachep, "constructor overwrote the" 2214 " start of an object"); 2215 } 2216 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep) 2217 && cachep->flags & SLAB_POISON) 2218 kernel_map_pages(virt_to_page(objp), 2219 cachep->buffer_size / PAGE_SIZE, 0); 2220#else 2221 if (cachep->ctor) 2222 cachep->ctor(objp, cachep, ctor_flags); 2223#endif 2224 slab_bufctl(slabp)[i] = i + 1; 2225 } 2226 slab_bufctl(slabp)[i - 1] = BUFCTL_END; 2227 slabp->free = 0; 2228} 2229 2230static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags) 2231{ 2232 if (flags & SLAB_DMA) { 2233 if (!(cachep->gfpflags & GFP_DMA)) 2234 BUG(); 2235 } else { 2236 if (cachep->gfpflags & GFP_DMA) 2237 BUG(); 2238 } 2239} 2240 2241static void *slab_get_obj(kmem_cache_t *cachep, struct slab *slabp, int nodeid) 2242{ 2243 void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size); 2244 kmem_bufctl_t next; 2245 2246 slabp->inuse++; 2247 next = slab_bufctl(slabp)[slabp->free]; 2248#if DEBUG 2249 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2250 WARN_ON(slabp->nodeid != nodeid); 2251#endif 2252 slabp->free = next; 2253 2254 return objp; 2255} 2256 2257static void slab_put_obj(kmem_cache_t *cachep, struct slab *slabp, void *objp, 2258 int nodeid) 2259{ 2260 unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size; 2261 2262#if DEBUG 2263 /* Verify that the slab belongs to the intended node */ 2264 WARN_ON(slabp->nodeid != nodeid); 2265 2266 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2267 printk(KERN_ERR "slab: double free detected in cache " 2268 "'%s', objp %p\n", cachep->name, objp); 2269 BUG(); 2270 } 2271#endif 2272 slab_bufctl(slabp)[objnr] = slabp->free; 2273 slabp->free = objnr; 2274 slabp->inuse--; 2275} 2276 2277static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) 2278{ 2279 int i; 2280 struct page *page; 2281 2282 /* Nasty!!!!!! I hope this is OK. */ 2283 i = 1 << cachep->gfporder; 2284 page = virt_to_page(objp); 2285 do { 2286 page_set_cache(page, cachep); 2287 page_set_slab(page, slabp); 2288 page++; 2289 } while (--i); 2290} 2291 2292/* 2293 * Grow (by 1) the number of slabs within a cache. This is called by 2294 * kmem_cache_alloc() when there are no active objs left in a cache. 2295 */ 2296static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2297{ 2298 struct slab *slabp; 2299 void *objp; 2300 size_t offset; 2301 gfp_t local_flags; 2302 unsigned long ctor_flags; 2303 struct kmem_list3 *l3; 2304 2305 /* Be lazy and only check for valid flags here, 2306 * keeping it out of the critical path in kmem_cache_alloc(). 2307 */ 2308 if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) 2309 BUG(); 2310 if (flags & SLAB_NO_GROW) 2311 return 0; 2312 2313 ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2314 local_flags = (flags & SLAB_LEVEL_MASK); 2315 if (!(local_flags & __GFP_WAIT)) 2316 /* 2317 * Not allowed to sleep. Need to tell a constructor about 2318 * this - it might need to know... 2319 */ 2320 ctor_flags |= SLAB_CTOR_ATOMIC; 2321 2322 /* About to mess with non-constant members - lock. */ 2323 check_irq_off(); 2324 spin_lock(&cachep->spinlock); 2325 2326 /* Get colour for the slab, and cal the next value. */ 2327 offset = cachep->colour_next; 2328 cachep->colour_next++; 2329 if (cachep->colour_next >= cachep->colour) 2330 cachep->colour_next = 0; 2331 offset *= cachep->colour_off; 2332 2333 spin_unlock(&cachep->spinlock); 2334 2335 check_irq_off(); 2336 if (local_flags & __GFP_WAIT) 2337 local_irq_enable(); 2338 2339 /* 2340 * The test for missing atomic flag is performed here, rather than 2341 * the more obvious place, simply to reduce the critical path length 2342 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 2343 * will eventually be caught here (where it matters). 2344 */ 2345 kmem_flagcheck(cachep, flags); 2346 2347 /* Get mem for the objs. 2348 * Attempt to allocate a physical page from 'nodeid', 2349 */ 2350 if (!(objp = kmem_getpages(cachep, flags, nodeid))) 2351 goto failed; 2352 2353 /* Get slab management. */ 2354 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) 2355 goto opps1; 2356 2357 slabp->nodeid = nodeid; 2358 set_slab_attr(cachep, slabp, objp); 2359 2360 cache_init_objs(cachep, slabp, ctor_flags); 2361 2362 if (local_flags & __GFP_WAIT) 2363 local_irq_disable(); 2364 check_irq_off(); 2365 l3 = cachep->nodelists[nodeid]; 2366 spin_lock(&l3->list_lock); 2367 2368 /* Make slab active. */ 2369 list_add_tail(&slabp->list, &(l3->slabs_free)); 2370 STATS_INC_GROWN(cachep); 2371 l3->free_objects += cachep->num; 2372 spin_unlock(&l3->list_lock); 2373 return 1; 2374 opps1: 2375 kmem_freepages(cachep, objp); 2376 failed: 2377 if (local_flags & __GFP_WAIT) 2378 local_irq_disable(); 2379 return 0; 2380} 2381 2382#if DEBUG 2383 2384/* 2385 * Perform extra freeing checks: 2386 * - detect bad pointers. 2387 * - POISON/RED_ZONE checking 2388 * - destructor calls, for caches with POISON+dtor 2389 */ 2390static void kfree_debugcheck(const void *objp) 2391{ 2392 struct page *page; 2393 2394 if (!virt_addr_valid(objp)) { 2395 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2396 (unsigned long)objp); 2397 BUG(); 2398 } 2399 page = virt_to_page(objp); 2400 if (!PageSlab(page)) { 2401 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", 2402 (unsigned long)objp); 2403 BUG(); 2404 } 2405} 2406 2407static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, 2408 void *caller) 2409{ 2410 struct page *page; 2411 unsigned int objnr; 2412 struct slab *slabp; 2413 2414 objp -= obj_offset(cachep); 2415 kfree_debugcheck(objp); 2416 page = virt_to_page(objp); 2417 2418 if (page_get_cache(page) != cachep) { 2419 printk(KERN_ERR 2420 "mismatch in kmem_cache_free: expected cache %p, got %p\n", 2421 page_get_cache(page), cachep); 2422 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); 2423 printk(KERN_ERR "%p is %s.\n", page_get_cache(page), 2424 page_get_cache(page)->name); 2425 WARN_ON(1); 2426 } 2427 slabp = page_get_slab(page); 2428 2429 if (cachep->flags & SLAB_RED_ZONE) { 2430 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE 2431 || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { 2432 slab_error(cachep, 2433 "double free, or memory outside" 2434 " object was overwritten"); 2435 printk(KERN_ERR 2436 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2437 objp, *dbg_redzone1(cachep, objp), 2438 *dbg_redzone2(cachep, objp)); 2439 } 2440 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2441 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2442 } 2443 if (cachep->flags & SLAB_STORE_USER) 2444 *dbg_userword(cachep, objp) = caller; 2445 2446 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 2447 2448 BUG_ON(objnr >= cachep->num); 2449 BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size); 2450 2451 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2452 /* Need to call the slab's constructor so the 2453 * caller can perform a verify of its state (debugging). 2454 * Called without the cache-lock held. 2455 */ 2456 cachep->ctor(objp + obj_offset(cachep), 2457 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); 2458 } 2459 if (cachep->flags & SLAB_POISON && cachep->dtor) { 2460 /* we want to cache poison the object, 2461 * call the destruction callback 2462 */ 2463 cachep->dtor(objp + obj_offset(cachep), cachep, 0); 2464 } 2465 if (cachep->flags & SLAB_POISON) { 2466#ifdef CONFIG_DEBUG_PAGEALLOC 2467 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2468 store_stackinfo(cachep, objp, (unsigned long)caller); 2469 kernel_map_pages(virt_to_page(objp), 2470 cachep->buffer_size / PAGE_SIZE, 0); 2471 } else { 2472 poison_obj(cachep, objp, POISON_FREE); 2473 } 2474#else 2475 poison_obj(cachep, objp, POISON_FREE); 2476#endif 2477 } 2478 return objp; 2479} 2480 2481static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) 2482{ 2483 kmem_bufctl_t i; 2484 int entries = 0; 2485 2486 /* Check slab's freelist to see if this obj is there. */ 2487 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2488 entries++; 2489 if (entries > cachep->num || i >= cachep->num) 2490 goto bad; 2491 } 2492 if (entries != cachep->num - slabp->inuse) { 2493 bad: 2494 printk(KERN_ERR 2495 "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2496 cachep->name, cachep->num, slabp, slabp->inuse); 2497 for (i = 0; 2498 i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t); 2499 i++) { 2500 if ((i % 16) == 0) 2501 printk("\n%03x:", i); 2502 printk(" %02x", ((unsigned char *)slabp)[i]); 2503 } 2504 printk("\n"); 2505 BUG(); 2506 } 2507} 2508#else 2509#define kfree_debugcheck(x) do { } while(0) 2510#define cache_free_debugcheck(x,objp,z) (objp) 2511#define check_slabp(x,y) do { } while(0) 2512#endif 2513 2514static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags) 2515{ 2516 int batchcount; 2517 struct kmem_list3 *l3; 2518 struct array_cache *ac; 2519 2520 check_irq_off(); 2521 ac = ac_data(cachep); 2522 retry: 2523 batchcount = ac->batchcount; 2524 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2525 /* if there was little recent activity on this 2526 * cache, then perform only a partial refill. 2527 * Otherwise we could generate refill bouncing. 2528 */ 2529 batchcount = BATCHREFILL_LIMIT; 2530 } 2531 l3 = cachep->nodelists[numa_node_id()]; 2532 2533 BUG_ON(ac->avail > 0 || !l3); 2534 spin_lock(&l3->list_lock); 2535 2536 if (l3->shared) { 2537 struct array_cache *shared_array = l3->shared; 2538 if (shared_array->avail) { 2539 if (batchcount > shared_array->avail) 2540 batchcount = shared_array->avail; 2541 shared_array->avail -= batchcount; 2542 ac->avail = batchcount; 2543 memcpy(ac->entry, 2544 &(shared_array->entry[shared_array->avail]), 2545 sizeof(void *) * batchcount); 2546 shared_array->touched = 1; 2547 goto alloc_done; 2548 } 2549 } 2550 while (batchcount > 0) { 2551 struct list_head *entry; 2552 struct slab *slabp; 2553 /* Get slab alloc is to come from. */ 2554 entry = l3->slabs_partial.next; 2555 if (entry == &l3->slabs_partial) { 2556 l3->free_touched = 1; 2557 entry = l3->slabs_free.next; 2558 if (entry == &l3->slabs_free) 2559 goto must_grow; 2560 } 2561 2562 slabp = list_entry(entry, struct slab, list); 2563 check_slabp(cachep, slabp); 2564 check_spinlock_acquired(cachep); 2565 while (slabp->inuse < cachep->num && batchcount--) { 2566 STATS_INC_ALLOCED(cachep); 2567 STATS_INC_ACTIVE(cachep); 2568 STATS_SET_HIGH(cachep); 2569 2570 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, 2571 numa_node_id()); 2572 } 2573 check_slabp(cachep, slabp); 2574 2575 /* move slabp to correct slabp list: */ 2576 list_del(&slabp->list); 2577 if (slabp->free == BUFCTL_END) 2578 list_add(&slabp->list, &l3->slabs_full); 2579 else 2580 list_add(&slabp->list, &l3->slabs_partial); 2581 } 2582 2583 must_grow: 2584 l3->free_objects -= ac->avail; 2585 alloc_done: 2586 spin_unlock(&l3->list_lock); 2587 2588 if (unlikely(!ac->avail)) { 2589 int x; 2590 x = cache_grow(cachep, flags, numa_node_id()); 2591 2592 // cache_grow can reenable interrupts, then ac could change. 2593 ac = ac_data(cachep); 2594 if (!x && ac->avail == 0) // no objects in sight? abort 2595 return NULL; 2596 2597 if (!ac->avail) // objects refilled by interrupt? 2598 goto retry; 2599 } 2600 ac->touched = 1; 2601 return ac->entry[--ac->avail]; 2602} 2603 2604static inline void 2605cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags) 2606{ 2607 might_sleep_if(flags & __GFP_WAIT); 2608#if DEBUG 2609 kmem_flagcheck(cachep, flags); 2610#endif 2611} 2612 2613#if DEBUG 2614static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags, 2615 void *objp, void *caller) 2616{ 2617 if (!objp) 2618 return objp; 2619 if (cachep->flags & SLAB_POISON) { 2620#ifdef CONFIG_DEBUG_PAGEALLOC 2621 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2622 kernel_map_pages(virt_to_page(objp), 2623 cachep->buffer_size / PAGE_SIZE, 1); 2624 else 2625 check_poison_obj(cachep, objp); 2626#else 2627 check_poison_obj(cachep, objp); 2628#endif 2629 poison_obj(cachep, objp, POISON_INUSE); 2630 } 2631 if (cachep->flags & SLAB_STORE_USER) 2632 *dbg_userword(cachep, objp) = caller; 2633 2634 if (cachep->flags & SLAB_RED_ZONE) { 2635 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE 2636 || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2637 slab_error(cachep, 2638 "double free, or memory outside" 2639 " object was overwritten"); 2640 printk(KERN_ERR 2641 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2642 objp, *dbg_redzone1(cachep, objp), 2643 *dbg_redzone2(cachep, objp)); 2644 } 2645 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2646 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2647 } 2648 objp += obj_offset(cachep); 2649 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2650 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2651 2652 if (!(flags & __GFP_WAIT)) 2653 ctor_flags |= SLAB_CTOR_ATOMIC; 2654 2655 cachep->ctor(objp, cachep, ctor_flags); 2656 } 2657 return objp; 2658} 2659#else 2660#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2661#endif 2662 2663static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2664{ 2665 void *objp; 2666 struct array_cache *ac; 2667 2668#ifdef CONFIG_NUMA 2669 if (unlikely(current->mempolicy && !in_interrupt())) { 2670 int nid = slab_node(current->mempolicy); 2671 2672 if (nid != numa_node_id()) 2673 return __cache_alloc_node(cachep, flags, nid); 2674 } 2675#endif 2676 2677 check_irq_off(); 2678 ac = ac_data(cachep); 2679 if (likely(ac->avail)) { 2680 STATS_INC_ALLOCHIT(cachep); 2681 ac->touched = 1; 2682 objp = ac->entry[--ac->avail]; 2683 } else { 2684 STATS_INC_ALLOCMISS(cachep); 2685 objp = cache_alloc_refill(cachep, flags); 2686 } 2687 return objp; 2688} 2689 2690static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2691{ 2692 unsigned long save_flags; 2693 void *objp; 2694 2695 cache_alloc_debugcheck_before(cachep, flags); 2696 2697 local_irq_save(save_flags); 2698 objp = ____cache_alloc(cachep, flags); 2699 local_irq_restore(save_flags); 2700 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 2701 __builtin_return_address(0)); 2702 prefetchw(objp); 2703 return objp; 2704} 2705 2706#ifdef CONFIG_NUMA 2707/* 2708 * A interface to enable slab creation on nodeid 2709 */ 2710static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2711{ 2712 struct list_head *entry; 2713 struct slab *slabp; 2714 struct kmem_list3 *l3; 2715 void *obj; 2716 int x; 2717 2718 l3 = cachep->nodelists[nodeid]; 2719 BUG_ON(!l3); 2720 2721 retry: 2722 spin_lock(&l3->list_lock); 2723 entry = l3->slabs_partial.next; 2724 if (entry == &l3->slabs_partial) { 2725 l3->free_touched = 1; 2726 entry = l3->slabs_free.next; 2727 if (entry == &l3->slabs_free) 2728 goto must_grow; 2729 } 2730 2731 slabp = list_entry(entry, struct slab, list); 2732 check_spinlock_acquired_node(cachep, nodeid); 2733 check_slabp(cachep, slabp); 2734 2735 STATS_INC_NODEALLOCS(cachep); 2736 STATS_INC_ACTIVE(cachep); 2737 STATS_SET_HIGH(cachep); 2738 2739 BUG_ON(slabp->inuse == cachep->num); 2740 2741 obj = slab_get_obj(cachep, slabp, nodeid); 2742 check_slabp(cachep, slabp); 2743 l3->free_objects--; 2744 /* move slabp to correct slabp list: */ 2745 list_del(&slabp->list); 2746 2747 if (slabp->free == BUFCTL_END) { 2748 list_add(&slabp->list, &l3->slabs_full); 2749 } else { 2750 list_add(&slabp->list, &l3->slabs_partial); 2751 } 2752 2753 spin_unlock(&l3->list_lock); 2754 goto done; 2755 2756 must_grow: 2757 spin_unlock(&l3->list_lock); 2758 x = cache_grow(cachep, flags, nodeid); 2759 2760 if (!x) 2761 return NULL; 2762 2763 goto retry; 2764 done: 2765 return obj; 2766} 2767#endif 2768 2769/* 2770 * Caller needs to acquire correct kmem_list's list_lock 2771 */ 2772static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, 2773 int node) 2774{ 2775 int i; 2776 struct kmem_list3 *l3; 2777 2778 for (i = 0; i < nr_objects; i++) { 2779 void *objp = objpp[i]; 2780 struct slab *slabp; 2781 2782 slabp = virt_to_slab(objp); 2783 l3 = cachep->nodelists[node]; 2784 list_del(&slabp->list); 2785 check_spinlock_acquired_node(cachep, node); 2786 check_slabp(cachep, slabp); 2787 slab_put_obj(cachep, slabp, objp, node); 2788 STATS_DEC_ACTIVE(cachep); 2789 l3->free_objects++; 2790 check_slabp(cachep, slabp); 2791 2792 /* fixup slab chains */ 2793 if (slabp->inuse == 0) { 2794 if (l3->free_objects > l3->free_limit) { 2795 l3->free_objects -= cachep->num; 2796 slab_destroy(cachep, slabp); 2797 } else { 2798 list_add(&slabp->list, &l3->slabs_free); 2799 } 2800 } else { 2801 /* Unconditionally move a slab to the end of the 2802 * partial list on free - maximum time for the 2803 * other objects to be freed, too. 2804 */ 2805 list_add_tail(&slabp->list, &l3->slabs_partial); 2806 } 2807 } 2808} 2809 2810static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) 2811{ 2812 int batchcount; 2813 struct kmem_list3 *l3; 2814 int node = numa_node_id(); 2815 2816 batchcount = ac->batchcount; 2817#if DEBUG 2818 BUG_ON(!batchcount || batchcount > ac->avail); 2819#endif 2820 check_irq_off(); 2821 l3 = cachep->nodelists[node]; 2822 spin_lock(&l3->list_lock); 2823 if (l3->shared) { 2824 struct array_cache *shared_array = l3->shared; 2825 int max = shared_array->limit - shared_array->avail; 2826 if (max) { 2827 if (batchcount > max) 2828 batchcount = max; 2829 memcpy(&(shared_array->entry[shared_array->avail]), 2830 ac->entry, sizeof(void *) * batchcount); 2831 shared_array->avail += batchcount; 2832 goto free_done; 2833 } 2834 } 2835 2836 free_block(cachep, ac->entry, batchcount, node); 2837 free_done: 2838#if STATS 2839 { 2840 int i = 0; 2841 struct list_head *p; 2842 2843 p = l3->slabs_free.next; 2844 while (p != &(l3->slabs_free)) { 2845 struct slab *slabp; 2846 2847 slabp = list_entry(p, struct slab, list); 2848 BUG_ON(slabp->inuse); 2849 2850 i++; 2851 p = p->next; 2852 } 2853 STATS_SET_FREEABLE(cachep, i); 2854 } 2855#endif 2856 spin_unlock(&l3->list_lock); 2857 ac->avail -= batchcount; 2858 memmove(ac->entry, &(ac->entry[batchcount]), 2859 sizeof(void *) * ac->avail); 2860} 2861 2862/* 2863 * __cache_free 2864 * Release an obj back to its cache. If the obj has a constructed 2865 * state, it must be in this state _before_ it is released. 2866 * 2867 * Called with disabled ints. 2868 */ 2869static inline void __cache_free(kmem_cache_t *cachep, void *objp) 2870{ 2871 struct array_cache *ac = ac_data(cachep); 2872 2873 check_irq_off(); 2874 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 2875 2876 /* Make sure we are not freeing a object from another 2877 * node to the array cache on this cpu. 2878 */ 2879#ifdef CONFIG_NUMA 2880 { 2881 struct slab *slabp; 2882 slabp = virt_to_slab(objp); 2883 if (unlikely(slabp->nodeid != numa_node_id())) { 2884 struct array_cache *alien = NULL; 2885 int nodeid = slabp->nodeid; 2886 struct kmem_list3 *l3 = 2887 cachep->nodelists[numa_node_id()]; 2888 2889 STATS_INC_NODEFREES(cachep); 2890 if (l3->alien && l3->alien[nodeid]) { 2891 alien = l3->alien[nodeid]; 2892 spin_lock(&alien->lock); 2893 if (unlikely(alien->avail == alien->limit)) 2894 __drain_alien_cache(cachep, 2895 alien, nodeid); 2896 alien->entry[alien->avail++] = objp; 2897 spin_unlock(&alien->lock); 2898 } else { 2899 spin_lock(&(cachep->nodelists[nodeid])-> 2900 list_lock); 2901 free_block(cachep, &objp, 1, nodeid); 2902 spin_unlock(&(cachep->nodelists[nodeid])-> 2903 list_lock); 2904 } 2905 return; 2906 } 2907 } 2908#endif 2909 if (likely(ac->avail < ac->limit)) { 2910 STATS_INC_FREEHIT(cachep); 2911 ac->entry[ac->avail++] = objp; 2912 return; 2913 } else { 2914 STATS_INC_FREEMISS(cachep); 2915 cache_flusharray(cachep, ac); 2916 ac->entry[ac->avail++] = objp; 2917 } 2918} 2919 2920/** 2921 * kmem_cache_alloc - Allocate an object 2922 * @cachep: The cache to allocate from. 2923 * @flags: See kmalloc(). 2924 * 2925 * Allocate an object from this cache. The flags are only relevant 2926 * if the cache has no available objects. 2927 */ 2928void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2929{ 2930 return __cache_alloc(cachep, flags); 2931} 2932EXPORT_SYMBOL(kmem_cache_alloc); 2933 2934/** 2935 * kmem_ptr_validate - check if an untrusted pointer might 2936 * be a slab entry. 2937 * @cachep: the cache we're checking against 2938 * @ptr: pointer to validate 2939 * 2940 * This verifies that the untrusted pointer looks sane: 2941 * it is _not_ a guarantee that the pointer is actually 2942 * part of the slab cache in question, but it at least 2943 * validates that the pointer can be dereferenced and 2944 * looks half-way sane. 2945 * 2946 * Currently only used for dentry validation. 2947 */ 2948int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) 2949{ 2950 unsigned long addr = (unsigned long)ptr; 2951 unsigned long min_addr = PAGE_OFFSET; 2952 unsigned long align_mask = BYTES_PER_WORD - 1; 2953 unsigned long size = cachep->buffer_size; 2954 struct page *page; 2955 2956 if (unlikely(addr < min_addr)) 2957 goto out; 2958 if (unlikely(addr > (unsigned long)high_memory - size)) 2959 goto out; 2960 if (unlikely(addr & align_mask)) 2961 goto out; 2962 if (unlikely(!kern_addr_valid(addr))) 2963 goto out; 2964 if (unlikely(!kern_addr_valid(addr + size - 1))) 2965 goto out; 2966 page = virt_to_page(ptr); 2967 if (unlikely(!PageSlab(page))) 2968 goto out; 2969 if (unlikely(page_get_cache(page) != cachep)) 2970 goto out; 2971 return 1; 2972 out: 2973 return 0; 2974} 2975 2976#ifdef CONFIG_NUMA 2977/** 2978 * kmem_cache_alloc_node - Allocate an object on the specified node 2979 * @cachep: The cache to allocate from. 2980 * @flags: See kmalloc(). 2981 * @nodeid: node number of the target node. 2982 * 2983 * Identical to kmem_cache_alloc, except that this function is slow 2984 * and can sleep. And it will allocate memory on the given node, which 2985 * can improve the performance for cpu bound structures. 2986 * New and improved: it will now make sure that the object gets 2987 * put on the correct node list so that there is no false sharing. 2988 */ 2989void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2990{ 2991 unsigned long save_flags; 2992 void *ptr; 2993 2994 cache_alloc_debugcheck_before(cachep, flags); 2995 local_irq_save(save_flags); 2996 2997 if (nodeid == -1 || nodeid == numa_node_id() || 2998 !cachep->nodelists[nodeid]) 2999 ptr = ____cache_alloc(cachep, flags); 3000 else 3001 ptr = __cache_alloc_node(cachep, flags, nodeid); 3002 local_irq_restore(save_flags); 3003 3004 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, 3005 __builtin_return_address(0)); 3006 3007 return ptr; 3008} 3009EXPORT_SYMBOL(kmem_cache_alloc_node); 3010 3011void *kmalloc_node(size_t size, gfp_t flags, int node) 3012{ 3013 kmem_cache_t *cachep; 3014 3015 cachep = kmem_find_general_cachep(size, flags); 3016 if (unlikely(cachep == NULL)) 3017 return NULL; 3018 return kmem_cache_alloc_node(cachep, flags, node); 3019} 3020EXPORT_SYMBOL(kmalloc_node); 3021#endif 3022 3023/** 3024 * kmalloc - allocate memory 3025 * @size: how many bytes of memory are required. 3026 * @flags: the type of memory to allocate. 3027 * 3028 * kmalloc is the normal method of allocating memory 3029 * in the kernel. 3030 * 3031 * The @flags argument may be one of: 3032 * 3033 * %GFP_USER - Allocate memory on behalf of user. May sleep. 3034 * 3035 * %GFP_KERNEL - Allocate normal kernel ram. May sleep. 3036 * 3037 * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers. 3038 * 3039 * Additionally, the %GFP_DMA flag may be set to indicate the memory 3040 * must be suitable for DMA. This can mean different things on different 3041 * platforms. For example, on i386, it means that the memory must come 3042 * from the first 16MB. 3043 */ 3044void *__kmalloc(size_t size, gfp_t flags) 3045{ 3046 kmem_cache_t *cachep; 3047 3048 /* If you want to save a few bytes .text space: replace 3049 * __ with kmem_. 3050 * Then kmalloc uses the uninlined functions instead of the inline 3051 * functions. 3052 */ 3053 cachep = __find_general_cachep(size, flags); 3054 if (unlikely(cachep == NULL)) 3055 return NULL; 3056 return __cache_alloc(cachep, flags); 3057} 3058EXPORT_SYMBOL(__kmalloc); 3059 3060#ifdef CONFIG_SMP 3061/** 3062 * __alloc_percpu - allocate one copy of the object for every present 3063 * cpu in the system, zeroing them. 3064 * Objects should be dereferenced using the per_cpu_ptr macro only. 3065 * 3066 * @size: how many bytes of memory are required. 3067 */ 3068void *__alloc_percpu(size_t size) 3069{ 3070 int i; 3071 struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL); 3072 3073 if (!pdata) 3074 return NULL; 3075 3076 /* 3077 * Cannot use for_each_online_cpu since a cpu may come online 3078 * and we have no way of figuring out how to fix the array 3079 * that we have allocated then.... 3080 */ 3081 for_each_cpu(i) { 3082 int node = cpu_to_node(i); 3083 3084 if (node_online(node)) 3085 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node); 3086 else 3087 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); 3088 3089 if (!pdata->ptrs[i]) 3090 goto unwind_oom; 3091 memset(pdata->ptrs[i], 0, size); 3092 } 3093 3094 /* Catch derefs w/o wrappers */ 3095 return (void *)(~(unsigned long)pdata); 3096 3097 unwind_oom: 3098 while (--i >= 0) { 3099 if (!cpu_possible(i)) 3100 continue; 3101 kfree(pdata->ptrs[i]); 3102 } 3103 kfree(pdata); 3104 return NULL; 3105} 3106EXPORT_SYMBOL(__alloc_percpu); 3107#endif 3108 3109/** 3110 * kmem_cache_free - Deallocate an object 3111 * @cachep: The cache the allocation was from. 3112 * @objp: The previously allocated object. 3113 * 3114 * Free an object which was previously allocated from this 3115 * cache. 3116 */ 3117void kmem_cache_free(kmem_cache_t *cachep, void *objp) 3118{ 3119 unsigned long flags; 3120 3121 local_irq_save(flags); 3122 __cache_free(cachep, objp); 3123 local_irq_restore(flags); 3124} 3125EXPORT_SYMBOL(kmem_cache_free); 3126 3127/** 3128 * kfree - free previously allocated memory 3129 * @objp: pointer returned by kmalloc. 3130 * 3131 * If @objp is NULL, no operation is performed. 3132 * 3133 * Don't free memory not originally allocated by kmalloc() 3134 * or you will run into trouble. 3135 */ 3136void kfree(const void *objp) 3137{ 3138 kmem_cache_t *c; 3139 unsigned long flags; 3140 3141 if (unlikely(!objp)) 3142 return; 3143 local_irq_save(flags); 3144 kfree_debugcheck(objp); 3145 c = virt_to_cache(objp); 3146 mutex_debug_check_no_locks_freed(objp, obj_size(c)); 3147 __cache_free(c, (void *)objp); 3148 local_irq_restore(flags); 3149} 3150EXPORT_SYMBOL(kfree); 3151 3152#ifdef CONFIG_SMP 3153/** 3154 * free_percpu - free previously allocated percpu memory 3155 * @objp: pointer returned by alloc_percpu. 3156 * 3157 * Don't free memory not originally allocated by alloc_percpu() 3158 * The complemented objp is to check for that. 3159 */ 3160void free_percpu(const void *objp) 3161{ 3162 int i; 3163 struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp); 3164 3165 /* 3166 * We allocate for all cpus so we cannot use for online cpu here. 3167 */ 3168 for_each_cpu(i) 3169 kfree(p->ptrs[i]); 3170 kfree(p); 3171} 3172EXPORT_SYMBOL(free_percpu); 3173#endif 3174 3175unsigned int kmem_cache_size(kmem_cache_t *cachep) 3176{ 3177 return obj_size(cachep); 3178} 3179EXPORT_SYMBOL(kmem_cache_size); 3180 3181const char *kmem_cache_name(kmem_cache_t *cachep) 3182{ 3183 return cachep->name; 3184} 3185EXPORT_SYMBOL_GPL(kmem_cache_name); 3186 3187/* 3188 * This initializes kmem_list3 for all nodes. 3189 */ 3190static int alloc_kmemlist(kmem_cache_t *cachep) 3191{ 3192 int node; 3193 struct kmem_list3 *l3; 3194 int err = 0; 3195 3196 for_each_online_node(node) { 3197 struct array_cache *nc = NULL, *new; 3198 struct array_cache **new_alien = NULL; 3199#ifdef CONFIG_NUMA 3200 if (!(new_alien = alloc_alien_cache(node, cachep->limit))) 3201 goto fail; 3202#endif 3203 if (!(new = alloc_arraycache(node, (cachep->shared * 3204 cachep->batchcount), 3205 0xbaadf00d))) 3206 goto fail; 3207 if ((l3 = cachep->nodelists[node])) { 3208 3209 spin_lock_irq(&l3->list_lock); 3210 3211 if ((nc = cachep->nodelists[node]->shared)) 3212 free_block(cachep, nc->entry, nc->avail, node); 3213 3214 l3->shared = new; 3215 if (!cachep->nodelists[node]->alien) { 3216 l3->alien = new_alien; 3217 new_alien = NULL; 3218 } 3219 l3->free_limit = (1 + nr_cpus_node(node)) * 3220 cachep->batchcount + cachep->num; 3221 spin_unlock_irq(&l3->list_lock); 3222 kfree(nc); 3223 free_alien_cache(new_alien); 3224 continue; 3225 } 3226 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), 3227 GFP_KERNEL, node))) 3228 goto fail; 3229 3230 kmem_list3_init(l3); 3231 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3232 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 3233 l3->shared = new; 3234 l3->alien = new_alien; 3235 l3->free_limit = (1 + nr_cpus_node(node)) * 3236 cachep->batchcount + cachep->num; 3237 cachep->nodelists[node] = l3; 3238 } 3239 return err; 3240 fail: 3241 err = -ENOMEM; 3242 return err; 3243} 3244 3245struct ccupdate_struct { 3246 kmem_cache_t *cachep; 3247 struct array_cache *new[NR_CPUS]; 3248}; 3249 3250static void do_ccupdate_local(void *info) 3251{ 3252 struct ccupdate_struct *new = (struct ccupdate_struct *)info; 3253 struct array_cache *old; 3254 3255 check_irq_off(); 3256 old = ac_data(new->cachep); 3257 3258 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3259 new->new[smp_processor_id()] = old; 3260} 3261 3262static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, 3263 int shared) 3264{ 3265 struct ccupdate_struct new; 3266 int i, err; 3267 3268 memset(&new.new, 0, sizeof(new.new)); 3269 for_each_online_cpu(i) { 3270 new.new[i] = 3271 alloc_arraycache(cpu_to_node(i), limit, batchcount); 3272 if (!new.new[i]) { 3273 for (i--; i >= 0; i--) 3274 kfree(new.new[i]); 3275 return -ENOMEM; 3276 } 3277 } 3278 new.cachep = cachep; 3279 3280 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); 3281 3282 check_irq_on(); 3283 spin_lock_irq(&cachep->spinlock); 3284 cachep->batchcount = batchcount; 3285 cachep->limit = limit; 3286 cachep->shared = shared; 3287 spin_unlock_irq(&cachep->spinlock); 3288 3289 for_each_online_cpu(i) { 3290 struct array_cache *ccold = new.new[i]; 3291 if (!ccold) 3292 continue; 3293 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3294 free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); 3295 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3296 kfree(ccold); 3297 } 3298 3299 err = alloc_kmemlist(cachep); 3300 if (err) { 3301 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", 3302 cachep->name, -err); 3303 BUG(); 3304 } 3305 return 0; 3306} 3307 3308static void enable_cpucache(kmem_cache_t *cachep) 3309{ 3310 int err; 3311 int limit, shared; 3312 3313 /* The head array serves three purposes: 3314 * - create a LIFO ordering, i.e. return objects that are cache-warm 3315 * - reduce the number of spinlock operations. 3316 * - reduce the number of linked list operations on the slab and 3317 * bufctl chains: array operations are cheaper. 3318 * The numbers are guessed, we should auto-tune as described by 3319 * Bonwick. 3320 */ 3321 if (cachep->buffer_size > 131072) 3322 limit = 1; 3323 else if (cachep->buffer_size > PAGE_SIZE) 3324 limit = 8; 3325 else if (cachep->buffer_size > 1024) 3326 limit = 24; 3327 else if (cachep->buffer_size > 256) 3328 limit = 54; 3329 else 3330 limit = 120; 3331 3332 /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound 3333 * allocation behaviour: Most allocs on one cpu, most free operations 3334 * on another cpu. For these cases, an efficient object passing between 3335 * cpus is necessary. This is provided by a shared array. The array 3336 * replaces Bonwick's magazine layer. 3337 * On uniprocessor, it's functionally equivalent (but less efficient) 3338 * to a larger limit. Thus disabled by default. 3339 */ 3340 shared = 0; 3341#ifdef CONFIG_SMP 3342 if (cachep->buffer_size <= PAGE_SIZE) 3343 shared = 8; 3344#endif 3345 3346#if DEBUG 3347 /* With debugging enabled, large batchcount lead to excessively 3348 * long periods with disabled local interrupts. Limit the 3349 * batchcount 3350 */ 3351 if (limit > 32) 3352 limit = 32; 3353#endif 3354 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); 3355 if (err) 3356 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3357 cachep->name, -err); 3358} 3359 3360static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, 3361 int force, int node) 3362{ 3363 int tofree; 3364 3365 check_spinlock_acquired_node(cachep, node); 3366 if (ac->touched && !force) { 3367 ac->touched = 0; 3368 } else if (ac->avail) { 3369 tofree = force ? ac->avail : (ac->limit + 4) / 5; 3370 if (tofree > ac->avail) { 3371 tofree = (ac->avail + 1) / 2; 3372 } 3373 free_block(cachep, ac->entry, tofree, node); 3374 ac->avail -= tofree; 3375 memmove(ac->entry, &(ac->entry[tofree]), 3376 sizeof(void *) * ac->avail); 3377 } 3378} 3379 3380/** 3381 * cache_reap - Reclaim memory from caches. 3382 * @unused: unused parameter 3383 * 3384 * Called from workqueue/eventd every few seconds. 3385 * Purpose: 3386 * - clear the per-cpu caches for this CPU. 3387 * - return freeable pages to the main free memory pool. 3388 * 3389 * If we cannot acquire the cache chain mutex then just give up - we'll 3390 * try again on the next iteration. 3391 */ 3392static void cache_reap(void *unused) 3393{ 3394 struct list_head *walk; 3395 struct kmem_list3 *l3; 3396 3397 if (!mutex_trylock(&cache_chain_mutex)) { 3398 /* Give up. Setup the next iteration. */ 3399 schedule_delayed_work(&__get_cpu_var(reap_work), 3400 REAPTIMEOUT_CPUC); 3401 return; 3402 } 3403 3404 list_for_each(walk, &cache_chain) { 3405 kmem_cache_t *searchp; 3406 struct list_head *p; 3407 int tofree; 3408 struct slab *slabp; 3409 3410 searchp = list_entry(walk, kmem_cache_t, next); 3411 3412 if (searchp->flags & SLAB_NO_REAP) 3413 goto next; 3414 3415 check_irq_on(); 3416 3417 l3 = searchp->nodelists[numa_node_id()]; 3418 if (l3->alien) 3419 drain_alien_cache(searchp, l3); 3420 spin_lock_irq(&l3->list_lock); 3421 3422 drain_array_locked(searchp, ac_data(searchp), 0, 3423 numa_node_id()); 3424 3425 if (time_after(l3->next_reap, jiffies)) 3426 goto next_unlock; 3427 3428 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 3429 3430 if (l3->shared) 3431 drain_array_locked(searchp, l3->shared, 0, 3432 numa_node_id()); 3433 3434 if (l3->free_touched) { 3435 l3->free_touched = 0; 3436 goto next_unlock; 3437 } 3438 3439 tofree = 3440 (l3->free_limit + 5 * searchp->num - 3441 1) / (5 * searchp->num); 3442 do { 3443 p = l3->slabs_free.next; 3444 if (p == &(l3->slabs_free)) 3445 break; 3446 3447 slabp = list_entry(p, struct slab, list); 3448 BUG_ON(slabp->inuse); 3449 list_del(&slabp->list); 3450 STATS_INC_REAPED(searchp); 3451 3452 /* Safe to drop the lock. The slab is no longer 3453 * linked to the cache. 3454 * searchp cannot disappear, we hold 3455 * cache_chain_lock 3456 */ 3457 l3->free_objects -= searchp->num; 3458 spin_unlock_irq(&l3->list_lock); 3459 slab_destroy(searchp, slabp); 3460 spin_lock_irq(&l3->list_lock); 3461 } while (--tofree > 0); 3462 next_unlock: 3463 spin_unlock_irq(&l3->list_lock); 3464 next: 3465 cond_resched(); 3466 } 3467 check_irq_on(); 3468 mutex_unlock(&cache_chain_mutex); 3469 drain_remote_pages(); 3470 /* Setup the next iteration */ 3471 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3472} 3473 3474#ifdef CONFIG_PROC_FS 3475 3476static void print_slabinfo_header(struct seq_file *m) 3477{ 3478 /* 3479 * Output format version, so at least we can change it 3480 * without _too_ many complaints. 3481 */ 3482#if STATS 3483 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 3484#else 3485 seq_puts(m, "slabinfo - version: 2.1\n"); 3486#endif 3487 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 3488 "<objperslab> <pagesperslab>"); 3489 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 3490 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 3491#if STATS 3492 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " 3493 "<error> <maxfreeable> <nodeallocs> <remotefrees>"); 3494 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 3495#endif 3496 seq_putc(m, '\n'); 3497} 3498 3499static void *s_start(struct seq_file *m, loff_t *pos) 3500{ 3501 loff_t n = *pos; 3502 struct list_head *p; 3503 3504 mutex_lock(&cache_chain_mutex); 3505 if (!n) 3506 print_slabinfo_header(m); 3507 p = cache_chain.next; 3508 while (n--) { 3509 p = p->next; 3510 if (p == &cache_chain) 3511 return NULL; 3512 } 3513 return list_entry(p, kmem_cache_t, next); 3514} 3515 3516static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3517{ 3518 kmem_cache_t *cachep = p; 3519 ++*pos; 3520 return cachep->next.next == &cache_chain ? NULL 3521 : list_entry(cachep->next.next, kmem_cache_t, next); 3522} 3523 3524static void s_stop(struct seq_file *m, void *p) 3525{ 3526 mutex_unlock(&cache_chain_mutex); 3527} 3528 3529static int s_show(struct seq_file *m, void *p) 3530{ 3531 kmem_cache_t *cachep = p; 3532 struct list_head *q; 3533 struct slab *slabp; 3534 unsigned long active_objs; 3535 unsigned long num_objs; 3536 unsigned long active_slabs = 0; 3537 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 3538 const char *name; 3539 char *error = NULL; 3540 int node; 3541 struct kmem_list3 *l3; 3542 3543 check_irq_on(); 3544 spin_lock_irq(&cachep->spinlock); 3545 active_objs = 0; 3546 num_slabs = 0; 3547 for_each_online_node(node) { 3548 l3 = cachep->nodelists[node]; 3549 if (!l3) 3550 continue; 3551 3552 spin_lock(&l3->list_lock); 3553 3554 list_for_each(q, &l3->slabs_full) { 3555 slabp = list_entry(q, struct slab, list); 3556 if (slabp->inuse != cachep->num && !error) 3557 error = "slabs_full accounting error"; 3558 active_objs += cachep->num; 3559 active_slabs++; 3560 } 3561 list_for_each(q, &l3->slabs_partial) { 3562 slabp = list_entry(q, struct slab, list); 3563 if (slabp->inuse == cachep->num && !error) 3564 error = "slabs_partial inuse accounting error"; 3565 if (!slabp->inuse && !error) 3566 error = "slabs_partial/inuse accounting error"; 3567 active_objs += slabp->inuse; 3568 active_slabs++; 3569 } 3570 list_for_each(q, &l3->slabs_free) { 3571 slabp = list_entry(q, struct slab, list); 3572 if (slabp->inuse && !error) 3573 error = "slabs_free/inuse accounting error"; 3574 num_slabs++; 3575 } 3576 free_objects += l3->free_objects; 3577 shared_avail += l3->shared->avail; 3578 3579 spin_unlock(&l3->list_lock); 3580 } 3581 num_slabs += active_slabs; 3582 num_objs = num_slabs * cachep->num; 3583 if (num_objs - active_objs != free_objects && !error) 3584 error = "free_objects accounting error"; 3585 3586 name = cachep->name; 3587 if (error) 3588 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 3589 3590 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 3591 name, active_objs, num_objs, cachep->buffer_size, 3592 cachep->num, (1 << cachep->gfporder)); 3593 seq_printf(m, " : tunables %4u %4u %4u", 3594 cachep->limit, cachep->batchcount, cachep->shared); 3595 seq_printf(m, " : slabdata %6lu %6lu %6lu", 3596 active_slabs, num_slabs, shared_avail); 3597#if STATS 3598 { /* list3 stats */ 3599 unsigned long high = cachep->high_mark; 3600 unsigned long allocs = cachep->num_allocations; 3601 unsigned long grown = cachep->grown; 3602 unsigned long reaped = cachep->reaped; 3603 unsigned long errors = cachep->errors; 3604 unsigned long max_freeable = cachep->max_freeable; 3605 unsigned long node_allocs = cachep->node_allocs; 3606 unsigned long node_frees = cachep->node_frees; 3607 3608 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 3609 %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees); 3610 } 3611 /* cpu stats */ 3612 { 3613 unsigned long allochit = atomic_read(&cachep->allochit); 3614 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 3615 unsigned long freehit = atomic_read(&cachep->freehit); 3616 unsigned long freemiss = atomic_read(&cachep->freemiss); 3617 3618 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 3619 allochit, allocmiss, freehit, freemiss); 3620 } 3621#endif 3622 seq_putc(m, '\n'); 3623 spin_unlock_irq(&cachep->spinlock); 3624 return 0; 3625} 3626 3627/* 3628 * slabinfo_op - iterator that generates /proc/slabinfo 3629 * 3630 * Output layout: 3631 * cache-name 3632 * num-active-objs 3633 * total-objs 3634 * object size 3635 * num-active-slabs 3636 * total-slabs 3637 * num-pages-per-slab 3638 * + further values on SMP and with statistics enabled 3639 */ 3640 3641struct seq_operations slabinfo_op = { 3642 .start = s_start, 3643 .next = s_next, 3644 .stop = s_stop, 3645 .show = s_show, 3646}; 3647 3648#define MAX_SLABINFO_WRITE 128 3649/** 3650 * slabinfo_write - Tuning for the slab allocator 3651 * @file: unused 3652 * @buffer: user buffer 3653 * @count: data length 3654 * @ppos: unused 3655 */ 3656ssize_t slabinfo_write(struct file *file, const char __user * buffer, 3657 size_t count, loff_t *ppos) 3658{ 3659 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 3660 int limit, batchcount, shared, res; 3661 struct list_head *p; 3662 3663 if (count > MAX_SLABINFO_WRITE) 3664 return -EINVAL; 3665 if (copy_from_user(&kbuf, buffer, count)) 3666 return -EFAULT; 3667 kbuf[MAX_SLABINFO_WRITE] = '\0'; 3668 3669 tmp = strchr(kbuf, ' '); 3670 if (!tmp) 3671 return -EINVAL; 3672 *tmp = '\0'; 3673 tmp++; 3674 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 3675 return -EINVAL; 3676 3677 /* Find the cache in the chain of caches. */ 3678 mutex_lock(&cache_chain_mutex); 3679 res = -EINVAL; 3680 list_for_each(p, &cache_chain) { 3681 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); 3682 3683 if (!strcmp(cachep->name, kbuf)) { 3684 if (limit < 1 || 3685 batchcount < 1 || 3686 batchcount > limit || shared < 0) { 3687 res = 0; 3688 } else { 3689 res = do_tune_cpucache(cachep, limit, 3690 batchcount, shared); 3691 } 3692 break; 3693 } 3694 } 3695 mutex_unlock(&cache_chain_mutex); 3696 if (res >= 0) 3697 res = count; 3698 return res; 3699} 3700#endif 3701 3702/** 3703 * ksize - get the actual amount of memory allocated for a given object 3704 * @objp: Pointer to the object 3705 * 3706 * kmalloc may internally round up allocations and return more memory 3707 * than requested. ksize() can be used to determine the actual amount of 3708 * memory allocated. The caller may use this additional memory, even though 3709 * a smaller amount of memory was initially specified with the kmalloc call. 3710 * The caller must guarantee that objp points to a valid object previously 3711 * allocated with either kmalloc() or kmem_cache_alloc(). The object 3712 * must not be freed during the duration of the call. 3713 */ 3714unsigned int ksize(const void *objp) 3715{ 3716 if (unlikely(objp == NULL)) 3717 return 0; 3718 3719 return obj_size(virt_to_cache(objp)); 3720} 3721