slab.c revision e498be7dafd72fd68848c1eef1575aa7c5d658df
1/* 2 * linux/mm/slab.c 3 * Written by Mark Hemment, 1996/97. 4 * (markhe@nextd.demon.co.uk) 5 * 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 7 * 8 * Major cleanup, different bufctl logic, per-cpu arrays 9 * (c) 2000 Manfred Spraul 10 * 11 * Cleanup, make the head arrays unconditional, preparation for NUMA 12 * (c) 2002 Manfred Spraul 13 * 14 * An implementation of the Slab Allocator as described in outline in; 15 * UNIX Internals: The New Frontiers by Uresh Vahalia 16 * Pub: Prentice Hall ISBN 0-13-101908-2 17 * or with a little more detail in; 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 19 * Jeff Bonwick (Sun Microsystems). 20 * Presented at: USENIX Summer 1994 Technical Conference 21 * 22 * The memory is organized in caches, one cache for each object type. 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 24 * Each cache consists out of many slabs (they are small (usually one 25 * page long) and always contiguous), and each slab contains multiple 26 * initialized objects. 27 * 28 * This means, that your constructor is used only for newly allocated 29 * slabs and you must pass objects with the same intializations to 30 * kmem_cache_free. 31 * 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 33 * normal). If you need a special memory type, then must create a new 34 * cache for that memory type. 35 * 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 37 * full slabs with 0 free objects 38 * partial slabs 39 * empty slabs with no allocated objects 40 * 41 * If partial slabs exist, then new allocations come from these slabs, 42 * otherwise from empty slabs or new slabs are allocated. 43 * 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 46 * 47 * Each cache has a short per-cpu head array, most allocs 48 * and frees go into that array, and if that array overflows, then 1/2 49 * of the entries in the array are given back into the global cache. 50 * The head array is strictly LIFO and should improve the cache hit rates. 51 * On SMP, it additionally reduces the spinlock operations. 52 * 53 * The c_cpuarray may not be read with enabled local interrupts - 54 * it's changed with a smp_call_function(). 55 * 56 * SMP synchronization: 57 * constructors and destructors are called without any locking. 58 * Several members in kmem_cache_t and struct slab never change, they 59 * are accessed without any locking. 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 61 * and local interrupts are disabled so slab code is preempt-safe. 62 * The non-constant members are protected with a per-cache irq spinlock. 63 * 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 65 * in 2000 - many ideas in the current implementation are derived from 66 * his patch. 67 * 68 * Further notes from the original documentation: 69 * 70 * 11 April '97. Started multi-threading - markhe 71 * The global cache-chain is protected by the semaphore 'cache_chain_sem'. 72 * The sem is only needed when accessing/extending the cache-chain, which 73 * can never happen inside an interrupt (kmem_cache_create(), 74 * kmem_cache_shrink() and kmem_cache_reap()). 75 * 76 * At present, each engine can be growing a cache. This should be blocked. 77 * 78 * 15 March 2005. NUMA slab allocator. 79 * Shai Fultheim <shai@scalex86.org>. 80 * Shobhit Dayal <shobhit@calsoftinc.com> 81 * Alok N Kataria <alokk@calsoftinc.com> 82 * Christoph Lameter <christoph@lameter.com> 83 * 84 * Modified the slab allocator to be node aware on NUMA systems. 85 * Each node has its own list of partial, free and full slabs. 86 * All object allocations for a node occur from node specific slab lists. 87 */ 88 89#include <linux/config.h> 90#include <linux/slab.h> 91#include <linux/mm.h> 92#include <linux/swap.h> 93#include <linux/cache.h> 94#include <linux/interrupt.h> 95#include <linux/init.h> 96#include <linux/compiler.h> 97#include <linux/seq_file.h> 98#include <linux/notifier.h> 99#include <linux/kallsyms.h> 100#include <linux/cpu.h> 101#include <linux/sysctl.h> 102#include <linux/module.h> 103#include <linux/rcupdate.h> 104#include <linux/string.h> 105#include <linux/nodemask.h> 106 107#include <asm/uaccess.h> 108#include <asm/cacheflush.h> 109#include <asm/tlbflush.h> 110#include <asm/page.h> 111 112/* 113 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, 114 * SLAB_RED_ZONE & SLAB_POISON. 115 * 0 for faster, smaller code (especially in the critical paths). 116 * 117 * STATS - 1 to collect stats for /proc/slabinfo. 118 * 0 for faster, smaller code (especially in the critical paths). 119 * 120 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 121 */ 122 123#ifdef CONFIG_DEBUG_SLAB 124#define DEBUG 1 125#define STATS 1 126#define FORCED_DEBUG 1 127#else 128#define DEBUG 0 129#define STATS 0 130#define FORCED_DEBUG 0 131#endif 132 133 134/* Shouldn't this be in a header file somewhere? */ 135#define BYTES_PER_WORD sizeof(void *) 136 137#ifndef cache_line_size 138#define cache_line_size() L1_CACHE_BYTES 139#endif 140 141#ifndef ARCH_KMALLOC_MINALIGN 142/* 143 * Enforce a minimum alignment for the kmalloc caches. 144 * Usually, the kmalloc caches are cache_line_size() aligned, except when 145 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. 146 * Some archs want to perform DMA into kmalloc caches and need a guaranteed 147 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. 148 * Note that this flag disables some debug features. 149 */ 150#define ARCH_KMALLOC_MINALIGN 0 151#endif 152 153#ifndef ARCH_SLAB_MINALIGN 154/* 155 * Enforce a minimum alignment for all caches. 156 * Intended for archs that get misalignment faults even for BYTES_PER_WORD 157 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. 158 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables 159 * some debug features. 160 */ 161#define ARCH_SLAB_MINALIGN 0 162#endif 163 164#ifndef ARCH_KMALLOC_FLAGS 165#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 166#endif 167 168/* Legal flag mask for kmem_cache_create(). */ 169#if DEBUG 170# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ 171 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 172 SLAB_NO_REAP | SLAB_CACHE_DMA | \ 173 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ 174 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 175 SLAB_DESTROY_BY_RCU) 176#else 177# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ 178 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ 179 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 180 SLAB_DESTROY_BY_RCU) 181#endif 182 183/* 184 * kmem_bufctl_t: 185 * 186 * Bufctl's are used for linking objs within a slab 187 * linked offsets. 188 * 189 * This implementation relies on "struct page" for locating the cache & 190 * slab an object belongs to. 191 * This allows the bufctl structure to be small (one int), but limits 192 * the number of objects a slab (not a cache) can contain when off-slab 193 * bufctls are used. The limit is the size of the largest general cache 194 * that does not use off-slab slabs. 195 * For 32bit archs with 4 kB pages, is this 56. 196 * This is not serious, as it is only for large objects, when it is unwise 197 * to have too many per slab. 198 * Note: This limit can be raised by introducing a general cache whose size 199 * is less than 512 (PAGE_SIZE<<3), but greater than 256. 200 */ 201 202typedef unsigned int kmem_bufctl_t; 203#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) 204#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) 205#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) 206 207/* Max number of objs-per-slab for caches which use off-slab slabs. 208 * Needed to avoid a possible looping condition in cache_grow(). 209 */ 210static unsigned long offslab_limit; 211 212/* 213 * struct slab 214 * 215 * Manages the objs in a slab. Placed either at the beginning of mem allocated 216 * for a slab, or allocated from an general cache. 217 * Slabs are chained into three list: fully used, partial, fully free slabs. 218 */ 219struct slab { 220 struct list_head list; 221 unsigned long colouroff; 222 void *s_mem; /* including colour offset */ 223 unsigned int inuse; /* num of objs active in slab */ 224 kmem_bufctl_t free; 225 unsigned short nodeid; 226}; 227 228/* 229 * struct slab_rcu 230 * 231 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 232 * arrange for kmem_freepages to be called via RCU. This is useful if 233 * we need to approach a kernel structure obliquely, from its address 234 * obtained without the usual locking. We can lock the structure to 235 * stabilize it and check it's still at the given address, only if we 236 * can be sure that the memory has not been meanwhile reused for some 237 * other kind of object (which our subsystem's lock might corrupt). 238 * 239 * rcu_read_lock before reading the address, then rcu_read_unlock after 240 * taking the spinlock within the structure expected at that address. 241 * 242 * We assume struct slab_rcu can overlay struct slab when destroying. 243 */ 244struct slab_rcu { 245 struct rcu_head head; 246 kmem_cache_t *cachep; 247 void *addr; 248}; 249 250/* 251 * struct array_cache 252 * 253 * Purpose: 254 * - LIFO ordering, to hand out cache-warm objects from _alloc 255 * - reduce the number of linked list operations 256 * - reduce spinlock operations 257 * 258 * The limit is stored in the per-cpu structure to reduce the data cache 259 * footprint. 260 * 261 */ 262struct array_cache { 263 unsigned int avail; 264 unsigned int limit; 265 unsigned int batchcount; 266 unsigned int touched; 267 spinlock_t lock; 268 void *entry[0]; /* 269 * Must have this definition in here for the proper 270 * alignment of array_cache. Also simplifies accessing 271 * the entries. 272 * [0] is for gcc 2.95. It should really be []. 273 */ 274}; 275 276/* bootstrap: The caches do not work without cpuarrays anymore, 277 * but the cpuarrays are allocated from the generic caches... 278 */ 279#define BOOT_CPUCACHE_ENTRIES 1 280struct arraycache_init { 281 struct array_cache cache; 282 void * entries[BOOT_CPUCACHE_ENTRIES]; 283}; 284 285/* 286 * The slab lists for all objects. 287 */ 288struct kmem_list3 { 289 struct list_head slabs_partial; /* partial list first, better asm code */ 290 struct list_head slabs_full; 291 struct list_head slabs_free; 292 unsigned long free_objects; 293 unsigned long next_reap; 294 int free_touched; 295 unsigned int free_limit; 296 spinlock_t list_lock; 297 struct array_cache *shared; /* shared per node */ 298 struct array_cache **alien; /* on other nodes */ 299}; 300 301/* 302 * Need this for bootstrapping a per node allocator. 303 */ 304#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) 305struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; 306#define CACHE_CACHE 0 307#define SIZE_AC 1 308#define SIZE_L3 (1 + MAX_NUMNODES) 309 310/* 311 * This function may be completely optimized away if 312 * a constant is passed to it. Mostly the same as 313 * what is in linux/slab.h except it returns an 314 * index. 315 */ 316static inline int index_of(const size_t size) 317{ 318 if (__builtin_constant_p(size)) { 319 int i = 0; 320 321#define CACHE(x) \ 322 if (size <=x) \ 323 return i; \ 324 else \ 325 i++; 326#include "linux/kmalloc_sizes.h" 327#undef CACHE 328 { 329 extern void __bad_size(void); 330 __bad_size(); 331 } 332 } 333 return 0; 334} 335 336#define INDEX_AC index_of(sizeof(struct arraycache_init)) 337#define INDEX_L3 index_of(sizeof(struct kmem_list3)) 338 339static inline void kmem_list3_init(struct kmem_list3 *parent) 340{ 341 INIT_LIST_HEAD(&parent->slabs_full); 342 INIT_LIST_HEAD(&parent->slabs_partial); 343 INIT_LIST_HEAD(&parent->slabs_free); 344 parent->shared = NULL; 345 parent->alien = NULL; 346 spin_lock_init(&parent->list_lock); 347 parent->free_objects = 0; 348 parent->free_touched = 0; 349} 350 351#define MAKE_LIST(cachep, listp, slab, nodeid) \ 352 do { \ 353 INIT_LIST_HEAD(listp); \ 354 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ 355 } while (0) 356 357#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 358 do { \ 359 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 360 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 361 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 362 } while (0) 363 364/* 365 * kmem_cache_t 366 * 367 * manages a cache. 368 */ 369 370struct kmem_cache_s { 371/* 1) per-cpu data, touched during every alloc/free */ 372 struct array_cache *array[NR_CPUS]; 373 unsigned int batchcount; 374 unsigned int limit; 375 unsigned int shared; 376 unsigned int objsize; 377/* 2) touched by every alloc & free from the backend */ 378 struct kmem_list3 *nodelists[MAX_NUMNODES]; 379 unsigned int flags; /* constant flags */ 380 unsigned int num; /* # of objs per slab */ 381 spinlock_t spinlock; 382 383/* 3) cache_grow/shrink */ 384 /* order of pgs per slab (2^n) */ 385 unsigned int gfporder; 386 387 /* force GFP flags, e.g. GFP_DMA */ 388 unsigned int gfpflags; 389 390 size_t colour; /* cache colouring range */ 391 unsigned int colour_off; /* colour offset */ 392 unsigned int colour_next; /* cache colouring */ 393 kmem_cache_t *slabp_cache; 394 unsigned int slab_size; 395 unsigned int dflags; /* dynamic flags */ 396 397 /* constructor func */ 398 void (*ctor)(void *, kmem_cache_t *, unsigned long); 399 400 /* de-constructor func */ 401 void (*dtor)(void *, kmem_cache_t *, unsigned long); 402 403/* 4) cache creation/removal */ 404 const char *name; 405 struct list_head next; 406 407/* 5) statistics */ 408#if STATS 409 unsigned long num_active; 410 unsigned long num_allocations; 411 unsigned long high_mark; 412 unsigned long grown; 413 unsigned long reaped; 414 unsigned long errors; 415 unsigned long max_freeable; 416 unsigned long node_allocs; 417 unsigned long node_frees; 418 atomic_t allochit; 419 atomic_t allocmiss; 420 atomic_t freehit; 421 atomic_t freemiss; 422#endif 423#if DEBUG 424 int dbghead; 425 int reallen; 426#endif 427}; 428 429#define CFLGS_OFF_SLAB (0x80000000UL) 430#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 431 432#define BATCHREFILL_LIMIT 16 433/* Optimization question: fewer reaps means less 434 * probability for unnessary cpucache drain/refill cycles. 435 * 436 * OTHO the cpuarrays can contain lots of objects, 437 * which could lock up otherwise freeable slabs. 438 */ 439#define REAPTIMEOUT_CPUC (2*HZ) 440#define REAPTIMEOUT_LIST3 (4*HZ) 441 442#if STATS 443#define STATS_INC_ACTIVE(x) ((x)->num_active++) 444#define STATS_DEC_ACTIVE(x) ((x)->num_active--) 445#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 446#define STATS_INC_GROWN(x) ((x)->grown++) 447#define STATS_INC_REAPED(x) ((x)->reaped++) 448#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ 449 (x)->high_mark = (x)->num_active; \ 450 } while (0) 451#define STATS_INC_ERR(x) ((x)->errors++) 452#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 453#define STATS_INC_NODEFREES(x) ((x)->node_frees++) 454#define STATS_SET_FREEABLE(x, i) \ 455 do { if ((x)->max_freeable < i) \ 456 (x)->max_freeable = i; \ 457 } while (0) 458 459#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 460#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 461#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 462#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 463#else 464#define STATS_INC_ACTIVE(x) do { } while (0) 465#define STATS_DEC_ACTIVE(x) do { } while (0) 466#define STATS_INC_ALLOCED(x) do { } while (0) 467#define STATS_INC_GROWN(x) do { } while (0) 468#define STATS_INC_REAPED(x) do { } while (0) 469#define STATS_SET_HIGH(x) do { } while (0) 470#define STATS_INC_ERR(x) do { } while (0) 471#define STATS_INC_NODEALLOCS(x) do { } while (0) 472#define STATS_INC_NODEFREES(x) do { } while (0) 473#define STATS_SET_FREEABLE(x, i) \ 474 do { } while (0) 475 476#define STATS_INC_ALLOCHIT(x) do { } while (0) 477#define STATS_INC_ALLOCMISS(x) do { } while (0) 478#define STATS_INC_FREEHIT(x) do { } while (0) 479#define STATS_INC_FREEMISS(x) do { } while (0) 480#endif 481 482#if DEBUG 483/* Magic nums for obj red zoning. 484 * Placed in the first word before and the first word after an obj. 485 */ 486#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ 487#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ 488 489/* ...and for poisoning */ 490#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ 491#define POISON_FREE 0x6b /* for use-after-free poisoning */ 492#define POISON_END 0xa5 /* end-byte of poisoning */ 493 494/* memory layout of objects: 495 * 0 : objp 496 * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that 497 * the end of an object is aligned with the end of the real 498 * allocation. Catches writes behind the end of the allocation. 499 * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1: 500 * redzone word. 501 * cachep->dbghead: The real object. 502 * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 503 * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] 504 */ 505static int obj_dbghead(kmem_cache_t *cachep) 506{ 507 return cachep->dbghead; 508} 509 510static int obj_reallen(kmem_cache_t *cachep) 511{ 512 return cachep->reallen; 513} 514 515static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp) 516{ 517 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 518 return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD); 519} 520 521static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) 522{ 523 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 524 if (cachep->flags & SLAB_STORE_USER) 525 return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD); 526 return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD); 527} 528 529static void **dbg_userword(kmem_cache_t *cachep, void *objp) 530{ 531 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 532 return (void**)(objp+cachep->objsize-BYTES_PER_WORD); 533} 534 535#else 536 537#define obj_dbghead(x) 0 538#define obj_reallen(cachep) (cachep->objsize) 539#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 540#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 541#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 542 543#endif 544 545/* 546 * Maximum size of an obj (in 2^order pages) 547 * and absolute limit for the gfp order. 548 */ 549#if defined(CONFIG_LARGE_ALLOCS) 550#define MAX_OBJ_ORDER 13 /* up to 32Mb */ 551#define MAX_GFP_ORDER 13 /* up to 32Mb */ 552#elif defined(CONFIG_MMU) 553#define MAX_OBJ_ORDER 5 /* 32 pages */ 554#define MAX_GFP_ORDER 5 /* 32 pages */ 555#else 556#define MAX_OBJ_ORDER 8 /* up to 1Mb */ 557#define MAX_GFP_ORDER 8 /* up to 1Mb */ 558#endif 559 560/* 561 * Do not go above this order unless 0 objects fit into the slab. 562 */ 563#define BREAK_GFP_ORDER_HI 1 564#define BREAK_GFP_ORDER_LO 0 565static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 566 567/* Macros for storing/retrieving the cachep and or slab from the 568 * global 'mem_map'. These are used to find the slab an obj belongs to. 569 * With kfree(), these are used to find the cache which an obj belongs to. 570 */ 571#define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x)) 572#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next) 573#define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x)) 574#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev) 575 576/* These are the default caches for kmalloc. Custom caches can have other sizes. */ 577struct cache_sizes malloc_sizes[] = { 578#define CACHE(x) { .cs_size = (x) }, 579#include <linux/kmalloc_sizes.h> 580 CACHE(ULONG_MAX) 581#undef CACHE 582}; 583EXPORT_SYMBOL(malloc_sizes); 584 585/* Must match cache_sizes above. Out of line to keep cache footprint low. */ 586struct cache_names { 587 char *name; 588 char *name_dma; 589}; 590 591static struct cache_names __initdata cache_names[] = { 592#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 593#include <linux/kmalloc_sizes.h> 594 { NULL, } 595#undef CACHE 596}; 597 598static struct arraycache_init initarray_cache __initdata = 599 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 600static struct arraycache_init initarray_generic = 601 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 602 603/* internal cache of cache description objs */ 604static kmem_cache_t cache_cache = { 605 .batchcount = 1, 606 .limit = BOOT_CPUCACHE_ENTRIES, 607 .shared = 1, 608 .objsize = sizeof(kmem_cache_t), 609 .flags = SLAB_NO_REAP, 610 .spinlock = SPIN_LOCK_UNLOCKED, 611 .name = "kmem_cache", 612#if DEBUG 613 .reallen = sizeof(kmem_cache_t), 614#endif 615}; 616 617/* Guard access to the cache-chain. */ 618static struct semaphore cache_chain_sem; 619static struct list_head cache_chain; 620 621/* 622 * vm_enough_memory() looks at this to determine how many 623 * slab-allocated pages are possibly freeable under pressure 624 * 625 * SLAB_RECLAIM_ACCOUNT turns this on per-slab 626 */ 627atomic_t slab_reclaim_pages; 628 629/* 630 * chicken and egg problem: delay the per-cpu array allocation 631 * until the general caches are up. 632 */ 633static enum { 634 NONE, 635 PARTIAL_AC, 636 PARTIAL_L3, 637 FULL 638} g_cpucache_up; 639 640static DEFINE_PER_CPU(struct work_struct, reap_work); 641 642static void free_block(kmem_cache_t* cachep, void** objpp, int len); 643static void enable_cpucache (kmem_cache_t *cachep); 644static void cache_reap (void *unused); 645static int __node_shrink(kmem_cache_t *cachep, int node); 646 647static inline struct array_cache *ac_data(kmem_cache_t *cachep) 648{ 649 return cachep->array[smp_processor_id()]; 650} 651 652static inline kmem_cache_t *__find_general_cachep(size_t size, 653 unsigned int __nocast gfpflags) 654{ 655 struct cache_sizes *csizep = malloc_sizes; 656 657#if DEBUG 658 /* This happens if someone tries to call 659 * kmem_cache_create(), or __kmalloc(), before 660 * the generic caches are initialized. 661 */ 662 BUG_ON(csizep->cs_cachep == NULL); 663#endif 664 while (size > csizep->cs_size) 665 csizep++; 666 667 /* 668 * Really subtle: The last entry with cs->cs_size==ULONG_MAX 669 * has cs_{dma,}cachep==NULL. Thus no special case 670 * for large kmalloc calls required. 671 */ 672 if (unlikely(gfpflags & GFP_DMA)) 673 return csizep->cs_dmacachep; 674 return csizep->cs_cachep; 675} 676 677kmem_cache_t *kmem_find_general_cachep(size_t size, 678 unsigned int __nocast gfpflags) 679{ 680 return __find_general_cachep(size, gfpflags); 681} 682EXPORT_SYMBOL(kmem_find_general_cachep); 683 684/* Cal the num objs, wastage, and bytes left over for a given slab size. */ 685static void cache_estimate(unsigned long gfporder, size_t size, size_t align, 686 int flags, size_t *left_over, unsigned int *num) 687{ 688 int i; 689 size_t wastage = PAGE_SIZE<<gfporder; 690 size_t extra = 0; 691 size_t base = 0; 692 693 if (!(flags & CFLGS_OFF_SLAB)) { 694 base = sizeof(struct slab); 695 extra = sizeof(kmem_bufctl_t); 696 } 697 i = 0; 698 while (i*size + ALIGN(base+i*extra, align) <= wastage) 699 i++; 700 if (i > 0) 701 i--; 702 703 if (i > SLAB_LIMIT) 704 i = SLAB_LIMIT; 705 706 *num = i; 707 wastage -= i*size; 708 wastage -= ALIGN(base+i*extra, align); 709 *left_over = wastage; 710} 711 712#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 713 714static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) 715{ 716 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 717 function, cachep->name, msg); 718 dump_stack(); 719} 720 721/* 722 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 723 * via the workqueue/eventd. 724 * Add the CPU number into the expiration time to minimize the possibility of 725 * the CPUs getting into lockstep and contending for the global cache chain 726 * lock. 727 */ 728static void __devinit start_cpu_timer(int cpu) 729{ 730 struct work_struct *reap_work = &per_cpu(reap_work, cpu); 731 732 /* 733 * When this gets called from do_initcalls via cpucache_init(), 734 * init_workqueues() has already run, so keventd will be setup 735 * at that time. 736 */ 737 if (keventd_up() && reap_work->func == NULL) { 738 INIT_WORK(reap_work, cache_reap, NULL); 739 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 740 } 741} 742 743static struct array_cache *alloc_arraycache(int node, int entries, 744 int batchcount) 745{ 746 int memsize = sizeof(void*)*entries+sizeof(struct array_cache); 747 struct array_cache *nc = NULL; 748 749 nc = kmalloc_node(memsize, GFP_KERNEL, node); 750 if (nc) { 751 nc->avail = 0; 752 nc->limit = entries; 753 nc->batchcount = batchcount; 754 nc->touched = 0; 755 spin_lock_init(&nc->lock); 756 } 757 return nc; 758} 759 760#ifdef CONFIG_NUMA 761static inline struct array_cache **alloc_alien_cache(int node, int limit) 762{ 763 struct array_cache **ac_ptr; 764 int memsize = sizeof(void*)*MAX_NUMNODES; 765 int i; 766 767 if (limit > 1) 768 limit = 12; 769 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); 770 if (ac_ptr) { 771 for_each_node(i) { 772 if (i == node || !node_online(i)) { 773 ac_ptr[i] = NULL; 774 continue; 775 } 776 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 777 if (!ac_ptr[i]) { 778 for (i--; i <=0; i--) 779 kfree(ac_ptr[i]); 780 kfree(ac_ptr); 781 return NULL; 782 } 783 } 784 } 785 return ac_ptr; 786} 787 788static inline void free_alien_cache(struct array_cache **ac_ptr) 789{ 790 int i; 791 792 if (!ac_ptr) 793 return; 794 795 for_each_node(i) 796 kfree(ac_ptr[i]); 797 798 kfree(ac_ptr); 799} 800 801static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node) 802{ 803 struct kmem_list3 *rl3 = cachep->nodelists[node]; 804 805 if (ac->avail) { 806 spin_lock(&rl3->list_lock); 807 free_block(cachep, ac->entry, ac->avail); 808 ac->avail = 0; 809 spin_unlock(&rl3->list_lock); 810 } 811} 812 813static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) 814{ 815 int i=0; 816 struct array_cache *ac; 817 unsigned long flags; 818 819 for_each_online_node(i) { 820 ac = l3->alien[i]; 821 if (ac) { 822 spin_lock_irqsave(&ac->lock, flags); 823 __drain_alien_cache(cachep, ac, i); 824 spin_unlock_irqrestore(&ac->lock, flags); 825 } 826 } 827} 828#else 829#define alloc_alien_cache(node, limit) do { } while (0) 830#define free_alien_cache(ac_ptr) do { } while (0) 831#define drain_alien_cache(cachep, l3) do { } while (0) 832#endif 833 834static int __devinit cpuup_callback(struct notifier_block *nfb, 835 unsigned long action, void *hcpu) 836{ 837 long cpu = (long)hcpu; 838 kmem_cache_t* cachep; 839 struct kmem_list3 *l3 = NULL; 840 int node = cpu_to_node(cpu); 841 int memsize = sizeof(struct kmem_list3); 842 struct array_cache *nc = NULL; 843 844 switch (action) { 845 case CPU_UP_PREPARE: 846 down(&cache_chain_sem); 847 /* we need to do this right in the beginning since 848 * alloc_arraycache's are going to use this list. 849 * kmalloc_node allows us to add the slab to the right 850 * kmem_list3 and not this cpu's kmem_list3 851 */ 852 853 list_for_each_entry(cachep, &cache_chain, next) { 854 /* setup the size64 kmemlist for cpu before we can 855 * begin anything. Make sure some other cpu on this 856 * node has not already allocated this 857 */ 858 if (!cachep->nodelists[node]) { 859 if (!(l3 = kmalloc_node(memsize, 860 GFP_KERNEL, node))) 861 goto bad; 862 kmem_list3_init(l3); 863 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 864 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 865 866 cachep->nodelists[node] = l3; 867 } 868 869 spin_lock_irq(&cachep->nodelists[node]->list_lock); 870 cachep->nodelists[node]->free_limit = 871 (1 + nr_cpus_node(node)) * 872 cachep->batchcount + cachep->num; 873 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 874 } 875 876 /* Now we can go ahead with allocating the shared array's 877 & array cache's */ 878 list_for_each_entry(cachep, &cache_chain, next) { 879 nc = alloc_arraycache(node, cachep->limit, 880 cachep->batchcount); 881 if (!nc) 882 goto bad; 883 cachep->array[cpu] = nc; 884 885 l3 = cachep->nodelists[node]; 886 BUG_ON(!l3); 887 if (!l3->shared) { 888 if (!(nc = alloc_arraycache(node, 889 cachep->shared*cachep->batchcount, 890 0xbaadf00d))) 891 goto bad; 892 893 /* we are serialised from CPU_DEAD or 894 CPU_UP_CANCELLED by the cpucontrol lock */ 895 l3->shared = nc; 896 } 897 } 898 up(&cache_chain_sem); 899 break; 900 case CPU_ONLINE: 901 start_cpu_timer(cpu); 902 break; 903#ifdef CONFIG_HOTPLUG_CPU 904 case CPU_DEAD: 905 /* fall thru */ 906 case CPU_UP_CANCELED: 907 down(&cache_chain_sem); 908 909 list_for_each_entry(cachep, &cache_chain, next) { 910 struct array_cache *nc; 911 cpumask_t mask; 912 913 mask = node_to_cpumask(node); 914 spin_lock_irq(&cachep->spinlock); 915 /* cpu is dead; no one can alloc from it. */ 916 nc = cachep->array[cpu]; 917 cachep->array[cpu] = NULL; 918 l3 = cachep->nodelists[node]; 919 920 if (!l3) 921 goto unlock_cache; 922 923 spin_lock(&l3->list_lock); 924 925 /* Free limit for this kmem_list3 */ 926 l3->free_limit -= cachep->batchcount; 927 if (nc) 928 free_block(cachep, nc->entry, nc->avail); 929 930 if (!cpus_empty(mask)) { 931 spin_unlock(&l3->list_lock); 932 goto unlock_cache; 933 } 934 935 if (l3->shared) { 936 free_block(cachep, l3->shared->entry, 937 l3->shared->avail); 938 kfree(l3->shared); 939 l3->shared = NULL; 940 } 941 if (l3->alien) { 942 drain_alien_cache(cachep, l3); 943 free_alien_cache(l3->alien); 944 l3->alien = NULL; 945 } 946 947 /* free slabs belonging to this node */ 948 if (__node_shrink(cachep, node)) { 949 cachep->nodelists[node] = NULL; 950 spin_unlock(&l3->list_lock); 951 kfree(l3); 952 } else { 953 spin_unlock(&l3->list_lock); 954 } 955unlock_cache: 956 spin_unlock_irq(&cachep->spinlock); 957 kfree(nc); 958 } 959 up(&cache_chain_sem); 960 break; 961#endif 962 } 963 return NOTIFY_OK; 964bad: 965 up(&cache_chain_sem); 966 return NOTIFY_BAD; 967} 968 969static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; 970 971/* 972 * swap the static kmem_list3 with kmalloced memory 973 */ 974static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, 975 int nodeid) 976{ 977 struct kmem_list3 *ptr; 978 979 BUG_ON(cachep->nodelists[nodeid] != list); 980 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); 981 BUG_ON(!ptr); 982 983 local_irq_disable(); 984 memcpy(ptr, list, sizeof(struct kmem_list3)); 985 MAKE_ALL_LISTS(cachep, ptr, nodeid); 986 cachep->nodelists[nodeid] = ptr; 987 local_irq_enable(); 988} 989 990/* Initialisation. 991 * Called after the gfp() functions have been enabled, and before smp_init(). 992 */ 993void __init kmem_cache_init(void) 994{ 995 size_t left_over; 996 struct cache_sizes *sizes; 997 struct cache_names *names; 998 int i; 999 1000 for (i = 0; i < NUM_INIT_LISTS; i++) { 1001 kmem_list3_init(&initkmem_list3[i]); 1002 if (i < MAX_NUMNODES) 1003 cache_cache.nodelists[i] = NULL; 1004 } 1005 1006 /* 1007 * Fragmentation resistance on low memory - only use bigger 1008 * page orders on machines with more than 32MB of memory. 1009 */ 1010 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1011 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1012 1013 /* Bootstrap is tricky, because several objects are allocated 1014 * from caches that do not exist yet: 1015 * 1) initialize the cache_cache cache: it contains the kmem_cache_t 1016 * structures of all caches, except cache_cache itself: cache_cache 1017 * is statically allocated. 1018 * Initially an __init data area is used for the head array and the 1019 * kmem_list3 structures, it's replaced with a kmalloc allocated 1020 * array at the end of the bootstrap. 1021 * 2) Create the first kmalloc cache. 1022 * The kmem_cache_t for the new cache is allocated normally. 1023 * An __init data area is used for the head array. 1024 * 3) Create the remaining kmalloc caches, with minimally sized 1025 * head arrays. 1026 * 4) Replace the __init data head arrays for cache_cache and the first 1027 * kmalloc cache with kmalloc allocated arrays. 1028 * 5) Replace the __init data for kmem_list3 for cache_cache and 1029 * the other cache's with kmalloc allocated memory. 1030 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1031 */ 1032 1033 /* 1) create the cache_cache */ 1034 init_MUTEX(&cache_chain_sem); 1035 INIT_LIST_HEAD(&cache_chain); 1036 list_add(&cache_cache.next, &cache_chain); 1037 cache_cache.colour_off = cache_line_size(); 1038 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1039 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; 1040 1041 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); 1042 1043 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, 1044 &left_over, &cache_cache.num); 1045 if (!cache_cache.num) 1046 BUG(); 1047 1048 cache_cache.colour = left_over/cache_cache.colour_off; 1049 cache_cache.colour_next = 0; 1050 cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + 1051 sizeof(struct slab), cache_line_size()); 1052 1053 /* 2+3) create the kmalloc caches */ 1054 sizes = malloc_sizes; 1055 names = cache_names; 1056 1057 /* Initialize the caches that provide memory for the array cache 1058 * and the kmem_list3 structures first. 1059 * Without this, further allocations will bug 1060 */ 1061 1062 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1063 sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, 1064 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1065 1066 if (INDEX_AC != INDEX_L3) 1067 sizes[INDEX_L3].cs_cachep = 1068 kmem_cache_create(names[INDEX_L3].name, 1069 sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, 1070 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1071 1072 while (sizes->cs_size != ULONG_MAX) { 1073 /* 1074 * For performance, all the general caches are L1 aligned. 1075 * This should be particularly beneficial on SMP boxes, as it 1076 * eliminates "false sharing". 1077 * Note for systems short on memory removing the alignment will 1078 * allow tighter packing of the smaller caches. 1079 */ 1080 if(!sizes->cs_cachep) 1081 sizes->cs_cachep = kmem_cache_create(names->name, 1082 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1083 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1084 1085 /* Inc off-slab bufctl limit until the ceiling is hit. */ 1086 if (!(OFF_SLAB(sizes->cs_cachep))) { 1087 offslab_limit = sizes->cs_size-sizeof(struct slab); 1088 offslab_limit /= sizeof(kmem_bufctl_t); 1089 } 1090 1091 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1092 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1093 (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC), 1094 NULL, NULL); 1095 1096 sizes++; 1097 names++; 1098 } 1099 /* 4) Replace the bootstrap head arrays */ 1100 { 1101 void * ptr; 1102 1103 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1104 1105 local_irq_disable(); 1106 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 1107 memcpy(ptr, ac_data(&cache_cache), 1108 sizeof(struct arraycache_init)); 1109 cache_cache.array[smp_processor_id()] = ptr; 1110 local_irq_enable(); 1111 1112 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1113 1114 local_irq_disable(); 1115 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) 1116 != &initarray_generic.cache); 1117 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), 1118 sizeof(struct arraycache_init)); 1119 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1120 ptr; 1121 local_irq_enable(); 1122 } 1123 /* 5) Replace the bootstrap kmem_list3's */ 1124 { 1125 int node; 1126 /* Replace the static kmem_list3 structures for the boot cpu */ 1127 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], 1128 numa_node_id()); 1129 1130 for_each_online_node(node) { 1131 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1132 &initkmem_list3[SIZE_AC+node], node); 1133 1134 if (INDEX_AC != INDEX_L3) { 1135 init_list(malloc_sizes[INDEX_L3].cs_cachep, 1136 &initkmem_list3[SIZE_L3+node], 1137 node); 1138 } 1139 } 1140 } 1141 1142 /* 6) resize the head arrays to their final sizes */ 1143 { 1144 kmem_cache_t *cachep; 1145 down(&cache_chain_sem); 1146 list_for_each_entry(cachep, &cache_chain, next) 1147 enable_cpucache(cachep); 1148 up(&cache_chain_sem); 1149 } 1150 1151 /* Done! */ 1152 g_cpucache_up = FULL; 1153 1154 /* Register a cpu startup notifier callback 1155 * that initializes ac_data for all new cpus 1156 */ 1157 register_cpu_notifier(&cpucache_notifier); 1158 1159 /* The reap timers are started later, with a module init call: 1160 * That part of the kernel is not yet operational. 1161 */ 1162} 1163 1164static int __init cpucache_init(void) 1165{ 1166 int cpu; 1167 1168 /* 1169 * Register the timers that return unneeded 1170 * pages to gfp. 1171 */ 1172 for_each_online_cpu(cpu) 1173 start_cpu_timer(cpu); 1174 1175 return 0; 1176} 1177 1178__initcall(cpucache_init); 1179 1180/* 1181 * Interface to system's page allocator. No need to hold the cache-lock. 1182 * 1183 * If we requested dmaable memory, we will get it. Even if we 1184 * did not request dmaable memory, we might get it, but that 1185 * would be relatively rare and ignorable. 1186 */ 1187static void *kmem_getpages(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid) 1188{ 1189 struct page *page; 1190 void *addr; 1191 int i; 1192 1193 flags |= cachep->gfpflags; 1194 if (likely(nodeid == -1)) { 1195 page = alloc_pages(flags, cachep->gfporder); 1196 } else { 1197 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1198 } 1199 if (!page) 1200 return NULL; 1201 addr = page_address(page); 1202 1203 i = (1 << cachep->gfporder); 1204 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1205 atomic_add(i, &slab_reclaim_pages); 1206 add_page_state(nr_slab, i); 1207 while (i--) { 1208 SetPageSlab(page); 1209 page++; 1210 } 1211 return addr; 1212} 1213 1214/* 1215 * Interface to system's page release. 1216 */ 1217static void kmem_freepages(kmem_cache_t *cachep, void *addr) 1218{ 1219 unsigned long i = (1<<cachep->gfporder); 1220 struct page *page = virt_to_page(addr); 1221 const unsigned long nr_freed = i; 1222 1223 while (i--) { 1224 if (!TestClearPageSlab(page)) 1225 BUG(); 1226 page++; 1227 } 1228 sub_page_state(nr_slab, nr_freed); 1229 if (current->reclaim_state) 1230 current->reclaim_state->reclaimed_slab += nr_freed; 1231 free_pages((unsigned long)addr, cachep->gfporder); 1232 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1233 atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages); 1234} 1235 1236static void kmem_rcu_free(struct rcu_head *head) 1237{ 1238 struct slab_rcu *slab_rcu = (struct slab_rcu *) head; 1239 kmem_cache_t *cachep = slab_rcu->cachep; 1240 1241 kmem_freepages(cachep, slab_rcu->addr); 1242 if (OFF_SLAB(cachep)) 1243 kmem_cache_free(cachep->slabp_cache, slab_rcu); 1244} 1245 1246#if DEBUG 1247 1248#ifdef CONFIG_DEBUG_PAGEALLOC 1249static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, 1250 unsigned long caller) 1251{ 1252 int size = obj_reallen(cachep); 1253 1254 addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; 1255 1256 if (size < 5*sizeof(unsigned long)) 1257 return; 1258 1259 *addr++=0x12345678; 1260 *addr++=caller; 1261 *addr++=smp_processor_id(); 1262 size -= 3*sizeof(unsigned long); 1263 { 1264 unsigned long *sptr = &caller; 1265 unsigned long svalue; 1266 1267 while (!kstack_end(sptr)) { 1268 svalue = *sptr++; 1269 if (kernel_text_address(svalue)) { 1270 *addr++=svalue; 1271 size -= sizeof(unsigned long); 1272 if (size <= sizeof(unsigned long)) 1273 break; 1274 } 1275 } 1276 1277 } 1278 *addr++=0x87654321; 1279} 1280#endif 1281 1282static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) 1283{ 1284 int size = obj_reallen(cachep); 1285 addr = &((char*)addr)[obj_dbghead(cachep)]; 1286 1287 memset(addr, val, size); 1288 *(unsigned char *)(addr+size-1) = POISON_END; 1289} 1290 1291static void dump_line(char *data, int offset, int limit) 1292{ 1293 int i; 1294 printk(KERN_ERR "%03x:", offset); 1295 for (i=0;i<limit;i++) { 1296 printk(" %02x", (unsigned char)data[offset+i]); 1297 } 1298 printk("\n"); 1299} 1300#endif 1301 1302#if DEBUG 1303 1304static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines) 1305{ 1306 int i, size; 1307 char *realobj; 1308 1309 if (cachep->flags & SLAB_RED_ZONE) { 1310 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1311 *dbg_redzone1(cachep, objp), 1312 *dbg_redzone2(cachep, objp)); 1313 } 1314 1315 if (cachep->flags & SLAB_STORE_USER) { 1316 printk(KERN_ERR "Last user: [<%p>]", 1317 *dbg_userword(cachep, objp)); 1318 print_symbol("(%s)", 1319 (unsigned long)*dbg_userword(cachep, objp)); 1320 printk("\n"); 1321 } 1322 realobj = (char*)objp+obj_dbghead(cachep); 1323 size = obj_reallen(cachep); 1324 for (i=0; i<size && lines;i+=16, lines--) { 1325 int limit; 1326 limit = 16; 1327 if (i+limit > size) 1328 limit = size-i; 1329 dump_line(realobj, i, limit); 1330 } 1331} 1332 1333static void check_poison_obj(kmem_cache_t *cachep, void *objp) 1334{ 1335 char *realobj; 1336 int size, i; 1337 int lines = 0; 1338 1339 realobj = (char*)objp+obj_dbghead(cachep); 1340 size = obj_reallen(cachep); 1341 1342 for (i=0;i<size;i++) { 1343 char exp = POISON_FREE; 1344 if (i == size-1) 1345 exp = POISON_END; 1346 if (realobj[i] != exp) { 1347 int limit; 1348 /* Mismatch ! */ 1349 /* Print header */ 1350 if (lines == 0) { 1351 printk(KERN_ERR "Slab corruption: start=%p, len=%d\n", 1352 realobj, size); 1353 print_objinfo(cachep, objp, 0); 1354 } 1355 /* Hexdump the affected line */ 1356 i = (i/16)*16; 1357 limit = 16; 1358 if (i+limit > size) 1359 limit = size-i; 1360 dump_line(realobj, i, limit); 1361 i += 16; 1362 lines++; 1363 /* Limit to 5 lines */ 1364 if (lines > 5) 1365 break; 1366 } 1367 } 1368 if (lines != 0) { 1369 /* Print some data about the neighboring objects, if they 1370 * exist: 1371 */ 1372 struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp)); 1373 int objnr; 1374 1375 objnr = (objp-slabp->s_mem)/cachep->objsize; 1376 if (objnr) { 1377 objp = slabp->s_mem+(objnr-1)*cachep->objsize; 1378 realobj = (char*)objp+obj_dbghead(cachep); 1379 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1380 realobj, size); 1381 print_objinfo(cachep, objp, 2); 1382 } 1383 if (objnr+1 < cachep->num) { 1384 objp = slabp->s_mem+(objnr+1)*cachep->objsize; 1385 realobj = (char*)objp+obj_dbghead(cachep); 1386 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1387 realobj, size); 1388 print_objinfo(cachep, objp, 2); 1389 } 1390 } 1391} 1392#endif 1393 1394/* Destroy all the objs in a slab, and release the mem back to the system. 1395 * Before calling the slab must have been unlinked from the cache. 1396 * The cache-lock is not held/needed. 1397 */ 1398static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) 1399{ 1400 void *addr = slabp->s_mem - slabp->colouroff; 1401 1402#if DEBUG 1403 int i; 1404 for (i = 0; i < cachep->num; i++) { 1405 void *objp = slabp->s_mem + cachep->objsize * i; 1406 1407 if (cachep->flags & SLAB_POISON) { 1408#ifdef CONFIG_DEBUG_PAGEALLOC 1409 if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) 1410 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); 1411 else 1412 check_poison_obj(cachep, objp); 1413#else 1414 check_poison_obj(cachep, objp); 1415#endif 1416 } 1417 if (cachep->flags & SLAB_RED_ZONE) { 1418 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1419 slab_error(cachep, "start of a freed object " 1420 "was overwritten"); 1421 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1422 slab_error(cachep, "end of a freed object " 1423 "was overwritten"); 1424 } 1425 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1426 (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0); 1427 } 1428#else 1429 if (cachep->dtor) { 1430 int i; 1431 for (i = 0; i < cachep->num; i++) { 1432 void* objp = slabp->s_mem+cachep->objsize*i; 1433 (cachep->dtor)(objp, cachep, 0); 1434 } 1435 } 1436#endif 1437 1438 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1439 struct slab_rcu *slab_rcu; 1440 1441 slab_rcu = (struct slab_rcu *) slabp; 1442 slab_rcu->cachep = cachep; 1443 slab_rcu->addr = addr; 1444 call_rcu(&slab_rcu->head, kmem_rcu_free); 1445 } else { 1446 kmem_freepages(cachep, addr); 1447 if (OFF_SLAB(cachep)) 1448 kmem_cache_free(cachep->slabp_cache, slabp); 1449 } 1450} 1451 1452/* For setting up all the kmem_list3s for cache whose objsize is same 1453 as size of kmem_list3. */ 1454static inline void set_up_list3s(kmem_cache_t *cachep, int index) 1455{ 1456 int node; 1457 1458 for_each_online_node(node) { 1459 cachep->nodelists[node] = &initkmem_list3[index+node]; 1460 cachep->nodelists[node]->next_reap = jiffies + 1461 REAPTIMEOUT_LIST3 + 1462 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 1463 } 1464} 1465 1466/** 1467 * kmem_cache_create - Create a cache. 1468 * @name: A string which is used in /proc/slabinfo to identify this cache. 1469 * @size: The size of objects to be created in this cache. 1470 * @align: The required alignment for the objects. 1471 * @flags: SLAB flags 1472 * @ctor: A constructor for the objects. 1473 * @dtor: A destructor for the objects. 1474 * 1475 * Returns a ptr to the cache on success, NULL on failure. 1476 * Cannot be called within a int, but can be interrupted. 1477 * The @ctor is run when new pages are allocated by the cache 1478 * and the @dtor is run before the pages are handed back. 1479 * 1480 * @name must be valid until the cache is destroyed. This implies that 1481 * the module calling this has to destroy the cache before getting 1482 * unloaded. 1483 * 1484 * The flags are 1485 * 1486 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 1487 * to catch references to uninitialised memory. 1488 * 1489 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 1490 * for buffer overruns. 1491 * 1492 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under 1493 * memory pressure. 1494 * 1495 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 1496 * cacheline. This can be beneficial if you're counting cycles as closely 1497 * as davem. 1498 */ 1499kmem_cache_t * 1500kmem_cache_create (const char *name, size_t size, size_t align, 1501 unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), 1502 void (*dtor)(void*, kmem_cache_t *, unsigned long)) 1503{ 1504 size_t left_over, slab_size, ralign; 1505 kmem_cache_t *cachep = NULL; 1506 1507 /* 1508 * Sanity checks... these are all serious usage bugs. 1509 */ 1510 if ((!name) || 1511 in_interrupt() || 1512 (size < BYTES_PER_WORD) || 1513 (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) || 1514 (dtor && !ctor)) { 1515 printk(KERN_ERR "%s: Early error in slab %s\n", 1516 __FUNCTION__, name); 1517 BUG(); 1518 } 1519 1520#if DEBUG 1521 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 1522 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { 1523 /* No constructor, but inital state check requested */ 1524 printk(KERN_ERR "%s: No con, but init state check " 1525 "requested - %s\n", __FUNCTION__, name); 1526 flags &= ~SLAB_DEBUG_INITIAL; 1527 } 1528 1529#if FORCED_DEBUG 1530 /* 1531 * Enable redzoning and last user accounting, except for caches with 1532 * large objects, if the increased size would increase the object size 1533 * above the next power of two: caches with object sizes just above a 1534 * power of two have a significant amount of internal fragmentation. 1535 */ 1536 if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) 1537 flags |= SLAB_RED_ZONE|SLAB_STORE_USER; 1538 if (!(flags & SLAB_DESTROY_BY_RCU)) 1539 flags |= SLAB_POISON; 1540#endif 1541 if (flags & SLAB_DESTROY_BY_RCU) 1542 BUG_ON(flags & SLAB_POISON); 1543#endif 1544 if (flags & SLAB_DESTROY_BY_RCU) 1545 BUG_ON(dtor); 1546 1547 /* 1548 * Always checks flags, a caller might be expecting debug 1549 * support which isn't available. 1550 */ 1551 if (flags & ~CREATE_MASK) 1552 BUG(); 1553 1554 /* Check that size is in terms of words. This is needed to avoid 1555 * unaligned accesses for some archs when redzoning is used, and makes 1556 * sure any on-slab bufctl's are also correctly aligned. 1557 */ 1558 if (size & (BYTES_PER_WORD-1)) { 1559 size += (BYTES_PER_WORD-1); 1560 size &= ~(BYTES_PER_WORD-1); 1561 } 1562 1563 /* calculate out the final buffer alignment: */ 1564 /* 1) arch recommendation: can be overridden for debug */ 1565 if (flags & SLAB_HWCACHE_ALIGN) { 1566 /* Default alignment: as specified by the arch code. 1567 * Except if an object is really small, then squeeze multiple 1568 * objects into one cacheline. 1569 */ 1570 ralign = cache_line_size(); 1571 while (size <= ralign/2) 1572 ralign /= 2; 1573 } else { 1574 ralign = BYTES_PER_WORD; 1575 } 1576 /* 2) arch mandated alignment: disables debug if necessary */ 1577 if (ralign < ARCH_SLAB_MINALIGN) { 1578 ralign = ARCH_SLAB_MINALIGN; 1579 if (ralign > BYTES_PER_WORD) 1580 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); 1581 } 1582 /* 3) caller mandated alignment: disables debug if necessary */ 1583 if (ralign < align) { 1584 ralign = align; 1585 if (ralign > BYTES_PER_WORD) 1586 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); 1587 } 1588 /* 4) Store it. Note that the debug code below can reduce 1589 * the alignment to BYTES_PER_WORD. 1590 */ 1591 align = ralign; 1592 1593 /* Get cache's description obj. */ 1594 cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); 1595 if (!cachep) 1596 goto opps; 1597 memset(cachep, 0, sizeof(kmem_cache_t)); 1598 1599#if DEBUG 1600 cachep->reallen = size; 1601 1602 if (flags & SLAB_RED_ZONE) { 1603 /* redzoning only works with word aligned caches */ 1604 align = BYTES_PER_WORD; 1605 1606 /* add space for red zone words */ 1607 cachep->dbghead += BYTES_PER_WORD; 1608 size += 2*BYTES_PER_WORD; 1609 } 1610 if (flags & SLAB_STORE_USER) { 1611 /* user store requires word alignment and 1612 * one word storage behind the end of the real 1613 * object. 1614 */ 1615 align = BYTES_PER_WORD; 1616 size += BYTES_PER_WORD; 1617 } 1618#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1619 if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { 1620 cachep->dbghead += PAGE_SIZE - size; 1621 size = PAGE_SIZE; 1622 } 1623#endif 1624#endif 1625 1626 /* Determine if the slab management is 'on' or 'off' slab. */ 1627 if (size >= (PAGE_SIZE>>3)) 1628 /* 1629 * Size is large, assume best to place the slab management obj 1630 * off-slab (should allow better packing of objs). 1631 */ 1632 flags |= CFLGS_OFF_SLAB; 1633 1634 size = ALIGN(size, align); 1635 1636 if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { 1637 /* 1638 * A VFS-reclaimable slab tends to have most allocations 1639 * as GFP_NOFS and we really don't want to have to be allocating 1640 * higher-order pages when we are unable to shrink dcache. 1641 */ 1642 cachep->gfporder = 0; 1643 cache_estimate(cachep->gfporder, size, align, flags, 1644 &left_over, &cachep->num); 1645 } else { 1646 /* 1647 * Calculate size (in pages) of slabs, and the num of objs per 1648 * slab. This could be made much more intelligent. For now, 1649 * try to avoid using high page-orders for slabs. When the 1650 * gfp() funcs are more friendly towards high-order requests, 1651 * this should be changed. 1652 */ 1653 do { 1654 unsigned int break_flag = 0; 1655cal_wastage: 1656 cache_estimate(cachep->gfporder, size, align, flags, 1657 &left_over, &cachep->num); 1658 if (break_flag) 1659 break; 1660 if (cachep->gfporder >= MAX_GFP_ORDER) 1661 break; 1662 if (!cachep->num) 1663 goto next; 1664 if (flags & CFLGS_OFF_SLAB && 1665 cachep->num > offslab_limit) { 1666 /* This num of objs will cause problems. */ 1667 cachep->gfporder--; 1668 break_flag++; 1669 goto cal_wastage; 1670 } 1671 1672 /* 1673 * Large num of objs is good, but v. large slabs are 1674 * currently bad for the gfp()s. 1675 */ 1676 if (cachep->gfporder >= slab_break_gfp_order) 1677 break; 1678 1679 if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder)) 1680 break; /* Acceptable internal fragmentation. */ 1681next: 1682 cachep->gfporder++; 1683 } while (1); 1684 } 1685 1686 if (!cachep->num) { 1687 printk("kmem_cache_create: couldn't create cache %s.\n", name); 1688 kmem_cache_free(&cache_cache, cachep); 1689 cachep = NULL; 1690 goto opps; 1691 } 1692 slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) 1693 + sizeof(struct slab), align); 1694 1695 /* 1696 * If the slab has been placed off-slab, and we have enough space then 1697 * move it on-slab. This is at the expense of any extra colouring. 1698 */ 1699 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { 1700 flags &= ~CFLGS_OFF_SLAB; 1701 left_over -= slab_size; 1702 } 1703 1704 if (flags & CFLGS_OFF_SLAB) { 1705 /* really off slab. No need for manual alignment */ 1706 slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); 1707 } 1708 1709 cachep->colour_off = cache_line_size(); 1710 /* Offset must be a multiple of the alignment. */ 1711 if (cachep->colour_off < align) 1712 cachep->colour_off = align; 1713 cachep->colour = left_over/cachep->colour_off; 1714 cachep->slab_size = slab_size; 1715 cachep->flags = flags; 1716 cachep->gfpflags = 0; 1717 if (flags & SLAB_CACHE_DMA) 1718 cachep->gfpflags |= GFP_DMA; 1719 spin_lock_init(&cachep->spinlock); 1720 cachep->objsize = size; 1721 1722 if (flags & CFLGS_OFF_SLAB) 1723 cachep->slabp_cache = kmem_find_general_cachep(slab_size,0); 1724 cachep->ctor = ctor; 1725 cachep->dtor = dtor; 1726 cachep->name = name; 1727 1728 /* Don't let CPUs to come and go */ 1729 lock_cpu_hotplug(); 1730 1731 if (g_cpucache_up == FULL) { 1732 enable_cpucache(cachep); 1733 } else { 1734 if (g_cpucache_up == NONE) { 1735 /* Note: the first kmem_cache_create must create 1736 * the cache that's used by kmalloc(24), otherwise 1737 * the creation of further caches will BUG(). 1738 */ 1739 cachep->array[smp_processor_id()] = 1740 &initarray_generic.cache; 1741 1742 /* If the cache that's used by 1743 * kmalloc(sizeof(kmem_list3)) is the first cache, 1744 * then we need to set up all its list3s, otherwise 1745 * the creation of further caches will BUG(). 1746 */ 1747 set_up_list3s(cachep, SIZE_AC); 1748 if (INDEX_AC == INDEX_L3) 1749 g_cpucache_up = PARTIAL_L3; 1750 else 1751 g_cpucache_up = PARTIAL_AC; 1752 } else { 1753 cachep->array[smp_processor_id()] = 1754 kmalloc(sizeof(struct arraycache_init), 1755 GFP_KERNEL); 1756 1757 if (g_cpucache_up == PARTIAL_AC) { 1758 set_up_list3s(cachep, SIZE_L3); 1759 g_cpucache_up = PARTIAL_L3; 1760 } else { 1761 int node; 1762 for_each_online_node(node) { 1763 1764 cachep->nodelists[node] = 1765 kmalloc_node(sizeof(struct kmem_list3), 1766 GFP_KERNEL, node); 1767 BUG_ON(!cachep->nodelists[node]); 1768 kmem_list3_init(cachep->nodelists[node]); 1769 } 1770 } 1771 } 1772 cachep->nodelists[numa_node_id()]->next_reap = 1773 jiffies + REAPTIMEOUT_LIST3 + 1774 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 1775 1776 BUG_ON(!ac_data(cachep)); 1777 ac_data(cachep)->avail = 0; 1778 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 1779 ac_data(cachep)->batchcount = 1; 1780 ac_data(cachep)->touched = 0; 1781 cachep->batchcount = 1; 1782 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1783 } 1784 1785 /* Need the semaphore to access the chain. */ 1786 down(&cache_chain_sem); 1787 { 1788 struct list_head *p; 1789 mm_segment_t old_fs; 1790 1791 old_fs = get_fs(); 1792 set_fs(KERNEL_DS); 1793 list_for_each(p, &cache_chain) { 1794 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); 1795 char tmp; 1796 /* This happens when the module gets unloaded and doesn't 1797 destroy its slab cache and noone else reuses the vmalloc 1798 area of the module. Print a warning. */ 1799 if (__get_user(tmp,pc->name)) { 1800 printk("SLAB: cache with size %d has lost its name\n", 1801 pc->objsize); 1802 continue; 1803 } 1804 if (!strcmp(pc->name,name)) { 1805 printk("kmem_cache_create: duplicate cache %s\n",name); 1806 up(&cache_chain_sem); 1807 unlock_cpu_hotplug(); 1808 BUG(); 1809 } 1810 } 1811 set_fs(old_fs); 1812 } 1813 1814 /* cache setup completed, link it into the list */ 1815 list_add(&cachep->next, &cache_chain); 1816 up(&cache_chain_sem); 1817 unlock_cpu_hotplug(); 1818opps: 1819 if (!cachep && (flags & SLAB_PANIC)) 1820 panic("kmem_cache_create(): failed to create slab `%s'\n", 1821 name); 1822 return cachep; 1823} 1824EXPORT_SYMBOL(kmem_cache_create); 1825 1826#if DEBUG 1827static void check_irq_off(void) 1828{ 1829 BUG_ON(!irqs_disabled()); 1830} 1831 1832static void check_irq_on(void) 1833{ 1834 BUG_ON(irqs_disabled()); 1835} 1836 1837static void check_spinlock_acquired(kmem_cache_t *cachep) 1838{ 1839#ifdef CONFIG_SMP 1840 check_irq_off(); 1841 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); 1842#endif 1843} 1844 1845static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node) 1846{ 1847#ifdef CONFIG_SMP 1848 check_irq_off(); 1849 assert_spin_locked(&cachep->nodelists[node]->list_lock); 1850#endif 1851} 1852 1853#else 1854#define check_irq_off() do { } while(0) 1855#define check_irq_on() do { } while(0) 1856#define check_spinlock_acquired(x) do { } while(0) 1857#define check_spinlock_acquired_node(x, y) do { } while(0) 1858#endif 1859 1860/* 1861 * Waits for all CPUs to execute func(). 1862 */ 1863static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) 1864{ 1865 check_irq_on(); 1866 preempt_disable(); 1867 1868 local_irq_disable(); 1869 func(arg); 1870 local_irq_enable(); 1871 1872 if (smp_call_function(func, arg, 1, 1)) 1873 BUG(); 1874 1875 preempt_enable(); 1876} 1877 1878static void drain_array_locked(kmem_cache_t* cachep, 1879 struct array_cache *ac, int force, int node); 1880 1881static void do_drain(void *arg) 1882{ 1883 kmem_cache_t *cachep = (kmem_cache_t*)arg; 1884 struct array_cache *ac; 1885 1886 check_irq_off(); 1887 ac = ac_data(cachep); 1888 spin_lock(&cachep->nodelists[numa_node_id()]->list_lock); 1889 free_block(cachep, ac->entry, ac->avail); 1890 spin_unlock(&cachep->nodelists[numa_node_id()]->list_lock); 1891 ac->avail = 0; 1892} 1893 1894static void drain_cpu_caches(kmem_cache_t *cachep) 1895{ 1896 struct kmem_list3 *l3; 1897 int node; 1898 1899 smp_call_function_all_cpus(do_drain, cachep); 1900 check_irq_on(); 1901 spin_lock_irq(&cachep->spinlock); 1902 for_each_online_node(node) { 1903 l3 = cachep->nodelists[node]; 1904 if (l3) { 1905 spin_lock(&l3->list_lock); 1906 drain_array_locked(cachep, l3->shared, 1, node); 1907 spin_unlock(&l3->list_lock); 1908 if (l3->alien) 1909 drain_alien_cache(cachep, l3); 1910 } 1911 } 1912 spin_unlock_irq(&cachep->spinlock); 1913} 1914 1915static int __node_shrink(kmem_cache_t *cachep, int node) 1916{ 1917 struct slab *slabp; 1918 struct kmem_list3 *l3 = cachep->nodelists[node]; 1919 int ret; 1920 1921 for (;;) { 1922 struct list_head *p; 1923 1924 p = l3->slabs_free.prev; 1925 if (p == &l3->slabs_free) 1926 break; 1927 1928 slabp = list_entry(l3->slabs_free.prev, struct slab, list); 1929#if DEBUG 1930 if (slabp->inuse) 1931 BUG(); 1932#endif 1933 list_del(&slabp->list); 1934 1935 l3->free_objects -= cachep->num; 1936 spin_unlock_irq(&l3->list_lock); 1937 slab_destroy(cachep, slabp); 1938 spin_lock_irq(&l3->list_lock); 1939 } 1940 ret = !list_empty(&l3->slabs_full) || 1941 !list_empty(&l3->slabs_partial); 1942 return ret; 1943} 1944 1945static int __cache_shrink(kmem_cache_t *cachep) 1946{ 1947 int ret = 0, i = 0; 1948 struct kmem_list3 *l3; 1949 1950 drain_cpu_caches(cachep); 1951 1952 check_irq_on(); 1953 for_each_online_node(i) { 1954 l3 = cachep->nodelists[i]; 1955 if (l3) { 1956 spin_lock_irq(&l3->list_lock); 1957 ret += __node_shrink(cachep, i); 1958 spin_unlock_irq(&l3->list_lock); 1959 } 1960 } 1961 return (ret ? 1 : 0); 1962} 1963 1964/** 1965 * kmem_cache_shrink - Shrink a cache. 1966 * @cachep: The cache to shrink. 1967 * 1968 * Releases as many slabs as possible for a cache. 1969 * To help debugging, a zero exit status indicates all slabs were released. 1970 */ 1971int kmem_cache_shrink(kmem_cache_t *cachep) 1972{ 1973 if (!cachep || in_interrupt()) 1974 BUG(); 1975 1976 return __cache_shrink(cachep); 1977} 1978EXPORT_SYMBOL(kmem_cache_shrink); 1979 1980/** 1981 * kmem_cache_destroy - delete a cache 1982 * @cachep: the cache to destroy 1983 * 1984 * Remove a kmem_cache_t object from the slab cache. 1985 * Returns 0 on success. 1986 * 1987 * It is expected this function will be called by a module when it is 1988 * unloaded. This will remove the cache completely, and avoid a duplicate 1989 * cache being allocated each time a module is loaded and unloaded, if the 1990 * module doesn't have persistent in-kernel storage across loads and unloads. 1991 * 1992 * The cache must be empty before calling this function. 1993 * 1994 * The caller must guarantee that noone will allocate memory from the cache 1995 * during the kmem_cache_destroy(). 1996 */ 1997int kmem_cache_destroy(kmem_cache_t * cachep) 1998{ 1999 int i; 2000 struct kmem_list3 *l3; 2001 2002 if (!cachep || in_interrupt()) 2003 BUG(); 2004 2005 /* Don't let CPUs to come and go */ 2006 lock_cpu_hotplug(); 2007 2008 /* Find the cache in the chain of caches. */ 2009 down(&cache_chain_sem); 2010 /* 2011 * the chain is never empty, cache_cache is never destroyed 2012 */ 2013 list_del(&cachep->next); 2014 up(&cache_chain_sem); 2015 2016 if (__cache_shrink(cachep)) { 2017 slab_error(cachep, "Can't free all objects"); 2018 down(&cache_chain_sem); 2019 list_add(&cachep->next,&cache_chain); 2020 up(&cache_chain_sem); 2021 unlock_cpu_hotplug(); 2022 return 1; 2023 } 2024 2025 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2026 synchronize_rcu(); 2027 2028 for_each_online_cpu(i) 2029 kfree(cachep->array[i]); 2030 2031 /* NUMA: free the list3 structures */ 2032 for_each_online_node(i) { 2033 if ((l3 = cachep->nodelists[i])) { 2034 kfree(l3->shared); 2035 free_alien_cache(l3->alien); 2036 kfree(l3); 2037 } 2038 } 2039 kmem_cache_free(&cache_cache, cachep); 2040 2041 unlock_cpu_hotplug(); 2042 2043 return 0; 2044} 2045EXPORT_SYMBOL(kmem_cache_destroy); 2046 2047/* Get the memory for a slab management obj. */ 2048static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp, 2049 int colour_off, unsigned int __nocast local_flags) 2050{ 2051 struct slab *slabp; 2052 2053 if (OFF_SLAB(cachep)) { 2054 /* Slab management obj is off-slab. */ 2055 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); 2056 if (!slabp) 2057 return NULL; 2058 } else { 2059 slabp = objp+colour_off; 2060 colour_off += cachep->slab_size; 2061 } 2062 slabp->inuse = 0; 2063 slabp->colouroff = colour_off; 2064 slabp->s_mem = objp+colour_off; 2065 2066 return slabp; 2067} 2068 2069static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2070{ 2071 return (kmem_bufctl_t *)(slabp+1); 2072} 2073 2074static void cache_init_objs(kmem_cache_t *cachep, 2075 struct slab *slabp, unsigned long ctor_flags) 2076{ 2077 int i; 2078 2079 for (i = 0; i < cachep->num; i++) { 2080 void *objp = slabp->s_mem+cachep->objsize*i; 2081#if DEBUG 2082 /* need to poison the objs? */ 2083 if (cachep->flags & SLAB_POISON) 2084 poison_obj(cachep, objp, POISON_FREE); 2085 if (cachep->flags & SLAB_STORE_USER) 2086 *dbg_userword(cachep, objp) = NULL; 2087 2088 if (cachep->flags & SLAB_RED_ZONE) { 2089 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2090 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2091 } 2092 /* 2093 * Constructors are not allowed to allocate memory from 2094 * the same cache which they are a constructor for. 2095 * Otherwise, deadlock. They must also be threaded. 2096 */ 2097 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2098 cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags); 2099 2100 if (cachep->flags & SLAB_RED_ZONE) { 2101 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2102 slab_error(cachep, "constructor overwrote the" 2103 " end of an object"); 2104 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2105 slab_error(cachep, "constructor overwrote the" 2106 " start of an object"); 2107 } 2108 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2109 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 2110#else 2111 if (cachep->ctor) 2112 cachep->ctor(objp, cachep, ctor_flags); 2113#endif 2114 slab_bufctl(slabp)[i] = i+1; 2115 } 2116 slab_bufctl(slabp)[i-1] = BUFCTL_END; 2117 slabp->free = 0; 2118} 2119 2120static void kmem_flagcheck(kmem_cache_t *cachep, unsigned int flags) 2121{ 2122 if (flags & SLAB_DMA) { 2123 if (!(cachep->gfpflags & GFP_DMA)) 2124 BUG(); 2125 } else { 2126 if (cachep->gfpflags & GFP_DMA) 2127 BUG(); 2128 } 2129} 2130 2131static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) 2132{ 2133 int i; 2134 struct page *page; 2135 2136 /* Nasty!!!!!! I hope this is OK. */ 2137 i = 1 << cachep->gfporder; 2138 page = virt_to_page(objp); 2139 do { 2140 SET_PAGE_CACHE(page, cachep); 2141 SET_PAGE_SLAB(page, slabp); 2142 page++; 2143 } while (--i); 2144} 2145 2146/* 2147 * Grow (by 1) the number of slabs within a cache. This is called by 2148 * kmem_cache_alloc() when there are no active objs left in a cache. 2149 */ 2150static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid) 2151{ 2152 struct slab *slabp; 2153 void *objp; 2154 size_t offset; 2155 unsigned int local_flags; 2156 unsigned long ctor_flags; 2157 struct kmem_list3 *l3; 2158 2159 /* Be lazy and only check for valid flags here, 2160 * keeping it out of the critical path in kmem_cache_alloc(). 2161 */ 2162 if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) 2163 BUG(); 2164 if (flags & SLAB_NO_GROW) 2165 return 0; 2166 2167 ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2168 local_flags = (flags & SLAB_LEVEL_MASK); 2169 if (!(local_flags & __GFP_WAIT)) 2170 /* 2171 * Not allowed to sleep. Need to tell a constructor about 2172 * this - it might need to know... 2173 */ 2174 ctor_flags |= SLAB_CTOR_ATOMIC; 2175 2176 /* About to mess with non-constant members - lock. */ 2177 check_irq_off(); 2178 spin_lock(&cachep->spinlock); 2179 2180 /* Get colour for the slab, and cal the next value. */ 2181 offset = cachep->colour_next; 2182 cachep->colour_next++; 2183 if (cachep->colour_next >= cachep->colour) 2184 cachep->colour_next = 0; 2185 offset *= cachep->colour_off; 2186 2187 spin_unlock(&cachep->spinlock); 2188 2189 check_irq_off(); 2190 if (local_flags & __GFP_WAIT) 2191 local_irq_enable(); 2192 2193 /* 2194 * The test for missing atomic flag is performed here, rather than 2195 * the more obvious place, simply to reduce the critical path length 2196 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 2197 * will eventually be caught here (where it matters). 2198 */ 2199 kmem_flagcheck(cachep, flags); 2200 2201 /* Get mem for the objs. 2202 * Attempt to allocate a physical page from 'nodeid', 2203 */ 2204 if (!(objp = kmem_getpages(cachep, flags, nodeid))) 2205 goto failed; 2206 2207 /* Get slab management. */ 2208 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) 2209 goto opps1; 2210 2211 slabp->nodeid = nodeid; 2212 set_slab_attr(cachep, slabp, objp); 2213 2214 cache_init_objs(cachep, slabp, ctor_flags); 2215 2216 if (local_flags & __GFP_WAIT) 2217 local_irq_disable(); 2218 check_irq_off(); 2219 l3 = cachep->nodelists[nodeid]; 2220 spin_lock(&l3->list_lock); 2221 2222 /* Make slab active. */ 2223 list_add_tail(&slabp->list, &(l3->slabs_free)); 2224 STATS_INC_GROWN(cachep); 2225 l3->free_objects += cachep->num; 2226 spin_unlock(&l3->list_lock); 2227 return 1; 2228opps1: 2229 kmem_freepages(cachep, objp); 2230failed: 2231 if (local_flags & __GFP_WAIT) 2232 local_irq_disable(); 2233 return 0; 2234} 2235 2236#if DEBUG 2237 2238/* 2239 * Perform extra freeing checks: 2240 * - detect bad pointers. 2241 * - POISON/RED_ZONE checking 2242 * - destructor calls, for caches with POISON+dtor 2243 */ 2244static void kfree_debugcheck(const void *objp) 2245{ 2246 struct page *page; 2247 2248 if (!virt_addr_valid(objp)) { 2249 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2250 (unsigned long)objp); 2251 BUG(); 2252 } 2253 page = virt_to_page(objp); 2254 if (!PageSlab(page)) { 2255 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp); 2256 BUG(); 2257 } 2258} 2259 2260static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, 2261 void *caller) 2262{ 2263 struct page *page; 2264 unsigned int objnr; 2265 struct slab *slabp; 2266 2267 objp -= obj_dbghead(cachep); 2268 kfree_debugcheck(objp); 2269 page = virt_to_page(objp); 2270 2271 if (GET_PAGE_CACHE(page) != cachep) { 2272 printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n", 2273 GET_PAGE_CACHE(page),cachep); 2274 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); 2275 printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name); 2276 WARN_ON(1); 2277 } 2278 slabp = GET_PAGE_SLAB(page); 2279 2280 if (cachep->flags & SLAB_RED_ZONE) { 2281 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { 2282 slab_error(cachep, "double free, or memory outside" 2283 " object was overwritten"); 2284 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2285 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 2286 } 2287 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2288 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2289 } 2290 if (cachep->flags & SLAB_STORE_USER) 2291 *dbg_userword(cachep, objp) = caller; 2292 2293 objnr = (objp-slabp->s_mem)/cachep->objsize; 2294 2295 BUG_ON(objnr >= cachep->num); 2296 BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize); 2297 2298 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2299 /* Need to call the slab's constructor so the 2300 * caller can perform a verify of its state (debugging). 2301 * Called without the cache-lock held. 2302 */ 2303 cachep->ctor(objp+obj_dbghead(cachep), 2304 cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); 2305 } 2306 if (cachep->flags & SLAB_POISON && cachep->dtor) { 2307 /* we want to cache poison the object, 2308 * call the destruction callback 2309 */ 2310 cachep->dtor(objp+obj_dbghead(cachep), cachep, 0); 2311 } 2312 if (cachep->flags & SLAB_POISON) { 2313#ifdef CONFIG_DEBUG_PAGEALLOC 2314 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2315 store_stackinfo(cachep, objp, (unsigned long)caller); 2316 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 2317 } else { 2318 poison_obj(cachep, objp, POISON_FREE); 2319 } 2320#else 2321 poison_obj(cachep, objp, POISON_FREE); 2322#endif 2323 } 2324 return objp; 2325} 2326 2327static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) 2328{ 2329 kmem_bufctl_t i; 2330 int entries = 0; 2331 2332 /* Check slab's freelist to see if this obj is there. */ 2333 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2334 entries++; 2335 if (entries > cachep->num || i >= cachep->num) 2336 goto bad; 2337 } 2338 if (entries != cachep->num - slabp->inuse) { 2339bad: 2340 printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2341 cachep->name, cachep->num, slabp, slabp->inuse); 2342 for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) { 2343 if ((i%16)==0) 2344 printk("\n%03x:", i); 2345 printk(" %02x", ((unsigned char*)slabp)[i]); 2346 } 2347 printk("\n"); 2348 BUG(); 2349 } 2350} 2351#else 2352#define kfree_debugcheck(x) do { } while(0) 2353#define cache_free_debugcheck(x,objp,z) (objp) 2354#define check_slabp(x,y) do { } while(0) 2355#endif 2356 2357static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags) 2358{ 2359 int batchcount; 2360 struct kmem_list3 *l3; 2361 struct array_cache *ac; 2362 2363 check_irq_off(); 2364 ac = ac_data(cachep); 2365retry: 2366 batchcount = ac->batchcount; 2367 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2368 /* if there was little recent activity on this 2369 * cache, then perform only a partial refill. 2370 * Otherwise we could generate refill bouncing. 2371 */ 2372 batchcount = BATCHREFILL_LIMIT; 2373 } 2374 l3 = cachep->nodelists[numa_node_id()]; 2375 2376 BUG_ON(ac->avail > 0 || !l3); 2377 spin_lock(&l3->list_lock); 2378 2379 if (l3->shared) { 2380 struct array_cache *shared_array = l3->shared; 2381 if (shared_array->avail) { 2382 if (batchcount > shared_array->avail) 2383 batchcount = shared_array->avail; 2384 shared_array->avail -= batchcount; 2385 ac->avail = batchcount; 2386 memcpy(ac->entry, 2387 &(shared_array->entry[shared_array->avail]), 2388 sizeof(void*)*batchcount); 2389 shared_array->touched = 1; 2390 goto alloc_done; 2391 } 2392 } 2393 while (batchcount > 0) { 2394 struct list_head *entry; 2395 struct slab *slabp; 2396 /* Get slab alloc is to come from. */ 2397 entry = l3->slabs_partial.next; 2398 if (entry == &l3->slabs_partial) { 2399 l3->free_touched = 1; 2400 entry = l3->slabs_free.next; 2401 if (entry == &l3->slabs_free) 2402 goto must_grow; 2403 } 2404 2405 slabp = list_entry(entry, struct slab, list); 2406 check_slabp(cachep, slabp); 2407 check_spinlock_acquired(cachep); 2408 while (slabp->inuse < cachep->num && batchcount--) { 2409 kmem_bufctl_t next; 2410 STATS_INC_ALLOCED(cachep); 2411 STATS_INC_ACTIVE(cachep); 2412 STATS_SET_HIGH(cachep); 2413 2414 /* get obj pointer */ 2415 ac->entry[ac->avail++] = slabp->s_mem + 2416 slabp->free*cachep->objsize; 2417 2418 slabp->inuse++; 2419 next = slab_bufctl(slabp)[slabp->free]; 2420#if DEBUG 2421 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2422#endif 2423 slabp->free = next; 2424 } 2425 check_slabp(cachep, slabp); 2426 2427 /* move slabp to correct slabp list: */ 2428 list_del(&slabp->list); 2429 if (slabp->free == BUFCTL_END) 2430 list_add(&slabp->list, &l3->slabs_full); 2431 else 2432 list_add(&slabp->list, &l3->slabs_partial); 2433 } 2434 2435must_grow: 2436 l3->free_objects -= ac->avail; 2437alloc_done: 2438 spin_unlock(&l3->list_lock); 2439 2440 if (unlikely(!ac->avail)) { 2441 int x; 2442 x = cache_grow(cachep, flags, numa_node_id()); 2443 2444 // cache_grow can reenable interrupts, then ac could change. 2445 ac = ac_data(cachep); 2446 if (!x && ac->avail == 0) // no objects in sight? abort 2447 return NULL; 2448 2449 if (!ac->avail) // objects refilled by interrupt? 2450 goto retry; 2451 } 2452 ac->touched = 1; 2453 return ac->entry[--ac->avail]; 2454} 2455 2456static inline void 2457cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags) 2458{ 2459 might_sleep_if(flags & __GFP_WAIT); 2460#if DEBUG 2461 kmem_flagcheck(cachep, flags); 2462#endif 2463} 2464 2465#if DEBUG 2466static void * 2467cache_alloc_debugcheck_after(kmem_cache_t *cachep, 2468 unsigned int __nocast flags, void *objp, void *caller) 2469{ 2470 if (!objp) 2471 return objp; 2472 if (cachep->flags & SLAB_POISON) { 2473#ifdef CONFIG_DEBUG_PAGEALLOC 2474 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2475 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1); 2476 else 2477 check_poison_obj(cachep, objp); 2478#else 2479 check_poison_obj(cachep, objp); 2480#endif 2481 poison_obj(cachep, objp, POISON_INUSE); 2482 } 2483 if (cachep->flags & SLAB_STORE_USER) 2484 *dbg_userword(cachep, objp) = caller; 2485 2486 if (cachep->flags & SLAB_RED_ZONE) { 2487 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2488 slab_error(cachep, "double free, or memory outside" 2489 " object was overwritten"); 2490 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2491 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 2492 } 2493 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2494 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2495 } 2496 objp += obj_dbghead(cachep); 2497 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2498 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2499 2500 if (!(flags & __GFP_WAIT)) 2501 ctor_flags |= SLAB_CTOR_ATOMIC; 2502 2503 cachep->ctor(objp, cachep, ctor_flags); 2504 } 2505 return objp; 2506} 2507#else 2508#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2509#endif 2510 2511 2512static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags) 2513{ 2514 unsigned long save_flags; 2515 void* objp; 2516 struct array_cache *ac; 2517 2518 cache_alloc_debugcheck_before(cachep, flags); 2519 2520 local_irq_save(save_flags); 2521 ac = ac_data(cachep); 2522 if (likely(ac->avail)) { 2523 STATS_INC_ALLOCHIT(cachep); 2524 ac->touched = 1; 2525 objp = ac->entry[--ac->avail]; 2526 } else { 2527 STATS_INC_ALLOCMISS(cachep); 2528 objp = cache_alloc_refill(cachep, flags); 2529 } 2530 local_irq_restore(save_flags); 2531 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 2532 __builtin_return_address(0)); 2533 prefetchw(objp); 2534 return objp; 2535} 2536 2537#ifdef CONFIG_NUMA 2538/* 2539 * A interface to enable slab creation on nodeid 2540 */ 2541static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid) 2542{ 2543 struct list_head *entry; 2544 struct slab *slabp; 2545 struct kmem_list3 *l3; 2546 void *obj; 2547 kmem_bufctl_t next; 2548 int x; 2549 2550 l3 = cachep->nodelists[nodeid]; 2551 BUG_ON(!l3); 2552 2553retry: 2554 spin_lock(&l3->list_lock); 2555 entry = l3->slabs_partial.next; 2556 if (entry == &l3->slabs_partial) { 2557 l3->free_touched = 1; 2558 entry = l3->slabs_free.next; 2559 if (entry == &l3->slabs_free) 2560 goto must_grow; 2561 } 2562 2563 slabp = list_entry(entry, struct slab, list); 2564 check_spinlock_acquired_node(cachep, nodeid); 2565 check_slabp(cachep, slabp); 2566 2567 STATS_INC_NODEALLOCS(cachep); 2568 STATS_INC_ACTIVE(cachep); 2569 STATS_SET_HIGH(cachep); 2570 2571 BUG_ON(slabp->inuse == cachep->num); 2572 2573 /* get obj pointer */ 2574 obj = slabp->s_mem + slabp->free*cachep->objsize; 2575 slabp->inuse++; 2576 next = slab_bufctl(slabp)[slabp->free]; 2577#if DEBUG 2578 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2579#endif 2580 slabp->free = next; 2581 check_slabp(cachep, slabp); 2582 l3->free_objects--; 2583 /* move slabp to correct slabp list: */ 2584 list_del(&slabp->list); 2585 2586 if (slabp->free == BUFCTL_END) { 2587 list_add(&slabp->list, &l3->slabs_full); 2588 } else { 2589 list_add(&slabp->list, &l3->slabs_partial); 2590 } 2591 2592 spin_unlock(&l3->list_lock); 2593 goto done; 2594 2595must_grow: 2596 spin_unlock(&l3->list_lock); 2597 x = cache_grow(cachep, flags, nodeid); 2598 2599 if (!x) 2600 return NULL; 2601 2602 goto retry; 2603done: 2604 return obj; 2605} 2606#endif 2607 2608/* 2609 * Caller needs to acquire correct kmem_list's list_lock 2610 */ 2611static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects) 2612{ 2613 int i; 2614 struct kmem_list3 *l3; 2615 2616 for (i = 0; i < nr_objects; i++) { 2617 void *objp = objpp[i]; 2618 struct slab *slabp; 2619 unsigned int objnr; 2620 int nodeid = 0; 2621 2622 slabp = GET_PAGE_SLAB(virt_to_page(objp)); 2623 nodeid = slabp->nodeid; 2624 l3 = cachep->nodelists[nodeid]; 2625 list_del(&slabp->list); 2626 objnr = (objp - slabp->s_mem) / cachep->objsize; 2627 check_spinlock_acquired_node(cachep, nodeid); 2628 check_slabp(cachep, slabp); 2629 2630 2631#if DEBUG 2632 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2633 printk(KERN_ERR "slab: double free detected in cache " 2634 "'%s', objp %p\n", cachep->name, objp); 2635 BUG(); 2636 } 2637#endif 2638 slab_bufctl(slabp)[objnr] = slabp->free; 2639 slabp->free = objnr; 2640 STATS_DEC_ACTIVE(cachep); 2641 slabp->inuse--; 2642 l3->free_objects++; 2643 check_slabp(cachep, slabp); 2644 2645 /* fixup slab chains */ 2646 if (slabp->inuse == 0) { 2647 if (l3->free_objects > l3->free_limit) { 2648 l3->free_objects -= cachep->num; 2649 slab_destroy(cachep, slabp); 2650 } else { 2651 list_add(&slabp->list, &l3->slabs_free); 2652 } 2653 } else { 2654 /* Unconditionally move a slab to the end of the 2655 * partial list on free - maximum time for the 2656 * other objects to be freed, too. 2657 */ 2658 list_add_tail(&slabp->list, &l3->slabs_partial); 2659 } 2660 } 2661} 2662 2663static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) 2664{ 2665 int batchcount; 2666 struct kmem_list3 *l3; 2667 2668 batchcount = ac->batchcount; 2669#if DEBUG 2670 BUG_ON(!batchcount || batchcount > ac->avail); 2671#endif 2672 check_irq_off(); 2673 l3 = cachep->nodelists[numa_node_id()]; 2674 spin_lock(&l3->list_lock); 2675 if (l3->shared) { 2676 struct array_cache *shared_array = l3->shared; 2677 int max = shared_array->limit-shared_array->avail; 2678 if (max) { 2679 if (batchcount > max) 2680 batchcount = max; 2681 memcpy(&(shared_array->entry[shared_array->avail]), 2682 ac->entry, 2683 sizeof(void*)*batchcount); 2684 shared_array->avail += batchcount; 2685 goto free_done; 2686 } 2687 } 2688 2689 free_block(cachep, ac->entry, batchcount); 2690free_done: 2691#if STATS 2692 { 2693 int i = 0; 2694 struct list_head *p; 2695 2696 p = l3->slabs_free.next; 2697 while (p != &(l3->slabs_free)) { 2698 struct slab *slabp; 2699 2700 slabp = list_entry(p, struct slab, list); 2701 BUG_ON(slabp->inuse); 2702 2703 i++; 2704 p = p->next; 2705 } 2706 STATS_SET_FREEABLE(cachep, i); 2707 } 2708#endif 2709 spin_unlock(&l3->list_lock); 2710 ac->avail -= batchcount; 2711 memmove(ac->entry, &(ac->entry[batchcount]), 2712 sizeof(void*)*ac->avail); 2713} 2714 2715 2716/* 2717 * __cache_free 2718 * Release an obj back to its cache. If the obj has a constructed 2719 * state, it must be in this state _before_ it is released. 2720 * 2721 * Called with disabled ints. 2722 */ 2723static inline void __cache_free(kmem_cache_t *cachep, void *objp) 2724{ 2725 struct array_cache *ac = ac_data(cachep); 2726 2727 check_irq_off(); 2728 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 2729 2730 /* Make sure we are not freeing a object from another 2731 * node to the array cache on this cpu. 2732 */ 2733#ifdef CONFIG_NUMA 2734 { 2735 struct slab *slabp; 2736 slabp = GET_PAGE_SLAB(virt_to_page(objp)); 2737 if (unlikely(slabp->nodeid != numa_node_id())) { 2738 struct array_cache *alien = NULL; 2739 int nodeid = slabp->nodeid; 2740 struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()]; 2741 2742 STATS_INC_NODEFREES(cachep); 2743 if (l3->alien && l3->alien[nodeid]) { 2744 alien = l3->alien[nodeid]; 2745 spin_lock(&alien->lock); 2746 if (unlikely(alien->avail == alien->limit)) 2747 __drain_alien_cache(cachep, 2748 alien, nodeid); 2749 alien->entry[alien->avail++] = objp; 2750 spin_unlock(&alien->lock); 2751 } else { 2752 spin_lock(&(cachep->nodelists[nodeid])-> 2753 list_lock); 2754 free_block(cachep, &objp, 1); 2755 spin_unlock(&(cachep->nodelists[nodeid])-> 2756 list_lock); 2757 } 2758 return; 2759 } 2760 } 2761#endif 2762 if (likely(ac->avail < ac->limit)) { 2763 STATS_INC_FREEHIT(cachep); 2764 ac->entry[ac->avail++] = objp; 2765 return; 2766 } else { 2767 STATS_INC_FREEMISS(cachep); 2768 cache_flusharray(cachep, ac); 2769 ac->entry[ac->avail++] = objp; 2770 } 2771} 2772 2773/** 2774 * kmem_cache_alloc - Allocate an object 2775 * @cachep: The cache to allocate from. 2776 * @flags: See kmalloc(). 2777 * 2778 * Allocate an object from this cache. The flags are only relevant 2779 * if the cache has no available objects. 2780 */ 2781void *kmem_cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags) 2782{ 2783 return __cache_alloc(cachep, flags); 2784} 2785EXPORT_SYMBOL(kmem_cache_alloc); 2786 2787/** 2788 * kmem_ptr_validate - check if an untrusted pointer might 2789 * be a slab entry. 2790 * @cachep: the cache we're checking against 2791 * @ptr: pointer to validate 2792 * 2793 * This verifies that the untrusted pointer looks sane: 2794 * it is _not_ a guarantee that the pointer is actually 2795 * part of the slab cache in question, but it at least 2796 * validates that the pointer can be dereferenced and 2797 * looks half-way sane. 2798 * 2799 * Currently only used for dentry validation. 2800 */ 2801int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) 2802{ 2803 unsigned long addr = (unsigned long) ptr; 2804 unsigned long min_addr = PAGE_OFFSET; 2805 unsigned long align_mask = BYTES_PER_WORD-1; 2806 unsigned long size = cachep->objsize; 2807 struct page *page; 2808 2809 if (unlikely(addr < min_addr)) 2810 goto out; 2811 if (unlikely(addr > (unsigned long)high_memory - size)) 2812 goto out; 2813 if (unlikely(addr & align_mask)) 2814 goto out; 2815 if (unlikely(!kern_addr_valid(addr))) 2816 goto out; 2817 if (unlikely(!kern_addr_valid(addr + size - 1))) 2818 goto out; 2819 page = virt_to_page(ptr); 2820 if (unlikely(!PageSlab(page))) 2821 goto out; 2822 if (unlikely(GET_PAGE_CACHE(page) != cachep)) 2823 goto out; 2824 return 1; 2825out: 2826 return 0; 2827} 2828 2829#ifdef CONFIG_NUMA 2830/** 2831 * kmem_cache_alloc_node - Allocate an object on the specified node 2832 * @cachep: The cache to allocate from. 2833 * @flags: See kmalloc(). 2834 * @nodeid: node number of the target node. 2835 * 2836 * Identical to kmem_cache_alloc, except that this function is slow 2837 * and can sleep. And it will allocate memory on the given node, which 2838 * can improve the performance for cpu bound structures. 2839 * New and improved: it will now make sure that the object gets 2840 * put on the correct node list so that there is no false sharing. 2841 */ 2842void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid) 2843{ 2844 unsigned long save_flags; 2845 void *ptr; 2846 2847 if (nodeid == numa_node_id() || nodeid == -1) 2848 return __cache_alloc(cachep, flags); 2849 2850 if (unlikely(!cachep->nodelists[nodeid])) { 2851 /* Fall back to __cache_alloc if we run into trouble */ 2852 printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name); 2853 return __cache_alloc(cachep,flags); 2854 } 2855 2856 cache_alloc_debugcheck_before(cachep, flags); 2857 local_irq_save(save_flags); 2858 ptr = __cache_alloc_node(cachep, flags, nodeid); 2859 local_irq_restore(save_flags); 2860 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0)); 2861 2862 return ptr; 2863} 2864EXPORT_SYMBOL(kmem_cache_alloc_node); 2865 2866void *kmalloc_node(size_t size, unsigned int __nocast flags, int node) 2867{ 2868 kmem_cache_t *cachep; 2869 2870 cachep = kmem_find_general_cachep(size, flags); 2871 if (unlikely(cachep == NULL)) 2872 return NULL; 2873 return kmem_cache_alloc_node(cachep, flags, node); 2874} 2875EXPORT_SYMBOL(kmalloc_node); 2876#endif 2877 2878/** 2879 * kmalloc - allocate memory 2880 * @size: how many bytes of memory are required. 2881 * @flags: the type of memory to allocate. 2882 * 2883 * kmalloc is the normal method of allocating memory 2884 * in the kernel. 2885 * 2886 * The @flags argument may be one of: 2887 * 2888 * %GFP_USER - Allocate memory on behalf of user. May sleep. 2889 * 2890 * %GFP_KERNEL - Allocate normal kernel ram. May sleep. 2891 * 2892 * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers. 2893 * 2894 * Additionally, the %GFP_DMA flag may be set to indicate the memory 2895 * must be suitable for DMA. This can mean different things on different 2896 * platforms. For example, on i386, it means that the memory must come 2897 * from the first 16MB. 2898 */ 2899void *__kmalloc(size_t size, unsigned int __nocast flags) 2900{ 2901 kmem_cache_t *cachep; 2902 2903 /* If you want to save a few bytes .text space: replace 2904 * __ with kmem_. 2905 * Then kmalloc uses the uninlined functions instead of the inline 2906 * functions. 2907 */ 2908 cachep = __find_general_cachep(size, flags); 2909 if (unlikely(cachep == NULL)) 2910 return NULL; 2911 return __cache_alloc(cachep, flags); 2912} 2913EXPORT_SYMBOL(__kmalloc); 2914 2915#ifdef CONFIG_SMP 2916/** 2917 * __alloc_percpu - allocate one copy of the object for every present 2918 * cpu in the system, zeroing them. 2919 * Objects should be dereferenced using the per_cpu_ptr macro only. 2920 * 2921 * @size: how many bytes of memory are required. 2922 * @align: the alignment, which can't be greater than SMP_CACHE_BYTES. 2923 */ 2924void *__alloc_percpu(size_t size, size_t align) 2925{ 2926 int i; 2927 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); 2928 2929 if (!pdata) 2930 return NULL; 2931 2932 /* 2933 * Cannot use for_each_online_cpu since a cpu may come online 2934 * and we have no way of figuring out how to fix the array 2935 * that we have allocated then.... 2936 */ 2937 for_each_cpu(i) { 2938 int node = cpu_to_node(i); 2939 2940 if (node_online(node)) 2941 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node); 2942 else 2943 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); 2944 2945 if (!pdata->ptrs[i]) 2946 goto unwind_oom; 2947 memset(pdata->ptrs[i], 0, size); 2948 } 2949 2950 /* Catch derefs w/o wrappers */ 2951 return (void *) (~(unsigned long) pdata); 2952 2953unwind_oom: 2954 while (--i >= 0) { 2955 if (!cpu_possible(i)) 2956 continue; 2957 kfree(pdata->ptrs[i]); 2958 } 2959 kfree(pdata); 2960 return NULL; 2961} 2962EXPORT_SYMBOL(__alloc_percpu); 2963#endif 2964 2965/** 2966 * kmem_cache_free - Deallocate an object 2967 * @cachep: The cache the allocation was from. 2968 * @objp: The previously allocated object. 2969 * 2970 * Free an object which was previously allocated from this 2971 * cache. 2972 */ 2973void kmem_cache_free(kmem_cache_t *cachep, void *objp) 2974{ 2975 unsigned long flags; 2976 2977 local_irq_save(flags); 2978 __cache_free(cachep, objp); 2979 local_irq_restore(flags); 2980} 2981EXPORT_SYMBOL(kmem_cache_free); 2982 2983/** 2984 * kzalloc - allocate memory. The memory is set to zero. 2985 * @size: how many bytes of memory are required. 2986 * @flags: the type of memory to allocate. 2987 */ 2988void *kzalloc(size_t size, unsigned int __nocast flags) 2989{ 2990 void *ret = kmalloc(size, flags); 2991 if (ret) 2992 memset(ret, 0, size); 2993 return ret; 2994} 2995EXPORT_SYMBOL(kzalloc); 2996 2997/** 2998 * kfree - free previously allocated memory 2999 * @objp: pointer returned by kmalloc. 3000 * 3001 * Don't free memory not originally allocated by kmalloc() 3002 * or you will run into trouble. 3003 */ 3004void kfree(const void *objp) 3005{ 3006 kmem_cache_t *c; 3007 unsigned long flags; 3008 3009 if (unlikely(!objp)) 3010 return; 3011 local_irq_save(flags); 3012 kfree_debugcheck(objp); 3013 c = GET_PAGE_CACHE(virt_to_page(objp)); 3014 __cache_free(c, (void*)objp); 3015 local_irq_restore(flags); 3016} 3017EXPORT_SYMBOL(kfree); 3018 3019#ifdef CONFIG_SMP 3020/** 3021 * free_percpu - free previously allocated percpu memory 3022 * @objp: pointer returned by alloc_percpu. 3023 * 3024 * Don't free memory not originally allocated by alloc_percpu() 3025 * The complemented objp is to check for that. 3026 */ 3027void 3028free_percpu(const void *objp) 3029{ 3030 int i; 3031 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); 3032 3033 /* 3034 * We allocate for all cpus so we cannot use for online cpu here. 3035 */ 3036 for_each_cpu(i) 3037 kfree(p->ptrs[i]); 3038 kfree(p); 3039} 3040EXPORT_SYMBOL(free_percpu); 3041#endif 3042 3043unsigned int kmem_cache_size(kmem_cache_t *cachep) 3044{ 3045 return obj_reallen(cachep); 3046} 3047EXPORT_SYMBOL(kmem_cache_size); 3048 3049const char *kmem_cache_name(kmem_cache_t *cachep) 3050{ 3051 return cachep->name; 3052} 3053EXPORT_SYMBOL_GPL(kmem_cache_name); 3054 3055/* 3056 * This initializes kmem_list3 for all nodes. 3057 */ 3058static int alloc_kmemlist(kmem_cache_t *cachep) 3059{ 3060 int node; 3061 struct kmem_list3 *l3; 3062 int err = 0; 3063 3064 for_each_online_node(node) { 3065 struct array_cache *nc = NULL, *new; 3066 struct array_cache **new_alien = NULL; 3067#ifdef CONFIG_NUMA 3068 if (!(new_alien = alloc_alien_cache(node, cachep->limit))) 3069 goto fail; 3070#endif 3071 if (!(new = alloc_arraycache(node, (cachep->shared* 3072 cachep->batchcount), 0xbaadf00d))) 3073 goto fail; 3074 if ((l3 = cachep->nodelists[node])) { 3075 3076 spin_lock_irq(&l3->list_lock); 3077 3078 if ((nc = cachep->nodelists[node]->shared)) 3079 free_block(cachep, nc->entry, 3080 nc->avail); 3081 3082 l3->shared = new; 3083 if (!cachep->nodelists[node]->alien) { 3084 l3->alien = new_alien; 3085 new_alien = NULL; 3086 } 3087 l3->free_limit = (1 + nr_cpus_node(node))* 3088 cachep->batchcount + cachep->num; 3089 spin_unlock_irq(&l3->list_lock); 3090 kfree(nc); 3091 free_alien_cache(new_alien); 3092 continue; 3093 } 3094 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), 3095 GFP_KERNEL, node))) 3096 goto fail; 3097 3098 kmem_list3_init(l3); 3099 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3100 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 3101 l3->shared = new; 3102 l3->alien = new_alien; 3103 l3->free_limit = (1 + nr_cpus_node(node))* 3104 cachep->batchcount + cachep->num; 3105 cachep->nodelists[node] = l3; 3106 } 3107 return err; 3108fail: 3109 err = -ENOMEM; 3110 return err; 3111} 3112 3113struct ccupdate_struct { 3114 kmem_cache_t *cachep; 3115 struct array_cache *new[NR_CPUS]; 3116}; 3117 3118static void do_ccupdate_local(void *info) 3119{ 3120 struct ccupdate_struct *new = (struct ccupdate_struct *)info; 3121 struct array_cache *old; 3122 3123 check_irq_off(); 3124 old = ac_data(new->cachep); 3125 3126 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3127 new->new[smp_processor_id()] = old; 3128} 3129 3130 3131static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, 3132 int shared) 3133{ 3134 struct ccupdate_struct new; 3135 int i, err; 3136 3137 memset(&new.new,0,sizeof(new.new)); 3138 for_each_online_cpu(i) { 3139 new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount); 3140 if (!new.new[i]) { 3141 for (i--; i >= 0; i--) kfree(new.new[i]); 3142 return -ENOMEM; 3143 } 3144 } 3145 new.cachep = cachep; 3146 3147 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); 3148 3149 check_irq_on(); 3150 spin_lock_irq(&cachep->spinlock); 3151 cachep->batchcount = batchcount; 3152 cachep->limit = limit; 3153 cachep->shared = shared; 3154 spin_unlock_irq(&cachep->spinlock); 3155 3156 for_each_online_cpu(i) { 3157 struct array_cache *ccold = new.new[i]; 3158 if (!ccold) 3159 continue; 3160 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3161 free_block(cachep, ccold->entry, ccold->avail); 3162 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3163 kfree(ccold); 3164 } 3165 3166 err = alloc_kmemlist(cachep); 3167 if (err) { 3168 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", 3169 cachep->name, -err); 3170 BUG(); 3171 } 3172 return 0; 3173} 3174 3175 3176static void enable_cpucache(kmem_cache_t *cachep) 3177{ 3178 int err; 3179 int limit, shared; 3180 3181 /* The head array serves three purposes: 3182 * - create a LIFO ordering, i.e. return objects that are cache-warm 3183 * - reduce the number of spinlock operations. 3184 * - reduce the number of linked list operations on the slab and 3185 * bufctl chains: array operations are cheaper. 3186 * The numbers are guessed, we should auto-tune as described by 3187 * Bonwick. 3188 */ 3189 if (cachep->objsize > 131072) 3190 limit = 1; 3191 else if (cachep->objsize > PAGE_SIZE) 3192 limit = 8; 3193 else if (cachep->objsize > 1024) 3194 limit = 24; 3195 else if (cachep->objsize > 256) 3196 limit = 54; 3197 else 3198 limit = 120; 3199 3200 /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound 3201 * allocation behaviour: Most allocs on one cpu, most free operations 3202 * on another cpu. For these cases, an efficient object passing between 3203 * cpus is necessary. This is provided by a shared array. The array 3204 * replaces Bonwick's magazine layer. 3205 * On uniprocessor, it's functionally equivalent (but less efficient) 3206 * to a larger limit. Thus disabled by default. 3207 */ 3208 shared = 0; 3209#ifdef CONFIG_SMP 3210 if (cachep->objsize <= PAGE_SIZE) 3211 shared = 8; 3212#endif 3213 3214#if DEBUG 3215 /* With debugging enabled, large batchcount lead to excessively 3216 * long periods with disabled local interrupts. Limit the 3217 * batchcount 3218 */ 3219 if (limit > 32) 3220 limit = 32; 3221#endif 3222 err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); 3223 if (err) 3224 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3225 cachep->name, -err); 3226} 3227 3228static void drain_array_locked(kmem_cache_t *cachep, 3229 struct array_cache *ac, int force, int node) 3230{ 3231 int tofree; 3232 3233 check_spinlock_acquired_node(cachep, node); 3234 if (ac->touched && !force) { 3235 ac->touched = 0; 3236 } else if (ac->avail) { 3237 tofree = force ? ac->avail : (ac->limit+4)/5; 3238 if (tofree > ac->avail) { 3239 tofree = (ac->avail+1)/2; 3240 } 3241 free_block(cachep, ac->entry, tofree); 3242 ac->avail -= tofree; 3243 memmove(ac->entry, &(ac->entry[tofree]), 3244 sizeof(void*)*ac->avail); 3245 } 3246} 3247 3248/** 3249 * cache_reap - Reclaim memory from caches. 3250 * 3251 * Called from workqueue/eventd every few seconds. 3252 * Purpose: 3253 * - clear the per-cpu caches for this CPU. 3254 * - return freeable pages to the main free memory pool. 3255 * 3256 * If we cannot acquire the cache chain semaphore then just give up - we'll 3257 * try again on the next iteration. 3258 */ 3259static void cache_reap(void *unused) 3260{ 3261 struct list_head *walk; 3262 struct kmem_list3 *l3; 3263 3264 if (down_trylock(&cache_chain_sem)) { 3265 /* Give up. Setup the next iteration. */ 3266 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); 3267 return; 3268 } 3269 3270 list_for_each(walk, &cache_chain) { 3271 kmem_cache_t *searchp; 3272 struct list_head* p; 3273 int tofree; 3274 struct slab *slabp; 3275 3276 searchp = list_entry(walk, kmem_cache_t, next); 3277 3278 if (searchp->flags & SLAB_NO_REAP) 3279 goto next; 3280 3281 check_irq_on(); 3282 3283 l3 = searchp->nodelists[numa_node_id()]; 3284 if (l3->alien) 3285 drain_alien_cache(searchp, l3); 3286 spin_lock_irq(&l3->list_lock); 3287 3288 drain_array_locked(searchp, ac_data(searchp), 0, 3289 numa_node_id()); 3290 3291 if (time_after(l3->next_reap, jiffies)) 3292 goto next_unlock; 3293 3294 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 3295 3296 if (l3->shared) 3297 drain_array_locked(searchp, l3->shared, 0, 3298 numa_node_id()); 3299 3300 if (l3->free_touched) { 3301 l3->free_touched = 0; 3302 goto next_unlock; 3303 } 3304 3305 tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num); 3306 do { 3307 p = l3->slabs_free.next; 3308 if (p == &(l3->slabs_free)) 3309 break; 3310 3311 slabp = list_entry(p, struct slab, list); 3312 BUG_ON(slabp->inuse); 3313 list_del(&slabp->list); 3314 STATS_INC_REAPED(searchp); 3315 3316 /* Safe to drop the lock. The slab is no longer 3317 * linked to the cache. 3318 * searchp cannot disappear, we hold 3319 * cache_chain_lock 3320 */ 3321 l3->free_objects -= searchp->num; 3322 spin_unlock_irq(&l3->list_lock); 3323 slab_destroy(searchp, slabp); 3324 spin_lock_irq(&l3->list_lock); 3325 } while(--tofree > 0); 3326next_unlock: 3327 spin_unlock_irq(&l3->list_lock); 3328next: 3329 cond_resched(); 3330 } 3331 check_irq_on(); 3332 up(&cache_chain_sem); 3333 drain_remote_pages(); 3334 /* Setup the next iteration */ 3335 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); 3336} 3337 3338#ifdef CONFIG_PROC_FS 3339 3340static void *s_start(struct seq_file *m, loff_t *pos) 3341{ 3342 loff_t n = *pos; 3343 struct list_head *p; 3344 3345 down(&cache_chain_sem); 3346 if (!n) { 3347 /* 3348 * Output format version, so at least we can change it 3349 * without _too_ many complaints. 3350 */ 3351#if STATS 3352 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 3353#else 3354 seq_puts(m, "slabinfo - version: 2.1\n"); 3355#endif 3356 seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); 3357 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 3358 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 3359#if STATS 3360 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>" 3361 " <error> <maxfreeable> <nodeallocs> <remotefrees>"); 3362 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 3363#endif 3364 seq_putc(m, '\n'); 3365 } 3366 p = cache_chain.next; 3367 while (n--) { 3368 p = p->next; 3369 if (p == &cache_chain) 3370 return NULL; 3371 } 3372 return list_entry(p, kmem_cache_t, next); 3373} 3374 3375static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3376{ 3377 kmem_cache_t *cachep = p; 3378 ++*pos; 3379 return cachep->next.next == &cache_chain ? NULL 3380 : list_entry(cachep->next.next, kmem_cache_t, next); 3381} 3382 3383static void s_stop(struct seq_file *m, void *p) 3384{ 3385 up(&cache_chain_sem); 3386} 3387 3388static int s_show(struct seq_file *m, void *p) 3389{ 3390 kmem_cache_t *cachep = p; 3391 struct list_head *q; 3392 struct slab *slabp; 3393 unsigned long active_objs; 3394 unsigned long num_objs; 3395 unsigned long active_slabs = 0; 3396 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 3397 const char *name; 3398 char *error = NULL; 3399 int node; 3400 struct kmem_list3 *l3; 3401 3402 check_irq_on(); 3403 spin_lock_irq(&cachep->spinlock); 3404 active_objs = 0; 3405 num_slabs = 0; 3406 for_each_online_node(node) { 3407 l3 = cachep->nodelists[node]; 3408 if (!l3) 3409 continue; 3410 3411 spin_lock(&l3->list_lock); 3412 3413 list_for_each(q,&l3->slabs_full) { 3414 slabp = list_entry(q, struct slab, list); 3415 if (slabp->inuse != cachep->num && !error) 3416 error = "slabs_full accounting error"; 3417 active_objs += cachep->num; 3418 active_slabs++; 3419 } 3420 list_for_each(q,&l3->slabs_partial) { 3421 slabp = list_entry(q, struct slab, list); 3422 if (slabp->inuse == cachep->num && !error) 3423 error = "slabs_partial inuse accounting error"; 3424 if (!slabp->inuse && !error) 3425 error = "slabs_partial/inuse accounting error"; 3426 active_objs += slabp->inuse; 3427 active_slabs++; 3428 } 3429 list_for_each(q,&l3->slabs_free) { 3430 slabp = list_entry(q, struct slab, list); 3431 if (slabp->inuse && !error) 3432 error = "slabs_free/inuse accounting error"; 3433 num_slabs++; 3434 } 3435 free_objects += l3->free_objects; 3436 shared_avail += l3->shared->avail; 3437 3438 spin_unlock(&l3->list_lock); 3439 } 3440 num_slabs+=active_slabs; 3441 num_objs = num_slabs*cachep->num; 3442 if (num_objs - active_objs != free_objects && !error) 3443 error = "free_objects accounting error"; 3444 3445 name = cachep->name; 3446 if (error) 3447 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 3448 3449 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 3450 name, active_objs, num_objs, cachep->objsize, 3451 cachep->num, (1<<cachep->gfporder)); 3452 seq_printf(m, " : tunables %4u %4u %4u", 3453 cachep->limit, cachep->batchcount, 3454 cachep->shared); 3455 seq_printf(m, " : slabdata %6lu %6lu %6lu", 3456 active_slabs, num_slabs, shared_avail); 3457#if STATS 3458 { /* list3 stats */ 3459 unsigned long high = cachep->high_mark; 3460 unsigned long allocs = cachep->num_allocations; 3461 unsigned long grown = cachep->grown; 3462 unsigned long reaped = cachep->reaped; 3463 unsigned long errors = cachep->errors; 3464 unsigned long max_freeable = cachep->max_freeable; 3465 unsigned long node_allocs = cachep->node_allocs; 3466 unsigned long node_frees = cachep->node_frees; 3467 3468 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 3469 %4lu %4lu %4lu %4lu", 3470 allocs, high, grown, reaped, errors, 3471 max_freeable, node_allocs, node_frees); 3472 } 3473 /* cpu stats */ 3474 { 3475 unsigned long allochit = atomic_read(&cachep->allochit); 3476 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 3477 unsigned long freehit = atomic_read(&cachep->freehit); 3478 unsigned long freemiss = atomic_read(&cachep->freemiss); 3479 3480 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 3481 allochit, allocmiss, freehit, freemiss); 3482 } 3483#endif 3484 seq_putc(m, '\n'); 3485 spin_unlock_irq(&cachep->spinlock); 3486 return 0; 3487} 3488 3489/* 3490 * slabinfo_op - iterator that generates /proc/slabinfo 3491 * 3492 * Output layout: 3493 * cache-name 3494 * num-active-objs 3495 * total-objs 3496 * object size 3497 * num-active-slabs 3498 * total-slabs 3499 * num-pages-per-slab 3500 * + further values on SMP and with statistics enabled 3501 */ 3502 3503struct seq_operations slabinfo_op = { 3504 .start = s_start, 3505 .next = s_next, 3506 .stop = s_stop, 3507 .show = s_show, 3508}; 3509 3510#define MAX_SLABINFO_WRITE 128 3511/** 3512 * slabinfo_write - Tuning for the slab allocator 3513 * @file: unused 3514 * @buffer: user buffer 3515 * @count: data length 3516 * @ppos: unused 3517 */ 3518ssize_t slabinfo_write(struct file *file, const char __user *buffer, 3519 size_t count, loff_t *ppos) 3520{ 3521 char kbuf[MAX_SLABINFO_WRITE+1], *tmp; 3522 int limit, batchcount, shared, res; 3523 struct list_head *p; 3524 3525 if (count > MAX_SLABINFO_WRITE) 3526 return -EINVAL; 3527 if (copy_from_user(&kbuf, buffer, count)) 3528 return -EFAULT; 3529 kbuf[MAX_SLABINFO_WRITE] = '\0'; 3530 3531 tmp = strchr(kbuf, ' '); 3532 if (!tmp) 3533 return -EINVAL; 3534 *tmp = '\0'; 3535 tmp++; 3536 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 3537 return -EINVAL; 3538 3539 /* Find the cache in the chain of caches. */ 3540 down(&cache_chain_sem); 3541 res = -EINVAL; 3542 list_for_each(p,&cache_chain) { 3543 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); 3544 3545 if (!strcmp(cachep->name, kbuf)) { 3546 if (limit < 1 || 3547 batchcount < 1 || 3548 batchcount > limit || 3549 shared < 0) { 3550 res = 0; 3551 } else { 3552 res = do_tune_cpucache(cachep, limit, 3553 batchcount, shared); 3554 } 3555 break; 3556 } 3557 } 3558 up(&cache_chain_sem); 3559 if (res >= 0) 3560 res = count; 3561 return res; 3562} 3563#endif 3564 3565/** 3566 * ksize - get the actual amount of memory allocated for a given object 3567 * @objp: Pointer to the object 3568 * 3569 * kmalloc may internally round up allocations and return more memory 3570 * than requested. ksize() can be used to determine the actual amount of 3571 * memory allocated. The caller may use this additional memory, even though 3572 * a smaller amount of memory was initially specified with the kmalloc call. 3573 * The caller must guarantee that objp points to a valid object previously 3574 * allocated with either kmalloc() or kmem_cache_alloc(). The object 3575 * must not be freed during the duration of the call. 3576 */ 3577unsigned int ksize(const void *objp) 3578{ 3579 if (unlikely(objp == NULL)) 3580 return 0; 3581 3582 return obj_reallen(GET_PAGE_CACHE(virt_to_page(objp))); 3583} 3584 3585 3586/* 3587 * kstrdup - allocate space for and copy an existing string 3588 * 3589 * @s: the string to duplicate 3590 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 3591 */ 3592char *kstrdup(const char *s, unsigned int __nocast gfp) 3593{ 3594 size_t len; 3595 char *buf; 3596 3597 if (!s) 3598 return NULL; 3599 3600 len = strlen(s) + 1; 3601 buf = kmalloc(len, gfp); 3602 if (buf) 3603 memcpy(buf, s, len); 3604 return buf; 3605} 3606EXPORT_SYMBOL(kstrdup); 3607