slab.c revision 7243cc05bafdda4c4de77cba00cf87666bd237f7
1/* 2 * linux/mm/slab.c 3 * Written by Mark Hemment, 1996/97. 4 * (markhe@nextd.demon.co.uk) 5 * 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 7 * 8 * Major cleanup, different bufctl logic, per-cpu arrays 9 * (c) 2000 Manfred Spraul 10 * 11 * Cleanup, make the head arrays unconditional, preparation for NUMA 12 * (c) 2002 Manfred Spraul 13 * 14 * An implementation of the Slab Allocator as described in outline in; 15 * UNIX Internals: The New Frontiers by Uresh Vahalia 16 * Pub: Prentice Hall ISBN 0-13-101908-2 17 * or with a little more detail in; 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 19 * Jeff Bonwick (Sun Microsystems). 20 * Presented at: USENIX Summer 1994 Technical Conference 21 * 22 * The memory is organized in caches, one cache for each object type. 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 24 * Each cache consists out of many slabs (they are small (usually one 25 * page long) and always contiguous), and each slab contains multiple 26 * initialized objects. 27 * 28 * This means, that your constructor is used only for newly allocated 29 * slabs and you must pass objects with the same intializations to 30 * kmem_cache_free. 31 * 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 33 * normal). If you need a special memory type, then must create a new 34 * cache for that memory type. 35 * 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 37 * full slabs with 0 free objects 38 * partial slabs 39 * empty slabs with no allocated objects 40 * 41 * If partial slabs exist, then new allocations come from these slabs, 42 * otherwise from empty slabs or new slabs are allocated. 43 * 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 46 * 47 * Each cache has a short per-cpu head array, most allocs 48 * and frees go into that array, and if that array overflows, then 1/2 49 * of the entries in the array are given back into the global cache. 50 * The head array is strictly LIFO and should improve the cache hit rates. 51 * On SMP, it additionally reduces the spinlock operations. 52 * 53 * The c_cpuarray may not be read with enabled local interrupts - 54 * it's changed with a smp_call_function(). 55 * 56 * SMP synchronization: 57 * constructors and destructors are called without any locking. 58 * Several members in kmem_cache_t and struct slab never change, they 59 * are accessed without any locking. 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 61 * and local interrupts are disabled so slab code is preempt-safe. 62 * The non-constant members are protected with a per-cache irq spinlock. 63 * 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 65 * in 2000 - many ideas in the current implementation are derived from 66 * his patch. 67 * 68 * Further notes from the original documentation: 69 * 70 * 11 April '97. Started multi-threading - markhe 71 * The global cache-chain is protected by the semaphore 'cache_chain_sem'. 72 * The sem is only needed when accessing/extending the cache-chain, which 73 * can never happen inside an interrupt (kmem_cache_create(), 74 * kmem_cache_shrink() and kmem_cache_reap()). 75 * 76 * At present, each engine can be growing a cache. This should be blocked. 77 * 78 * 15 March 2005. NUMA slab allocator. 79 * Shai Fultheim <shai@scalex86.org>. 80 * Shobhit Dayal <shobhit@calsoftinc.com> 81 * Alok N Kataria <alokk@calsoftinc.com> 82 * Christoph Lameter <christoph@lameter.com> 83 * 84 * Modified the slab allocator to be node aware on NUMA systems. 85 * Each node has its own list of partial, free and full slabs. 86 * All object allocations for a node occur from node specific slab lists. 87 */ 88 89#include <linux/config.h> 90#include <linux/slab.h> 91#include <linux/mm.h> 92#include <linux/swap.h> 93#include <linux/cache.h> 94#include <linux/interrupt.h> 95#include <linux/init.h> 96#include <linux/compiler.h> 97#include <linux/seq_file.h> 98#include <linux/notifier.h> 99#include <linux/kallsyms.h> 100#include <linux/cpu.h> 101#include <linux/sysctl.h> 102#include <linux/module.h> 103#include <linux/rcupdate.h> 104#include <linux/string.h> 105#include <linux/nodemask.h> 106 107#include <asm/uaccess.h> 108#include <asm/cacheflush.h> 109#include <asm/tlbflush.h> 110#include <asm/page.h> 111 112/* 113 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, 114 * SLAB_RED_ZONE & SLAB_POISON. 115 * 0 for faster, smaller code (especially in the critical paths). 116 * 117 * STATS - 1 to collect stats for /proc/slabinfo. 118 * 0 for faster, smaller code (especially in the critical paths). 119 * 120 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 121 */ 122 123#ifdef CONFIG_DEBUG_SLAB 124#define DEBUG 1 125#define STATS 1 126#define FORCED_DEBUG 1 127#else 128#define DEBUG 0 129#define STATS 0 130#define FORCED_DEBUG 0 131#endif 132 133 134/* Shouldn't this be in a header file somewhere? */ 135#define BYTES_PER_WORD sizeof(void *) 136 137#ifndef cache_line_size 138#define cache_line_size() L1_CACHE_BYTES 139#endif 140 141#ifndef ARCH_KMALLOC_MINALIGN 142/* 143 * Enforce a minimum alignment for the kmalloc caches. 144 * Usually, the kmalloc caches are cache_line_size() aligned, except when 145 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. 146 * Some archs want to perform DMA into kmalloc caches and need a guaranteed 147 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. 148 * Note that this flag disables some debug features. 149 */ 150#define ARCH_KMALLOC_MINALIGN 0 151#endif 152 153#ifndef ARCH_SLAB_MINALIGN 154/* 155 * Enforce a minimum alignment for all caches. 156 * Intended for archs that get misalignment faults even for BYTES_PER_WORD 157 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. 158 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables 159 * some debug features. 160 */ 161#define ARCH_SLAB_MINALIGN 0 162#endif 163 164#ifndef ARCH_KMALLOC_FLAGS 165#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 166#endif 167 168/* Legal flag mask for kmem_cache_create(). */ 169#if DEBUG 170# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ 171 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 172 SLAB_NO_REAP | SLAB_CACHE_DMA | \ 173 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ 174 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 175 SLAB_DESTROY_BY_RCU) 176#else 177# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ 178 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ 179 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 180 SLAB_DESTROY_BY_RCU) 181#endif 182 183/* 184 * kmem_bufctl_t: 185 * 186 * Bufctl's are used for linking objs within a slab 187 * linked offsets. 188 * 189 * This implementation relies on "struct page" for locating the cache & 190 * slab an object belongs to. 191 * This allows the bufctl structure to be small (one int), but limits 192 * the number of objects a slab (not a cache) can contain when off-slab 193 * bufctls are used. The limit is the size of the largest general cache 194 * that does not use off-slab slabs. 195 * For 32bit archs with 4 kB pages, is this 56. 196 * This is not serious, as it is only for large objects, when it is unwise 197 * to have too many per slab. 198 * Note: This limit can be raised by introducing a general cache whose size 199 * is less than 512 (PAGE_SIZE<<3), but greater than 256. 200 */ 201 202typedef unsigned int kmem_bufctl_t; 203#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) 204#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) 205#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) 206 207/* Max number of objs-per-slab for caches which use off-slab slabs. 208 * Needed to avoid a possible looping condition in cache_grow(). 209 */ 210static unsigned long offslab_limit; 211 212/* 213 * struct slab 214 * 215 * Manages the objs in a slab. Placed either at the beginning of mem allocated 216 * for a slab, or allocated from an general cache. 217 * Slabs are chained into three list: fully used, partial, fully free slabs. 218 */ 219struct slab { 220 struct list_head list; 221 unsigned long colouroff; 222 void *s_mem; /* including colour offset */ 223 unsigned int inuse; /* num of objs active in slab */ 224 kmem_bufctl_t free; 225 unsigned short nodeid; 226}; 227 228/* 229 * struct slab_rcu 230 * 231 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 232 * arrange for kmem_freepages to be called via RCU. This is useful if 233 * we need to approach a kernel structure obliquely, from its address 234 * obtained without the usual locking. We can lock the structure to 235 * stabilize it and check it's still at the given address, only if we 236 * can be sure that the memory has not been meanwhile reused for some 237 * other kind of object (which our subsystem's lock might corrupt). 238 * 239 * rcu_read_lock before reading the address, then rcu_read_unlock after 240 * taking the spinlock within the structure expected at that address. 241 * 242 * We assume struct slab_rcu can overlay struct slab when destroying. 243 */ 244struct slab_rcu { 245 struct rcu_head head; 246 kmem_cache_t *cachep; 247 void *addr; 248}; 249 250/* 251 * struct array_cache 252 * 253 * Purpose: 254 * - LIFO ordering, to hand out cache-warm objects from _alloc 255 * - reduce the number of linked list operations 256 * - reduce spinlock operations 257 * 258 * The limit is stored in the per-cpu structure to reduce the data cache 259 * footprint. 260 * 261 */ 262struct array_cache { 263 unsigned int avail; 264 unsigned int limit; 265 unsigned int batchcount; 266 unsigned int touched; 267 spinlock_t lock; 268 void *entry[0]; /* 269 * Must have this definition in here for the proper 270 * alignment of array_cache. Also simplifies accessing 271 * the entries. 272 * [0] is for gcc 2.95. It should really be []. 273 */ 274}; 275 276/* bootstrap: The caches do not work without cpuarrays anymore, 277 * but the cpuarrays are allocated from the generic caches... 278 */ 279#define BOOT_CPUCACHE_ENTRIES 1 280struct arraycache_init { 281 struct array_cache cache; 282 void * entries[BOOT_CPUCACHE_ENTRIES]; 283}; 284 285/* 286 * The slab lists for all objects. 287 */ 288struct kmem_list3 { 289 struct list_head slabs_partial; /* partial list first, better asm code */ 290 struct list_head slabs_full; 291 struct list_head slabs_free; 292 unsigned long free_objects; 293 unsigned long next_reap; 294 int free_touched; 295 unsigned int free_limit; 296 spinlock_t list_lock; 297 struct array_cache *shared; /* shared per node */ 298 struct array_cache **alien; /* on other nodes */ 299}; 300 301/* 302 * Need this for bootstrapping a per node allocator. 303 */ 304#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) 305struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; 306#define CACHE_CACHE 0 307#define SIZE_AC 1 308#define SIZE_L3 (1 + MAX_NUMNODES) 309 310/* 311 * This function must be completely optimized away if 312 * a constant is passed to it. Mostly the same as 313 * what is in linux/slab.h except it returns an 314 * index. 315 */ 316static __always_inline int index_of(const size_t size) 317{ 318 if (__builtin_constant_p(size)) { 319 int i = 0; 320 321#define CACHE(x) \ 322 if (size <=x) \ 323 return i; \ 324 else \ 325 i++; 326#include "linux/kmalloc_sizes.h" 327#undef CACHE 328 { 329 extern void __bad_size(void); 330 __bad_size(); 331 } 332 } else 333 BUG(); 334 return 0; 335} 336 337#define INDEX_AC index_of(sizeof(struct arraycache_init)) 338#define INDEX_L3 index_of(sizeof(struct kmem_list3)) 339 340static inline void kmem_list3_init(struct kmem_list3 *parent) 341{ 342 INIT_LIST_HEAD(&parent->slabs_full); 343 INIT_LIST_HEAD(&parent->slabs_partial); 344 INIT_LIST_HEAD(&parent->slabs_free); 345 parent->shared = NULL; 346 parent->alien = NULL; 347 spin_lock_init(&parent->list_lock); 348 parent->free_objects = 0; 349 parent->free_touched = 0; 350} 351 352#define MAKE_LIST(cachep, listp, slab, nodeid) \ 353 do { \ 354 INIT_LIST_HEAD(listp); \ 355 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ 356 } while (0) 357 358#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 359 do { \ 360 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 361 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 362 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 363 } while (0) 364 365/* 366 * kmem_cache_t 367 * 368 * manages a cache. 369 */ 370 371struct kmem_cache_s { 372/* 1) per-cpu data, touched during every alloc/free */ 373 struct array_cache *array[NR_CPUS]; 374 unsigned int batchcount; 375 unsigned int limit; 376 unsigned int shared; 377 unsigned int objsize; 378/* 2) touched by every alloc & free from the backend */ 379 struct kmem_list3 *nodelists[MAX_NUMNODES]; 380 unsigned int flags; /* constant flags */ 381 unsigned int num; /* # of objs per slab */ 382 spinlock_t spinlock; 383 384/* 3) cache_grow/shrink */ 385 /* order of pgs per slab (2^n) */ 386 unsigned int gfporder; 387 388 /* force GFP flags, e.g. GFP_DMA */ 389 unsigned int gfpflags; 390 391 size_t colour; /* cache colouring range */ 392 unsigned int colour_off; /* colour offset */ 393 unsigned int colour_next; /* cache colouring */ 394 kmem_cache_t *slabp_cache; 395 unsigned int slab_size; 396 unsigned int dflags; /* dynamic flags */ 397 398 /* constructor func */ 399 void (*ctor)(void *, kmem_cache_t *, unsigned long); 400 401 /* de-constructor func */ 402 void (*dtor)(void *, kmem_cache_t *, unsigned long); 403 404/* 4) cache creation/removal */ 405 const char *name; 406 struct list_head next; 407 408/* 5) statistics */ 409#if STATS 410 unsigned long num_active; 411 unsigned long num_allocations; 412 unsigned long high_mark; 413 unsigned long grown; 414 unsigned long reaped; 415 unsigned long errors; 416 unsigned long max_freeable; 417 unsigned long node_allocs; 418 unsigned long node_frees; 419 atomic_t allochit; 420 atomic_t allocmiss; 421 atomic_t freehit; 422 atomic_t freemiss; 423#endif 424#if DEBUG 425 int dbghead; 426 int reallen; 427#endif 428}; 429 430#define CFLGS_OFF_SLAB (0x80000000UL) 431#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 432 433#define BATCHREFILL_LIMIT 16 434/* Optimization question: fewer reaps means less 435 * probability for unnessary cpucache drain/refill cycles. 436 * 437 * OTHO the cpuarrays can contain lots of objects, 438 * which could lock up otherwise freeable slabs. 439 */ 440#define REAPTIMEOUT_CPUC (2*HZ) 441#define REAPTIMEOUT_LIST3 (4*HZ) 442 443#if STATS 444#define STATS_INC_ACTIVE(x) ((x)->num_active++) 445#define STATS_DEC_ACTIVE(x) ((x)->num_active--) 446#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 447#define STATS_INC_GROWN(x) ((x)->grown++) 448#define STATS_INC_REAPED(x) ((x)->reaped++) 449#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ 450 (x)->high_mark = (x)->num_active; \ 451 } while (0) 452#define STATS_INC_ERR(x) ((x)->errors++) 453#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 454#define STATS_INC_NODEFREES(x) ((x)->node_frees++) 455#define STATS_SET_FREEABLE(x, i) \ 456 do { if ((x)->max_freeable < i) \ 457 (x)->max_freeable = i; \ 458 } while (0) 459 460#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 461#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 462#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 463#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 464#else 465#define STATS_INC_ACTIVE(x) do { } while (0) 466#define STATS_DEC_ACTIVE(x) do { } while (0) 467#define STATS_INC_ALLOCED(x) do { } while (0) 468#define STATS_INC_GROWN(x) do { } while (0) 469#define STATS_INC_REAPED(x) do { } while (0) 470#define STATS_SET_HIGH(x) do { } while (0) 471#define STATS_INC_ERR(x) do { } while (0) 472#define STATS_INC_NODEALLOCS(x) do { } while (0) 473#define STATS_INC_NODEFREES(x) do { } while (0) 474#define STATS_SET_FREEABLE(x, i) \ 475 do { } while (0) 476 477#define STATS_INC_ALLOCHIT(x) do { } while (0) 478#define STATS_INC_ALLOCMISS(x) do { } while (0) 479#define STATS_INC_FREEHIT(x) do { } while (0) 480#define STATS_INC_FREEMISS(x) do { } while (0) 481#endif 482 483#if DEBUG 484/* Magic nums for obj red zoning. 485 * Placed in the first word before and the first word after an obj. 486 */ 487#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ 488#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ 489 490/* ...and for poisoning */ 491#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ 492#define POISON_FREE 0x6b /* for use-after-free poisoning */ 493#define POISON_END 0xa5 /* end-byte of poisoning */ 494 495/* memory layout of objects: 496 * 0 : objp 497 * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that 498 * the end of an object is aligned with the end of the real 499 * allocation. Catches writes behind the end of the allocation. 500 * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1: 501 * redzone word. 502 * cachep->dbghead: The real object. 503 * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 504 * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] 505 */ 506static int obj_dbghead(kmem_cache_t *cachep) 507{ 508 return cachep->dbghead; 509} 510 511static int obj_reallen(kmem_cache_t *cachep) 512{ 513 return cachep->reallen; 514} 515 516static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp) 517{ 518 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 519 return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD); 520} 521 522static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) 523{ 524 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 525 if (cachep->flags & SLAB_STORE_USER) 526 return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD); 527 return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD); 528} 529 530static void **dbg_userword(kmem_cache_t *cachep, void *objp) 531{ 532 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 533 return (void**)(objp+cachep->objsize-BYTES_PER_WORD); 534} 535 536#else 537 538#define obj_dbghead(x) 0 539#define obj_reallen(cachep) (cachep->objsize) 540#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 541#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 542#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 543 544#endif 545 546/* 547 * Maximum size of an obj (in 2^order pages) 548 * and absolute limit for the gfp order. 549 */ 550#if defined(CONFIG_LARGE_ALLOCS) 551#define MAX_OBJ_ORDER 13 /* up to 32Mb */ 552#define MAX_GFP_ORDER 13 /* up to 32Mb */ 553#elif defined(CONFIG_MMU) 554#define MAX_OBJ_ORDER 5 /* 32 pages */ 555#define MAX_GFP_ORDER 5 /* 32 pages */ 556#else 557#define MAX_OBJ_ORDER 8 /* up to 1Mb */ 558#define MAX_GFP_ORDER 8 /* up to 1Mb */ 559#endif 560 561/* 562 * Do not go above this order unless 0 objects fit into the slab. 563 */ 564#define BREAK_GFP_ORDER_HI 1 565#define BREAK_GFP_ORDER_LO 0 566static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 567 568/* Macros for storing/retrieving the cachep and or slab from the 569 * global 'mem_map'. These are used to find the slab an obj belongs to. 570 * With kfree(), these are used to find the cache which an obj belongs to. 571 */ 572#define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x)) 573#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next) 574#define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x)) 575#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev) 576 577/* These are the default caches for kmalloc. Custom caches can have other sizes. */ 578struct cache_sizes malloc_sizes[] = { 579#define CACHE(x) { .cs_size = (x) }, 580#include <linux/kmalloc_sizes.h> 581 CACHE(ULONG_MAX) 582#undef CACHE 583}; 584EXPORT_SYMBOL(malloc_sizes); 585 586/* Must match cache_sizes above. Out of line to keep cache footprint low. */ 587struct cache_names { 588 char *name; 589 char *name_dma; 590}; 591 592static struct cache_names __initdata cache_names[] = { 593#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 594#include <linux/kmalloc_sizes.h> 595 { NULL, } 596#undef CACHE 597}; 598 599static struct arraycache_init initarray_cache __initdata = 600 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 601static struct arraycache_init initarray_generic = 602 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 603 604/* internal cache of cache description objs */ 605static kmem_cache_t cache_cache = { 606 .batchcount = 1, 607 .limit = BOOT_CPUCACHE_ENTRIES, 608 .shared = 1, 609 .objsize = sizeof(kmem_cache_t), 610 .flags = SLAB_NO_REAP, 611 .spinlock = SPIN_LOCK_UNLOCKED, 612 .name = "kmem_cache", 613#if DEBUG 614 .reallen = sizeof(kmem_cache_t), 615#endif 616}; 617 618/* Guard access to the cache-chain. */ 619static struct semaphore cache_chain_sem; 620static struct list_head cache_chain; 621 622/* 623 * vm_enough_memory() looks at this to determine how many 624 * slab-allocated pages are possibly freeable under pressure 625 * 626 * SLAB_RECLAIM_ACCOUNT turns this on per-slab 627 */ 628atomic_t slab_reclaim_pages; 629 630/* 631 * chicken and egg problem: delay the per-cpu array allocation 632 * until the general caches are up. 633 */ 634static enum { 635 NONE, 636 PARTIAL_AC, 637 PARTIAL_L3, 638 FULL 639} g_cpucache_up; 640 641static DEFINE_PER_CPU(struct work_struct, reap_work); 642 643static void free_block(kmem_cache_t* cachep, void** objpp, int len); 644static void enable_cpucache (kmem_cache_t *cachep); 645static void cache_reap (void *unused); 646static int __node_shrink(kmem_cache_t *cachep, int node); 647 648static inline struct array_cache *ac_data(kmem_cache_t *cachep) 649{ 650 return cachep->array[smp_processor_id()]; 651} 652 653static inline kmem_cache_t *__find_general_cachep(size_t size, 654 unsigned int __nocast gfpflags) 655{ 656 struct cache_sizes *csizep = malloc_sizes; 657 658#if DEBUG 659 /* This happens if someone tries to call 660 * kmem_cache_create(), or __kmalloc(), before 661 * the generic caches are initialized. 662 */ 663 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 664#endif 665 while (size > csizep->cs_size) 666 csizep++; 667 668 /* 669 * Really subtle: The last entry with cs->cs_size==ULONG_MAX 670 * has cs_{dma,}cachep==NULL. Thus no special case 671 * for large kmalloc calls required. 672 */ 673 if (unlikely(gfpflags & GFP_DMA)) 674 return csizep->cs_dmacachep; 675 return csizep->cs_cachep; 676} 677 678kmem_cache_t *kmem_find_general_cachep(size_t size, 679 unsigned int __nocast gfpflags) 680{ 681 return __find_general_cachep(size, gfpflags); 682} 683EXPORT_SYMBOL(kmem_find_general_cachep); 684 685/* Cal the num objs, wastage, and bytes left over for a given slab size. */ 686static void cache_estimate(unsigned long gfporder, size_t size, size_t align, 687 int flags, size_t *left_over, unsigned int *num) 688{ 689 int i; 690 size_t wastage = PAGE_SIZE<<gfporder; 691 size_t extra = 0; 692 size_t base = 0; 693 694 if (!(flags & CFLGS_OFF_SLAB)) { 695 base = sizeof(struct slab); 696 extra = sizeof(kmem_bufctl_t); 697 } 698 i = 0; 699 while (i*size + ALIGN(base+i*extra, align) <= wastage) 700 i++; 701 if (i > 0) 702 i--; 703 704 if (i > SLAB_LIMIT) 705 i = SLAB_LIMIT; 706 707 *num = i; 708 wastage -= i*size; 709 wastage -= ALIGN(base+i*extra, align); 710 *left_over = wastage; 711} 712 713#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 714 715static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) 716{ 717 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 718 function, cachep->name, msg); 719 dump_stack(); 720} 721 722/* 723 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 724 * via the workqueue/eventd. 725 * Add the CPU number into the expiration time to minimize the possibility of 726 * the CPUs getting into lockstep and contending for the global cache chain 727 * lock. 728 */ 729static void __devinit start_cpu_timer(int cpu) 730{ 731 struct work_struct *reap_work = &per_cpu(reap_work, cpu); 732 733 /* 734 * When this gets called from do_initcalls via cpucache_init(), 735 * init_workqueues() has already run, so keventd will be setup 736 * at that time. 737 */ 738 if (keventd_up() && reap_work->func == NULL) { 739 INIT_WORK(reap_work, cache_reap, NULL); 740 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 741 } 742} 743 744static struct array_cache *alloc_arraycache(int node, int entries, 745 int batchcount) 746{ 747 int memsize = sizeof(void*)*entries+sizeof(struct array_cache); 748 struct array_cache *nc = NULL; 749 750 nc = kmalloc_node(memsize, GFP_KERNEL, node); 751 if (nc) { 752 nc->avail = 0; 753 nc->limit = entries; 754 nc->batchcount = batchcount; 755 nc->touched = 0; 756 spin_lock_init(&nc->lock); 757 } 758 return nc; 759} 760 761#ifdef CONFIG_NUMA 762static inline struct array_cache **alloc_alien_cache(int node, int limit) 763{ 764 struct array_cache **ac_ptr; 765 int memsize = sizeof(void*)*MAX_NUMNODES; 766 int i; 767 768 if (limit > 1) 769 limit = 12; 770 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); 771 if (ac_ptr) { 772 for_each_node(i) { 773 if (i == node || !node_online(i)) { 774 ac_ptr[i] = NULL; 775 continue; 776 } 777 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 778 if (!ac_ptr[i]) { 779 for (i--; i <=0; i--) 780 kfree(ac_ptr[i]); 781 kfree(ac_ptr); 782 return NULL; 783 } 784 } 785 } 786 return ac_ptr; 787} 788 789static inline void free_alien_cache(struct array_cache **ac_ptr) 790{ 791 int i; 792 793 if (!ac_ptr) 794 return; 795 796 for_each_node(i) 797 kfree(ac_ptr[i]); 798 799 kfree(ac_ptr); 800} 801 802static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node) 803{ 804 struct kmem_list3 *rl3 = cachep->nodelists[node]; 805 806 if (ac->avail) { 807 spin_lock(&rl3->list_lock); 808 free_block(cachep, ac->entry, ac->avail); 809 ac->avail = 0; 810 spin_unlock(&rl3->list_lock); 811 } 812} 813 814static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) 815{ 816 int i=0; 817 struct array_cache *ac; 818 unsigned long flags; 819 820 for_each_online_node(i) { 821 ac = l3->alien[i]; 822 if (ac) { 823 spin_lock_irqsave(&ac->lock, flags); 824 __drain_alien_cache(cachep, ac, i); 825 spin_unlock_irqrestore(&ac->lock, flags); 826 } 827 } 828} 829#else 830#define alloc_alien_cache(node, limit) do { } while (0) 831#define free_alien_cache(ac_ptr) do { } while (0) 832#define drain_alien_cache(cachep, l3) do { } while (0) 833#endif 834 835static int __devinit cpuup_callback(struct notifier_block *nfb, 836 unsigned long action, void *hcpu) 837{ 838 long cpu = (long)hcpu; 839 kmem_cache_t* cachep; 840 struct kmem_list3 *l3 = NULL; 841 int node = cpu_to_node(cpu); 842 int memsize = sizeof(struct kmem_list3); 843 struct array_cache *nc = NULL; 844 845 switch (action) { 846 case CPU_UP_PREPARE: 847 down(&cache_chain_sem); 848 /* we need to do this right in the beginning since 849 * alloc_arraycache's are going to use this list. 850 * kmalloc_node allows us to add the slab to the right 851 * kmem_list3 and not this cpu's kmem_list3 852 */ 853 854 list_for_each_entry(cachep, &cache_chain, next) { 855 /* setup the size64 kmemlist for cpu before we can 856 * begin anything. Make sure some other cpu on this 857 * node has not already allocated this 858 */ 859 if (!cachep->nodelists[node]) { 860 if (!(l3 = kmalloc_node(memsize, 861 GFP_KERNEL, node))) 862 goto bad; 863 kmem_list3_init(l3); 864 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 865 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 866 867 cachep->nodelists[node] = l3; 868 } 869 870 spin_lock_irq(&cachep->nodelists[node]->list_lock); 871 cachep->nodelists[node]->free_limit = 872 (1 + nr_cpus_node(node)) * 873 cachep->batchcount + cachep->num; 874 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 875 } 876 877 /* Now we can go ahead with allocating the shared array's 878 & array cache's */ 879 list_for_each_entry(cachep, &cache_chain, next) { 880 nc = alloc_arraycache(node, cachep->limit, 881 cachep->batchcount); 882 if (!nc) 883 goto bad; 884 cachep->array[cpu] = nc; 885 886 l3 = cachep->nodelists[node]; 887 BUG_ON(!l3); 888 if (!l3->shared) { 889 if (!(nc = alloc_arraycache(node, 890 cachep->shared*cachep->batchcount, 891 0xbaadf00d))) 892 goto bad; 893 894 /* we are serialised from CPU_DEAD or 895 CPU_UP_CANCELLED by the cpucontrol lock */ 896 l3->shared = nc; 897 } 898 } 899 up(&cache_chain_sem); 900 break; 901 case CPU_ONLINE: 902 start_cpu_timer(cpu); 903 break; 904#ifdef CONFIG_HOTPLUG_CPU 905 case CPU_DEAD: 906 /* fall thru */ 907 case CPU_UP_CANCELED: 908 down(&cache_chain_sem); 909 910 list_for_each_entry(cachep, &cache_chain, next) { 911 struct array_cache *nc; 912 cpumask_t mask; 913 914 mask = node_to_cpumask(node); 915 spin_lock_irq(&cachep->spinlock); 916 /* cpu is dead; no one can alloc from it. */ 917 nc = cachep->array[cpu]; 918 cachep->array[cpu] = NULL; 919 l3 = cachep->nodelists[node]; 920 921 if (!l3) 922 goto unlock_cache; 923 924 spin_lock(&l3->list_lock); 925 926 /* Free limit for this kmem_list3 */ 927 l3->free_limit -= cachep->batchcount; 928 if (nc) 929 free_block(cachep, nc->entry, nc->avail); 930 931 if (!cpus_empty(mask)) { 932 spin_unlock(&l3->list_lock); 933 goto unlock_cache; 934 } 935 936 if (l3->shared) { 937 free_block(cachep, l3->shared->entry, 938 l3->shared->avail); 939 kfree(l3->shared); 940 l3->shared = NULL; 941 } 942 if (l3->alien) { 943 drain_alien_cache(cachep, l3); 944 free_alien_cache(l3->alien); 945 l3->alien = NULL; 946 } 947 948 /* free slabs belonging to this node */ 949 if (__node_shrink(cachep, node)) { 950 cachep->nodelists[node] = NULL; 951 spin_unlock(&l3->list_lock); 952 kfree(l3); 953 } else { 954 spin_unlock(&l3->list_lock); 955 } 956unlock_cache: 957 spin_unlock_irq(&cachep->spinlock); 958 kfree(nc); 959 } 960 up(&cache_chain_sem); 961 break; 962#endif 963 } 964 return NOTIFY_OK; 965bad: 966 up(&cache_chain_sem); 967 return NOTIFY_BAD; 968} 969 970static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; 971 972/* 973 * swap the static kmem_list3 with kmalloced memory 974 */ 975static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, 976 int nodeid) 977{ 978 struct kmem_list3 *ptr; 979 980 BUG_ON(cachep->nodelists[nodeid] != list); 981 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); 982 BUG_ON(!ptr); 983 984 local_irq_disable(); 985 memcpy(ptr, list, sizeof(struct kmem_list3)); 986 MAKE_ALL_LISTS(cachep, ptr, nodeid); 987 cachep->nodelists[nodeid] = ptr; 988 local_irq_enable(); 989} 990 991/* Initialisation. 992 * Called after the gfp() functions have been enabled, and before smp_init(). 993 */ 994void __init kmem_cache_init(void) 995{ 996 size_t left_over; 997 struct cache_sizes *sizes; 998 struct cache_names *names; 999 int i; 1000 1001 for (i = 0; i < NUM_INIT_LISTS; i++) { 1002 kmem_list3_init(&initkmem_list3[i]); 1003 if (i < MAX_NUMNODES) 1004 cache_cache.nodelists[i] = NULL; 1005 } 1006 1007 /* 1008 * Fragmentation resistance on low memory - only use bigger 1009 * page orders on machines with more than 32MB of memory. 1010 */ 1011 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1012 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1013 1014 /* Bootstrap is tricky, because several objects are allocated 1015 * from caches that do not exist yet: 1016 * 1) initialize the cache_cache cache: it contains the kmem_cache_t 1017 * structures of all caches, except cache_cache itself: cache_cache 1018 * is statically allocated. 1019 * Initially an __init data area is used for the head array and the 1020 * kmem_list3 structures, it's replaced with a kmalloc allocated 1021 * array at the end of the bootstrap. 1022 * 2) Create the first kmalloc cache. 1023 * The kmem_cache_t for the new cache is allocated normally. 1024 * An __init data area is used for the head array. 1025 * 3) Create the remaining kmalloc caches, with minimally sized 1026 * head arrays. 1027 * 4) Replace the __init data head arrays for cache_cache and the first 1028 * kmalloc cache with kmalloc allocated arrays. 1029 * 5) Replace the __init data for kmem_list3 for cache_cache and 1030 * the other cache's with kmalloc allocated memory. 1031 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1032 */ 1033 1034 /* 1) create the cache_cache */ 1035 init_MUTEX(&cache_chain_sem); 1036 INIT_LIST_HEAD(&cache_chain); 1037 list_add(&cache_cache.next, &cache_chain); 1038 cache_cache.colour_off = cache_line_size(); 1039 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1040 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; 1041 1042 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); 1043 1044 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, 1045 &left_over, &cache_cache.num); 1046 if (!cache_cache.num) 1047 BUG(); 1048 1049 cache_cache.colour = left_over/cache_cache.colour_off; 1050 cache_cache.colour_next = 0; 1051 cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + 1052 sizeof(struct slab), cache_line_size()); 1053 1054 /* 2+3) create the kmalloc caches */ 1055 sizes = malloc_sizes; 1056 names = cache_names; 1057 1058 /* Initialize the caches that provide memory for the array cache 1059 * and the kmem_list3 structures first. 1060 * Without this, further allocations will bug 1061 */ 1062 1063 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1064 sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, 1065 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1066 1067 if (INDEX_AC != INDEX_L3) 1068 sizes[INDEX_L3].cs_cachep = 1069 kmem_cache_create(names[INDEX_L3].name, 1070 sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, 1071 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1072 1073 while (sizes->cs_size != ULONG_MAX) { 1074 /* 1075 * For performance, all the general caches are L1 aligned. 1076 * This should be particularly beneficial on SMP boxes, as it 1077 * eliminates "false sharing". 1078 * Note for systems short on memory removing the alignment will 1079 * allow tighter packing of the smaller caches. 1080 */ 1081 if(!sizes->cs_cachep) 1082 sizes->cs_cachep = kmem_cache_create(names->name, 1083 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1084 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1085 1086 /* Inc off-slab bufctl limit until the ceiling is hit. */ 1087 if (!(OFF_SLAB(sizes->cs_cachep))) { 1088 offslab_limit = sizes->cs_size-sizeof(struct slab); 1089 offslab_limit /= sizeof(kmem_bufctl_t); 1090 } 1091 1092 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1093 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1094 (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC), 1095 NULL, NULL); 1096 1097 sizes++; 1098 names++; 1099 } 1100 /* 4) Replace the bootstrap head arrays */ 1101 { 1102 void * ptr; 1103 1104 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1105 1106 local_irq_disable(); 1107 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 1108 memcpy(ptr, ac_data(&cache_cache), 1109 sizeof(struct arraycache_init)); 1110 cache_cache.array[smp_processor_id()] = ptr; 1111 local_irq_enable(); 1112 1113 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1114 1115 local_irq_disable(); 1116 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) 1117 != &initarray_generic.cache); 1118 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), 1119 sizeof(struct arraycache_init)); 1120 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1121 ptr; 1122 local_irq_enable(); 1123 } 1124 /* 5) Replace the bootstrap kmem_list3's */ 1125 { 1126 int node; 1127 /* Replace the static kmem_list3 structures for the boot cpu */ 1128 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], 1129 numa_node_id()); 1130 1131 for_each_online_node(node) { 1132 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1133 &initkmem_list3[SIZE_AC+node], node); 1134 1135 if (INDEX_AC != INDEX_L3) { 1136 init_list(malloc_sizes[INDEX_L3].cs_cachep, 1137 &initkmem_list3[SIZE_L3+node], 1138 node); 1139 } 1140 } 1141 } 1142 1143 /* 6) resize the head arrays to their final sizes */ 1144 { 1145 kmem_cache_t *cachep; 1146 down(&cache_chain_sem); 1147 list_for_each_entry(cachep, &cache_chain, next) 1148 enable_cpucache(cachep); 1149 up(&cache_chain_sem); 1150 } 1151 1152 /* Done! */ 1153 g_cpucache_up = FULL; 1154 1155 /* Register a cpu startup notifier callback 1156 * that initializes ac_data for all new cpus 1157 */ 1158 register_cpu_notifier(&cpucache_notifier); 1159 1160 /* The reap timers are started later, with a module init call: 1161 * That part of the kernel is not yet operational. 1162 */ 1163} 1164 1165static int __init cpucache_init(void) 1166{ 1167 int cpu; 1168 1169 /* 1170 * Register the timers that return unneeded 1171 * pages to gfp. 1172 */ 1173 for_each_online_cpu(cpu) 1174 start_cpu_timer(cpu); 1175 1176 return 0; 1177} 1178 1179__initcall(cpucache_init); 1180 1181/* 1182 * Interface to system's page allocator. No need to hold the cache-lock. 1183 * 1184 * If we requested dmaable memory, we will get it. Even if we 1185 * did not request dmaable memory, we might get it, but that 1186 * would be relatively rare and ignorable. 1187 */ 1188static void *kmem_getpages(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid) 1189{ 1190 struct page *page; 1191 void *addr; 1192 int i; 1193 1194 flags |= cachep->gfpflags; 1195 if (likely(nodeid == -1)) { 1196 page = alloc_pages(flags, cachep->gfporder); 1197 } else { 1198 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1199 } 1200 if (!page) 1201 return NULL; 1202 addr = page_address(page); 1203 1204 i = (1 << cachep->gfporder); 1205 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1206 atomic_add(i, &slab_reclaim_pages); 1207 add_page_state(nr_slab, i); 1208 while (i--) { 1209 SetPageSlab(page); 1210 page++; 1211 } 1212 return addr; 1213} 1214 1215/* 1216 * Interface to system's page release. 1217 */ 1218static void kmem_freepages(kmem_cache_t *cachep, void *addr) 1219{ 1220 unsigned long i = (1<<cachep->gfporder); 1221 struct page *page = virt_to_page(addr); 1222 const unsigned long nr_freed = i; 1223 1224 while (i--) { 1225 if (!TestClearPageSlab(page)) 1226 BUG(); 1227 page++; 1228 } 1229 sub_page_state(nr_slab, nr_freed); 1230 if (current->reclaim_state) 1231 current->reclaim_state->reclaimed_slab += nr_freed; 1232 free_pages((unsigned long)addr, cachep->gfporder); 1233 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1234 atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages); 1235} 1236 1237static void kmem_rcu_free(struct rcu_head *head) 1238{ 1239 struct slab_rcu *slab_rcu = (struct slab_rcu *) head; 1240 kmem_cache_t *cachep = slab_rcu->cachep; 1241 1242 kmem_freepages(cachep, slab_rcu->addr); 1243 if (OFF_SLAB(cachep)) 1244 kmem_cache_free(cachep->slabp_cache, slab_rcu); 1245} 1246 1247#if DEBUG 1248 1249#ifdef CONFIG_DEBUG_PAGEALLOC 1250static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, 1251 unsigned long caller) 1252{ 1253 int size = obj_reallen(cachep); 1254 1255 addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; 1256 1257 if (size < 5*sizeof(unsigned long)) 1258 return; 1259 1260 *addr++=0x12345678; 1261 *addr++=caller; 1262 *addr++=smp_processor_id(); 1263 size -= 3*sizeof(unsigned long); 1264 { 1265 unsigned long *sptr = &caller; 1266 unsigned long svalue; 1267 1268 while (!kstack_end(sptr)) { 1269 svalue = *sptr++; 1270 if (kernel_text_address(svalue)) { 1271 *addr++=svalue; 1272 size -= sizeof(unsigned long); 1273 if (size <= sizeof(unsigned long)) 1274 break; 1275 } 1276 } 1277 1278 } 1279 *addr++=0x87654321; 1280} 1281#endif 1282 1283static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) 1284{ 1285 int size = obj_reallen(cachep); 1286 addr = &((char*)addr)[obj_dbghead(cachep)]; 1287 1288 memset(addr, val, size); 1289 *(unsigned char *)(addr+size-1) = POISON_END; 1290} 1291 1292static void dump_line(char *data, int offset, int limit) 1293{ 1294 int i; 1295 printk(KERN_ERR "%03x:", offset); 1296 for (i=0;i<limit;i++) { 1297 printk(" %02x", (unsigned char)data[offset+i]); 1298 } 1299 printk("\n"); 1300} 1301#endif 1302 1303#if DEBUG 1304 1305static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines) 1306{ 1307 int i, size; 1308 char *realobj; 1309 1310 if (cachep->flags & SLAB_RED_ZONE) { 1311 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1312 *dbg_redzone1(cachep, objp), 1313 *dbg_redzone2(cachep, objp)); 1314 } 1315 1316 if (cachep->flags & SLAB_STORE_USER) { 1317 printk(KERN_ERR "Last user: [<%p>]", 1318 *dbg_userword(cachep, objp)); 1319 print_symbol("(%s)", 1320 (unsigned long)*dbg_userword(cachep, objp)); 1321 printk("\n"); 1322 } 1323 realobj = (char*)objp+obj_dbghead(cachep); 1324 size = obj_reallen(cachep); 1325 for (i=0; i<size && lines;i+=16, lines--) { 1326 int limit; 1327 limit = 16; 1328 if (i+limit > size) 1329 limit = size-i; 1330 dump_line(realobj, i, limit); 1331 } 1332} 1333 1334static void check_poison_obj(kmem_cache_t *cachep, void *objp) 1335{ 1336 char *realobj; 1337 int size, i; 1338 int lines = 0; 1339 1340 realobj = (char*)objp+obj_dbghead(cachep); 1341 size = obj_reallen(cachep); 1342 1343 for (i=0;i<size;i++) { 1344 char exp = POISON_FREE; 1345 if (i == size-1) 1346 exp = POISON_END; 1347 if (realobj[i] != exp) { 1348 int limit; 1349 /* Mismatch ! */ 1350 /* Print header */ 1351 if (lines == 0) { 1352 printk(KERN_ERR "Slab corruption: start=%p, len=%d\n", 1353 realobj, size); 1354 print_objinfo(cachep, objp, 0); 1355 } 1356 /* Hexdump the affected line */ 1357 i = (i/16)*16; 1358 limit = 16; 1359 if (i+limit > size) 1360 limit = size-i; 1361 dump_line(realobj, i, limit); 1362 i += 16; 1363 lines++; 1364 /* Limit to 5 lines */ 1365 if (lines > 5) 1366 break; 1367 } 1368 } 1369 if (lines != 0) { 1370 /* Print some data about the neighboring objects, if they 1371 * exist: 1372 */ 1373 struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp)); 1374 int objnr; 1375 1376 objnr = (objp-slabp->s_mem)/cachep->objsize; 1377 if (objnr) { 1378 objp = slabp->s_mem+(objnr-1)*cachep->objsize; 1379 realobj = (char*)objp+obj_dbghead(cachep); 1380 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1381 realobj, size); 1382 print_objinfo(cachep, objp, 2); 1383 } 1384 if (objnr+1 < cachep->num) { 1385 objp = slabp->s_mem+(objnr+1)*cachep->objsize; 1386 realobj = (char*)objp+obj_dbghead(cachep); 1387 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1388 realobj, size); 1389 print_objinfo(cachep, objp, 2); 1390 } 1391 } 1392} 1393#endif 1394 1395/* Destroy all the objs in a slab, and release the mem back to the system. 1396 * Before calling the slab must have been unlinked from the cache. 1397 * The cache-lock is not held/needed. 1398 */ 1399static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) 1400{ 1401 void *addr = slabp->s_mem - slabp->colouroff; 1402 1403#if DEBUG 1404 int i; 1405 for (i = 0; i < cachep->num; i++) { 1406 void *objp = slabp->s_mem + cachep->objsize * i; 1407 1408 if (cachep->flags & SLAB_POISON) { 1409#ifdef CONFIG_DEBUG_PAGEALLOC 1410 if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) 1411 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); 1412 else 1413 check_poison_obj(cachep, objp); 1414#else 1415 check_poison_obj(cachep, objp); 1416#endif 1417 } 1418 if (cachep->flags & SLAB_RED_ZONE) { 1419 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1420 slab_error(cachep, "start of a freed object " 1421 "was overwritten"); 1422 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1423 slab_error(cachep, "end of a freed object " 1424 "was overwritten"); 1425 } 1426 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1427 (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0); 1428 } 1429#else 1430 if (cachep->dtor) { 1431 int i; 1432 for (i = 0; i < cachep->num; i++) { 1433 void* objp = slabp->s_mem+cachep->objsize*i; 1434 (cachep->dtor)(objp, cachep, 0); 1435 } 1436 } 1437#endif 1438 1439 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1440 struct slab_rcu *slab_rcu; 1441 1442 slab_rcu = (struct slab_rcu *) slabp; 1443 slab_rcu->cachep = cachep; 1444 slab_rcu->addr = addr; 1445 call_rcu(&slab_rcu->head, kmem_rcu_free); 1446 } else { 1447 kmem_freepages(cachep, addr); 1448 if (OFF_SLAB(cachep)) 1449 kmem_cache_free(cachep->slabp_cache, slabp); 1450 } 1451} 1452 1453/* For setting up all the kmem_list3s for cache whose objsize is same 1454 as size of kmem_list3. */ 1455static inline void set_up_list3s(kmem_cache_t *cachep, int index) 1456{ 1457 int node; 1458 1459 for_each_online_node(node) { 1460 cachep->nodelists[node] = &initkmem_list3[index+node]; 1461 cachep->nodelists[node]->next_reap = jiffies + 1462 REAPTIMEOUT_LIST3 + 1463 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 1464 } 1465} 1466 1467/** 1468 * kmem_cache_create - Create a cache. 1469 * @name: A string which is used in /proc/slabinfo to identify this cache. 1470 * @size: The size of objects to be created in this cache. 1471 * @align: The required alignment for the objects. 1472 * @flags: SLAB flags 1473 * @ctor: A constructor for the objects. 1474 * @dtor: A destructor for the objects. 1475 * 1476 * Returns a ptr to the cache on success, NULL on failure. 1477 * Cannot be called within a int, but can be interrupted. 1478 * The @ctor is run when new pages are allocated by the cache 1479 * and the @dtor is run before the pages are handed back. 1480 * 1481 * @name must be valid until the cache is destroyed. This implies that 1482 * the module calling this has to destroy the cache before getting 1483 * unloaded. 1484 * 1485 * The flags are 1486 * 1487 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 1488 * to catch references to uninitialised memory. 1489 * 1490 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 1491 * for buffer overruns. 1492 * 1493 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under 1494 * memory pressure. 1495 * 1496 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 1497 * cacheline. This can be beneficial if you're counting cycles as closely 1498 * as davem. 1499 */ 1500kmem_cache_t * 1501kmem_cache_create (const char *name, size_t size, size_t align, 1502 unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), 1503 void (*dtor)(void*, kmem_cache_t *, unsigned long)) 1504{ 1505 size_t left_over, slab_size, ralign; 1506 kmem_cache_t *cachep = NULL; 1507 1508 /* 1509 * Sanity checks... these are all serious usage bugs. 1510 */ 1511 if ((!name) || 1512 in_interrupt() || 1513 (size < BYTES_PER_WORD) || 1514 (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) || 1515 (dtor && !ctor)) { 1516 printk(KERN_ERR "%s: Early error in slab %s\n", 1517 __FUNCTION__, name); 1518 BUG(); 1519 } 1520 1521#if DEBUG 1522 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 1523 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { 1524 /* No constructor, but inital state check requested */ 1525 printk(KERN_ERR "%s: No con, but init state check " 1526 "requested - %s\n", __FUNCTION__, name); 1527 flags &= ~SLAB_DEBUG_INITIAL; 1528 } 1529 1530#if FORCED_DEBUG 1531 /* 1532 * Enable redzoning and last user accounting, except for caches with 1533 * large objects, if the increased size would increase the object size 1534 * above the next power of two: caches with object sizes just above a 1535 * power of two have a significant amount of internal fragmentation. 1536 */ 1537 if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) 1538 flags |= SLAB_RED_ZONE|SLAB_STORE_USER; 1539 if (!(flags & SLAB_DESTROY_BY_RCU)) 1540 flags |= SLAB_POISON; 1541#endif 1542 if (flags & SLAB_DESTROY_BY_RCU) 1543 BUG_ON(flags & SLAB_POISON); 1544#endif 1545 if (flags & SLAB_DESTROY_BY_RCU) 1546 BUG_ON(dtor); 1547 1548 /* 1549 * Always checks flags, a caller might be expecting debug 1550 * support which isn't available. 1551 */ 1552 if (flags & ~CREATE_MASK) 1553 BUG(); 1554 1555 /* Check that size is in terms of words. This is needed to avoid 1556 * unaligned accesses for some archs when redzoning is used, and makes 1557 * sure any on-slab bufctl's are also correctly aligned. 1558 */ 1559 if (size & (BYTES_PER_WORD-1)) { 1560 size += (BYTES_PER_WORD-1); 1561 size &= ~(BYTES_PER_WORD-1); 1562 } 1563 1564 /* calculate out the final buffer alignment: */ 1565 /* 1) arch recommendation: can be overridden for debug */ 1566 if (flags & SLAB_HWCACHE_ALIGN) { 1567 /* Default alignment: as specified by the arch code. 1568 * Except if an object is really small, then squeeze multiple 1569 * objects into one cacheline. 1570 */ 1571 ralign = cache_line_size(); 1572 while (size <= ralign/2) 1573 ralign /= 2; 1574 } else { 1575 ralign = BYTES_PER_WORD; 1576 } 1577 /* 2) arch mandated alignment: disables debug if necessary */ 1578 if (ralign < ARCH_SLAB_MINALIGN) { 1579 ralign = ARCH_SLAB_MINALIGN; 1580 if (ralign > BYTES_PER_WORD) 1581 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); 1582 } 1583 /* 3) caller mandated alignment: disables debug if necessary */ 1584 if (ralign < align) { 1585 ralign = align; 1586 if (ralign > BYTES_PER_WORD) 1587 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); 1588 } 1589 /* 4) Store it. Note that the debug code below can reduce 1590 * the alignment to BYTES_PER_WORD. 1591 */ 1592 align = ralign; 1593 1594 /* Get cache's description obj. */ 1595 cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); 1596 if (!cachep) 1597 goto opps; 1598 memset(cachep, 0, sizeof(kmem_cache_t)); 1599 1600#if DEBUG 1601 cachep->reallen = size; 1602 1603 if (flags & SLAB_RED_ZONE) { 1604 /* redzoning only works with word aligned caches */ 1605 align = BYTES_PER_WORD; 1606 1607 /* add space for red zone words */ 1608 cachep->dbghead += BYTES_PER_WORD; 1609 size += 2*BYTES_PER_WORD; 1610 } 1611 if (flags & SLAB_STORE_USER) { 1612 /* user store requires word alignment and 1613 * one word storage behind the end of the real 1614 * object. 1615 */ 1616 align = BYTES_PER_WORD; 1617 size += BYTES_PER_WORD; 1618 } 1619#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1620 if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { 1621 cachep->dbghead += PAGE_SIZE - size; 1622 size = PAGE_SIZE; 1623 } 1624#endif 1625#endif 1626 1627 /* Determine if the slab management is 'on' or 'off' slab. */ 1628 if (size >= (PAGE_SIZE>>3)) 1629 /* 1630 * Size is large, assume best to place the slab management obj 1631 * off-slab (should allow better packing of objs). 1632 */ 1633 flags |= CFLGS_OFF_SLAB; 1634 1635 size = ALIGN(size, align); 1636 1637 if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { 1638 /* 1639 * A VFS-reclaimable slab tends to have most allocations 1640 * as GFP_NOFS and we really don't want to have to be allocating 1641 * higher-order pages when we are unable to shrink dcache. 1642 */ 1643 cachep->gfporder = 0; 1644 cache_estimate(cachep->gfporder, size, align, flags, 1645 &left_over, &cachep->num); 1646 } else { 1647 /* 1648 * Calculate size (in pages) of slabs, and the num of objs per 1649 * slab. This could be made much more intelligent. For now, 1650 * try to avoid using high page-orders for slabs. When the 1651 * gfp() funcs are more friendly towards high-order requests, 1652 * this should be changed. 1653 */ 1654 do { 1655 unsigned int break_flag = 0; 1656cal_wastage: 1657 cache_estimate(cachep->gfporder, size, align, flags, 1658 &left_over, &cachep->num); 1659 if (break_flag) 1660 break; 1661 if (cachep->gfporder >= MAX_GFP_ORDER) 1662 break; 1663 if (!cachep->num) 1664 goto next; 1665 if (flags & CFLGS_OFF_SLAB && 1666 cachep->num > offslab_limit) { 1667 /* This num of objs will cause problems. */ 1668 cachep->gfporder--; 1669 break_flag++; 1670 goto cal_wastage; 1671 } 1672 1673 /* 1674 * Large num of objs is good, but v. large slabs are 1675 * currently bad for the gfp()s. 1676 */ 1677 if (cachep->gfporder >= slab_break_gfp_order) 1678 break; 1679 1680 if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder)) 1681 break; /* Acceptable internal fragmentation. */ 1682next: 1683 cachep->gfporder++; 1684 } while (1); 1685 } 1686 1687 if (!cachep->num) { 1688 printk("kmem_cache_create: couldn't create cache %s.\n", name); 1689 kmem_cache_free(&cache_cache, cachep); 1690 cachep = NULL; 1691 goto opps; 1692 } 1693 slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) 1694 + sizeof(struct slab), align); 1695 1696 /* 1697 * If the slab has been placed off-slab, and we have enough space then 1698 * move it on-slab. This is at the expense of any extra colouring. 1699 */ 1700 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { 1701 flags &= ~CFLGS_OFF_SLAB; 1702 left_over -= slab_size; 1703 } 1704 1705 if (flags & CFLGS_OFF_SLAB) { 1706 /* really off slab. No need for manual alignment */ 1707 slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); 1708 } 1709 1710 cachep->colour_off = cache_line_size(); 1711 /* Offset must be a multiple of the alignment. */ 1712 if (cachep->colour_off < align) 1713 cachep->colour_off = align; 1714 cachep->colour = left_over/cachep->colour_off; 1715 cachep->slab_size = slab_size; 1716 cachep->flags = flags; 1717 cachep->gfpflags = 0; 1718 if (flags & SLAB_CACHE_DMA) 1719 cachep->gfpflags |= GFP_DMA; 1720 spin_lock_init(&cachep->spinlock); 1721 cachep->objsize = size; 1722 1723 if (flags & CFLGS_OFF_SLAB) 1724 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 1725 cachep->ctor = ctor; 1726 cachep->dtor = dtor; 1727 cachep->name = name; 1728 1729 /* Don't let CPUs to come and go */ 1730 lock_cpu_hotplug(); 1731 1732 if (g_cpucache_up == FULL) { 1733 enable_cpucache(cachep); 1734 } else { 1735 if (g_cpucache_up == NONE) { 1736 /* Note: the first kmem_cache_create must create 1737 * the cache that's used by kmalloc(24), otherwise 1738 * the creation of further caches will BUG(). 1739 */ 1740 cachep->array[smp_processor_id()] = 1741 &initarray_generic.cache; 1742 1743 /* If the cache that's used by 1744 * kmalloc(sizeof(kmem_list3)) is the first cache, 1745 * then we need to set up all its list3s, otherwise 1746 * the creation of further caches will BUG(). 1747 */ 1748 set_up_list3s(cachep, SIZE_AC); 1749 if (INDEX_AC == INDEX_L3) 1750 g_cpucache_up = PARTIAL_L3; 1751 else 1752 g_cpucache_up = PARTIAL_AC; 1753 } else { 1754 cachep->array[smp_processor_id()] = 1755 kmalloc(sizeof(struct arraycache_init), 1756 GFP_KERNEL); 1757 1758 if (g_cpucache_up == PARTIAL_AC) { 1759 set_up_list3s(cachep, SIZE_L3); 1760 g_cpucache_up = PARTIAL_L3; 1761 } else { 1762 int node; 1763 for_each_online_node(node) { 1764 1765 cachep->nodelists[node] = 1766 kmalloc_node(sizeof(struct kmem_list3), 1767 GFP_KERNEL, node); 1768 BUG_ON(!cachep->nodelists[node]); 1769 kmem_list3_init(cachep->nodelists[node]); 1770 } 1771 } 1772 } 1773 cachep->nodelists[numa_node_id()]->next_reap = 1774 jiffies + REAPTIMEOUT_LIST3 + 1775 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 1776 1777 BUG_ON(!ac_data(cachep)); 1778 ac_data(cachep)->avail = 0; 1779 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 1780 ac_data(cachep)->batchcount = 1; 1781 ac_data(cachep)->touched = 0; 1782 cachep->batchcount = 1; 1783 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1784 } 1785 1786 /* Need the semaphore to access the chain. */ 1787 down(&cache_chain_sem); 1788 { 1789 struct list_head *p; 1790 mm_segment_t old_fs; 1791 1792 old_fs = get_fs(); 1793 set_fs(KERNEL_DS); 1794 list_for_each(p, &cache_chain) { 1795 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); 1796 char tmp; 1797 /* This happens when the module gets unloaded and doesn't 1798 destroy its slab cache and noone else reuses the vmalloc 1799 area of the module. Print a warning. */ 1800 if (__get_user(tmp,pc->name)) { 1801 printk("SLAB: cache with size %d has lost its name\n", 1802 pc->objsize); 1803 continue; 1804 } 1805 if (!strcmp(pc->name,name)) { 1806 printk("kmem_cache_create: duplicate cache %s\n",name); 1807 up(&cache_chain_sem); 1808 unlock_cpu_hotplug(); 1809 BUG(); 1810 } 1811 } 1812 set_fs(old_fs); 1813 } 1814 1815 /* cache setup completed, link it into the list */ 1816 list_add(&cachep->next, &cache_chain); 1817 up(&cache_chain_sem); 1818 unlock_cpu_hotplug(); 1819opps: 1820 if (!cachep && (flags & SLAB_PANIC)) 1821 panic("kmem_cache_create(): failed to create slab `%s'\n", 1822 name); 1823 return cachep; 1824} 1825EXPORT_SYMBOL(kmem_cache_create); 1826 1827#if DEBUG 1828static void check_irq_off(void) 1829{ 1830 BUG_ON(!irqs_disabled()); 1831} 1832 1833static void check_irq_on(void) 1834{ 1835 BUG_ON(irqs_disabled()); 1836} 1837 1838static void check_spinlock_acquired(kmem_cache_t *cachep) 1839{ 1840#ifdef CONFIG_SMP 1841 check_irq_off(); 1842 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); 1843#endif 1844} 1845 1846static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node) 1847{ 1848#ifdef CONFIG_SMP 1849 check_irq_off(); 1850 assert_spin_locked(&cachep->nodelists[node]->list_lock); 1851#endif 1852} 1853 1854#else 1855#define check_irq_off() do { } while(0) 1856#define check_irq_on() do { } while(0) 1857#define check_spinlock_acquired(x) do { } while(0) 1858#define check_spinlock_acquired_node(x, y) do { } while(0) 1859#endif 1860 1861/* 1862 * Waits for all CPUs to execute func(). 1863 */ 1864static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) 1865{ 1866 check_irq_on(); 1867 preempt_disable(); 1868 1869 local_irq_disable(); 1870 func(arg); 1871 local_irq_enable(); 1872 1873 if (smp_call_function(func, arg, 1, 1)) 1874 BUG(); 1875 1876 preempt_enable(); 1877} 1878 1879static void drain_array_locked(kmem_cache_t* cachep, 1880 struct array_cache *ac, int force, int node); 1881 1882static void do_drain(void *arg) 1883{ 1884 kmem_cache_t *cachep = (kmem_cache_t*)arg; 1885 struct array_cache *ac; 1886 1887 check_irq_off(); 1888 ac = ac_data(cachep); 1889 spin_lock(&cachep->nodelists[numa_node_id()]->list_lock); 1890 free_block(cachep, ac->entry, ac->avail); 1891 spin_unlock(&cachep->nodelists[numa_node_id()]->list_lock); 1892 ac->avail = 0; 1893} 1894 1895static void drain_cpu_caches(kmem_cache_t *cachep) 1896{ 1897 struct kmem_list3 *l3; 1898 int node; 1899 1900 smp_call_function_all_cpus(do_drain, cachep); 1901 check_irq_on(); 1902 spin_lock_irq(&cachep->spinlock); 1903 for_each_online_node(node) { 1904 l3 = cachep->nodelists[node]; 1905 if (l3) { 1906 spin_lock(&l3->list_lock); 1907 drain_array_locked(cachep, l3->shared, 1, node); 1908 spin_unlock(&l3->list_lock); 1909 if (l3->alien) 1910 drain_alien_cache(cachep, l3); 1911 } 1912 } 1913 spin_unlock_irq(&cachep->spinlock); 1914} 1915 1916static int __node_shrink(kmem_cache_t *cachep, int node) 1917{ 1918 struct slab *slabp; 1919 struct kmem_list3 *l3 = cachep->nodelists[node]; 1920 int ret; 1921 1922 for (;;) { 1923 struct list_head *p; 1924 1925 p = l3->slabs_free.prev; 1926 if (p == &l3->slabs_free) 1927 break; 1928 1929 slabp = list_entry(l3->slabs_free.prev, struct slab, list); 1930#if DEBUG 1931 if (slabp->inuse) 1932 BUG(); 1933#endif 1934 list_del(&slabp->list); 1935 1936 l3->free_objects -= cachep->num; 1937 spin_unlock_irq(&l3->list_lock); 1938 slab_destroy(cachep, slabp); 1939 spin_lock_irq(&l3->list_lock); 1940 } 1941 ret = !list_empty(&l3->slabs_full) || 1942 !list_empty(&l3->slabs_partial); 1943 return ret; 1944} 1945 1946static int __cache_shrink(kmem_cache_t *cachep) 1947{ 1948 int ret = 0, i = 0; 1949 struct kmem_list3 *l3; 1950 1951 drain_cpu_caches(cachep); 1952 1953 check_irq_on(); 1954 for_each_online_node(i) { 1955 l3 = cachep->nodelists[i]; 1956 if (l3) { 1957 spin_lock_irq(&l3->list_lock); 1958 ret += __node_shrink(cachep, i); 1959 spin_unlock_irq(&l3->list_lock); 1960 } 1961 } 1962 return (ret ? 1 : 0); 1963} 1964 1965/** 1966 * kmem_cache_shrink - Shrink a cache. 1967 * @cachep: The cache to shrink. 1968 * 1969 * Releases as many slabs as possible for a cache. 1970 * To help debugging, a zero exit status indicates all slabs were released. 1971 */ 1972int kmem_cache_shrink(kmem_cache_t *cachep) 1973{ 1974 if (!cachep || in_interrupt()) 1975 BUG(); 1976 1977 return __cache_shrink(cachep); 1978} 1979EXPORT_SYMBOL(kmem_cache_shrink); 1980 1981/** 1982 * kmem_cache_destroy - delete a cache 1983 * @cachep: the cache to destroy 1984 * 1985 * Remove a kmem_cache_t object from the slab cache. 1986 * Returns 0 on success. 1987 * 1988 * It is expected this function will be called by a module when it is 1989 * unloaded. This will remove the cache completely, and avoid a duplicate 1990 * cache being allocated each time a module is loaded and unloaded, if the 1991 * module doesn't have persistent in-kernel storage across loads and unloads. 1992 * 1993 * The cache must be empty before calling this function. 1994 * 1995 * The caller must guarantee that noone will allocate memory from the cache 1996 * during the kmem_cache_destroy(). 1997 */ 1998int kmem_cache_destroy(kmem_cache_t * cachep) 1999{ 2000 int i; 2001 struct kmem_list3 *l3; 2002 2003 if (!cachep || in_interrupt()) 2004 BUG(); 2005 2006 /* Don't let CPUs to come and go */ 2007 lock_cpu_hotplug(); 2008 2009 /* Find the cache in the chain of caches. */ 2010 down(&cache_chain_sem); 2011 /* 2012 * the chain is never empty, cache_cache is never destroyed 2013 */ 2014 list_del(&cachep->next); 2015 up(&cache_chain_sem); 2016 2017 if (__cache_shrink(cachep)) { 2018 slab_error(cachep, "Can't free all objects"); 2019 down(&cache_chain_sem); 2020 list_add(&cachep->next,&cache_chain); 2021 up(&cache_chain_sem); 2022 unlock_cpu_hotplug(); 2023 return 1; 2024 } 2025 2026 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2027 synchronize_rcu(); 2028 2029 for_each_online_cpu(i) 2030 kfree(cachep->array[i]); 2031 2032 /* NUMA: free the list3 structures */ 2033 for_each_online_node(i) { 2034 if ((l3 = cachep->nodelists[i])) { 2035 kfree(l3->shared); 2036 free_alien_cache(l3->alien); 2037 kfree(l3); 2038 } 2039 } 2040 kmem_cache_free(&cache_cache, cachep); 2041 2042 unlock_cpu_hotplug(); 2043 2044 return 0; 2045} 2046EXPORT_SYMBOL(kmem_cache_destroy); 2047 2048/* Get the memory for a slab management obj. */ 2049static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp, 2050 int colour_off, unsigned int __nocast local_flags) 2051{ 2052 struct slab *slabp; 2053 2054 if (OFF_SLAB(cachep)) { 2055 /* Slab management obj is off-slab. */ 2056 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); 2057 if (!slabp) 2058 return NULL; 2059 } else { 2060 slabp = objp+colour_off; 2061 colour_off += cachep->slab_size; 2062 } 2063 slabp->inuse = 0; 2064 slabp->colouroff = colour_off; 2065 slabp->s_mem = objp+colour_off; 2066 2067 return slabp; 2068} 2069 2070static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2071{ 2072 return (kmem_bufctl_t *)(slabp+1); 2073} 2074 2075static void cache_init_objs(kmem_cache_t *cachep, 2076 struct slab *slabp, unsigned long ctor_flags) 2077{ 2078 int i; 2079 2080 for (i = 0; i < cachep->num; i++) { 2081 void *objp = slabp->s_mem+cachep->objsize*i; 2082#if DEBUG 2083 /* need to poison the objs? */ 2084 if (cachep->flags & SLAB_POISON) 2085 poison_obj(cachep, objp, POISON_FREE); 2086 if (cachep->flags & SLAB_STORE_USER) 2087 *dbg_userword(cachep, objp) = NULL; 2088 2089 if (cachep->flags & SLAB_RED_ZONE) { 2090 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2091 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2092 } 2093 /* 2094 * Constructors are not allowed to allocate memory from 2095 * the same cache which they are a constructor for. 2096 * Otherwise, deadlock. They must also be threaded. 2097 */ 2098 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2099 cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags); 2100 2101 if (cachep->flags & SLAB_RED_ZONE) { 2102 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2103 slab_error(cachep, "constructor overwrote the" 2104 " end of an object"); 2105 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2106 slab_error(cachep, "constructor overwrote the" 2107 " start of an object"); 2108 } 2109 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2110 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 2111#else 2112 if (cachep->ctor) 2113 cachep->ctor(objp, cachep, ctor_flags); 2114#endif 2115 slab_bufctl(slabp)[i] = i+1; 2116 } 2117 slab_bufctl(slabp)[i-1] = BUFCTL_END; 2118 slabp->free = 0; 2119} 2120 2121static void kmem_flagcheck(kmem_cache_t *cachep, unsigned int flags) 2122{ 2123 if (flags & SLAB_DMA) { 2124 if (!(cachep->gfpflags & GFP_DMA)) 2125 BUG(); 2126 } else { 2127 if (cachep->gfpflags & GFP_DMA) 2128 BUG(); 2129 } 2130} 2131 2132static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) 2133{ 2134 int i; 2135 struct page *page; 2136 2137 /* Nasty!!!!!! I hope this is OK. */ 2138 i = 1 << cachep->gfporder; 2139 page = virt_to_page(objp); 2140 do { 2141 SET_PAGE_CACHE(page, cachep); 2142 SET_PAGE_SLAB(page, slabp); 2143 page++; 2144 } while (--i); 2145} 2146 2147/* 2148 * Grow (by 1) the number of slabs within a cache. This is called by 2149 * kmem_cache_alloc() when there are no active objs left in a cache. 2150 */ 2151static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid) 2152{ 2153 struct slab *slabp; 2154 void *objp; 2155 size_t offset; 2156 unsigned int local_flags; 2157 unsigned long ctor_flags; 2158 struct kmem_list3 *l3; 2159 2160 /* Be lazy and only check for valid flags here, 2161 * keeping it out of the critical path in kmem_cache_alloc(). 2162 */ 2163 if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) 2164 BUG(); 2165 if (flags & SLAB_NO_GROW) 2166 return 0; 2167 2168 ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2169 local_flags = (flags & SLAB_LEVEL_MASK); 2170 if (!(local_flags & __GFP_WAIT)) 2171 /* 2172 * Not allowed to sleep. Need to tell a constructor about 2173 * this - it might need to know... 2174 */ 2175 ctor_flags |= SLAB_CTOR_ATOMIC; 2176 2177 /* About to mess with non-constant members - lock. */ 2178 check_irq_off(); 2179 spin_lock(&cachep->spinlock); 2180 2181 /* Get colour for the slab, and cal the next value. */ 2182 offset = cachep->colour_next; 2183 cachep->colour_next++; 2184 if (cachep->colour_next >= cachep->colour) 2185 cachep->colour_next = 0; 2186 offset *= cachep->colour_off; 2187 2188 spin_unlock(&cachep->spinlock); 2189 2190 check_irq_off(); 2191 if (local_flags & __GFP_WAIT) 2192 local_irq_enable(); 2193 2194 /* 2195 * The test for missing atomic flag is performed here, rather than 2196 * the more obvious place, simply to reduce the critical path length 2197 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 2198 * will eventually be caught here (where it matters). 2199 */ 2200 kmem_flagcheck(cachep, flags); 2201 2202 /* Get mem for the objs. 2203 * Attempt to allocate a physical page from 'nodeid', 2204 */ 2205 if (!(objp = kmem_getpages(cachep, flags, nodeid))) 2206 goto failed; 2207 2208 /* Get slab management. */ 2209 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) 2210 goto opps1; 2211 2212 slabp->nodeid = nodeid; 2213 set_slab_attr(cachep, slabp, objp); 2214 2215 cache_init_objs(cachep, slabp, ctor_flags); 2216 2217 if (local_flags & __GFP_WAIT) 2218 local_irq_disable(); 2219 check_irq_off(); 2220 l3 = cachep->nodelists[nodeid]; 2221 spin_lock(&l3->list_lock); 2222 2223 /* Make slab active. */ 2224 list_add_tail(&slabp->list, &(l3->slabs_free)); 2225 STATS_INC_GROWN(cachep); 2226 l3->free_objects += cachep->num; 2227 spin_unlock(&l3->list_lock); 2228 return 1; 2229opps1: 2230 kmem_freepages(cachep, objp); 2231failed: 2232 if (local_flags & __GFP_WAIT) 2233 local_irq_disable(); 2234 return 0; 2235} 2236 2237#if DEBUG 2238 2239/* 2240 * Perform extra freeing checks: 2241 * - detect bad pointers. 2242 * - POISON/RED_ZONE checking 2243 * - destructor calls, for caches with POISON+dtor 2244 */ 2245static void kfree_debugcheck(const void *objp) 2246{ 2247 struct page *page; 2248 2249 if (!virt_addr_valid(objp)) { 2250 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2251 (unsigned long)objp); 2252 BUG(); 2253 } 2254 page = virt_to_page(objp); 2255 if (!PageSlab(page)) { 2256 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp); 2257 BUG(); 2258 } 2259} 2260 2261static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, 2262 void *caller) 2263{ 2264 struct page *page; 2265 unsigned int objnr; 2266 struct slab *slabp; 2267 2268 objp -= obj_dbghead(cachep); 2269 kfree_debugcheck(objp); 2270 page = virt_to_page(objp); 2271 2272 if (GET_PAGE_CACHE(page) != cachep) { 2273 printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n", 2274 GET_PAGE_CACHE(page),cachep); 2275 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); 2276 printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name); 2277 WARN_ON(1); 2278 } 2279 slabp = GET_PAGE_SLAB(page); 2280 2281 if (cachep->flags & SLAB_RED_ZONE) { 2282 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { 2283 slab_error(cachep, "double free, or memory outside" 2284 " object was overwritten"); 2285 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2286 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 2287 } 2288 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2289 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2290 } 2291 if (cachep->flags & SLAB_STORE_USER) 2292 *dbg_userword(cachep, objp) = caller; 2293 2294 objnr = (objp-slabp->s_mem)/cachep->objsize; 2295 2296 BUG_ON(objnr >= cachep->num); 2297 BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize); 2298 2299 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2300 /* Need to call the slab's constructor so the 2301 * caller can perform a verify of its state (debugging). 2302 * Called without the cache-lock held. 2303 */ 2304 cachep->ctor(objp+obj_dbghead(cachep), 2305 cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); 2306 } 2307 if (cachep->flags & SLAB_POISON && cachep->dtor) { 2308 /* we want to cache poison the object, 2309 * call the destruction callback 2310 */ 2311 cachep->dtor(objp+obj_dbghead(cachep), cachep, 0); 2312 } 2313 if (cachep->flags & SLAB_POISON) { 2314#ifdef CONFIG_DEBUG_PAGEALLOC 2315 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2316 store_stackinfo(cachep, objp, (unsigned long)caller); 2317 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 2318 } else { 2319 poison_obj(cachep, objp, POISON_FREE); 2320 } 2321#else 2322 poison_obj(cachep, objp, POISON_FREE); 2323#endif 2324 } 2325 return objp; 2326} 2327 2328static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) 2329{ 2330 kmem_bufctl_t i; 2331 int entries = 0; 2332 2333 /* Check slab's freelist to see if this obj is there. */ 2334 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2335 entries++; 2336 if (entries > cachep->num || i >= cachep->num) 2337 goto bad; 2338 } 2339 if (entries != cachep->num - slabp->inuse) { 2340bad: 2341 printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2342 cachep->name, cachep->num, slabp, slabp->inuse); 2343 for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) { 2344 if ((i%16)==0) 2345 printk("\n%03x:", i); 2346 printk(" %02x", ((unsigned char*)slabp)[i]); 2347 } 2348 printk("\n"); 2349 BUG(); 2350 } 2351} 2352#else 2353#define kfree_debugcheck(x) do { } while(0) 2354#define cache_free_debugcheck(x,objp,z) (objp) 2355#define check_slabp(x,y) do { } while(0) 2356#endif 2357 2358static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags) 2359{ 2360 int batchcount; 2361 struct kmem_list3 *l3; 2362 struct array_cache *ac; 2363 2364 check_irq_off(); 2365 ac = ac_data(cachep); 2366retry: 2367 batchcount = ac->batchcount; 2368 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2369 /* if there was little recent activity on this 2370 * cache, then perform only a partial refill. 2371 * Otherwise we could generate refill bouncing. 2372 */ 2373 batchcount = BATCHREFILL_LIMIT; 2374 } 2375 l3 = cachep->nodelists[numa_node_id()]; 2376 2377 BUG_ON(ac->avail > 0 || !l3); 2378 spin_lock(&l3->list_lock); 2379 2380 if (l3->shared) { 2381 struct array_cache *shared_array = l3->shared; 2382 if (shared_array->avail) { 2383 if (batchcount > shared_array->avail) 2384 batchcount = shared_array->avail; 2385 shared_array->avail -= batchcount; 2386 ac->avail = batchcount; 2387 memcpy(ac->entry, 2388 &(shared_array->entry[shared_array->avail]), 2389 sizeof(void*)*batchcount); 2390 shared_array->touched = 1; 2391 goto alloc_done; 2392 } 2393 } 2394 while (batchcount > 0) { 2395 struct list_head *entry; 2396 struct slab *slabp; 2397 /* Get slab alloc is to come from. */ 2398 entry = l3->slabs_partial.next; 2399 if (entry == &l3->slabs_partial) { 2400 l3->free_touched = 1; 2401 entry = l3->slabs_free.next; 2402 if (entry == &l3->slabs_free) 2403 goto must_grow; 2404 } 2405 2406 slabp = list_entry(entry, struct slab, list); 2407 check_slabp(cachep, slabp); 2408 check_spinlock_acquired(cachep); 2409 while (slabp->inuse < cachep->num && batchcount--) { 2410 kmem_bufctl_t next; 2411 STATS_INC_ALLOCED(cachep); 2412 STATS_INC_ACTIVE(cachep); 2413 STATS_SET_HIGH(cachep); 2414 2415 /* get obj pointer */ 2416 ac->entry[ac->avail++] = slabp->s_mem + 2417 slabp->free*cachep->objsize; 2418 2419 slabp->inuse++; 2420 next = slab_bufctl(slabp)[slabp->free]; 2421#if DEBUG 2422 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2423#endif 2424 slabp->free = next; 2425 } 2426 check_slabp(cachep, slabp); 2427 2428 /* move slabp to correct slabp list: */ 2429 list_del(&slabp->list); 2430 if (slabp->free == BUFCTL_END) 2431 list_add(&slabp->list, &l3->slabs_full); 2432 else 2433 list_add(&slabp->list, &l3->slabs_partial); 2434 } 2435 2436must_grow: 2437 l3->free_objects -= ac->avail; 2438alloc_done: 2439 spin_unlock(&l3->list_lock); 2440 2441 if (unlikely(!ac->avail)) { 2442 int x; 2443 x = cache_grow(cachep, flags, numa_node_id()); 2444 2445 // cache_grow can reenable interrupts, then ac could change. 2446 ac = ac_data(cachep); 2447 if (!x && ac->avail == 0) // no objects in sight? abort 2448 return NULL; 2449 2450 if (!ac->avail) // objects refilled by interrupt? 2451 goto retry; 2452 } 2453 ac->touched = 1; 2454 return ac->entry[--ac->avail]; 2455} 2456 2457static inline void 2458cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags) 2459{ 2460 might_sleep_if(flags & __GFP_WAIT); 2461#if DEBUG 2462 kmem_flagcheck(cachep, flags); 2463#endif 2464} 2465 2466#if DEBUG 2467static void * 2468cache_alloc_debugcheck_after(kmem_cache_t *cachep, 2469 unsigned int __nocast flags, void *objp, void *caller) 2470{ 2471 if (!objp) 2472 return objp; 2473 if (cachep->flags & SLAB_POISON) { 2474#ifdef CONFIG_DEBUG_PAGEALLOC 2475 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2476 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1); 2477 else 2478 check_poison_obj(cachep, objp); 2479#else 2480 check_poison_obj(cachep, objp); 2481#endif 2482 poison_obj(cachep, objp, POISON_INUSE); 2483 } 2484 if (cachep->flags & SLAB_STORE_USER) 2485 *dbg_userword(cachep, objp) = caller; 2486 2487 if (cachep->flags & SLAB_RED_ZONE) { 2488 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2489 slab_error(cachep, "double free, or memory outside" 2490 " object was overwritten"); 2491 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2492 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 2493 } 2494 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2495 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2496 } 2497 objp += obj_dbghead(cachep); 2498 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2499 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2500 2501 if (!(flags & __GFP_WAIT)) 2502 ctor_flags |= SLAB_CTOR_ATOMIC; 2503 2504 cachep->ctor(objp, cachep, ctor_flags); 2505 } 2506 return objp; 2507} 2508#else 2509#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2510#endif 2511 2512 2513static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags) 2514{ 2515 unsigned long save_flags; 2516 void* objp; 2517 struct array_cache *ac; 2518 2519 cache_alloc_debugcheck_before(cachep, flags); 2520 2521 local_irq_save(save_flags); 2522 ac = ac_data(cachep); 2523 if (likely(ac->avail)) { 2524 STATS_INC_ALLOCHIT(cachep); 2525 ac->touched = 1; 2526 objp = ac->entry[--ac->avail]; 2527 } else { 2528 STATS_INC_ALLOCMISS(cachep); 2529 objp = cache_alloc_refill(cachep, flags); 2530 } 2531 local_irq_restore(save_flags); 2532 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 2533 __builtin_return_address(0)); 2534 prefetchw(objp); 2535 return objp; 2536} 2537 2538#ifdef CONFIG_NUMA 2539/* 2540 * A interface to enable slab creation on nodeid 2541 */ 2542static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid) 2543{ 2544 struct list_head *entry; 2545 struct slab *slabp; 2546 struct kmem_list3 *l3; 2547 void *obj; 2548 kmem_bufctl_t next; 2549 int x; 2550 2551 l3 = cachep->nodelists[nodeid]; 2552 BUG_ON(!l3); 2553 2554retry: 2555 spin_lock(&l3->list_lock); 2556 entry = l3->slabs_partial.next; 2557 if (entry == &l3->slabs_partial) { 2558 l3->free_touched = 1; 2559 entry = l3->slabs_free.next; 2560 if (entry == &l3->slabs_free) 2561 goto must_grow; 2562 } 2563 2564 slabp = list_entry(entry, struct slab, list); 2565 check_spinlock_acquired_node(cachep, nodeid); 2566 check_slabp(cachep, slabp); 2567 2568 STATS_INC_NODEALLOCS(cachep); 2569 STATS_INC_ACTIVE(cachep); 2570 STATS_SET_HIGH(cachep); 2571 2572 BUG_ON(slabp->inuse == cachep->num); 2573 2574 /* get obj pointer */ 2575 obj = slabp->s_mem + slabp->free*cachep->objsize; 2576 slabp->inuse++; 2577 next = slab_bufctl(slabp)[slabp->free]; 2578#if DEBUG 2579 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2580#endif 2581 slabp->free = next; 2582 check_slabp(cachep, slabp); 2583 l3->free_objects--; 2584 /* move slabp to correct slabp list: */ 2585 list_del(&slabp->list); 2586 2587 if (slabp->free == BUFCTL_END) { 2588 list_add(&slabp->list, &l3->slabs_full); 2589 } else { 2590 list_add(&slabp->list, &l3->slabs_partial); 2591 } 2592 2593 spin_unlock(&l3->list_lock); 2594 goto done; 2595 2596must_grow: 2597 spin_unlock(&l3->list_lock); 2598 x = cache_grow(cachep, flags, nodeid); 2599 2600 if (!x) 2601 return NULL; 2602 2603 goto retry; 2604done: 2605 return obj; 2606} 2607#endif 2608 2609/* 2610 * Caller needs to acquire correct kmem_list's list_lock 2611 */ 2612static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects) 2613{ 2614 int i; 2615 struct kmem_list3 *l3; 2616 2617 for (i = 0; i < nr_objects; i++) { 2618 void *objp = objpp[i]; 2619 struct slab *slabp; 2620 unsigned int objnr; 2621 int nodeid = 0; 2622 2623 slabp = GET_PAGE_SLAB(virt_to_page(objp)); 2624 nodeid = slabp->nodeid; 2625 l3 = cachep->nodelists[nodeid]; 2626 list_del(&slabp->list); 2627 objnr = (objp - slabp->s_mem) / cachep->objsize; 2628 check_spinlock_acquired_node(cachep, nodeid); 2629 check_slabp(cachep, slabp); 2630 2631 2632#if DEBUG 2633 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2634 printk(KERN_ERR "slab: double free detected in cache " 2635 "'%s', objp %p\n", cachep->name, objp); 2636 BUG(); 2637 } 2638#endif 2639 slab_bufctl(slabp)[objnr] = slabp->free; 2640 slabp->free = objnr; 2641 STATS_DEC_ACTIVE(cachep); 2642 slabp->inuse--; 2643 l3->free_objects++; 2644 check_slabp(cachep, slabp); 2645 2646 /* fixup slab chains */ 2647 if (slabp->inuse == 0) { 2648 if (l3->free_objects > l3->free_limit) { 2649 l3->free_objects -= cachep->num; 2650 slab_destroy(cachep, slabp); 2651 } else { 2652 list_add(&slabp->list, &l3->slabs_free); 2653 } 2654 } else { 2655 /* Unconditionally move a slab to the end of the 2656 * partial list on free - maximum time for the 2657 * other objects to be freed, too. 2658 */ 2659 list_add_tail(&slabp->list, &l3->slabs_partial); 2660 } 2661 } 2662} 2663 2664static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) 2665{ 2666 int batchcount; 2667 struct kmem_list3 *l3; 2668 2669 batchcount = ac->batchcount; 2670#if DEBUG 2671 BUG_ON(!batchcount || batchcount > ac->avail); 2672#endif 2673 check_irq_off(); 2674 l3 = cachep->nodelists[numa_node_id()]; 2675 spin_lock(&l3->list_lock); 2676 if (l3->shared) { 2677 struct array_cache *shared_array = l3->shared; 2678 int max = shared_array->limit-shared_array->avail; 2679 if (max) { 2680 if (batchcount > max) 2681 batchcount = max; 2682 memcpy(&(shared_array->entry[shared_array->avail]), 2683 ac->entry, 2684 sizeof(void*)*batchcount); 2685 shared_array->avail += batchcount; 2686 goto free_done; 2687 } 2688 } 2689 2690 free_block(cachep, ac->entry, batchcount); 2691free_done: 2692#if STATS 2693 { 2694 int i = 0; 2695 struct list_head *p; 2696 2697 p = l3->slabs_free.next; 2698 while (p != &(l3->slabs_free)) { 2699 struct slab *slabp; 2700 2701 slabp = list_entry(p, struct slab, list); 2702 BUG_ON(slabp->inuse); 2703 2704 i++; 2705 p = p->next; 2706 } 2707 STATS_SET_FREEABLE(cachep, i); 2708 } 2709#endif 2710 spin_unlock(&l3->list_lock); 2711 ac->avail -= batchcount; 2712 memmove(ac->entry, &(ac->entry[batchcount]), 2713 sizeof(void*)*ac->avail); 2714} 2715 2716 2717/* 2718 * __cache_free 2719 * Release an obj back to its cache. If the obj has a constructed 2720 * state, it must be in this state _before_ it is released. 2721 * 2722 * Called with disabled ints. 2723 */ 2724static inline void __cache_free(kmem_cache_t *cachep, void *objp) 2725{ 2726 struct array_cache *ac = ac_data(cachep); 2727 2728 check_irq_off(); 2729 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 2730 2731 /* Make sure we are not freeing a object from another 2732 * node to the array cache on this cpu. 2733 */ 2734#ifdef CONFIG_NUMA 2735 { 2736 struct slab *slabp; 2737 slabp = GET_PAGE_SLAB(virt_to_page(objp)); 2738 if (unlikely(slabp->nodeid != numa_node_id())) { 2739 struct array_cache *alien = NULL; 2740 int nodeid = slabp->nodeid; 2741 struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()]; 2742 2743 STATS_INC_NODEFREES(cachep); 2744 if (l3->alien && l3->alien[nodeid]) { 2745 alien = l3->alien[nodeid]; 2746 spin_lock(&alien->lock); 2747 if (unlikely(alien->avail == alien->limit)) 2748 __drain_alien_cache(cachep, 2749 alien, nodeid); 2750 alien->entry[alien->avail++] = objp; 2751 spin_unlock(&alien->lock); 2752 } else { 2753 spin_lock(&(cachep->nodelists[nodeid])-> 2754 list_lock); 2755 free_block(cachep, &objp, 1); 2756 spin_unlock(&(cachep->nodelists[nodeid])-> 2757 list_lock); 2758 } 2759 return; 2760 } 2761 } 2762#endif 2763 if (likely(ac->avail < ac->limit)) { 2764 STATS_INC_FREEHIT(cachep); 2765 ac->entry[ac->avail++] = objp; 2766 return; 2767 } else { 2768 STATS_INC_FREEMISS(cachep); 2769 cache_flusharray(cachep, ac); 2770 ac->entry[ac->avail++] = objp; 2771 } 2772} 2773 2774/** 2775 * kmem_cache_alloc - Allocate an object 2776 * @cachep: The cache to allocate from. 2777 * @flags: See kmalloc(). 2778 * 2779 * Allocate an object from this cache. The flags are only relevant 2780 * if the cache has no available objects. 2781 */ 2782void *kmem_cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags) 2783{ 2784 return __cache_alloc(cachep, flags); 2785} 2786EXPORT_SYMBOL(kmem_cache_alloc); 2787 2788/** 2789 * kmem_ptr_validate - check if an untrusted pointer might 2790 * be a slab entry. 2791 * @cachep: the cache we're checking against 2792 * @ptr: pointer to validate 2793 * 2794 * This verifies that the untrusted pointer looks sane: 2795 * it is _not_ a guarantee that the pointer is actually 2796 * part of the slab cache in question, but it at least 2797 * validates that the pointer can be dereferenced and 2798 * looks half-way sane. 2799 * 2800 * Currently only used for dentry validation. 2801 */ 2802int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) 2803{ 2804 unsigned long addr = (unsigned long) ptr; 2805 unsigned long min_addr = PAGE_OFFSET; 2806 unsigned long align_mask = BYTES_PER_WORD-1; 2807 unsigned long size = cachep->objsize; 2808 struct page *page; 2809 2810 if (unlikely(addr < min_addr)) 2811 goto out; 2812 if (unlikely(addr > (unsigned long)high_memory - size)) 2813 goto out; 2814 if (unlikely(addr & align_mask)) 2815 goto out; 2816 if (unlikely(!kern_addr_valid(addr))) 2817 goto out; 2818 if (unlikely(!kern_addr_valid(addr + size - 1))) 2819 goto out; 2820 page = virt_to_page(ptr); 2821 if (unlikely(!PageSlab(page))) 2822 goto out; 2823 if (unlikely(GET_PAGE_CACHE(page) != cachep)) 2824 goto out; 2825 return 1; 2826out: 2827 return 0; 2828} 2829 2830#ifdef CONFIG_NUMA 2831/** 2832 * kmem_cache_alloc_node - Allocate an object on the specified node 2833 * @cachep: The cache to allocate from. 2834 * @flags: See kmalloc(). 2835 * @nodeid: node number of the target node. 2836 * 2837 * Identical to kmem_cache_alloc, except that this function is slow 2838 * and can sleep. And it will allocate memory on the given node, which 2839 * can improve the performance for cpu bound structures. 2840 * New and improved: it will now make sure that the object gets 2841 * put on the correct node list so that there is no false sharing. 2842 */ 2843void *kmem_cache_alloc_node(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid) 2844{ 2845 unsigned long save_flags; 2846 void *ptr; 2847 2848 if (nodeid == numa_node_id() || nodeid == -1) 2849 return __cache_alloc(cachep, flags); 2850 2851 if (unlikely(!cachep->nodelists[nodeid])) { 2852 /* Fall back to __cache_alloc if we run into trouble */ 2853 printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name); 2854 return __cache_alloc(cachep,flags); 2855 } 2856 2857 cache_alloc_debugcheck_before(cachep, flags); 2858 local_irq_save(save_flags); 2859 ptr = __cache_alloc_node(cachep, flags, nodeid); 2860 local_irq_restore(save_flags); 2861 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0)); 2862 2863 return ptr; 2864} 2865EXPORT_SYMBOL(kmem_cache_alloc_node); 2866 2867void *kmalloc_node(size_t size, unsigned int __nocast flags, int node) 2868{ 2869 kmem_cache_t *cachep; 2870 2871 cachep = kmem_find_general_cachep(size, flags); 2872 if (unlikely(cachep == NULL)) 2873 return NULL; 2874 return kmem_cache_alloc_node(cachep, flags, node); 2875} 2876EXPORT_SYMBOL(kmalloc_node); 2877#endif 2878 2879/** 2880 * kmalloc - allocate memory 2881 * @size: how many bytes of memory are required. 2882 * @flags: the type of memory to allocate. 2883 * 2884 * kmalloc is the normal method of allocating memory 2885 * in the kernel. 2886 * 2887 * The @flags argument may be one of: 2888 * 2889 * %GFP_USER - Allocate memory on behalf of user. May sleep. 2890 * 2891 * %GFP_KERNEL - Allocate normal kernel ram. May sleep. 2892 * 2893 * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers. 2894 * 2895 * Additionally, the %GFP_DMA flag may be set to indicate the memory 2896 * must be suitable for DMA. This can mean different things on different 2897 * platforms. For example, on i386, it means that the memory must come 2898 * from the first 16MB. 2899 */ 2900void *__kmalloc(size_t size, unsigned int __nocast flags) 2901{ 2902 kmem_cache_t *cachep; 2903 2904 /* If you want to save a few bytes .text space: replace 2905 * __ with kmem_. 2906 * Then kmalloc uses the uninlined functions instead of the inline 2907 * functions. 2908 */ 2909 cachep = __find_general_cachep(size, flags); 2910 if (unlikely(cachep == NULL)) 2911 return NULL; 2912 return __cache_alloc(cachep, flags); 2913} 2914EXPORT_SYMBOL(__kmalloc); 2915 2916#ifdef CONFIG_SMP 2917/** 2918 * __alloc_percpu - allocate one copy of the object for every present 2919 * cpu in the system, zeroing them. 2920 * Objects should be dereferenced using the per_cpu_ptr macro only. 2921 * 2922 * @size: how many bytes of memory are required. 2923 * @align: the alignment, which can't be greater than SMP_CACHE_BYTES. 2924 */ 2925void *__alloc_percpu(size_t size, size_t align) 2926{ 2927 int i; 2928 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); 2929 2930 if (!pdata) 2931 return NULL; 2932 2933 /* 2934 * Cannot use for_each_online_cpu since a cpu may come online 2935 * and we have no way of figuring out how to fix the array 2936 * that we have allocated then.... 2937 */ 2938 for_each_cpu(i) { 2939 int node = cpu_to_node(i); 2940 2941 if (node_online(node)) 2942 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node); 2943 else 2944 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); 2945 2946 if (!pdata->ptrs[i]) 2947 goto unwind_oom; 2948 memset(pdata->ptrs[i], 0, size); 2949 } 2950 2951 /* Catch derefs w/o wrappers */ 2952 return (void *) (~(unsigned long) pdata); 2953 2954unwind_oom: 2955 while (--i >= 0) { 2956 if (!cpu_possible(i)) 2957 continue; 2958 kfree(pdata->ptrs[i]); 2959 } 2960 kfree(pdata); 2961 return NULL; 2962} 2963EXPORT_SYMBOL(__alloc_percpu); 2964#endif 2965 2966/** 2967 * kmem_cache_free - Deallocate an object 2968 * @cachep: The cache the allocation was from. 2969 * @objp: The previously allocated object. 2970 * 2971 * Free an object which was previously allocated from this 2972 * cache. 2973 */ 2974void kmem_cache_free(kmem_cache_t *cachep, void *objp) 2975{ 2976 unsigned long flags; 2977 2978 local_irq_save(flags); 2979 __cache_free(cachep, objp); 2980 local_irq_restore(flags); 2981} 2982EXPORT_SYMBOL(kmem_cache_free); 2983 2984/** 2985 * kzalloc - allocate memory. The memory is set to zero. 2986 * @size: how many bytes of memory are required. 2987 * @flags: the type of memory to allocate. 2988 */ 2989void *kzalloc(size_t size, unsigned int __nocast flags) 2990{ 2991 void *ret = kmalloc(size, flags); 2992 if (ret) 2993 memset(ret, 0, size); 2994 return ret; 2995} 2996EXPORT_SYMBOL(kzalloc); 2997 2998/** 2999 * kfree - free previously allocated memory 3000 * @objp: pointer returned by kmalloc. 3001 * 3002 * If @objp is NULL, no operation is performed. 3003 * 3004 * Don't free memory not originally allocated by kmalloc() 3005 * or you will run into trouble. 3006 */ 3007void kfree(const void *objp) 3008{ 3009 kmem_cache_t *c; 3010 unsigned long flags; 3011 3012 if (unlikely(!objp)) 3013 return; 3014 local_irq_save(flags); 3015 kfree_debugcheck(objp); 3016 c = GET_PAGE_CACHE(virt_to_page(objp)); 3017 __cache_free(c, (void*)objp); 3018 local_irq_restore(flags); 3019} 3020EXPORT_SYMBOL(kfree); 3021 3022#ifdef CONFIG_SMP 3023/** 3024 * free_percpu - free previously allocated percpu memory 3025 * @objp: pointer returned by alloc_percpu. 3026 * 3027 * Don't free memory not originally allocated by alloc_percpu() 3028 * The complemented objp is to check for that. 3029 */ 3030void 3031free_percpu(const void *objp) 3032{ 3033 int i; 3034 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); 3035 3036 /* 3037 * We allocate for all cpus so we cannot use for online cpu here. 3038 */ 3039 for_each_cpu(i) 3040 kfree(p->ptrs[i]); 3041 kfree(p); 3042} 3043EXPORT_SYMBOL(free_percpu); 3044#endif 3045 3046unsigned int kmem_cache_size(kmem_cache_t *cachep) 3047{ 3048 return obj_reallen(cachep); 3049} 3050EXPORT_SYMBOL(kmem_cache_size); 3051 3052const char *kmem_cache_name(kmem_cache_t *cachep) 3053{ 3054 return cachep->name; 3055} 3056EXPORT_SYMBOL_GPL(kmem_cache_name); 3057 3058/* 3059 * This initializes kmem_list3 for all nodes. 3060 */ 3061static int alloc_kmemlist(kmem_cache_t *cachep) 3062{ 3063 int node; 3064 struct kmem_list3 *l3; 3065 int err = 0; 3066 3067 for_each_online_node(node) { 3068 struct array_cache *nc = NULL, *new; 3069 struct array_cache **new_alien = NULL; 3070#ifdef CONFIG_NUMA 3071 if (!(new_alien = alloc_alien_cache(node, cachep->limit))) 3072 goto fail; 3073#endif 3074 if (!(new = alloc_arraycache(node, (cachep->shared* 3075 cachep->batchcount), 0xbaadf00d))) 3076 goto fail; 3077 if ((l3 = cachep->nodelists[node])) { 3078 3079 spin_lock_irq(&l3->list_lock); 3080 3081 if ((nc = cachep->nodelists[node]->shared)) 3082 free_block(cachep, nc->entry, 3083 nc->avail); 3084 3085 l3->shared = new; 3086 if (!cachep->nodelists[node]->alien) { 3087 l3->alien = new_alien; 3088 new_alien = NULL; 3089 } 3090 l3->free_limit = (1 + nr_cpus_node(node))* 3091 cachep->batchcount + cachep->num; 3092 spin_unlock_irq(&l3->list_lock); 3093 kfree(nc); 3094 free_alien_cache(new_alien); 3095 continue; 3096 } 3097 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), 3098 GFP_KERNEL, node))) 3099 goto fail; 3100 3101 kmem_list3_init(l3); 3102 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3103 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 3104 l3->shared = new; 3105 l3->alien = new_alien; 3106 l3->free_limit = (1 + nr_cpus_node(node))* 3107 cachep->batchcount + cachep->num; 3108 cachep->nodelists[node] = l3; 3109 } 3110 return err; 3111fail: 3112 err = -ENOMEM; 3113 return err; 3114} 3115 3116struct ccupdate_struct { 3117 kmem_cache_t *cachep; 3118 struct array_cache *new[NR_CPUS]; 3119}; 3120 3121static void do_ccupdate_local(void *info) 3122{ 3123 struct ccupdate_struct *new = (struct ccupdate_struct *)info; 3124 struct array_cache *old; 3125 3126 check_irq_off(); 3127 old = ac_data(new->cachep); 3128 3129 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3130 new->new[smp_processor_id()] = old; 3131} 3132 3133 3134static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, 3135 int shared) 3136{ 3137 struct ccupdate_struct new; 3138 int i, err; 3139 3140 memset(&new.new,0,sizeof(new.new)); 3141 for_each_online_cpu(i) { 3142 new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount); 3143 if (!new.new[i]) { 3144 for (i--; i >= 0; i--) kfree(new.new[i]); 3145 return -ENOMEM; 3146 } 3147 } 3148 new.cachep = cachep; 3149 3150 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); 3151 3152 check_irq_on(); 3153 spin_lock_irq(&cachep->spinlock); 3154 cachep->batchcount = batchcount; 3155 cachep->limit = limit; 3156 cachep->shared = shared; 3157 spin_unlock_irq(&cachep->spinlock); 3158 3159 for_each_online_cpu(i) { 3160 struct array_cache *ccold = new.new[i]; 3161 if (!ccold) 3162 continue; 3163 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3164 free_block(cachep, ccold->entry, ccold->avail); 3165 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3166 kfree(ccold); 3167 } 3168 3169 err = alloc_kmemlist(cachep); 3170 if (err) { 3171 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", 3172 cachep->name, -err); 3173 BUG(); 3174 } 3175 return 0; 3176} 3177 3178 3179static void enable_cpucache(kmem_cache_t *cachep) 3180{ 3181 int err; 3182 int limit, shared; 3183 3184 /* The head array serves three purposes: 3185 * - create a LIFO ordering, i.e. return objects that are cache-warm 3186 * - reduce the number of spinlock operations. 3187 * - reduce the number of linked list operations on the slab and 3188 * bufctl chains: array operations are cheaper. 3189 * The numbers are guessed, we should auto-tune as described by 3190 * Bonwick. 3191 */ 3192 if (cachep->objsize > 131072) 3193 limit = 1; 3194 else if (cachep->objsize > PAGE_SIZE) 3195 limit = 8; 3196 else if (cachep->objsize > 1024) 3197 limit = 24; 3198 else if (cachep->objsize > 256) 3199 limit = 54; 3200 else 3201 limit = 120; 3202 3203 /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound 3204 * allocation behaviour: Most allocs on one cpu, most free operations 3205 * on another cpu. For these cases, an efficient object passing between 3206 * cpus is necessary. This is provided by a shared array. The array 3207 * replaces Bonwick's magazine layer. 3208 * On uniprocessor, it's functionally equivalent (but less efficient) 3209 * to a larger limit. Thus disabled by default. 3210 */ 3211 shared = 0; 3212#ifdef CONFIG_SMP 3213 if (cachep->objsize <= PAGE_SIZE) 3214 shared = 8; 3215#endif 3216 3217#if DEBUG 3218 /* With debugging enabled, large batchcount lead to excessively 3219 * long periods with disabled local interrupts. Limit the 3220 * batchcount 3221 */ 3222 if (limit > 32) 3223 limit = 32; 3224#endif 3225 err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); 3226 if (err) 3227 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3228 cachep->name, -err); 3229} 3230 3231static void drain_array_locked(kmem_cache_t *cachep, 3232 struct array_cache *ac, int force, int node) 3233{ 3234 int tofree; 3235 3236 check_spinlock_acquired_node(cachep, node); 3237 if (ac->touched && !force) { 3238 ac->touched = 0; 3239 } else if (ac->avail) { 3240 tofree = force ? ac->avail : (ac->limit+4)/5; 3241 if (tofree > ac->avail) { 3242 tofree = (ac->avail+1)/2; 3243 } 3244 free_block(cachep, ac->entry, tofree); 3245 ac->avail -= tofree; 3246 memmove(ac->entry, &(ac->entry[tofree]), 3247 sizeof(void*)*ac->avail); 3248 } 3249} 3250 3251/** 3252 * cache_reap - Reclaim memory from caches. 3253 * 3254 * Called from workqueue/eventd every few seconds. 3255 * Purpose: 3256 * - clear the per-cpu caches for this CPU. 3257 * - return freeable pages to the main free memory pool. 3258 * 3259 * If we cannot acquire the cache chain semaphore then just give up - we'll 3260 * try again on the next iteration. 3261 */ 3262static void cache_reap(void *unused) 3263{ 3264 struct list_head *walk; 3265 struct kmem_list3 *l3; 3266 3267 if (down_trylock(&cache_chain_sem)) { 3268 /* Give up. Setup the next iteration. */ 3269 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); 3270 return; 3271 } 3272 3273 list_for_each(walk, &cache_chain) { 3274 kmem_cache_t *searchp; 3275 struct list_head* p; 3276 int tofree; 3277 struct slab *slabp; 3278 3279 searchp = list_entry(walk, kmem_cache_t, next); 3280 3281 if (searchp->flags & SLAB_NO_REAP) 3282 goto next; 3283 3284 check_irq_on(); 3285 3286 l3 = searchp->nodelists[numa_node_id()]; 3287 if (l3->alien) 3288 drain_alien_cache(searchp, l3); 3289 spin_lock_irq(&l3->list_lock); 3290 3291 drain_array_locked(searchp, ac_data(searchp), 0, 3292 numa_node_id()); 3293 3294 if (time_after(l3->next_reap, jiffies)) 3295 goto next_unlock; 3296 3297 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 3298 3299 if (l3->shared) 3300 drain_array_locked(searchp, l3->shared, 0, 3301 numa_node_id()); 3302 3303 if (l3->free_touched) { 3304 l3->free_touched = 0; 3305 goto next_unlock; 3306 } 3307 3308 tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num); 3309 do { 3310 p = l3->slabs_free.next; 3311 if (p == &(l3->slabs_free)) 3312 break; 3313 3314 slabp = list_entry(p, struct slab, list); 3315 BUG_ON(slabp->inuse); 3316 list_del(&slabp->list); 3317 STATS_INC_REAPED(searchp); 3318 3319 /* Safe to drop the lock. The slab is no longer 3320 * linked to the cache. 3321 * searchp cannot disappear, we hold 3322 * cache_chain_lock 3323 */ 3324 l3->free_objects -= searchp->num; 3325 spin_unlock_irq(&l3->list_lock); 3326 slab_destroy(searchp, slabp); 3327 spin_lock_irq(&l3->list_lock); 3328 } while(--tofree > 0); 3329next_unlock: 3330 spin_unlock_irq(&l3->list_lock); 3331next: 3332 cond_resched(); 3333 } 3334 check_irq_on(); 3335 up(&cache_chain_sem); 3336 drain_remote_pages(); 3337 /* Setup the next iteration */ 3338 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); 3339} 3340 3341#ifdef CONFIG_PROC_FS 3342 3343static void *s_start(struct seq_file *m, loff_t *pos) 3344{ 3345 loff_t n = *pos; 3346 struct list_head *p; 3347 3348 down(&cache_chain_sem); 3349 if (!n) { 3350 /* 3351 * Output format version, so at least we can change it 3352 * without _too_ many complaints. 3353 */ 3354#if STATS 3355 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 3356#else 3357 seq_puts(m, "slabinfo - version: 2.1\n"); 3358#endif 3359 seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); 3360 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 3361 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 3362#if STATS 3363 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>" 3364 " <error> <maxfreeable> <nodeallocs> <remotefrees>"); 3365 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 3366#endif 3367 seq_putc(m, '\n'); 3368 } 3369 p = cache_chain.next; 3370 while (n--) { 3371 p = p->next; 3372 if (p == &cache_chain) 3373 return NULL; 3374 } 3375 return list_entry(p, kmem_cache_t, next); 3376} 3377 3378static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3379{ 3380 kmem_cache_t *cachep = p; 3381 ++*pos; 3382 return cachep->next.next == &cache_chain ? NULL 3383 : list_entry(cachep->next.next, kmem_cache_t, next); 3384} 3385 3386static void s_stop(struct seq_file *m, void *p) 3387{ 3388 up(&cache_chain_sem); 3389} 3390 3391static int s_show(struct seq_file *m, void *p) 3392{ 3393 kmem_cache_t *cachep = p; 3394 struct list_head *q; 3395 struct slab *slabp; 3396 unsigned long active_objs; 3397 unsigned long num_objs; 3398 unsigned long active_slabs = 0; 3399 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 3400 const char *name; 3401 char *error = NULL; 3402 int node; 3403 struct kmem_list3 *l3; 3404 3405 check_irq_on(); 3406 spin_lock_irq(&cachep->spinlock); 3407 active_objs = 0; 3408 num_slabs = 0; 3409 for_each_online_node(node) { 3410 l3 = cachep->nodelists[node]; 3411 if (!l3) 3412 continue; 3413 3414 spin_lock(&l3->list_lock); 3415 3416 list_for_each(q,&l3->slabs_full) { 3417 slabp = list_entry(q, struct slab, list); 3418 if (slabp->inuse != cachep->num && !error) 3419 error = "slabs_full accounting error"; 3420 active_objs += cachep->num; 3421 active_slabs++; 3422 } 3423 list_for_each(q,&l3->slabs_partial) { 3424 slabp = list_entry(q, struct slab, list); 3425 if (slabp->inuse == cachep->num && !error) 3426 error = "slabs_partial inuse accounting error"; 3427 if (!slabp->inuse && !error) 3428 error = "slabs_partial/inuse accounting error"; 3429 active_objs += slabp->inuse; 3430 active_slabs++; 3431 } 3432 list_for_each(q,&l3->slabs_free) { 3433 slabp = list_entry(q, struct slab, list); 3434 if (slabp->inuse && !error) 3435 error = "slabs_free/inuse accounting error"; 3436 num_slabs++; 3437 } 3438 free_objects += l3->free_objects; 3439 shared_avail += l3->shared->avail; 3440 3441 spin_unlock(&l3->list_lock); 3442 } 3443 num_slabs+=active_slabs; 3444 num_objs = num_slabs*cachep->num; 3445 if (num_objs - active_objs != free_objects && !error) 3446 error = "free_objects accounting error"; 3447 3448 name = cachep->name; 3449 if (error) 3450 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 3451 3452 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 3453 name, active_objs, num_objs, cachep->objsize, 3454 cachep->num, (1<<cachep->gfporder)); 3455 seq_printf(m, " : tunables %4u %4u %4u", 3456 cachep->limit, cachep->batchcount, 3457 cachep->shared); 3458 seq_printf(m, " : slabdata %6lu %6lu %6lu", 3459 active_slabs, num_slabs, shared_avail); 3460#if STATS 3461 { /* list3 stats */ 3462 unsigned long high = cachep->high_mark; 3463 unsigned long allocs = cachep->num_allocations; 3464 unsigned long grown = cachep->grown; 3465 unsigned long reaped = cachep->reaped; 3466 unsigned long errors = cachep->errors; 3467 unsigned long max_freeable = cachep->max_freeable; 3468 unsigned long node_allocs = cachep->node_allocs; 3469 unsigned long node_frees = cachep->node_frees; 3470 3471 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 3472 %4lu %4lu %4lu %4lu", 3473 allocs, high, grown, reaped, errors, 3474 max_freeable, node_allocs, node_frees); 3475 } 3476 /* cpu stats */ 3477 { 3478 unsigned long allochit = atomic_read(&cachep->allochit); 3479 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 3480 unsigned long freehit = atomic_read(&cachep->freehit); 3481 unsigned long freemiss = atomic_read(&cachep->freemiss); 3482 3483 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 3484 allochit, allocmiss, freehit, freemiss); 3485 } 3486#endif 3487 seq_putc(m, '\n'); 3488 spin_unlock_irq(&cachep->spinlock); 3489 return 0; 3490} 3491 3492/* 3493 * slabinfo_op - iterator that generates /proc/slabinfo 3494 * 3495 * Output layout: 3496 * cache-name 3497 * num-active-objs 3498 * total-objs 3499 * object size 3500 * num-active-slabs 3501 * total-slabs 3502 * num-pages-per-slab 3503 * + further values on SMP and with statistics enabled 3504 */ 3505 3506struct seq_operations slabinfo_op = { 3507 .start = s_start, 3508 .next = s_next, 3509 .stop = s_stop, 3510 .show = s_show, 3511}; 3512 3513#define MAX_SLABINFO_WRITE 128 3514/** 3515 * slabinfo_write - Tuning for the slab allocator 3516 * @file: unused 3517 * @buffer: user buffer 3518 * @count: data length 3519 * @ppos: unused 3520 */ 3521ssize_t slabinfo_write(struct file *file, const char __user *buffer, 3522 size_t count, loff_t *ppos) 3523{ 3524 char kbuf[MAX_SLABINFO_WRITE+1], *tmp; 3525 int limit, batchcount, shared, res; 3526 struct list_head *p; 3527 3528 if (count > MAX_SLABINFO_WRITE) 3529 return -EINVAL; 3530 if (copy_from_user(&kbuf, buffer, count)) 3531 return -EFAULT; 3532 kbuf[MAX_SLABINFO_WRITE] = '\0'; 3533 3534 tmp = strchr(kbuf, ' '); 3535 if (!tmp) 3536 return -EINVAL; 3537 *tmp = '\0'; 3538 tmp++; 3539 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 3540 return -EINVAL; 3541 3542 /* Find the cache in the chain of caches. */ 3543 down(&cache_chain_sem); 3544 res = -EINVAL; 3545 list_for_each(p,&cache_chain) { 3546 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); 3547 3548 if (!strcmp(cachep->name, kbuf)) { 3549 if (limit < 1 || 3550 batchcount < 1 || 3551 batchcount > limit || 3552 shared < 0) { 3553 res = 0; 3554 } else { 3555 res = do_tune_cpucache(cachep, limit, 3556 batchcount, shared); 3557 } 3558 break; 3559 } 3560 } 3561 up(&cache_chain_sem); 3562 if (res >= 0) 3563 res = count; 3564 return res; 3565} 3566#endif 3567 3568/** 3569 * ksize - get the actual amount of memory allocated for a given object 3570 * @objp: Pointer to the object 3571 * 3572 * kmalloc may internally round up allocations and return more memory 3573 * than requested. ksize() can be used to determine the actual amount of 3574 * memory allocated. The caller may use this additional memory, even though 3575 * a smaller amount of memory was initially specified with the kmalloc call. 3576 * The caller must guarantee that objp points to a valid object previously 3577 * allocated with either kmalloc() or kmem_cache_alloc(). The object 3578 * must not be freed during the duration of the call. 3579 */ 3580unsigned int ksize(const void *objp) 3581{ 3582 if (unlikely(objp == NULL)) 3583 return 0; 3584 3585 return obj_reallen(GET_PAGE_CACHE(virt_to_page(objp))); 3586} 3587 3588 3589/* 3590 * kstrdup - allocate space for and copy an existing string 3591 * 3592 * @s: the string to duplicate 3593 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 3594 */ 3595char *kstrdup(const char *s, unsigned int __nocast gfp) 3596{ 3597 size_t len; 3598 char *buf; 3599 3600 if (!s) 3601 return NULL; 3602 3603 len = strlen(s) + 1; 3604 buf = kmalloc(len, gfp); 3605 if (buf) 3606 memcpy(buf, s, len); 3607 return buf; 3608} 3609EXPORT_SYMBOL(kstrdup); 3610