slab.c revision ddc2e812d592457747c4367fb73edcaa8e1e49ff
1/* 2 * linux/mm/slab.c 3 * Written by Mark Hemment, 1996/97. 4 * (markhe@nextd.demon.co.uk) 5 * 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 7 * 8 * Major cleanup, different bufctl logic, per-cpu arrays 9 * (c) 2000 Manfred Spraul 10 * 11 * Cleanup, make the head arrays unconditional, preparation for NUMA 12 * (c) 2002 Manfred Spraul 13 * 14 * An implementation of the Slab Allocator as described in outline in; 15 * UNIX Internals: The New Frontiers by Uresh Vahalia 16 * Pub: Prentice Hall ISBN 0-13-101908-2 17 * or with a little more detail in; 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 19 * Jeff Bonwick (Sun Microsystems). 20 * Presented at: USENIX Summer 1994 Technical Conference 21 * 22 * The memory is organized in caches, one cache for each object type. 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 24 * Each cache consists out of many slabs (they are small (usually one 25 * page long) and always contiguous), and each slab contains multiple 26 * initialized objects. 27 * 28 * This means, that your constructor is used only for newly allocated 29 * slabs and you must pass objects with the same intializations to 30 * kmem_cache_free. 31 * 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 33 * normal). If you need a special memory type, then must create a new 34 * cache for that memory type. 35 * 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 37 * full slabs with 0 free objects 38 * partial slabs 39 * empty slabs with no allocated objects 40 * 41 * If partial slabs exist, then new allocations come from these slabs, 42 * otherwise from empty slabs or new slabs are allocated. 43 * 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 46 * 47 * Each cache has a short per-cpu head array, most allocs 48 * and frees go into that array, and if that array overflows, then 1/2 49 * of the entries in the array are given back into the global cache. 50 * The head array is strictly LIFO and should improve the cache hit rates. 51 * On SMP, it additionally reduces the spinlock operations. 52 * 53 * The c_cpuarray may not be read with enabled local interrupts - 54 * it's changed with a smp_call_function(). 55 * 56 * SMP synchronization: 57 * constructors and destructors are called without any locking. 58 * Several members in struct kmem_cache and struct slab never change, they 59 * are accessed without any locking. 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 61 * and local interrupts are disabled so slab code is preempt-safe. 62 * The non-constant members are protected with a per-cache irq spinlock. 63 * 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 65 * in 2000 - many ideas in the current implementation are derived from 66 * his patch. 67 * 68 * Further notes from the original documentation: 69 * 70 * 11 April '97. Started multi-threading - markhe 71 * The global cache-chain is protected by the mutex 'cache_chain_mutex'. 72 * The sem is only needed when accessing/extending the cache-chain, which 73 * can never happen inside an interrupt (kmem_cache_create(), 74 * kmem_cache_shrink() and kmem_cache_reap()). 75 * 76 * At present, each engine can be growing a cache. This should be blocked. 77 * 78 * 15 March 2005. NUMA slab allocator. 79 * Shai Fultheim <shai@scalex86.org>. 80 * Shobhit Dayal <shobhit@calsoftinc.com> 81 * Alok N Kataria <alokk@calsoftinc.com> 82 * Christoph Lameter <christoph@lameter.com> 83 * 84 * Modified the slab allocator to be node aware on NUMA systems. 85 * Each node has its own list of partial, free and full slabs. 86 * All object allocations for a node occur from node specific slab lists. 87 */ 88 89#include <linux/config.h> 90#include <linux/slab.h> 91#include <linux/mm.h> 92#include <linux/swap.h> 93#include <linux/cache.h> 94#include <linux/interrupt.h> 95#include <linux/init.h> 96#include <linux/compiler.h> 97#include <linux/cpuset.h> 98#include <linux/seq_file.h> 99#include <linux/notifier.h> 100#include <linux/kallsyms.h> 101#include <linux/cpu.h> 102#include <linux/sysctl.h> 103#include <linux/module.h> 104#include <linux/rcupdate.h> 105#include <linux/string.h> 106#include <linux/nodemask.h> 107#include <linux/mempolicy.h> 108#include <linux/mutex.h> 109 110#include <asm/uaccess.h> 111#include <asm/cacheflush.h> 112#include <asm/tlbflush.h> 113#include <asm/page.h> 114 115/* 116 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, 117 * SLAB_RED_ZONE & SLAB_POISON. 118 * 0 for faster, smaller code (especially in the critical paths). 119 * 120 * STATS - 1 to collect stats for /proc/slabinfo. 121 * 0 for faster, smaller code (especially in the critical paths). 122 * 123 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 124 */ 125 126#ifdef CONFIG_DEBUG_SLAB 127#define DEBUG 1 128#define STATS 1 129#define FORCED_DEBUG 1 130#else 131#define DEBUG 0 132#define STATS 0 133#define FORCED_DEBUG 0 134#endif 135 136/* Shouldn't this be in a header file somewhere? */ 137#define BYTES_PER_WORD sizeof(void *) 138 139#ifndef cache_line_size 140#define cache_line_size() L1_CACHE_BYTES 141#endif 142 143#ifndef ARCH_KMALLOC_MINALIGN 144/* 145 * Enforce a minimum alignment for the kmalloc caches. 146 * Usually, the kmalloc caches are cache_line_size() aligned, except when 147 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. 148 * Some archs want to perform DMA into kmalloc caches and need a guaranteed 149 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. 150 * Note that this flag disables some debug features. 151 */ 152#define ARCH_KMALLOC_MINALIGN 0 153#endif 154 155#ifndef ARCH_SLAB_MINALIGN 156/* 157 * Enforce a minimum alignment for all caches. 158 * Intended for archs that get misalignment faults even for BYTES_PER_WORD 159 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. 160 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables 161 * some debug features. 162 */ 163#define ARCH_SLAB_MINALIGN 0 164#endif 165 166#ifndef ARCH_KMALLOC_FLAGS 167#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 168#endif 169 170/* Legal flag mask for kmem_cache_create(). */ 171#if DEBUG 172# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ 173 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 174 SLAB_CACHE_DMA | \ 175 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ 176 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 177 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) 178#else 179# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ 180 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ 181 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 182 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) 183#endif 184 185/* 186 * kmem_bufctl_t: 187 * 188 * Bufctl's are used for linking objs within a slab 189 * linked offsets. 190 * 191 * This implementation relies on "struct page" for locating the cache & 192 * slab an object belongs to. 193 * This allows the bufctl structure to be small (one int), but limits 194 * the number of objects a slab (not a cache) can contain when off-slab 195 * bufctls are used. The limit is the size of the largest general cache 196 * that does not use off-slab slabs. 197 * For 32bit archs with 4 kB pages, is this 56. 198 * This is not serious, as it is only for large objects, when it is unwise 199 * to have too many per slab. 200 * Note: This limit can be raised by introducing a general cache whose size 201 * is less than 512 (PAGE_SIZE<<3), but greater than 256. 202 */ 203 204typedef unsigned int kmem_bufctl_t; 205#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) 206#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) 207#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) 208#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) 209 210/* 211 * struct slab 212 * 213 * Manages the objs in a slab. Placed either at the beginning of mem allocated 214 * for a slab, or allocated from an general cache. 215 * Slabs are chained into three list: fully used, partial, fully free slabs. 216 */ 217struct slab { 218 struct list_head list; 219 unsigned long colouroff; 220 void *s_mem; /* including colour offset */ 221 unsigned int inuse; /* num of objs active in slab */ 222 kmem_bufctl_t free; 223 unsigned short nodeid; 224}; 225 226/* 227 * struct slab_rcu 228 * 229 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 230 * arrange for kmem_freepages to be called via RCU. This is useful if 231 * we need to approach a kernel structure obliquely, from its address 232 * obtained without the usual locking. We can lock the structure to 233 * stabilize it and check it's still at the given address, only if we 234 * can be sure that the memory has not been meanwhile reused for some 235 * other kind of object (which our subsystem's lock might corrupt). 236 * 237 * rcu_read_lock before reading the address, then rcu_read_unlock after 238 * taking the spinlock within the structure expected at that address. 239 * 240 * We assume struct slab_rcu can overlay struct slab when destroying. 241 */ 242struct slab_rcu { 243 struct rcu_head head; 244 struct kmem_cache *cachep; 245 void *addr; 246}; 247 248/* 249 * struct array_cache 250 * 251 * Purpose: 252 * - LIFO ordering, to hand out cache-warm objects from _alloc 253 * - reduce the number of linked list operations 254 * - reduce spinlock operations 255 * 256 * The limit is stored in the per-cpu structure to reduce the data cache 257 * footprint. 258 * 259 */ 260struct array_cache { 261 unsigned int avail; 262 unsigned int limit; 263 unsigned int batchcount; 264 unsigned int touched; 265 spinlock_t lock; 266 void *entry[0]; /* 267 * Must have this definition in here for the proper 268 * alignment of array_cache. Also simplifies accessing 269 * the entries. 270 * [0] is for gcc 2.95. It should really be []. 271 */ 272}; 273 274/* 275 * bootstrap: The caches do not work without cpuarrays anymore, but the 276 * cpuarrays are allocated from the generic caches... 277 */ 278#define BOOT_CPUCACHE_ENTRIES 1 279struct arraycache_init { 280 struct array_cache cache; 281 void *entries[BOOT_CPUCACHE_ENTRIES]; 282}; 283 284/* 285 * The slab lists for all objects. 286 */ 287struct kmem_list3 { 288 struct list_head slabs_partial; /* partial list first, better asm code */ 289 struct list_head slabs_full; 290 struct list_head slabs_free; 291 unsigned long free_objects; 292 unsigned int free_limit; 293 unsigned int colour_next; /* Per-node cache coloring */ 294 spinlock_t list_lock; 295 struct array_cache *shared; /* shared per node */ 296 struct array_cache **alien; /* on other nodes */ 297 unsigned long next_reap; /* updated without locking */ 298 int free_touched; /* updated without locking */ 299}; 300 301/* 302 * Need this for bootstrapping a per node allocator. 303 */ 304#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) 305struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; 306#define CACHE_CACHE 0 307#define SIZE_AC 1 308#define SIZE_L3 (1 + MAX_NUMNODES) 309 310/* 311 * This function must be completely optimized away if a constant is passed to 312 * it. Mostly the same as what is in linux/slab.h except it returns an index. 313 */ 314static __always_inline int index_of(const size_t size) 315{ 316 extern void __bad_size(void); 317 318 if (__builtin_constant_p(size)) { 319 int i = 0; 320 321#define CACHE(x) \ 322 if (size <=x) \ 323 return i; \ 324 else \ 325 i++; 326#include "linux/kmalloc_sizes.h" 327#undef CACHE 328 __bad_size(); 329 } else 330 __bad_size(); 331 return 0; 332} 333 334#define INDEX_AC index_of(sizeof(struct arraycache_init)) 335#define INDEX_L3 index_of(sizeof(struct kmem_list3)) 336 337static void kmem_list3_init(struct kmem_list3 *parent) 338{ 339 INIT_LIST_HEAD(&parent->slabs_full); 340 INIT_LIST_HEAD(&parent->slabs_partial); 341 INIT_LIST_HEAD(&parent->slabs_free); 342 parent->shared = NULL; 343 parent->alien = NULL; 344 parent->colour_next = 0; 345 spin_lock_init(&parent->list_lock); 346 parent->free_objects = 0; 347 parent->free_touched = 0; 348} 349 350#define MAKE_LIST(cachep, listp, slab, nodeid) \ 351 do { \ 352 INIT_LIST_HEAD(listp); \ 353 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ 354 } while (0) 355 356#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 357 do { \ 358 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 359 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 360 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 361 } while (0) 362 363/* 364 * struct kmem_cache 365 * 366 * manages a cache. 367 */ 368 369struct kmem_cache { 370/* 1) per-cpu data, touched during every alloc/free */ 371 struct array_cache *array[NR_CPUS]; 372/* 2) Cache tunables. Protected by cache_chain_mutex */ 373 unsigned int batchcount; 374 unsigned int limit; 375 unsigned int shared; 376 377 unsigned int buffer_size; 378/* 3) touched by every alloc & free from the backend */ 379 struct kmem_list3 *nodelists[MAX_NUMNODES]; 380 381 unsigned int flags; /* constant flags */ 382 unsigned int num; /* # of objs per slab */ 383 384/* 4) cache_grow/shrink */ 385 /* order of pgs per slab (2^n) */ 386 unsigned int gfporder; 387 388 /* force GFP flags, e.g. GFP_DMA */ 389 gfp_t gfpflags; 390 391 size_t colour; /* cache colouring range */ 392 unsigned int colour_off; /* colour offset */ 393 struct kmem_cache *slabp_cache; 394 unsigned int slab_size; 395 unsigned int dflags; /* dynamic flags */ 396 397 /* constructor func */ 398 void (*ctor) (void *, struct kmem_cache *, unsigned long); 399 400 /* de-constructor func */ 401 void (*dtor) (void *, struct kmem_cache *, unsigned long); 402 403/* 5) cache creation/removal */ 404 const char *name; 405 struct list_head next; 406 407/* 6) statistics */ 408#if STATS 409 unsigned long num_active; 410 unsigned long num_allocations; 411 unsigned long high_mark; 412 unsigned long grown; 413 unsigned long reaped; 414 unsigned long errors; 415 unsigned long max_freeable; 416 unsigned long node_allocs; 417 unsigned long node_frees; 418 unsigned long node_overflow; 419 atomic_t allochit; 420 atomic_t allocmiss; 421 atomic_t freehit; 422 atomic_t freemiss; 423#endif 424#if DEBUG 425 /* 426 * If debugging is enabled, then the allocator can add additional 427 * fields and/or padding to every object. buffer_size contains the total 428 * object size including these internal fields, the following two 429 * variables contain the offset to the user object and its size. 430 */ 431 int obj_offset; 432 int obj_size; 433#endif 434}; 435 436#define CFLGS_OFF_SLAB (0x80000000UL) 437#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 438 439#define BATCHREFILL_LIMIT 16 440/* 441 * Optimization question: fewer reaps means less probability for unnessary 442 * cpucache drain/refill cycles. 443 * 444 * OTOH the cpuarrays can contain lots of objects, 445 * which could lock up otherwise freeable slabs. 446 */ 447#define REAPTIMEOUT_CPUC (2*HZ) 448#define REAPTIMEOUT_LIST3 (4*HZ) 449 450#if STATS 451#define STATS_INC_ACTIVE(x) ((x)->num_active++) 452#define STATS_DEC_ACTIVE(x) ((x)->num_active--) 453#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 454#define STATS_INC_GROWN(x) ((x)->grown++) 455#define STATS_INC_REAPED(x) ((x)->reaped++) 456#define STATS_SET_HIGH(x) \ 457 do { \ 458 if ((x)->num_active > (x)->high_mark) \ 459 (x)->high_mark = (x)->num_active; \ 460 } while (0) 461#define STATS_INC_ERR(x) ((x)->errors++) 462#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 463#define STATS_INC_NODEFREES(x) ((x)->node_frees++) 464#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) 465#define STATS_SET_FREEABLE(x, i) \ 466 do { \ 467 if ((x)->max_freeable < i) \ 468 (x)->max_freeable = i; \ 469 } while (0) 470#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 471#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 472#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 473#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 474#else 475#define STATS_INC_ACTIVE(x) do { } while (0) 476#define STATS_DEC_ACTIVE(x) do { } while (0) 477#define STATS_INC_ALLOCED(x) do { } while (0) 478#define STATS_INC_GROWN(x) do { } while (0) 479#define STATS_INC_REAPED(x) do { } while (0) 480#define STATS_SET_HIGH(x) do { } while (0) 481#define STATS_INC_ERR(x) do { } while (0) 482#define STATS_INC_NODEALLOCS(x) do { } while (0) 483#define STATS_INC_NODEFREES(x) do { } while (0) 484#define STATS_INC_ACOVERFLOW(x) do { } while (0) 485#define STATS_SET_FREEABLE(x, i) do { } while (0) 486#define STATS_INC_ALLOCHIT(x) do { } while (0) 487#define STATS_INC_ALLOCMISS(x) do { } while (0) 488#define STATS_INC_FREEHIT(x) do { } while (0) 489#define STATS_INC_FREEMISS(x) do { } while (0) 490#endif 491 492#if DEBUG 493/* 494 * Magic nums for obj red zoning. 495 * Placed in the first word before and the first word after an obj. 496 */ 497#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ 498#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ 499 500/* ...and for poisoning */ 501#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ 502#define POISON_FREE 0x6b /* for use-after-free poisoning */ 503#define POISON_END 0xa5 /* end-byte of poisoning */ 504 505/* 506 * memory layout of objects: 507 * 0 : objp 508 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that 509 * the end of an object is aligned with the end of the real 510 * allocation. Catches writes behind the end of the allocation. 511 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: 512 * redzone word. 513 * cachep->obj_offset: The real object. 514 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 515 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address 516 * [BYTES_PER_WORD long] 517 */ 518static int obj_offset(struct kmem_cache *cachep) 519{ 520 return cachep->obj_offset; 521} 522 523static int obj_size(struct kmem_cache *cachep) 524{ 525 return cachep->obj_size; 526} 527 528static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp) 529{ 530 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 531 return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD); 532} 533 534static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp) 535{ 536 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 537 if (cachep->flags & SLAB_STORE_USER) 538 return (unsigned long *)(objp + cachep->buffer_size - 539 2 * BYTES_PER_WORD); 540 return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD); 541} 542 543static void **dbg_userword(struct kmem_cache *cachep, void *objp) 544{ 545 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 546 return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); 547} 548 549#else 550 551#define obj_offset(x) 0 552#define obj_size(cachep) (cachep->buffer_size) 553#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 554#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 555#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 556 557#endif 558 559/* 560 * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp 561 * order. 562 */ 563#if defined(CONFIG_LARGE_ALLOCS) 564#define MAX_OBJ_ORDER 13 /* up to 32Mb */ 565#define MAX_GFP_ORDER 13 /* up to 32Mb */ 566#elif defined(CONFIG_MMU) 567#define MAX_OBJ_ORDER 5 /* 32 pages */ 568#define MAX_GFP_ORDER 5 /* 32 pages */ 569#else 570#define MAX_OBJ_ORDER 8 /* up to 1Mb */ 571#define MAX_GFP_ORDER 8 /* up to 1Mb */ 572#endif 573 574/* 575 * Do not go above this order unless 0 objects fit into the slab. 576 */ 577#define BREAK_GFP_ORDER_HI 1 578#define BREAK_GFP_ORDER_LO 0 579static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 580 581/* 582 * Functions for storing/retrieving the cachep and or slab from the page 583 * allocator. These are used to find the slab an obj belongs to. With kfree(), 584 * these are used to find the cache which an obj belongs to. 585 */ 586static inline void page_set_cache(struct page *page, struct kmem_cache *cache) 587{ 588 page->lru.next = (struct list_head *)cache; 589} 590 591static inline struct kmem_cache *page_get_cache(struct page *page) 592{ 593 if (unlikely(PageCompound(page))) 594 page = (struct page *)page_private(page); 595 BUG_ON(!PageSlab(page)); 596 return (struct kmem_cache *)page->lru.next; 597} 598 599static inline void page_set_slab(struct page *page, struct slab *slab) 600{ 601 page->lru.prev = (struct list_head *)slab; 602} 603 604static inline struct slab *page_get_slab(struct page *page) 605{ 606 if (unlikely(PageCompound(page))) 607 page = (struct page *)page_private(page); 608 BUG_ON(!PageSlab(page)); 609 return (struct slab *)page->lru.prev; 610} 611 612static inline struct kmem_cache *virt_to_cache(const void *obj) 613{ 614 struct page *page = virt_to_page(obj); 615 return page_get_cache(page); 616} 617 618static inline struct slab *virt_to_slab(const void *obj) 619{ 620 struct page *page = virt_to_page(obj); 621 return page_get_slab(page); 622} 623 624static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, 625 unsigned int idx) 626{ 627 return slab->s_mem + cache->buffer_size * idx; 628} 629 630static inline unsigned int obj_to_index(struct kmem_cache *cache, 631 struct slab *slab, void *obj) 632{ 633 return (unsigned)(obj - slab->s_mem) / cache->buffer_size; 634} 635 636/* 637 * These are the default caches for kmalloc. Custom caches can have other sizes. 638 */ 639struct cache_sizes malloc_sizes[] = { 640#define CACHE(x) { .cs_size = (x) }, 641#include <linux/kmalloc_sizes.h> 642 CACHE(ULONG_MAX) 643#undef CACHE 644}; 645EXPORT_SYMBOL(malloc_sizes); 646 647/* Must match cache_sizes above. Out of line to keep cache footprint low. */ 648struct cache_names { 649 char *name; 650 char *name_dma; 651}; 652 653static struct cache_names __initdata cache_names[] = { 654#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 655#include <linux/kmalloc_sizes.h> 656 {NULL,} 657#undef CACHE 658}; 659 660static struct arraycache_init initarray_cache __initdata = 661 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 662static struct arraycache_init initarray_generic = 663 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 664 665/* internal cache of cache description objs */ 666static struct kmem_cache cache_cache = { 667 .batchcount = 1, 668 .limit = BOOT_CPUCACHE_ENTRIES, 669 .shared = 1, 670 .buffer_size = sizeof(struct kmem_cache), 671 .name = "kmem_cache", 672#if DEBUG 673 .obj_size = sizeof(struct kmem_cache), 674#endif 675}; 676 677/* Guard access to the cache-chain. */ 678static DEFINE_MUTEX(cache_chain_mutex); 679static struct list_head cache_chain; 680 681/* 682 * vm_enough_memory() looks at this to determine how many slab-allocated pages 683 * are possibly freeable under pressure 684 * 685 * SLAB_RECLAIM_ACCOUNT turns this on per-slab 686 */ 687atomic_t slab_reclaim_pages; 688 689/* 690 * chicken and egg problem: delay the per-cpu array allocation 691 * until the general caches are up. 692 */ 693static enum { 694 NONE, 695 PARTIAL_AC, 696 PARTIAL_L3, 697 FULL 698} g_cpucache_up; 699 700/* 701 * used by boot code to determine if it can use slab based allocator 702 */ 703int slab_is_available(void) 704{ 705 return g_cpucache_up == FULL; 706} 707 708static DEFINE_PER_CPU(struct work_struct, reap_work); 709 710static void free_block(struct kmem_cache *cachep, void **objpp, int len, 711 int node); 712static void enable_cpucache(struct kmem_cache *cachep); 713static void cache_reap(void *unused); 714static int __node_shrink(struct kmem_cache *cachep, int node); 715 716static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 717{ 718 return cachep->array[smp_processor_id()]; 719} 720 721static inline struct kmem_cache *__find_general_cachep(size_t size, 722 gfp_t gfpflags) 723{ 724 struct cache_sizes *csizep = malloc_sizes; 725 726#if DEBUG 727 /* This happens if someone tries to call 728 * kmem_cache_create(), or __kmalloc(), before 729 * the generic caches are initialized. 730 */ 731 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 732#endif 733 while (size > csizep->cs_size) 734 csizep++; 735 736 /* 737 * Really subtle: The last entry with cs->cs_size==ULONG_MAX 738 * has cs_{dma,}cachep==NULL. Thus no special case 739 * for large kmalloc calls required. 740 */ 741 if (unlikely(gfpflags & GFP_DMA)) 742 return csizep->cs_dmacachep; 743 return csizep->cs_cachep; 744} 745 746struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 747{ 748 return __find_general_cachep(size, gfpflags); 749} 750EXPORT_SYMBOL(kmem_find_general_cachep); 751 752static size_t slab_mgmt_size(size_t nr_objs, size_t align) 753{ 754 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); 755} 756 757/* 758 * Calculate the number of objects and left-over bytes for a given buffer size. 759 */ 760static void cache_estimate(unsigned long gfporder, size_t buffer_size, 761 size_t align, int flags, size_t *left_over, 762 unsigned int *num) 763{ 764 int nr_objs; 765 size_t mgmt_size; 766 size_t slab_size = PAGE_SIZE << gfporder; 767 768 /* 769 * The slab management structure can be either off the slab or 770 * on it. For the latter case, the memory allocated for a 771 * slab is used for: 772 * 773 * - The struct slab 774 * - One kmem_bufctl_t for each object 775 * - Padding to respect alignment of @align 776 * - @buffer_size bytes for each object 777 * 778 * If the slab management structure is off the slab, then the 779 * alignment will already be calculated into the size. Because 780 * the slabs are all pages aligned, the objects will be at the 781 * correct alignment when allocated. 782 */ 783 if (flags & CFLGS_OFF_SLAB) { 784 mgmt_size = 0; 785 nr_objs = slab_size / buffer_size; 786 787 if (nr_objs > SLAB_LIMIT) 788 nr_objs = SLAB_LIMIT; 789 } else { 790 /* 791 * Ignore padding for the initial guess. The padding 792 * is at most @align-1 bytes, and @buffer_size is at 793 * least @align. In the worst case, this result will 794 * be one greater than the number of objects that fit 795 * into the memory allocation when taking the padding 796 * into account. 797 */ 798 nr_objs = (slab_size - sizeof(struct slab)) / 799 (buffer_size + sizeof(kmem_bufctl_t)); 800 801 /* 802 * This calculated number will be either the right 803 * amount, or one greater than what we want. 804 */ 805 if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size 806 > slab_size) 807 nr_objs--; 808 809 if (nr_objs > SLAB_LIMIT) 810 nr_objs = SLAB_LIMIT; 811 812 mgmt_size = slab_mgmt_size(nr_objs, align); 813 } 814 *num = nr_objs; 815 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 816} 817 818#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 819 820static void __slab_error(const char *function, struct kmem_cache *cachep, 821 char *msg) 822{ 823 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 824 function, cachep->name, msg); 825 dump_stack(); 826} 827 828#ifdef CONFIG_NUMA 829/* 830 * Special reaping functions for NUMA systems called from cache_reap(). 831 * These take care of doing round robin flushing of alien caches (containing 832 * objects freed on different nodes from which they were allocated) and the 833 * flushing of remote pcps by calling drain_node_pages. 834 */ 835static DEFINE_PER_CPU(unsigned long, reap_node); 836 837static void init_reap_node(int cpu) 838{ 839 int node; 840 841 node = next_node(cpu_to_node(cpu), node_online_map); 842 if (node == MAX_NUMNODES) 843 node = first_node(node_online_map); 844 845 __get_cpu_var(reap_node) = node; 846} 847 848static void next_reap_node(void) 849{ 850 int node = __get_cpu_var(reap_node); 851 852 /* 853 * Also drain per cpu pages on remote zones 854 */ 855 if (node != numa_node_id()) 856 drain_node_pages(node); 857 858 node = next_node(node, node_online_map); 859 if (unlikely(node >= MAX_NUMNODES)) 860 node = first_node(node_online_map); 861 __get_cpu_var(reap_node) = node; 862} 863 864#else 865#define init_reap_node(cpu) do { } while (0) 866#define next_reap_node(void) do { } while (0) 867#endif 868 869/* 870 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 871 * via the workqueue/eventd. 872 * Add the CPU number into the expiration time to minimize the possibility of 873 * the CPUs getting into lockstep and contending for the global cache chain 874 * lock. 875 */ 876static void __devinit start_cpu_timer(int cpu) 877{ 878 struct work_struct *reap_work = &per_cpu(reap_work, cpu); 879 880 /* 881 * When this gets called from do_initcalls via cpucache_init(), 882 * init_workqueues() has already run, so keventd will be setup 883 * at that time. 884 */ 885 if (keventd_up() && reap_work->func == NULL) { 886 init_reap_node(cpu); 887 INIT_WORK(reap_work, cache_reap, NULL); 888 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 889 } 890} 891 892static struct array_cache *alloc_arraycache(int node, int entries, 893 int batchcount) 894{ 895 int memsize = sizeof(void *) * entries + sizeof(struct array_cache); 896 struct array_cache *nc = NULL; 897 898 nc = kmalloc_node(memsize, GFP_KERNEL, node); 899 if (nc) { 900 nc->avail = 0; 901 nc->limit = entries; 902 nc->batchcount = batchcount; 903 nc->touched = 0; 904 spin_lock_init(&nc->lock); 905 } 906 return nc; 907} 908 909/* 910 * Transfer objects in one arraycache to another. 911 * Locking must be handled by the caller. 912 * 913 * Return the number of entries transferred. 914 */ 915static int transfer_objects(struct array_cache *to, 916 struct array_cache *from, unsigned int max) 917{ 918 /* Figure out how many entries to transfer */ 919 int nr = min(min(from->avail, max), to->limit - to->avail); 920 921 if (!nr) 922 return 0; 923 924 memcpy(to->entry + to->avail, from->entry + from->avail -nr, 925 sizeof(void *) *nr); 926 927 from->avail -= nr; 928 to->avail += nr; 929 to->touched = 1; 930 return nr; 931} 932 933#ifdef CONFIG_NUMA 934static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); 935static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 936 937static struct array_cache **alloc_alien_cache(int node, int limit) 938{ 939 struct array_cache **ac_ptr; 940 int memsize = sizeof(void *) * MAX_NUMNODES; 941 int i; 942 943 if (limit > 1) 944 limit = 12; 945 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); 946 if (ac_ptr) { 947 for_each_node(i) { 948 if (i == node || !node_online(i)) { 949 ac_ptr[i] = NULL; 950 continue; 951 } 952 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 953 if (!ac_ptr[i]) { 954 for (i--; i <= 0; i--) 955 kfree(ac_ptr[i]); 956 kfree(ac_ptr); 957 return NULL; 958 } 959 } 960 } 961 return ac_ptr; 962} 963 964static void free_alien_cache(struct array_cache **ac_ptr) 965{ 966 int i; 967 968 if (!ac_ptr) 969 return; 970 for_each_node(i) 971 kfree(ac_ptr[i]); 972 kfree(ac_ptr); 973} 974 975static void __drain_alien_cache(struct kmem_cache *cachep, 976 struct array_cache *ac, int node) 977{ 978 struct kmem_list3 *rl3 = cachep->nodelists[node]; 979 980 if (ac->avail) { 981 spin_lock(&rl3->list_lock); 982 /* 983 * Stuff objects into the remote nodes shared array first. 984 * That way we could avoid the overhead of putting the objects 985 * into the free lists and getting them back later. 986 */ 987 if (rl3->shared) 988 transfer_objects(rl3->shared, ac, ac->limit); 989 990 free_block(cachep, ac->entry, ac->avail, node); 991 ac->avail = 0; 992 spin_unlock(&rl3->list_lock); 993 } 994} 995 996/* 997 * Called from cache_reap() to regularly drain alien caches round robin. 998 */ 999static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) 1000{ 1001 int node = __get_cpu_var(reap_node); 1002 1003 if (l3->alien) { 1004 struct array_cache *ac = l3->alien[node]; 1005 1006 if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { 1007 __drain_alien_cache(cachep, ac, node); 1008 spin_unlock_irq(&ac->lock); 1009 } 1010 } 1011} 1012 1013static void drain_alien_cache(struct kmem_cache *cachep, 1014 struct array_cache **alien) 1015{ 1016 int i = 0; 1017 struct array_cache *ac; 1018 unsigned long flags; 1019 1020 for_each_online_node(i) { 1021 ac = alien[i]; 1022 if (ac) { 1023 spin_lock_irqsave(&ac->lock, flags); 1024 __drain_alien_cache(cachep, ac, i); 1025 spin_unlock_irqrestore(&ac->lock, flags); 1026 } 1027 } 1028} 1029 1030static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1031{ 1032 struct slab *slabp = virt_to_slab(objp); 1033 int nodeid = slabp->nodeid; 1034 struct kmem_list3 *l3; 1035 struct array_cache *alien = NULL; 1036 1037 /* 1038 * Make sure we are not freeing a object from another node to the array 1039 * cache on this cpu. 1040 */ 1041 if (likely(slabp->nodeid == numa_node_id())) 1042 return 0; 1043 1044 l3 = cachep->nodelists[numa_node_id()]; 1045 STATS_INC_NODEFREES(cachep); 1046 if (l3->alien && l3->alien[nodeid]) { 1047 alien = l3->alien[nodeid]; 1048 spin_lock(&alien->lock); 1049 if (unlikely(alien->avail == alien->limit)) { 1050 STATS_INC_ACOVERFLOW(cachep); 1051 __drain_alien_cache(cachep, alien, nodeid); 1052 } 1053 alien->entry[alien->avail++] = objp; 1054 spin_unlock(&alien->lock); 1055 } else { 1056 spin_lock(&(cachep->nodelists[nodeid])->list_lock); 1057 free_block(cachep, &objp, 1, nodeid); 1058 spin_unlock(&(cachep->nodelists[nodeid])->list_lock); 1059 } 1060 return 1; 1061} 1062 1063#else 1064 1065#define drain_alien_cache(cachep, alien) do { } while (0) 1066#define reap_alien(cachep, l3) do { } while (0) 1067 1068static inline struct array_cache **alloc_alien_cache(int node, int limit) 1069{ 1070 return (struct array_cache **) 0x01020304ul; 1071} 1072 1073static inline void free_alien_cache(struct array_cache **ac_ptr) 1074{ 1075} 1076 1077static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1078{ 1079 return 0; 1080} 1081 1082#endif 1083 1084static int cpuup_callback(struct notifier_block *nfb, 1085 unsigned long action, void *hcpu) 1086{ 1087 long cpu = (long)hcpu; 1088 struct kmem_cache *cachep; 1089 struct kmem_list3 *l3 = NULL; 1090 int node = cpu_to_node(cpu); 1091 int memsize = sizeof(struct kmem_list3); 1092 1093 switch (action) { 1094 case CPU_UP_PREPARE: 1095 mutex_lock(&cache_chain_mutex); 1096 /* 1097 * We need to do this right in the beginning since 1098 * alloc_arraycache's are going to use this list. 1099 * kmalloc_node allows us to add the slab to the right 1100 * kmem_list3 and not this cpu's kmem_list3 1101 */ 1102 1103 list_for_each_entry(cachep, &cache_chain, next) { 1104 /* 1105 * Set up the size64 kmemlist for cpu before we can 1106 * begin anything. Make sure some other cpu on this 1107 * node has not already allocated this 1108 */ 1109 if (!cachep->nodelists[node]) { 1110 l3 = kmalloc_node(memsize, GFP_KERNEL, node); 1111 if (!l3) 1112 goto bad; 1113 kmem_list3_init(l3); 1114 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 1115 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1116 1117 /* 1118 * The l3s don't come and go as CPUs come and 1119 * go. cache_chain_mutex is sufficient 1120 * protection here. 1121 */ 1122 cachep->nodelists[node] = l3; 1123 } 1124 1125 spin_lock_irq(&cachep->nodelists[node]->list_lock); 1126 cachep->nodelists[node]->free_limit = 1127 (1 + nr_cpus_node(node)) * 1128 cachep->batchcount + cachep->num; 1129 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 1130 } 1131 1132 /* 1133 * Now we can go ahead with allocating the shared arrays and 1134 * array caches 1135 */ 1136 list_for_each_entry(cachep, &cache_chain, next) { 1137 struct array_cache *nc; 1138 struct array_cache *shared; 1139 struct array_cache **alien; 1140 1141 nc = alloc_arraycache(node, cachep->limit, 1142 cachep->batchcount); 1143 if (!nc) 1144 goto bad; 1145 shared = alloc_arraycache(node, 1146 cachep->shared * cachep->batchcount, 1147 0xbaadf00d); 1148 if (!shared) 1149 goto bad; 1150 1151 alien = alloc_alien_cache(node, cachep->limit); 1152 if (!alien) 1153 goto bad; 1154 cachep->array[cpu] = nc; 1155 l3 = cachep->nodelists[node]; 1156 BUG_ON(!l3); 1157 1158 spin_lock_irq(&l3->list_lock); 1159 if (!l3->shared) { 1160 /* 1161 * We are serialised from CPU_DEAD or 1162 * CPU_UP_CANCELLED by the cpucontrol lock 1163 */ 1164 l3->shared = shared; 1165 shared = NULL; 1166 } 1167#ifdef CONFIG_NUMA 1168 if (!l3->alien) { 1169 l3->alien = alien; 1170 alien = NULL; 1171 } 1172#endif 1173 spin_unlock_irq(&l3->list_lock); 1174 kfree(shared); 1175 free_alien_cache(alien); 1176 } 1177 mutex_unlock(&cache_chain_mutex); 1178 break; 1179 case CPU_ONLINE: 1180 start_cpu_timer(cpu); 1181 break; 1182#ifdef CONFIG_HOTPLUG_CPU 1183 case CPU_DEAD: 1184 /* 1185 * Even if all the cpus of a node are down, we don't free the 1186 * kmem_list3 of any cache. This to avoid a race between 1187 * cpu_down, and a kmalloc allocation from another cpu for 1188 * memory from the node of the cpu going down. The list3 1189 * structure is usually allocated from kmem_cache_create() and 1190 * gets destroyed at kmem_cache_destroy(). 1191 */ 1192 /* fall thru */ 1193 case CPU_UP_CANCELED: 1194 mutex_lock(&cache_chain_mutex); 1195 list_for_each_entry(cachep, &cache_chain, next) { 1196 struct array_cache *nc; 1197 struct array_cache *shared; 1198 struct array_cache **alien; 1199 cpumask_t mask; 1200 1201 mask = node_to_cpumask(node); 1202 /* cpu is dead; no one can alloc from it. */ 1203 nc = cachep->array[cpu]; 1204 cachep->array[cpu] = NULL; 1205 l3 = cachep->nodelists[node]; 1206 1207 if (!l3) 1208 goto free_array_cache; 1209 1210 spin_lock_irq(&l3->list_lock); 1211 1212 /* Free limit for this kmem_list3 */ 1213 l3->free_limit -= cachep->batchcount; 1214 if (nc) 1215 free_block(cachep, nc->entry, nc->avail, node); 1216 1217 if (!cpus_empty(mask)) { 1218 spin_unlock_irq(&l3->list_lock); 1219 goto free_array_cache; 1220 } 1221 1222 shared = l3->shared; 1223 if (shared) { 1224 free_block(cachep, l3->shared->entry, 1225 l3->shared->avail, node); 1226 l3->shared = NULL; 1227 } 1228 1229 alien = l3->alien; 1230 l3->alien = NULL; 1231 1232 spin_unlock_irq(&l3->list_lock); 1233 1234 kfree(shared); 1235 if (alien) { 1236 drain_alien_cache(cachep, alien); 1237 free_alien_cache(alien); 1238 } 1239free_array_cache: 1240 kfree(nc); 1241 } 1242 /* 1243 * In the previous loop, all the objects were freed to 1244 * the respective cache's slabs, now we can go ahead and 1245 * shrink each nodelist to its limit. 1246 */ 1247 list_for_each_entry(cachep, &cache_chain, next) { 1248 l3 = cachep->nodelists[node]; 1249 if (!l3) 1250 continue; 1251 spin_lock_irq(&l3->list_lock); 1252 /* free slabs belonging to this node */ 1253 __node_shrink(cachep, node); 1254 spin_unlock_irq(&l3->list_lock); 1255 } 1256 mutex_unlock(&cache_chain_mutex); 1257 break; 1258#endif 1259 } 1260 return NOTIFY_OK; 1261bad: 1262 mutex_unlock(&cache_chain_mutex); 1263 return NOTIFY_BAD; 1264} 1265 1266static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; 1267 1268/* 1269 * swap the static kmem_list3 with kmalloced memory 1270 */ 1271static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, 1272 int nodeid) 1273{ 1274 struct kmem_list3 *ptr; 1275 1276 BUG_ON(cachep->nodelists[nodeid] != list); 1277 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); 1278 BUG_ON(!ptr); 1279 1280 local_irq_disable(); 1281 memcpy(ptr, list, sizeof(struct kmem_list3)); 1282 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1283 cachep->nodelists[nodeid] = ptr; 1284 local_irq_enable(); 1285} 1286 1287/* 1288 * Initialisation. Called after the page allocator have been initialised and 1289 * before smp_init(). 1290 */ 1291void __init kmem_cache_init(void) 1292{ 1293 size_t left_over; 1294 struct cache_sizes *sizes; 1295 struct cache_names *names; 1296 int i; 1297 int order; 1298 1299 for (i = 0; i < NUM_INIT_LISTS; i++) { 1300 kmem_list3_init(&initkmem_list3[i]); 1301 if (i < MAX_NUMNODES) 1302 cache_cache.nodelists[i] = NULL; 1303 } 1304 1305 /* 1306 * Fragmentation resistance on low memory - only use bigger 1307 * page orders on machines with more than 32MB of memory. 1308 */ 1309 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1310 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1311 1312 /* Bootstrap is tricky, because several objects are allocated 1313 * from caches that do not exist yet: 1314 * 1) initialize the cache_cache cache: it contains the struct 1315 * kmem_cache structures of all caches, except cache_cache itself: 1316 * cache_cache is statically allocated. 1317 * Initially an __init data area is used for the head array and the 1318 * kmem_list3 structures, it's replaced with a kmalloc allocated 1319 * array at the end of the bootstrap. 1320 * 2) Create the first kmalloc cache. 1321 * The struct kmem_cache for the new cache is allocated normally. 1322 * An __init data area is used for the head array. 1323 * 3) Create the remaining kmalloc caches, with minimally sized 1324 * head arrays. 1325 * 4) Replace the __init data head arrays for cache_cache and the first 1326 * kmalloc cache with kmalloc allocated arrays. 1327 * 5) Replace the __init data for kmem_list3 for cache_cache and 1328 * the other cache's with kmalloc allocated memory. 1329 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1330 */ 1331 1332 /* 1) create the cache_cache */ 1333 INIT_LIST_HEAD(&cache_chain); 1334 list_add(&cache_cache.next, &cache_chain); 1335 cache_cache.colour_off = cache_line_size(); 1336 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1337 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; 1338 1339 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, 1340 cache_line_size()); 1341 1342 for (order = 0; order < MAX_ORDER; order++) { 1343 cache_estimate(order, cache_cache.buffer_size, 1344 cache_line_size(), 0, &left_over, &cache_cache.num); 1345 if (cache_cache.num) 1346 break; 1347 } 1348 BUG_ON(!cache_cache.num); 1349 cache_cache.gfporder = order; 1350 cache_cache.colour = left_over / cache_cache.colour_off; 1351 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + 1352 sizeof(struct slab), cache_line_size()); 1353 1354 /* 2+3) create the kmalloc caches */ 1355 sizes = malloc_sizes; 1356 names = cache_names; 1357 1358 /* 1359 * Initialize the caches that provide memory for the array cache and the 1360 * kmem_list3 structures first. Without this, further allocations will 1361 * bug. 1362 */ 1363 1364 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1365 sizes[INDEX_AC].cs_size, 1366 ARCH_KMALLOC_MINALIGN, 1367 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1368 NULL, NULL); 1369 1370 if (INDEX_AC != INDEX_L3) { 1371 sizes[INDEX_L3].cs_cachep = 1372 kmem_cache_create(names[INDEX_L3].name, 1373 sizes[INDEX_L3].cs_size, 1374 ARCH_KMALLOC_MINALIGN, 1375 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1376 NULL, NULL); 1377 } 1378 1379 while (sizes->cs_size != ULONG_MAX) { 1380 /* 1381 * For performance, all the general caches are L1 aligned. 1382 * This should be particularly beneficial on SMP boxes, as it 1383 * eliminates "false sharing". 1384 * Note for systems short on memory removing the alignment will 1385 * allow tighter packing of the smaller caches. 1386 */ 1387 if (!sizes->cs_cachep) { 1388 sizes->cs_cachep = kmem_cache_create(names->name, 1389 sizes->cs_size, 1390 ARCH_KMALLOC_MINALIGN, 1391 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1392 NULL, NULL); 1393 } 1394 1395 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1396 sizes->cs_size, 1397 ARCH_KMALLOC_MINALIGN, 1398 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| 1399 SLAB_PANIC, 1400 NULL, NULL); 1401 sizes++; 1402 names++; 1403 } 1404 /* 4) Replace the bootstrap head arrays */ 1405 { 1406 void *ptr; 1407 1408 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1409 1410 local_irq_disable(); 1411 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); 1412 memcpy(ptr, cpu_cache_get(&cache_cache), 1413 sizeof(struct arraycache_init)); 1414 cache_cache.array[smp_processor_id()] = ptr; 1415 local_irq_enable(); 1416 1417 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1418 1419 local_irq_disable(); 1420 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) 1421 != &initarray_generic.cache); 1422 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), 1423 sizeof(struct arraycache_init)); 1424 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1425 ptr; 1426 local_irq_enable(); 1427 } 1428 /* 5) Replace the bootstrap kmem_list3's */ 1429 { 1430 int node; 1431 /* Replace the static kmem_list3 structures for the boot cpu */ 1432 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], 1433 numa_node_id()); 1434 1435 for_each_online_node(node) { 1436 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1437 &initkmem_list3[SIZE_AC + node], node); 1438 1439 if (INDEX_AC != INDEX_L3) { 1440 init_list(malloc_sizes[INDEX_L3].cs_cachep, 1441 &initkmem_list3[SIZE_L3 + node], 1442 node); 1443 } 1444 } 1445 } 1446 1447 /* 6) resize the head arrays to their final sizes */ 1448 { 1449 struct kmem_cache *cachep; 1450 mutex_lock(&cache_chain_mutex); 1451 list_for_each_entry(cachep, &cache_chain, next) 1452 enable_cpucache(cachep); 1453 mutex_unlock(&cache_chain_mutex); 1454 } 1455 1456 /* Done! */ 1457 g_cpucache_up = FULL; 1458 1459 /* 1460 * Register a cpu startup notifier callback that initializes 1461 * cpu_cache_get for all new cpus 1462 */ 1463 register_cpu_notifier(&cpucache_notifier); 1464 1465 /* 1466 * The reap timers are started later, with a module init call: That part 1467 * of the kernel is not yet operational. 1468 */ 1469} 1470 1471static int __init cpucache_init(void) 1472{ 1473 int cpu; 1474 1475 /* 1476 * Register the timers that return unneeded pages to the page allocator 1477 */ 1478 for_each_online_cpu(cpu) 1479 start_cpu_timer(cpu); 1480 return 0; 1481} 1482__initcall(cpucache_init); 1483 1484/* 1485 * Interface to system's page allocator. No need to hold the cache-lock. 1486 * 1487 * If we requested dmaable memory, we will get it. Even if we 1488 * did not request dmaable memory, we might get it, but that 1489 * would be relatively rare and ignorable. 1490 */ 1491static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) 1492{ 1493 struct page *page; 1494 int nr_pages; 1495 int i; 1496 1497#ifndef CONFIG_MMU 1498 /* 1499 * Nommu uses slab's for process anonymous memory allocations, and thus 1500 * requires __GFP_COMP to properly refcount higher order allocations 1501 */ 1502 flags |= __GFP_COMP; 1503#endif 1504 flags |= cachep->gfpflags; 1505 1506 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1507 if (!page) 1508 return NULL; 1509 1510 nr_pages = (1 << cachep->gfporder); 1511 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1512 atomic_add(nr_pages, &slab_reclaim_pages); 1513 add_page_state(nr_slab, nr_pages); 1514 for (i = 0; i < nr_pages; i++) 1515 __SetPageSlab(page + i); 1516 return page_address(page); 1517} 1518 1519/* 1520 * Interface to system's page release. 1521 */ 1522static void kmem_freepages(struct kmem_cache *cachep, void *addr) 1523{ 1524 unsigned long i = (1 << cachep->gfporder); 1525 struct page *page = virt_to_page(addr); 1526 const unsigned long nr_freed = i; 1527 1528 while (i--) { 1529 BUG_ON(!PageSlab(page)); 1530 __ClearPageSlab(page); 1531 page++; 1532 } 1533 sub_page_state(nr_slab, nr_freed); 1534 if (current->reclaim_state) 1535 current->reclaim_state->reclaimed_slab += nr_freed; 1536 free_pages((unsigned long)addr, cachep->gfporder); 1537 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1538 atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages); 1539} 1540 1541static void kmem_rcu_free(struct rcu_head *head) 1542{ 1543 struct slab_rcu *slab_rcu = (struct slab_rcu *)head; 1544 struct kmem_cache *cachep = slab_rcu->cachep; 1545 1546 kmem_freepages(cachep, slab_rcu->addr); 1547 if (OFF_SLAB(cachep)) 1548 kmem_cache_free(cachep->slabp_cache, slab_rcu); 1549} 1550 1551#if DEBUG 1552 1553#ifdef CONFIG_DEBUG_PAGEALLOC 1554static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1555 unsigned long caller) 1556{ 1557 int size = obj_size(cachep); 1558 1559 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; 1560 1561 if (size < 5 * sizeof(unsigned long)) 1562 return; 1563 1564 *addr++ = 0x12345678; 1565 *addr++ = caller; 1566 *addr++ = smp_processor_id(); 1567 size -= 3 * sizeof(unsigned long); 1568 { 1569 unsigned long *sptr = &caller; 1570 unsigned long svalue; 1571 1572 while (!kstack_end(sptr)) { 1573 svalue = *sptr++; 1574 if (kernel_text_address(svalue)) { 1575 *addr++ = svalue; 1576 size -= sizeof(unsigned long); 1577 if (size <= sizeof(unsigned long)) 1578 break; 1579 } 1580 } 1581 1582 } 1583 *addr++ = 0x87654321; 1584} 1585#endif 1586 1587static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 1588{ 1589 int size = obj_size(cachep); 1590 addr = &((char *)addr)[obj_offset(cachep)]; 1591 1592 memset(addr, val, size); 1593 *(unsigned char *)(addr + size - 1) = POISON_END; 1594} 1595 1596static void dump_line(char *data, int offset, int limit) 1597{ 1598 int i; 1599 printk(KERN_ERR "%03x:", offset); 1600 for (i = 0; i < limit; i++) 1601 printk(" %02x", (unsigned char)data[offset + i]); 1602 printk("\n"); 1603} 1604#endif 1605 1606#if DEBUG 1607 1608static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) 1609{ 1610 int i, size; 1611 char *realobj; 1612 1613 if (cachep->flags & SLAB_RED_ZONE) { 1614 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1615 *dbg_redzone1(cachep, objp), 1616 *dbg_redzone2(cachep, objp)); 1617 } 1618 1619 if (cachep->flags & SLAB_STORE_USER) { 1620 printk(KERN_ERR "Last user: [<%p>]", 1621 *dbg_userword(cachep, objp)); 1622 print_symbol("(%s)", 1623 (unsigned long)*dbg_userword(cachep, objp)); 1624 printk("\n"); 1625 } 1626 realobj = (char *)objp + obj_offset(cachep); 1627 size = obj_size(cachep); 1628 for (i = 0; i < size && lines; i += 16, lines--) { 1629 int limit; 1630 limit = 16; 1631 if (i + limit > size) 1632 limit = size - i; 1633 dump_line(realobj, i, limit); 1634 } 1635} 1636 1637static void check_poison_obj(struct kmem_cache *cachep, void *objp) 1638{ 1639 char *realobj; 1640 int size, i; 1641 int lines = 0; 1642 1643 realobj = (char *)objp + obj_offset(cachep); 1644 size = obj_size(cachep); 1645 1646 for (i = 0; i < size; i++) { 1647 char exp = POISON_FREE; 1648 if (i == size - 1) 1649 exp = POISON_END; 1650 if (realobj[i] != exp) { 1651 int limit; 1652 /* Mismatch ! */ 1653 /* Print header */ 1654 if (lines == 0) { 1655 printk(KERN_ERR 1656 "Slab corruption: start=%p, len=%d\n", 1657 realobj, size); 1658 print_objinfo(cachep, objp, 0); 1659 } 1660 /* Hexdump the affected line */ 1661 i = (i / 16) * 16; 1662 limit = 16; 1663 if (i + limit > size) 1664 limit = size - i; 1665 dump_line(realobj, i, limit); 1666 i += 16; 1667 lines++; 1668 /* Limit to 5 lines */ 1669 if (lines > 5) 1670 break; 1671 } 1672 } 1673 if (lines != 0) { 1674 /* Print some data about the neighboring objects, if they 1675 * exist: 1676 */ 1677 struct slab *slabp = virt_to_slab(objp); 1678 unsigned int objnr; 1679 1680 objnr = obj_to_index(cachep, slabp, objp); 1681 if (objnr) { 1682 objp = index_to_obj(cachep, slabp, objnr - 1); 1683 realobj = (char *)objp + obj_offset(cachep); 1684 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1685 realobj, size); 1686 print_objinfo(cachep, objp, 2); 1687 } 1688 if (objnr + 1 < cachep->num) { 1689 objp = index_to_obj(cachep, slabp, objnr + 1); 1690 realobj = (char *)objp + obj_offset(cachep); 1691 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1692 realobj, size); 1693 print_objinfo(cachep, objp, 2); 1694 } 1695 } 1696} 1697#endif 1698 1699#if DEBUG 1700/** 1701 * slab_destroy_objs - destroy a slab and its objects 1702 * @cachep: cache pointer being destroyed 1703 * @slabp: slab pointer being destroyed 1704 * 1705 * Call the registered destructor for each object in a slab that is being 1706 * destroyed. 1707 */ 1708static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) 1709{ 1710 int i; 1711 for (i = 0; i < cachep->num; i++) { 1712 void *objp = index_to_obj(cachep, slabp, i); 1713 1714 if (cachep->flags & SLAB_POISON) { 1715#ifdef CONFIG_DEBUG_PAGEALLOC 1716 if (cachep->buffer_size % PAGE_SIZE == 0 && 1717 OFF_SLAB(cachep)) 1718 kernel_map_pages(virt_to_page(objp), 1719 cachep->buffer_size / PAGE_SIZE, 1); 1720 else 1721 check_poison_obj(cachep, objp); 1722#else 1723 check_poison_obj(cachep, objp); 1724#endif 1725 } 1726 if (cachep->flags & SLAB_RED_ZONE) { 1727 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1728 slab_error(cachep, "start of a freed object " 1729 "was overwritten"); 1730 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1731 slab_error(cachep, "end of a freed object " 1732 "was overwritten"); 1733 } 1734 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1735 (cachep->dtor) (objp + obj_offset(cachep), cachep, 0); 1736 } 1737} 1738#else 1739static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) 1740{ 1741 if (cachep->dtor) { 1742 int i; 1743 for (i = 0; i < cachep->num; i++) { 1744 void *objp = index_to_obj(cachep, slabp, i); 1745 (cachep->dtor) (objp, cachep, 0); 1746 } 1747 } 1748} 1749#endif 1750 1751/** 1752 * slab_destroy - destroy and release all objects in a slab 1753 * @cachep: cache pointer being destroyed 1754 * @slabp: slab pointer being destroyed 1755 * 1756 * Destroy all the objs in a slab, and release the mem back to the system. 1757 * Before calling the slab must have been unlinked from the cache. The 1758 * cache-lock is not held/needed. 1759 */ 1760static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) 1761{ 1762 void *addr = slabp->s_mem - slabp->colouroff; 1763 1764 slab_destroy_objs(cachep, slabp); 1765 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1766 struct slab_rcu *slab_rcu; 1767 1768 slab_rcu = (struct slab_rcu *)slabp; 1769 slab_rcu->cachep = cachep; 1770 slab_rcu->addr = addr; 1771 call_rcu(&slab_rcu->head, kmem_rcu_free); 1772 } else { 1773 kmem_freepages(cachep, addr); 1774 if (OFF_SLAB(cachep)) 1775 kmem_cache_free(cachep->slabp_cache, slabp); 1776 } 1777} 1778 1779/* 1780 * For setting up all the kmem_list3s for cache whose buffer_size is same as 1781 * size of kmem_list3. 1782 */ 1783static void set_up_list3s(struct kmem_cache *cachep, int index) 1784{ 1785 int node; 1786 1787 for_each_online_node(node) { 1788 cachep->nodelists[node] = &initkmem_list3[index + node]; 1789 cachep->nodelists[node]->next_reap = jiffies + 1790 REAPTIMEOUT_LIST3 + 1791 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1792 } 1793} 1794 1795/** 1796 * calculate_slab_order - calculate size (page order) of slabs 1797 * @cachep: pointer to the cache that is being created 1798 * @size: size of objects to be created in this cache. 1799 * @align: required alignment for the objects. 1800 * @flags: slab allocation flags 1801 * 1802 * Also calculates the number of objects per slab. 1803 * 1804 * This could be made much more intelligent. For now, try to avoid using 1805 * high order pages for slabs. When the gfp() functions are more friendly 1806 * towards high-order requests, this should be changed. 1807 */ 1808static size_t calculate_slab_order(struct kmem_cache *cachep, 1809 size_t size, size_t align, unsigned long flags) 1810{ 1811 unsigned long offslab_limit; 1812 size_t left_over = 0; 1813 int gfporder; 1814 1815 for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) { 1816 unsigned int num; 1817 size_t remainder; 1818 1819 cache_estimate(gfporder, size, align, flags, &remainder, &num); 1820 if (!num) 1821 continue; 1822 1823 if (flags & CFLGS_OFF_SLAB) { 1824 /* 1825 * Max number of objs-per-slab for caches which 1826 * use off-slab slabs. Needed to avoid a possible 1827 * looping condition in cache_grow(). 1828 */ 1829 offslab_limit = size - sizeof(struct slab); 1830 offslab_limit /= sizeof(kmem_bufctl_t); 1831 1832 if (num > offslab_limit) 1833 break; 1834 } 1835 1836 /* Found something acceptable - save it away */ 1837 cachep->num = num; 1838 cachep->gfporder = gfporder; 1839 left_over = remainder; 1840 1841 /* 1842 * A VFS-reclaimable slab tends to have most allocations 1843 * as GFP_NOFS and we really don't want to have to be allocating 1844 * higher-order pages when we are unable to shrink dcache. 1845 */ 1846 if (flags & SLAB_RECLAIM_ACCOUNT) 1847 break; 1848 1849 /* 1850 * Large number of objects is good, but very large slabs are 1851 * currently bad for the gfp()s. 1852 */ 1853 if (gfporder >= slab_break_gfp_order) 1854 break; 1855 1856 /* 1857 * Acceptable internal fragmentation? 1858 */ 1859 if (left_over * 8 <= (PAGE_SIZE << gfporder)) 1860 break; 1861 } 1862 return left_over; 1863} 1864 1865static void setup_cpu_cache(struct kmem_cache *cachep) 1866{ 1867 if (g_cpucache_up == FULL) { 1868 enable_cpucache(cachep); 1869 return; 1870 } 1871 if (g_cpucache_up == NONE) { 1872 /* 1873 * Note: the first kmem_cache_create must create the cache 1874 * that's used by kmalloc(24), otherwise the creation of 1875 * further caches will BUG(). 1876 */ 1877 cachep->array[smp_processor_id()] = &initarray_generic.cache; 1878 1879 /* 1880 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is 1881 * the first cache, then we need to set up all its list3s, 1882 * otherwise the creation of further caches will BUG(). 1883 */ 1884 set_up_list3s(cachep, SIZE_AC); 1885 if (INDEX_AC == INDEX_L3) 1886 g_cpucache_up = PARTIAL_L3; 1887 else 1888 g_cpucache_up = PARTIAL_AC; 1889 } else { 1890 cachep->array[smp_processor_id()] = 1891 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1892 1893 if (g_cpucache_up == PARTIAL_AC) { 1894 set_up_list3s(cachep, SIZE_L3); 1895 g_cpucache_up = PARTIAL_L3; 1896 } else { 1897 int node; 1898 for_each_online_node(node) { 1899 cachep->nodelists[node] = 1900 kmalloc_node(sizeof(struct kmem_list3), 1901 GFP_KERNEL, node); 1902 BUG_ON(!cachep->nodelists[node]); 1903 kmem_list3_init(cachep->nodelists[node]); 1904 } 1905 } 1906 } 1907 cachep->nodelists[numa_node_id()]->next_reap = 1908 jiffies + REAPTIMEOUT_LIST3 + 1909 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1910 1911 cpu_cache_get(cachep)->avail = 0; 1912 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 1913 cpu_cache_get(cachep)->batchcount = 1; 1914 cpu_cache_get(cachep)->touched = 0; 1915 cachep->batchcount = 1; 1916 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1917} 1918 1919/** 1920 * kmem_cache_create - Create a cache. 1921 * @name: A string which is used in /proc/slabinfo to identify this cache. 1922 * @size: The size of objects to be created in this cache. 1923 * @align: The required alignment for the objects. 1924 * @flags: SLAB flags 1925 * @ctor: A constructor for the objects. 1926 * @dtor: A destructor for the objects. 1927 * 1928 * Returns a ptr to the cache on success, NULL on failure. 1929 * Cannot be called within a int, but can be interrupted. 1930 * The @ctor is run when new pages are allocated by the cache 1931 * and the @dtor is run before the pages are handed back. 1932 * 1933 * @name must be valid until the cache is destroyed. This implies that 1934 * the module calling this has to destroy the cache before getting unloaded. 1935 * 1936 * The flags are 1937 * 1938 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 1939 * to catch references to uninitialised memory. 1940 * 1941 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 1942 * for buffer overruns. 1943 * 1944 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 1945 * cacheline. This can be beneficial if you're counting cycles as closely 1946 * as davem. 1947 */ 1948struct kmem_cache * 1949kmem_cache_create (const char *name, size_t size, size_t align, 1950 unsigned long flags, 1951 void (*ctor)(void*, struct kmem_cache *, unsigned long), 1952 void (*dtor)(void*, struct kmem_cache *, unsigned long)) 1953{ 1954 size_t left_over, slab_size, ralign; 1955 struct kmem_cache *cachep = NULL, *pc; 1956 1957 /* 1958 * Sanity checks... these are all serious usage bugs. 1959 */ 1960 if (!name || in_interrupt() || (size < BYTES_PER_WORD) || 1961 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { 1962 printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, 1963 name); 1964 BUG(); 1965 } 1966 1967 /* 1968 * Prevent CPUs from coming and going. 1969 * lock_cpu_hotplug() nests outside cache_chain_mutex 1970 */ 1971 lock_cpu_hotplug(); 1972 1973 mutex_lock(&cache_chain_mutex); 1974 1975 list_for_each_entry(pc, &cache_chain, next) { 1976 mm_segment_t old_fs = get_fs(); 1977 char tmp; 1978 int res; 1979 1980 /* 1981 * This happens when the module gets unloaded and doesn't 1982 * destroy its slab cache and no-one else reuses the vmalloc 1983 * area of the module. Print a warning. 1984 */ 1985 set_fs(KERNEL_DS); 1986 res = __get_user(tmp, pc->name); 1987 set_fs(old_fs); 1988 if (res) { 1989 printk("SLAB: cache with size %d has lost its name\n", 1990 pc->buffer_size); 1991 continue; 1992 } 1993 1994 if (!strcmp(pc->name, name)) { 1995 printk("kmem_cache_create: duplicate cache %s\n", name); 1996 dump_stack(); 1997 goto oops; 1998 } 1999 } 2000 2001#if DEBUG 2002 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 2003 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { 2004 /* No constructor, but inital state check requested */ 2005 printk(KERN_ERR "%s: No con, but init state check " 2006 "requested - %s\n", __FUNCTION__, name); 2007 flags &= ~SLAB_DEBUG_INITIAL; 2008 } 2009#if FORCED_DEBUG 2010 /* 2011 * Enable redzoning and last user accounting, except for caches with 2012 * large objects, if the increased size would increase the object size 2013 * above the next power of two: caches with object sizes just above a 2014 * power of two have a significant amount of internal fragmentation. 2015 */ 2016 if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD)) 2017 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 2018 if (!(flags & SLAB_DESTROY_BY_RCU)) 2019 flags |= SLAB_POISON; 2020#endif 2021 if (flags & SLAB_DESTROY_BY_RCU) 2022 BUG_ON(flags & SLAB_POISON); 2023#endif 2024 if (flags & SLAB_DESTROY_BY_RCU) 2025 BUG_ON(dtor); 2026 2027 /* 2028 * Always checks flags, a caller might be expecting debug support which 2029 * isn't available. 2030 */ 2031 BUG_ON(flags & ~CREATE_MASK); 2032 2033 /* 2034 * Check that size is in terms of words. This is needed to avoid 2035 * unaligned accesses for some archs when redzoning is used, and makes 2036 * sure any on-slab bufctl's are also correctly aligned. 2037 */ 2038 if (size & (BYTES_PER_WORD - 1)) { 2039 size += (BYTES_PER_WORD - 1); 2040 size &= ~(BYTES_PER_WORD - 1); 2041 } 2042 2043 /* calculate the final buffer alignment: */ 2044 2045 /* 1) arch recommendation: can be overridden for debug */ 2046 if (flags & SLAB_HWCACHE_ALIGN) { 2047 /* 2048 * Default alignment: as specified by the arch code. Except if 2049 * an object is really small, then squeeze multiple objects into 2050 * one cacheline. 2051 */ 2052 ralign = cache_line_size(); 2053 while (size <= ralign / 2) 2054 ralign /= 2; 2055 } else { 2056 ralign = BYTES_PER_WORD; 2057 } 2058 /* 2) arch mandated alignment: disables debug if necessary */ 2059 if (ralign < ARCH_SLAB_MINALIGN) { 2060 ralign = ARCH_SLAB_MINALIGN; 2061 if (ralign > BYTES_PER_WORD) 2062 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2063 } 2064 /* 3) caller mandated alignment: disables debug if necessary */ 2065 if (ralign < align) { 2066 ralign = align; 2067 if (ralign > BYTES_PER_WORD) 2068 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2069 } 2070 /* 2071 * 4) Store it. Note that the debug code below can reduce 2072 * the alignment to BYTES_PER_WORD. 2073 */ 2074 align = ralign; 2075 2076 /* Get cache's description obj. */ 2077 cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); 2078 if (!cachep) 2079 goto oops; 2080 2081#if DEBUG 2082 cachep->obj_size = size; 2083 2084 if (flags & SLAB_RED_ZONE) { 2085 /* redzoning only works with word aligned caches */ 2086 align = BYTES_PER_WORD; 2087 2088 /* add space for red zone words */ 2089 cachep->obj_offset += BYTES_PER_WORD; 2090 size += 2 * BYTES_PER_WORD; 2091 } 2092 if (flags & SLAB_STORE_USER) { 2093 /* user store requires word alignment and 2094 * one word storage behind the end of the real 2095 * object. 2096 */ 2097 align = BYTES_PER_WORD; 2098 size += BYTES_PER_WORD; 2099 } 2100#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2101 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 2102 && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { 2103 cachep->obj_offset += PAGE_SIZE - size; 2104 size = PAGE_SIZE; 2105 } 2106#endif 2107#endif 2108 2109 /* Determine if the slab management is 'on' or 'off' slab. */ 2110 if (size >= (PAGE_SIZE >> 3)) 2111 /* 2112 * Size is large, assume best to place the slab management obj 2113 * off-slab (should allow better packing of objs). 2114 */ 2115 flags |= CFLGS_OFF_SLAB; 2116 2117 size = ALIGN(size, align); 2118 2119 left_over = calculate_slab_order(cachep, size, align, flags); 2120 2121 if (!cachep->num) { 2122 printk("kmem_cache_create: couldn't create cache %s.\n", name); 2123 kmem_cache_free(&cache_cache, cachep); 2124 cachep = NULL; 2125 goto oops; 2126 } 2127 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) 2128 + sizeof(struct slab), align); 2129 2130 /* 2131 * If the slab has been placed off-slab, and we have enough space then 2132 * move it on-slab. This is at the expense of any extra colouring. 2133 */ 2134 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { 2135 flags &= ~CFLGS_OFF_SLAB; 2136 left_over -= slab_size; 2137 } 2138 2139 if (flags & CFLGS_OFF_SLAB) { 2140 /* really off slab. No need for manual alignment */ 2141 slab_size = 2142 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); 2143 } 2144 2145 cachep->colour_off = cache_line_size(); 2146 /* Offset must be a multiple of the alignment. */ 2147 if (cachep->colour_off < align) 2148 cachep->colour_off = align; 2149 cachep->colour = left_over / cachep->colour_off; 2150 cachep->slab_size = slab_size; 2151 cachep->flags = flags; 2152 cachep->gfpflags = 0; 2153 if (flags & SLAB_CACHE_DMA) 2154 cachep->gfpflags |= GFP_DMA; 2155 cachep->buffer_size = size; 2156 2157 if (flags & CFLGS_OFF_SLAB) 2158 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 2159 cachep->ctor = ctor; 2160 cachep->dtor = dtor; 2161 cachep->name = name; 2162 2163 2164 setup_cpu_cache(cachep); 2165 2166 /* cache setup completed, link it into the list */ 2167 list_add(&cachep->next, &cache_chain); 2168oops: 2169 if (!cachep && (flags & SLAB_PANIC)) 2170 panic("kmem_cache_create(): failed to create slab `%s'\n", 2171 name); 2172 mutex_unlock(&cache_chain_mutex); 2173 unlock_cpu_hotplug(); 2174 return cachep; 2175} 2176EXPORT_SYMBOL(kmem_cache_create); 2177 2178#if DEBUG 2179static void check_irq_off(void) 2180{ 2181 BUG_ON(!irqs_disabled()); 2182} 2183 2184static void check_irq_on(void) 2185{ 2186 BUG_ON(irqs_disabled()); 2187} 2188 2189static void check_spinlock_acquired(struct kmem_cache *cachep) 2190{ 2191#ifdef CONFIG_SMP 2192 check_irq_off(); 2193 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); 2194#endif 2195} 2196 2197static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) 2198{ 2199#ifdef CONFIG_SMP 2200 check_irq_off(); 2201 assert_spin_locked(&cachep->nodelists[node]->list_lock); 2202#endif 2203} 2204 2205#else 2206#define check_irq_off() do { } while(0) 2207#define check_irq_on() do { } while(0) 2208#define check_spinlock_acquired(x) do { } while(0) 2209#define check_spinlock_acquired_node(x, y) do { } while(0) 2210#endif 2211 2212static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 2213 struct array_cache *ac, 2214 int force, int node); 2215 2216static void do_drain(void *arg) 2217{ 2218 struct kmem_cache *cachep = arg; 2219 struct array_cache *ac; 2220 int node = numa_node_id(); 2221 2222 check_irq_off(); 2223 ac = cpu_cache_get(cachep); 2224 spin_lock(&cachep->nodelists[node]->list_lock); 2225 free_block(cachep, ac->entry, ac->avail, node); 2226 spin_unlock(&cachep->nodelists[node]->list_lock); 2227 ac->avail = 0; 2228} 2229 2230static void drain_cpu_caches(struct kmem_cache *cachep) 2231{ 2232 struct kmem_list3 *l3; 2233 int node; 2234 2235 on_each_cpu(do_drain, cachep, 1, 1); 2236 check_irq_on(); 2237 for_each_online_node(node) { 2238 l3 = cachep->nodelists[node]; 2239 if (l3 && l3->alien) 2240 drain_alien_cache(cachep, l3->alien); 2241 } 2242 2243 for_each_online_node(node) { 2244 l3 = cachep->nodelists[node]; 2245 if (l3) 2246 drain_array(cachep, l3, l3->shared, 1, node); 2247 } 2248} 2249 2250static int __node_shrink(struct kmem_cache *cachep, int node) 2251{ 2252 struct slab *slabp; 2253 struct kmem_list3 *l3 = cachep->nodelists[node]; 2254 int ret; 2255 2256 for (;;) { 2257 struct list_head *p; 2258 2259 p = l3->slabs_free.prev; 2260 if (p == &l3->slabs_free) 2261 break; 2262 2263 slabp = list_entry(l3->slabs_free.prev, struct slab, list); 2264#if DEBUG 2265 BUG_ON(slabp->inuse); 2266#endif 2267 list_del(&slabp->list); 2268 2269 l3->free_objects -= cachep->num; 2270 spin_unlock_irq(&l3->list_lock); 2271 slab_destroy(cachep, slabp); 2272 spin_lock_irq(&l3->list_lock); 2273 } 2274 ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial); 2275 return ret; 2276} 2277 2278static int __cache_shrink(struct kmem_cache *cachep) 2279{ 2280 int ret = 0, i = 0; 2281 struct kmem_list3 *l3; 2282 2283 drain_cpu_caches(cachep); 2284 2285 check_irq_on(); 2286 for_each_online_node(i) { 2287 l3 = cachep->nodelists[i]; 2288 if (l3) { 2289 spin_lock_irq(&l3->list_lock); 2290 ret += __node_shrink(cachep, i); 2291 spin_unlock_irq(&l3->list_lock); 2292 } 2293 } 2294 return (ret ? 1 : 0); 2295} 2296 2297/** 2298 * kmem_cache_shrink - Shrink a cache. 2299 * @cachep: The cache to shrink. 2300 * 2301 * Releases as many slabs as possible for a cache. 2302 * To help debugging, a zero exit status indicates all slabs were released. 2303 */ 2304int kmem_cache_shrink(struct kmem_cache *cachep) 2305{ 2306 BUG_ON(!cachep || in_interrupt()); 2307 2308 return __cache_shrink(cachep); 2309} 2310EXPORT_SYMBOL(kmem_cache_shrink); 2311 2312/** 2313 * kmem_cache_destroy - delete a cache 2314 * @cachep: the cache to destroy 2315 * 2316 * Remove a struct kmem_cache object from the slab cache. 2317 * Returns 0 on success. 2318 * 2319 * It is expected this function will be called by a module when it is 2320 * unloaded. This will remove the cache completely, and avoid a duplicate 2321 * cache being allocated each time a module is loaded and unloaded, if the 2322 * module doesn't have persistent in-kernel storage across loads and unloads. 2323 * 2324 * The cache must be empty before calling this function. 2325 * 2326 * The caller must guarantee that noone will allocate memory from the cache 2327 * during the kmem_cache_destroy(). 2328 */ 2329int kmem_cache_destroy(struct kmem_cache *cachep) 2330{ 2331 int i; 2332 struct kmem_list3 *l3; 2333 2334 BUG_ON(!cachep || in_interrupt()); 2335 2336 /* Don't let CPUs to come and go */ 2337 lock_cpu_hotplug(); 2338 2339 /* Find the cache in the chain of caches. */ 2340 mutex_lock(&cache_chain_mutex); 2341 /* 2342 * the chain is never empty, cache_cache is never destroyed 2343 */ 2344 list_del(&cachep->next); 2345 mutex_unlock(&cache_chain_mutex); 2346 2347 if (__cache_shrink(cachep)) { 2348 slab_error(cachep, "Can't free all objects"); 2349 mutex_lock(&cache_chain_mutex); 2350 list_add(&cachep->next, &cache_chain); 2351 mutex_unlock(&cache_chain_mutex); 2352 unlock_cpu_hotplug(); 2353 return 1; 2354 } 2355 2356 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2357 synchronize_rcu(); 2358 2359 for_each_online_cpu(i) 2360 kfree(cachep->array[i]); 2361 2362 /* NUMA: free the list3 structures */ 2363 for_each_online_node(i) { 2364 l3 = cachep->nodelists[i]; 2365 if (l3) { 2366 kfree(l3->shared); 2367 free_alien_cache(l3->alien); 2368 kfree(l3); 2369 } 2370 } 2371 kmem_cache_free(&cache_cache, cachep); 2372 unlock_cpu_hotplug(); 2373 return 0; 2374} 2375EXPORT_SYMBOL(kmem_cache_destroy); 2376 2377/* Get the memory for a slab management obj. */ 2378static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, 2379 int colour_off, gfp_t local_flags, 2380 int nodeid) 2381{ 2382 struct slab *slabp; 2383 2384 if (OFF_SLAB(cachep)) { 2385 /* Slab management obj is off-slab. */ 2386 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2387 local_flags, nodeid); 2388 if (!slabp) 2389 return NULL; 2390 } else { 2391 slabp = objp + colour_off; 2392 colour_off += cachep->slab_size; 2393 } 2394 slabp->inuse = 0; 2395 slabp->colouroff = colour_off; 2396 slabp->s_mem = objp + colour_off; 2397 slabp->nodeid = nodeid; 2398 return slabp; 2399} 2400 2401static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2402{ 2403 return (kmem_bufctl_t *) (slabp + 1); 2404} 2405 2406static void cache_init_objs(struct kmem_cache *cachep, 2407 struct slab *slabp, unsigned long ctor_flags) 2408{ 2409 int i; 2410 2411 for (i = 0; i < cachep->num; i++) { 2412 void *objp = index_to_obj(cachep, slabp, i); 2413#if DEBUG 2414 /* need to poison the objs? */ 2415 if (cachep->flags & SLAB_POISON) 2416 poison_obj(cachep, objp, POISON_FREE); 2417 if (cachep->flags & SLAB_STORE_USER) 2418 *dbg_userword(cachep, objp) = NULL; 2419 2420 if (cachep->flags & SLAB_RED_ZONE) { 2421 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2422 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2423 } 2424 /* 2425 * Constructors are not allowed to allocate memory from the same 2426 * cache which they are a constructor for. Otherwise, deadlock. 2427 * They must also be threaded. 2428 */ 2429 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2430 cachep->ctor(objp + obj_offset(cachep), cachep, 2431 ctor_flags); 2432 2433 if (cachep->flags & SLAB_RED_ZONE) { 2434 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2435 slab_error(cachep, "constructor overwrote the" 2436 " end of an object"); 2437 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2438 slab_error(cachep, "constructor overwrote the" 2439 " start of an object"); 2440 } 2441 if ((cachep->buffer_size % PAGE_SIZE) == 0 && 2442 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2443 kernel_map_pages(virt_to_page(objp), 2444 cachep->buffer_size / PAGE_SIZE, 0); 2445#else 2446 if (cachep->ctor) 2447 cachep->ctor(objp, cachep, ctor_flags); 2448#endif 2449 slab_bufctl(slabp)[i] = i + 1; 2450 } 2451 slab_bufctl(slabp)[i - 1] = BUFCTL_END; 2452 slabp->free = 0; 2453} 2454 2455static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2456{ 2457 if (flags & SLAB_DMA) 2458 BUG_ON(!(cachep->gfpflags & GFP_DMA)); 2459 else 2460 BUG_ON(cachep->gfpflags & GFP_DMA); 2461} 2462 2463static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, 2464 int nodeid) 2465{ 2466 void *objp = index_to_obj(cachep, slabp, slabp->free); 2467 kmem_bufctl_t next; 2468 2469 slabp->inuse++; 2470 next = slab_bufctl(slabp)[slabp->free]; 2471#if DEBUG 2472 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2473 WARN_ON(slabp->nodeid != nodeid); 2474#endif 2475 slabp->free = next; 2476 2477 return objp; 2478} 2479 2480static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, 2481 void *objp, int nodeid) 2482{ 2483 unsigned int objnr = obj_to_index(cachep, slabp, objp); 2484 2485#if DEBUG 2486 /* Verify that the slab belongs to the intended node */ 2487 WARN_ON(slabp->nodeid != nodeid); 2488 2489 if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { 2490 printk(KERN_ERR "slab: double free detected in cache " 2491 "'%s', objp %p\n", cachep->name, objp); 2492 BUG(); 2493 } 2494#endif 2495 slab_bufctl(slabp)[objnr] = slabp->free; 2496 slabp->free = objnr; 2497 slabp->inuse--; 2498} 2499 2500/* 2501 * Map pages beginning at addr to the given cache and slab. This is required 2502 * for the slab allocator to be able to lookup the cache and slab of a 2503 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. 2504 */ 2505static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, 2506 void *addr) 2507{ 2508 int nr_pages; 2509 struct page *page; 2510 2511 page = virt_to_page(addr); 2512 2513 nr_pages = 1; 2514 if (likely(!PageCompound(page))) 2515 nr_pages <<= cache->gfporder; 2516 2517 do { 2518 page_set_cache(page, cache); 2519 page_set_slab(page, slab); 2520 page++; 2521 } while (--nr_pages); 2522} 2523 2524/* 2525 * Grow (by 1) the number of slabs within a cache. This is called by 2526 * kmem_cache_alloc() when there are no active objs left in a cache. 2527 */ 2528static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) 2529{ 2530 struct slab *slabp; 2531 void *objp; 2532 size_t offset; 2533 gfp_t local_flags; 2534 unsigned long ctor_flags; 2535 struct kmem_list3 *l3; 2536 2537 /* 2538 * Be lazy and only check for valid flags here, keeping it out of the 2539 * critical path in kmem_cache_alloc(). 2540 */ 2541 BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)); 2542 if (flags & SLAB_NO_GROW) 2543 return 0; 2544 2545 ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2546 local_flags = (flags & SLAB_LEVEL_MASK); 2547 if (!(local_flags & __GFP_WAIT)) 2548 /* 2549 * Not allowed to sleep. Need to tell a constructor about 2550 * this - it might need to know... 2551 */ 2552 ctor_flags |= SLAB_CTOR_ATOMIC; 2553 2554 /* Take the l3 list lock to change the colour_next on this node */ 2555 check_irq_off(); 2556 l3 = cachep->nodelists[nodeid]; 2557 spin_lock(&l3->list_lock); 2558 2559 /* Get colour for the slab, and cal the next value. */ 2560 offset = l3->colour_next; 2561 l3->colour_next++; 2562 if (l3->colour_next >= cachep->colour) 2563 l3->colour_next = 0; 2564 spin_unlock(&l3->list_lock); 2565 2566 offset *= cachep->colour_off; 2567 2568 if (local_flags & __GFP_WAIT) 2569 local_irq_enable(); 2570 2571 /* 2572 * The test for missing atomic flag is performed here, rather than 2573 * the more obvious place, simply to reduce the critical path length 2574 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 2575 * will eventually be caught here (where it matters). 2576 */ 2577 kmem_flagcheck(cachep, flags); 2578 2579 /* 2580 * Get mem for the objs. Attempt to allocate a physical page from 2581 * 'nodeid'. 2582 */ 2583 objp = kmem_getpages(cachep, flags, nodeid); 2584 if (!objp) 2585 goto failed; 2586 2587 /* Get slab management. */ 2588 slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid); 2589 if (!slabp) 2590 goto opps1; 2591 2592 slabp->nodeid = nodeid; 2593 slab_map_pages(cachep, slabp, objp); 2594 2595 cache_init_objs(cachep, slabp, ctor_flags); 2596 2597 if (local_flags & __GFP_WAIT) 2598 local_irq_disable(); 2599 check_irq_off(); 2600 spin_lock(&l3->list_lock); 2601 2602 /* Make slab active. */ 2603 list_add_tail(&slabp->list, &(l3->slabs_free)); 2604 STATS_INC_GROWN(cachep); 2605 l3->free_objects += cachep->num; 2606 spin_unlock(&l3->list_lock); 2607 return 1; 2608opps1: 2609 kmem_freepages(cachep, objp); 2610failed: 2611 if (local_flags & __GFP_WAIT) 2612 local_irq_disable(); 2613 return 0; 2614} 2615 2616#if DEBUG 2617 2618/* 2619 * Perform extra freeing checks: 2620 * - detect bad pointers. 2621 * - POISON/RED_ZONE checking 2622 * - destructor calls, for caches with POISON+dtor 2623 */ 2624static void kfree_debugcheck(const void *objp) 2625{ 2626 struct page *page; 2627 2628 if (!virt_addr_valid(objp)) { 2629 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2630 (unsigned long)objp); 2631 BUG(); 2632 } 2633 page = virt_to_page(objp); 2634 if (!PageSlab(page)) { 2635 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", 2636 (unsigned long)objp); 2637 BUG(); 2638 } 2639} 2640 2641static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) 2642{ 2643 unsigned long redzone1, redzone2; 2644 2645 redzone1 = *dbg_redzone1(cache, obj); 2646 redzone2 = *dbg_redzone2(cache, obj); 2647 2648 /* 2649 * Redzone is ok. 2650 */ 2651 if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) 2652 return; 2653 2654 if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) 2655 slab_error(cache, "double free detected"); 2656 else 2657 slab_error(cache, "memory outside object was overwritten"); 2658 2659 printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n", 2660 obj, redzone1, redzone2); 2661} 2662 2663static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, 2664 void *caller) 2665{ 2666 struct page *page; 2667 unsigned int objnr; 2668 struct slab *slabp; 2669 2670 objp -= obj_offset(cachep); 2671 kfree_debugcheck(objp); 2672 page = virt_to_page(objp); 2673 2674 slabp = page_get_slab(page); 2675 2676 if (cachep->flags & SLAB_RED_ZONE) { 2677 verify_redzone_free(cachep, objp); 2678 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2679 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2680 } 2681 if (cachep->flags & SLAB_STORE_USER) 2682 *dbg_userword(cachep, objp) = caller; 2683 2684 objnr = obj_to_index(cachep, slabp, objp); 2685 2686 BUG_ON(objnr >= cachep->num); 2687 BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); 2688 2689 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2690 /* 2691 * Need to call the slab's constructor so the caller can 2692 * perform a verify of its state (debugging). Called without 2693 * the cache-lock held. 2694 */ 2695 cachep->ctor(objp + obj_offset(cachep), 2696 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); 2697 } 2698 if (cachep->flags & SLAB_POISON && cachep->dtor) { 2699 /* we want to cache poison the object, 2700 * call the destruction callback 2701 */ 2702 cachep->dtor(objp + obj_offset(cachep), cachep, 0); 2703 } 2704#ifdef CONFIG_DEBUG_SLAB_LEAK 2705 slab_bufctl(slabp)[objnr] = BUFCTL_FREE; 2706#endif 2707 if (cachep->flags & SLAB_POISON) { 2708#ifdef CONFIG_DEBUG_PAGEALLOC 2709 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 2710 store_stackinfo(cachep, objp, (unsigned long)caller); 2711 kernel_map_pages(virt_to_page(objp), 2712 cachep->buffer_size / PAGE_SIZE, 0); 2713 } else { 2714 poison_obj(cachep, objp, POISON_FREE); 2715 } 2716#else 2717 poison_obj(cachep, objp, POISON_FREE); 2718#endif 2719 } 2720 return objp; 2721} 2722 2723static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) 2724{ 2725 kmem_bufctl_t i; 2726 int entries = 0; 2727 2728 /* Check slab's freelist to see if this obj is there. */ 2729 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2730 entries++; 2731 if (entries > cachep->num || i >= cachep->num) 2732 goto bad; 2733 } 2734 if (entries != cachep->num - slabp->inuse) { 2735bad: 2736 printk(KERN_ERR "slab: Internal list corruption detected in " 2737 "cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2738 cachep->name, cachep->num, slabp, slabp->inuse); 2739 for (i = 0; 2740 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); 2741 i++) { 2742 if (i % 16 == 0) 2743 printk("\n%03x:", i); 2744 printk(" %02x", ((unsigned char *)slabp)[i]); 2745 } 2746 printk("\n"); 2747 BUG(); 2748 } 2749} 2750#else 2751#define kfree_debugcheck(x) do { } while(0) 2752#define cache_free_debugcheck(x,objp,z) (objp) 2753#define check_slabp(x,y) do { } while(0) 2754#endif 2755 2756static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) 2757{ 2758 int batchcount; 2759 struct kmem_list3 *l3; 2760 struct array_cache *ac; 2761 2762 check_irq_off(); 2763 ac = cpu_cache_get(cachep); 2764retry: 2765 batchcount = ac->batchcount; 2766 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2767 /* 2768 * If there was little recent activity on this cache, then 2769 * perform only a partial refill. Otherwise we could generate 2770 * refill bouncing. 2771 */ 2772 batchcount = BATCHREFILL_LIMIT; 2773 } 2774 l3 = cachep->nodelists[numa_node_id()]; 2775 2776 BUG_ON(ac->avail > 0 || !l3); 2777 spin_lock(&l3->list_lock); 2778 2779 /* See if we can refill from the shared array */ 2780 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) 2781 goto alloc_done; 2782 2783 while (batchcount > 0) { 2784 struct list_head *entry; 2785 struct slab *slabp; 2786 /* Get slab alloc is to come from. */ 2787 entry = l3->slabs_partial.next; 2788 if (entry == &l3->slabs_partial) { 2789 l3->free_touched = 1; 2790 entry = l3->slabs_free.next; 2791 if (entry == &l3->slabs_free) 2792 goto must_grow; 2793 } 2794 2795 slabp = list_entry(entry, struct slab, list); 2796 check_slabp(cachep, slabp); 2797 check_spinlock_acquired(cachep); 2798 while (slabp->inuse < cachep->num && batchcount--) { 2799 STATS_INC_ALLOCED(cachep); 2800 STATS_INC_ACTIVE(cachep); 2801 STATS_SET_HIGH(cachep); 2802 2803 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, 2804 numa_node_id()); 2805 } 2806 check_slabp(cachep, slabp); 2807 2808 /* move slabp to correct slabp list: */ 2809 list_del(&slabp->list); 2810 if (slabp->free == BUFCTL_END) 2811 list_add(&slabp->list, &l3->slabs_full); 2812 else 2813 list_add(&slabp->list, &l3->slabs_partial); 2814 } 2815 2816must_grow: 2817 l3->free_objects -= ac->avail; 2818alloc_done: 2819 spin_unlock(&l3->list_lock); 2820 2821 if (unlikely(!ac->avail)) { 2822 int x; 2823 x = cache_grow(cachep, flags, numa_node_id()); 2824 2825 /* cache_grow can reenable interrupts, then ac could change. */ 2826 ac = cpu_cache_get(cachep); 2827 if (!x && ac->avail == 0) /* no objects in sight? abort */ 2828 return NULL; 2829 2830 if (!ac->avail) /* objects refilled by interrupt? */ 2831 goto retry; 2832 } 2833 ac->touched = 1; 2834 return ac->entry[--ac->avail]; 2835} 2836 2837static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 2838 gfp_t flags) 2839{ 2840 might_sleep_if(flags & __GFP_WAIT); 2841#if DEBUG 2842 kmem_flagcheck(cachep, flags); 2843#endif 2844} 2845 2846#if DEBUG 2847static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 2848 gfp_t flags, void *objp, void *caller) 2849{ 2850 if (!objp) 2851 return objp; 2852 if (cachep->flags & SLAB_POISON) { 2853#ifdef CONFIG_DEBUG_PAGEALLOC 2854 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2855 kernel_map_pages(virt_to_page(objp), 2856 cachep->buffer_size / PAGE_SIZE, 1); 2857 else 2858 check_poison_obj(cachep, objp); 2859#else 2860 check_poison_obj(cachep, objp); 2861#endif 2862 poison_obj(cachep, objp, POISON_INUSE); 2863 } 2864 if (cachep->flags & SLAB_STORE_USER) 2865 *dbg_userword(cachep, objp) = caller; 2866 2867 if (cachep->flags & SLAB_RED_ZONE) { 2868 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || 2869 *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2870 slab_error(cachep, "double free, or memory outside" 2871 " object was overwritten"); 2872 printk(KERN_ERR 2873 "%p: redzone 1:0x%lx, redzone 2:0x%lx\n", 2874 objp, *dbg_redzone1(cachep, objp), 2875 *dbg_redzone2(cachep, objp)); 2876 } 2877 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2878 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2879 } 2880#ifdef CONFIG_DEBUG_SLAB_LEAK 2881 { 2882 struct slab *slabp; 2883 unsigned objnr; 2884 2885 slabp = page_get_slab(virt_to_page(objp)); 2886 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 2887 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; 2888 } 2889#endif 2890 objp += obj_offset(cachep); 2891 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2892 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2893 2894 if (!(flags & __GFP_WAIT)) 2895 ctor_flags |= SLAB_CTOR_ATOMIC; 2896 2897 cachep->ctor(objp, cachep, ctor_flags); 2898 } 2899 return objp; 2900} 2901#else 2902#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2903#endif 2904 2905static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 2906{ 2907 void *objp; 2908 struct array_cache *ac; 2909 2910#ifdef CONFIG_NUMA 2911 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { 2912 objp = alternate_node_alloc(cachep, flags); 2913 if (objp != NULL) 2914 return objp; 2915 } 2916#endif 2917 2918 check_irq_off(); 2919 ac = cpu_cache_get(cachep); 2920 if (likely(ac->avail)) { 2921 STATS_INC_ALLOCHIT(cachep); 2922 ac->touched = 1; 2923 objp = ac->entry[--ac->avail]; 2924 } else { 2925 STATS_INC_ALLOCMISS(cachep); 2926 objp = cache_alloc_refill(cachep, flags); 2927 } 2928 return objp; 2929} 2930 2931static __always_inline void *__cache_alloc(struct kmem_cache *cachep, 2932 gfp_t flags, void *caller) 2933{ 2934 unsigned long save_flags; 2935 void *objp; 2936 2937 cache_alloc_debugcheck_before(cachep, flags); 2938 2939 local_irq_save(save_flags); 2940 objp = ____cache_alloc(cachep, flags); 2941 local_irq_restore(save_flags); 2942 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 2943 caller); 2944 prefetchw(objp); 2945 return objp; 2946} 2947 2948#ifdef CONFIG_NUMA 2949/* 2950 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. 2951 * 2952 * If we are in_interrupt, then process context, including cpusets and 2953 * mempolicy, may not apply and should not be used for allocation policy. 2954 */ 2955static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) 2956{ 2957 int nid_alloc, nid_here; 2958 2959 if (in_interrupt()) 2960 return NULL; 2961 nid_alloc = nid_here = numa_node_id(); 2962 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 2963 nid_alloc = cpuset_mem_spread_node(); 2964 else if (current->mempolicy) 2965 nid_alloc = slab_node(current->mempolicy); 2966 if (nid_alloc != nid_here) 2967 return __cache_alloc_node(cachep, flags, nid_alloc); 2968 return NULL; 2969} 2970 2971/* 2972 * A interface to enable slab creation on nodeid 2973 */ 2974static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 2975 int nodeid) 2976{ 2977 struct list_head *entry; 2978 struct slab *slabp; 2979 struct kmem_list3 *l3; 2980 void *obj; 2981 int x; 2982 2983 l3 = cachep->nodelists[nodeid]; 2984 BUG_ON(!l3); 2985 2986retry: 2987 check_irq_off(); 2988 spin_lock(&l3->list_lock); 2989 entry = l3->slabs_partial.next; 2990 if (entry == &l3->slabs_partial) { 2991 l3->free_touched = 1; 2992 entry = l3->slabs_free.next; 2993 if (entry == &l3->slabs_free) 2994 goto must_grow; 2995 } 2996 2997 slabp = list_entry(entry, struct slab, list); 2998 check_spinlock_acquired_node(cachep, nodeid); 2999 check_slabp(cachep, slabp); 3000 3001 STATS_INC_NODEALLOCS(cachep); 3002 STATS_INC_ACTIVE(cachep); 3003 STATS_SET_HIGH(cachep); 3004 3005 BUG_ON(slabp->inuse == cachep->num); 3006 3007 obj = slab_get_obj(cachep, slabp, nodeid); 3008 check_slabp(cachep, slabp); 3009 l3->free_objects--; 3010 /* move slabp to correct slabp list: */ 3011 list_del(&slabp->list); 3012 3013 if (slabp->free == BUFCTL_END) 3014 list_add(&slabp->list, &l3->slabs_full); 3015 else 3016 list_add(&slabp->list, &l3->slabs_partial); 3017 3018 spin_unlock(&l3->list_lock); 3019 goto done; 3020 3021must_grow: 3022 spin_unlock(&l3->list_lock); 3023 x = cache_grow(cachep, flags, nodeid); 3024 3025 if (!x) 3026 return NULL; 3027 3028 goto retry; 3029done: 3030 return obj; 3031} 3032#endif 3033 3034/* 3035 * Caller needs to acquire correct kmem_list's list_lock 3036 */ 3037static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, 3038 int node) 3039{ 3040 int i; 3041 struct kmem_list3 *l3; 3042 3043 for (i = 0; i < nr_objects; i++) { 3044 void *objp = objpp[i]; 3045 struct slab *slabp; 3046 3047 slabp = virt_to_slab(objp); 3048 l3 = cachep->nodelists[node]; 3049 list_del(&slabp->list); 3050 check_spinlock_acquired_node(cachep, node); 3051 check_slabp(cachep, slabp); 3052 slab_put_obj(cachep, slabp, objp, node); 3053 STATS_DEC_ACTIVE(cachep); 3054 l3->free_objects++; 3055 check_slabp(cachep, slabp); 3056 3057 /* fixup slab chains */ 3058 if (slabp->inuse == 0) { 3059 if (l3->free_objects > l3->free_limit) { 3060 l3->free_objects -= cachep->num; 3061 slab_destroy(cachep, slabp); 3062 } else { 3063 list_add(&slabp->list, &l3->slabs_free); 3064 } 3065 } else { 3066 /* Unconditionally move a slab to the end of the 3067 * partial list on free - maximum time for the 3068 * other objects to be freed, too. 3069 */ 3070 list_add_tail(&slabp->list, &l3->slabs_partial); 3071 } 3072 } 3073} 3074 3075static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) 3076{ 3077 int batchcount; 3078 struct kmem_list3 *l3; 3079 int node = numa_node_id(); 3080 3081 batchcount = ac->batchcount; 3082#if DEBUG 3083 BUG_ON(!batchcount || batchcount > ac->avail); 3084#endif 3085 check_irq_off(); 3086 l3 = cachep->nodelists[node]; 3087 spin_lock(&l3->list_lock); 3088 if (l3->shared) { 3089 struct array_cache *shared_array = l3->shared; 3090 int max = shared_array->limit - shared_array->avail; 3091 if (max) { 3092 if (batchcount > max) 3093 batchcount = max; 3094 memcpy(&(shared_array->entry[shared_array->avail]), 3095 ac->entry, sizeof(void *) * batchcount); 3096 shared_array->avail += batchcount; 3097 goto free_done; 3098 } 3099 } 3100 3101 free_block(cachep, ac->entry, batchcount, node); 3102free_done: 3103#if STATS 3104 { 3105 int i = 0; 3106 struct list_head *p; 3107 3108 p = l3->slabs_free.next; 3109 while (p != &(l3->slabs_free)) { 3110 struct slab *slabp; 3111 3112 slabp = list_entry(p, struct slab, list); 3113 BUG_ON(slabp->inuse); 3114 3115 i++; 3116 p = p->next; 3117 } 3118 STATS_SET_FREEABLE(cachep, i); 3119 } 3120#endif 3121 spin_unlock(&l3->list_lock); 3122 ac->avail -= batchcount; 3123 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); 3124} 3125 3126/* 3127 * Release an obj back to its cache. If the obj has a constructed state, it must 3128 * be in this state _before_ it is released. Called with disabled ints. 3129 */ 3130static inline void __cache_free(struct kmem_cache *cachep, void *objp) 3131{ 3132 struct array_cache *ac = cpu_cache_get(cachep); 3133 3134 check_irq_off(); 3135 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3136 3137 if (cache_free_alien(cachep, objp)) 3138 return; 3139 3140 if (likely(ac->avail < ac->limit)) { 3141 STATS_INC_FREEHIT(cachep); 3142 ac->entry[ac->avail++] = objp; 3143 return; 3144 } else { 3145 STATS_INC_FREEMISS(cachep); 3146 cache_flusharray(cachep, ac); 3147 ac->entry[ac->avail++] = objp; 3148 } 3149} 3150 3151/** 3152 * kmem_cache_alloc - Allocate an object 3153 * @cachep: The cache to allocate from. 3154 * @flags: See kmalloc(). 3155 * 3156 * Allocate an object from this cache. The flags are only relevant 3157 * if the cache has no available objects. 3158 */ 3159void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3160{ 3161 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3162} 3163EXPORT_SYMBOL(kmem_cache_alloc); 3164 3165/** 3166 * kmem_cache_alloc - Allocate an object. The memory is set to zero. 3167 * @cache: The cache to allocate from. 3168 * @flags: See kmalloc(). 3169 * 3170 * Allocate an object from this cache and set the allocated memory to zero. 3171 * The flags are only relevant if the cache has no available objects. 3172 */ 3173void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags) 3174{ 3175 void *ret = __cache_alloc(cache, flags, __builtin_return_address(0)); 3176 if (ret) 3177 memset(ret, 0, obj_size(cache)); 3178 return ret; 3179} 3180EXPORT_SYMBOL(kmem_cache_zalloc); 3181 3182/** 3183 * kmem_ptr_validate - check if an untrusted pointer might 3184 * be a slab entry. 3185 * @cachep: the cache we're checking against 3186 * @ptr: pointer to validate 3187 * 3188 * This verifies that the untrusted pointer looks sane: 3189 * it is _not_ a guarantee that the pointer is actually 3190 * part of the slab cache in question, but it at least 3191 * validates that the pointer can be dereferenced and 3192 * looks half-way sane. 3193 * 3194 * Currently only used for dentry validation. 3195 */ 3196int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) 3197{ 3198 unsigned long addr = (unsigned long)ptr; 3199 unsigned long min_addr = PAGE_OFFSET; 3200 unsigned long align_mask = BYTES_PER_WORD - 1; 3201 unsigned long size = cachep->buffer_size; 3202 struct page *page; 3203 3204 if (unlikely(addr < min_addr)) 3205 goto out; 3206 if (unlikely(addr > (unsigned long)high_memory - size)) 3207 goto out; 3208 if (unlikely(addr & align_mask)) 3209 goto out; 3210 if (unlikely(!kern_addr_valid(addr))) 3211 goto out; 3212 if (unlikely(!kern_addr_valid(addr + size - 1))) 3213 goto out; 3214 page = virt_to_page(ptr); 3215 if (unlikely(!PageSlab(page))) 3216 goto out; 3217 if (unlikely(page_get_cache(page) != cachep)) 3218 goto out; 3219 return 1; 3220out: 3221 return 0; 3222} 3223 3224#ifdef CONFIG_NUMA 3225/** 3226 * kmem_cache_alloc_node - Allocate an object on the specified node 3227 * @cachep: The cache to allocate from. 3228 * @flags: See kmalloc(). 3229 * @nodeid: node number of the target node. 3230 * 3231 * Identical to kmem_cache_alloc, except that this function is slow 3232 * and can sleep. And it will allocate memory on the given node, which 3233 * can improve the performance for cpu bound structures. 3234 * New and improved: it will now make sure that the object gets 3235 * put on the correct node list so that there is no false sharing. 3236 */ 3237void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3238{ 3239 unsigned long save_flags; 3240 void *ptr; 3241 3242 cache_alloc_debugcheck_before(cachep, flags); 3243 local_irq_save(save_flags); 3244 3245 if (nodeid == -1 || nodeid == numa_node_id() || 3246 !cachep->nodelists[nodeid]) 3247 ptr = ____cache_alloc(cachep, flags); 3248 else 3249 ptr = __cache_alloc_node(cachep, flags, nodeid); 3250 local_irq_restore(save_flags); 3251 3252 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, 3253 __builtin_return_address(0)); 3254 3255 return ptr; 3256} 3257EXPORT_SYMBOL(kmem_cache_alloc_node); 3258 3259void *kmalloc_node(size_t size, gfp_t flags, int node) 3260{ 3261 struct kmem_cache *cachep; 3262 3263 cachep = kmem_find_general_cachep(size, flags); 3264 if (unlikely(cachep == NULL)) 3265 return NULL; 3266 return kmem_cache_alloc_node(cachep, flags, node); 3267} 3268EXPORT_SYMBOL(kmalloc_node); 3269#endif 3270 3271/** 3272 * kmalloc - allocate memory 3273 * @size: how many bytes of memory are required. 3274 * @flags: the type of memory to allocate. 3275 * @caller: function caller for debug tracking of the caller 3276 * 3277 * kmalloc is the normal method of allocating memory 3278 * in the kernel. 3279 * 3280 * The @flags argument may be one of: 3281 * 3282 * %GFP_USER - Allocate memory on behalf of user. May sleep. 3283 * 3284 * %GFP_KERNEL - Allocate normal kernel ram. May sleep. 3285 * 3286 * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers. 3287 * 3288 * Additionally, the %GFP_DMA flag may be set to indicate the memory 3289 * must be suitable for DMA. This can mean different things on different 3290 * platforms. For example, on i386, it means that the memory must come 3291 * from the first 16MB. 3292 */ 3293static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, 3294 void *caller) 3295{ 3296 struct kmem_cache *cachep; 3297 3298 /* If you want to save a few bytes .text space: replace 3299 * __ with kmem_. 3300 * Then kmalloc uses the uninlined functions instead of the inline 3301 * functions. 3302 */ 3303 cachep = __find_general_cachep(size, flags); 3304 if (unlikely(cachep == NULL)) 3305 return NULL; 3306 return __cache_alloc(cachep, flags, caller); 3307} 3308 3309 3310void *__kmalloc(size_t size, gfp_t flags) 3311{ 3312#ifndef CONFIG_DEBUG_SLAB 3313 return __do_kmalloc(size, flags, NULL); 3314#else 3315 return __do_kmalloc(size, flags, __builtin_return_address(0)); 3316#endif 3317} 3318EXPORT_SYMBOL(__kmalloc); 3319 3320#ifdef CONFIG_DEBUG_SLAB 3321void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) 3322{ 3323 return __do_kmalloc(size, flags, caller); 3324} 3325EXPORT_SYMBOL(__kmalloc_track_caller); 3326#endif 3327 3328#ifdef CONFIG_SMP 3329/** 3330 * __alloc_percpu - allocate one copy of the object for every present 3331 * cpu in the system, zeroing them. 3332 * Objects should be dereferenced using the per_cpu_ptr macro only. 3333 * 3334 * @size: how many bytes of memory are required. 3335 */ 3336void *__alloc_percpu(size_t size) 3337{ 3338 int i; 3339 struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL); 3340 3341 if (!pdata) 3342 return NULL; 3343 3344 /* 3345 * Cannot use for_each_online_cpu since a cpu may come online 3346 * and we have no way of figuring out how to fix the array 3347 * that we have allocated then.... 3348 */ 3349 for_each_possible_cpu(i) { 3350 int node = cpu_to_node(i); 3351 3352 if (node_online(node)) 3353 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node); 3354 else 3355 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); 3356 3357 if (!pdata->ptrs[i]) 3358 goto unwind_oom; 3359 memset(pdata->ptrs[i], 0, size); 3360 } 3361 3362 /* Catch derefs w/o wrappers */ 3363 return (void *)(~(unsigned long)pdata); 3364 3365unwind_oom: 3366 while (--i >= 0) { 3367 if (!cpu_possible(i)) 3368 continue; 3369 kfree(pdata->ptrs[i]); 3370 } 3371 kfree(pdata); 3372 return NULL; 3373} 3374EXPORT_SYMBOL(__alloc_percpu); 3375#endif 3376 3377/** 3378 * kmem_cache_free - Deallocate an object 3379 * @cachep: The cache the allocation was from. 3380 * @objp: The previously allocated object. 3381 * 3382 * Free an object which was previously allocated from this 3383 * cache. 3384 */ 3385void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3386{ 3387 unsigned long flags; 3388 3389 BUG_ON(virt_to_cache(objp) != cachep); 3390 3391 local_irq_save(flags); 3392 __cache_free(cachep, objp); 3393 local_irq_restore(flags); 3394} 3395EXPORT_SYMBOL(kmem_cache_free); 3396 3397/** 3398 * kfree - free previously allocated memory 3399 * @objp: pointer returned by kmalloc. 3400 * 3401 * If @objp is NULL, no operation is performed. 3402 * 3403 * Don't free memory not originally allocated by kmalloc() 3404 * or you will run into trouble. 3405 */ 3406void kfree(const void *objp) 3407{ 3408 struct kmem_cache *c; 3409 unsigned long flags; 3410 3411 if (unlikely(!objp)) 3412 return; 3413 local_irq_save(flags); 3414 kfree_debugcheck(objp); 3415 c = virt_to_cache(objp); 3416 mutex_debug_check_no_locks_freed(objp, obj_size(c)); 3417 __cache_free(c, (void *)objp); 3418 local_irq_restore(flags); 3419} 3420EXPORT_SYMBOL(kfree); 3421 3422#ifdef CONFIG_SMP 3423/** 3424 * free_percpu - free previously allocated percpu memory 3425 * @objp: pointer returned by alloc_percpu. 3426 * 3427 * Don't free memory not originally allocated by alloc_percpu() 3428 * The complemented objp is to check for that. 3429 */ 3430void free_percpu(const void *objp) 3431{ 3432 int i; 3433 struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp); 3434 3435 /* 3436 * We allocate for all cpus so we cannot use for online cpu here. 3437 */ 3438 for_each_possible_cpu(i) 3439 kfree(p->ptrs[i]); 3440 kfree(p); 3441} 3442EXPORT_SYMBOL(free_percpu); 3443#endif 3444 3445unsigned int kmem_cache_size(struct kmem_cache *cachep) 3446{ 3447 return obj_size(cachep); 3448} 3449EXPORT_SYMBOL(kmem_cache_size); 3450 3451const char *kmem_cache_name(struct kmem_cache *cachep) 3452{ 3453 return cachep->name; 3454} 3455EXPORT_SYMBOL_GPL(kmem_cache_name); 3456 3457/* 3458 * This initializes kmem_list3 or resizes varioius caches for all nodes. 3459 */ 3460static int alloc_kmemlist(struct kmem_cache *cachep) 3461{ 3462 int node; 3463 struct kmem_list3 *l3; 3464 struct array_cache *new_shared; 3465 struct array_cache **new_alien; 3466 3467 for_each_online_node(node) { 3468 3469 new_alien = alloc_alien_cache(node, cachep->limit); 3470 if (!new_alien) 3471 goto fail; 3472 3473 new_shared = alloc_arraycache(node, 3474 cachep->shared*cachep->batchcount, 3475 0xbaadf00d); 3476 if (!new_shared) { 3477 free_alien_cache(new_alien); 3478 goto fail; 3479 } 3480 3481 l3 = cachep->nodelists[node]; 3482 if (l3) { 3483 struct array_cache *shared = l3->shared; 3484 3485 spin_lock_irq(&l3->list_lock); 3486 3487 if (shared) 3488 free_block(cachep, shared->entry, 3489 shared->avail, node); 3490 3491 l3->shared = new_shared; 3492 if (!l3->alien) { 3493 l3->alien = new_alien; 3494 new_alien = NULL; 3495 } 3496 l3->free_limit = (1 + nr_cpus_node(node)) * 3497 cachep->batchcount + cachep->num; 3498 spin_unlock_irq(&l3->list_lock); 3499 kfree(shared); 3500 free_alien_cache(new_alien); 3501 continue; 3502 } 3503 l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); 3504 if (!l3) { 3505 free_alien_cache(new_alien); 3506 kfree(new_shared); 3507 goto fail; 3508 } 3509 3510 kmem_list3_init(l3); 3511 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3512 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 3513 l3->shared = new_shared; 3514 l3->alien = new_alien; 3515 l3->free_limit = (1 + nr_cpus_node(node)) * 3516 cachep->batchcount + cachep->num; 3517 cachep->nodelists[node] = l3; 3518 } 3519 return 0; 3520 3521fail: 3522 if (!cachep->next.next) { 3523 /* Cache is not active yet. Roll back what we did */ 3524 node--; 3525 while (node >= 0) { 3526 if (cachep->nodelists[node]) { 3527 l3 = cachep->nodelists[node]; 3528 3529 kfree(l3->shared); 3530 free_alien_cache(l3->alien); 3531 kfree(l3); 3532 cachep->nodelists[node] = NULL; 3533 } 3534 node--; 3535 } 3536 } 3537 return -ENOMEM; 3538} 3539 3540struct ccupdate_struct { 3541 struct kmem_cache *cachep; 3542 struct array_cache *new[NR_CPUS]; 3543}; 3544 3545static void do_ccupdate_local(void *info) 3546{ 3547 struct ccupdate_struct *new = info; 3548 struct array_cache *old; 3549 3550 check_irq_off(); 3551 old = cpu_cache_get(new->cachep); 3552 3553 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3554 new->new[smp_processor_id()] = old; 3555} 3556 3557/* Always called with the cache_chain_mutex held */ 3558static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3559 int batchcount, int shared) 3560{ 3561 struct ccupdate_struct new; 3562 int i, err; 3563 3564 memset(&new.new, 0, sizeof(new.new)); 3565 for_each_online_cpu(i) { 3566 new.new[i] = alloc_arraycache(cpu_to_node(i), limit, 3567 batchcount); 3568 if (!new.new[i]) { 3569 for (i--; i >= 0; i--) 3570 kfree(new.new[i]); 3571 return -ENOMEM; 3572 } 3573 } 3574 new.cachep = cachep; 3575 3576 on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); 3577 3578 check_irq_on(); 3579 cachep->batchcount = batchcount; 3580 cachep->limit = limit; 3581 cachep->shared = shared; 3582 3583 for_each_online_cpu(i) { 3584 struct array_cache *ccold = new.new[i]; 3585 if (!ccold) 3586 continue; 3587 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3588 free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); 3589 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3590 kfree(ccold); 3591 } 3592 3593 err = alloc_kmemlist(cachep); 3594 if (err) { 3595 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", 3596 cachep->name, -err); 3597 BUG(); 3598 } 3599 return 0; 3600} 3601 3602/* Called with cache_chain_mutex held always */ 3603static void enable_cpucache(struct kmem_cache *cachep) 3604{ 3605 int err; 3606 int limit, shared; 3607 3608 /* 3609 * The head array serves three purposes: 3610 * - create a LIFO ordering, i.e. return objects that are cache-warm 3611 * - reduce the number of spinlock operations. 3612 * - reduce the number of linked list operations on the slab and 3613 * bufctl chains: array operations are cheaper. 3614 * The numbers are guessed, we should auto-tune as described by 3615 * Bonwick. 3616 */ 3617 if (cachep->buffer_size > 131072) 3618 limit = 1; 3619 else if (cachep->buffer_size > PAGE_SIZE) 3620 limit = 8; 3621 else if (cachep->buffer_size > 1024) 3622 limit = 24; 3623 else if (cachep->buffer_size > 256) 3624 limit = 54; 3625 else 3626 limit = 120; 3627 3628 /* 3629 * CPU bound tasks (e.g. network routing) can exhibit cpu bound 3630 * allocation behaviour: Most allocs on one cpu, most free operations 3631 * on another cpu. For these cases, an efficient object passing between 3632 * cpus is necessary. This is provided by a shared array. The array 3633 * replaces Bonwick's magazine layer. 3634 * On uniprocessor, it's functionally equivalent (but less efficient) 3635 * to a larger limit. Thus disabled by default. 3636 */ 3637 shared = 0; 3638#ifdef CONFIG_SMP 3639 if (cachep->buffer_size <= PAGE_SIZE) 3640 shared = 8; 3641#endif 3642 3643#if DEBUG 3644 /* 3645 * With debugging enabled, large batchcount lead to excessively long 3646 * periods with disabled local interrupts. Limit the batchcount 3647 */ 3648 if (limit > 32) 3649 limit = 32; 3650#endif 3651 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); 3652 if (err) 3653 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3654 cachep->name, -err); 3655} 3656 3657/* 3658 * Drain an array if it contains any elements taking the l3 lock only if 3659 * necessary. Note that the l3 listlock also protects the array_cache 3660 * if drain_array() is used on the shared array. 3661 */ 3662void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 3663 struct array_cache *ac, int force, int node) 3664{ 3665 int tofree; 3666 3667 if (!ac || !ac->avail) 3668 return; 3669 if (ac->touched && !force) { 3670 ac->touched = 0; 3671 } else { 3672 spin_lock_irq(&l3->list_lock); 3673 if (ac->avail) { 3674 tofree = force ? ac->avail : (ac->limit + 4) / 5; 3675 if (tofree > ac->avail) 3676 tofree = (ac->avail + 1) / 2; 3677 free_block(cachep, ac->entry, tofree, node); 3678 ac->avail -= tofree; 3679 memmove(ac->entry, &(ac->entry[tofree]), 3680 sizeof(void *) * ac->avail); 3681 } 3682 spin_unlock_irq(&l3->list_lock); 3683 } 3684} 3685 3686/** 3687 * cache_reap - Reclaim memory from caches. 3688 * @unused: unused parameter 3689 * 3690 * Called from workqueue/eventd every few seconds. 3691 * Purpose: 3692 * - clear the per-cpu caches for this CPU. 3693 * - return freeable pages to the main free memory pool. 3694 * 3695 * If we cannot acquire the cache chain mutex then just give up - we'll try 3696 * again on the next iteration. 3697 */ 3698static void cache_reap(void *unused) 3699{ 3700 struct kmem_cache *searchp; 3701 struct kmem_list3 *l3; 3702 int node = numa_node_id(); 3703 3704 if (!mutex_trylock(&cache_chain_mutex)) { 3705 /* Give up. Setup the next iteration. */ 3706 schedule_delayed_work(&__get_cpu_var(reap_work), 3707 REAPTIMEOUT_CPUC); 3708 return; 3709 } 3710 3711 list_for_each_entry(searchp, &cache_chain, next) { 3712 struct list_head *p; 3713 int tofree; 3714 struct slab *slabp; 3715 3716 check_irq_on(); 3717 3718 /* 3719 * We only take the l3 lock if absolutely necessary and we 3720 * have established with reasonable certainty that 3721 * we can do some work if the lock was obtained. 3722 */ 3723 l3 = searchp->nodelists[node]; 3724 3725 reap_alien(searchp, l3); 3726 3727 drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); 3728 3729 /* 3730 * These are racy checks but it does not matter 3731 * if we skip one check or scan twice. 3732 */ 3733 if (time_after(l3->next_reap, jiffies)) 3734 goto next; 3735 3736 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 3737 3738 drain_array(searchp, l3, l3->shared, 0, node); 3739 3740 if (l3->free_touched) { 3741 l3->free_touched = 0; 3742 goto next; 3743 } 3744 3745 tofree = (l3->free_limit + 5 * searchp->num - 1) / 3746 (5 * searchp->num); 3747 do { 3748 /* 3749 * Do not lock if there are no free blocks. 3750 */ 3751 if (list_empty(&l3->slabs_free)) 3752 break; 3753 3754 spin_lock_irq(&l3->list_lock); 3755 p = l3->slabs_free.next; 3756 if (p == &(l3->slabs_free)) { 3757 spin_unlock_irq(&l3->list_lock); 3758 break; 3759 } 3760 3761 slabp = list_entry(p, struct slab, list); 3762 BUG_ON(slabp->inuse); 3763 list_del(&slabp->list); 3764 STATS_INC_REAPED(searchp); 3765 3766 /* 3767 * Safe to drop the lock. The slab is no longer linked 3768 * to the cache. searchp cannot disappear, we hold 3769 * cache_chain_lock 3770 */ 3771 l3->free_objects -= searchp->num; 3772 spin_unlock_irq(&l3->list_lock); 3773 slab_destroy(searchp, slabp); 3774 } while (--tofree > 0); 3775next: 3776 cond_resched(); 3777 } 3778 check_irq_on(); 3779 mutex_unlock(&cache_chain_mutex); 3780 next_reap_node(); 3781 /* Set up the next iteration */ 3782 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3783} 3784 3785#ifdef CONFIG_PROC_FS 3786 3787static void print_slabinfo_header(struct seq_file *m) 3788{ 3789 /* 3790 * Output format version, so at least we can change it 3791 * without _too_ many complaints. 3792 */ 3793#if STATS 3794 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 3795#else 3796 seq_puts(m, "slabinfo - version: 2.1\n"); 3797#endif 3798 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 3799 "<objperslab> <pagesperslab>"); 3800 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 3801 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 3802#if STATS 3803 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " 3804 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); 3805 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 3806#endif 3807 seq_putc(m, '\n'); 3808} 3809 3810static void *s_start(struct seq_file *m, loff_t *pos) 3811{ 3812 loff_t n = *pos; 3813 struct list_head *p; 3814 3815 mutex_lock(&cache_chain_mutex); 3816 if (!n) 3817 print_slabinfo_header(m); 3818 p = cache_chain.next; 3819 while (n--) { 3820 p = p->next; 3821 if (p == &cache_chain) 3822 return NULL; 3823 } 3824 return list_entry(p, struct kmem_cache, next); 3825} 3826 3827static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3828{ 3829 struct kmem_cache *cachep = p; 3830 ++*pos; 3831 return cachep->next.next == &cache_chain ? 3832 NULL : list_entry(cachep->next.next, struct kmem_cache, next); 3833} 3834 3835static void s_stop(struct seq_file *m, void *p) 3836{ 3837 mutex_unlock(&cache_chain_mutex); 3838} 3839 3840static int s_show(struct seq_file *m, void *p) 3841{ 3842 struct kmem_cache *cachep = p; 3843 struct slab *slabp; 3844 unsigned long active_objs; 3845 unsigned long num_objs; 3846 unsigned long active_slabs = 0; 3847 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 3848 const char *name; 3849 char *error = NULL; 3850 int node; 3851 struct kmem_list3 *l3; 3852 3853 active_objs = 0; 3854 num_slabs = 0; 3855 for_each_online_node(node) { 3856 l3 = cachep->nodelists[node]; 3857 if (!l3) 3858 continue; 3859 3860 check_irq_on(); 3861 spin_lock_irq(&l3->list_lock); 3862 3863 list_for_each_entry(slabp, &l3->slabs_full, list) { 3864 if (slabp->inuse != cachep->num && !error) 3865 error = "slabs_full accounting error"; 3866 active_objs += cachep->num; 3867 active_slabs++; 3868 } 3869 list_for_each_entry(slabp, &l3->slabs_partial, list) { 3870 if (slabp->inuse == cachep->num && !error) 3871 error = "slabs_partial inuse accounting error"; 3872 if (!slabp->inuse && !error) 3873 error = "slabs_partial/inuse accounting error"; 3874 active_objs += slabp->inuse; 3875 active_slabs++; 3876 } 3877 list_for_each_entry(slabp, &l3->slabs_free, list) { 3878 if (slabp->inuse && !error) 3879 error = "slabs_free/inuse accounting error"; 3880 num_slabs++; 3881 } 3882 free_objects += l3->free_objects; 3883 if (l3->shared) 3884 shared_avail += l3->shared->avail; 3885 3886 spin_unlock_irq(&l3->list_lock); 3887 } 3888 num_slabs += active_slabs; 3889 num_objs = num_slabs * cachep->num; 3890 if (num_objs - active_objs != free_objects && !error) 3891 error = "free_objects accounting error"; 3892 3893 name = cachep->name; 3894 if (error) 3895 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 3896 3897 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 3898 name, active_objs, num_objs, cachep->buffer_size, 3899 cachep->num, (1 << cachep->gfporder)); 3900 seq_printf(m, " : tunables %4u %4u %4u", 3901 cachep->limit, cachep->batchcount, cachep->shared); 3902 seq_printf(m, " : slabdata %6lu %6lu %6lu", 3903 active_slabs, num_slabs, shared_avail); 3904#if STATS 3905 { /* list3 stats */ 3906 unsigned long high = cachep->high_mark; 3907 unsigned long allocs = cachep->num_allocations; 3908 unsigned long grown = cachep->grown; 3909 unsigned long reaped = cachep->reaped; 3910 unsigned long errors = cachep->errors; 3911 unsigned long max_freeable = cachep->max_freeable; 3912 unsigned long node_allocs = cachep->node_allocs; 3913 unsigned long node_frees = cachep->node_frees; 3914 unsigned long overflows = cachep->node_overflow; 3915 3916 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 3917 %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, 3918 reaped, errors, max_freeable, node_allocs, 3919 node_frees, overflows); 3920 } 3921 /* cpu stats */ 3922 { 3923 unsigned long allochit = atomic_read(&cachep->allochit); 3924 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 3925 unsigned long freehit = atomic_read(&cachep->freehit); 3926 unsigned long freemiss = atomic_read(&cachep->freemiss); 3927 3928 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 3929 allochit, allocmiss, freehit, freemiss); 3930 } 3931#endif 3932 seq_putc(m, '\n'); 3933 return 0; 3934} 3935 3936/* 3937 * slabinfo_op - iterator that generates /proc/slabinfo 3938 * 3939 * Output layout: 3940 * cache-name 3941 * num-active-objs 3942 * total-objs 3943 * object size 3944 * num-active-slabs 3945 * total-slabs 3946 * num-pages-per-slab 3947 * + further values on SMP and with statistics enabled 3948 */ 3949 3950struct seq_operations slabinfo_op = { 3951 .start = s_start, 3952 .next = s_next, 3953 .stop = s_stop, 3954 .show = s_show, 3955}; 3956 3957#define MAX_SLABINFO_WRITE 128 3958/** 3959 * slabinfo_write - Tuning for the slab allocator 3960 * @file: unused 3961 * @buffer: user buffer 3962 * @count: data length 3963 * @ppos: unused 3964 */ 3965ssize_t slabinfo_write(struct file *file, const char __user * buffer, 3966 size_t count, loff_t *ppos) 3967{ 3968 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 3969 int limit, batchcount, shared, res; 3970 struct kmem_cache *cachep; 3971 3972 if (count > MAX_SLABINFO_WRITE) 3973 return -EINVAL; 3974 if (copy_from_user(&kbuf, buffer, count)) 3975 return -EFAULT; 3976 kbuf[MAX_SLABINFO_WRITE] = '\0'; 3977 3978 tmp = strchr(kbuf, ' '); 3979 if (!tmp) 3980 return -EINVAL; 3981 *tmp = '\0'; 3982 tmp++; 3983 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 3984 return -EINVAL; 3985 3986 /* Find the cache in the chain of caches. */ 3987 mutex_lock(&cache_chain_mutex); 3988 res = -EINVAL; 3989 list_for_each_entry(cachep, &cache_chain, next) { 3990 if (!strcmp(cachep->name, kbuf)) { 3991 if (limit < 1 || batchcount < 1 || 3992 batchcount > limit || shared < 0) { 3993 res = 0; 3994 } else { 3995 res = do_tune_cpucache(cachep, limit, 3996 batchcount, shared); 3997 } 3998 break; 3999 } 4000 } 4001 mutex_unlock(&cache_chain_mutex); 4002 if (res >= 0) 4003 res = count; 4004 return res; 4005} 4006 4007#ifdef CONFIG_DEBUG_SLAB_LEAK 4008 4009static void *leaks_start(struct seq_file *m, loff_t *pos) 4010{ 4011 loff_t n = *pos; 4012 struct list_head *p; 4013 4014 mutex_lock(&cache_chain_mutex); 4015 p = cache_chain.next; 4016 while (n--) { 4017 p = p->next; 4018 if (p == &cache_chain) 4019 return NULL; 4020 } 4021 return list_entry(p, struct kmem_cache, next); 4022} 4023 4024static inline int add_caller(unsigned long *n, unsigned long v) 4025{ 4026 unsigned long *p; 4027 int l; 4028 if (!v) 4029 return 1; 4030 l = n[1]; 4031 p = n + 2; 4032 while (l) { 4033 int i = l/2; 4034 unsigned long *q = p + 2 * i; 4035 if (*q == v) { 4036 q[1]++; 4037 return 1; 4038 } 4039 if (*q > v) { 4040 l = i; 4041 } else { 4042 p = q + 2; 4043 l -= i + 1; 4044 } 4045 } 4046 if (++n[1] == n[0]) 4047 return 0; 4048 memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); 4049 p[0] = v; 4050 p[1] = 1; 4051 return 1; 4052} 4053 4054static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) 4055{ 4056 void *p; 4057 int i; 4058 if (n[0] == n[1]) 4059 return; 4060 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { 4061 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) 4062 continue; 4063 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4064 return; 4065 } 4066} 4067 4068static void show_symbol(struct seq_file *m, unsigned long address) 4069{ 4070#ifdef CONFIG_KALLSYMS 4071 char *modname; 4072 const char *name; 4073 unsigned long offset, size; 4074 char namebuf[KSYM_NAME_LEN+1]; 4075 4076 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); 4077 4078 if (name) { 4079 seq_printf(m, "%s+%#lx/%#lx", name, offset, size); 4080 if (modname) 4081 seq_printf(m, " [%s]", modname); 4082 return; 4083 } 4084#endif 4085 seq_printf(m, "%p", (void *)address); 4086} 4087 4088static int leaks_show(struct seq_file *m, void *p) 4089{ 4090 struct kmem_cache *cachep = p; 4091 struct slab *slabp; 4092 struct kmem_list3 *l3; 4093 const char *name; 4094 unsigned long *n = m->private; 4095 int node; 4096 int i; 4097 4098 if (!(cachep->flags & SLAB_STORE_USER)) 4099 return 0; 4100 if (!(cachep->flags & SLAB_RED_ZONE)) 4101 return 0; 4102 4103 /* OK, we can do it */ 4104 4105 n[1] = 0; 4106 4107 for_each_online_node(node) { 4108 l3 = cachep->nodelists[node]; 4109 if (!l3) 4110 continue; 4111 4112 check_irq_on(); 4113 spin_lock_irq(&l3->list_lock); 4114 4115 list_for_each_entry(slabp, &l3->slabs_full, list) 4116 handle_slab(n, cachep, slabp); 4117 list_for_each_entry(slabp, &l3->slabs_partial, list) 4118 handle_slab(n, cachep, slabp); 4119 spin_unlock_irq(&l3->list_lock); 4120 } 4121 name = cachep->name; 4122 if (n[0] == n[1]) { 4123 /* Increase the buffer size */ 4124 mutex_unlock(&cache_chain_mutex); 4125 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); 4126 if (!m->private) { 4127 /* Too bad, we are really out */ 4128 m->private = n; 4129 mutex_lock(&cache_chain_mutex); 4130 return -ENOMEM; 4131 } 4132 *(unsigned long *)m->private = n[0] * 2; 4133 kfree(n); 4134 mutex_lock(&cache_chain_mutex); 4135 /* Now make sure this entry will be retried */ 4136 m->count = m->size; 4137 return 0; 4138 } 4139 for (i = 0; i < n[1]; i++) { 4140 seq_printf(m, "%s: %lu ", name, n[2*i+3]); 4141 show_symbol(m, n[2*i+2]); 4142 seq_putc(m, '\n'); 4143 } 4144 return 0; 4145} 4146 4147struct seq_operations slabstats_op = { 4148 .start = leaks_start, 4149 .next = s_next, 4150 .stop = s_stop, 4151 .show = leaks_show, 4152}; 4153#endif 4154#endif 4155 4156/** 4157 * ksize - get the actual amount of memory allocated for a given object 4158 * @objp: Pointer to the object 4159 * 4160 * kmalloc may internally round up allocations and return more memory 4161 * than requested. ksize() can be used to determine the actual amount of 4162 * memory allocated. The caller may use this additional memory, even though 4163 * a smaller amount of memory was initially specified with the kmalloc call. 4164 * The caller must guarantee that objp points to a valid object previously 4165 * allocated with either kmalloc() or kmem_cache_alloc(). The object 4166 * must not be freed during the duration of the call. 4167 */ 4168unsigned int ksize(const void *objp) 4169{ 4170 if (unlikely(objp == NULL)) 4171 return 0; 4172 4173 return obj_size(virt_to_cache(objp)); 4174} 4175