slab.c revision 7e85ee0c1d15ca5f8bff0f514f158eba1742dd87
1/* 2 * linux/mm/slab.c 3 * Written by Mark Hemment, 1996/97. 4 * (markhe@nextd.demon.co.uk) 5 * 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 7 * 8 * Major cleanup, different bufctl logic, per-cpu arrays 9 * (c) 2000 Manfred Spraul 10 * 11 * Cleanup, make the head arrays unconditional, preparation for NUMA 12 * (c) 2002 Manfred Spraul 13 * 14 * An implementation of the Slab Allocator as described in outline in; 15 * UNIX Internals: The New Frontiers by Uresh Vahalia 16 * Pub: Prentice Hall ISBN 0-13-101908-2 17 * or with a little more detail in; 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 19 * Jeff Bonwick (Sun Microsystems). 20 * Presented at: USENIX Summer 1994 Technical Conference 21 * 22 * The memory is organized in caches, one cache for each object type. 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 24 * Each cache consists out of many slabs (they are small (usually one 25 * page long) and always contiguous), and each slab contains multiple 26 * initialized objects. 27 * 28 * This means, that your constructor is used only for newly allocated 29 * slabs and you must pass objects with the same initializations to 30 * kmem_cache_free. 31 * 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 33 * normal). If you need a special memory type, then must create a new 34 * cache for that memory type. 35 * 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 37 * full slabs with 0 free objects 38 * partial slabs 39 * empty slabs with no allocated objects 40 * 41 * If partial slabs exist, then new allocations come from these slabs, 42 * otherwise from empty slabs or new slabs are allocated. 43 * 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 46 * 47 * Each cache has a short per-cpu head array, most allocs 48 * and frees go into that array, and if that array overflows, then 1/2 49 * of the entries in the array are given back into the global cache. 50 * The head array is strictly LIFO and should improve the cache hit rates. 51 * On SMP, it additionally reduces the spinlock operations. 52 * 53 * The c_cpuarray may not be read with enabled local interrupts - 54 * it's changed with a smp_call_function(). 55 * 56 * SMP synchronization: 57 * constructors and destructors are called without any locking. 58 * Several members in struct kmem_cache and struct slab never change, they 59 * are accessed without any locking. 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 61 * and local interrupts are disabled so slab code is preempt-safe. 62 * The non-constant members are protected with a per-cache irq spinlock. 63 * 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 65 * in 2000 - many ideas in the current implementation are derived from 66 * his patch. 67 * 68 * Further notes from the original documentation: 69 * 70 * 11 April '97. Started multi-threading - markhe 71 * The global cache-chain is protected by the mutex 'cache_chain_mutex'. 72 * The sem is only needed when accessing/extending the cache-chain, which 73 * can never happen inside an interrupt (kmem_cache_create(), 74 * kmem_cache_shrink() and kmem_cache_reap()). 75 * 76 * At present, each engine can be growing a cache. This should be blocked. 77 * 78 * 15 March 2005. NUMA slab allocator. 79 * Shai Fultheim <shai@scalex86.org>. 80 * Shobhit Dayal <shobhit@calsoftinc.com> 81 * Alok N Kataria <alokk@calsoftinc.com> 82 * Christoph Lameter <christoph@lameter.com> 83 * 84 * Modified the slab allocator to be node aware on NUMA systems. 85 * Each node has its own list of partial, free and full slabs. 86 * All object allocations for a node occur from node specific slab lists. 87 */ 88 89#include <linux/slab.h> 90#include <linux/mm.h> 91#include <linux/poison.h> 92#include <linux/swap.h> 93#include <linux/cache.h> 94#include <linux/interrupt.h> 95#include <linux/init.h> 96#include <linux/compiler.h> 97#include <linux/cpuset.h> 98#include <linux/proc_fs.h> 99#include <linux/seq_file.h> 100#include <linux/notifier.h> 101#include <linux/kallsyms.h> 102#include <linux/cpu.h> 103#include <linux/sysctl.h> 104#include <linux/module.h> 105#include <linux/kmemtrace.h> 106#include <linux/rcupdate.h> 107#include <linux/string.h> 108#include <linux/uaccess.h> 109#include <linux/nodemask.h> 110#include <linux/kmemleak.h> 111#include <linux/mempolicy.h> 112#include <linux/mutex.h> 113#include <linux/fault-inject.h> 114#include <linux/rtmutex.h> 115#include <linux/reciprocal_div.h> 116#include <linux/debugobjects.h> 117 118#include <asm/cacheflush.h> 119#include <asm/tlbflush.h> 120#include <asm/page.h> 121 122/* 123 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 124 * 0 for faster, smaller code (especially in the critical paths). 125 * 126 * STATS - 1 to collect stats for /proc/slabinfo. 127 * 0 for faster, smaller code (especially in the critical paths). 128 * 129 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 130 */ 131 132#ifdef CONFIG_DEBUG_SLAB 133#define DEBUG 1 134#define STATS 1 135#define FORCED_DEBUG 1 136#else 137#define DEBUG 0 138#define STATS 0 139#define FORCED_DEBUG 0 140#endif 141 142/* Shouldn't this be in a header file somewhere? */ 143#define BYTES_PER_WORD sizeof(void *) 144#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) 145 146#ifndef ARCH_KMALLOC_MINALIGN 147/* 148 * Enforce a minimum alignment for the kmalloc caches. 149 * Usually, the kmalloc caches are cache_line_size() aligned, except when 150 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. 151 * Some archs want to perform DMA into kmalloc caches and need a guaranteed 152 * alignment larger than the alignment of a 64-bit integer. 153 * ARCH_KMALLOC_MINALIGN allows that. 154 * Note that increasing this value may disable some debug features. 155 */ 156#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 157#endif 158 159#ifndef ARCH_SLAB_MINALIGN 160/* 161 * Enforce a minimum alignment for all caches. 162 * Intended for archs that get misalignment faults even for BYTES_PER_WORD 163 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. 164 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables 165 * some debug features. 166 */ 167#define ARCH_SLAB_MINALIGN 0 168#endif 169 170#ifndef ARCH_KMALLOC_FLAGS 171#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 172#endif 173 174/* Legal flag mask for kmem_cache_create(). */ 175#if DEBUG 176# define CREATE_MASK (SLAB_RED_ZONE | \ 177 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 178 SLAB_CACHE_DMA | \ 179 SLAB_STORE_USER | \ 180 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 181 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 182 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE) 183#else 184# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ 185 SLAB_CACHE_DMA | \ 186 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 187 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 188 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE) 189#endif 190 191/* 192 * kmem_bufctl_t: 193 * 194 * Bufctl's are used for linking objs within a slab 195 * linked offsets. 196 * 197 * This implementation relies on "struct page" for locating the cache & 198 * slab an object belongs to. 199 * This allows the bufctl structure to be small (one int), but limits 200 * the number of objects a slab (not a cache) can contain when off-slab 201 * bufctls are used. The limit is the size of the largest general cache 202 * that does not use off-slab slabs. 203 * For 32bit archs with 4 kB pages, is this 56. 204 * This is not serious, as it is only for large objects, when it is unwise 205 * to have too many per slab. 206 * Note: This limit can be raised by introducing a general cache whose size 207 * is less than 512 (PAGE_SIZE<<3), but greater than 256. 208 */ 209 210typedef unsigned int kmem_bufctl_t; 211#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) 212#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) 213#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) 214#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) 215 216/* 217 * struct slab 218 * 219 * Manages the objs in a slab. Placed either at the beginning of mem allocated 220 * for a slab, or allocated from an general cache. 221 * Slabs are chained into three list: fully used, partial, fully free slabs. 222 */ 223struct slab { 224 struct list_head list; 225 unsigned long colouroff; 226 void *s_mem; /* including colour offset */ 227 unsigned int inuse; /* num of objs active in slab */ 228 kmem_bufctl_t free; 229 unsigned short nodeid; 230}; 231 232/* 233 * struct slab_rcu 234 * 235 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 236 * arrange for kmem_freepages to be called via RCU. This is useful if 237 * we need to approach a kernel structure obliquely, from its address 238 * obtained without the usual locking. We can lock the structure to 239 * stabilize it and check it's still at the given address, only if we 240 * can be sure that the memory has not been meanwhile reused for some 241 * other kind of object (which our subsystem's lock might corrupt). 242 * 243 * rcu_read_lock before reading the address, then rcu_read_unlock after 244 * taking the spinlock within the structure expected at that address. 245 * 246 * We assume struct slab_rcu can overlay struct slab when destroying. 247 */ 248struct slab_rcu { 249 struct rcu_head head; 250 struct kmem_cache *cachep; 251 void *addr; 252}; 253 254/* 255 * struct array_cache 256 * 257 * Purpose: 258 * - LIFO ordering, to hand out cache-warm objects from _alloc 259 * - reduce the number of linked list operations 260 * - reduce spinlock operations 261 * 262 * The limit is stored in the per-cpu structure to reduce the data cache 263 * footprint. 264 * 265 */ 266struct array_cache { 267 unsigned int avail; 268 unsigned int limit; 269 unsigned int batchcount; 270 unsigned int touched; 271 spinlock_t lock; 272 void *entry[]; /* 273 * Must have this definition in here for the proper 274 * alignment of array_cache. Also simplifies accessing 275 * the entries. 276 */ 277}; 278 279/* 280 * bootstrap: The caches do not work without cpuarrays anymore, but the 281 * cpuarrays are allocated from the generic caches... 282 */ 283#define BOOT_CPUCACHE_ENTRIES 1 284struct arraycache_init { 285 struct array_cache cache; 286 void *entries[BOOT_CPUCACHE_ENTRIES]; 287}; 288 289/* 290 * The slab lists for all objects. 291 */ 292struct kmem_list3 { 293 struct list_head slabs_partial; /* partial list first, better asm code */ 294 struct list_head slabs_full; 295 struct list_head slabs_free; 296 unsigned long free_objects; 297 unsigned int free_limit; 298 unsigned int colour_next; /* Per-node cache coloring */ 299 spinlock_t list_lock; 300 struct array_cache *shared; /* shared per node */ 301 struct array_cache **alien; /* on other nodes */ 302 unsigned long next_reap; /* updated without locking */ 303 int free_touched; /* updated without locking */ 304}; 305 306/* 307 * The slab allocator is initialized with interrupts disabled. Therefore, make 308 * sure early boot allocations don't accidentally enable interrupts. 309 */ 310static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK; 311 312/* 313 * Need this for bootstrapping a per node allocator. 314 */ 315#define NUM_INIT_LISTS (3 * MAX_NUMNODES) 316struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; 317#define CACHE_CACHE 0 318#define SIZE_AC MAX_NUMNODES 319#define SIZE_L3 (2 * MAX_NUMNODES) 320 321static int drain_freelist(struct kmem_cache *cache, 322 struct kmem_list3 *l3, int tofree); 323static void free_block(struct kmem_cache *cachep, void **objpp, int len, 324 int node); 325static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); 326static void cache_reap(struct work_struct *unused); 327 328/* 329 * This function must be completely optimized away if a constant is passed to 330 * it. Mostly the same as what is in linux/slab.h except it returns an index. 331 */ 332static __always_inline int index_of(const size_t size) 333{ 334 extern void __bad_size(void); 335 336 if (__builtin_constant_p(size)) { 337 int i = 0; 338 339#define CACHE(x) \ 340 if (size <=x) \ 341 return i; \ 342 else \ 343 i++; 344#include <linux/kmalloc_sizes.h> 345#undef CACHE 346 __bad_size(); 347 } else 348 __bad_size(); 349 return 0; 350} 351 352static int slab_early_init = 1; 353 354#define INDEX_AC index_of(sizeof(struct arraycache_init)) 355#define INDEX_L3 index_of(sizeof(struct kmem_list3)) 356 357static void kmem_list3_init(struct kmem_list3 *parent) 358{ 359 INIT_LIST_HEAD(&parent->slabs_full); 360 INIT_LIST_HEAD(&parent->slabs_partial); 361 INIT_LIST_HEAD(&parent->slabs_free); 362 parent->shared = NULL; 363 parent->alien = NULL; 364 parent->colour_next = 0; 365 spin_lock_init(&parent->list_lock); 366 parent->free_objects = 0; 367 parent->free_touched = 0; 368} 369 370#define MAKE_LIST(cachep, listp, slab, nodeid) \ 371 do { \ 372 INIT_LIST_HEAD(listp); \ 373 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ 374 } while (0) 375 376#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 377 do { \ 378 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 379 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 380 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 381 } while (0) 382 383/* 384 * struct kmem_cache 385 * 386 * manages a cache. 387 */ 388 389struct kmem_cache { 390/* 1) per-cpu data, touched during every alloc/free */ 391 struct array_cache *array[NR_CPUS]; 392/* 2) Cache tunables. Protected by cache_chain_mutex */ 393 unsigned int batchcount; 394 unsigned int limit; 395 unsigned int shared; 396 397 unsigned int buffer_size; 398 u32 reciprocal_buffer_size; 399/* 3) touched by every alloc & free from the backend */ 400 401 unsigned int flags; /* constant flags */ 402 unsigned int num; /* # of objs per slab */ 403 404/* 4) cache_grow/shrink */ 405 /* order of pgs per slab (2^n) */ 406 unsigned int gfporder; 407 408 /* force GFP flags, e.g. GFP_DMA */ 409 gfp_t gfpflags; 410 411 size_t colour; /* cache colouring range */ 412 unsigned int colour_off; /* colour offset */ 413 struct kmem_cache *slabp_cache; 414 unsigned int slab_size; 415 unsigned int dflags; /* dynamic flags */ 416 417 /* constructor func */ 418 void (*ctor)(void *obj); 419 420/* 5) cache creation/removal */ 421 const char *name; 422 struct list_head next; 423 424/* 6) statistics */ 425#if STATS 426 unsigned long num_active; 427 unsigned long num_allocations; 428 unsigned long high_mark; 429 unsigned long grown; 430 unsigned long reaped; 431 unsigned long errors; 432 unsigned long max_freeable; 433 unsigned long node_allocs; 434 unsigned long node_frees; 435 unsigned long node_overflow; 436 atomic_t allochit; 437 atomic_t allocmiss; 438 atomic_t freehit; 439 atomic_t freemiss; 440#endif 441#if DEBUG 442 /* 443 * If debugging is enabled, then the allocator can add additional 444 * fields and/or padding to every object. buffer_size contains the total 445 * object size including these internal fields, the following two 446 * variables contain the offset to the user object and its size. 447 */ 448 int obj_offset; 449 int obj_size; 450#endif 451 /* 452 * We put nodelists[] at the end of kmem_cache, because we want to size 453 * this array to nr_node_ids slots instead of MAX_NUMNODES 454 * (see kmem_cache_init()) 455 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache 456 * is statically defined, so we reserve the max number of nodes. 457 */ 458 struct kmem_list3 *nodelists[MAX_NUMNODES]; 459 /* 460 * Do not add fields after nodelists[] 461 */ 462}; 463 464#define CFLGS_OFF_SLAB (0x80000000UL) 465#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 466 467#define BATCHREFILL_LIMIT 16 468/* 469 * Optimization question: fewer reaps means less probability for unnessary 470 * cpucache drain/refill cycles. 471 * 472 * OTOH the cpuarrays can contain lots of objects, 473 * which could lock up otherwise freeable slabs. 474 */ 475#define REAPTIMEOUT_CPUC (2*HZ) 476#define REAPTIMEOUT_LIST3 (4*HZ) 477 478#if STATS 479#define STATS_INC_ACTIVE(x) ((x)->num_active++) 480#define STATS_DEC_ACTIVE(x) ((x)->num_active--) 481#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 482#define STATS_INC_GROWN(x) ((x)->grown++) 483#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) 484#define STATS_SET_HIGH(x) \ 485 do { \ 486 if ((x)->num_active > (x)->high_mark) \ 487 (x)->high_mark = (x)->num_active; \ 488 } while (0) 489#define STATS_INC_ERR(x) ((x)->errors++) 490#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 491#define STATS_INC_NODEFREES(x) ((x)->node_frees++) 492#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) 493#define STATS_SET_FREEABLE(x, i) \ 494 do { \ 495 if ((x)->max_freeable < i) \ 496 (x)->max_freeable = i; \ 497 } while (0) 498#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 499#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 500#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 501#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 502#else 503#define STATS_INC_ACTIVE(x) do { } while (0) 504#define STATS_DEC_ACTIVE(x) do { } while (0) 505#define STATS_INC_ALLOCED(x) do { } while (0) 506#define STATS_INC_GROWN(x) do { } while (0) 507#define STATS_ADD_REAPED(x,y) do { } while (0) 508#define STATS_SET_HIGH(x) do { } while (0) 509#define STATS_INC_ERR(x) do { } while (0) 510#define STATS_INC_NODEALLOCS(x) do { } while (0) 511#define STATS_INC_NODEFREES(x) do { } while (0) 512#define STATS_INC_ACOVERFLOW(x) do { } while (0) 513#define STATS_SET_FREEABLE(x, i) do { } while (0) 514#define STATS_INC_ALLOCHIT(x) do { } while (0) 515#define STATS_INC_ALLOCMISS(x) do { } while (0) 516#define STATS_INC_FREEHIT(x) do { } while (0) 517#define STATS_INC_FREEMISS(x) do { } while (0) 518#endif 519 520#if DEBUG 521 522/* 523 * memory layout of objects: 524 * 0 : objp 525 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that 526 * the end of an object is aligned with the end of the real 527 * allocation. Catches writes behind the end of the allocation. 528 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: 529 * redzone word. 530 * cachep->obj_offset: The real object. 531 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 532 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address 533 * [BYTES_PER_WORD long] 534 */ 535static int obj_offset(struct kmem_cache *cachep) 536{ 537 return cachep->obj_offset; 538} 539 540static int obj_size(struct kmem_cache *cachep) 541{ 542 return cachep->obj_size; 543} 544 545static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) 546{ 547 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 548 return (unsigned long long*) (objp + obj_offset(cachep) - 549 sizeof(unsigned long long)); 550} 551 552static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) 553{ 554 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 555 if (cachep->flags & SLAB_STORE_USER) 556 return (unsigned long long *)(objp + cachep->buffer_size - 557 sizeof(unsigned long long) - 558 REDZONE_ALIGN); 559 return (unsigned long long *) (objp + cachep->buffer_size - 560 sizeof(unsigned long long)); 561} 562 563static void **dbg_userword(struct kmem_cache *cachep, void *objp) 564{ 565 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 566 return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); 567} 568 569#else 570 571#define obj_offset(x) 0 572#define obj_size(cachep) (cachep->buffer_size) 573#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 574#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 575#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 576 577#endif 578 579#ifdef CONFIG_KMEMTRACE 580size_t slab_buffer_size(struct kmem_cache *cachep) 581{ 582 return cachep->buffer_size; 583} 584EXPORT_SYMBOL(slab_buffer_size); 585#endif 586 587/* 588 * Do not go above this order unless 0 objects fit into the slab. 589 */ 590#define BREAK_GFP_ORDER_HI 1 591#define BREAK_GFP_ORDER_LO 0 592static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 593 594/* 595 * Functions for storing/retrieving the cachep and or slab from the page 596 * allocator. These are used to find the slab an obj belongs to. With kfree(), 597 * these are used to find the cache which an obj belongs to. 598 */ 599static inline void page_set_cache(struct page *page, struct kmem_cache *cache) 600{ 601 page->lru.next = (struct list_head *)cache; 602} 603 604static inline struct kmem_cache *page_get_cache(struct page *page) 605{ 606 page = compound_head(page); 607 BUG_ON(!PageSlab(page)); 608 return (struct kmem_cache *)page->lru.next; 609} 610 611static inline void page_set_slab(struct page *page, struct slab *slab) 612{ 613 page->lru.prev = (struct list_head *)slab; 614} 615 616static inline struct slab *page_get_slab(struct page *page) 617{ 618 BUG_ON(!PageSlab(page)); 619 return (struct slab *)page->lru.prev; 620} 621 622static inline struct kmem_cache *virt_to_cache(const void *obj) 623{ 624 struct page *page = virt_to_head_page(obj); 625 return page_get_cache(page); 626} 627 628static inline struct slab *virt_to_slab(const void *obj) 629{ 630 struct page *page = virt_to_head_page(obj); 631 return page_get_slab(page); 632} 633 634static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, 635 unsigned int idx) 636{ 637 return slab->s_mem + cache->buffer_size * idx; 638} 639 640/* 641 * We want to avoid an expensive divide : (offset / cache->buffer_size) 642 * Using the fact that buffer_size is a constant for a particular cache, 643 * we can replace (offset / cache->buffer_size) by 644 * reciprocal_divide(offset, cache->reciprocal_buffer_size) 645 */ 646static inline unsigned int obj_to_index(const struct kmem_cache *cache, 647 const struct slab *slab, void *obj) 648{ 649 u32 offset = (obj - slab->s_mem); 650 return reciprocal_divide(offset, cache->reciprocal_buffer_size); 651} 652 653/* 654 * These are the default caches for kmalloc. Custom caches can have other sizes. 655 */ 656struct cache_sizes malloc_sizes[] = { 657#define CACHE(x) { .cs_size = (x) }, 658#include <linux/kmalloc_sizes.h> 659 CACHE(ULONG_MAX) 660#undef CACHE 661}; 662EXPORT_SYMBOL(malloc_sizes); 663 664/* Must match cache_sizes above. Out of line to keep cache footprint low. */ 665struct cache_names { 666 char *name; 667 char *name_dma; 668}; 669 670static struct cache_names __initdata cache_names[] = { 671#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 672#include <linux/kmalloc_sizes.h> 673 {NULL,} 674#undef CACHE 675}; 676 677static struct arraycache_init initarray_cache __initdata = 678 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 679static struct arraycache_init initarray_generic = 680 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 681 682/* internal cache of cache description objs */ 683static struct kmem_cache cache_cache = { 684 .batchcount = 1, 685 .limit = BOOT_CPUCACHE_ENTRIES, 686 .shared = 1, 687 .buffer_size = sizeof(struct kmem_cache), 688 .name = "kmem_cache", 689}; 690 691#define BAD_ALIEN_MAGIC 0x01020304ul 692 693#ifdef CONFIG_LOCKDEP 694 695/* 696 * Slab sometimes uses the kmalloc slabs to store the slab headers 697 * for other slabs "off slab". 698 * The locking for this is tricky in that it nests within the locks 699 * of all other slabs in a few places; to deal with this special 700 * locking we put on-slab caches into a separate lock-class. 701 * 702 * We set lock class for alien array caches which are up during init. 703 * The lock annotation will be lost if all cpus of a node goes down and 704 * then comes back up during hotplug 705 */ 706static struct lock_class_key on_slab_l3_key; 707static struct lock_class_key on_slab_alc_key; 708 709static inline void init_lock_keys(void) 710 711{ 712 int q; 713 struct cache_sizes *s = malloc_sizes; 714 715 while (s->cs_size != ULONG_MAX) { 716 for_each_node(q) { 717 struct array_cache **alc; 718 int r; 719 struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; 720 if (!l3 || OFF_SLAB(s->cs_cachep)) 721 continue; 722 lockdep_set_class(&l3->list_lock, &on_slab_l3_key); 723 alc = l3->alien; 724 /* 725 * FIXME: This check for BAD_ALIEN_MAGIC 726 * should go away when common slab code is taught to 727 * work even without alien caches. 728 * Currently, non NUMA code returns BAD_ALIEN_MAGIC 729 * for alloc_alien_cache, 730 */ 731 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) 732 continue; 733 for_each_node(r) { 734 if (alc[r]) 735 lockdep_set_class(&alc[r]->lock, 736 &on_slab_alc_key); 737 } 738 } 739 s++; 740 } 741} 742#else 743static inline void init_lock_keys(void) 744{ 745} 746#endif 747 748/* 749 * Guard access to the cache-chain. 750 */ 751static DEFINE_MUTEX(cache_chain_mutex); 752static struct list_head cache_chain; 753 754/* 755 * chicken and egg problem: delay the per-cpu array allocation 756 * until the general caches are up. 757 */ 758static enum { 759 NONE, 760 PARTIAL_AC, 761 PARTIAL_L3, 762 FULL 763} g_cpucache_up; 764 765/* 766 * used by boot code to determine if it can use slab based allocator 767 */ 768int slab_is_available(void) 769{ 770 return g_cpucache_up == FULL; 771} 772 773static DEFINE_PER_CPU(struct delayed_work, reap_work); 774 775static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 776{ 777 return cachep->array[smp_processor_id()]; 778} 779 780static inline struct kmem_cache *__find_general_cachep(size_t size, 781 gfp_t gfpflags) 782{ 783 struct cache_sizes *csizep = malloc_sizes; 784 785#if DEBUG 786 /* This happens if someone tries to call 787 * kmem_cache_create(), or __kmalloc(), before 788 * the generic caches are initialized. 789 */ 790 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 791#endif 792 if (!size) 793 return ZERO_SIZE_PTR; 794 795 while (size > csizep->cs_size) 796 csizep++; 797 798 /* 799 * Really subtle: The last entry with cs->cs_size==ULONG_MAX 800 * has cs_{dma,}cachep==NULL. Thus no special case 801 * for large kmalloc calls required. 802 */ 803#ifdef CONFIG_ZONE_DMA 804 if (unlikely(gfpflags & GFP_DMA)) 805 return csizep->cs_dmacachep; 806#endif 807 return csizep->cs_cachep; 808} 809 810static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 811{ 812 return __find_general_cachep(size, gfpflags); 813} 814 815static size_t slab_mgmt_size(size_t nr_objs, size_t align) 816{ 817 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); 818} 819 820/* 821 * Calculate the number of objects and left-over bytes for a given buffer size. 822 */ 823static void cache_estimate(unsigned long gfporder, size_t buffer_size, 824 size_t align, int flags, size_t *left_over, 825 unsigned int *num) 826{ 827 int nr_objs; 828 size_t mgmt_size; 829 size_t slab_size = PAGE_SIZE << gfporder; 830 831 /* 832 * The slab management structure can be either off the slab or 833 * on it. For the latter case, the memory allocated for a 834 * slab is used for: 835 * 836 * - The struct slab 837 * - One kmem_bufctl_t for each object 838 * - Padding to respect alignment of @align 839 * - @buffer_size bytes for each object 840 * 841 * If the slab management structure is off the slab, then the 842 * alignment will already be calculated into the size. Because 843 * the slabs are all pages aligned, the objects will be at the 844 * correct alignment when allocated. 845 */ 846 if (flags & CFLGS_OFF_SLAB) { 847 mgmt_size = 0; 848 nr_objs = slab_size / buffer_size; 849 850 if (nr_objs > SLAB_LIMIT) 851 nr_objs = SLAB_LIMIT; 852 } else { 853 /* 854 * Ignore padding for the initial guess. The padding 855 * is at most @align-1 bytes, and @buffer_size is at 856 * least @align. In the worst case, this result will 857 * be one greater than the number of objects that fit 858 * into the memory allocation when taking the padding 859 * into account. 860 */ 861 nr_objs = (slab_size - sizeof(struct slab)) / 862 (buffer_size + sizeof(kmem_bufctl_t)); 863 864 /* 865 * This calculated number will be either the right 866 * amount, or one greater than what we want. 867 */ 868 if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size 869 > slab_size) 870 nr_objs--; 871 872 if (nr_objs > SLAB_LIMIT) 873 nr_objs = SLAB_LIMIT; 874 875 mgmt_size = slab_mgmt_size(nr_objs, align); 876 } 877 *num = nr_objs; 878 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 879} 880 881#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) 882 883static void __slab_error(const char *function, struct kmem_cache *cachep, 884 char *msg) 885{ 886 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 887 function, cachep->name, msg); 888 dump_stack(); 889} 890 891/* 892 * By default on NUMA we use alien caches to stage the freeing of 893 * objects allocated from other nodes. This causes massive memory 894 * inefficiencies when using fake NUMA setup to split memory into a 895 * large number of small nodes, so it can be disabled on the command 896 * line 897 */ 898 899static int use_alien_caches __read_mostly = 1; 900static int numa_platform __read_mostly = 1; 901static int __init noaliencache_setup(char *s) 902{ 903 use_alien_caches = 0; 904 return 1; 905} 906__setup("noaliencache", noaliencache_setup); 907 908#ifdef CONFIG_NUMA 909/* 910 * Special reaping functions for NUMA systems called from cache_reap(). 911 * These take care of doing round robin flushing of alien caches (containing 912 * objects freed on different nodes from which they were allocated) and the 913 * flushing of remote pcps by calling drain_node_pages. 914 */ 915static DEFINE_PER_CPU(unsigned long, reap_node); 916 917static void init_reap_node(int cpu) 918{ 919 int node; 920 921 node = next_node(cpu_to_node(cpu), node_online_map); 922 if (node == MAX_NUMNODES) 923 node = first_node(node_online_map); 924 925 per_cpu(reap_node, cpu) = node; 926} 927 928static void next_reap_node(void) 929{ 930 int node = __get_cpu_var(reap_node); 931 932 node = next_node(node, node_online_map); 933 if (unlikely(node >= MAX_NUMNODES)) 934 node = first_node(node_online_map); 935 __get_cpu_var(reap_node) = node; 936} 937 938#else 939#define init_reap_node(cpu) do { } while (0) 940#define next_reap_node(void) do { } while (0) 941#endif 942 943/* 944 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 945 * via the workqueue/eventd. 946 * Add the CPU number into the expiration time to minimize the possibility of 947 * the CPUs getting into lockstep and contending for the global cache chain 948 * lock. 949 */ 950static void __cpuinit start_cpu_timer(int cpu) 951{ 952 struct delayed_work *reap_work = &per_cpu(reap_work, cpu); 953 954 /* 955 * When this gets called from do_initcalls via cpucache_init(), 956 * init_workqueues() has already run, so keventd will be setup 957 * at that time. 958 */ 959 if (keventd_up() && reap_work->work.func == NULL) { 960 init_reap_node(cpu); 961 INIT_DELAYED_WORK(reap_work, cache_reap); 962 schedule_delayed_work_on(cpu, reap_work, 963 __round_jiffies_relative(HZ, cpu)); 964 } 965} 966 967static struct array_cache *alloc_arraycache(int node, int entries, 968 int batchcount, gfp_t gfp) 969{ 970 int memsize = sizeof(void *) * entries + sizeof(struct array_cache); 971 struct array_cache *nc = NULL; 972 973 nc = kmalloc_node(memsize, gfp, node); 974 /* 975 * The array_cache structures contain pointers to free object. 976 * However, when such objects are allocated or transfered to another 977 * cache the pointers are not cleared and they could be counted as 978 * valid references during a kmemleak scan. Therefore, kmemleak must 979 * not scan such objects. 980 */ 981 kmemleak_no_scan(nc); 982 if (nc) { 983 nc->avail = 0; 984 nc->limit = entries; 985 nc->batchcount = batchcount; 986 nc->touched = 0; 987 spin_lock_init(&nc->lock); 988 } 989 return nc; 990} 991 992/* 993 * Transfer objects in one arraycache to another. 994 * Locking must be handled by the caller. 995 * 996 * Return the number of entries transferred. 997 */ 998static int transfer_objects(struct array_cache *to, 999 struct array_cache *from, unsigned int max) 1000{ 1001 /* Figure out how many entries to transfer */ 1002 int nr = min(min(from->avail, max), to->limit - to->avail); 1003 1004 if (!nr) 1005 return 0; 1006 1007 memcpy(to->entry + to->avail, from->entry + from->avail -nr, 1008 sizeof(void *) *nr); 1009 1010 from->avail -= nr; 1011 to->avail += nr; 1012 to->touched = 1; 1013 return nr; 1014} 1015 1016#ifndef CONFIG_NUMA 1017 1018#define drain_alien_cache(cachep, alien) do { } while (0) 1019#define reap_alien(cachep, l3) do { } while (0) 1020 1021static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 1022{ 1023 return (struct array_cache **)BAD_ALIEN_MAGIC; 1024} 1025 1026static inline void free_alien_cache(struct array_cache **ac_ptr) 1027{ 1028} 1029 1030static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1031{ 1032 return 0; 1033} 1034 1035static inline void *alternate_node_alloc(struct kmem_cache *cachep, 1036 gfp_t flags) 1037{ 1038 return NULL; 1039} 1040 1041static inline void *____cache_alloc_node(struct kmem_cache *cachep, 1042 gfp_t flags, int nodeid) 1043{ 1044 return NULL; 1045} 1046 1047#else /* CONFIG_NUMA */ 1048 1049static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 1050static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1051 1052static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 1053{ 1054 struct array_cache **ac_ptr; 1055 int memsize = sizeof(void *) * nr_node_ids; 1056 int i; 1057 1058 if (limit > 1) 1059 limit = 12; 1060 ac_ptr = kmalloc_node(memsize, gfp, node); 1061 if (ac_ptr) { 1062 for_each_node(i) { 1063 if (i == node || !node_online(i)) { 1064 ac_ptr[i] = NULL; 1065 continue; 1066 } 1067 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); 1068 if (!ac_ptr[i]) { 1069 for (i--; i >= 0; i--) 1070 kfree(ac_ptr[i]); 1071 kfree(ac_ptr); 1072 return NULL; 1073 } 1074 } 1075 } 1076 return ac_ptr; 1077} 1078 1079static void free_alien_cache(struct array_cache **ac_ptr) 1080{ 1081 int i; 1082 1083 if (!ac_ptr) 1084 return; 1085 for_each_node(i) 1086 kfree(ac_ptr[i]); 1087 kfree(ac_ptr); 1088} 1089 1090static void __drain_alien_cache(struct kmem_cache *cachep, 1091 struct array_cache *ac, int node) 1092{ 1093 struct kmem_list3 *rl3 = cachep->nodelists[node]; 1094 1095 if (ac->avail) { 1096 spin_lock(&rl3->list_lock); 1097 /* 1098 * Stuff objects into the remote nodes shared array first. 1099 * That way we could avoid the overhead of putting the objects 1100 * into the free lists and getting them back later. 1101 */ 1102 if (rl3->shared) 1103 transfer_objects(rl3->shared, ac, ac->limit); 1104 1105 free_block(cachep, ac->entry, ac->avail, node); 1106 ac->avail = 0; 1107 spin_unlock(&rl3->list_lock); 1108 } 1109} 1110 1111/* 1112 * Called from cache_reap() to regularly drain alien caches round robin. 1113 */ 1114static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) 1115{ 1116 int node = __get_cpu_var(reap_node); 1117 1118 if (l3->alien) { 1119 struct array_cache *ac = l3->alien[node]; 1120 1121 if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { 1122 __drain_alien_cache(cachep, ac, node); 1123 spin_unlock_irq(&ac->lock); 1124 } 1125 } 1126} 1127 1128static void drain_alien_cache(struct kmem_cache *cachep, 1129 struct array_cache **alien) 1130{ 1131 int i = 0; 1132 struct array_cache *ac; 1133 unsigned long flags; 1134 1135 for_each_online_node(i) { 1136 ac = alien[i]; 1137 if (ac) { 1138 spin_lock_irqsave(&ac->lock, flags); 1139 __drain_alien_cache(cachep, ac, i); 1140 spin_unlock_irqrestore(&ac->lock, flags); 1141 } 1142 } 1143} 1144 1145static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1146{ 1147 struct slab *slabp = virt_to_slab(objp); 1148 int nodeid = slabp->nodeid; 1149 struct kmem_list3 *l3; 1150 struct array_cache *alien = NULL; 1151 int node; 1152 1153 node = numa_node_id(); 1154 1155 /* 1156 * Make sure we are not freeing a object from another node to the array 1157 * cache on this cpu. 1158 */ 1159 if (likely(slabp->nodeid == node)) 1160 return 0; 1161 1162 l3 = cachep->nodelists[node]; 1163 STATS_INC_NODEFREES(cachep); 1164 if (l3->alien && l3->alien[nodeid]) { 1165 alien = l3->alien[nodeid]; 1166 spin_lock(&alien->lock); 1167 if (unlikely(alien->avail == alien->limit)) { 1168 STATS_INC_ACOVERFLOW(cachep); 1169 __drain_alien_cache(cachep, alien, nodeid); 1170 } 1171 alien->entry[alien->avail++] = objp; 1172 spin_unlock(&alien->lock); 1173 } else { 1174 spin_lock(&(cachep->nodelists[nodeid])->list_lock); 1175 free_block(cachep, &objp, 1, nodeid); 1176 spin_unlock(&(cachep->nodelists[nodeid])->list_lock); 1177 } 1178 return 1; 1179} 1180#endif 1181 1182static void __cpuinit cpuup_canceled(long cpu) 1183{ 1184 struct kmem_cache *cachep; 1185 struct kmem_list3 *l3 = NULL; 1186 int node = cpu_to_node(cpu); 1187 const struct cpumask *mask = cpumask_of_node(node); 1188 1189 list_for_each_entry(cachep, &cache_chain, next) { 1190 struct array_cache *nc; 1191 struct array_cache *shared; 1192 struct array_cache **alien; 1193 1194 /* cpu is dead; no one can alloc from it. */ 1195 nc = cachep->array[cpu]; 1196 cachep->array[cpu] = NULL; 1197 l3 = cachep->nodelists[node]; 1198 1199 if (!l3) 1200 goto free_array_cache; 1201 1202 spin_lock_irq(&l3->list_lock); 1203 1204 /* Free limit for this kmem_list3 */ 1205 l3->free_limit -= cachep->batchcount; 1206 if (nc) 1207 free_block(cachep, nc->entry, nc->avail, node); 1208 1209 if (!cpus_empty(*mask)) { 1210 spin_unlock_irq(&l3->list_lock); 1211 goto free_array_cache; 1212 } 1213 1214 shared = l3->shared; 1215 if (shared) { 1216 free_block(cachep, shared->entry, 1217 shared->avail, node); 1218 l3->shared = NULL; 1219 } 1220 1221 alien = l3->alien; 1222 l3->alien = NULL; 1223 1224 spin_unlock_irq(&l3->list_lock); 1225 1226 kfree(shared); 1227 if (alien) { 1228 drain_alien_cache(cachep, alien); 1229 free_alien_cache(alien); 1230 } 1231free_array_cache: 1232 kfree(nc); 1233 } 1234 /* 1235 * In the previous loop, all the objects were freed to 1236 * the respective cache's slabs, now we can go ahead and 1237 * shrink each nodelist to its limit. 1238 */ 1239 list_for_each_entry(cachep, &cache_chain, next) { 1240 l3 = cachep->nodelists[node]; 1241 if (!l3) 1242 continue; 1243 drain_freelist(cachep, l3, l3->free_objects); 1244 } 1245} 1246 1247static int __cpuinit cpuup_prepare(long cpu) 1248{ 1249 struct kmem_cache *cachep; 1250 struct kmem_list3 *l3 = NULL; 1251 int node = cpu_to_node(cpu); 1252 const int memsize = sizeof(struct kmem_list3); 1253 1254 /* 1255 * We need to do this right in the beginning since 1256 * alloc_arraycache's are going to use this list. 1257 * kmalloc_node allows us to add the slab to the right 1258 * kmem_list3 and not this cpu's kmem_list3 1259 */ 1260 1261 list_for_each_entry(cachep, &cache_chain, next) { 1262 /* 1263 * Set up the size64 kmemlist for cpu before we can 1264 * begin anything. Make sure some other cpu on this 1265 * node has not already allocated this 1266 */ 1267 if (!cachep->nodelists[node]) { 1268 l3 = kmalloc_node(memsize, GFP_KERNEL, node); 1269 if (!l3) 1270 goto bad; 1271 kmem_list3_init(l3); 1272 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 1273 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1274 1275 /* 1276 * The l3s don't come and go as CPUs come and 1277 * go. cache_chain_mutex is sufficient 1278 * protection here. 1279 */ 1280 cachep->nodelists[node] = l3; 1281 } 1282 1283 spin_lock_irq(&cachep->nodelists[node]->list_lock); 1284 cachep->nodelists[node]->free_limit = 1285 (1 + nr_cpus_node(node)) * 1286 cachep->batchcount + cachep->num; 1287 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 1288 } 1289 1290 /* 1291 * Now we can go ahead with allocating the shared arrays and 1292 * array caches 1293 */ 1294 list_for_each_entry(cachep, &cache_chain, next) { 1295 struct array_cache *nc; 1296 struct array_cache *shared = NULL; 1297 struct array_cache **alien = NULL; 1298 1299 nc = alloc_arraycache(node, cachep->limit, 1300 cachep->batchcount, GFP_KERNEL); 1301 if (!nc) 1302 goto bad; 1303 if (cachep->shared) { 1304 shared = alloc_arraycache(node, 1305 cachep->shared * cachep->batchcount, 1306 0xbaadf00d, GFP_KERNEL); 1307 if (!shared) { 1308 kfree(nc); 1309 goto bad; 1310 } 1311 } 1312 if (use_alien_caches) { 1313 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); 1314 if (!alien) { 1315 kfree(shared); 1316 kfree(nc); 1317 goto bad; 1318 } 1319 } 1320 cachep->array[cpu] = nc; 1321 l3 = cachep->nodelists[node]; 1322 BUG_ON(!l3); 1323 1324 spin_lock_irq(&l3->list_lock); 1325 if (!l3->shared) { 1326 /* 1327 * We are serialised from CPU_DEAD or 1328 * CPU_UP_CANCELLED by the cpucontrol lock 1329 */ 1330 l3->shared = shared; 1331 shared = NULL; 1332 } 1333#ifdef CONFIG_NUMA 1334 if (!l3->alien) { 1335 l3->alien = alien; 1336 alien = NULL; 1337 } 1338#endif 1339 spin_unlock_irq(&l3->list_lock); 1340 kfree(shared); 1341 free_alien_cache(alien); 1342 } 1343 return 0; 1344bad: 1345 cpuup_canceled(cpu); 1346 return -ENOMEM; 1347} 1348 1349static int __cpuinit cpuup_callback(struct notifier_block *nfb, 1350 unsigned long action, void *hcpu) 1351{ 1352 long cpu = (long)hcpu; 1353 int err = 0; 1354 1355 switch (action) { 1356 case CPU_UP_PREPARE: 1357 case CPU_UP_PREPARE_FROZEN: 1358 mutex_lock(&cache_chain_mutex); 1359 err = cpuup_prepare(cpu); 1360 mutex_unlock(&cache_chain_mutex); 1361 break; 1362 case CPU_ONLINE: 1363 case CPU_ONLINE_FROZEN: 1364 start_cpu_timer(cpu); 1365 break; 1366#ifdef CONFIG_HOTPLUG_CPU 1367 case CPU_DOWN_PREPARE: 1368 case CPU_DOWN_PREPARE_FROZEN: 1369 /* 1370 * Shutdown cache reaper. Note that the cache_chain_mutex is 1371 * held so that if cache_reap() is invoked it cannot do 1372 * anything expensive but will only modify reap_work 1373 * and reschedule the timer. 1374 */ 1375 cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); 1376 /* Now the cache_reaper is guaranteed to be not running. */ 1377 per_cpu(reap_work, cpu).work.func = NULL; 1378 break; 1379 case CPU_DOWN_FAILED: 1380 case CPU_DOWN_FAILED_FROZEN: 1381 start_cpu_timer(cpu); 1382 break; 1383 case CPU_DEAD: 1384 case CPU_DEAD_FROZEN: 1385 /* 1386 * Even if all the cpus of a node are down, we don't free the 1387 * kmem_list3 of any cache. This to avoid a race between 1388 * cpu_down, and a kmalloc allocation from another cpu for 1389 * memory from the node of the cpu going down. The list3 1390 * structure is usually allocated from kmem_cache_create() and 1391 * gets destroyed at kmem_cache_destroy(). 1392 */ 1393 /* fall through */ 1394#endif 1395 case CPU_UP_CANCELED: 1396 case CPU_UP_CANCELED_FROZEN: 1397 mutex_lock(&cache_chain_mutex); 1398 cpuup_canceled(cpu); 1399 mutex_unlock(&cache_chain_mutex); 1400 break; 1401 } 1402 return err ? NOTIFY_BAD : NOTIFY_OK; 1403} 1404 1405static struct notifier_block __cpuinitdata cpucache_notifier = { 1406 &cpuup_callback, NULL, 0 1407}; 1408 1409/* 1410 * swap the static kmem_list3 with kmalloced memory 1411 */ 1412static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, 1413 int nodeid) 1414{ 1415 struct kmem_list3 *ptr; 1416 1417 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid); 1418 BUG_ON(!ptr); 1419 1420 memcpy(ptr, list, sizeof(struct kmem_list3)); 1421 /* 1422 * Do not assume that spinlocks can be initialized via memcpy: 1423 */ 1424 spin_lock_init(&ptr->list_lock); 1425 1426 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1427 cachep->nodelists[nodeid] = ptr; 1428} 1429 1430/* 1431 * For setting up all the kmem_list3s for cache whose buffer_size is same as 1432 * size of kmem_list3. 1433 */ 1434static void __init set_up_list3s(struct kmem_cache *cachep, int index) 1435{ 1436 int node; 1437 1438 for_each_online_node(node) { 1439 cachep->nodelists[node] = &initkmem_list3[index + node]; 1440 cachep->nodelists[node]->next_reap = jiffies + 1441 REAPTIMEOUT_LIST3 + 1442 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1443 } 1444} 1445 1446/* 1447 * Initialisation. Called after the page allocator have been initialised and 1448 * before smp_init(). 1449 */ 1450void __init kmem_cache_init(void) 1451{ 1452 size_t left_over; 1453 struct cache_sizes *sizes; 1454 struct cache_names *names; 1455 int i; 1456 int order; 1457 int node; 1458 1459 if (num_possible_nodes() == 1) { 1460 use_alien_caches = 0; 1461 numa_platform = 0; 1462 } 1463 1464 for (i = 0; i < NUM_INIT_LISTS; i++) { 1465 kmem_list3_init(&initkmem_list3[i]); 1466 if (i < MAX_NUMNODES) 1467 cache_cache.nodelists[i] = NULL; 1468 } 1469 set_up_list3s(&cache_cache, CACHE_CACHE); 1470 1471 /* 1472 * Fragmentation resistance on low memory - only use bigger 1473 * page orders on machines with more than 32MB of memory. 1474 */ 1475 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1476 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1477 1478 /* Bootstrap is tricky, because several objects are allocated 1479 * from caches that do not exist yet: 1480 * 1) initialize the cache_cache cache: it contains the struct 1481 * kmem_cache structures of all caches, except cache_cache itself: 1482 * cache_cache is statically allocated. 1483 * Initially an __init data area is used for the head array and the 1484 * kmem_list3 structures, it's replaced with a kmalloc allocated 1485 * array at the end of the bootstrap. 1486 * 2) Create the first kmalloc cache. 1487 * The struct kmem_cache for the new cache is allocated normally. 1488 * An __init data area is used for the head array. 1489 * 3) Create the remaining kmalloc caches, with minimally sized 1490 * head arrays. 1491 * 4) Replace the __init data head arrays for cache_cache and the first 1492 * kmalloc cache with kmalloc allocated arrays. 1493 * 5) Replace the __init data for kmem_list3 for cache_cache and 1494 * the other cache's with kmalloc allocated memory. 1495 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1496 */ 1497 1498 node = numa_node_id(); 1499 1500 /* 1) create the cache_cache */ 1501 INIT_LIST_HEAD(&cache_chain); 1502 list_add(&cache_cache.next, &cache_chain); 1503 cache_cache.colour_off = cache_line_size(); 1504 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1505 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; 1506 1507 /* 1508 * struct kmem_cache size depends on nr_node_ids, which 1509 * can be less than MAX_NUMNODES. 1510 */ 1511 cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + 1512 nr_node_ids * sizeof(struct kmem_list3 *); 1513#if DEBUG 1514 cache_cache.obj_size = cache_cache.buffer_size; 1515#endif 1516 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, 1517 cache_line_size()); 1518 cache_cache.reciprocal_buffer_size = 1519 reciprocal_value(cache_cache.buffer_size); 1520 1521 for (order = 0; order < MAX_ORDER; order++) { 1522 cache_estimate(order, cache_cache.buffer_size, 1523 cache_line_size(), 0, &left_over, &cache_cache.num); 1524 if (cache_cache.num) 1525 break; 1526 } 1527 BUG_ON(!cache_cache.num); 1528 cache_cache.gfporder = order; 1529 cache_cache.colour = left_over / cache_cache.colour_off; 1530 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + 1531 sizeof(struct slab), cache_line_size()); 1532 1533 /* 2+3) create the kmalloc caches */ 1534 sizes = malloc_sizes; 1535 names = cache_names; 1536 1537 /* 1538 * Initialize the caches that provide memory for the array cache and the 1539 * kmem_list3 structures first. Without this, further allocations will 1540 * bug. 1541 */ 1542 1543 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1544 sizes[INDEX_AC].cs_size, 1545 ARCH_KMALLOC_MINALIGN, 1546 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1547 NULL); 1548 1549 if (INDEX_AC != INDEX_L3) { 1550 sizes[INDEX_L3].cs_cachep = 1551 kmem_cache_create(names[INDEX_L3].name, 1552 sizes[INDEX_L3].cs_size, 1553 ARCH_KMALLOC_MINALIGN, 1554 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1555 NULL); 1556 } 1557 1558 slab_early_init = 0; 1559 1560 while (sizes->cs_size != ULONG_MAX) { 1561 /* 1562 * For performance, all the general caches are L1 aligned. 1563 * This should be particularly beneficial on SMP boxes, as it 1564 * eliminates "false sharing". 1565 * Note for systems short on memory removing the alignment will 1566 * allow tighter packing of the smaller caches. 1567 */ 1568 if (!sizes->cs_cachep) { 1569 sizes->cs_cachep = kmem_cache_create(names->name, 1570 sizes->cs_size, 1571 ARCH_KMALLOC_MINALIGN, 1572 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1573 NULL); 1574 } 1575#ifdef CONFIG_ZONE_DMA 1576 sizes->cs_dmacachep = kmem_cache_create( 1577 names->name_dma, 1578 sizes->cs_size, 1579 ARCH_KMALLOC_MINALIGN, 1580 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| 1581 SLAB_PANIC, 1582 NULL); 1583#endif 1584 sizes++; 1585 names++; 1586 } 1587 /* 4) Replace the bootstrap head arrays */ 1588 { 1589 struct array_cache *ptr; 1590 1591 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1592 1593 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); 1594 memcpy(ptr, cpu_cache_get(&cache_cache), 1595 sizeof(struct arraycache_init)); 1596 /* 1597 * Do not assume that spinlocks can be initialized via memcpy: 1598 */ 1599 spin_lock_init(&ptr->lock); 1600 1601 cache_cache.array[smp_processor_id()] = ptr; 1602 1603 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1604 1605 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) 1606 != &initarray_generic.cache); 1607 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), 1608 sizeof(struct arraycache_init)); 1609 /* 1610 * Do not assume that spinlocks can be initialized via memcpy: 1611 */ 1612 spin_lock_init(&ptr->lock); 1613 1614 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1615 ptr; 1616 } 1617 /* 5) Replace the bootstrap kmem_list3's */ 1618 { 1619 int nid; 1620 1621 for_each_online_node(nid) { 1622 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); 1623 1624 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1625 &initkmem_list3[SIZE_AC + nid], nid); 1626 1627 if (INDEX_AC != INDEX_L3) { 1628 init_list(malloc_sizes[INDEX_L3].cs_cachep, 1629 &initkmem_list3[SIZE_L3 + nid], nid); 1630 } 1631 } 1632 } 1633 1634 /* 6) resize the head arrays to their final sizes */ 1635 { 1636 struct kmem_cache *cachep; 1637 mutex_lock(&cache_chain_mutex); 1638 list_for_each_entry(cachep, &cache_chain, next) 1639 if (enable_cpucache(cachep, GFP_NOWAIT)) 1640 BUG(); 1641 mutex_unlock(&cache_chain_mutex); 1642 } 1643 1644 /* Annotate slab for lockdep -- annotate the malloc caches */ 1645 init_lock_keys(); 1646 1647 1648 /* Done! */ 1649 g_cpucache_up = FULL; 1650 1651 /* 1652 * Register a cpu startup notifier callback that initializes 1653 * cpu_cache_get for all new cpus 1654 */ 1655 register_cpu_notifier(&cpucache_notifier); 1656 1657 /* 1658 * The reap timers are started later, with a module init call: That part 1659 * of the kernel is not yet operational. 1660 */ 1661} 1662 1663void __init kmem_cache_init_late(void) 1664{ 1665 /* 1666 * Interrupts are enabled now so all GFP allocations are safe. 1667 */ 1668 slab_gfp_mask = __GFP_BITS_MASK; 1669} 1670 1671static int __init cpucache_init(void) 1672{ 1673 int cpu; 1674 1675 /* 1676 * Register the timers that return unneeded pages to the page allocator 1677 */ 1678 for_each_online_cpu(cpu) 1679 start_cpu_timer(cpu); 1680 return 0; 1681} 1682__initcall(cpucache_init); 1683 1684/* 1685 * Interface to system's page allocator. No need to hold the cache-lock. 1686 * 1687 * If we requested dmaable memory, we will get it. Even if we 1688 * did not request dmaable memory, we might get it, but that 1689 * would be relatively rare and ignorable. 1690 */ 1691static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) 1692{ 1693 struct page *page; 1694 int nr_pages; 1695 int i; 1696 1697#ifndef CONFIG_MMU 1698 /* 1699 * Nommu uses slab's for process anonymous memory allocations, and thus 1700 * requires __GFP_COMP to properly refcount higher order allocations 1701 */ 1702 flags |= __GFP_COMP; 1703#endif 1704 1705 flags |= cachep->gfpflags; 1706 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1707 flags |= __GFP_RECLAIMABLE; 1708 1709 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1710 if (!page) 1711 return NULL; 1712 1713 nr_pages = (1 << cachep->gfporder); 1714 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1715 add_zone_page_state(page_zone(page), 1716 NR_SLAB_RECLAIMABLE, nr_pages); 1717 else 1718 add_zone_page_state(page_zone(page), 1719 NR_SLAB_UNRECLAIMABLE, nr_pages); 1720 for (i = 0; i < nr_pages; i++) 1721 __SetPageSlab(page + i); 1722 return page_address(page); 1723} 1724 1725/* 1726 * Interface to system's page release. 1727 */ 1728static void kmem_freepages(struct kmem_cache *cachep, void *addr) 1729{ 1730 unsigned long i = (1 << cachep->gfporder); 1731 struct page *page = virt_to_page(addr); 1732 const unsigned long nr_freed = i; 1733 1734 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1735 sub_zone_page_state(page_zone(page), 1736 NR_SLAB_RECLAIMABLE, nr_freed); 1737 else 1738 sub_zone_page_state(page_zone(page), 1739 NR_SLAB_UNRECLAIMABLE, nr_freed); 1740 while (i--) { 1741 BUG_ON(!PageSlab(page)); 1742 __ClearPageSlab(page); 1743 page++; 1744 } 1745 if (current->reclaim_state) 1746 current->reclaim_state->reclaimed_slab += nr_freed; 1747 free_pages((unsigned long)addr, cachep->gfporder); 1748} 1749 1750static void kmem_rcu_free(struct rcu_head *head) 1751{ 1752 struct slab_rcu *slab_rcu = (struct slab_rcu *)head; 1753 struct kmem_cache *cachep = slab_rcu->cachep; 1754 1755 kmem_freepages(cachep, slab_rcu->addr); 1756 if (OFF_SLAB(cachep)) 1757 kmem_cache_free(cachep->slabp_cache, slab_rcu); 1758} 1759 1760#if DEBUG 1761 1762#ifdef CONFIG_DEBUG_PAGEALLOC 1763static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1764 unsigned long caller) 1765{ 1766 int size = obj_size(cachep); 1767 1768 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; 1769 1770 if (size < 5 * sizeof(unsigned long)) 1771 return; 1772 1773 *addr++ = 0x12345678; 1774 *addr++ = caller; 1775 *addr++ = smp_processor_id(); 1776 size -= 3 * sizeof(unsigned long); 1777 { 1778 unsigned long *sptr = &caller; 1779 unsigned long svalue; 1780 1781 while (!kstack_end(sptr)) { 1782 svalue = *sptr++; 1783 if (kernel_text_address(svalue)) { 1784 *addr++ = svalue; 1785 size -= sizeof(unsigned long); 1786 if (size <= sizeof(unsigned long)) 1787 break; 1788 } 1789 } 1790 1791 } 1792 *addr++ = 0x87654321; 1793} 1794#endif 1795 1796static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 1797{ 1798 int size = obj_size(cachep); 1799 addr = &((char *)addr)[obj_offset(cachep)]; 1800 1801 memset(addr, val, size); 1802 *(unsigned char *)(addr + size - 1) = POISON_END; 1803} 1804 1805static void dump_line(char *data, int offset, int limit) 1806{ 1807 int i; 1808 unsigned char error = 0; 1809 int bad_count = 0; 1810 1811 printk(KERN_ERR "%03x:", offset); 1812 for (i = 0; i < limit; i++) { 1813 if (data[offset + i] != POISON_FREE) { 1814 error = data[offset + i]; 1815 bad_count++; 1816 } 1817 printk(" %02x", (unsigned char)data[offset + i]); 1818 } 1819 printk("\n"); 1820 1821 if (bad_count == 1) { 1822 error ^= POISON_FREE; 1823 if (!(error & (error - 1))) { 1824 printk(KERN_ERR "Single bit error detected. Probably " 1825 "bad RAM.\n"); 1826#ifdef CONFIG_X86 1827 printk(KERN_ERR "Run memtest86+ or a similar memory " 1828 "test tool.\n"); 1829#else 1830 printk(KERN_ERR "Run a memory test tool.\n"); 1831#endif 1832 } 1833 } 1834} 1835#endif 1836 1837#if DEBUG 1838 1839static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) 1840{ 1841 int i, size; 1842 char *realobj; 1843 1844 if (cachep->flags & SLAB_RED_ZONE) { 1845 printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", 1846 *dbg_redzone1(cachep, objp), 1847 *dbg_redzone2(cachep, objp)); 1848 } 1849 1850 if (cachep->flags & SLAB_STORE_USER) { 1851 printk(KERN_ERR "Last user: [<%p>]", 1852 *dbg_userword(cachep, objp)); 1853 print_symbol("(%s)", 1854 (unsigned long)*dbg_userword(cachep, objp)); 1855 printk("\n"); 1856 } 1857 realobj = (char *)objp + obj_offset(cachep); 1858 size = obj_size(cachep); 1859 for (i = 0; i < size && lines; i += 16, lines--) { 1860 int limit; 1861 limit = 16; 1862 if (i + limit > size) 1863 limit = size - i; 1864 dump_line(realobj, i, limit); 1865 } 1866} 1867 1868static void check_poison_obj(struct kmem_cache *cachep, void *objp) 1869{ 1870 char *realobj; 1871 int size, i; 1872 int lines = 0; 1873 1874 realobj = (char *)objp + obj_offset(cachep); 1875 size = obj_size(cachep); 1876 1877 for (i = 0; i < size; i++) { 1878 char exp = POISON_FREE; 1879 if (i == size - 1) 1880 exp = POISON_END; 1881 if (realobj[i] != exp) { 1882 int limit; 1883 /* Mismatch ! */ 1884 /* Print header */ 1885 if (lines == 0) { 1886 printk(KERN_ERR 1887 "Slab corruption: %s start=%p, len=%d\n", 1888 cachep->name, realobj, size); 1889 print_objinfo(cachep, objp, 0); 1890 } 1891 /* Hexdump the affected line */ 1892 i = (i / 16) * 16; 1893 limit = 16; 1894 if (i + limit > size) 1895 limit = size - i; 1896 dump_line(realobj, i, limit); 1897 i += 16; 1898 lines++; 1899 /* Limit to 5 lines */ 1900 if (lines > 5) 1901 break; 1902 } 1903 } 1904 if (lines != 0) { 1905 /* Print some data about the neighboring objects, if they 1906 * exist: 1907 */ 1908 struct slab *slabp = virt_to_slab(objp); 1909 unsigned int objnr; 1910 1911 objnr = obj_to_index(cachep, slabp, objp); 1912 if (objnr) { 1913 objp = index_to_obj(cachep, slabp, objnr - 1); 1914 realobj = (char *)objp + obj_offset(cachep); 1915 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1916 realobj, size); 1917 print_objinfo(cachep, objp, 2); 1918 } 1919 if (objnr + 1 < cachep->num) { 1920 objp = index_to_obj(cachep, slabp, objnr + 1); 1921 realobj = (char *)objp + obj_offset(cachep); 1922 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1923 realobj, size); 1924 print_objinfo(cachep, objp, 2); 1925 } 1926 } 1927} 1928#endif 1929 1930#if DEBUG 1931static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) 1932{ 1933 int i; 1934 for (i = 0; i < cachep->num; i++) { 1935 void *objp = index_to_obj(cachep, slabp, i); 1936 1937 if (cachep->flags & SLAB_POISON) { 1938#ifdef CONFIG_DEBUG_PAGEALLOC 1939 if (cachep->buffer_size % PAGE_SIZE == 0 && 1940 OFF_SLAB(cachep)) 1941 kernel_map_pages(virt_to_page(objp), 1942 cachep->buffer_size / PAGE_SIZE, 1); 1943 else 1944 check_poison_obj(cachep, objp); 1945#else 1946 check_poison_obj(cachep, objp); 1947#endif 1948 } 1949 if (cachep->flags & SLAB_RED_ZONE) { 1950 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1951 slab_error(cachep, "start of a freed object " 1952 "was overwritten"); 1953 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1954 slab_error(cachep, "end of a freed object " 1955 "was overwritten"); 1956 } 1957 } 1958} 1959#else 1960static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) 1961{ 1962} 1963#endif 1964 1965/** 1966 * slab_destroy - destroy and release all objects in a slab 1967 * @cachep: cache pointer being destroyed 1968 * @slabp: slab pointer being destroyed 1969 * 1970 * Destroy all the objs in a slab, and release the mem back to the system. 1971 * Before calling the slab must have been unlinked from the cache. The 1972 * cache-lock is not held/needed. 1973 */ 1974static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) 1975{ 1976 void *addr = slabp->s_mem - slabp->colouroff; 1977 1978 slab_destroy_debugcheck(cachep, slabp); 1979 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1980 struct slab_rcu *slab_rcu; 1981 1982 slab_rcu = (struct slab_rcu *)slabp; 1983 slab_rcu->cachep = cachep; 1984 slab_rcu->addr = addr; 1985 call_rcu(&slab_rcu->head, kmem_rcu_free); 1986 } else { 1987 kmem_freepages(cachep, addr); 1988 if (OFF_SLAB(cachep)) 1989 kmem_cache_free(cachep->slabp_cache, slabp); 1990 } 1991} 1992 1993static void __kmem_cache_destroy(struct kmem_cache *cachep) 1994{ 1995 int i; 1996 struct kmem_list3 *l3; 1997 1998 for_each_online_cpu(i) 1999 kfree(cachep->array[i]); 2000 2001 /* NUMA: free the list3 structures */ 2002 for_each_online_node(i) { 2003 l3 = cachep->nodelists[i]; 2004 if (l3) { 2005 kfree(l3->shared); 2006 free_alien_cache(l3->alien); 2007 kfree(l3); 2008 } 2009 } 2010 kmem_cache_free(&cache_cache, cachep); 2011} 2012 2013 2014/** 2015 * calculate_slab_order - calculate size (page order) of slabs 2016 * @cachep: pointer to the cache that is being created 2017 * @size: size of objects to be created in this cache. 2018 * @align: required alignment for the objects. 2019 * @flags: slab allocation flags 2020 * 2021 * Also calculates the number of objects per slab. 2022 * 2023 * This could be made much more intelligent. For now, try to avoid using 2024 * high order pages for slabs. When the gfp() functions are more friendly 2025 * towards high-order requests, this should be changed. 2026 */ 2027static size_t calculate_slab_order(struct kmem_cache *cachep, 2028 size_t size, size_t align, unsigned long flags) 2029{ 2030 unsigned long offslab_limit; 2031 size_t left_over = 0; 2032 int gfporder; 2033 2034 for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { 2035 unsigned int num; 2036 size_t remainder; 2037 2038 cache_estimate(gfporder, size, align, flags, &remainder, &num); 2039 if (!num) 2040 continue; 2041 2042 if (flags & CFLGS_OFF_SLAB) { 2043 /* 2044 * Max number of objs-per-slab for caches which 2045 * use off-slab slabs. Needed to avoid a possible 2046 * looping condition in cache_grow(). 2047 */ 2048 offslab_limit = size - sizeof(struct slab); 2049 offslab_limit /= sizeof(kmem_bufctl_t); 2050 2051 if (num > offslab_limit) 2052 break; 2053 } 2054 2055 /* Found something acceptable - save it away */ 2056 cachep->num = num; 2057 cachep->gfporder = gfporder; 2058 left_over = remainder; 2059 2060 /* 2061 * A VFS-reclaimable slab tends to have most allocations 2062 * as GFP_NOFS and we really don't want to have to be allocating 2063 * higher-order pages when we are unable to shrink dcache. 2064 */ 2065 if (flags & SLAB_RECLAIM_ACCOUNT) 2066 break; 2067 2068 /* 2069 * Large number of objects is good, but very large slabs are 2070 * currently bad for the gfp()s. 2071 */ 2072 if (gfporder >= slab_break_gfp_order) 2073 break; 2074 2075 /* 2076 * Acceptable internal fragmentation? 2077 */ 2078 if (left_over * 8 <= (PAGE_SIZE << gfporder)) 2079 break; 2080 } 2081 return left_over; 2082} 2083 2084static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) 2085{ 2086 if (g_cpucache_up == FULL) 2087 return enable_cpucache(cachep, gfp); 2088 2089 if (g_cpucache_up == NONE) { 2090 /* 2091 * Note: the first kmem_cache_create must create the cache 2092 * that's used by kmalloc(24), otherwise the creation of 2093 * further caches will BUG(). 2094 */ 2095 cachep->array[smp_processor_id()] = &initarray_generic.cache; 2096 2097 /* 2098 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is 2099 * the first cache, then we need to set up all its list3s, 2100 * otherwise the creation of further caches will BUG(). 2101 */ 2102 set_up_list3s(cachep, SIZE_AC); 2103 if (INDEX_AC == INDEX_L3) 2104 g_cpucache_up = PARTIAL_L3; 2105 else 2106 g_cpucache_up = PARTIAL_AC; 2107 } else { 2108 cachep->array[smp_processor_id()] = 2109 kmalloc(sizeof(struct arraycache_init), gfp); 2110 2111 if (g_cpucache_up == PARTIAL_AC) { 2112 set_up_list3s(cachep, SIZE_L3); 2113 g_cpucache_up = PARTIAL_L3; 2114 } else { 2115 int node; 2116 for_each_online_node(node) { 2117 cachep->nodelists[node] = 2118 kmalloc_node(sizeof(struct kmem_list3), 2119 gfp, node); 2120 BUG_ON(!cachep->nodelists[node]); 2121 kmem_list3_init(cachep->nodelists[node]); 2122 } 2123 } 2124 } 2125 cachep->nodelists[numa_node_id()]->next_reap = 2126 jiffies + REAPTIMEOUT_LIST3 + 2127 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 2128 2129 cpu_cache_get(cachep)->avail = 0; 2130 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 2131 cpu_cache_get(cachep)->batchcount = 1; 2132 cpu_cache_get(cachep)->touched = 0; 2133 cachep->batchcount = 1; 2134 cachep->limit = BOOT_CPUCACHE_ENTRIES; 2135 return 0; 2136} 2137 2138/** 2139 * kmem_cache_create - Create a cache. 2140 * @name: A string which is used in /proc/slabinfo to identify this cache. 2141 * @size: The size of objects to be created in this cache. 2142 * @align: The required alignment for the objects. 2143 * @flags: SLAB flags 2144 * @ctor: A constructor for the objects. 2145 * 2146 * Returns a ptr to the cache on success, NULL on failure. 2147 * Cannot be called within a int, but can be interrupted. 2148 * The @ctor is run when new pages are allocated by the cache. 2149 * 2150 * @name must be valid until the cache is destroyed. This implies that 2151 * the module calling this has to destroy the cache before getting unloaded. 2152 * Note that kmem_cache_name() is not guaranteed to return the same pointer, 2153 * therefore applications must manage it themselves. 2154 * 2155 * The flags are 2156 * 2157 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 2158 * to catch references to uninitialised memory. 2159 * 2160 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 2161 * for buffer overruns. 2162 * 2163 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 2164 * cacheline. This can be beneficial if you're counting cycles as closely 2165 * as davem. 2166 */ 2167struct kmem_cache * 2168kmem_cache_create (const char *name, size_t size, size_t align, 2169 unsigned long flags, void (*ctor)(void *)) 2170{ 2171 size_t left_over, slab_size, ralign; 2172 struct kmem_cache *cachep = NULL, *pc; 2173 gfp_t gfp; 2174 2175 /* 2176 * Sanity checks... these are all serious usage bugs. 2177 */ 2178 if (!name || in_interrupt() || (size < BYTES_PER_WORD) || 2179 size > KMALLOC_MAX_SIZE) { 2180 printk(KERN_ERR "%s: Early error in slab %s\n", __func__, 2181 name); 2182 BUG(); 2183 } 2184 2185 /* 2186 * We use cache_chain_mutex to ensure a consistent view of 2187 * cpu_online_mask as well. Please see cpuup_callback 2188 */ 2189 if (slab_is_available()) { 2190 get_online_cpus(); 2191 mutex_lock(&cache_chain_mutex); 2192 } 2193 2194 list_for_each_entry(pc, &cache_chain, next) { 2195 char tmp; 2196 int res; 2197 2198 /* 2199 * This happens when the module gets unloaded and doesn't 2200 * destroy its slab cache and no-one else reuses the vmalloc 2201 * area of the module. Print a warning. 2202 */ 2203 res = probe_kernel_address(pc->name, tmp); 2204 if (res) { 2205 printk(KERN_ERR 2206 "SLAB: cache with size %d has lost its name\n", 2207 pc->buffer_size); 2208 continue; 2209 } 2210 2211 if (!strcmp(pc->name, name)) { 2212 printk(KERN_ERR 2213 "kmem_cache_create: duplicate cache %s\n", name); 2214 dump_stack(); 2215 goto oops; 2216 } 2217 } 2218 2219#if DEBUG 2220 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 2221#if FORCED_DEBUG 2222 /* 2223 * Enable redzoning and last user accounting, except for caches with 2224 * large objects, if the increased size would increase the object size 2225 * above the next power of two: caches with object sizes just above a 2226 * power of two have a significant amount of internal fragmentation. 2227 */ 2228 if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2229 2 * sizeof(unsigned long long))) 2230 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 2231 if (!(flags & SLAB_DESTROY_BY_RCU)) 2232 flags |= SLAB_POISON; 2233#endif 2234 if (flags & SLAB_DESTROY_BY_RCU) 2235 BUG_ON(flags & SLAB_POISON); 2236#endif 2237 /* 2238 * Always checks flags, a caller might be expecting debug support which 2239 * isn't available. 2240 */ 2241 BUG_ON(flags & ~CREATE_MASK); 2242 2243 /* 2244 * Check that size is in terms of words. This is needed to avoid 2245 * unaligned accesses for some archs when redzoning is used, and makes 2246 * sure any on-slab bufctl's are also correctly aligned. 2247 */ 2248 if (size & (BYTES_PER_WORD - 1)) { 2249 size += (BYTES_PER_WORD - 1); 2250 size &= ~(BYTES_PER_WORD - 1); 2251 } 2252 2253 /* calculate the final buffer alignment: */ 2254 2255 /* 1) arch recommendation: can be overridden for debug */ 2256 if (flags & SLAB_HWCACHE_ALIGN) { 2257 /* 2258 * Default alignment: as specified by the arch code. Except if 2259 * an object is really small, then squeeze multiple objects into 2260 * one cacheline. 2261 */ 2262 ralign = cache_line_size(); 2263 while (size <= ralign / 2) 2264 ralign /= 2; 2265 } else { 2266 ralign = BYTES_PER_WORD; 2267 } 2268 2269 /* 2270 * Redzoning and user store require word alignment or possibly larger. 2271 * Note this will be overridden by architecture or caller mandated 2272 * alignment if either is greater than BYTES_PER_WORD. 2273 */ 2274 if (flags & SLAB_STORE_USER) 2275 ralign = BYTES_PER_WORD; 2276 2277 if (flags & SLAB_RED_ZONE) { 2278 ralign = REDZONE_ALIGN; 2279 /* If redzoning, ensure that the second redzone is suitably 2280 * aligned, by adjusting the object size accordingly. */ 2281 size += REDZONE_ALIGN - 1; 2282 size &= ~(REDZONE_ALIGN - 1); 2283 } 2284 2285 /* 2) arch mandated alignment */ 2286 if (ralign < ARCH_SLAB_MINALIGN) { 2287 ralign = ARCH_SLAB_MINALIGN; 2288 } 2289 /* 3) caller mandated alignment */ 2290 if (ralign < align) { 2291 ralign = align; 2292 } 2293 /* disable debug if necessary */ 2294 if (ralign > __alignof__(unsigned long long)) 2295 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2296 /* 2297 * 4) Store it. 2298 */ 2299 align = ralign; 2300 2301 if (slab_is_available()) 2302 gfp = GFP_KERNEL; 2303 else 2304 gfp = GFP_NOWAIT; 2305 2306 /* Get cache's description obj. */ 2307 cachep = kmem_cache_zalloc(&cache_cache, gfp); 2308 if (!cachep) 2309 goto oops; 2310 2311#if DEBUG 2312 cachep->obj_size = size; 2313 2314 /* 2315 * Both debugging options require word-alignment which is calculated 2316 * into align above. 2317 */ 2318 if (flags & SLAB_RED_ZONE) { 2319 /* add space for red zone words */ 2320 cachep->obj_offset += sizeof(unsigned long long); 2321 size += 2 * sizeof(unsigned long long); 2322 } 2323 if (flags & SLAB_STORE_USER) { 2324 /* user store requires one word storage behind the end of 2325 * the real object. But if the second red zone needs to be 2326 * aligned to 64 bits, we must allow that much space. 2327 */ 2328 if (flags & SLAB_RED_ZONE) 2329 size += REDZONE_ALIGN; 2330 else 2331 size += BYTES_PER_WORD; 2332 } 2333#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2334 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 2335 && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { 2336 cachep->obj_offset += PAGE_SIZE - size; 2337 size = PAGE_SIZE; 2338 } 2339#endif 2340#endif 2341 2342 /* 2343 * Determine if the slab management is 'on' or 'off' slab. 2344 * (bootstrapping cannot cope with offslab caches so don't do 2345 * it too early on.) 2346 */ 2347 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) 2348 /* 2349 * Size is large, assume best to place the slab management obj 2350 * off-slab (should allow better packing of objs). 2351 */ 2352 flags |= CFLGS_OFF_SLAB; 2353 2354 size = ALIGN(size, align); 2355 2356 left_over = calculate_slab_order(cachep, size, align, flags); 2357 2358 if (!cachep->num) { 2359 printk(KERN_ERR 2360 "kmem_cache_create: couldn't create cache %s.\n", name); 2361 kmem_cache_free(&cache_cache, cachep); 2362 cachep = NULL; 2363 goto oops; 2364 } 2365 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) 2366 + sizeof(struct slab), align); 2367 2368 /* 2369 * If the slab has been placed off-slab, and we have enough space then 2370 * move it on-slab. This is at the expense of any extra colouring. 2371 */ 2372 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { 2373 flags &= ~CFLGS_OFF_SLAB; 2374 left_over -= slab_size; 2375 } 2376 2377 if (flags & CFLGS_OFF_SLAB) { 2378 /* really off slab. No need for manual alignment */ 2379 slab_size = 2380 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); 2381 } 2382 2383 cachep->colour_off = cache_line_size(); 2384 /* Offset must be a multiple of the alignment. */ 2385 if (cachep->colour_off < align) 2386 cachep->colour_off = align; 2387 cachep->colour = left_over / cachep->colour_off; 2388 cachep->slab_size = slab_size; 2389 cachep->flags = flags; 2390 cachep->gfpflags = 0; 2391 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) 2392 cachep->gfpflags |= GFP_DMA; 2393 cachep->buffer_size = size; 2394 cachep->reciprocal_buffer_size = reciprocal_value(size); 2395 2396 if (flags & CFLGS_OFF_SLAB) { 2397 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 2398 /* 2399 * This is a possibility for one of the malloc_sizes caches. 2400 * But since we go off slab only for object size greater than 2401 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, 2402 * this should not happen at all. 2403 * But leave a BUG_ON for some lucky dude. 2404 */ 2405 BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); 2406 } 2407 cachep->ctor = ctor; 2408 cachep->name = name; 2409 2410 if (setup_cpu_cache(cachep, gfp)) { 2411 __kmem_cache_destroy(cachep); 2412 cachep = NULL; 2413 goto oops; 2414 } 2415 2416 /* cache setup completed, link it into the list */ 2417 list_add(&cachep->next, &cache_chain); 2418oops: 2419 if (!cachep && (flags & SLAB_PANIC)) 2420 panic("kmem_cache_create(): failed to create slab `%s'\n", 2421 name); 2422 if (slab_is_available()) { 2423 mutex_unlock(&cache_chain_mutex); 2424 put_online_cpus(); 2425 } 2426 return cachep; 2427} 2428EXPORT_SYMBOL(kmem_cache_create); 2429 2430#if DEBUG 2431static void check_irq_off(void) 2432{ 2433 BUG_ON(!irqs_disabled()); 2434} 2435 2436static void check_irq_on(void) 2437{ 2438 BUG_ON(irqs_disabled()); 2439} 2440 2441static void check_spinlock_acquired(struct kmem_cache *cachep) 2442{ 2443#ifdef CONFIG_SMP 2444 check_irq_off(); 2445 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); 2446#endif 2447} 2448 2449static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) 2450{ 2451#ifdef CONFIG_SMP 2452 check_irq_off(); 2453 assert_spin_locked(&cachep->nodelists[node]->list_lock); 2454#endif 2455} 2456 2457#else 2458#define check_irq_off() do { } while(0) 2459#define check_irq_on() do { } while(0) 2460#define check_spinlock_acquired(x) do { } while(0) 2461#define check_spinlock_acquired_node(x, y) do { } while(0) 2462#endif 2463 2464static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 2465 struct array_cache *ac, 2466 int force, int node); 2467 2468static void do_drain(void *arg) 2469{ 2470 struct kmem_cache *cachep = arg; 2471 struct array_cache *ac; 2472 int node = numa_node_id(); 2473 2474 check_irq_off(); 2475 ac = cpu_cache_get(cachep); 2476 spin_lock(&cachep->nodelists[node]->list_lock); 2477 free_block(cachep, ac->entry, ac->avail, node); 2478 spin_unlock(&cachep->nodelists[node]->list_lock); 2479 ac->avail = 0; 2480} 2481 2482static void drain_cpu_caches(struct kmem_cache *cachep) 2483{ 2484 struct kmem_list3 *l3; 2485 int node; 2486 2487 on_each_cpu(do_drain, cachep, 1); 2488 check_irq_on(); 2489 for_each_online_node(node) { 2490 l3 = cachep->nodelists[node]; 2491 if (l3 && l3->alien) 2492 drain_alien_cache(cachep, l3->alien); 2493 } 2494 2495 for_each_online_node(node) { 2496 l3 = cachep->nodelists[node]; 2497 if (l3) 2498 drain_array(cachep, l3, l3->shared, 1, node); 2499 } 2500} 2501 2502/* 2503 * Remove slabs from the list of free slabs. 2504 * Specify the number of slabs to drain in tofree. 2505 * 2506 * Returns the actual number of slabs released. 2507 */ 2508static int drain_freelist(struct kmem_cache *cache, 2509 struct kmem_list3 *l3, int tofree) 2510{ 2511 struct list_head *p; 2512 int nr_freed; 2513 struct slab *slabp; 2514 2515 nr_freed = 0; 2516 while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { 2517 2518 spin_lock_irq(&l3->list_lock); 2519 p = l3->slabs_free.prev; 2520 if (p == &l3->slabs_free) { 2521 spin_unlock_irq(&l3->list_lock); 2522 goto out; 2523 } 2524 2525 slabp = list_entry(p, struct slab, list); 2526#if DEBUG 2527 BUG_ON(slabp->inuse); 2528#endif 2529 list_del(&slabp->list); 2530 /* 2531 * Safe to drop the lock. The slab is no longer linked 2532 * to the cache. 2533 */ 2534 l3->free_objects -= cache->num; 2535 spin_unlock_irq(&l3->list_lock); 2536 slab_destroy(cache, slabp); 2537 nr_freed++; 2538 } 2539out: 2540 return nr_freed; 2541} 2542 2543/* Called with cache_chain_mutex held to protect against cpu hotplug */ 2544static int __cache_shrink(struct kmem_cache *cachep) 2545{ 2546 int ret = 0, i = 0; 2547 struct kmem_list3 *l3; 2548 2549 drain_cpu_caches(cachep); 2550 2551 check_irq_on(); 2552 for_each_online_node(i) { 2553 l3 = cachep->nodelists[i]; 2554 if (!l3) 2555 continue; 2556 2557 drain_freelist(cachep, l3, l3->free_objects); 2558 2559 ret += !list_empty(&l3->slabs_full) || 2560 !list_empty(&l3->slabs_partial); 2561 } 2562 return (ret ? 1 : 0); 2563} 2564 2565/** 2566 * kmem_cache_shrink - Shrink a cache. 2567 * @cachep: The cache to shrink. 2568 * 2569 * Releases as many slabs as possible for a cache. 2570 * To help debugging, a zero exit status indicates all slabs were released. 2571 */ 2572int kmem_cache_shrink(struct kmem_cache *cachep) 2573{ 2574 int ret; 2575 BUG_ON(!cachep || in_interrupt()); 2576 2577 get_online_cpus(); 2578 mutex_lock(&cache_chain_mutex); 2579 ret = __cache_shrink(cachep); 2580 mutex_unlock(&cache_chain_mutex); 2581 put_online_cpus(); 2582 return ret; 2583} 2584EXPORT_SYMBOL(kmem_cache_shrink); 2585 2586/** 2587 * kmem_cache_destroy - delete a cache 2588 * @cachep: the cache to destroy 2589 * 2590 * Remove a &struct kmem_cache object from the slab cache. 2591 * 2592 * It is expected this function will be called by a module when it is 2593 * unloaded. This will remove the cache completely, and avoid a duplicate 2594 * cache being allocated each time a module is loaded and unloaded, if the 2595 * module doesn't have persistent in-kernel storage across loads and unloads. 2596 * 2597 * The cache must be empty before calling this function. 2598 * 2599 * The caller must guarantee that noone will allocate memory from the cache 2600 * during the kmem_cache_destroy(). 2601 */ 2602void kmem_cache_destroy(struct kmem_cache *cachep) 2603{ 2604 BUG_ON(!cachep || in_interrupt()); 2605 2606 /* Find the cache in the chain of caches. */ 2607 get_online_cpus(); 2608 mutex_lock(&cache_chain_mutex); 2609 /* 2610 * the chain is never empty, cache_cache is never destroyed 2611 */ 2612 list_del(&cachep->next); 2613 if (__cache_shrink(cachep)) { 2614 slab_error(cachep, "Can't free all objects"); 2615 list_add(&cachep->next, &cache_chain); 2616 mutex_unlock(&cache_chain_mutex); 2617 put_online_cpus(); 2618 return; 2619 } 2620 2621 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2622 synchronize_rcu(); 2623 2624 __kmem_cache_destroy(cachep); 2625 mutex_unlock(&cache_chain_mutex); 2626 put_online_cpus(); 2627} 2628EXPORT_SYMBOL(kmem_cache_destroy); 2629 2630/* 2631 * Get the memory for a slab management obj. 2632 * For a slab cache when the slab descriptor is off-slab, slab descriptors 2633 * always come from malloc_sizes caches. The slab descriptor cannot 2634 * come from the same cache which is getting created because, 2635 * when we are searching for an appropriate cache for these 2636 * descriptors in kmem_cache_create, we search through the malloc_sizes array. 2637 * If we are creating a malloc_sizes cache here it would not be visible to 2638 * kmem_find_general_cachep till the initialization is complete. 2639 * Hence we cannot have slabp_cache same as the original cache. 2640 */ 2641static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, 2642 int colour_off, gfp_t local_flags, 2643 int nodeid) 2644{ 2645 struct slab *slabp; 2646 2647 if (OFF_SLAB(cachep)) { 2648 /* Slab management obj is off-slab. */ 2649 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2650 local_flags, nodeid); 2651 /* 2652 * If the first object in the slab is leaked (it's allocated 2653 * but no one has a reference to it), we want to make sure 2654 * kmemleak does not treat the ->s_mem pointer as a reference 2655 * to the object. Otherwise we will not report the leak. 2656 */ 2657 kmemleak_scan_area(slabp, offsetof(struct slab, list), 2658 sizeof(struct list_head), local_flags); 2659 if (!slabp) 2660 return NULL; 2661 } else { 2662 slabp = objp + colour_off; 2663 colour_off += cachep->slab_size; 2664 } 2665 slabp->inuse = 0; 2666 slabp->colouroff = colour_off; 2667 slabp->s_mem = objp + colour_off; 2668 slabp->nodeid = nodeid; 2669 slabp->free = 0; 2670 return slabp; 2671} 2672 2673static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2674{ 2675 return (kmem_bufctl_t *) (slabp + 1); 2676} 2677 2678static void cache_init_objs(struct kmem_cache *cachep, 2679 struct slab *slabp) 2680{ 2681 int i; 2682 2683 for (i = 0; i < cachep->num; i++) { 2684 void *objp = index_to_obj(cachep, slabp, i); 2685#if DEBUG 2686 /* need to poison the objs? */ 2687 if (cachep->flags & SLAB_POISON) 2688 poison_obj(cachep, objp, POISON_FREE); 2689 if (cachep->flags & SLAB_STORE_USER) 2690 *dbg_userword(cachep, objp) = NULL; 2691 2692 if (cachep->flags & SLAB_RED_ZONE) { 2693 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2694 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2695 } 2696 /* 2697 * Constructors are not allowed to allocate memory from the same 2698 * cache which they are a constructor for. Otherwise, deadlock. 2699 * They must also be threaded. 2700 */ 2701 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2702 cachep->ctor(objp + obj_offset(cachep)); 2703 2704 if (cachep->flags & SLAB_RED_ZONE) { 2705 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2706 slab_error(cachep, "constructor overwrote the" 2707 " end of an object"); 2708 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2709 slab_error(cachep, "constructor overwrote the" 2710 " start of an object"); 2711 } 2712 if ((cachep->buffer_size % PAGE_SIZE) == 0 && 2713 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2714 kernel_map_pages(virt_to_page(objp), 2715 cachep->buffer_size / PAGE_SIZE, 0); 2716#else 2717 if (cachep->ctor) 2718 cachep->ctor(objp); 2719#endif 2720 slab_bufctl(slabp)[i] = i + 1; 2721 } 2722 slab_bufctl(slabp)[i - 1] = BUFCTL_END; 2723} 2724 2725static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2726{ 2727 if (CONFIG_ZONE_DMA_FLAG) { 2728 if (flags & GFP_DMA) 2729 BUG_ON(!(cachep->gfpflags & GFP_DMA)); 2730 else 2731 BUG_ON(cachep->gfpflags & GFP_DMA); 2732 } 2733} 2734 2735static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, 2736 int nodeid) 2737{ 2738 void *objp = index_to_obj(cachep, slabp, slabp->free); 2739 kmem_bufctl_t next; 2740 2741 slabp->inuse++; 2742 next = slab_bufctl(slabp)[slabp->free]; 2743#if DEBUG 2744 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2745 WARN_ON(slabp->nodeid != nodeid); 2746#endif 2747 slabp->free = next; 2748 2749 return objp; 2750} 2751 2752static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, 2753 void *objp, int nodeid) 2754{ 2755 unsigned int objnr = obj_to_index(cachep, slabp, objp); 2756 2757#if DEBUG 2758 /* Verify that the slab belongs to the intended node */ 2759 WARN_ON(slabp->nodeid != nodeid); 2760 2761 if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { 2762 printk(KERN_ERR "slab: double free detected in cache " 2763 "'%s', objp %p\n", cachep->name, objp); 2764 BUG(); 2765 } 2766#endif 2767 slab_bufctl(slabp)[objnr] = slabp->free; 2768 slabp->free = objnr; 2769 slabp->inuse--; 2770} 2771 2772/* 2773 * Map pages beginning at addr to the given cache and slab. This is required 2774 * for the slab allocator to be able to lookup the cache and slab of a 2775 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. 2776 */ 2777static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, 2778 void *addr) 2779{ 2780 int nr_pages; 2781 struct page *page; 2782 2783 page = virt_to_page(addr); 2784 2785 nr_pages = 1; 2786 if (likely(!PageCompound(page))) 2787 nr_pages <<= cache->gfporder; 2788 2789 do { 2790 page_set_cache(page, cache); 2791 page_set_slab(page, slab); 2792 page++; 2793 } while (--nr_pages); 2794} 2795 2796/* 2797 * Grow (by 1) the number of slabs within a cache. This is called by 2798 * kmem_cache_alloc() when there are no active objs left in a cache. 2799 */ 2800static int cache_grow(struct kmem_cache *cachep, 2801 gfp_t flags, int nodeid, void *objp) 2802{ 2803 struct slab *slabp; 2804 size_t offset; 2805 gfp_t local_flags; 2806 struct kmem_list3 *l3; 2807 2808 /* 2809 * Be lazy and only check for valid flags here, keeping it out of the 2810 * critical path in kmem_cache_alloc(). 2811 */ 2812 BUG_ON(flags & GFP_SLAB_BUG_MASK); 2813 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 2814 2815 /* Take the l3 list lock to change the colour_next on this node */ 2816 check_irq_off(); 2817 l3 = cachep->nodelists[nodeid]; 2818 spin_lock(&l3->list_lock); 2819 2820 /* Get colour for the slab, and cal the next value. */ 2821 offset = l3->colour_next; 2822 l3->colour_next++; 2823 if (l3->colour_next >= cachep->colour) 2824 l3->colour_next = 0; 2825 spin_unlock(&l3->list_lock); 2826 2827 offset *= cachep->colour_off; 2828 2829 if (local_flags & __GFP_WAIT) 2830 local_irq_enable(); 2831 2832 /* 2833 * The test for missing atomic flag is performed here, rather than 2834 * the more obvious place, simply to reduce the critical path length 2835 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 2836 * will eventually be caught here (where it matters). 2837 */ 2838 kmem_flagcheck(cachep, flags); 2839 2840 /* 2841 * Get mem for the objs. Attempt to allocate a physical page from 2842 * 'nodeid'. 2843 */ 2844 if (!objp) 2845 objp = kmem_getpages(cachep, local_flags, nodeid); 2846 if (!objp) 2847 goto failed; 2848 2849 /* Get slab management. */ 2850 slabp = alloc_slabmgmt(cachep, objp, offset, 2851 local_flags & ~GFP_CONSTRAINT_MASK, nodeid); 2852 if (!slabp) 2853 goto opps1; 2854 2855 slab_map_pages(cachep, slabp, objp); 2856 2857 cache_init_objs(cachep, slabp); 2858 2859 if (local_flags & __GFP_WAIT) 2860 local_irq_disable(); 2861 check_irq_off(); 2862 spin_lock(&l3->list_lock); 2863 2864 /* Make slab active. */ 2865 list_add_tail(&slabp->list, &(l3->slabs_free)); 2866 STATS_INC_GROWN(cachep); 2867 l3->free_objects += cachep->num; 2868 spin_unlock(&l3->list_lock); 2869 return 1; 2870opps1: 2871 kmem_freepages(cachep, objp); 2872failed: 2873 if (local_flags & __GFP_WAIT) 2874 local_irq_disable(); 2875 return 0; 2876} 2877 2878#if DEBUG 2879 2880/* 2881 * Perform extra freeing checks: 2882 * - detect bad pointers. 2883 * - POISON/RED_ZONE checking 2884 */ 2885static void kfree_debugcheck(const void *objp) 2886{ 2887 if (!virt_addr_valid(objp)) { 2888 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2889 (unsigned long)objp); 2890 BUG(); 2891 } 2892} 2893 2894static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) 2895{ 2896 unsigned long long redzone1, redzone2; 2897 2898 redzone1 = *dbg_redzone1(cache, obj); 2899 redzone2 = *dbg_redzone2(cache, obj); 2900 2901 /* 2902 * Redzone is ok. 2903 */ 2904 if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) 2905 return; 2906 2907 if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) 2908 slab_error(cache, "double free detected"); 2909 else 2910 slab_error(cache, "memory outside object was overwritten"); 2911 2912 printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", 2913 obj, redzone1, redzone2); 2914} 2915 2916static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, 2917 void *caller) 2918{ 2919 struct page *page; 2920 unsigned int objnr; 2921 struct slab *slabp; 2922 2923 BUG_ON(virt_to_cache(objp) != cachep); 2924 2925 objp -= obj_offset(cachep); 2926 kfree_debugcheck(objp); 2927 page = virt_to_head_page(objp); 2928 2929 slabp = page_get_slab(page); 2930 2931 if (cachep->flags & SLAB_RED_ZONE) { 2932 verify_redzone_free(cachep, objp); 2933 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2934 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2935 } 2936 if (cachep->flags & SLAB_STORE_USER) 2937 *dbg_userword(cachep, objp) = caller; 2938 2939 objnr = obj_to_index(cachep, slabp, objp); 2940 2941 BUG_ON(objnr >= cachep->num); 2942 BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); 2943 2944#ifdef CONFIG_DEBUG_SLAB_LEAK 2945 slab_bufctl(slabp)[objnr] = BUFCTL_FREE; 2946#endif 2947 if (cachep->flags & SLAB_POISON) { 2948#ifdef CONFIG_DEBUG_PAGEALLOC 2949 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 2950 store_stackinfo(cachep, objp, (unsigned long)caller); 2951 kernel_map_pages(virt_to_page(objp), 2952 cachep->buffer_size / PAGE_SIZE, 0); 2953 } else { 2954 poison_obj(cachep, objp, POISON_FREE); 2955 } 2956#else 2957 poison_obj(cachep, objp, POISON_FREE); 2958#endif 2959 } 2960 return objp; 2961} 2962 2963static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) 2964{ 2965 kmem_bufctl_t i; 2966 int entries = 0; 2967 2968 /* Check slab's freelist to see if this obj is there. */ 2969 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2970 entries++; 2971 if (entries > cachep->num || i >= cachep->num) 2972 goto bad; 2973 } 2974 if (entries != cachep->num - slabp->inuse) { 2975bad: 2976 printk(KERN_ERR "slab: Internal list corruption detected in " 2977 "cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2978 cachep->name, cachep->num, slabp, slabp->inuse); 2979 for (i = 0; 2980 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); 2981 i++) { 2982 if (i % 16 == 0) 2983 printk("\n%03x:", i); 2984 printk(" %02x", ((unsigned char *)slabp)[i]); 2985 } 2986 printk("\n"); 2987 BUG(); 2988 } 2989} 2990#else 2991#define kfree_debugcheck(x) do { } while(0) 2992#define cache_free_debugcheck(x,objp,z) (objp) 2993#define check_slabp(x,y) do { } while(0) 2994#endif 2995 2996static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) 2997{ 2998 int batchcount; 2999 struct kmem_list3 *l3; 3000 struct array_cache *ac; 3001 int node; 3002 3003retry: 3004 check_irq_off(); 3005 node = numa_node_id(); 3006 ac = cpu_cache_get(cachep); 3007 batchcount = ac->batchcount; 3008 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 3009 /* 3010 * If there was little recent activity on this cache, then 3011 * perform only a partial refill. Otherwise we could generate 3012 * refill bouncing. 3013 */ 3014 batchcount = BATCHREFILL_LIMIT; 3015 } 3016 l3 = cachep->nodelists[node]; 3017 3018 BUG_ON(ac->avail > 0 || !l3); 3019 spin_lock(&l3->list_lock); 3020 3021 /* See if we can refill from the shared array */ 3022 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) 3023 goto alloc_done; 3024 3025 while (batchcount > 0) { 3026 struct list_head *entry; 3027 struct slab *slabp; 3028 /* Get slab alloc is to come from. */ 3029 entry = l3->slabs_partial.next; 3030 if (entry == &l3->slabs_partial) { 3031 l3->free_touched = 1; 3032 entry = l3->slabs_free.next; 3033 if (entry == &l3->slabs_free) 3034 goto must_grow; 3035 } 3036 3037 slabp = list_entry(entry, struct slab, list); 3038 check_slabp(cachep, slabp); 3039 check_spinlock_acquired(cachep); 3040 3041 /* 3042 * The slab was either on partial or free list so 3043 * there must be at least one object available for 3044 * allocation. 3045 */ 3046 BUG_ON(slabp->inuse >= cachep->num); 3047 3048 while (slabp->inuse < cachep->num && batchcount--) { 3049 STATS_INC_ALLOCED(cachep); 3050 STATS_INC_ACTIVE(cachep); 3051 STATS_SET_HIGH(cachep); 3052 3053 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, 3054 node); 3055 } 3056 check_slabp(cachep, slabp); 3057 3058 /* move slabp to correct slabp list: */ 3059 list_del(&slabp->list); 3060 if (slabp->free == BUFCTL_END) 3061 list_add(&slabp->list, &l3->slabs_full); 3062 else 3063 list_add(&slabp->list, &l3->slabs_partial); 3064 } 3065 3066must_grow: 3067 l3->free_objects -= ac->avail; 3068alloc_done: 3069 spin_unlock(&l3->list_lock); 3070 3071 if (unlikely(!ac->avail)) { 3072 int x; 3073 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 3074 3075 /* cache_grow can reenable interrupts, then ac could change. */ 3076 ac = cpu_cache_get(cachep); 3077 if (!x && ac->avail == 0) /* no objects in sight? abort */ 3078 return NULL; 3079 3080 if (!ac->avail) /* objects refilled by interrupt? */ 3081 goto retry; 3082 } 3083 ac->touched = 1; 3084 return ac->entry[--ac->avail]; 3085} 3086 3087static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 3088 gfp_t flags) 3089{ 3090 might_sleep_if(flags & __GFP_WAIT); 3091#if DEBUG 3092 kmem_flagcheck(cachep, flags); 3093#endif 3094} 3095 3096#if DEBUG 3097static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 3098 gfp_t flags, void *objp, void *caller) 3099{ 3100 if (!objp) 3101 return objp; 3102 if (cachep->flags & SLAB_POISON) { 3103#ifdef CONFIG_DEBUG_PAGEALLOC 3104 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 3105 kernel_map_pages(virt_to_page(objp), 3106 cachep->buffer_size / PAGE_SIZE, 1); 3107 else 3108 check_poison_obj(cachep, objp); 3109#else 3110 check_poison_obj(cachep, objp); 3111#endif 3112 poison_obj(cachep, objp, POISON_INUSE); 3113 } 3114 if (cachep->flags & SLAB_STORE_USER) 3115 *dbg_userword(cachep, objp) = caller; 3116 3117 if (cachep->flags & SLAB_RED_ZONE) { 3118 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || 3119 *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 3120 slab_error(cachep, "double free, or memory outside" 3121 " object was overwritten"); 3122 printk(KERN_ERR 3123 "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", 3124 objp, *dbg_redzone1(cachep, objp), 3125 *dbg_redzone2(cachep, objp)); 3126 } 3127 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 3128 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 3129 } 3130#ifdef CONFIG_DEBUG_SLAB_LEAK 3131 { 3132 struct slab *slabp; 3133 unsigned objnr; 3134 3135 slabp = page_get_slab(virt_to_head_page(objp)); 3136 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 3137 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; 3138 } 3139#endif 3140 objp += obj_offset(cachep); 3141 if (cachep->ctor && cachep->flags & SLAB_POISON) 3142 cachep->ctor(objp); 3143#if ARCH_SLAB_MINALIGN 3144 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3145 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3146 objp, ARCH_SLAB_MINALIGN); 3147 } 3148#endif 3149 return objp; 3150} 3151#else 3152#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 3153#endif 3154 3155static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) 3156{ 3157 if (cachep == &cache_cache) 3158 return false; 3159 3160 return should_failslab(obj_size(cachep), flags); 3161} 3162 3163static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3164{ 3165 void *objp; 3166 struct array_cache *ac; 3167 3168 check_irq_off(); 3169 3170 ac = cpu_cache_get(cachep); 3171 if (likely(ac->avail)) { 3172 STATS_INC_ALLOCHIT(cachep); 3173 ac->touched = 1; 3174 objp = ac->entry[--ac->avail]; 3175 } else { 3176 STATS_INC_ALLOCMISS(cachep); 3177 objp = cache_alloc_refill(cachep, flags); 3178 } 3179 /* 3180 * To avoid a false negative, if an object that is in one of the 3181 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 3182 * treat the array pointers as a reference to the object. 3183 */ 3184 kmemleak_erase(&ac->entry[ac->avail]); 3185 return objp; 3186} 3187 3188#ifdef CONFIG_NUMA 3189/* 3190 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. 3191 * 3192 * If we are in_interrupt, then process context, including cpusets and 3193 * mempolicy, may not apply and should not be used for allocation policy. 3194 */ 3195static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) 3196{ 3197 int nid_alloc, nid_here; 3198 3199 if (in_interrupt() || (flags & __GFP_THISNODE)) 3200 return NULL; 3201 nid_alloc = nid_here = numa_node_id(); 3202 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3203 nid_alloc = cpuset_mem_spread_node(); 3204 else if (current->mempolicy) 3205 nid_alloc = slab_node(current->mempolicy); 3206 if (nid_alloc != nid_here) 3207 return ____cache_alloc_node(cachep, flags, nid_alloc); 3208 return NULL; 3209} 3210 3211/* 3212 * Fallback function if there was no memory available and no objects on a 3213 * certain node and fall back is permitted. First we scan all the 3214 * available nodelists for available objects. If that fails then we 3215 * perform an allocation without specifying a node. This allows the page 3216 * allocator to do its reclaim / fallback magic. We then insert the 3217 * slab into the proper nodelist and then allocate from it. 3218 */ 3219static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) 3220{ 3221 struct zonelist *zonelist; 3222 gfp_t local_flags; 3223 struct zoneref *z; 3224 struct zone *zone; 3225 enum zone_type high_zoneidx = gfp_zone(flags); 3226 void *obj = NULL; 3227 int nid; 3228 3229 if (flags & __GFP_THISNODE) 3230 return NULL; 3231 3232 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 3233 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3234 3235retry: 3236 /* 3237 * Look through allowed nodes for objects available 3238 * from existing per node queues. 3239 */ 3240 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 3241 nid = zone_to_nid(zone); 3242 3243 if (cpuset_zone_allowed_hardwall(zone, flags) && 3244 cache->nodelists[nid] && 3245 cache->nodelists[nid]->free_objects) { 3246 obj = ____cache_alloc_node(cache, 3247 flags | GFP_THISNODE, nid); 3248 if (obj) 3249 break; 3250 } 3251 } 3252 3253 if (!obj) { 3254 /* 3255 * This allocation will be performed within the constraints 3256 * of the current cpuset / memory policy requirements. 3257 * We may trigger various forms of reclaim on the allowed 3258 * set and go into memory reserves if necessary. 3259 */ 3260 if (local_flags & __GFP_WAIT) 3261 local_irq_enable(); 3262 kmem_flagcheck(cache, flags); 3263 obj = kmem_getpages(cache, local_flags, -1); 3264 if (local_flags & __GFP_WAIT) 3265 local_irq_disable(); 3266 if (obj) { 3267 /* 3268 * Insert into the appropriate per node queues 3269 */ 3270 nid = page_to_nid(virt_to_page(obj)); 3271 if (cache_grow(cache, flags, nid, obj)) { 3272 obj = ____cache_alloc_node(cache, 3273 flags | GFP_THISNODE, nid); 3274 if (!obj) 3275 /* 3276 * Another processor may allocate the 3277 * objects in the slab since we are 3278 * not holding any locks. 3279 */ 3280 goto retry; 3281 } else { 3282 /* cache_grow already freed obj */ 3283 obj = NULL; 3284 } 3285 } 3286 } 3287 return obj; 3288} 3289 3290/* 3291 * A interface to enable slab creation on nodeid 3292 */ 3293static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3294 int nodeid) 3295{ 3296 struct list_head *entry; 3297 struct slab *slabp; 3298 struct kmem_list3 *l3; 3299 void *obj; 3300 int x; 3301 3302 l3 = cachep->nodelists[nodeid]; 3303 BUG_ON(!l3); 3304 3305retry: 3306 check_irq_off(); 3307 spin_lock(&l3->list_lock); 3308 entry = l3->slabs_partial.next; 3309 if (entry == &l3->slabs_partial) { 3310 l3->free_touched = 1; 3311 entry = l3->slabs_free.next; 3312 if (entry == &l3->slabs_free) 3313 goto must_grow; 3314 } 3315 3316 slabp = list_entry(entry, struct slab, list); 3317 check_spinlock_acquired_node(cachep, nodeid); 3318 check_slabp(cachep, slabp); 3319 3320 STATS_INC_NODEALLOCS(cachep); 3321 STATS_INC_ACTIVE(cachep); 3322 STATS_SET_HIGH(cachep); 3323 3324 BUG_ON(slabp->inuse == cachep->num); 3325 3326 obj = slab_get_obj(cachep, slabp, nodeid); 3327 check_slabp(cachep, slabp); 3328 l3->free_objects--; 3329 /* move slabp to correct slabp list: */ 3330 list_del(&slabp->list); 3331 3332 if (slabp->free == BUFCTL_END) 3333 list_add(&slabp->list, &l3->slabs_full); 3334 else 3335 list_add(&slabp->list, &l3->slabs_partial); 3336 3337 spin_unlock(&l3->list_lock); 3338 goto done; 3339 3340must_grow: 3341 spin_unlock(&l3->list_lock); 3342 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); 3343 if (x) 3344 goto retry; 3345 3346 return fallback_alloc(cachep, flags); 3347 3348done: 3349 return obj; 3350} 3351 3352/** 3353 * kmem_cache_alloc_node - Allocate an object on the specified node 3354 * @cachep: The cache to allocate from. 3355 * @flags: See kmalloc(). 3356 * @nodeid: node number of the target node. 3357 * @caller: return address of caller, used for debug information 3358 * 3359 * Identical to kmem_cache_alloc but it will allocate memory on the given 3360 * node, which can improve the performance for cpu bound structures. 3361 * 3362 * Fallback to other node is possible if __GFP_THISNODE is not set. 3363 */ 3364static __always_inline void * 3365__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, 3366 void *caller) 3367{ 3368 unsigned long save_flags; 3369 void *ptr; 3370 3371 flags &= slab_gfp_mask; 3372 3373 lockdep_trace_alloc(flags); 3374 3375 if (slab_should_failslab(cachep, flags)) 3376 return NULL; 3377 3378 cache_alloc_debugcheck_before(cachep, flags); 3379 local_irq_save(save_flags); 3380 3381 if (unlikely(nodeid == -1)) 3382 nodeid = numa_node_id(); 3383 3384 if (unlikely(!cachep->nodelists[nodeid])) { 3385 /* Node not bootstrapped yet */ 3386 ptr = fallback_alloc(cachep, flags); 3387 goto out; 3388 } 3389 3390 if (nodeid == numa_node_id()) { 3391 /* 3392 * Use the locally cached objects if possible. 3393 * However ____cache_alloc does not allow fallback 3394 * to other nodes. It may fail while we still have 3395 * objects on other nodes available. 3396 */ 3397 ptr = ____cache_alloc(cachep, flags); 3398 if (ptr) 3399 goto out; 3400 } 3401 /* ___cache_alloc_node can fall back to other nodes */ 3402 ptr = ____cache_alloc_node(cachep, flags, nodeid); 3403 out: 3404 local_irq_restore(save_flags); 3405 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3406 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, 3407 flags); 3408 3409 if (unlikely((flags & __GFP_ZERO) && ptr)) 3410 memset(ptr, 0, obj_size(cachep)); 3411 3412 return ptr; 3413} 3414 3415static __always_inline void * 3416__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) 3417{ 3418 void *objp; 3419 3420 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { 3421 objp = alternate_node_alloc(cache, flags); 3422 if (objp) 3423 goto out; 3424 } 3425 objp = ____cache_alloc(cache, flags); 3426 3427 /* 3428 * We may just have run out of memory on the local node. 3429 * ____cache_alloc_node() knows how to locate memory on other nodes 3430 */ 3431 if (!objp) 3432 objp = ____cache_alloc_node(cache, flags, numa_node_id()); 3433 3434 out: 3435 return objp; 3436} 3437#else 3438 3439static __always_inline void * 3440__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3441{ 3442 return ____cache_alloc(cachep, flags); 3443} 3444 3445#endif /* CONFIG_NUMA */ 3446 3447static __always_inline void * 3448__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) 3449{ 3450 unsigned long save_flags; 3451 void *objp; 3452 3453 flags &= slab_gfp_mask; 3454 3455 lockdep_trace_alloc(flags); 3456 3457 if (slab_should_failslab(cachep, flags)) 3458 return NULL; 3459 3460 cache_alloc_debugcheck_before(cachep, flags); 3461 local_irq_save(save_flags); 3462 objp = __do_cache_alloc(cachep, flags); 3463 local_irq_restore(save_flags); 3464 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3465 kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, 3466 flags); 3467 prefetchw(objp); 3468 3469 if (unlikely((flags & __GFP_ZERO) && objp)) 3470 memset(objp, 0, obj_size(cachep)); 3471 3472 return objp; 3473} 3474 3475/* 3476 * Caller needs to acquire correct kmem_list's list_lock 3477 */ 3478static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, 3479 int node) 3480{ 3481 int i; 3482 struct kmem_list3 *l3; 3483 3484 for (i = 0; i < nr_objects; i++) { 3485 void *objp = objpp[i]; 3486 struct slab *slabp; 3487 3488 slabp = virt_to_slab(objp); 3489 l3 = cachep->nodelists[node]; 3490 list_del(&slabp->list); 3491 check_spinlock_acquired_node(cachep, node); 3492 check_slabp(cachep, slabp); 3493 slab_put_obj(cachep, slabp, objp, node); 3494 STATS_DEC_ACTIVE(cachep); 3495 l3->free_objects++; 3496 check_slabp(cachep, slabp); 3497 3498 /* fixup slab chains */ 3499 if (slabp->inuse == 0) { 3500 if (l3->free_objects > l3->free_limit) { 3501 l3->free_objects -= cachep->num; 3502 /* No need to drop any previously held 3503 * lock here, even if we have a off-slab slab 3504 * descriptor it is guaranteed to come from 3505 * a different cache, refer to comments before 3506 * alloc_slabmgmt. 3507 */ 3508 slab_destroy(cachep, slabp); 3509 } else { 3510 list_add(&slabp->list, &l3->slabs_free); 3511 } 3512 } else { 3513 /* Unconditionally move a slab to the end of the 3514 * partial list on free - maximum time for the 3515 * other objects to be freed, too. 3516 */ 3517 list_add_tail(&slabp->list, &l3->slabs_partial); 3518 } 3519 } 3520} 3521 3522static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) 3523{ 3524 int batchcount; 3525 struct kmem_list3 *l3; 3526 int node = numa_node_id(); 3527 3528 batchcount = ac->batchcount; 3529#if DEBUG 3530 BUG_ON(!batchcount || batchcount > ac->avail); 3531#endif 3532 check_irq_off(); 3533 l3 = cachep->nodelists[node]; 3534 spin_lock(&l3->list_lock); 3535 if (l3->shared) { 3536 struct array_cache *shared_array = l3->shared; 3537 int max = shared_array->limit - shared_array->avail; 3538 if (max) { 3539 if (batchcount > max) 3540 batchcount = max; 3541 memcpy(&(shared_array->entry[shared_array->avail]), 3542 ac->entry, sizeof(void *) * batchcount); 3543 shared_array->avail += batchcount; 3544 goto free_done; 3545 } 3546 } 3547 3548 free_block(cachep, ac->entry, batchcount, node); 3549free_done: 3550#if STATS 3551 { 3552 int i = 0; 3553 struct list_head *p; 3554 3555 p = l3->slabs_free.next; 3556 while (p != &(l3->slabs_free)) { 3557 struct slab *slabp; 3558 3559 slabp = list_entry(p, struct slab, list); 3560 BUG_ON(slabp->inuse); 3561 3562 i++; 3563 p = p->next; 3564 } 3565 STATS_SET_FREEABLE(cachep, i); 3566 } 3567#endif 3568 spin_unlock(&l3->list_lock); 3569 ac->avail -= batchcount; 3570 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); 3571} 3572 3573/* 3574 * Release an obj back to its cache. If the obj has a constructed state, it must 3575 * be in this state _before_ it is released. Called with disabled ints. 3576 */ 3577static inline void __cache_free(struct kmem_cache *cachep, void *objp) 3578{ 3579 struct array_cache *ac = cpu_cache_get(cachep); 3580 3581 check_irq_off(); 3582 kmemleak_free_recursive(objp, cachep->flags); 3583 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3584 3585 /* 3586 * Skip calling cache_free_alien() when the platform is not numa. 3587 * This will avoid cache misses that happen while accessing slabp (which 3588 * is per page memory reference) to get nodeid. Instead use a global 3589 * variable to skip the call, which is mostly likely to be present in 3590 * the cache. 3591 */ 3592 if (numa_platform && cache_free_alien(cachep, objp)) 3593 return; 3594 3595 if (likely(ac->avail < ac->limit)) { 3596 STATS_INC_FREEHIT(cachep); 3597 ac->entry[ac->avail++] = objp; 3598 return; 3599 } else { 3600 STATS_INC_FREEMISS(cachep); 3601 cache_flusharray(cachep, ac); 3602 ac->entry[ac->avail++] = objp; 3603 } 3604} 3605 3606/** 3607 * kmem_cache_alloc - Allocate an object 3608 * @cachep: The cache to allocate from. 3609 * @flags: See kmalloc(). 3610 * 3611 * Allocate an object from this cache. The flags are only relevant 3612 * if the cache has no available objects. 3613 */ 3614void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3615{ 3616 void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); 3617 3618 trace_kmem_cache_alloc(_RET_IP_, ret, 3619 obj_size(cachep), cachep->buffer_size, flags); 3620 3621 return ret; 3622} 3623EXPORT_SYMBOL(kmem_cache_alloc); 3624 3625#ifdef CONFIG_KMEMTRACE 3626void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) 3627{ 3628 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3629} 3630EXPORT_SYMBOL(kmem_cache_alloc_notrace); 3631#endif 3632 3633/** 3634 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. 3635 * @cachep: the cache we're checking against 3636 * @ptr: pointer to validate 3637 * 3638 * This verifies that the untrusted pointer looks sane; 3639 * it is _not_ a guarantee that the pointer is actually 3640 * part of the slab cache in question, but it at least 3641 * validates that the pointer can be dereferenced and 3642 * looks half-way sane. 3643 * 3644 * Currently only used for dentry validation. 3645 */ 3646int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) 3647{ 3648 unsigned long addr = (unsigned long)ptr; 3649 unsigned long min_addr = PAGE_OFFSET; 3650 unsigned long align_mask = BYTES_PER_WORD - 1; 3651 unsigned long size = cachep->buffer_size; 3652 struct page *page; 3653 3654 if (unlikely(addr < min_addr)) 3655 goto out; 3656 if (unlikely(addr > (unsigned long)high_memory - size)) 3657 goto out; 3658 if (unlikely(addr & align_mask)) 3659 goto out; 3660 if (unlikely(!kern_addr_valid(addr))) 3661 goto out; 3662 if (unlikely(!kern_addr_valid(addr + size - 1))) 3663 goto out; 3664 page = virt_to_page(ptr); 3665 if (unlikely(!PageSlab(page))) 3666 goto out; 3667 if (unlikely(page_get_cache(page) != cachep)) 3668 goto out; 3669 return 1; 3670out: 3671 return 0; 3672} 3673 3674#ifdef CONFIG_NUMA 3675void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3676{ 3677 void *ret = __cache_alloc_node(cachep, flags, nodeid, 3678 __builtin_return_address(0)); 3679 3680 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3681 obj_size(cachep), cachep->buffer_size, 3682 flags, nodeid); 3683 3684 return ret; 3685} 3686EXPORT_SYMBOL(kmem_cache_alloc_node); 3687 3688#ifdef CONFIG_KMEMTRACE 3689void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, 3690 gfp_t flags, 3691 int nodeid) 3692{ 3693 return __cache_alloc_node(cachep, flags, nodeid, 3694 __builtin_return_address(0)); 3695} 3696EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); 3697#endif 3698 3699static __always_inline void * 3700__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) 3701{ 3702 struct kmem_cache *cachep; 3703 void *ret; 3704 3705 cachep = kmem_find_general_cachep(size, flags); 3706 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3707 return cachep; 3708 ret = kmem_cache_alloc_node_notrace(cachep, flags, node); 3709 3710 trace_kmalloc_node((unsigned long) caller, ret, 3711 size, cachep->buffer_size, flags, node); 3712 3713 return ret; 3714} 3715 3716#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) 3717void *__kmalloc_node(size_t size, gfp_t flags, int node) 3718{ 3719 return __do_kmalloc_node(size, flags, node, 3720 __builtin_return_address(0)); 3721} 3722EXPORT_SYMBOL(__kmalloc_node); 3723 3724void *__kmalloc_node_track_caller(size_t size, gfp_t flags, 3725 int node, unsigned long caller) 3726{ 3727 return __do_kmalloc_node(size, flags, node, (void *)caller); 3728} 3729EXPORT_SYMBOL(__kmalloc_node_track_caller); 3730#else 3731void *__kmalloc_node(size_t size, gfp_t flags, int node) 3732{ 3733 return __do_kmalloc_node(size, flags, node, NULL); 3734} 3735EXPORT_SYMBOL(__kmalloc_node); 3736#endif /* CONFIG_DEBUG_SLAB */ 3737#endif /* CONFIG_NUMA */ 3738 3739/** 3740 * __do_kmalloc - allocate memory 3741 * @size: how many bytes of memory are required. 3742 * @flags: the type of memory to allocate (see kmalloc). 3743 * @caller: function caller for debug tracking of the caller 3744 */ 3745static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, 3746 void *caller) 3747{ 3748 struct kmem_cache *cachep; 3749 void *ret; 3750 3751 /* If you want to save a few bytes .text space: replace 3752 * __ with kmem_. 3753 * Then kmalloc uses the uninlined functions instead of the inline 3754 * functions. 3755 */ 3756 cachep = __find_general_cachep(size, flags); 3757 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3758 return cachep; 3759 ret = __cache_alloc(cachep, flags, caller); 3760 3761 trace_kmalloc((unsigned long) caller, ret, 3762 size, cachep->buffer_size, flags); 3763 3764 return ret; 3765} 3766 3767 3768#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) 3769void *__kmalloc(size_t size, gfp_t flags) 3770{ 3771 return __do_kmalloc(size, flags, __builtin_return_address(0)); 3772} 3773EXPORT_SYMBOL(__kmalloc); 3774 3775void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) 3776{ 3777 return __do_kmalloc(size, flags, (void *)caller); 3778} 3779EXPORT_SYMBOL(__kmalloc_track_caller); 3780 3781#else 3782void *__kmalloc(size_t size, gfp_t flags) 3783{ 3784 return __do_kmalloc(size, flags, NULL); 3785} 3786EXPORT_SYMBOL(__kmalloc); 3787#endif 3788 3789/** 3790 * kmem_cache_free - Deallocate an object 3791 * @cachep: The cache the allocation was from. 3792 * @objp: The previously allocated object. 3793 * 3794 * Free an object which was previously allocated from this 3795 * cache. 3796 */ 3797void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3798{ 3799 unsigned long flags; 3800 3801 local_irq_save(flags); 3802 debug_check_no_locks_freed(objp, obj_size(cachep)); 3803 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3804 debug_check_no_obj_freed(objp, obj_size(cachep)); 3805 __cache_free(cachep, objp); 3806 local_irq_restore(flags); 3807 3808 trace_kmem_cache_free(_RET_IP_, objp); 3809} 3810EXPORT_SYMBOL(kmem_cache_free); 3811 3812/** 3813 * kfree - free previously allocated memory 3814 * @objp: pointer returned by kmalloc. 3815 * 3816 * If @objp is NULL, no operation is performed. 3817 * 3818 * Don't free memory not originally allocated by kmalloc() 3819 * or you will run into trouble. 3820 */ 3821void kfree(const void *objp) 3822{ 3823 struct kmem_cache *c; 3824 unsigned long flags; 3825 3826 trace_kfree(_RET_IP_, objp); 3827 3828 if (unlikely(ZERO_OR_NULL_PTR(objp))) 3829 return; 3830 local_irq_save(flags); 3831 kfree_debugcheck(objp); 3832 c = virt_to_cache(objp); 3833 debug_check_no_locks_freed(objp, obj_size(c)); 3834 debug_check_no_obj_freed(objp, obj_size(c)); 3835 __cache_free(c, (void *)objp); 3836 local_irq_restore(flags); 3837} 3838EXPORT_SYMBOL(kfree); 3839 3840unsigned int kmem_cache_size(struct kmem_cache *cachep) 3841{ 3842 return obj_size(cachep); 3843} 3844EXPORT_SYMBOL(kmem_cache_size); 3845 3846const char *kmem_cache_name(struct kmem_cache *cachep) 3847{ 3848 return cachep->name; 3849} 3850EXPORT_SYMBOL_GPL(kmem_cache_name); 3851 3852/* 3853 * This initializes kmem_list3 or resizes various caches for all nodes. 3854 */ 3855static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) 3856{ 3857 int node; 3858 struct kmem_list3 *l3; 3859 struct array_cache *new_shared; 3860 struct array_cache **new_alien = NULL; 3861 3862 for_each_online_node(node) { 3863 3864 if (use_alien_caches) { 3865 new_alien = alloc_alien_cache(node, cachep->limit, gfp); 3866 if (!new_alien) 3867 goto fail; 3868 } 3869 3870 new_shared = NULL; 3871 if (cachep->shared) { 3872 new_shared = alloc_arraycache(node, 3873 cachep->shared*cachep->batchcount, 3874 0xbaadf00d, gfp); 3875 if (!new_shared) { 3876 free_alien_cache(new_alien); 3877 goto fail; 3878 } 3879 } 3880 3881 l3 = cachep->nodelists[node]; 3882 if (l3) { 3883 struct array_cache *shared = l3->shared; 3884 3885 spin_lock_irq(&l3->list_lock); 3886 3887 if (shared) 3888 free_block(cachep, shared->entry, 3889 shared->avail, node); 3890 3891 l3->shared = new_shared; 3892 if (!l3->alien) { 3893 l3->alien = new_alien; 3894 new_alien = NULL; 3895 } 3896 l3->free_limit = (1 + nr_cpus_node(node)) * 3897 cachep->batchcount + cachep->num; 3898 spin_unlock_irq(&l3->list_lock); 3899 kfree(shared); 3900 free_alien_cache(new_alien); 3901 continue; 3902 } 3903 l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node); 3904 if (!l3) { 3905 free_alien_cache(new_alien); 3906 kfree(new_shared); 3907 goto fail; 3908 } 3909 3910 kmem_list3_init(l3); 3911 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3912 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 3913 l3->shared = new_shared; 3914 l3->alien = new_alien; 3915 l3->free_limit = (1 + nr_cpus_node(node)) * 3916 cachep->batchcount + cachep->num; 3917 cachep->nodelists[node] = l3; 3918 } 3919 return 0; 3920 3921fail: 3922 if (!cachep->next.next) { 3923 /* Cache is not active yet. Roll back what we did */ 3924 node--; 3925 while (node >= 0) { 3926 if (cachep->nodelists[node]) { 3927 l3 = cachep->nodelists[node]; 3928 3929 kfree(l3->shared); 3930 free_alien_cache(l3->alien); 3931 kfree(l3); 3932 cachep->nodelists[node] = NULL; 3933 } 3934 node--; 3935 } 3936 } 3937 return -ENOMEM; 3938} 3939 3940struct ccupdate_struct { 3941 struct kmem_cache *cachep; 3942 struct array_cache *new[NR_CPUS]; 3943}; 3944 3945static void do_ccupdate_local(void *info) 3946{ 3947 struct ccupdate_struct *new = info; 3948 struct array_cache *old; 3949 3950 check_irq_off(); 3951 old = cpu_cache_get(new->cachep); 3952 3953 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3954 new->new[smp_processor_id()] = old; 3955} 3956 3957/* Always called with the cache_chain_mutex held */ 3958static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3959 int batchcount, int shared, gfp_t gfp) 3960{ 3961 struct ccupdate_struct *new; 3962 int i; 3963 3964 new = kzalloc(sizeof(*new), gfp); 3965 if (!new) 3966 return -ENOMEM; 3967 3968 for_each_online_cpu(i) { 3969 new->new[i] = alloc_arraycache(cpu_to_node(i), limit, 3970 batchcount, gfp); 3971 if (!new->new[i]) { 3972 for (i--; i >= 0; i--) 3973 kfree(new->new[i]); 3974 kfree(new); 3975 return -ENOMEM; 3976 } 3977 } 3978 new->cachep = cachep; 3979 3980 on_each_cpu(do_ccupdate_local, (void *)new, 1); 3981 3982 check_irq_on(); 3983 cachep->batchcount = batchcount; 3984 cachep->limit = limit; 3985 cachep->shared = shared; 3986 3987 for_each_online_cpu(i) { 3988 struct array_cache *ccold = new->new[i]; 3989 if (!ccold) 3990 continue; 3991 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3992 free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); 3993 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3994 kfree(ccold); 3995 } 3996 kfree(new); 3997 return alloc_kmemlist(cachep, gfp); 3998} 3999 4000/* Called with cache_chain_mutex held always */ 4001static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 4002{ 4003 int err; 4004 int limit, shared; 4005 4006 /* 4007 * The head array serves three purposes: 4008 * - create a LIFO ordering, i.e. return objects that are cache-warm 4009 * - reduce the number of spinlock operations. 4010 * - reduce the number of linked list operations on the slab and 4011 * bufctl chains: array operations are cheaper. 4012 * The numbers are guessed, we should auto-tune as described by 4013 * Bonwick. 4014 */ 4015 if (cachep->buffer_size > 131072) 4016 limit = 1; 4017 else if (cachep->buffer_size > PAGE_SIZE) 4018 limit = 8; 4019 else if (cachep->buffer_size > 1024) 4020 limit = 24; 4021 else if (cachep->buffer_size > 256) 4022 limit = 54; 4023 else 4024 limit = 120; 4025 4026 /* 4027 * CPU bound tasks (e.g. network routing) can exhibit cpu bound 4028 * allocation behaviour: Most allocs on one cpu, most free operations 4029 * on another cpu. For these cases, an efficient object passing between 4030 * cpus is necessary. This is provided by a shared array. The array 4031 * replaces Bonwick's magazine layer. 4032 * On uniprocessor, it's functionally equivalent (but less efficient) 4033 * to a larger limit. Thus disabled by default. 4034 */ 4035 shared = 0; 4036 if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) 4037 shared = 8; 4038 4039#if DEBUG 4040 /* 4041 * With debugging enabled, large batchcount lead to excessively long 4042 * periods with disabled local interrupts. Limit the batchcount 4043 */ 4044 if (limit > 32) 4045 limit = 32; 4046#endif 4047 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); 4048 if (err) 4049 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 4050 cachep->name, -err); 4051 return err; 4052} 4053 4054/* 4055 * Drain an array if it contains any elements taking the l3 lock only if 4056 * necessary. Note that the l3 listlock also protects the array_cache 4057 * if drain_array() is used on the shared array. 4058 */ 4059void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 4060 struct array_cache *ac, int force, int node) 4061{ 4062 int tofree; 4063 4064 if (!ac || !ac->avail) 4065 return; 4066 if (ac->touched && !force) { 4067 ac->touched = 0; 4068 } else { 4069 spin_lock_irq(&l3->list_lock); 4070 if (ac->avail) { 4071 tofree = force ? ac->avail : (ac->limit + 4) / 5; 4072 if (tofree > ac->avail) 4073 tofree = (ac->avail + 1) / 2; 4074 free_block(cachep, ac->entry, tofree, node); 4075 ac->avail -= tofree; 4076 memmove(ac->entry, &(ac->entry[tofree]), 4077 sizeof(void *) * ac->avail); 4078 } 4079 spin_unlock_irq(&l3->list_lock); 4080 } 4081} 4082 4083/** 4084 * cache_reap - Reclaim memory from caches. 4085 * @w: work descriptor 4086 * 4087 * Called from workqueue/eventd every few seconds. 4088 * Purpose: 4089 * - clear the per-cpu caches for this CPU. 4090 * - return freeable pages to the main free memory pool. 4091 * 4092 * If we cannot acquire the cache chain mutex then just give up - we'll try 4093 * again on the next iteration. 4094 */ 4095static void cache_reap(struct work_struct *w) 4096{ 4097 struct kmem_cache *searchp; 4098 struct kmem_list3 *l3; 4099 int node = numa_node_id(); 4100 struct delayed_work *work = to_delayed_work(w); 4101 4102 if (!mutex_trylock(&cache_chain_mutex)) 4103 /* Give up. Setup the next iteration. */ 4104 goto out; 4105 4106 list_for_each_entry(searchp, &cache_chain, next) { 4107 check_irq_on(); 4108 4109 /* 4110 * We only take the l3 lock if absolutely necessary and we 4111 * have established with reasonable certainty that 4112 * we can do some work if the lock was obtained. 4113 */ 4114 l3 = searchp->nodelists[node]; 4115 4116 reap_alien(searchp, l3); 4117 4118 drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); 4119 4120 /* 4121 * These are racy checks but it does not matter 4122 * if we skip one check or scan twice. 4123 */ 4124 if (time_after(l3->next_reap, jiffies)) 4125 goto next; 4126 4127 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 4128 4129 drain_array(searchp, l3, l3->shared, 0, node); 4130 4131 if (l3->free_touched) 4132 l3->free_touched = 0; 4133 else { 4134 int freed; 4135 4136 freed = drain_freelist(searchp, l3, (l3->free_limit + 4137 5 * searchp->num - 1) / (5 * searchp->num)); 4138 STATS_ADD_REAPED(searchp, freed); 4139 } 4140next: 4141 cond_resched(); 4142 } 4143 check_irq_on(); 4144 mutex_unlock(&cache_chain_mutex); 4145 next_reap_node(); 4146out: 4147 /* Set up the next iteration */ 4148 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); 4149} 4150 4151#ifdef CONFIG_SLABINFO 4152 4153static void print_slabinfo_header(struct seq_file *m) 4154{ 4155 /* 4156 * Output format version, so at least we can change it 4157 * without _too_ many complaints. 4158 */ 4159#if STATS 4160 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 4161#else 4162 seq_puts(m, "slabinfo - version: 2.1\n"); 4163#endif 4164 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 4165 "<objperslab> <pagesperslab>"); 4166 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 4167 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 4168#if STATS 4169 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " 4170 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); 4171 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 4172#endif 4173 seq_putc(m, '\n'); 4174} 4175 4176static void *s_start(struct seq_file *m, loff_t *pos) 4177{ 4178 loff_t n = *pos; 4179 4180 mutex_lock(&cache_chain_mutex); 4181 if (!n) 4182 print_slabinfo_header(m); 4183 4184 return seq_list_start(&cache_chain, *pos); 4185} 4186 4187static void *s_next(struct seq_file *m, void *p, loff_t *pos) 4188{ 4189 return seq_list_next(p, &cache_chain, pos); 4190} 4191 4192static void s_stop(struct seq_file *m, void *p) 4193{ 4194 mutex_unlock(&cache_chain_mutex); 4195} 4196 4197static int s_show(struct seq_file *m, void *p) 4198{ 4199 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); 4200 struct slab *slabp; 4201 unsigned long active_objs; 4202 unsigned long num_objs; 4203 unsigned long active_slabs = 0; 4204 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 4205 const char *name; 4206 char *error = NULL; 4207 int node; 4208 struct kmem_list3 *l3; 4209 4210 active_objs = 0; 4211 num_slabs = 0; 4212 for_each_online_node(node) { 4213 l3 = cachep->nodelists[node]; 4214 if (!l3) 4215 continue; 4216 4217 check_irq_on(); 4218 spin_lock_irq(&l3->list_lock); 4219 4220 list_for_each_entry(slabp, &l3->slabs_full, list) { 4221 if (slabp->inuse != cachep->num && !error) 4222 error = "slabs_full accounting error"; 4223 active_objs += cachep->num; 4224 active_slabs++; 4225 } 4226 list_for_each_entry(slabp, &l3->slabs_partial, list) { 4227 if (slabp->inuse == cachep->num && !error) 4228 error = "slabs_partial inuse accounting error"; 4229 if (!slabp->inuse && !error) 4230 error = "slabs_partial/inuse accounting error"; 4231 active_objs += slabp->inuse; 4232 active_slabs++; 4233 } 4234 list_for_each_entry(slabp, &l3->slabs_free, list) { 4235 if (slabp->inuse && !error) 4236 error = "slabs_free/inuse accounting error"; 4237 num_slabs++; 4238 } 4239 free_objects += l3->free_objects; 4240 if (l3->shared) 4241 shared_avail += l3->shared->avail; 4242 4243 spin_unlock_irq(&l3->list_lock); 4244 } 4245 num_slabs += active_slabs; 4246 num_objs = num_slabs * cachep->num; 4247 if (num_objs - active_objs != free_objects && !error) 4248 error = "free_objects accounting error"; 4249 4250 name = cachep->name; 4251 if (error) 4252 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 4253 4254 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 4255 name, active_objs, num_objs, cachep->buffer_size, 4256 cachep->num, (1 << cachep->gfporder)); 4257 seq_printf(m, " : tunables %4u %4u %4u", 4258 cachep->limit, cachep->batchcount, cachep->shared); 4259 seq_printf(m, " : slabdata %6lu %6lu %6lu", 4260 active_slabs, num_slabs, shared_avail); 4261#if STATS 4262 { /* list3 stats */ 4263 unsigned long high = cachep->high_mark; 4264 unsigned long allocs = cachep->num_allocations; 4265 unsigned long grown = cachep->grown; 4266 unsigned long reaped = cachep->reaped; 4267 unsigned long errors = cachep->errors; 4268 unsigned long max_freeable = cachep->max_freeable; 4269 unsigned long node_allocs = cachep->node_allocs; 4270 unsigned long node_frees = cachep->node_frees; 4271 unsigned long overflows = cachep->node_overflow; 4272 4273 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 4274 %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, 4275 reaped, errors, max_freeable, node_allocs, 4276 node_frees, overflows); 4277 } 4278 /* cpu stats */ 4279 { 4280 unsigned long allochit = atomic_read(&cachep->allochit); 4281 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 4282 unsigned long freehit = atomic_read(&cachep->freehit); 4283 unsigned long freemiss = atomic_read(&cachep->freemiss); 4284 4285 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 4286 allochit, allocmiss, freehit, freemiss); 4287 } 4288#endif 4289 seq_putc(m, '\n'); 4290 return 0; 4291} 4292 4293/* 4294 * slabinfo_op - iterator that generates /proc/slabinfo 4295 * 4296 * Output layout: 4297 * cache-name 4298 * num-active-objs 4299 * total-objs 4300 * object size 4301 * num-active-slabs 4302 * total-slabs 4303 * num-pages-per-slab 4304 * + further values on SMP and with statistics enabled 4305 */ 4306 4307static const struct seq_operations slabinfo_op = { 4308 .start = s_start, 4309 .next = s_next, 4310 .stop = s_stop, 4311 .show = s_show, 4312}; 4313 4314#define MAX_SLABINFO_WRITE 128 4315/** 4316 * slabinfo_write - Tuning for the slab allocator 4317 * @file: unused 4318 * @buffer: user buffer 4319 * @count: data length 4320 * @ppos: unused 4321 */ 4322ssize_t slabinfo_write(struct file *file, const char __user * buffer, 4323 size_t count, loff_t *ppos) 4324{ 4325 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 4326 int limit, batchcount, shared, res; 4327 struct kmem_cache *cachep; 4328 4329 if (count > MAX_SLABINFO_WRITE) 4330 return -EINVAL; 4331 if (copy_from_user(&kbuf, buffer, count)) 4332 return -EFAULT; 4333 kbuf[MAX_SLABINFO_WRITE] = '\0'; 4334 4335 tmp = strchr(kbuf, ' '); 4336 if (!tmp) 4337 return -EINVAL; 4338 *tmp = '\0'; 4339 tmp++; 4340 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 4341 return -EINVAL; 4342 4343 /* Find the cache in the chain of caches. */ 4344 mutex_lock(&cache_chain_mutex); 4345 res = -EINVAL; 4346 list_for_each_entry(cachep, &cache_chain, next) { 4347 if (!strcmp(cachep->name, kbuf)) { 4348 if (limit < 1 || batchcount < 1 || 4349 batchcount > limit || shared < 0) { 4350 res = 0; 4351 } else { 4352 res = do_tune_cpucache(cachep, limit, 4353 batchcount, shared, 4354 GFP_KERNEL); 4355 } 4356 break; 4357 } 4358 } 4359 mutex_unlock(&cache_chain_mutex); 4360 if (res >= 0) 4361 res = count; 4362 return res; 4363} 4364 4365static int slabinfo_open(struct inode *inode, struct file *file) 4366{ 4367 return seq_open(file, &slabinfo_op); 4368} 4369 4370static const struct file_operations proc_slabinfo_operations = { 4371 .open = slabinfo_open, 4372 .read = seq_read, 4373 .write = slabinfo_write, 4374 .llseek = seq_lseek, 4375 .release = seq_release, 4376}; 4377 4378#ifdef CONFIG_DEBUG_SLAB_LEAK 4379 4380static void *leaks_start(struct seq_file *m, loff_t *pos) 4381{ 4382 mutex_lock(&cache_chain_mutex); 4383 return seq_list_start(&cache_chain, *pos); 4384} 4385 4386static inline int add_caller(unsigned long *n, unsigned long v) 4387{ 4388 unsigned long *p; 4389 int l; 4390 if (!v) 4391 return 1; 4392 l = n[1]; 4393 p = n + 2; 4394 while (l) { 4395 int i = l/2; 4396 unsigned long *q = p + 2 * i; 4397 if (*q == v) { 4398 q[1]++; 4399 return 1; 4400 } 4401 if (*q > v) { 4402 l = i; 4403 } else { 4404 p = q + 2; 4405 l -= i + 1; 4406 } 4407 } 4408 if (++n[1] == n[0]) 4409 return 0; 4410 memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); 4411 p[0] = v; 4412 p[1] = 1; 4413 return 1; 4414} 4415 4416static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) 4417{ 4418 void *p; 4419 int i; 4420 if (n[0] == n[1]) 4421 return; 4422 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { 4423 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) 4424 continue; 4425 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4426 return; 4427 } 4428} 4429 4430static void show_symbol(struct seq_file *m, unsigned long address) 4431{ 4432#ifdef CONFIG_KALLSYMS 4433 unsigned long offset, size; 4434 char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; 4435 4436 if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { 4437 seq_printf(m, "%s+%#lx/%#lx", name, offset, size); 4438 if (modname[0]) 4439 seq_printf(m, " [%s]", modname); 4440 return; 4441 } 4442#endif 4443 seq_printf(m, "%p", (void *)address); 4444} 4445 4446static int leaks_show(struct seq_file *m, void *p) 4447{ 4448 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); 4449 struct slab *slabp; 4450 struct kmem_list3 *l3; 4451 const char *name; 4452 unsigned long *n = m->private; 4453 int node; 4454 int i; 4455 4456 if (!(cachep->flags & SLAB_STORE_USER)) 4457 return 0; 4458 if (!(cachep->flags & SLAB_RED_ZONE)) 4459 return 0; 4460 4461 /* OK, we can do it */ 4462 4463 n[1] = 0; 4464 4465 for_each_online_node(node) { 4466 l3 = cachep->nodelists[node]; 4467 if (!l3) 4468 continue; 4469 4470 check_irq_on(); 4471 spin_lock_irq(&l3->list_lock); 4472 4473 list_for_each_entry(slabp, &l3->slabs_full, list) 4474 handle_slab(n, cachep, slabp); 4475 list_for_each_entry(slabp, &l3->slabs_partial, list) 4476 handle_slab(n, cachep, slabp); 4477 spin_unlock_irq(&l3->list_lock); 4478 } 4479 name = cachep->name; 4480 if (n[0] == n[1]) { 4481 /* Increase the buffer size */ 4482 mutex_unlock(&cache_chain_mutex); 4483 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); 4484 if (!m->private) { 4485 /* Too bad, we are really out */ 4486 m->private = n; 4487 mutex_lock(&cache_chain_mutex); 4488 return -ENOMEM; 4489 } 4490 *(unsigned long *)m->private = n[0] * 2; 4491 kfree(n); 4492 mutex_lock(&cache_chain_mutex); 4493 /* Now make sure this entry will be retried */ 4494 m->count = m->size; 4495 return 0; 4496 } 4497 for (i = 0; i < n[1]; i++) { 4498 seq_printf(m, "%s: %lu ", name, n[2*i+3]); 4499 show_symbol(m, n[2*i+2]); 4500 seq_putc(m, '\n'); 4501 } 4502 4503 return 0; 4504} 4505 4506static const struct seq_operations slabstats_op = { 4507 .start = leaks_start, 4508 .next = s_next, 4509 .stop = s_stop, 4510 .show = leaks_show, 4511}; 4512 4513static int slabstats_open(struct inode *inode, struct file *file) 4514{ 4515 unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); 4516 int ret = -ENOMEM; 4517 if (n) { 4518 ret = seq_open(file, &slabstats_op); 4519 if (!ret) { 4520 struct seq_file *m = file->private_data; 4521 *n = PAGE_SIZE / (2 * sizeof(unsigned long)); 4522 m->private = n; 4523 n = NULL; 4524 } 4525 kfree(n); 4526 } 4527 return ret; 4528} 4529 4530static const struct file_operations proc_slabstats_operations = { 4531 .open = slabstats_open, 4532 .read = seq_read, 4533 .llseek = seq_lseek, 4534 .release = seq_release_private, 4535}; 4536#endif 4537 4538static int __init slab_proc_init(void) 4539{ 4540 proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); 4541#ifdef CONFIG_DEBUG_SLAB_LEAK 4542 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); 4543#endif 4544 return 0; 4545} 4546module_init(slab_proc_init); 4547#endif 4548 4549/** 4550 * ksize - get the actual amount of memory allocated for a given object 4551 * @objp: Pointer to the object 4552 * 4553 * kmalloc may internally round up allocations and return more memory 4554 * than requested. ksize() can be used to determine the actual amount of 4555 * memory allocated. The caller may use this additional memory, even though 4556 * a smaller amount of memory was initially specified with the kmalloc call. 4557 * The caller must guarantee that objp points to a valid object previously 4558 * allocated with either kmalloc() or kmem_cache_alloc(). The object 4559 * must not be freed during the duration of the call. 4560 */ 4561size_t ksize(const void *objp) 4562{ 4563 BUG_ON(!objp); 4564 if (unlikely(objp == ZERO_SIZE_PTR)) 4565 return 0; 4566 4567 return obj_size(virt_to_cache(objp)); 4568} 4569EXPORT_SYMBOL(ksize); 4570