slab.c revision ca3b9b91735316f0ec7f01976f85842e0bfe5c6e
1/*
2 * linux/mm/slab.c
3 * Written by Mark Hemment, 1996/97.
4 * (markhe@nextd.demon.co.uk)
5 *
6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7 *
8 * Major cleanup, different bufctl logic, per-cpu arrays
9 *	(c) 2000 Manfred Spraul
10 *
11 * Cleanup, make the head arrays unconditional, preparation for NUMA
12 * 	(c) 2002 Manfred Spraul
13 *
14 * An implementation of the Slab Allocator as described in outline in;
15 *	UNIX Internals: The New Frontiers by Uresh Vahalia
16 *	Pub: Prentice Hall	ISBN 0-13-101908-2
17 * or with a little more detail in;
18 *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
19 *	Jeff Bonwick (Sun Microsystems).
20 *	Presented at: USENIX Summer 1994 Technical Conference
21 *
22 * The memory is organized in caches, one cache for each object type.
23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24 * Each cache consists out of many slabs (they are small (usually one
25 * page long) and always contiguous), and each slab contains multiple
26 * initialized objects.
27 *
28 * This means, that your constructor is used only for newly allocated
29 * slabs and you must pass objects with the same intializations to
30 * kmem_cache_free.
31 *
32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33 * normal). If you need a special memory type, then must create a new
34 * cache for that memory type.
35 *
36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37 *   full slabs with 0 free objects
38 *   partial slabs
39 *   empty slabs with no allocated objects
40 *
41 * If partial slabs exist, then new allocations come from these slabs,
42 * otherwise from empty slabs or new slabs are allocated.
43 *
44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46 *
47 * Each cache has a short per-cpu head array, most allocs
48 * and frees go into that array, and if that array overflows, then 1/2
49 * of the entries in the array are given back into the global cache.
50 * The head array is strictly LIFO and should improve the cache hit rates.
51 * On SMP, it additionally reduces the spinlock operations.
52 *
53 * The c_cpuarray may not be read with enabled local interrupts -
54 * it's changed with a smp_call_function().
55 *
56 * SMP synchronization:
57 *  constructors and destructors are called without any locking.
58 *  Several members in struct kmem_cache and struct slab never change, they
59 *	are accessed without any locking.
60 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
61 *  	and local interrupts are disabled so slab code is preempt-safe.
62 *  The non-constant members are protected with a per-cache irq spinlock.
63 *
64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65 * in 2000 - many ideas in the current implementation are derived from
66 * his patch.
67 *
68 * Further notes from the original documentation:
69 *
70 * 11 April '97.  Started multi-threading - markhe
71 *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
72 *	The sem is only needed when accessing/extending the cache-chain, which
73 *	can never happen inside an interrupt (kmem_cache_create(),
74 *	kmem_cache_shrink() and kmem_cache_reap()).
75 *
76 *	At present, each engine can be growing a cache.  This should be blocked.
77 *
78 * 15 March 2005. NUMA slab allocator.
79 *	Shai Fultheim <shai@scalex86.org>.
80 *	Shobhit Dayal <shobhit@calsoftinc.com>
81 *	Alok N Kataria <alokk@calsoftinc.com>
82 *	Christoph Lameter <christoph@lameter.com>
83 *
84 *	Modified the slab allocator to be node aware on NUMA systems.
85 *	Each node has its own list of partial, free and full slabs.
86 *	All object allocations for a node occur from node specific slab lists.
87 */
88
89#include	<linux/config.h>
90#include	<linux/slab.h>
91#include	<linux/mm.h>
92#include	<linux/swap.h>
93#include	<linux/cache.h>
94#include	<linux/interrupt.h>
95#include	<linux/init.h>
96#include	<linux/compiler.h>
97#include	<linux/seq_file.h>
98#include	<linux/notifier.h>
99#include	<linux/kallsyms.h>
100#include	<linux/cpu.h>
101#include	<linux/sysctl.h>
102#include	<linux/module.h>
103#include	<linux/rcupdate.h>
104#include	<linux/string.h>
105#include	<linux/nodemask.h>
106#include	<linux/mempolicy.h>
107#include	<linux/mutex.h>
108
109#include	<asm/uaccess.h>
110#include	<asm/cacheflush.h>
111#include	<asm/tlbflush.h>
112#include	<asm/page.h>
113
114/*
115 * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
116 *		  SLAB_RED_ZONE & SLAB_POISON.
117 *		  0 for faster, smaller code (especially in the critical paths).
118 *
119 * STATS	- 1 to collect stats for /proc/slabinfo.
120 *		  0 for faster, smaller code (especially in the critical paths).
121 *
122 * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
123 */
124
125#ifdef CONFIG_DEBUG_SLAB
126#define	DEBUG		1
127#define	STATS		1
128#define	FORCED_DEBUG	1
129#else
130#define	DEBUG		0
131#define	STATS		0
132#define	FORCED_DEBUG	0
133#endif
134
135/* Shouldn't this be in a header file somewhere? */
136#define	BYTES_PER_WORD		sizeof(void *)
137
138#ifndef cache_line_size
139#define cache_line_size()	L1_CACHE_BYTES
140#endif
141
142#ifndef ARCH_KMALLOC_MINALIGN
143/*
144 * Enforce a minimum alignment for the kmalloc caches.
145 * Usually, the kmalloc caches are cache_line_size() aligned, except when
146 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
147 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
148 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
149 * Note that this flag disables some debug features.
150 */
151#define ARCH_KMALLOC_MINALIGN 0
152#endif
153
154#ifndef ARCH_SLAB_MINALIGN
155/*
156 * Enforce a minimum alignment for all caches.
157 * Intended for archs that get misalignment faults even for BYTES_PER_WORD
158 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
159 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
160 * some debug features.
161 */
162#define ARCH_SLAB_MINALIGN 0
163#endif
164
165#ifndef ARCH_KMALLOC_FLAGS
166#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
167#endif
168
169/* Legal flag mask for kmem_cache_create(). */
170#if DEBUG
171# define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
172			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
173			 SLAB_NO_REAP | SLAB_CACHE_DMA | \
174			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
175			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
176			 SLAB_DESTROY_BY_RCU)
177#else
178# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
179			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
180			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
181			 SLAB_DESTROY_BY_RCU)
182#endif
183
184/*
185 * kmem_bufctl_t:
186 *
187 * Bufctl's are used for linking objs within a slab
188 * linked offsets.
189 *
190 * This implementation relies on "struct page" for locating the cache &
191 * slab an object belongs to.
192 * This allows the bufctl structure to be small (one int), but limits
193 * the number of objects a slab (not a cache) can contain when off-slab
194 * bufctls are used. The limit is the size of the largest general cache
195 * that does not use off-slab slabs.
196 * For 32bit archs with 4 kB pages, is this 56.
197 * This is not serious, as it is only for large objects, when it is unwise
198 * to have too many per slab.
199 * Note: This limit can be raised by introducing a general cache whose size
200 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
201 */
202
203typedef unsigned int kmem_bufctl_t;
204#define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
205#define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
206#define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-2)
207
208/* Max number of objs-per-slab for caches which use off-slab slabs.
209 * Needed to avoid a possible looping condition in cache_grow().
210 */
211static unsigned long offslab_limit;
212
213/*
214 * struct slab
215 *
216 * Manages the objs in a slab. Placed either at the beginning of mem allocated
217 * for a slab, or allocated from an general cache.
218 * Slabs are chained into three list: fully used, partial, fully free slabs.
219 */
220struct slab {
221	struct list_head list;
222	unsigned long colouroff;
223	void *s_mem;		/* including colour offset */
224	unsigned int inuse;	/* num of objs active in slab */
225	kmem_bufctl_t free;
226	unsigned short nodeid;
227};
228
229/*
230 * struct slab_rcu
231 *
232 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
233 * arrange for kmem_freepages to be called via RCU.  This is useful if
234 * we need to approach a kernel structure obliquely, from its address
235 * obtained without the usual locking.  We can lock the structure to
236 * stabilize it and check it's still at the given address, only if we
237 * can be sure that the memory has not been meanwhile reused for some
238 * other kind of object (which our subsystem's lock might corrupt).
239 *
240 * rcu_read_lock before reading the address, then rcu_read_unlock after
241 * taking the spinlock within the structure expected at that address.
242 *
243 * We assume struct slab_rcu can overlay struct slab when destroying.
244 */
245struct slab_rcu {
246	struct rcu_head head;
247	struct kmem_cache *cachep;
248	void *addr;
249};
250
251/*
252 * struct array_cache
253 *
254 * Purpose:
255 * - LIFO ordering, to hand out cache-warm objects from _alloc
256 * - reduce the number of linked list operations
257 * - reduce spinlock operations
258 *
259 * The limit is stored in the per-cpu structure to reduce the data cache
260 * footprint.
261 *
262 */
263struct array_cache {
264	unsigned int avail;
265	unsigned int limit;
266	unsigned int batchcount;
267	unsigned int touched;
268	spinlock_t lock;
269	void *entry[0];		/*
270				 * Must have this definition in here for the proper
271				 * alignment of array_cache. Also simplifies accessing
272				 * the entries.
273				 * [0] is for gcc 2.95. It should really be [].
274				 */
275};
276
277/* bootstrap: The caches do not work without cpuarrays anymore,
278 * but the cpuarrays are allocated from the generic caches...
279 */
280#define BOOT_CPUCACHE_ENTRIES	1
281struct arraycache_init {
282	struct array_cache cache;
283	void *entries[BOOT_CPUCACHE_ENTRIES];
284};
285
286/*
287 * The slab lists for all objects.
288 */
289struct kmem_list3 {
290	struct list_head slabs_partial;	/* partial list first, better asm code */
291	struct list_head slabs_full;
292	struct list_head slabs_free;
293	unsigned long free_objects;
294	unsigned long next_reap;
295	int free_touched;
296	unsigned int free_limit;
297	unsigned int colour_next;	/* Per-node cache coloring */
298	spinlock_t list_lock;
299	struct array_cache *shared;	/* shared per node */
300	struct array_cache **alien;	/* on other nodes */
301};
302
303/*
304 * Need this for bootstrapping a per node allocator.
305 */
306#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
307struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
308#define	CACHE_CACHE 0
309#define	SIZE_AC 1
310#define	SIZE_L3 (1 + MAX_NUMNODES)
311
312/*
313 * This function must be completely optimized away if
314 * a constant is passed to it. Mostly the same as
315 * what is in linux/slab.h except it returns an
316 * index.
317 */
318static __always_inline int index_of(const size_t size)
319{
320	extern void __bad_size(void);
321
322	if (__builtin_constant_p(size)) {
323		int i = 0;
324
325#define CACHE(x) \
326	if (size <=x) \
327		return i; \
328	else \
329		i++;
330#include "linux/kmalloc_sizes.h"
331#undef CACHE
332		__bad_size();
333	} else
334		__bad_size();
335	return 0;
336}
337
338#define INDEX_AC index_of(sizeof(struct arraycache_init))
339#define INDEX_L3 index_of(sizeof(struct kmem_list3))
340
341static void kmem_list3_init(struct kmem_list3 *parent)
342{
343	INIT_LIST_HEAD(&parent->slabs_full);
344	INIT_LIST_HEAD(&parent->slabs_partial);
345	INIT_LIST_HEAD(&parent->slabs_free);
346	parent->shared = NULL;
347	parent->alien = NULL;
348	parent->colour_next = 0;
349	spin_lock_init(&parent->list_lock);
350	parent->free_objects = 0;
351	parent->free_touched = 0;
352}
353
354#define MAKE_LIST(cachep, listp, slab, nodeid)	\
355	do {	\
356		INIT_LIST_HEAD(listp);		\
357		list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
358	} while (0)
359
360#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)			\
361	do {					\
362	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
363	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
364	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
365	} while (0)
366
367/*
368 * struct kmem_cache
369 *
370 * manages a cache.
371 */
372
373struct kmem_cache {
374/* 1) per-cpu data, touched during every alloc/free */
375	struct array_cache *array[NR_CPUS];
376	unsigned int batchcount;
377	unsigned int limit;
378	unsigned int shared;
379	unsigned int buffer_size;
380/* 2) touched by every alloc & free from the backend */
381	struct kmem_list3 *nodelists[MAX_NUMNODES];
382	unsigned int flags;	/* constant flags */
383	unsigned int num;	/* # of objs per slab */
384	spinlock_t spinlock;
385
386/* 3) cache_grow/shrink */
387	/* order of pgs per slab (2^n) */
388	unsigned int gfporder;
389
390	/* force GFP flags, e.g. GFP_DMA */
391	gfp_t gfpflags;
392
393	size_t colour;		/* cache colouring range */
394	unsigned int colour_off;	/* colour offset */
395	struct kmem_cache *slabp_cache;
396	unsigned int slab_size;
397	unsigned int dflags;	/* dynamic flags */
398
399	/* constructor func */
400	void (*ctor) (void *, struct kmem_cache *, unsigned long);
401
402	/* de-constructor func */
403	void (*dtor) (void *, struct kmem_cache *, unsigned long);
404
405/* 4) cache creation/removal */
406	const char *name;
407	struct list_head next;
408
409/* 5) statistics */
410#if STATS
411	unsigned long num_active;
412	unsigned long num_allocations;
413	unsigned long high_mark;
414	unsigned long grown;
415	unsigned long reaped;
416	unsigned long errors;
417	unsigned long max_freeable;
418	unsigned long node_allocs;
419	unsigned long node_frees;
420	atomic_t allochit;
421	atomic_t allocmiss;
422	atomic_t freehit;
423	atomic_t freemiss;
424#endif
425#if DEBUG
426	/*
427	 * If debugging is enabled, then the allocator can add additional
428	 * fields and/or padding to every object. buffer_size contains the total
429	 * object size including these internal fields, the following two
430	 * variables contain the offset to the user object and its size.
431	 */
432	int obj_offset;
433	int obj_size;
434#endif
435};
436
437#define CFLGS_OFF_SLAB		(0x80000000UL)
438#define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
439
440#define BATCHREFILL_LIMIT	16
441/* Optimization question: fewer reaps means less
442 * probability for unnessary cpucache drain/refill cycles.
443 *
444 * OTOH the cpuarrays can contain lots of objects,
445 * which could lock up otherwise freeable slabs.
446 */
447#define REAPTIMEOUT_CPUC	(2*HZ)
448#define REAPTIMEOUT_LIST3	(4*HZ)
449
450#if STATS
451#define	STATS_INC_ACTIVE(x)	((x)->num_active++)
452#define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
453#define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
454#define	STATS_INC_GROWN(x)	((x)->grown++)
455#define	STATS_INC_REAPED(x)	((x)->reaped++)
456#define	STATS_SET_HIGH(x)	do { if ((x)->num_active > (x)->high_mark) \
457					(x)->high_mark = (x)->num_active; \
458				} while (0)
459#define	STATS_INC_ERR(x)	((x)->errors++)
460#define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
461#define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
462#define	STATS_SET_FREEABLE(x, i) \
463				do { if ((x)->max_freeable < i) \
464					(x)->max_freeable = i; \
465				} while (0)
466
467#define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
468#define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
469#define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
470#define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
471#else
472#define	STATS_INC_ACTIVE(x)	do { } while (0)
473#define	STATS_DEC_ACTIVE(x)	do { } while (0)
474#define	STATS_INC_ALLOCED(x)	do { } while (0)
475#define	STATS_INC_GROWN(x)	do { } while (0)
476#define	STATS_INC_REAPED(x)	do { } while (0)
477#define	STATS_SET_HIGH(x)	do { } while (0)
478#define	STATS_INC_ERR(x)	do { } while (0)
479#define	STATS_INC_NODEALLOCS(x)	do { } while (0)
480#define	STATS_INC_NODEFREES(x)	do { } while (0)
481#define	STATS_SET_FREEABLE(x, i) \
482				do { } while (0)
483
484#define STATS_INC_ALLOCHIT(x)	do { } while (0)
485#define STATS_INC_ALLOCMISS(x)	do { } while (0)
486#define STATS_INC_FREEHIT(x)	do { } while (0)
487#define STATS_INC_FREEMISS(x)	do { } while (0)
488#endif
489
490#if DEBUG
491/* Magic nums for obj red zoning.
492 * Placed in the first word before and the first word after an obj.
493 */
494#define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
495#define	RED_ACTIVE	0x170FC2A5UL	/* when obj is active */
496
497/* ...and for poisoning */
498#define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
499#define POISON_FREE	0x6b	/* for use-after-free poisoning */
500#define	POISON_END	0xa5	/* end-byte of poisoning */
501
502/* memory layout of objects:
503 * 0		: objp
504 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
505 * 		the end of an object is aligned with the end of the real
506 * 		allocation. Catches writes behind the end of the allocation.
507 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
508 * 		redzone word.
509 * cachep->obj_offset: The real object.
510 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
511 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
512 */
513static int obj_offset(struct kmem_cache *cachep)
514{
515	return cachep->obj_offset;
516}
517
518static int obj_size(struct kmem_cache *cachep)
519{
520	return cachep->obj_size;
521}
522
523static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
524{
525	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
526	return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
527}
528
529static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
530{
531	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
532	if (cachep->flags & SLAB_STORE_USER)
533		return (unsigned long *)(objp + cachep->buffer_size -
534					 2 * BYTES_PER_WORD);
535	return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
536}
537
538static void **dbg_userword(struct kmem_cache *cachep, void *objp)
539{
540	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
541	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
542}
543
544#else
545
546#define obj_offset(x)			0
547#define obj_size(cachep)		(cachep->buffer_size)
548#define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long *)NULL;})
549#define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long *)NULL;})
550#define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
551
552#endif
553
554/*
555 * Maximum size of an obj (in 2^order pages)
556 * and absolute limit for the gfp order.
557 */
558#if defined(CONFIG_LARGE_ALLOCS)
559#define	MAX_OBJ_ORDER	13	/* up to 32Mb */
560#define	MAX_GFP_ORDER	13	/* up to 32Mb */
561#elif defined(CONFIG_MMU)
562#define	MAX_OBJ_ORDER	5	/* 32 pages */
563#define	MAX_GFP_ORDER	5	/* 32 pages */
564#else
565#define	MAX_OBJ_ORDER	8	/* up to 1Mb */
566#define	MAX_GFP_ORDER	8	/* up to 1Mb */
567#endif
568
569/*
570 * Do not go above this order unless 0 objects fit into the slab.
571 */
572#define	BREAK_GFP_ORDER_HI	1
573#define	BREAK_GFP_ORDER_LO	0
574static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
575
576/* Functions for storing/retrieving the cachep and or slab from the
577 * global 'mem_map'. These are used to find the slab an obj belongs to.
578 * With kfree(), these are used to find the cache which an obj belongs to.
579 */
580static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
581{
582	page->lru.next = (struct list_head *)cache;
583}
584
585static inline struct kmem_cache *page_get_cache(struct page *page)
586{
587	return (struct kmem_cache *)page->lru.next;
588}
589
590static inline void page_set_slab(struct page *page, struct slab *slab)
591{
592	page->lru.prev = (struct list_head *)slab;
593}
594
595static inline struct slab *page_get_slab(struct page *page)
596{
597	return (struct slab *)page->lru.prev;
598}
599
600static inline struct kmem_cache *virt_to_cache(const void *obj)
601{
602	struct page *page = virt_to_page(obj);
603	return page_get_cache(page);
604}
605
606static inline struct slab *virt_to_slab(const void *obj)
607{
608	struct page *page = virt_to_page(obj);
609	return page_get_slab(page);
610}
611
612/* These are the default caches for kmalloc. Custom caches can have other sizes. */
613struct cache_sizes malloc_sizes[] = {
614#define CACHE(x) { .cs_size = (x) },
615#include <linux/kmalloc_sizes.h>
616	CACHE(ULONG_MAX)
617#undef CACHE
618};
619EXPORT_SYMBOL(malloc_sizes);
620
621/* Must match cache_sizes above. Out of line to keep cache footprint low. */
622struct cache_names {
623	char *name;
624	char *name_dma;
625};
626
627static struct cache_names __initdata cache_names[] = {
628#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
629#include <linux/kmalloc_sizes.h>
630	{NULL,}
631#undef CACHE
632};
633
634static struct arraycache_init initarray_cache __initdata =
635    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
636static struct arraycache_init initarray_generic =
637    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
638
639/* internal cache of cache description objs */
640static struct kmem_cache cache_cache = {
641	.batchcount = 1,
642	.limit = BOOT_CPUCACHE_ENTRIES,
643	.shared = 1,
644	.buffer_size = sizeof(struct kmem_cache),
645	.flags = SLAB_NO_REAP,
646	.spinlock = SPIN_LOCK_UNLOCKED,
647	.name = "kmem_cache",
648#if DEBUG
649	.obj_size = sizeof(struct kmem_cache),
650#endif
651};
652
653/* Guard access to the cache-chain. */
654static DEFINE_MUTEX(cache_chain_mutex);
655static struct list_head cache_chain;
656
657/*
658 * vm_enough_memory() looks at this to determine how many
659 * slab-allocated pages are possibly freeable under pressure
660 *
661 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
662 */
663atomic_t slab_reclaim_pages;
664
665/*
666 * chicken and egg problem: delay the per-cpu array allocation
667 * until the general caches are up.
668 */
669static enum {
670	NONE,
671	PARTIAL_AC,
672	PARTIAL_L3,
673	FULL
674} g_cpucache_up;
675
676static DEFINE_PER_CPU(struct work_struct, reap_work);
677
678static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);
679static void enable_cpucache(struct kmem_cache *cachep);
680static void cache_reap(void *unused);
681static int __node_shrink(struct kmem_cache *cachep, int node);
682
683static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
684{
685	return cachep->array[smp_processor_id()];
686}
687
688static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
689{
690	struct cache_sizes *csizep = malloc_sizes;
691
692#if DEBUG
693	/* This happens if someone tries to call
694	 * kmem_cache_create(), or __kmalloc(), before
695	 * the generic caches are initialized.
696	 */
697	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
698#endif
699	while (size > csizep->cs_size)
700		csizep++;
701
702	/*
703	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
704	 * has cs_{dma,}cachep==NULL. Thus no special case
705	 * for large kmalloc calls required.
706	 */
707	if (unlikely(gfpflags & GFP_DMA))
708		return csizep->cs_dmacachep;
709	return csizep->cs_cachep;
710}
711
712struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
713{
714	return __find_general_cachep(size, gfpflags);
715}
716EXPORT_SYMBOL(kmem_find_general_cachep);
717
718static size_t slab_mgmt_size(size_t nr_objs, size_t align)
719{
720	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
721}
722
723/* Calculate the number of objects and left-over bytes for a given
724   buffer size. */
725static void cache_estimate(unsigned long gfporder, size_t buffer_size,
726			   size_t align, int flags, size_t *left_over,
727			   unsigned int *num)
728{
729	int nr_objs;
730	size_t mgmt_size;
731	size_t slab_size = PAGE_SIZE << gfporder;
732
733	/*
734	 * The slab management structure can be either off the slab or
735	 * on it. For the latter case, the memory allocated for a
736	 * slab is used for:
737	 *
738	 * - The struct slab
739	 * - One kmem_bufctl_t for each object
740	 * - Padding to respect alignment of @align
741	 * - @buffer_size bytes for each object
742	 *
743	 * If the slab management structure is off the slab, then the
744	 * alignment will already be calculated into the size. Because
745	 * the slabs are all pages aligned, the objects will be at the
746	 * correct alignment when allocated.
747	 */
748	if (flags & CFLGS_OFF_SLAB) {
749		mgmt_size = 0;
750		nr_objs = slab_size / buffer_size;
751
752		if (nr_objs > SLAB_LIMIT)
753			nr_objs = SLAB_LIMIT;
754	} else {
755		/*
756		 * Ignore padding for the initial guess. The padding
757		 * is at most @align-1 bytes, and @buffer_size is at
758		 * least @align. In the worst case, this result will
759		 * be one greater than the number of objects that fit
760		 * into the memory allocation when taking the padding
761		 * into account.
762		 */
763		nr_objs = (slab_size - sizeof(struct slab)) /
764			  (buffer_size + sizeof(kmem_bufctl_t));
765
766		/*
767		 * This calculated number will be either the right
768		 * amount, or one greater than what we want.
769		 */
770		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
771		       > slab_size)
772			nr_objs--;
773
774		if (nr_objs > SLAB_LIMIT)
775			nr_objs = SLAB_LIMIT;
776
777		mgmt_size = slab_mgmt_size(nr_objs, align);
778	}
779	*num = nr_objs;
780	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
781}
782
783#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
784
785static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg)
786{
787	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
788	       function, cachep->name, msg);
789	dump_stack();
790}
791
792/*
793 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
794 * via the workqueue/eventd.
795 * Add the CPU number into the expiration time to minimize the possibility of
796 * the CPUs getting into lockstep and contending for the global cache chain
797 * lock.
798 */
799static void __devinit start_cpu_timer(int cpu)
800{
801	struct work_struct *reap_work = &per_cpu(reap_work, cpu);
802
803	/*
804	 * When this gets called from do_initcalls via cpucache_init(),
805	 * init_workqueues() has already run, so keventd will be setup
806	 * at that time.
807	 */
808	if (keventd_up() && reap_work->func == NULL) {
809		INIT_WORK(reap_work, cache_reap, NULL);
810		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
811	}
812}
813
814static struct array_cache *alloc_arraycache(int node, int entries,
815					    int batchcount)
816{
817	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
818	struct array_cache *nc = NULL;
819
820	nc = kmalloc_node(memsize, GFP_KERNEL, node);
821	if (nc) {
822		nc->avail = 0;
823		nc->limit = entries;
824		nc->batchcount = batchcount;
825		nc->touched = 0;
826		spin_lock_init(&nc->lock);
827	}
828	return nc;
829}
830
831#ifdef CONFIG_NUMA
832static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
833
834static struct array_cache **alloc_alien_cache(int node, int limit)
835{
836	struct array_cache **ac_ptr;
837	int memsize = sizeof(void *) * MAX_NUMNODES;
838	int i;
839
840	if (limit > 1)
841		limit = 12;
842	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
843	if (ac_ptr) {
844		for_each_node(i) {
845			if (i == node || !node_online(i)) {
846				ac_ptr[i] = NULL;
847				continue;
848			}
849			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
850			if (!ac_ptr[i]) {
851				for (i--; i <= 0; i--)
852					kfree(ac_ptr[i]);
853				kfree(ac_ptr);
854				return NULL;
855			}
856		}
857	}
858	return ac_ptr;
859}
860
861static void free_alien_cache(struct array_cache **ac_ptr)
862{
863	int i;
864
865	if (!ac_ptr)
866		return;
867
868	for_each_node(i)
869	    kfree(ac_ptr[i]);
870
871	kfree(ac_ptr);
872}
873
874static void __drain_alien_cache(struct kmem_cache *cachep,
875				struct array_cache *ac, int node)
876{
877	struct kmem_list3 *rl3 = cachep->nodelists[node];
878
879	if (ac->avail) {
880		spin_lock(&rl3->list_lock);
881		free_block(cachep, ac->entry, ac->avail, node);
882		ac->avail = 0;
883		spin_unlock(&rl3->list_lock);
884	}
885}
886
887static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
888{
889	int i = 0;
890	struct array_cache *ac;
891	unsigned long flags;
892
893	for_each_online_node(i) {
894		ac = l3->alien[i];
895		if (ac) {
896			spin_lock_irqsave(&ac->lock, flags);
897			__drain_alien_cache(cachep, ac, i);
898			spin_unlock_irqrestore(&ac->lock, flags);
899		}
900	}
901}
902#else
903#define alloc_alien_cache(node, limit) do { } while (0)
904#define free_alien_cache(ac_ptr) do { } while (0)
905#define drain_alien_cache(cachep, l3) do { } while (0)
906#endif
907
908static int __devinit cpuup_callback(struct notifier_block *nfb,
909				    unsigned long action, void *hcpu)
910{
911	long cpu = (long)hcpu;
912	struct kmem_cache *cachep;
913	struct kmem_list3 *l3 = NULL;
914	int node = cpu_to_node(cpu);
915	int memsize = sizeof(struct kmem_list3);
916
917	switch (action) {
918	case CPU_UP_PREPARE:
919		mutex_lock(&cache_chain_mutex);
920		/* we need to do this right in the beginning since
921		 * alloc_arraycache's are going to use this list.
922		 * kmalloc_node allows us to add the slab to the right
923		 * kmem_list3 and not this cpu's kmem_list3
924		 */
925
926		list_for_each_entry(cachep, &cache_chain, next) {
927			/* setup the size64 kmemlist for cpu before we can
928			 * begin anything. Make sure some other cpu on this
929			 * node has not already allocated this
930			 */
931			if (!cachep->nodelists[node]) {
932				if (!(l3 = kmalloc_node(memsize,
933							GFP_KERNEL, node)))
934					goto bad;
935				kmem_list3_init(l3);
936				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
937				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
938
939				cachep->nodelists[node] = l3;
940			}
941
942			spin_lock_irq(&cachep->nodelists[node]->list_lock);
943			cachep->nodelists[node]->free_limit =
944			    (1 + nr_cpus_node(node)) *
945			    cachep->batchcount + cachep->num;
946			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
947		}
948
949		/* Now we can go ahead with allocating the shared array's
950		   & array cache's */
951		list_for_each_entry(cachep, &cache_chain, next) {
952			struct array_cache *nc;
953
954			nc = alloc_arraycache(node, cachep->limit,
955					      cachep->batchcount);
956			if (!nc)
957				goto bad;
958			cachep->array[cpu] = nc;
959
960			l3 = cachep->nodelists[node];
961			BUG_ON(!l3);
962			if (!l3->shared) {
963				if (!(nc = alloc_arraycache(node,
964							    cachep->shared *
965							    cachep->batchcount,
966							    0xbaadf00d)))
967					goto bad;
968
969				/* we are serialised from CPU_DEAD or
970				   CPU_UP_CANCELLED by the cpucontrol lock */
971				l3->shared = nc;
972			}
973		}
974		mutex_unlock(&cache_chain_mutex);
975		break;
976	case CPU_ONLINE:
977		start_cpu_timer(cpu);
978		break;
979#ifdef CONFIG_HOTPLUG_CPU
980	case CPU_DEAD:
981		/* fall thru */
982	case CPU_UP_CANCELED:
983		mutex_lock(&cache_chain_mutex);
984
985		list_for_each_entry(cachep, &cache_chain, next) {
986			struct array_cache *nc;
987			cpumask_t mask;
988
989			mask = node_to_cpumask(node);
990			spin_lock(&cachep->spinlock);
991			/* cpu is dead; no one can alloc from it. */
992			nc = cachep->array[cpu];
993			cachep->array[cpu] = NULL;
994			l3 = cachep->nodelists[node];
995
996			if (!l3)
997				goto unlock_cache;
998
999			spin_lock_irq(&l3->list_lock);
1000
1001			/* Free limit for this kmem_list3 */
1002			l3->free_limit -= cachep->batchcount;
1003			if (nc)
1004				free_block(cachep, nc->entry, nc->avail, node);
1005
1006			if (!cpus_empty(mask)) {
1007				spin_unlock_irq(&l3->list_lock);
1008				goto unlock_cache;
1009			}
1010
1011			if (l3->shared) {
1012				free_block(cachep, l3->shared->entry,
1013					   l3->shared->avail, node);
1014				kfree(l3->shared);
1015				l3->shared = NULL;
1016			}
1017			if (l3->alien) {
1018				drain_alien_cache(cachep, l3);
1019				free_alien_cache(l3->alien);
1020				l3->alien = NULL;
1021			}
1022
1023			/* free slabs belonging to this node */
1024			if (__node_shrink(cachep, node)) {
1025				cachep->nodelists[node] = NULL;
1026				spin_unlock_irq(&l3->list_lock);
1027				kfree(l3);
1028			} else {
1029				spin_unlock_irq(&l3->list_lock);
1030			}
1031		      unlock_cache:
1032			spin_unlock(&cachep->spinlock);
1033			kfree(nc);
1034		}
1035		mutex_unlock(&cache_chain_mutex);
1036		break;
1037#endif
1038	}
1039	return NOTIFY_OK;
1040      bad:
1041	mutex_unlock(&cache_chain_mutex);
1042	return NOTIFY_BAD;
1043}
1044
1045static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
1046
1047/*
1048 * swap the static kmem_list3 with kmalloced memory
1049 */
1050static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
1051{
1052	struct kmem_list3 *ptr;
1053
1054	BUG_ON(cachep->nodelists[nodeid] != list);
1055	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
1056	BUG_ON(!ptr);
1057
1058	local_irq_disable();
1059	memcpy(ptr, list, sizeof(struct kmem_list3));
1060	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1061	cachep->nodelists[nodeid] = ptr;
1062	local_irq_enable();
1063}
1064
1065/* Initialisation.
1066 * Called after the gfp() functions have been enabled, and before smp_init().
1067 */
1068void __init kmem_cache_init(void)
1069{
1070	size_t left_over;
1071	struct cache_sizes *sizes;
1072	struct cache_names *names;
1073	int i;
1074
1075	for (i = 0; i < NUM_INIT_LISTS; i++) {
1076		kmem_list3_init(&initkmem_list3[i]);
1077		if (i < MAX_NUMNODES)
1078			cache_cache.nodelists[i] = NULL;
1079	}
1080
1081	/*
1082	 * Fragmentation resistance on low memory - only use bigger
1083	 * page orders on machines with more than 32MB of memory.
1084	 */
1085	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
1086		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1087
1088	/* Bootstrap is tricky, because several objects are allocated
1089	 * from caches that do not exist yet:
1090	 * 1) initialize the cache_cache cache: it contains the struct kmem_cache
1091	 *    structures of all caches, except cache_cache itself: cache_cache
1092	 *    is statically allocated.
1093	 *    Initially an __init data area is used for the head array and the
1094	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
1095	 *    array at the end of the bootstrap.
1096	 * 2) Create the first kmalloc cache.
1097	 *    The struct kmem_cache for the new cache is allocated normally.
1098	 *    An __init data area is used for the head array.
1099	 * 3) Create the remaining kmalloc caches, with minimally sized
1100	 *    head arrays.
1101	 * 4) Replace the __init data head arrays for cache_cache and the first
1102	 *    kmalloc cache with kmalloc allocated arrays.
1103	 * 5) Replace the __init data for kmem_list3 for cache_cache and
1104	 *    the other cache's with kmalloc allocated memory.
1105	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1106	 */
1107
1108	/* 1) create the cache_cache */
1109	INIT_LIST_HEAD(&cache_chain);
1110	list_add(&cache_cache.next, &cache_chain);
1111	cache_cache.colour_off = cache_line_size();
1112	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1113	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
1114
1115	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
1116
1117	cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
1118		       &left_over, &cache_cache.num);
1119	if (!cache_cache.num)
1120		BUG();
1121
1122	cache_cache.colour = left_over / cache_cache.colour_off;
1123	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1124				      sizeof(struct slab), cache_line_size());
1125
1126	/* 2+3) create the kmalloc caches */
1127	sizes = malloc_sizes;
1128	names = cache_names;
1129
1130	/* Initialize the caches that provide memory for the array cache
1131	 * and the kmem_list3 structures first.
1132	 * Without this, further allocations will bug
1133	 */
1134
1135	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1136						      sizes[INDEX_AC].cs_size,
1137						      ARCH_KMALLOC_MINALIGN,
1138						      (ARCH_KMALLOC_FLAGS |
1139						       SLAB_PANIC), NULL, NULL);
1140
1141	if (INDEX_AC != INDEX_L3)
1142		sizes[INDEX_L3].cs_cachep =
1143		    kmem_cache_create(names[INDEX_L3].name,
1144				      sizes[INDEX_L3].cs_size,
1145				      ARCH_KMALLOC_MINALIGN,
1146				      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
1147				      NULL);
1148
1149	while (sizes->cs_size != ULONG_MAX) {
1150		/*
1151		 * For performance, all the general caches are L1 aligned.
1152		 * This should be particularly beneficial on SMP boxes, as it
1153		 * eliminates "false sharing".
1154		 * Note for systems short on memory removing the alignment will
1155		 * allow tighter packing of the smaller caches.
1156		 */
1157		if (!sizes->cs_cachep)
1158			sizes->cs_cachep = kmem_cache_create(names->name,
1159							     sizes->cs_size,
1160							     ARCH_KMALLOC_MINALIGN,
1161							     (ARCH_KMALLOC_FLAGS
1162							      | SLAB_PANIC),
1163							     NULL, NULL);
1164
1165		/* Inc off-slab bufctl limit until the ceiling is hit. */
1166		if (!(OFF_SLAB(sizes->cs_cachep))) {
1167			offslab_limit = sizes->cs_size - sizeof(struct slab);
1168			offslab_limit /= sizeof(kmem_bufctl_t);
1169		}
1170
1171		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1172							sizes->cs_size,
1173							ARCH_KMALLOC_MINALIGN,
1174							(ARCH_KMALLOC_FLAGS |
1175							 SLAB_CACHE_DMA |
1176							 SLAB_PANIC), NULL,
1177							NULL);
1178
1179		sizes++;
1180		names++;
1181	}
1182	/* 4) Replace the bootstrap head arrays */
1183	{
1184		void *ptr;
1185
1186		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1187
1188		local_irq_disable();
1189		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1190		memcpy(ptr, cpu_cache_get(&cache_cache),
1191		       sizeof(struct arraycache_init));
1192		cache_cache.array[smp_processor_id()] = ptr;
1193		local_irq_enable();
1194
1195		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1196
1197		local_irq_disable();
1198		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1199		       != &initarray_generic.cache);
1200		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1201		       sizeof(struct arraycache_init));
1202		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1203		    ptr;
1204		local_irq_enable();
1205	}
1206	/* 5) Replace the bootstrap kmem_list3's */
1207	{
1208		int node;
1209		/* Replace the static kmem_list3 structures for the boot cpu */
1210		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
1211			  numa_node_id());
1212
1213		for_each_online_node(node) {
1214			init_list(malloc_sizes[INDEX_AC].cs_cachep,
1215				  &initkmem_list3[SIZE_AC + node], node);
1216
1217			if (INDEX_AC != INDEX_L3) {
1218				init_list(malloc_sizes[INDEX_L3].cs_cachep,
1219					  &initkmem_list3[SIZE_L3 + node],
1220					  node);
1221			}
1222		}
1223	}
1224
1225	/* 6) resize the head arrays to their final sizes */
1226	{
1227		struct kmem_cache *cachep;
1228		mutex_lock(&cache_chain_mutex);
1229		list_for_each_entry(cachep, &cache_chain, next)
1230		    enable_cpucache(cachep);
1231		mutex_unlock(&cache_chain_mutex);
1232	}
1233
1234	/* Done! */
1235	g_cpucache_up = FULL;
1236
1237	/* Register a cpu startup notifier callback
1238	 * that initializes cpu_cache_get for all new cpus
1239	 */
1240	register_cpu_notifier(&cpucache_notifier);
1241
1242	/* The reap timers are started later, with a module init call:
1243	 * That part of the kernel is not yet operational.
1244	 */
1245}
1246
1247static int __init cpucache_init(void)
1248{
1249	int cpu;
1250
1251	/*
1252	 * Register the timers that return unneeded
1253	 * pages to gfp.
1254	 */
1255	for_each_online_cpu(cpu)
1256	    start_cpu_timer(cpu);
1257
1258	return 0;
1259}
1260
1261__initcall(cpucache_init);
1262
1263/*
1264 * Interface to system's page allocator. No need to hold the cache-lock.
1265 *
1266 * If we requested dmaable memory, we will get it. Even if we
1267 * did not request dmaable memory, we might get it, but that
1268 * would be relatively rare and ignorable.
1269 */
1270static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1271{
1272	struct page *page;
1273	void *addr;
1274	int i;
1275
1276	flags |= cachep->gfpflags;
1277	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1278	if (!page)
1279		return NULL;
1280	addr = page_address(page);
1281
1282	i = (1 << cachep->gfporder);
1283	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1284		atomic_add(i, &slab_reclaim_pages);
1285	add_page_state(nr_slab, i);
1286	while (i--) {
1287		SetPageSlab(page);
1288		page++;
1289	}
1290	return addr;
1291}
1292
1293/*
1294 * Interface to system's page release.
1295 */
1296static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1297{
1298	unsigned long i = (1 << cachep->gfporder);
1299	struct page *page = virt_to_page(addr);
1300	const unsigned long nr_freed = i;
1301
1302	while (i--) {
1303		if (!TestClearPageSlab(page))
1304			BUG();
1305		page++;
1306	}
1307	sub_page_state(nr_slab, nr_freed);
1308	if (current->reclaim_state)
1309		current->reclaim_state->reclaimed_slab += nr_freed;
1310	free_pages((unsigned long)addr, cachep->gfporder);
1311	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1312		atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
1313}
1314
1315static void kmem_rcu_free(struct rcu_head *head)
1316{
1317	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1318	struct kmem_cache *cachep = slab_rcu->cachep;
1319
1320	kmem_freepages(cachep, slab_rcu->addr);
1321	if (OFF_SLAB(cachep))
1322		kmem_cache_free(cachep->slabp_cache, slab_rcu);
1323}
1324
1325#if DEBUG
1326
1327#ifdef CONFIG_DEBUG_PAGEALLOC
1328static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1329			    unsigned long caller)
1330{
1331	int size = obj_size(cachep);
1332
1333	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1334
1335	if (size < 5 * sizeof(unsigned long))
1336		return;
1337
1338	*addr++ = 0x12345678;
1339	*addr++ = caller;
1340	*addr++ = smp_processor_id();
1341	size -= 3 * sizeof(unsigned long);
1342	{
1343		unsigned long *sptr = &caller;
1344		unsigned long svalue;
1345
1346		while (!kstack_end(sptr)) {
1347			svalue = *sptr++;
1348			if (kernel_text_address(svalue)) {
1349				*addr++ = svalue;
1350				size -= sizeof(unsigned long);
1351				if (size <= sizeof(unsigned long))
1352					break;
1353			}
1354		}
1355
1356	}
1357	*addr++ = 0x87654321;
1358}
1359#endif
1360
1361static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1362{
1363	int size = obj_size(cachep);
1364	addr = &((char *)addr)[obj_offset(cachep)];
1365
1366	memset(addr, val, size);
1367	*(unsigned char *)(addr + size - 1) = POISON_END;
1368}
1369
1370static void dump_line(char *data, int offset, int limit)
1371{
1372	int i;
1373	printk(KERN_ERR "%03x:", offset);
1374	for (i = 0; i < limit; i++) {
1375		printk(" %02x", (unsigned char)data[offset + i]);
1376	}
1377	printk("\n");
1378}
1379#endif
1380
1381#if DEBUG
1382
1383static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1384{
1385	int i, size;
1386	char *realobj;
1387
1388	if (cachep->flags & SLAB_RED_ZONE) {
1389		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1390		       *dbg_redzone1(cachep, objp),
1391		       *dbg_redzone2(cachep, objp));
1392	}
1393
1394	if (cachep->flags & SLAB_STORE_USER) {
1395		printk(KERN_ERR "Last user: [<%p>]",
1396		       *dbg_userword(cachep, objp));
1397		print_symbol("(%s)",
1398			     (unsigned long)*dbg_userword(cachep, objp));
1399		printk("\n");
1400	}
1401	realobj = (char *)objp + obj_offset(cachep);
1402	size = obj_size(cachep);
1403	for (i = 0; i < size && lines; i += 16, lines--) {
1404		int limit;
1405		limit = 16;
1406		if (i + limit > size)
1407			limit = size - i;
1408		dump_line(realobj, i, limit);
1409	}
1410}
1411
1412static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1413{
1414	char *realobj;
1415	int size, i;
1416	int lines = 0;
1417
1418	realobj = (char *)objp + obj_offset(cachep);
1419	size = obj_size(cachep);
1420
1421	for (i = 0; i < size; i++) {
1422		char exp = POISON_FREE;
1423		if (i == size - 1)
1424			exp = POISON_END;
1425		if (realobj[i] != exp) {
1426			int limit;
1427			/* Mismatch ! */
1428			/* Print header */
1429			if (lines == 0) {
1430				printk(KERN_ERR
1431				       "Slab corruption: start=%p, len=%d\n",
1432				       realobj, size);
1433				print_objinfo(cachep, objp, 0);
1434			}
1435			/* Hexdump the affected line */
1436			i = (i / 16) * 16;
1437			limit = 16;
1438			if (i + limit > size)
1439				limit = size - i;
1440			dump_line(realobj, i, limit);
1441			i += 16;
1442			lines++;
1443			/* Limit to 5 lines */
1444			if (lines > 5)
1445				break;
1446		}
1447	}
1448	if (lines != 0) {
1449		/* Print some data about the neighboring objects, if they
1450		 * exist:
1451		 */
1452		struct slab *slabp = virt_to_slab(objp);
1453		int objnr;
1454
1455		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
1456		if (objnr) {
1457			objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
1458			realobj = (char *)objp + obj_offset(cachep);
1459			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1460			       realobj, size);
1461			print_objinfo(cachep, objp, 2);
1462		}
1463		if (objnr + 1 < cachep->num) {
1464			objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
1465			realobj = (char *)objp + obj_offset(cachep);
1466			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1467			       realobj, size);
1468			print_objinfo(cachep, objp, 2);
1469		}
1470	}
1471}
1472#endif
1473
1474#if DEBUG
1475/**
1476 * slab_destroy_objs - call the registered destructor for each object in
1477 *      a slab that is to be destroyed.
1478 */
1479static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1480{
1481	int i;
1482	for (i = 0; i < cachep->num; i++) {
1483		void *objp = slabp->s_mem + cachep->buffer_size * i;
1484
1485		if (cachep->flags & SLAB_POISON) {
1486#ifdef CONFIG_DEBUG_PAGEALLOC
1487			if ((cachep->buffer_size % PAGE_SIZE) == 0
1488			    && OFF_SLAB(cachep))
1489				kernel_map_pages(virt_to_page(objp),
1490						 cachep->buffer_size / PAGE_SIZE,
1491						 1);
1492			else
1493				check_poison_obj(cachep, objp);
1494#else
1495			check_poison_obj(cachep, objp);
1496#endif
1497		}
1498		if (cachep->flags & SLAB_RED_ZONE) {
1499			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1500				slab_error(cachep, "start of a freed object "
1501					   "was overwritten");
1502			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1503				slab_error(cachep, "end of a freed object "
1504					   "was overwritten");
1505		}
1506		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1507			(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
1508	}
1509}
1510#else
1511static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1512{
1513	if (cachep->dtor) {
1514		int i;
1515		for (i = 0; i < cachep->num; i++) {
1516			void *objp = slabp->s_mem + cachep->buffer_size * i;
1517			(cachep->dtor) (objp, cachep, 0);
1518		}
1519	}
1520}
1521#endif
1522
1523/**
1524 * Destroy all the objs in a slab, and release the mem back to the system.
1525 * Before calling the slab must have been unlinked from the cache.
1526 * The cache-lock is not held/needed.
1527 */
1528static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1529{
1530	void *addr = slabp->s_mem - slabp->colouroff;
1531
1532	slab_destroy_objs(cachep, slabp);
1533	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1534		struct slab_rcu *slab_rcu;
1535
1536		slab_rcu = (struct slab_rcu *)slabp;
1537		slab_rcu->cachep = cachep;
1538		slab_rcu->addr = addr;
1539		call_rcu(&slab_rcu->head, kmem_rcu_free);
1540	} else {
1541		kmem_freepages(cachep, addr);
1542		if (OFF_SLAB(cachep))
1543			kmem_cache_free(cachep->slabp_cache, slabp);
1544	}
1545}
1546
1547/* For setting up all the kmem_list3s for cache whose buffer_size is same
1548   as size of kmem_list3. */
1549static void set_up_list3s(struct kmem_cache *cachep, int index)
1550{
1551	int node;
1552
1553	for_each_online_node(node) {
1554		cachep->nodelists[node] = &initkmem_list3[index + node];
1555		cachep->nodelists[node]->next_reap = jiffies +
1556		    REAPTIMEOUT_LIST3 +
1557		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1558	}
1559}
1560
1561/**
1562 * calculate_slab_order - calculate size (page order) of slabs
1563 * @cachep: pointer to the cache that is being created
1564 * @size: size of objects to be created in this cache.
1565 * @align: required alignment for the objects.
1566 * @flags: slab allocation flags
1567 *
1568 * Also calculates the number of objects per slab.
1569 *
1570 * This could be made much more intelligent.  For now, try to avoid using
1571 * high order pages for slabs.  When the gfp() functions are more friendly
1572 * towards high-order requests, this should be changed.
1573 */
1574static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1575			size_t size, size_t align, unsigned long flags)
1576{
1577	size_t left_over = 0;
1578
1579	for (;; cachep->gfporder++) {
1580		unsigned int num;
1581		size_t remainder;
1582
1583		if (cachep->gfporder > MAX_GFP_ORDER) {
1584			cachep->num = 0;
1585			break;
1586		}
1587
1588		cache_estimate(cachep->gfporder, size, align, flags,
1589			       &remainder, &num);
1590		if (!num)
1591			continue;
1592		/* More than offslab_limit objects will cause problems */
1593		if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
1594			break;
1595
1596		cachep->num = num;
1597		left_over = remainder;
1598
1599		/*
1600		 * Large number of objects is good, but very large slabs are
1601		 * currently bad for the gfp()s.
1602		 */
1603		if (cachep->gfporder >= slab_break_gfp_order)
1604			break;
1605
1606		if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
1607			/* Acceptable internal fragmentation */
1608			break;
1609	}
1610	return left_over;
1611}
1612
1613/**
1614 * kmem_cache_create - Create a cache.
1615 * @name: A string which is used in /proc/slabinfo to identify this cache.
1616 * @size: The size of objects to be created in this cache.
1617 * @align: The required alignment for the objects.
1618 * @flags: SLAB flags
1619 * @ctor: A constructor for the objects.
1620 * @dtor: A destructor for the objects.
1621 *
1622 * Returns a ptr to the cache on success, NULL on failure.
1623 * Cannot be called within a int, but can be interrupted.
1624 * The @ctor is run when new pages are allocated by the cache
1625 * and the @dtor is run before the pages are handed back.
1626 *
1627 * @name must be valid until the cache is destroyed. This implies that
1628 * the module calling this has to destroy the cache before getting
1629 * unloaded.
1630 *
1631 * The flags are
1632 *
1633 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
1634 * to catch references to uninitialised memory.
1635 *
1636 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1637 * for buffer overruns.
1638 *
1639 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
1640 * memory pressure.
1641 *
1642 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1643 * cacheline.  This can be beneficial if you're counting cycles as closely
1644 * as davem.
1645 */
1646struct kmem_cache *
1647kmem_cache_create (const char *name, size_t size, size_t align,
1648	unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long),
1649	void (*dtor)(void*, struct kmem_cache *, unsigned long))
1650{
1651	size_t left_over, slab_size, ralign;
1652	struct kmem_cache *cachep = NULL;
1653	struct list_head *p;
1654
1655	/*
1656	 * Sanity checks... these are all serious usage bugs.
1657	 */
1658	if ((!name) ||
1659	    in_interrupt() ||
1660	    (size < BYTES_PER_WORD) ||
1661	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
1662		printk(KERN_ERR "%s: Early error in slab %s\n",
1663		       __FUNCTION__, name);
1664		BUG();
1665	}
1666
1667	mutex_lock(&cache_chain_mutex);
1668
1669	list_for_each(p, &cache_chain) {
1670		struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
1671		mm_segment_t old_fs = get_fs();
1672		char tmp;
1673		int res;
1674
1675		/*
1676		 * This happens when the module gets unloaded and doesn't
1677		 * destroy its slab cache and no-one else reuses the vmalloc
1678		 * area of the module.  Print a warning.
1679		 */
1680		set_fs(KERNEL_DS);
1681		res = __get_user(tmp, pc->name);
1682		set_fs(old_fs);
1683		if (res) {
1684			printk("SLAB: cache with size %d has lost its name\n",
1685			       pc->buffer_size);
1686			continue;
1687		}
1688
1689		if (!strcmp(pc->name, name)) {
1690			printk("kmem_cache_create: duplicate cache %s\n", name);
1691			dump_stack();
1692			goto oops;
1693		}
1694	}
1695
1696#if DEBUG
1697	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
1698	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
1699		/* No constructor, but inital state check requested */
1700		printk(KERN_ERR "%s: No con, but init state check "
1701		       "requested - %s\n", __FUNCTION__, name);
1702		flags &= ~SLAB_DEBUG_INITIAL;
1703	}
1704#if FORCED_DEBUG
1705	/*
1706	 * Enable redzoning and last user accounting, except for caches with
1707	 * large objects, if the increased size would increase the object size
1708	 * above the next power of two: caches with object sizes just above a
1709	 * power of two have a significant amount of internal fragmentation.
1710	 */
1711	if ((size < 4096
1712	     || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
1713		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
1714	if (!(flags & SLAB_DESTROY_BY_RCU))
1715		flags |= SLAB_POISON;
1716#endif
1717	if (flags & SLAB_DESTROY_BY_RCU)
1718		BUG_ON(flags & SLAB_POISON);
1719#endif
1720	if (flags & SLAB_DESTROY_BY_RCU)
1721		BUG_ON(dtor);
1722
1723	/*
1724	 * Always checks flags, a caller might be expecting debug
1725	 * support which isn't available.
1726	 */
1727	if (flags & ~CREATE_MASK)
1728		BUG();
1729
1730	/* Check that size is in terms of words.  This is needed to avoid
1731	 * unaligned accesses for some archs when redzoning is used, and makes
1732	 * sure any on-slab bufctl's are also correctly aligned.
1733	 */
1734	if (size & (BYTES_PER_WORD - 1)) {
1735		size += (BYTES_PER_WORD - 1);
1736		size &= ~(BYTES_PER_WORD - 1);
1737	}
1738
1739	/* calculate out the final buffer alignment: */
1740	/* 1) arch recommendation: can be overridden for debug */
1741	if (flags & SLAB_HWCACHE_ALIGN) {
1742		/* Default alignment: as specified by the arch code.
1743		 * Except if an object is really small, then squeeze multiple
1744		 * objects into one cacheline.
1745		 */
1746		ralign = cache_line_size();
1747		while (size <= ralign / 2)
1748			ralign /= 2;
1749	} else {
1750		ralign = BYTES_PER_WORD;
1751	}
1752	/* 2) arch mandated alignment: disables debug if necessary */
1753	if (ralign < ARCH_SLAB_MINALIGN) {
1754		ralign = ARCH_SLAB_MINALIGN;
1755		if (ralign > BYTES_PER_WORD)
1756			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1757	}
1758	/* 3) caller mandated alignment: disables debug if necessary */
1759	if (ralign < align) {
1760		ralign = align;
1761		if (ralign > BYTES_PER_WORD)
1762			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1763	}
1764	/* 4) Store it. Note that the debug code below can reduce
1765	 *    the alignment to BYTES_PER_WORD.
1766	 */
1767	align = ralign;
1768
1769	/* Get cache's description obj. */
1770	cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
1771	if (!cachep)
1772		goto oops;
1773	memset(cachep, 0, sizeof(struct kmem_cache));
1774
1775#if DEBUG
1776	cachep->obj_size = size;
1777
1778	if (flags & SLAB_RED_ZONE) {
1779		/* redzoning only works with word aligned caches */
1780		align = BYTES_PER_WORD;
1781
1782		/* add space for red zone words */
1783		cachep->obj_offset += BYTES_PER_WORD;
1784		size += 2 * BYTES_PER_WORD;
1785	}
1786	if (flags & SLAB_STORE_USER) {
1787		/* user store requires word alignment and
1788		 * one word storage behind the end of the real
1789		 * object.
1790		 */
1791		align = BYTES_PER_WORD;
1792		size += BYTES_PER_WORD;
1793	}
1794#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1795	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
1796	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
1797		cachep->obj_offset += PAGE_SIZE - size;
1798		size = PAGE_SIZE;
1799	}
1800#endif
1801#endif
1802
1803	/* Determine if the slab management is 'on' or 'off' slab. */
1804	if (size >= (PAGE_SIZE >> 3))
1805		/*
1806		 * Size is large, assume best to place the slab management obj
1807		 * off-slab (should allow better packing of objs).
1808		 */
1809		flags |= CFLGS_OFF_SLAB;
1810
1811	size = ALIGN(size, align);
1812
1813	if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
1814		/*
1815		 * A VFS-reclaimable slab tends to have most allocations
1816		 * as GFP_NOFS and we really don't want to have to be allocating
1817		 * higher-order pages when we are unable to shrink dcache.
1818		 */
1819		cachep->gfporder = 0;
1820		cache_estimate(cachep->gfporder, size, align, flags,
1821			       &left_over, &cachep->num);
1822	} else
1823		left_over = calculate_slab_order(cachep, size, align, flags);
1824
1825	if (!cachep->num) {
1826		printk("kmem_cache_create: couldn't create cache %s.\n", name);
1827		kmem_cache_free(&cache_cache, cachep);
1828		cachep = NULL;
1829		goto oops;
1830	}
1831	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
1832			  + sizeof(struct slab), align);
1833
1834	/*
1835	 * If the slab has been placed off-slab, and we have enough space then
1836	 * move it on-slab. This is at the expense of any extra colouring.
1837	 */
1838	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
1839		flags &= ~CFLGS_OFF_SLAB;
1840		left_over -= slab_size;
1841	}
1842
1843	if (flags & CFLGS_OFF_SLAB) {
1844		/* really off slab. No need for manual alignment */
1845		slab_size =
1846		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
1847	}
1848
1849	cachep->colour_off = cache_line_size();
1850	/* Offset must be a multiple of the alignment. */
1851	if (cachep->colour_off < align)
1852		cachep->colour_off = align;
1853	cachep->colour = left_over / cachep->colour_off;
1854	cachep->slab_size = slab_size;
1855	cachep->flags = flags;
1856	cachep->gfpflags = 0;
1857	if (flags & SLAB_CACHE_DMA)
1858		cachep->gfpflags |= GFP_DMA;
1859	spin_lock_init(&cachep->spinlock);
1860	cachep->buffer_size = size;
1861
1862	if (flags & CFLGS_OFF_SLAB)
1863		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
1864	cachep->ctor = ctor;
1865	cachep->dtor = dtor;
1866	cachep->name = name;
1867
1868	/* Don't let CPUs to come and go */
1869	lock_cpu_hotplug();
1870
1871	if (g_cpucache_up == FULL) {
1872		enable_cpucache(cachep);
1873	} else {
1874		if (g_cpucache_up == NONE) {
1875			/* Note: the first kmem_cache_create must create
1876			 * the cache that's used by kmalloc(24), otherwise
1877			 * the creation of further caches will BUG().
1878			 */
1879			cachep->array[smp_processor_id()] =
1880			    &initarray_generic.cache;
1881
1882			/* If the cache that's used by
1883			 * kmalloc(sizeof(kmem_list3)) is the first cache,
1884			 * then we need to set up all its list3s, otherwise
1885			 * the creation of further caches will BUG().
1886			 */
1887			set_up_list3s(cachep, SIZE_AC);
1888			if (INDEX_AC == INDEX_L3)
1889				g_cpucache_up = PARTIAL_L3;
1890			else
1891				g_cpucache_up = PARTIAL_AC;
1892		} else {
1893			cachep->array[smp_processor_id()] =
1894			    kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1895
1896			if (g_cpucache_up == PARTIAL_AC) {
1897				set_up_list3s(cachep, SIZE_L3);
1898				g_cpucache_up = PARTIAL_L3;
1899			} else {
1900				int node;
1901				for_each_online_node(node) {
1902
1903					cachep->nodelists[node] =
1904					    kmalloc_node(sizeof
1905							 (struct kmem_list3),
1906							 GFP_KERNEL, node);
1907					BUG_ON(!cachep->nodelists[node]);
1908					kmem_list3_init(cachep->
1909							nodelists[node]);
1910				}
1911			}
1912		}
1913		cachep->nodelists[numa_node_id()]->next_reap =
1914		    jiffies + REAPTIMEOUT_LIST3 +
1915		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1916
1917		BUG_ON(!cpu_cache_get(cachep));
1918		cpu_cache_get(cachep)->avail = 0;
1919		cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1920		cpu_cache_get(cachep)->batchcount = 1;
1921		cpu_cache_get(cachep)->touched = 0;
1922		cachep->batchcount = 1;
1923		cachep->limit = BOOT_CPUCACHE_ENTRIES;
1924	}
1925
1926	/* cache setup completed, link it into the list */
1927	list_add(&cachep->next, &cache_chain);
1928	unlock_cpu_hotplug();
1929      oops:
1930	if (!cachep && (flags & SLAB_PANIC))
1931		panic("kmem_cache_create(): failed to create slab `%s'\n",
1932		      name);
1933	mutex_unlock(&cache_chain_mutex);
1934	return cachep;
1935}
1936EXPORT_SYMBOL(kmem_cache_create);
1937
1938#if DEBUG
1939static void check_irq_off(void)
1940{
1941	BUG_ON(!irqs_disabled());
1942}
1943
1944static void check_irq_on(void)
1945{
1946	BUG_ON(irqs_disabled());
1947}
1948
1949static void check_spinlock_acquired(struct kmem_cache *cachep)
1950{
1951#ifdef CONFIG_SMP
1952	check_irq_off();
1953	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
1954#endif
1955}
1956
1957static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
1958{
1959#ifdef CONFIG_SMP
1960	check_irq_off();
1961	assert_spin_locked(&cachep->nodelists[node]->list_lock);
1962#endif
1963}
1964
1965#else
1966#define check_irq_off()	do { } while(0)
1967#define check_irq_on()	do { } while(0)
1968#define check_spinlock_acquired(x) do { } while(0)
1969#define check_spinlock_acquired_node(x, y) do { } while(0)
1970#endif
1971
1972/*
1973 * Waits for all CPUs to execute func().
1974 */
1975static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
1976{
1977	check_irq_on();
1978	preempt_disable();
1979
1980	local_irq_disable();
1981	func(arg);
1982	local_irq_enable();
1983
1984	if (smp_call_function(func, arg, 1, 1))
1985		BUG();
1986
1987	preempt_enable();
1988}
1989
1990static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
1991				int force, int node);
1992
1993static void do_drain(void *arg)
1994{
1995	struct kmem_cache *cachep = (struct kmem_cache *) arg;
1996	struct array_cache *ac;
1997	int node = numa_node_id();
1998
1999	check_irq_off();
2000	ac = cpu_cache_get(cachep);
2001	spin_lock(&cachep->nodelists[node]->list_lock);
2002	free_block(cachep, ac->entry, ac->avail, node);
2003	spin_unlock(&cachep->nodelists[node]->list_lock);
2004	ac->avail = 0;
2005}
2006
2007static void drain_cpu_caches(struct kmem_cache *cachep)
2008{
2009	struct kmem_list3 *l3;
2010	int node;
2011
2012	smp_call_function_all_cpus(do_drain, cachep);
2013	check_irq_on();
2014	spin_lock(&cachep->spinlock);
2015	for_each_online_node(node) {
2016		l3 = cachep->nodelists[node];
2017		if (l3) {
2018			spin_lock_irq(&l3->list_lock);
2019			drain_array_locked(cachep, l3->shared, 1, node);
2020			spin_unlock_irq(&l3->list_lock);
2021			if (l3->alien)
2022				drain_alien_cache(cachep, l3);
2023		}
2024	}
2025	spin_unlock(&cachep->spinlock);
2026}
2027
2028static int __node_shrink(struct kmem_cache *cachep, int node)
2029{
2030	struct slab *slabp;
2031	struct kmem_list3 *l3 = cachep->nodelists[node];
2032	int ret;
2033
2034	for (;;) {
2035		struct list_head *p;
2036
2037		p = l3->slabs_free.prev;
2038		if (p == &l3->slabs_free)
2039			break;
2040
2041		slabp = list_entry(l3->slabs_free.prev, struct slab, list);
2042#if DEBUG
2043		if (slabp->inuse)
2044			BUG();
2045#endif
2046		list_del(&slabp->list);
2047
2048		l3->free_objects -= cachep->num;
2049		spin_unlock_irq(&l3->list_lock);
2050		slab_destroy(cachep, slabp);
2051		spin_lock_irq(&l3->list_lock);
2052	}
2053	ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
2054	return ret;
2055}
2056
2057static int __cache_shrink(struct kmem_cache *cachep)
2058{
2059	int ret = 0, i = 0;
2060	struct kmem_list3 *l3;
2061
2062	drain_cpu_caches(cachep);
2063
2064	check_irq_on();
2065	for_each_online_node(i) {
2066		l3 = cachep->nodelists[i];
2067		if (l3) {
2068			spin_lock_irq(&l3->list_lock);
2069			ret += __node_shrink(cachep, i);
2070			spin_unlock_irq(&l3->list_lock);
2071		}
2072	}
2073	return (ret ? 1 : 0);
2074}
2075
2076/**
2077 * kmem_cache_shrink - Shrink a cache.
2078 * @cachep: The cache to shrink.
2079 *
2080 * Releases as many slabs as possible for a cache.
2081 * To help debugging, a zero exit status indicates all slabs were released.
2082 */
2083int kmem_cache_shrink(struct kmem_cache *cachep)
2084{
2085	if (!cachep || in_interrupt())
2086		BUG();
2087
2088	return __cache_shrink(cachep);
2089}
2090EXPORT_SYMBOL(kmem_cache_shrink);
2091
2092/**
2093 * kmem_cache_destroy - delete a cache
2094 * @cachep: the cache to destroy
2095 *
2096 * Remove a struct kmem_cache object from the slab cache.
2097 * Returns 0 on success.
2098 *
2099 * It is expected this function will be called by a module when it is
2100 * unloaded.  This will remove the cache completely, and avoid a duplicate
2101 * cache being allocated each time a module is loaded and unloaded, if the
2102 * module doesn't have persistent in-kernel storage across loads and unloads.
2103 *
2104 * The cache must be empty before calling this function.
2105 *
2106 * The caller must guarantee that noone will allocate memory from the cache
2107 * during the kmem_cache_destroy().
2108 */
2109int kmem_cache_destroy(struct kmem_cache *cachep)
2110{
2111	int i;
2112	struct kmem_list3 *l3;
2113
2114	if (!cachep || in_interrupt())
2115		BUG();
2116
2117	/* Don't let CPUs to come and go */
2118	lock_cpu_hotplug();
2119
2120	/* Find the cache in the chain of caches. */
2121	mutex_lock(&cache_chain_mutex);
2122	/*
2123	 * the chain is never empty, cache_cache is never destroyed
2124	 */
2125	list_del(&cachep->next);
2126	mutex_unlock(&cache_chain_mutex);
2127
2128	if (__cache_shrink(cachep)) {
2129		slab_error(cachep, "Can't free all objects");
2130		mutex_lock(&cache_chain_mutex);
2131		list_add(&cachep->next, &cache_chain);
2132		mutex_unlock(&cache_chain_mutex);
2133		unlock_cpu_hotplug();
2134		return 1;
2135	}
2136
2137	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2138		synchronize_rcu();
2139
2140	for_each_online_cpu(i)
2141	    kfree(cachep->array[i]);
2142
2143	/* NUMA: free the list3 structures */
2144	for_each_online_node(i) {
2145		if ((l3 = cachep->nodelists[i])) {
2146			kfree(l3->shared);
2147			free_alien_cache(l3->alien);
2148			kfree(l3);
2149		}
2150	}
2151	kmem_cache_free(&cache_cache, cachep);
2152
2153	unlock_cpu_hotplug();
2154
2155	return 0;
2156}
2157EXPORT_SYMBOL(kmem_cache_destroy);
2158
2159/* Get the memory for a slab management obj. */
2160static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2161				   int colour_off, gfp_t local_flags)
2162{
2163	struct slab *slabp;
2164
2165	if (OFF_SLAB(cachep)) {
2166		/* Slab management obj is off-slab. */
2167		slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
2168		if (!slabp)
2169			return NULL;
2170	} else {
2171		slabp = objp + colour_off;
2172		colour_off += cachep->slab_size;
2173	}
2174	slabp->inuse = 0;
2175	slabp->colouroff = colour_off;
2176	slabp->s_mem = objp + colour_off;
2177
2178	return slabp;
2179}
2180
2181static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2182{
2183	return (kmem_bufctl_t *) (slabp + 1);
2184}
2185
2186static void cache_init_objs(struct kmem_cache *cachep,
2187			    struct slab *slabp, unsigned long ctor_flags)
2188{
2189	int i;
2190
2191	for (i = 0; i < cachep->num; i++) {
2192		void *objp = slabp->s_mem + cachep->buffer_size * i;
2193#if DEBUG
2194		/* need to poison the objs? */
2195		if (cachep->flags & SLAB_POISON)
2196			poison_obj(cachep, objp, POISON_FREE);
2197		if (cachep->flags & SLAB_STORE_USER)
2198			*dbg_userword(cachep, objp) = NULL;
2199
2200		if (cachep->flags & SLAB_RED_ZONE) {
2201			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2202			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2203		}
2204		/*
2205		 * Constructors are not allowed to allocate memory from
2206		 * the same cache which they are a constructor for.
2207		 * Otherwise, deadlock. They must also be threaded.
2208		 */
2209		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2210			cachep->ctor(objp + obj_offset(cachep), cachep,
2211				     ctor_flags);
2212
2213		if (cachep->flags & SLAB_RED_ZONE) {
2214			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2215				slab_error(cachep, "constructor overwrote the"
2216					   " end of an object");
2217			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2218				slab_error(cachep, "constructor overwrote the"
2219					   " start of an object");
2220		}
2221		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
2222		    && cachep->flags & SLAB_POISON)
2223			kernel_map_pages(virt_to_page(objp),
2224					 cachep->buffer_size / PAGE_SIZE, 0);
2225#else
2226		if (cachep->ctor)
2227			cachep->ctor(objp, cachep, ctor_flags);
2228#endif
2229		slab_bufctl(slabp)[i] = i + 1;
2230	}
2231	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2232	slabp->free = 0;
2233}
2234
2235static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2236{
2237	if (flags & SLAB_DMA) {
2238		if (!(cachep->gfpflags & GFP_DMA))
2239			BUG();
2240	} else {
2241		if (cachep->gfpflags & GFP_DMA)
2242			BUG();
2243	}
2244}
2245
2246static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
2247{
2248	void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
2249	kmem_bufctl_t next;
2250
2251	slabp->inuse++;
2252	next = slab_bufctl(slabp)[slabp->free];
2253#if DEBUG
2254	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2255	WARN_ON(slabp->nodeid != nodeid);
2256#endif
2257	slabp->free = next;
2258
2259	return objp;
2260}
2261
2262static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
2263			  int nodeid)
2264{
2265	unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
2266
2267#if DEBUG
2268	/* Verify that the slab belongs to the intended node */
2269	WARN_ON(slabp->nodeid != nodeid);
2270
2271	if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2272		printk(KERN_ERR "slab: double free detected in cache "
2273		       "'%s', objp %p\n", cachep->name, objp);
2274		BUG();
2275	}
2276#endif
2277	slab_bufctl(slabp)[objnr] = slabp->free;
2278	slabp->free = objnr;
2279	slabp->inuse--;
2280}
2281
2282static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp)
2283{
2284	int i;
2285	struct page *page;
2286
2287	/* Nasty!!!!!! I hope this is OK. */
2288	i = 1 << cachep->gfporder;
2289	page = virt_to_page(objp);
2290	do {
2291		page_set_cache(page, cachep);
2292		page_set_slab(page, slabp);
2293		page++;
2294	} while (--i);
2295}
2296
2297/*
2298 * Grow (by 1) the number of slabs within a cache.  This is called by
2299 * kmem_cache_alloc() when there are no active objs left in a cache.
2300 */
2301static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2302{
2303	struct slab *slabp;
2304	void *objp;
2305	size_t offset;
2306	gfp_t local_flags;
2307	unsigned long ctor_flags;
2308	struct kmem_list3 *l3;
2309
2310	/* Be lazy and only check for valid flags here,
2311	 * keeping it out of the critical path in kmem_cache_alloc().
2312	 */
2313	if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
2314		BUG();
2315	if (flags & SLAB_NO_GROW)
2316		return 0;
2317
2318	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2319	local_flags = (flags & SLAB_LEVEL_MASK);
2320	if (!(local_flags & __GFP_WAIT))
2321		/*
2322		 * Not allowed to sleep.  Need to tell a constructor about
2323		 * this - it might need to know...
2324		 */
2325		ctor_flags |= SLAB_CTOR_ATOMIC;
2326
2327	/* Take the l3 list lock to change the colour_next on this node */
2328	check_irq_off();
2329	l3 = cachep->nodelists[nodeid];
2330	spin_lock(&l3->list_lock);
2331
2332	/* Get colour for the slab, and cal the next value. */
2333	offset = l3->colour_next;
2334	l3->colour_next++;
2335	if (l3->colour_next >= cachep->colour)
2336		l3->colour_next = 0;
2337	spin_unlock(&l3->list_lock);
2338
2339	offset *= cachep->colour_off;
2340
2341	if (local_flags & __GFP_WAIT)
2342		local_irq_enable();
2343
2344	/*
2345	 * The test for missing atomic flag is performed here, rather than
2346	 * the more obvious place, simply to reduce the critical path length
2347	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2348	 * will eventually be caught here (where it matters).
2349	 */
2350	kmem_flagcheck(cachep, flags);
2351
2352	/* Get mem for the objs.
2353	 * Attempt to allocate a physical page from 'nodeid',
2354	 */
2355	if (!(objp = kmem_getpages(cachep, flags, nodeid)))
2356		goto failed;
2357
2358	/* Get slab management. */
2359	if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
2360		goto opps1;
2361
2362	slabp->nodeid = nodeid;
2363	set_slab_attr(cachep, slabp, objp);
2364
2365	cache_init_objs(cachep, slabp, ctor_flags);
2366
2367	if (local_flags & __GFP_WAIT)
2368		local_irq_disable();
2369	check_irq_off();
2370	spin_lock(&l3->list_lock);
2371
2372	/* Make slab active. */
2373	list_add_tail(&slabp->list, &(l3->slabs_free));
2374	STATS_INC_GROWN(cachep);
2375	l3->free_objects += cachep->num;
2376	spin_unlock(&l3->list_lock);
2377	return 1;
2378      opps1:
2379	kmem_freepages(cachep, objp);
2380      failed:
2381	if (local_flags & __GFP_WAIT)
2382		local_irq_disable();
2383	return 0;
2384}
2385
2386#if DEBUG
2387
2388/*
2389 * Perform extra freeing checks:
2390 * - detect bad pointers.
2391 * - POISON/RED_ZONE checking
2392 * - destructor calls, for caches with POISON+dtor
2393 */
2394static void kfree_debugcheck(const void *objp)
2395{
2396	struct page *page;
2397
2398	if (!virt_addr_valid(objp)) {
2399		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2400		       (unsigned long)objp);
2401		BUG();
2402	}
2403	page = virt_to_page(objp);
2404	if (!PageSlab(page)) {
2405		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
2406		       (unsigned long)objp);
2407		BUG();
2408	}
2409}
2410
2411static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2412				   void *caller)
2413{
2414	struct page *page;
2415	unsigned int objnr;
2416	struct slab *slabp;
2417
2418	objp -= obj_offset(cachep);
2419	kfree_debugcheck(objp);
2420	page = virt_to_page(objp);
2421
2422	if (page_get_cache(page) != cachep) {
2423		printk(KERN_ERR
2424		       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
2425		       page_get_cache(page), cachep);
2426		printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
2427		printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
2428		       page_get_cache(page)->name);
2429		WARN_ON(1);
2430	}
2431	slabp = page_get_slab(page);
2432
2433	if (cachep->flags & SLAB_RED_ZONE) {
2434		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
2435		    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
2436			slab_error(cachep,
2437				   "double free, or memory outside"
2438				   " object was overwritten");
2439			printk(KERN_ERR
2440			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2441			       objp, *dbg_redzone1(cachep, objp),
2442			       *dbg_redzone2(cachep, objp));
2443		}
2444		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2445		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2446	}
2447	if (cachep->flags & SLAB_STORE_USER)
2448		*dbg_userword(cachep, objp) = caller;
2449
2450	objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
2451
2452	BUG_ON(objnr >= cachep->num);
2453	BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size);
2454
2455	if (cachep->flags & SLAB_DEBUG_INITIAL) {
2456		/* Need to call the slab's constructor so the
2457		 * caller can perform a verify of its state (debugging).
2458		 * Called without the cache-lock held.
2459		 */
2460		cachep->ctor(objp + obj_offset(cachep),
2461			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2462	}
2463	if (cachep->flags & SLAB_POISON && cachep->dtor) {
2464		/* we want to cache poison the object,
2465		 * call the destruction callback
2466		 */
2467		cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2468	}
2469	if (cachep->flags & SLAB_POISON) {
2470#ifdef CONFIG_DEBUG_PAGEALLOC
2471		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
2472			store_stackinfo(cachep, objp, (unsigned long)caller);
2473			kernel_map_pages(virt_to_page(objp),
2474					 cachep->buffer_size / PAGE_SIZE, 0);
2475		} else {
2476			poison_obj(cachep, objp, POISON_FREE);
2477		}
2478#else
2479		poison_obj(cachep, objp, POISON_FREE);
2480#endif
2481	}
2482	return objp;
2483}
2484
2485static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2486{
2487	kmem_bufctl_t i;
2488	int entries = 0;
2489
2490	/* Check slab's freelist to see if this obj is there. */
2491	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2492		entries++;
2493		if (entries > cachep->num || i >= cachep->num)
2494			goto bad;
2495	}
2496	if (entries != cachep->num - slabp->inuse) {
2497	      bad:
2498		printk(KERN_ERR
2499		       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2500		       cachep->name, cachep->num, slabp, slabp->inuse);
2501		for (i = 0;
2502		     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
2503		     i++) {
2504			if ((i % 16) == 0)
2505				printk("\n%03x:", i);
2506			printk(" %02x", ((unsigned char *)slabp)[i]);
2507		}
2508		printk("\n");
2509		BUG();
2510	}
2511}
2512#else
2513#define kfree_debugcheck(x) do { } while(0)
2514#define cache_free_debugcheck(x,objp,z) (objp)
2515#define check_slabp(x,y) do { } while(0)
2516#endif
2517
2518static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2519{
2520	int batchcount;
2521	struct kmem_list3 *l3;
2522	struct array_cache *ac;
2523
2524	check_irq_off();
2525	ac = cpu_cache_get(cachep);
2526      retry:
2527	batchcount = ac->batchcount;
2528	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2529		/* if there was little recent activity on this
2530		 * cache, then perform only a partial refill.
2531		 * Otherwise we could generate refill bouncing.
2532		 */
2533		batchcount = BATCHREFILL_LIMIT;
2534	}
2535	l3 = cachep->nodelists[numa_node_id()];
2536
2537	BUG_ON(ac->avail > 0 || !l3);
2538	spin_lock(&l3->list_lock);
2539
2540	if (l3->shared) {
2541		struct array_cache *shared_array = l3->shared;
2542		if (shared_array->avail) {
2543			if (batchcount > shared_array->avail)
2544				batchcount = shared_array->avail;
2545			shared_array->avail -= batchcount;
2546			ac->avail = batchcount;
2547			memcpy(ac->entry,
2548			       &(shared_array->entry[shared_array->avail]),
2549			       sizeof(void *) * batchcount);
2550			shared_array->touched = 1;
2551			goto alloc_done;
2552		}
2553	}
2554	while (batchcount > 0) {
2555		struct list_head *entry;
2556		struct slab *slabp;
2557		/* Get slab alloc is to come from. */
2558		entry = l3->slabs_partial.next;
2559		if (entry == &l3->slabs_partial) {
2560			l3->free_touched = 1;
2561			entry = l3->slabs_free.next;
2562			if (entry == &l3->slabs_free)
2563				goto must_grow;
2564		}
2565
2566		slabp = list_entry(entry, struct slab, list);
2567		check_slabp(cachep, slabp);
2568		check_spinlock_acquired(cachep);
2569		while (slabp->inuse < cachep->num && batchcount--) {
2570			STATS_INC_ALLOCED(cachep);
2571			STATS_INC_ACTIVE(cachep);
2572			STATS_SET_HIGH(cachep);
2573
2574			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
2575							    numa_node_id());
2576		}
2577		check_slabp(cachep, slabp);
2578
2579		/* move slabp to correct slabp list: */
2580		list_del(&slabp->list);
2581		if (slabp->free == BUFCTL_END)
2582			list_add(&slabp->list, &l3->slabs_full);
2583		else
2584			list_add(&slabp->list, &l3->slabs_partial);
2585	}
2586
2587      must_grow:
2588	l3->free_objects -= ac->avail;
2589      alloc_done:
2590	spin_unlock(&l3->list_lock);
2591
2592	if (unlikely(!ac->avail)) {
2593		int x;
2594		x = cache_grow(cachep, flags, numa_node_id());
2595
2596		// cache_grow can reenable interrupts, then ac could change.
2597		ac = cpu_cache_get(cachep);
2598		if (!x && ac->avail == 0)	// no objects in sight? abort
2599			return NULL;
2600
2601		if (!ac->avail)	// objects refilled by interrupt?
2602			goto retry;
2603	}
2604	ac->touched = 1;
2605	return ac->entry[--ac->avail];
2606}
2607
2608static inline void
2609cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
2610{
2611	might_sleep_if(flags & __GFP_WAIT);
2612#if DEBUG
2613	kmem_flagcheck(cachep, flags);
2614#endif
2615}
2616
2617#if DEBUG
2618static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags,
2619					void *objp, void *caller)
2620{
2621	if (!objp)
2622		return objp;
2623	if (cachep->flags & SLAB_POISON) {
2624#ifdef CONFIG_DEBUG_PAGEALLOC
2625		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2626			kernel_map_pages(virt_to_page(objp),
2627					 cachep->buffer_size / PAGE_SIZE, 1);
2628		else
2629			check_poison_obj(cachep, objp);
2630#else
2631		check_poison_obj(cachep, objp);
2632#endif
2633		poison_obj(cachep, objp, POISON_INUSE);
2634	}
2635	if (cachep->flags & SLAB_STORE_USER)
2636		*dbg_userword(cachep, objp) = caller;
2637
2638	if (cachep->flags & SLAB_RED_ZONE) {
2639		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
2640		    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2641			slab_error(cachep,
2642				   "double free, or memory outside"
2643				   " object was overwritten");
2644			printk(KERN_ERR
2645			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2646			       objp, *dbg_redzone1(cachep, objp),
2647			       *dbg_redzone2(cachep, objp));
2648		}
2649		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
2650		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
2651	}
2652	objp += obj_offset(cachep);
2653	if (cachep->ctor && cachep->flags & SLAB_POISON) {
2654		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2655
2656		if (!(flags & __GFP_WAIT))
2657			ctor_flags |= SLAB_CTOR_ATOMIC;
2658
2659		cachep->ctor(objp, cachep, ctor_flags);
2660	}
2661	return objp;
2662}
2663#else
2664#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2665#endif
2666
2667static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2668{
2669	void *objp;
2670	struct array_cache *ac;
2671
2672#ifdef CONFIG_NUMA
2673	if (unlikely(current->mempolicy && !in_interrupt())) {
2674		int nid = slab_node(current->mempolicy);
2675
2676		if (nid != numa_node_id())
2677			return __cache_alloc_node(cachep, flags, nid);
2678	}
2679#endif
2680
2681	check_irq_off();
2682	ac = cpu_cache_get(cachep);
2683	if (likely(ac->avail)) {
2684		STATS_INC_ALLOCHIT(cachep);
2685		ac->touched = 1;
2686		objp = ac->entry[--ac->avail];
2687	} else {
2688		STATS_INC_ALLOCMISS(cachep);
2689		objp = cache_alloc_refill(cachep, flags);
2690	}
2691	return objp;
2692}
2693
2694static __always_inline void *
2695__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
2696{
2697	unsigned long save_flags;
2698	void *objp;
2699
2700	cache_alloc_debugcheck_before(cachep, flags);
2701
2702	local_irq_save(save_flags);
2703	objp = ____cache_alloc(cachep, flags);
2704	local_irq_restore(save_flags);
2705	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
2706					    caller);
2707	prefetchw(objp);
2708	return objp;
2709}
2710
2711#ifdef CONFIG_NUMA
2712/*
2713 * A interface to enable slab creation on nodeid
2714 */
2715static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2716{
2717	struct list_head *entry;
2718	struct slab *slabp;
2719	struct kmem_list3 *l3;
2720	void *obj;
2721	int x;
2722
2723	l3 = cachep->nodelists[nodeid];
2724	BUG_ON(!l3);
2725
2726      retry:
2727	check_irq_off();
2728	spin_lock(&l3->list_lock);
2729	entry = l3->slabs_partial.next;
2730	if (entry == &l3->slabs_partial) {
2731		l3->free_touched = 1;
2732		entry = l3->slabs_free.next;
2733		if (entry == &l3->slabs_free)
2734			goto must_grow;
2735	}
2736
2737	slabp = list_entry(entry, struct slab, list);
2738	check_spinlock_acquired_node(cachep, nodeid);
2739	check_slabp(cachep, slabp);
2740
2741	STATS_INC_NODEALLOCS(cachep);
2742	STATS_INC_ACTIVE(cachep);
2743	STATS_SET_HIGH(cachep);
2744
2745	BUG_ON(slabp->inuse == cachep->num);
2746
2747	obj = slab_get_obj(cachep, slabp, nodeid);
2748	check_slabp(cachep, slabp);
2749	l3->free_objects--;
2750	/* move slabp to correct slabp list: */
2751	list_del(&slabp->list);
2752
2753	if (slabp->free == BUFCTL_END) {
2754		list_add(&slabp->list, &l3->slabs_full);
2755	} else {
2756		list_add(&slabp->list, &l3->slabs_partial);
2757	}
2758
2759	spin_unlock(&l3->list_lock);
2760	goto done;
2761
2762      must_grow:
2763	spin_unlock(&l3->list_lock);
2764	x = cache_grow(cachep, flags, nodeid);
2765
2766	if (!x)
2767		return NULL;
2768
2769	goto retry;
2770      done:
2771	return obj;
2772}
2773#endif
2774
2775/*
2776 * Caller needs to acquire correct kmem_list's list_lock
2777 */
2778static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
2779		       int node)
2780{
2781	int i;
2782	struct kmem_list3 *l3;
2783
2784	for (i = 0; i < nr_objects; i++) {
2785		void *objp = objpp[i];
2786		struct slab *slabp;
2787
2788		slabp = virt_to_slab(objp);
2789		l3 = cachep->nodelists[node];
2790		list_del(&slabp->list);
2791		check_spinlock_acquired_node(cachep, node);
2792		check_slabp(cachep, slabp);
2793		slab_put_obj(cachep, slabp, objp, node);
2794		STATS_DEC_ACTIVE(cachep);
2795		l3->free_objects++;
2796		check_slabp(cachep, slabp);
2797
2798		/* fixup slab chains */
2799		if (slabp->inuse == 0) {
2800			if (l3->free_objects > l3->free_limit) {
2801				l3->free_objects -= cachep->num;
2802				slab_destroy(cachep, slabp);
2803			} else {
2804				list_add(&slabp->list, &l3->slabs_free);
2805			}
2806		} else {
2807			/* Unconditionally move a slab to the end of the
2808			 * partial list on free - maximum time for the
2809			 * other objects to be freed, too.
2810			 */
2811			list_add_tail(&slabp->list, &l3->slabs_partial);
2812		}
2813	}
2814}
2815
2816static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
2817{
2818	int batchcount;
2819	struct kmem_list3 *l3;
2820	int node = numa_node_id();
2821
2822	batchcount = ac->batchcount;
2823#if DEBUG
2824	BUG_ON(!batchcount || batchcount > ac->avail);
2825#endif
2826	check_irq_off();
2827	l3 = cachep->nodelists[node];
2828	spin_lock(&l3->list_lock);
2829	if (l3->shared) {
2830		struct array_cache *shared_array = l3->shared;
2831		int max = shared_array->limit - shared_array->avail;
2832		if (max) {
2833			if (batchcount > max)
2834				batchcount = max;
2835			memcpy(&(shared_array->entry[shared_array->avail]),
2836			       ac->entry, sizeof(void *) * batchcount);
2837			shared_array->avail += batchcount;
2838			goto free_done;
2839		}
2840	}
2841
2842	free_block(cachep, ac->entry, batchcount, node);
2843      free_done:
2844#if STATS
2845	{
2846		int i = 0;
2847		struct list_head *p;
2848
2849		p = l3->slabs_free.next;
2850		while (p != &(l3->slabs_free)) {
2851			struct slab *slabp;
2852
2853			slabp = list_entry(p, struct slab, list);
2854			BUG_ON(slabp->inuse);
2855
2856			i++;
2857			p = p->next;
2858		}
2859		STATS_SET_FREEABLE(cachep, i);
2860	}
2861#endif
2862	spin_unlock(&l3->list_lock);
2863	ac->avail -= batchcount;
2864	memmove(ac->entry, &(ac->entry[batchcount]),
2865		sizeof(void *) * ac->avail);
2866}
2867
2868/*
2869 * __cache_free
2870 * Release an obj back to its cache. If the obj has a constructed
2871 * state, it must be in this state _before_ it is released.
2872 *
2873 * Called with disabled ints.
2874 */
2875static inline void __cache_free(struct kmem_cache *cachep, void *objp)
2876{
2877	struct array_cache *ac = cpu_cache_get(cachep);
2878
2879	check_irq_off();
2880	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
2881
2882	/* Make sure we are not freeing a object from another
2883	 * node to the array cache on this cpu.
2884	 */
2885#ifdef CONFIG_NUMA
2886	{
2887		struct slab *slabp;
2888		slabp = virt_to_slab(objp);
2889		if (unlikely(slabp->nodeid != numa_node_id())) {
2890			struct array_cache *alien = NULL;
2891			int nodeid = slabp->nodeid;
2892			struct kmem_list3 *l3 =
2893			    cachep->nodelists[numa_node_id()];
2894
2895			STATS_INC_NODEFREES(cachep);
2896			if (l3->alien && l3->alien[nodeid]) {
2897				alien = l3->alien[nodeid];
2898				spin_lock(&alien->lock);
2899				if (unlikely(alien->avail == alien->limit))
2900					__drain_alien_cache(cachep,
2901							    alien, nodeid);
2902				alien->entry[alien->avail++] = objp;
2903				spin_unlock(&alien->lock);
2904			} else {
2905				spin_lock(&(cachep->nodelists[nodeid])->
2906					  list_lock);
2907				free_block(cachep, &objp, 1, nodeid);
2908				spin_unlock(&(cachep->nodelists[nodeid])->
2909					    list_lock);
2910			}
2911			return;
2912		}
2913	}
2914#endif
2915	if (likely(ac->avail < ac->limit)) {
2916		STATS_INC_FREEHIT(cachep);
2917		ac->entry[ac->avail++] = objp;
2918		return;
2919	} else {
2920		STATS_INC_FREEMISS(cachep);
2921		cache_flusharray(cachep, ac);
2922		ac->entry[ac->avail++] = objp;
2923	}
2924}
2925
2926/**
2927 * kmem_cache_alloc - Allocate an object
2928 * @cachep: The cache to allocate from.
2929 * @flags: See kmalloc().
2930 *
2931 * Allocate an object from this cache.  The flags are only relevant
2932 * if the cache has no available objects.
2933 */
2934void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2935{
2936	return __cache_alloc(cachep, flags, __builtin_return_address(0));
2937}
2938EXPORT_SYMBOL(kmem_cache_alloc);
2939
2940/**
2941 * kmem_ptr_validate - check if an untrusted pointer might
2942 *	be a slab entry.
2943 * @cachep: the cache we're checking against
2944 * @ptr: pointer to validate
2945 *
2946 * This verifies that the untrusted pointer looks sane:
2947 * it is _not_ a guarantee that the pointer is actually
2948 * part of the slab cache in question, but it at least
2949 * validates that the pointer can be dereferenced and
2950 * looks half-way sane.
2951 *
2952 * Currently only used for dentry validation.
2953 */
2954int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
2955{
2956	unsigned long addr = (unsigned long)ptr;
2957	unsigned long min_addr = PAGE_OFFSET;
2958	unsigned long align_mask = BYTES_PER_WORD - 1;
2959	unsigned long size = cachep->buffer_size;
2960	struct page *page;
2961
2962	if (unlikely(addr < min_addr))
2963		goto out;
2964	if (unlikely(addr > (unsigned long)high_memory - size))
2965		goto out;
2966	if (unlikely(addr & align_mask))
2967		goto out;
2968	if (unlikely(!kern_addr_valid(addr)))
2969		goto out;
2970	if (unlikely(!kern_addr_valid(addr + size - 1)))
2971		goto out;
2972	page = virt_to_page(ptr);
2973	if (unlikely(!PageSlab(page)))
2974		goto out;
2975	if (unlikely(page_get_cache(page) != cachep))
2976		goto out;
2977	return 1;
2978      out:
2979	return 0;
2980}
2981
2982#ifdef CONFIG_NUMA
2983/**
2984 * kmem_cache_alloc_node - Allocate an object on the specified node
2985 * @cachep: The cache to allocate from.
2986 * @flags: See kmalloc().
2987 * @nodeid: node number of the target node.
2988 *
2989 * Identical to kmem_cache_alloc, except that this function is slow
2990 * and can sleep. And it will allocate memory on the given node, which
2991 * can improve the performance for cpu bound structures.
2992 * New and improved: it will now make sure that the object gets
2993 * put on the correct node list so that there is no false sharing.
2994 */
2995void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2996{
2997	unsigned long save_flags;
2998	void *ptr;
2999
3000	cache_alloc_debugcheck_before(cachep, flags);
3001	local_irq_save(save_flags);
3002
3003	if (nodeid == -1 || nodeid == numa_node_id() ||
3004	    !cachep->nodelists[nodeid])
3005		ptr = ____cache_alloc(cachep, flags);
3006	else
3007		ptr = __cache_alloc_node(cachep, flags, nodeid);
3008	local_irq_restore(save_flags);
3009
3010	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
3011					   __builtin_return_address(0));
3012
3013	return ptr;
3014}
3015EXPORT_SYMBOL(kmem_cache_alloc_node);
3016
3017void *kmalloc_node(size_t size, gfp_t flags, int node)
3018{
3019	struct kmem_cache *cachep;
3020
3021	cachep = kmem_find_general_cachep(size, flags);
3022	if (unlikely(cachep == NULL))
3023		return NULL;
3024	return kmem_cache_alloc_node(cachep, flags, node);
3025}
3026EXPORT_SYMBOL(kmalloc_node);
3027#endif
3028
3029/**
3030 * kmalloc - allocate memory
3031 * @size: how many bytes of memory are required.
3032 * @flags: the type of memory to allocate.
3033 *
3034 * kmalloc is the normal method of allocating memory
3035 * in the kernel.
3036 *
3037 * The @flags argument may be one of:
3038 *
3039 * %GFP_USER - Allocate memory on behalf of user.  May sleep.
3040 *
3041 * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
3042 *
3043 * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
3044 *
3045 * Additionally, the %GFP_DMA flag may be set to indicate the memory
3046 * must be suitable for DMA.  This can mean different things on different
3047 * platforms.  For example, on i386, it means that the memory must come
3048 * from the first 16MB.
3049 */
3050static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3051					  void *caller)
3052{
3053	struct kmem_cache *cachep;
3054
3055	/* If you want to save a few bytes .text space: replace
3056	 * __ with kmem_.
3057	 * Then kmalloc uses the uninlined functions instead of the inline
3058	 * functions.
3059	 */
3060	cachep = __find_general_cachep(size, flags);
3061	if (unlikely(cachep == NULL))
3062		return NULL;
3063	return __cache_alloc(cachep, flags, caller);
3064}
3065
3066#ifndef CONFIG_DEBUG_SLAB
3067
3068void *__kmalloc(size_t size, gfp_t flags)
3069{
3070	return __do_kmalloc(size, flags, NULL);
3071}
3072EXPORT_SYMBOL(__kmalloc);
3073
3074#else
3075
3076void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3077{
3078	return __do_kmalloc(size, flags, caller);
3079}
3080EXPORT_SYMBOL(__kmalloc_track_caller);
3081
3082#endif
3083
3084#ifdef CONFIG_SMP
3085/**
3086 * __alloc_percpu - allocate one copy of the object for every present
3087 * cpu in the system, zeroing them.
3088 * Objects should be dereferenced using the per_cpu_ptr macro only.
3089 *
3090 * @size: how many bytes of memory are required.
3091 */
3092void *__alloc_percpu(size_t size)
3093{
3094	int i;
3095	struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
3096
3097	if (!pdata)
3098		return NULL;
3099
3100	/*
3101	 * Cannot use for_each_online_cpu since a cpu may come online
3102	 * and we have no way of figuring out how to fix the array
3103	 * that we have allocated then....
3104	 */
3105	for_each_cpu(i) {
3106		int node = cpu_to_node(i);
3107
3108		if (node_online(node))
3109			pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
3110		else
3111			pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
3112
3113		if (!pdata->ptrs[i])
3114			goto unwind_oom;
3115		memset(pdata->ptrs[i], 0, size);
3116	}
3117
3118	/* Catch derefs w/o wrappers */
3119	return (void *)(~(unsigned long)pdata);
3120
3121      unwind_oom:
3122	while (--i >= 0) {
3123		if (!cpu_possible(i))
3124			continue;
3125		kfree(pdata->ptrs[i]);
3126	}
3127	kfree(pdata);
3128	return NULL;
3129}
3130EXPORT_SYMBOL(__alloc_percpu);
3131#endif
3132
3133/**
3134 * kmem_cache_free - Deallocate an object
3135 * @cachep: The cache the allocation was from.
3136 * @objp: The previously allocated object.
3137 *
3138 * Free an object which was previously allocated from this
3139 * cache.
3140 */
3141void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3142{
3143	unsigned long flags;
3144
3145	local_irq_save(flags);
3146	__cache_free(cachep, objp);
3147	local_irq_restore(flags);
3148}
3149EXPORT_SYMBOL(kmem_cache_free);
3150
3151/**
3152 * kfree - free previously allocated memory
3153 * @objp: pointer returned by kmalloc.
3154 *
3155 * If @objp is NULL, no operation is performed.
3156 *
3157 * Don't free memory not originally allocated by kmalloc()
3158 * or you will run into trouble.
3159 */
3160void kfree(const void *objp)
3161{
3162	struct kmem_cache *c;
3163	unsigned long flags;
3164
3165	if (unlikely(!objp))
3166		return;
3167	local_irq_save(flags);
3168	kfree_debugcheck(objp);
3169	c = virt_to_cache(objp);
3170	mutex_debug_check_no_locks_freed(objp, obj_size(c));
3171	__cache_free(c, (void *)objp);
3172	local_irq_restore(flags);
3173}
3174EXPORT_SYMBOL(kfree);
3175
3176#ifdef CONFIG_SMP
3177/**
3178 * free_percpu - free previously allocated percpu memory
3179 * @objp: pointer returned by alloc_percpu.
3180 *
3181 * Don't free memory not originally allocated by alloc_percpu()
3182 * The complemented objp is to check for that.
3183 */
3184void free_percpu(const void *objp)
3185{
3186	int i;
3187	struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
3188
3189	/*
3190	 * We allocate for all cpus so we cannot use for online cpu here.
3191	 */
3192	for_each_cpu(i)
3193	    kfree(p->ptrs[i]);
3194	kfree(p);
3195}
3196EXPORT_SYMBOL(free_percpu);
3197#endif
3198
3199unsigned int kmem_cache_size(struct kmem_cache *cachep)
3200{
3201	return obj_size(cachep);
3202}
3203EXPORT_SYMBOL(kmem_cache_size);
3204
3205const char *kmem_cache_name(struct kmem_cache *cachep)
3206{
3207	return cachep->name;
3208}
3209EXPORT_SYMBOL_GPL(kmem_cache_name);
3210
3211/*
3212 * This initializes kmem_list3 for all nodes.
3213 */
3214static int alloc_kmemlist(struct kmem_cache *cachep)
3215{
3216	int node;
3217	struct kmem_list3 *l3;
3218	int err = 0;
3219
3220	for_each_online_node(node) {
3221		struct array_cache *nc = NULL, *new;
3222		struct array_cache **new_alien = NULL;
3223#ifdef CONFIG_NUMA
3224		if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
3225			goto fail;
3226#endif
3227		if (!(new = alloc_arraycache(node, (cachep->shared *
3228						    cachep->batchcount),
3229					     0xbaadf00d)))
3230			goto fail;
3231		if ((l3 = cachep->nodelists[node])) {
3232
3233			spin_lock_irq(&l3->list_lock);
3234
3235			if ((nc = cachep->nodelists[node]->shared))
3236				free_block(cachep, nc->entry, nc->avail, node);
3237
3238			l3->shared = new;
3239			if (!cachep->nodelists[node]->alien) {
3240				l3->alien = new_alien;
3241				new_alien = NULL;
3242			}
3243			l3->free_limit = (1 + nr_cpus_node(node)) *
3244			    cachep->batchcount + cachep->num;
3245			spin_unlock_irq(&l3->list_lock);
3246			kfree(nc);
3247			free_alien_cache(new_alien);
3248			continue;
3249		}
3250		if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
3251					GFP_KERNEL, node)))
3252			goto fail;
3253
3254		kmem_list3_init(l3);
3255		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3256		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3257		l3->shared = new;
3258		l3->alien = new_alien;
3259		l3->free_limit = (1 + nr_cpus_node(node)) *
3260		    cachep->batchcount + cachep->num;
3261		cachep->nodelists[node] = l3;
3262	}
3263	return err;
3264      fail:
3265	err = -ENOMEM;
3266	return err;
3267}
3268
3269struct ccupdate_struct {
3270	struct kmem_cache *cachep;
3271	struct array_cache *new[NR_CPUS];
3272};
3273
3274static void do_ccupdate_local(void *info)
3275{
3276	struct ccupdate_struct *new = (struct ccupdate_struct *)info;
3277	struct array_cache *old;
3278
3279	check_irq_off();
3280	old = cpu_cache_get(new->cachep);
3281
3282	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3283	new->new[smp_processor_id()] = old;
3284}
3285
3286static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
3287			    int shared)
3288{
3289	struct ccupdate_struct new;
3290	int i, err;
3291
3292	memset(&new.new, 0, sizeof(new.new));
3293	for_each_online_cpu(i) {
3294		new.new[i] =
3295		    alloc_arraycache(cpu_to_node(i), limit, batchcount);
3296		if (!new.new[i]) {
3297			for (i--; i >= 0; i--)
3298				kfree(new.new[i]);
3299			return -ENOMEM;
3300		}
3301	}
3302	new.cachep = cachep;
3303
3304	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
3305
3306	check_irq_on();
3307	spin_lock(&cachep->spinlock);
3308	cachep->batchcount = batchcount;
3309	cachep->limit = limit;
3310	cachep->shared = shared;
3311	spin_unlock(&cachep->spinlock);
3312
3313	for_each_online_cpu(i) {
3314		struct array_cache *ccold = new.new[i];
3315		if (!ccold)
3316			continue;
3317		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3318		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3319		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3320		kfree(ccold);
3321	}
3322
3323	err = alloc_kmemlist(cachep);
3324	if (err) {
3325		printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
3326		       cachep->name, -err);
3327		BUG();
3328	}
3329	return 0;
3330}
3331
3332static void enable_cpucache(struct kmem_cache *cachep)
3333{
3334	int err;
3335	int limit, shared;
3336
3337	/* The head array serves three purposes:
3338	 * - create a LIFO ordering, i.e. return objects that are cache-warm
3339	 * - reduce the number of spinlock operations.
3340	 * - reduce the number of linked list operations on the slab and
3341	 *   bufctl chains: array operations are cheaper.
3342	 * The numbers are guessed, we should auto-tune as described by
3343	 * Bonwick.
3344	 */
3345	if (cachep->buffer_size > 131072)
3346		limit = 1;
3347	else if (cachep->buffer_size > PAGE_SIZE)
3348		limit = 8;
3349	else if (cachep->buffer_size > 1024)
3350		limit = 24;
3351	else if (cachep->buffer_size > 256)
3352		limit = 54;
3353	else
3354		limit = 120;
3355
3356	/* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
3357	 * allocation behaviour: Most allocs on one cpu, most free operations
3358	 * on another cpu. For these cases, an efficient object passing between
3359	 * cpus is necessary. This is provided by a shared array. The array
3360	 * replaces Bonwick's magazine layer.
3361	 * On uniprocessor, it's functionally equivalent (but less efficient)
3362	 * to a larger limit. Thus disabled by default.
3363	 */
3364	shared = 0;
3365#ifdef CONFIG_SMP
3366	if (cachep->buffer_size <= PAGE_SIZE)
3367		shared = 8;
3368#endif
3369
3370#if DEBUG
3371	/* With debugging enabled, large batchcount lead to excessively
3372	 * long periods with disabled local interrupts. Limit the
3373	 * batchcount
3374	 */
3375	if (limit > 32)
3376		limit = 32;
3377#endif
3378	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
3379	if (err)
3380		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3381		       cachep->name, -err);
3382}
3383
3384static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
3385				int force, int node)
3386{
3387	int tofree;
3388
3389	check_spinlock_acquired_node(cachep, node);
3390	if (ac->touched && !force) {
3391		ac->touched = 0;
3392	} else if (ac->avail) {
3393		tofree = force ? ac->avail : (ac->limit + 4) / 5;
3394		if (tofree > ac->avail) {
3395			tofree = (ac->avail + 1) / 2;
3396		}
3397		free_block(cachep, ac->entry, tofree, node);
3398		ac->avail -= tofree;
3399		memmove(ac->entry, &(ac->entry[tofree]),
3400			sizeof(void *) * ac->avail);
3401	}
3402}
3403
3404/**
3405 * cache_reap - Reclaim memory from caches.
3406 * @unused: unused parameter
3407 *
3408 * Called from workqueue/eventd every few seconds.
3409 * Purpose:
3410 * - clear the per-cpu caches for this CPU.
3411 * - return freeable pages to the main free memory pool.
3412 *
3413 * If we cannot acquire the cache chain mutex then just give up - we'll
3414 * try again on the next iteration.
3415 */
3416static void cache_reap(void *unused)
3417{
3418	struct list_head *walk;
3419	struct kmem_list3 *l3;
3420
3421	if (!mutex_trylock(&cache_chain_mutex)) {
3422		/* Give up. Setup the next iteration. */
3423		schedule_delayed_work(&__get_cpu_var(reap_work),
3424				      REAPTIMEOUT_CPUC);
3425		return;
3426	}
3427
3428	list_for_each(walk, &cache_chain) {
3429		struct kmem_cache *searchp;
3430		struct list_head *p;
3431		int tofree;
3432		struct slab *slabp;
3433
3434		searchp = list_entry(walk, struct kmem_cache, next);
3435
3436		if (searchp->flags & SLAB_NO_REAP)
3437			goto next;
3438
3439		check_irq_on();
3440
3441		l3 = searchp->nodelists[numa_node_id()];
3442		if (l3->alien)
3443			drain_alien_cache(searchp, l3);
3444		spin_lock_irq(&l3->list_lock);
3445
3446		drain_array_locked(searchp, cpu_cache_get(searchp), 0,
3447				   numa_node_id());
3448
3449		if (time_after(l3->next_reap, jiffies))
3450			goto next_unlock;
3451
3452		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
3453
3454		if (l3->shared)
3455			drain_array_locked(searchp, l3->shared, 0,
3456					   numa_node_id());
3457
3458		if (l3->free_touched) {
3459			l3->free_touched = 0;
3460			goto next_unlock;
3461		}
3462
3463		tofree =
3464		    (l3->free_limit + 5 * searchp->num -
3465		     1) / (5 * searchp->num);
3466		do {
3467			p = l3->slabs_free.next;
3468			if (p == &(l3->slabs_free))
3469				break;
3470
3471			slabp = list_entry(p, struct slab, list);
3472			BUG_ON(slabp->inuse);
3473			list_del(&slabp->list);
3474			STATS_INC_REAPED(searchp);
3475
3476			/* Safe to drop the lock. The slab is no longer
3477			 * linked to the cache.
3478			 * searchp cannot disappear, we hold
3479			 * cache_chain_lock
3480			 */
3481			l3->free_objects -= searchp->num;
3482			spin_unlock_irq(&l3->list_lock);
3483			slab_destroy(searchp, slabp);
3484			spin_lock_irq(&l3->list_lock);
3485		} while (--tofree > 0);
3486	      next_unlock:
3487		spin_unlock_irq(&l3->list_lock);
3488	      next:
3489		cond_resched();
3490	}
3491	check_irq_on();
3492	mutex_unlock(&cache_chain_mutex);
3493	drain_remote_pages();
3494	/* Setup the next iteration */
3495	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3496}
3497
3498#ifdef CONFIG_PROC_FS
3499
3500static void print_slabinfo_header(struct seq_file *m)
3501{
3502	/*
3503	 * Output format version, so at least we can change it
3504	 * without _too_ many complaints.
3505	 */
3506#if STATS
3507	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
3508#else
3509	seq_puts(m, "slabinfo - version: 2.1\n");
3510#endif
3511	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
3512		 "<objperslab> <pagesperslab>");
3513	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3514	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3515#if STATS
3516	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
3517		 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
3518	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
3519#endif
3520	seq_putc(m, '\n');
3521}
3522
3523static void *s_start(struct seq_file *m, loff_t *pos)
3524{
3525	loff_t n = *pos;
3526	struct list_head *p;
3527
3528	mutex_lock(&cache_chain_mutex);
3529	if (!n)
3530		print_slabinfo_header(m);
3531	p = cache_chain.next;
3532	while (n--) {
3533		p = p->next;
3534		if (p == &cache_chain)
3535			return NULL;
3536	}
3537	return list_entry(p, struct kmem_cache, next);
3538}
3539
3540static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3541{
3542	struct kmem_cache *cachep = p;
3543	++*pos;
3544	return cachep->next.next == &cache_chain ? NULL
3545	    : list_entry(cachep->next.next, struct kmem_cache, next);
3546}
3547
3548static void s_stop(struct seq_file *m, void *p)
3549{
3550	mutex_unlock(&cache_chain_mutex);
3551}
3552
3553static int s_show(struct seq_file *m, void *p)
3554{
3555	struct kmem_cache *cachep = p;
3556	struct list_head *q;
3557	struct slab *slabp;
3558	unsigned long active_objs;
3559	unsigned long num_objs;
3560	unsigned long active_slabs = 0;
3561	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
3562	const char *name;
3563	char *error = NULL;
3564	int node;
3565	struct kmem_list3 *l3;
3566
3567	spin_lock(&cachep->spinlock);
3568	active_objs = 0;
3569	num_slabs = 0;
3570	for_each_online_node(node) {
3571		l3 = cachep->nodelists[node];
3572		if (!l3)
3573			continue;
3574
3575		check_irq_on();
3576		spin_lock_irq(&l3->list_lock);
3577
3578		list_for_each(q, &l3->slabs_full) {
3579			slabp = list_entry(q, struct slab, list);
3580			if (slabp->inuse != cachep->num && !error)
3581				error = "slabs_full accounting error";
3582			active_objs += cachep->num;
3583			active_slabs++;
3584		}
3585		list_for_each(q, &l3->slabs_partial) {
3586			slabp = list_entry(q, struct slab, list);
3587			if (slabp->inuse == cachep->num && !error)
3588				error = "slabs_partial inuse accounting error";
3589			if (!slabp->inuse && !error)
3590				error = "slabs_partial/inuse accounting error";
3591			active_objs += slabp->inuse;
3592			active_slabs++;
3593		}
3594		list_for_each(q, &l3->slabs_free) {
3595			slabp = list_entry(q, struct slab, list);
3596			if (slabp->inuse && !error)
3597				error = "slabs_free/inuse accounting error";
3598			num_slabs++;
3599		}
3600		free_objects += l3->free_objects;
3601		shared_avail += l3->shared->avail;
3602
3603		spin_unlock_irq(&l3->list_lock);
3604	}
3605	num_slabs += active_slabs;
3606	num_objs = num_slabs * cachep->num;
3607	if (num_objs - active_objs != free_objects && !error)
3608		error = "free_objects accounting error";
3609
3610	name = cachep->name;
3611	if (error)
3612		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
3613
3614	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
3615		   name, active_objs, num_objs, cachep->buffer_size,
3616		   cachep->num, (1 << cachep->gfporder));
3617	seq_printf(m, " : tunables %4u %4u %4u",
3618		   cachep->limit, cachep->batchcount, cachep->shared);
3619	seq_printf(m, " : slabdata %6lu %6lu %6lu",
3620		   active_slabs, num_slabs, shared_avail);
3621#if STATS
3622	{			/* list3 stats */
3623		unsigned long high = cachep->high_mark;
3624		unsigned long allocs = cachep->num_allocations;
3625		unsigned long grown = cachep->grown;
3626		unsigned long reaped = cachep->reaped;
3627		unsigned long errors = cachep->errors;
3628		unsigned long max_freeable = cachep->max_freeable;
3629		unsigned long node_allocs = cachep->node_allocs;
3630		unsigned long node_frees = cachep->node_frees;
3631
3632		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
3633				%4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
3634	}
3635	/* cpu stats */
3636	{
3637		unsigned long allochit = atomic_read(&cachep->allochit);
3638		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
3639		unsigned long freehit = atomic_read(&cachep->freehit);
3640		unsigned long freemiss = atomic_read(&cachep->freemiss);
3641
3642		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
3643			   allochit, allocmiss, freehit, freemiss);
3644	}
3645#endif
3646	seq_putc(m, '\n');
3647	spin_unlock(&cachep->spinlock);
3648	return 0;
3649}
3650
3651/*
3652 * slabinfo_op - iterator that generates /proc/slabinfo
3653 *
3654 * Output layout:
3655 * cache-name
3656 * num-active-objs
3657 * total-objs
3658 * object size
3659 * num-active-slabs
3660 * total-slabs
3661 * num-pages-per-slab
3662 * + further values on SMP and with statistics enabled
3663 */
3664
3665struct seq_operations slabinfo_op = {
3666	.start = s_start,
3667	.next = s_next,
3668	.stop = s_stop,
3669	.show = s_show,
3670};
3671
3672#define MAX_SLABINFO_WRITE 128
3673/**
3674 * slabinfo_write - Tuning for the slab allocator
3675 * @file: unused
3676 * @buffer: user buffer
3677 * @count: data length
3678 * @ppos: unused
3679 */
3680ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3681		       size_t count, loff_t *ppos)
3682{
3683	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
3684	int limit, batchcount, shared, res;
3685	struct list_head *p;
3686
3687	if (count > MAX_SLABINFO_WRITE)
3688		return -EINVAL;
3689	if (copy_from_user(&kbuf, buffer, count))
3690		return -EFAULT;
3691	kbuf[MAX_SLABINFO_WRITE] = '\0';
3692
3693	tmp = strchr(kbuf, ' ');
3694	if (!tmp)
3695		return -EINVAL;
3696	*tmp = '\0';
3697	tmp++;
3698	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
3699		return -EINVAL;
3700
3701	/* Find the cache in the chain of caches. */
3702	mutex_lock(&cache_chain_mutex);
3703	res = -EINVAL;
3704	list_for_each(p, &cache_chain) {
3705		struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
3706						       next);
3707
3708		if (!strcmp(cachep->name, kbuf)) {
3709			if (limit < 1 ||
3710			    batchcount < 1 ||
3711			    batchcount > limit || shared < 0) {
3712				res = 0;
3713			} else {
3714				res = do_tune_cpucache(cachep, limit,
3715						       batchcount, shared);
3716			}
3717			break;
3718		}
3719	}
3720	mutex_unlock(&cache_chain_mutex);
3721	if (res >= 0)
3722		res = count;
3723	return res;
3724}
3725#endif
3726
3727/**
3728 * ksize - get the actual amount of memory allocated for a given object
3729 * @objp: Pointer to the object
3730 *
3731 * kmalloc may internally round up allocations and return more memory
3732 * than requested. ksize() can be used to determine the actual amount of
3733 * memory allocated. The caller may use this additional memory, even though
3734 * a smaller amount of memory was initially specified with the kmalloc call.
3735 * The caller must guarantee that objp points to a valid object previously
3736 * allocated with either kmalloc() or kmem_cache_alloc(). The object
3737 * must not be freed during the duration of the call.
3738 */
3739unsigned int ksize(const void *objp)
3740{
3741	if (unlikely(objp == NULL))
3742		return 0;
3743
3744	return obj_size(virt_to_cache(objp));
3745}
3746