slab.c revision 84097518d1ecd2330f9488e4c2d09953a3340e74
1/*
2 * linux/mm/slab.c
3 * Written by Mark Hemment, 1996/97.
4 * (markhe@nextd.demon.co.uk)
5 *
6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7 *
8 * Major cleanup, different bufctl logic, per-cpu arrays
9 *	(c) 2000 Manfred Spraul
10 *
11 * Cleanup, make the head arrays unconditional, preparation for NUMA
12 * 	(c) 2002 Manfred Spraul
13 *
14 * An implementation of the Slab Allocator as described in outline in;
15 *	UNIX Internals: The New Frontiers by Uresh Vahalia
16 *	Pub: Prentice Hall	ISBN 0-13-101908-2
17 * or with a little more detail in;
18 *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
19 *	Jeff Bonwick (Sun Microsystems).
20 *	Presented at: USENIX Summer 1994 Technical Conference
21 *
22 * The memory is organized in caches, one cache for each object type.
23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24 * Each cache consists out of many slabs (they are small (usually one
25 * page long) and always contiguous), and each slab contains multiple
26 * initialized objects.
27 *
28 * This means, that your constructor is used only for newly allocated
29 * slabs and you must pass objects with the same intializations to
30 * kmem_cache_free.
31 *
32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33 * normal). If you need a special memory type, then must create a new
34 * cache for that memory type.
35 *
36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37 *   full slabs with 0 free objects
38 *   partial slabs
39 *   empty slabs with no allocated objects
40 *
41 * If partial slabs exist, then new allocations come from these slabs,
42 * otherwise from empty slabs or new slabs are allocated.
43 *
44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46 *
47 * Each cache has a short per-cpu head array, most allocs
48 * and frees go into that array, and if that array overflows, then 1/2
49 * of the entries in the array are given back into the global cache.
50 * The head array is strictly LIFO and should improve the cache hit rates.
51 * On SMP, it additionally reduces the spinlock operations.
52 *
53 * The c_cpuarray may not be read with enabled local interrupts -
54 * it's changed with a smp_call_function().
55 *
56 * SMP synchronization:
57 *  constructors and destructors are called without any locking.
58 *  Several members in struct kmem_cache and struct slab never change, they
59 *	are accessed without any locking.
60 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
61 *  	and local interrupts are disabled so slab code is preempt-safe.
62 *  The non-constant members are protected with a per-cache irq spinlock.
63 *
64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65 * in 2000 - many ideas in the current implementation are derived from
66 * his patch.
67 *
68 * Further notes from the original documentation:
69 *
70 * 11 April '97.  Started multi-threading - markhe
71 *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
72 *	The sem is only needed when accessing/extending the cache-chain, which
73 *	can never happen inside an interrupt (kmem_cache_create(),
74 *	kmem_cache_shrink() and kmem_cache_reap()).
75 *
76 *	At present, each engine can be growing a cache.  This should be blocked.
77 *
78 * 15 March 2005. NUMA slab allocator.
79 *	Shai Fultheim <shai@scalex86.org>.
80 *	Shobhit Dayal <shobhit@calsoftinc.com>
81 *	Alok N Kataria <alokk@calsoftinc.com>
82 *	Christoph Lameter <christoph@lameter.com>
83 *
84 *	Modified the slab allocator to be node aware on NUMA systems.
85 *	Each node has its own list of partial, free and full slabs.
86 *	All object allocations for a node occur from node specific slab lists.
87 */
88
89#include	<linux/config.h>
90#include	<linux/slab.h>
91#include	<linux/mm.h>
92#include	<linux/swap.h>
93#include	<linux/cache.h>
94#include	<linux/interrupt.h>
95#include	<linux/init.h>
96#include	<linux/compiler.h>
97#include	<linux/seq_file.h>
98#include	<linux/notifier.h>
99#include	<linux/kallsyms.h>
100#include	<linux/cpu.h>
101#include	<linux/sysctl.h>
102#include	<linux/module.h>
103#include	<linux/rcupdate.h>
104#include	<linux/string.h>
105#include	<linux/nodemask.h>
106#include	<linux/mempolicy.h>
107#include	<linux/mutex.h>
108
109#include	<asm/uaccess.h>
110#include	<asm/cacheflush.h>
111#include	<asm/tlbflush.h>
112#include	<asm/page.h>
113
114/*
115 * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
116 *		  SLAB_RED_ZONE & SLAB_POISON.
117 *		  0 for faster, smaller code (especially in the critical paths).
118 *
119 * STATS	- 1 to collect stats for /proc/slabinfo.
120 *		  0 for faster, smaller code (especially in the critical paths).
121 *
122 * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
123 */
124
125#ifdef CONFIG_DEBUG_SLAB
126#define	DEBUG		1
127#define	STATS		1
128#define	FORCED_DEBUG	1
129#else
130#define	DEBUG		0
131#define	STATS		0
132#define	FORCED_DEBUG	0
133#endif
134
135/* Shouldn't this be in a header file somewhere? */
136#define	BYTES_PER_WORD		sizeof(void *)
137
138#ifndef cache_line_size
139#define cache_line_size()	L1_CACHE_BYTES
140#endif
141
142#ifndef ARCH_KMALLOC_MINALIGN
143/*
144 * Enforce a minimum alignment for the kmalloc caches.
145 * Usually, the kmalloc caches are cache_line_size() aligned, except when
146 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
147 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
148 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
149 * Note that this flag disables some debug features.
150 */
151#define ARCH_KMALLOC_MINALIGN 0
152#endif
153
154#ifndef ARCH_SLAB_MINALIGN
155/*
156 * Enforce a minimum alignment for all caches.
157 * Intended for archs that get misalignment faults even for BYTES_PER_WORD
158 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
159 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
160 * some debug features.
161 */
162#define ARCH_SLAB_MINALIGN 0
163#endif
164
165#ifndef ARCH_KMALLOC_FLAGS
166#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
167#endif
168
169/* Legal flag mask for kmem_cache_create(). */
170#if DEBUG
171# define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
172			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
173			 SLAB_CACHE_DMA | \
174			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
175			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
176			 SLAB_DESTROY_BY_RCU)
177#else
178# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
179			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
180			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
181			 SLAB_DESTROY_BY_RCU)
182#endif
183
184/*
185 * kmem_bufctl_t:
186 *
187 * Bufctl's are used for linking objs within a slab
188 * linked offsets.
189 *
190 * This implementation relies on "struct page" for locating the cache &
191 * slab an object belongs to.
192 * This allows the bufctl structure to be small (one int), but limits
193 * the number of objects a slab (not a cache) can contain when off-slab
194 * bufctls are used. The limit is the size of the largest general cache
195 * that does not use off-slab slabs.
196 * For 32bit archs with 4 kB pages, is this 56.
197 * This is not serious, as it is only for large objects, when it is unwise
198 * to have too many per slab.
199 * Note: This limit can be raised by introducing a general cache whose size
200 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
201 */
202
203typedef unsigned int kmem_bufctl_t;
204#define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
205#define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
206#define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-2)
207
208/* Max number of objs-per-slab for caches which use off-slab slabs.
209 * Needed to avoid a possible looping condition in cache_grow().
210 */
211static unsigned long offslab_limit;
212
213/*
214 * struct slab
215 *
216 * Manages the objs in a slab. Placed either at the beginning of mem allocated
217 * for a slab, or allocated from an general cache.
218 * Slabs are chained into three list: fully used, partial, fully free slabs.
219 */
220struct slab {
221	struct list_head list;
222	unsigned long colouroff;
223	void *s_mem;		/* including colour offset */
224	unsigned int inuse;	/* num of objs active in slab */
225	kmem_bufctl_t free;
226	unsigned short nodeid;
227};
228
229/*
230 * struct slab_rcu
231 *
232 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
233 * arrange for kmem_freepages to be called via RCU.  This is useful if
234 * we need to approach a kernel structure obliquely, from its address
235 * obtained without the usual locking.  We can lock the structure to
236 * stabilize it and check it's still at the given address, only if we
237 * can be sure that the memory has not been meanwhile reused for some
238 * other kind of object (which our subsystem's lock might corrupt).
239 *
240 * rcu_read_lock before reading the address, then rcu_read_unlock after
241 * taking the spinlock within the structure expected at that address.
242 *
243 * We assume struct slab_rcu can overlay struct slab when destroying.
244 */
245struct slab_rcu {
246	struct rcu_head head;
247	struct kmem_cache *cachep;
248	void *addr;
249};
250
251/*
252 * struct array_cache
253 *
254 * Purpose:
255 * - LIFO ordering, to hand out cache-warm objects from _alloc
256 * - reduce the number of linked list operations
257 * - reduce spinlock operations
258 *
259 * The limit is stored in the per-cpu structure to reduce the data cache
260 * footprint.
261 *
262 */
263struct array_cache {
264	unsigned int avail;
265	unsigned int limit;
266	unsigned int batchcount;
267	unsigned int touched;
268	spinlock_t lock;
269	void *entry[0];	/*
270			 * Must have this definition in here for the proper
271			 * alignment of array_cache. Also simplifies accessing
272			 * the entries.
273			 * [0] is for gcc 2.95. It should really be [].
274			 */
275};
276
277/*
278 * bootstrap: The caches do not work without cpuarrays anymore, but the
279 * cpuarrays are allocated from the generic caches...
280 */
281#define BOOT_CPUCACHE_ENTRIES	1
282struct arraycache_init {
283	struct array_cache cache;
284	void *entries[BOOT_CPUCACHE_ENTRIES];
285};
286
287/*
288 * The slab lists for all objects.
289 */
290struct kmem_list3 {
291	struct list_head slabs_partial;	/* partial list first, better asm code */
292	struct list_head slabs_full;
293	struct list_head slabs_free;
294	unsigned long free_objects;
295	unsigned long next_reap;
296	int free_touched;
297	unsigned int free_limit;
298	unsigned int colour_next;	/* Per-node cache coloring */
299	spinlock_t list_lock;
300	struct array_cache *shared;	/* shared per node */
301	struct array_cache **alien;	/* on other nodes */
302};
303
304/*
305 * Need this for bootstrapping a per node allocator.
306 */
307#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
308struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
309#define	CACHE_CACHE 0
310#define	SIZE_AC 1
311#define	SIZE_L3 (1 + MAX_NUMNODES)
312
313/*
314 * This function must be completely optimized away if a constant is passed to
315 * it.  Mostly the same as what is in linux/slab.h except it returns an index.
316 */
317static __always_inline int index_of(const size_t size)
318{
319	extern void __bad_size(void);
320
321	if (__builtin_constant_p(size)) {
322		int i = 0;
323
324#define CACHE(x) \
325	if (size <=x) \
326		return i; \
327	else \
328		i++;
329#include "linux/kmalloc_sizes.h"
330#undef CACHE
331		__bad_size();
332	} else
333		__bad_size();
334	return 0;
335}
336
337#define INDEX_AC index_of(sizeof(struct arraycache_init))
338#define INDEX_L3 index_of(sizeof(struct kmem_list3))
339
340static void kmem_list3_init(struct kmem_list3 *parent)
341{
342	INIT_LIST_HEAD(&parent->slabs_full);
343	INIT_LIST_HEAD(&parent->slabs_partial);
344	INIT_LIST_HEAD(&parent->slabs_free);
345	parent->shared = NULL;
346	parent->alien = NULL;
347	parent->colour_next = 0;
348	spin_lock_init(&parent->list_lock);
349	parent->free_objects = 0;
350	parent->free_touched = 0;
351}
352
353#define MAKE_LIST(cachep, listp, slab, nodeid)				\
354	do {								\
355		INIT_LIST_HEAD(listp);					\
356		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
357	} while (0)
358
359#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
360	do {								\
361	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
362	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
363	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
364	} while (0)
365
366/*
367 * struct kmem_cache
368 *
369 * manages a cache.
370 */
371
372struct kmem_cache {
373/* 1) per-cpu data, touched during every alloc/free */
374	struct array_cache *array[NR_CPUS];
375/* 2) Cache tunables. Protected by cache_chain_mutex */
376	unsigned int batchcount;
377	unsigned int limit;
378	unsigned int shared;
379
380	unsigned int buffer_size;
381/* 3) touched by every alloc & free from the backend */
382	struct kmem_list3 *nodelists[MAX_NUMNODES];
383
384	unsigned int flags;		/* constant flags */
385	unsigned int num;		/* # of objs per slab */
386
387/* 4) cache_grow/shrink */
388	/* order of pgs per slab (2^n) */
389	unsigned int gfporder;
390
391	/* force GFP flags, e.g. GFP_DMA */
392	gfp_t gfpflags;
393
394	size_t colour;			/* cache colouring range */
395	unsigned int colour_off;	/* colour offset */
396	struct kmem_cache *slabp_cache;
397	unsigned int slab_size;
398	unsigned int dflags;		/* dynamic flags */
399
400	/* constructor func */
401	void (*ctor) (void *, struct kmem_cache *, unsigned long);
402
403	/* de-constructor func */
404	void (*dtor) (void *, struct kmem_cache *, unsigned long);
405
406/* 5) cache creation/removal */
407	const char *name;
408	struct list_head next;
409
410/* 6) statistics */
411#if STATS
412	unsigned long num_active;
413	unsigned long num_allocations;
414	unsigned long high_mark;
415	unsigned long grown;
416	unsigned long reaped;
417	unsigned long errors;
418	unsigned long max_freeable;
419	unsigned long node_allocs;
420	unsigned long node_frees;
421	atomic_t allochit;
422	atomic_t allocmiss;
423	atomic_t freehit;
424	atomic_t freemiss;
425#endif
426#if DEBUG
427	/*
428	 * If debugging is enabled, then the allocator can add additional
429	 * fields and/or padding to every object. buffer_size contains the total
430	 * object size including these internal fields, the following two
431	 * variables contain the offset to the user object and its size.
432	 */
433	int obj_offset;
434	int obj_size;
435#endif
436};
437
438#define CFLGS_OFF_SLAB		(0x80000000UL)
439#define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
440
441#define BATCHREFILL_LIMIT	16
442/*
443 * Optimization question: fewer reaps means less probability for unnessary
444 * cpucache drain/refill cycles.
445 *
446 * OTOH the cpuarrays can contain lots of objects,
447 * which could lock up otherwise freeable slabs.
448 */
449#define REAPTIMEOUT_CPUC	(2*HZ)
450#define REAPTIMEOUT_LIST3	(4*HZ)
451
452#if STATS
453#define	STATS_INC_ACTIVE(x)	((x)->num_active++)
454#define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
455#define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
456#define	STATS_INC_GROWN(x)	((x)->grown++)
457#define	STATS_INC_REAPED(x)	((x)->reaped++)
458#define	STATS_SET_HIGH(x)						\
459	do {								\
460		if ((x)->num_active > (x)->high_mark)			\
461			(x)->high_mark = (x)->num_active;		\
462	} while (0)
463#define	STATS_INC_ERR(x)	((x)->errors++)
464#define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
465#define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
466#define	STATS_SET_FREEABLE(x, i)					\
467	do {								\
468		if ((x)->max_freeable < i)				\
469			(x)->max_freeable = i;				\
470	} while (0)
471#define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
472#define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
473#define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
474#define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
475#else
476#define	STATS_INC_ACTIVE(x)	do { } while (0)
477#define	STATS_DEC_ACTIVE(x)	do { } while (0)
478#define	STATS_INC_ALLOCED(x)	do { } while (0)
479#define	STATS_INC_GROWN(x)	do { } while (0)
480#define	STATS_INC_REAPED(x)	do { } while (0)
481#define	STATS_SET_HIGH(x)	do { } while (0)
482#define	STATS_INC_ERR(x)	do { } while (0)
483#define	STATS_INC_NODEALLOCS(x)	do { } while (0)
484#define	STATS_INC_NODEFREES(x)	do { } while (0)
485#define	STATS_SET_FREEABLE(x, i) do { } while (0)
486#define STATS_INC_ALLOCHIT(x)	do { } while (0)
487#define STATS_INC_ALLOCMISS(x)	do { } while (0)
488#define STATS_INC_FREEHIT(x)	do { } while (0)
489#define STATS_INC_FREEMISS(x)	do { } while (0)
490#endif
491
492#if DEBUG
493/*
494 * Magic nums for obj red zoning.
495 * Placed in the first word before and the first word after an obj.
496 */
497#define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
498#define	RED_ACTIVE	0x170FC2A5UL	/* when obj is active */
499
500/* ...and for poisoning */
501#define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
502#define POISON_FREE	0x6b	/* for use-after-free poisoning */
503#define	POISON_END	0xa5	/* end-byte of poisoning */
504
505/*
506 * memory layout of objects:
507 * 0		: objp
508 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
509 * 		the end of an object is aligned with the end of the real
510 * 		allocation. Catches writes behind the end of the allocation.
511 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
512 * 		redzone word.
513 * cachep->obj_offset: The real object.
514 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
515 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
516 *					[BYTES_PER_WORD long]
517 */
518static int obj_offset(struct kmem_cache *cachep)
519{
520	return cachep->obj_offset;
521}
522
523static int obj_size(struct kmem_cache *cachep)
524{
525	return cachep->obj_size;
526}
527
528static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
529{
530	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
531	return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
532}
533
534static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
535{
536	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
537	if (cachep->flags & SLAB_STORE_USER)
538		return (unsigned long *)(objp + cachep->buffer_size -
539					 2 * BYTES_PER_WORD);
540	return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
541}
542
543static void **dbg_userword(struct kmem_cache *cachep, void *objp)
544{
545	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
546	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
547}
548
549#else
550
551#define obj_offset(x)			0
552#define obj_size(cachep)		(cachep->buffer_size)
553#define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long *)NULL;})
554#define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long *)NULL;})
555#define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
556
557#endif
558
559/*
560 * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
561 * order.
562 */
563#if defined(CONFIG_LARGE_ALLOCS)
564#define	MAX_OBJ_ORDER	13	/* up to 32Mb */
565#define	MAX_GFP_ORDER	13	/* up to 32Mb */
566#elif defined(CONFIG_MMU)
567#define	MAX_OBJ_ORDER	5	/* 32 pages */
568#define	MAX_GFP_ORDER	5	/* 32 pages */
569#else
570#define	MAX_OBJ_ORDER	8	/* up to 1Mb */
571#define	MAX_GFP_ORDER	8	/* up to 1Mb */
572#endif
573
574/*
575 * Do not go above this order unless 0 objects fit into the slab.
576 */
577#define	BREAK_GFP_ORDER_HI	1
578#define	BREAK_GFP_ORDER_LO	0
579static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
580
581/*
582 * Functions for storing/retrieving the cachep and or slab from the page
583 * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
584 * these are used to find the cache which an obj belongs to.
585 */
586static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
587{
588	page->lru.next = (struct list_head *)cache;
589}
590
591static inline struct kmem_cache *page_get_cache(struct page *page)
592{
593	if (unlikely(PageCompound(page)))
594		page = (struct page *)page_private(page);
595	return (struct kmem_cache *)page->lru.next;
596}
597
598static inline void page_set_slab(struct page *page, struct slab *slab)
599{
600	page->lru.prev = (struct list_head *)slab;
601}
602
603static inline struct slab *page_get_slab(struct page *page)
604{
605	if (unlikely(PageCompound(page)))
606		page = (struct page *)page_private(page);
607	return (struct slab *)page->lru.prev;
608}
609
610static inline struct kmem_cache *virt_to_cache(const void *obj)
611{
612	struct page *page = virt_to_page(obj);
613	return page_get_cache(page);
614}
615
616static inline struct slab *virt_to_slab(const void *obj)
617{
618	struct page *page = virt_to_page(obj);
619	return page_get_slab(page);
620}
621
622static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
623				 unsigned int idx)
624{
625	return slab->s_mem + cache->buffer_size * idx;
626}
627
628static inline unsigned int obj_to_index(struct kmem_cache *cache,
629					struct slab *slab, void *obj)
630{
631	return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
632}
633
634/*
635 * These are the default caches for kmalloc. Custom caches can have other sizes.
636 */
637struct cache_sizes malloc_sizes[] = {
638#define CACHE(x) { .cs_size = (x) },
639#include <linux/kmalloc_sizes.h>
640	CACHE(ULONG_MAX)
641#undef CACHE
642};
643EXPORT_SYMBOL(malloc_sizes);
644
645/* Must match cache_sizes above. Out of line to keep cache footprint low. */
646struct cache_names {
647	char *name;
648	char *name_dma;
649};
650
651static struct cache_names __initdata cache_names[] = {
652#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
653#include <linux/kmalloc_sizes.h>
654	{NULL,}
655#undef CACHE
656};
657
658static struct arraycache_init initarray_cache __initdata =
659    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
660static struct arraycache_init initarray_generic =
661    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
662
663/* internal cache of cache description objs */
664static struct kmem_cache cache_cache = {
665	.batchcount = 1,
666	.limit = BOOT_CPUCACHE_ENTRIES,
667	.shared = 1,
668	.buffer_size = sizeof(struct kmem_cache),
669	.name = "kmem_cache",
670#if DEBUG
671	.obj_size = sizeof(struct kmem_cache),
672#endif
673};
674
675/* Guard access to the cache-chain. */
676static DEFINE_MUTEX(cache_chain_mutex);
677static struct list_head cache_chain;
678
679/*
680 * vm_enough_memory() looks at this to determine how many slab-allocated pages
681 * are possibly freeable under pressure
682 *
683 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
684 */
685atomic_t slab_reclaim_pages;
686
687/*
688 * chicken and egg problem: delay the per-cpu array allocation
689 * until the general caches are up.
690 */
691static enum {
692	NONE,
693	PARTIAL_AC,
694	PARTIAL_L3,
695	FULL
696} g_cpucache_up;
697
698static DEFINE_PER_CPU(struct work_struct, reap_work);
699
700static void free_block(struct kmem_cache *cachep, void **objpp, int len,
701			int node);
702static void enable_cpucache(struct kmem_cache *cachep);
703static void cache_reap(void *unused);
704static int __node_shrink(struct kmem_cache *cachep, int node);
705
706static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
707{
708	return cachep->array[smp_processor_id()];
709}
710
711static inline struct kmem_cache *__find_general_cachep(size_t size,
712							gfp_t gfpflags)
713{
714	struct cache_sizes *csizep = malloc_sizes;
715
716#if DEBUG
717	/* This happens if someone tries to call
718	 * kmem_cache_create(), or __kmalloc(), before
719	 * the generic caches are initialized.
720	 */
721	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
722#endif
723	while (size > csizep->cs_size)
724		csizep++;
725
726	/*
727	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
728	 * has cs_{dma,}cachep==NULL. Thus no special case
729	 * for large kmalloc calls required.
730	 */
731	if (unlikely(gfpflags & GFP_DMA))
732		return csizep->cs_dmacachep;
733	return csizep->cs_cachep;
734}
735
736struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
737{
738	return __find_general_cachep(size, gfpflags);
739}
740EXPORT_SYMBOL(kmem_find_general_cachep);
741
742static size_t slab_mgmt_size(size_t nr_objs, size_t align)
743{
744	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
745}
746
747/*
748 * Calculate the number of objects and left-over bytes for a given buffer size.
749 */
750static void cache_estimate(unsigned long gfporder, size_t buffer_size,
751			   size_t align, int flags, size_t *left_over,
752			   unsigned int *num)
753{
754	int nr_objs;
755	size_t mgmt_size;
756	size_t slab_size = PAGE_SIZE << gfporder;
757
758	/*
759	 * The slab management structure can be either off the slab or
760	 * on it. For the latter case, the memory allocated for a
761	 * slab is used for:
762	 *
763	 * - The struct slab
764	 * - One kmem_bufctl_t for each object
765	 * - Padding to respect alignment of @align
766	 * - @buffer_size bytes for each object
767	 *
768	 * If the slab management structure is off the slab, then the
769	 * alignment will already be calculated into the size. Because
770	 * the slabs are all pages aligned, the objects will be at the
771	 * correct alignment when allocated.
772	 */
773	if (flags & CFLGS_OFF_SLAB) {
774		mgmt_size = 0;
775		nr_objs = slab_size / buffer_size;
776
777		if (nr_objs > SLAB_LIMIT)
778			nr_objs = SLAB_LIMIT;
779	} else {
780		/*
781		 * Ignore padding for the initial guess. The padding
782		 * is at most @align-1 bytes, and @buffer_size is at
783		 * least @align. In the worst case, this result will
784		 * be one greater than the number of objects that fit
785		 * into the memory allocation when taking the padding
786		 * into account.
787		 */
788		nr_objs = (slab_size - sizeof(struct slab)) /
789			  (buffer_size + sizeof(kmem_bufctl_t));
790
791		/*
792		 * This calculated number will be either the right
793		 * amount, or one greater than what we want.
794		 */
795		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
796		       > slab_size)
797			nr_objs--;
798
799		if (nr_objs > SLAB_LIMIT)
800			nr_objs = SLAB_LIMIT;
801
802		mgmt_size = slab_mgmt_size(nr_objs, align);
803	}
804	*num = nr_objs;
805	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
806}
807
808#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
809
810static void __slab_error(const char *function, struct kmem_cache *cachep,
811			char *msg)
812{
813	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
814	       function, cachep->name, msg);
815	dump_stack();
816}
817
818#ifdef CONFIG_NUMA
819/*
820 * Special reaping functions for NUMA systems called from cache_reap().
821 * These take care of doing round robin flushing of alien caches (containing
822 * objects freed on different nodes from which they were allocated) and the
823 * flushing of remote pcps by calling drain_node_pages.
824 */
825static DEFINE_PER_CPU(unsigned long, reap_node);
826
827static void init_reap_node(int cpu)
828{
829	int node;
830
831	node = next_node(cpu_to_node(cpu), node_online_map);
832	if (node == MAX_NUMNODES)
833		node = 0;
834
835	__get_cpu_var(reap_node) = node;
836}
837
838static void next_reap_node(void)
839{
840	int node = __get_cpu_var(reap_node);
841
842	/*
843	 * Also drain per cpu pages on remote zones
844	 */
845	if (node != numa_node_id())
846		drain_node_pages(node);
847
848	node = next_node(node, node_online_map);
849	if (unlikely(node >= MAX_NUMNODES))
850		node = first_node(node_online_map);
851	__get_cpu_var(reap_node) = node;
852}
853
854#else
855#define init_reap_node(cpu) do { } while (0)
856#define next_reap_node(void) do { } while (0)
857#endif
858
859/*
860 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
861 * via the workqueue/eventd.
862 * Add the CPU number into the expiration time to minimize the possibility of
863 * the CPUs getting into lockstep and contending for the global cache chain
864 * lock.
865 */
866static void __devinit start_cpu_timer(int cpu)
867{
868	struct work_struct *reap_work = &per_cpu(reap_work, cpu);
869
870	/*
871	 * When this gets called from do_initcalls via cpucache_init(),
872	 * init_workqueues() has already run, so keventd will be setup
873	 * at that time.
874	 */
875	if (keventd_up() && reap_work->func == NULL) {
876		init_reap_node(cpu);
877		INIT_WORK(reap_work, cache_reap, NULL);
878		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
879	}
880}
881
882static struct array_cache *alloc_arraycache(int node, int entries,
883					    int batchcount)
884{
885	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
886	struct array_cache *nc = NULL;
887
888	nc = kmalloc_node(memsize, GFP_KERNEL, node);
889	if (nc) {
890		nc->avail = 0;
891		nc->limit = entries;
892		nc->batchcount = batchcount;
893		nc->touched = 0;
894		spin_lock_init(&nc->lock);
895	}
896	return nc;
897}
898
899#ifdef CONFIG_NUMA
900static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
901
902static struct array_cache **alloc_alien_cache(int node, int limit)
903{
904	struct array_cache **ac_ptr;
905	int memsize = sizeof(void *) * MAX_NUMNODES;
906	int i;
907
908	if (limit > 1)
909		limit = 12;
910	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
911	if (ac_ptr) {
912		for_each_node(i) {
913			if (i == node || !node_online(i)) {
914				ac_ptr[i] = NULL;
915				continue;
916			}
917			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
918			if (!ac_ptr[i]) {
919				for (i--; i <= 0; i--)
920					kfree(ac_ptr[i]);
921				kfree(ac_ptr);
922				return NULL;
923			}
924		}
925	}
926	return ac_ptr;
927}
928
929static void free_alien_cache(struct array_cache **ac_ptr)
930{
931	int i;
932
933	if (!ac_ptr)
934		return;
935	for_each_node(i)
936	    kfree(ac_ptr[i]);
937	kfree(ac_ptr);
938}
939
940static void __drain_alien_cache(struct kmem_cache *cachep,
941				struct array_cache *ac, int node)
942{
943	struct kmem_list3 *rl3 = cachep->nodelists[node];
944
945	if (ac->avail) {
946		spin_lock(&rl3->list_lock);
947		free_block(cachep, ac->entry, ac->avail, node);
948		ac->avail = 0;
949		spin_unlock(&rl3->list_lock);
950	}
951}
952
953/*
954 * Called from cache_reap() to regularly drain alien caches round robin.
955 */
956static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
957{
958	int node = __get_cpu_var(reap_node);
959
960	if (l3->alien) {
961		struct array_cache *ac = l3->alien[node];
962		if (ac && ac->avail) {
963			spin_lock_irq(&ac->lock);
964			__drain_alien_cache(cachep, ac, node);
965			spin_unlock_irq(&ac->lock);
966		}
967	}
968}
969
970static void drain_alien_cache(struct kmem_cache *cachep,
971				struct array_cache **alien)
972{
973	int i = 0;
974	struct array_cache *ac;
975	unsigned long flags;
976
977	for_each_online_node(i) {
978		ac = alien[i];
979		if (ac) {
980			spin_lock_irqsave(&ac->lock, flags);
981			__drain_alien_cache(cachep, ac, i);
982			spin_unlock_irqrestore(&ac->lock, flags);
983		}
984	}
985}
986#else
987
988#define drain_alien_cache(cachep, alien) do { } while (0)
989#define reap_alien(cachep, l3) do { } while (0)
990
991static inline struct array_cache **alloc_alien_cache(int node, int limit)
992{
993	return (struct array_cache **) 0x01020304ul;
994}
995
996static inline void free_alien_cache(struct array_cache **ac_ptr)
997{
998}
999
1000#endif
1001
1002static int __devinit cpuup_callback(struct notifier_block *nfb,
1003				    unsigned long action, void *hcpu)
1004{
1005	long cpu = (long)hcpu;
1006	struct kmem_cache *cachep;
1007	struct kmem_list3 *l3 = NULL;
1008	int node = cpu_to_node(cpu);
1009	int memsize = sizeof(struct kmem_list3);
1010
1011	switch (action) {
1012	case CPU_UP_PREPARE:
1013		mutex_lock(&cache_chain_mutex);
1014		/*
1015		 * We need to do this right in the beginning since
1016		 * alloc_arraycache's are going to use this list.
1017		 * kmalloc_node allows us to add the slab to the right
1018		 * kmem_list3 and not this cpu's kmem_list3
1019		 */
1020
1021		list_for_each_entry(cachep, &cache_chain, next) {
1022			/*
1023			 * Set up the size64 kmemlist for cpu before we can
1024			 * begin anything. Make sure some other cpu on this
1025			 * node has not already allocated this
1026			 */
1027			if (!cachep->nodelists[node]) {
1028				l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1029				if (!l3)
1030					goto bad;
1031				kmem_list3_init(l3);
1032				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1033				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1034
1035				/*
1036				 * The l3s don't come and go as CPUs come and
1037				 * go.  cache_chain_mutex is sufficient
1038				 * protection here.
1039				 */
1040				cachep->nodelists[node] = l3;
1041			}
1042
1043			spin_lock_irq(&cachep->nodelists[node]->list_lock);
1044			cachep->nodelists[node]->free_limit =
1045				(1 + nr_cpus_node(node)) *
1046				cachep->batchcount + cachep->num;
1047			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1048		}
1049
1050		/*
1051		 * Now we can go ahead with allocating the shared arrays and
1052		 * array caches
1053		 */
1054		list_for_each_entry(cachep, &cache_chain, next) {
1055			struct array_cache *nc;
1056			struct array_cache *shared;
1057			struct array_cache **alien;
1058
1059			nc = alloc_arraycache(node, cachep->limit,
1060						cachep->batchcount);
1061			if (!nc)
1062				goto bad;
1063			shared = alloc_arraycache(node,
1064					cachep->shared * cachep->batchcount,
1065					0xbaadf00d);
1066			if (!shared)
1067				goto bad;
1068
1069			alien = alloc_alien_cache(node, cachep->limit);
1070			if (!alien)
1071				goto bad;
1072			cachep->array[cpu] = nc;
1073			l3 = cachep->nodelists[node];
1074			BUG_ON(!l3);
1075
1076			spin_lock_irq(&l3->list_lock);
1077			if (!l3->shared) {
1078				/*
1079				 * We are serialised from CPU_DEAD or
1080				 * CPU_UP_CANCELLED by the cpucontrol lock
1081				 */
1082				l3->shared = shared;
1083				shared = NULL;
1084			}
1085#ifdef CONFIG_NUMA
1086			if (!l3->alien) {
1087				l3->alien = alien;
1088				alien = NULL;
1089			}
1090#endif
1091			spin_unlock_irq(&l3->list_lock);
1092			kfree(shared);
1093			free_alien_cache(alien);
1094		}
1095		mutex_unlock(&cache_chain_mutex);
1096		break;
1097	case CPU_ONLINE:
1098		start_cpu_timer(cpu);
1099		break;
1100#ifdef CONFIG_HOTPLUG_CPU
1101	case CPU_DEAD:
1102		/*
1103		 * Even if all the cpus of a node are down, we don't free the
1104		 * kmem_list3 of any cache. This to avoid a race between
1105		 * cpu_down, and a kmalloc allocation from another cpu for
1106		 * memory from the node of the cpu going down.  The list3
1107		 * structure is usually allocated from kmem_cache_create() and
1108		 * gets destroyed at kmem_cache_destroy().
1109		 */
1110		/* fall thru */
1111	case CPU_UP_CANCELED:
1112		mutex_lock(&cache_chain_mutex);
1113		list_for_each_entry(cachep, &cache_chain, next) {
1114			struct array_cache *nc;
1115			struct array_cache *shared;
1116			struct array_cache **alien;
1117			cpumask_t mask;
1118
1119			mask = node_to_cpumask(node);
1120			/* cpu is dead; no one can alloc from it. */
1121			nc = cachep->array[cpu];
1122			cachep->array[cpu] = NULL;
1123			l3 = cachep->nodelists[node];
1124
1125			if (!l3)
1126				goto free_array_cache;
1127
1128			spin_lock_irq(&l3->list_lock);
1129
1130			/* Free limit for this kmem_list3 */
1131			l3->free_limit -= cachep->batchcount;
1132			if (nc)
1133				free_block(cachep, nc->entry, nc->avail, node);
1134
1135			if (!cpus_empty(mask)) {
1136				spin_unlock_irq(&l3->list_lock);
1137				goto free_array_cache;
1138			}
1139
1140			shared = l3->shared;
1141			if (shared) {
1142				free_block(cachep, l3->shared->entry,
1143					   l3->shared->avail, node);
1144				l3->shared = NULL;
1145			}
1146
1147			alien = l3->alien;
1148			l3->alien = NULL;
1149
1150			spin_unlock_irq(&l3->list_lock);
1151
1152			kfree(shared);
1153			if (alien) {
1154				drain_alien_cache(cachep, alien);
1155				free_alien_cache(alien);
1156			}
1157free_array_cache:
1158			kfree(nc);
1159		}
1160		/*
1161		 * In the previous loop, all the objects were freed to
1162		 * the respective cache's slabs,  now we can go ahead and
1163		 * shrink each nodelist to its limit.
1164		 */
1165		list_for_each_entry(cachep, &cache_chain, next) {
1166			l3 = cachep->nodelists[node];
1167			if (!l3)
1168				continue;
1169			spin_lock_irq(&l3->list_lock);
1170			/* free slabs belonging to this node */
1171			__node_shrink(cachep, node);
1172			spin_unlock_irq(&l3->list_lock);
1173		}
1174		mutex_unlock(&cache_chain_mutex);
1175		break;
1176#endif
1177	}
1178	return NOTIFY_OK;
1179bad:
1180	mutex_unlock(&cache_chain_mutex);
1181	return NOTIFY_BAD;
1182}
1183
1184static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
1185
1186/*
1187 * swap the static kmem_list3 with kmalloced memory
1188 */
1189static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1190			int nodeid)
1191{
1192	struct kmem_list3 *ptr;
1193
1194	BUG_ON(cachep->nodelists[nodeid] != list);
1195	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
1196	BUG_ON(!ptr);
1197
1198	local_irq_disable();
1199	memcpy(ptr, list, sizeof(struct kmem_list3));
1200	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1201	cachep->nodelists[nodeid] = ptr;
1202	local_irq_enable();
1203}
1204
1205/*
1206 * Initialisation.  Called after the page allocator have been initialised and
1207 * before smp_init().
1208 */
1209void __init kmem_cache_init(void)
1210{
1211	size_t left_over;
1212	struct cache_sizes *sizes;
1213	struct cache_names *names;
1214	int i;
1215	int order;
1216
1217	for (i = 0; i < NUM_INIT_LISTS; i++) {
1218		kmem_list3_init(&initkmem_list3[i]);
1219		if (i < MAX_NUMNODES)
1220			cache_cache.nodelists[i] = NULL;
1221	}
1222
1223	/*
1224	 * Fragmentation resistance on low memory - only use bigger
1225	 * page orders on machines with more than 32MB of memory.
1226	 */
1227	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
1228		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1229
1230	/* Bootstrap is tricky, because several objects are allocated
1231	 * from caches that do not exist yet:
1232	 * 1) initialize the cache_cache cache: it contains the struct
1233	 *    kmem_cache structures of all caches, except cache_cache itself:
1234	 *    cache_cache is statically allocated.
1235	 *    Initially an __init data area is used for the head array and the
1236	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
1237	 *    array at the end of the bootstrap.
1238	 * 2) Create the first kmalloc cache.
1239	 *    The struct kmem_cache for the new cache is allocated normally.
1240	 *    An __init data area is used for the head array.
1241	 * 3) Create the remaining kmalloc caches, with minimally sized
1242	 *    head arrays.
1243	 * 4) Replace the __init data head arrays for cache_cache and the first
1244	 *    kmalloc cache with kmalloc allocated arrays.
1245	 * 5) Replace the __init data for kmem_list3 for cache_cache and
1246	 *    the other cache's with kmalloc allocated memory.
1247	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1248	 */
1249
1250	/* 1) create the cache_cache */
1251	INIT_LIST_HEAD(&cache_chain);
1252	list_add(&cache_cache.next, &cache_chain);
1253	cache_cache.colour_off = cache_line_size();
1254	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1255	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
1256
1257	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1258					cache_line_size());
1259
1260	for (order = 0; order < MAX_ORDER; order++) {
1261		cache_estimate(order, cache_cache.buffer_size,
1262			cache_line_size(), 0, &left_over, &cache_cache.num);
1263		if (cache_cache.num)
1264			break;
1265	}
1266	if (!cache_cache.num)
1267		BUG();
1268	cache_cache.gfporder = order;
1269	cache_cache.colour = left_over / cache_cache.colour_off;
1270	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1271				      sizeof(struct slab), cache_line_size());
1272
1273	/* 2+3) create the kmalloc caches */
1274	sizes = malloc_sizes;
1275	names = cache_names;
1276
1277	/*
1278	 * Initialize the caches that provide memory for the array cache and the
1279	 * kmem_list3 structures first.  Without this, further allocations will
1280	 * bug.
1281	 */
1282
1283	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1284					sizes[INDEX_AC].cs_size,
1285					ARCH_KMALLOC_MINALIGN,
1286					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1287					NULL, NULL);
1288
1289	if (INDEX_AC != INDEX_L3) {
1290		sizes[INDEX_L3].cs_cachep =
1291			kmem_cache_create(names[INDEX_L3].name,
1292				sizes[INDEX_L3].cs_size,
1293				ARCH_KMALLOC_MINALIGN,
1294				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1295				NULL, NULL);
1296	}
1297
1298	while (sizes->cs_size != ULONG_MAX) {
1299		/*
1300		 * For performance, all the general caches are L1 aligned.
1301		 * This should be particularly beneficial on SMP boxes, as it
1302		 * eliminates "false sharing".
1303		 * Note for systems short on memory removing the alignment will
1304		 * allow tighter packing of the smaller caches.
1305		 */
1306		if (!sizes->cs_cachep) {
1307			sizes->cs_cachep = kmem_cache_create(names->name,
1308					sizes->cs_size,
1309					ARCH_KMALLOC_MINALIGN,
1310					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1311					NULL, NULL);
1312		}
1313
1314		/* Inc off-slab bufctl limit until the ceiling is hit. */
1315		if (!(OFF_SLAB(sizes->cs_cachep))) {
1316			offslab_limit = sizes->cs_size - sizeof(struct slab);
1317			offslab_limit /= sizeof(kmem_bufctl_t);
1318		}
1319
1320		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1321					sizes->cs_size,
1322					ARCH_KMALLOC_MINALIGN,
1323					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1324						SLAB_PANIC,
1325					NULL, NULL);
1326		sizes++;
1327		names++;
1328	}
1329	/* 4) Replace the bootstrap head arrays */
1330	{
1331		void *ptr;
1332
1333		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1334
1335		local_irq_disable();
1336		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1337		memcpy(ptr, cpu_cache_get(&cache_cache),
1338		       sizeof(struct arraycache_init));
1339		cache_cache.array[smp_processor_id()] = ptr;
1340		local_irq_enable();
1341
1342		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1343
1344		local_irq_disable();
1345		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1346		       != &initarray_generic.cache);
1347		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1348		       sizeof(struct arraycache_init));
1349		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1350		    ptr;
1351		local_irq_enable();
1352	}
1353	/* 5) Replace the bootstrap kmem_list3's */
1354	{
1355		int node;
1356		/* Replace the static kmem_list3 structures for the boot cpu */
1357		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
1358			  numa_node_id());
1359
1360		for_each_online_node(node) {
1361			init_list(malloc_sizes[INDEX_AC].cs_cachep,
1362				  &initkmem_list3[SIZE_AC + node], node);
1363
1364			if (INDEX_AC != INDEX_L3) {
1365				init_list(malloc_sizes[INDEX_L3].cs_cachep,
1366					  &initkmem_list3[SIZE_L3 + node],
1367					  node);
1368			}
1369		}
1370	}
1371
1372	/* 6) resize the head arrays to their final sizes */
1373	{
1374		struct kmem_cache *cachep;
1375		mutex_lock(&cache_chain_mutex);
1376		list_for_each_entry(cachep, &cache_chain, next)
1377			enable_cpucache(cachep);
1378		mutex_unlock(&cache_chain_mutex);
1379	}
1380
1381	/* Done! */
1382	g_cpucache_up = FULL;
1383
1384	/*
1385	 * Register a cpu startup notifier callback that initializes
1386	 * cpu_cache_get for all new cpus
1387	 */
1388	register_cpu_notifier(&cpucache_notifier);
1389
1390	/*
1391	 * The reap timers are started later, with a module init call: That part
1392	 * of the kernel is not yet operational.
1393	 */
1394}
1395
1396static int __init cpucache_init(void)
1397{
1398	int cpu;
1399
1400	/*
1401	 * Register the timers that return unneeded pages to the page allocator
1402	 */
1403	for_each_online_cpu(cpu)
1404		start_cpu_timer(cpu);
1405	return 0;
1406}
1407__initcall(cpucache_init);
1408
1409/*
1410 * Interface to system's page allocator. No need to hold the cache-lock.
1411 *
1412 * If we requested dmaable memory, we will get it. Even if we
1413 * did not request dmaable memory, we might get it, but that
1414 * would be relatively rare and ignorable.
1415 */
1416static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1417{
1418	struct page *page;
1419	void *addr;
1420	int i;
1421
1422	flags |= cachep->gfpflags;
1423	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1424	if (!page)
1425		return NULL;
1426	addr = page_address(page);
1427
1428	i = (1 << cachep->gfporder);
1429	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1430		atomic_add(i, &slab_reclaim_pages);
1431	add_page_state(nr_slab, i);
1432	while (i--) {
1433		__SetPageSlab(page);
1434		page++;
1435	}
1436	return addr;
1437}
1438
1439/*
1440 * Interface to system's page release.
1441 */
1442static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1443{
1444	unsigned long i = (1 << cachep->gfporder);
1445	struct page *page = virt_to_page(addr);
1446	const unsigned long nr_freed = i;
1447
1448	while (i--) {
1449		BUG_ON(!PageSlab(page));
1450		__ClearPageSlab(page);
1451		page++;
1452	}
1453	sub_page_state(nr_slab, nr_freed);
1454	if (current->reclaim_state)
1455		current->reclaim_state->reclaimed_slab += nr_freed;
1456	free_pages((unsigned long)addr, cachep->gfporder);
1457	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1458		atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
1459}
1460
1461static void kmem_rcu_free(struct rcu_head *head)
1462{
1463	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1464	struct kmem_cache *cachep = slab_rcu->cachep;
1465
1466	kmem_freepages(cachep, slab_rcu->addr);
1467	if (OFF_SLAB(cachep))
1468		kmem_cache_free(cachep->slabp_cache, slab_rcu);
1469}
1470
1471#if DEBUG
1472
1473#ifdef CONFIG_DEBUG_PAGEALLOC
1474static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1475			    unsigned long caller)
1476{
1477	int size = obj_size(cachep);
1478
1479	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1480
1481	if (size < 5 * sizeof(unsigned long))
1482		return;
1483
1484	*addr++ = 0x12345678;
1485	*addr++ = caller;
1486	*addr++ = smp_processor_id();
1487	size -= 3 * sizeof(unsigned long);
1488	{
1489		unsigned long *sptr = &caller;
1490		unsigned long svalue;
1491
1492		while (!kstack_end(sptr)) {
1493			svalue = *sptr++;
1494			if (kernel_text_address(svalue)) {
1495				*addr++ = svalue;
1496				size -= sizeof(unsigned long);
1497				if (size <= sizeof(unsigned long))
1498					break;
1499			}
1500		}
1501
1502	}
1503	*addr++ = 0x87654321;
1504}
1505#endif
1506
1507static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1508{
1509	int size = obj_size(cachep);
1510	addr = &((char *)addr)[obj_offset(cachep)];
1511
1512	memset(addr, val, size);
1513	*(unsigned char *)(addr + size - 1) = POISON_END;
1514}
1515
1516static void dump_line(char *data, int offset, int limit)
1517{
1518	int i;
1519	printk(KERN_ERR "%03x:", offset);
1520	for (i = 0; i < limit; i++)
1521		printk(" %02x", (unsigned char)data[offset + i]);
1522	printk("\n");
1523}
1524#endif
1525
1526#if DEBUG
1527
1528static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1529{
1530	int i, size;
1531	char *realobj;
1532
1533	if (cachep->flags & SLAB_RED_ZONE) {
1534		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1535			*dbg_redzone1(cachep, objp),
1536			*dbg_redzone2(cachep, objp));
1537	}
1538
1539	if (cachep->flags & SLAB_STORE_USER) {
1540		printk(KERN_ERR "Last user: [<%p>]",
1541			*dbg_userword(cachep, objp));
1542		print_symbol("(%s)",
1543				(unsigned long)*dbg_userword(cachep, objp));
1544		printk("\n");
1545	}
1546	realobj = (char *)objp + obj_offset(cachep);
1547	size = obj_size(cachep);
1548	for (i = 0; i < size && lines; i += 16, lines--) {
1549		int limit;
1550		limit = 16;
1551		if (i + limit > size)
1552			limit = size - i;
1553		dump_line(realobj, i, limit);
1554	}
1555}
1556
1557static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1558{
1559	char *realobj;
1560	int size, i;
1561	int lines = 0;
1562
1563	realobj = (char *)objp + obj_offset(cachep);
1564	size = obj_size(cachep);
1565
1566	for (i = 0; i < size; i++) {
1567		char exp = POISON_FREE;
1568		if (i == size - 1)
1569			exp = POISON_END;
1570		if (realobj[i] != exp) {
1571			int limit;
1572			/* Mismatch ! */
1573			/* Print header */
1574			if (lines == 0) {
1575				printk(KERN_ERR
1576					"Slab corruption: start=%p, len=%d\n",
1577					realobj, size);
1578				print_objinfo(cachep, objp, 0);
1579			}
1580			/* Hexdump the affected line */
1581			i = (i / 16) * 16;
1582			limit = 16;
1583			if (i + limit > size)
1584				limit = size - i;
1585			dump_line(realobj, i, limit);
1586			i += 16;
1587			lines++;
1588			/* Limit to 5 lines */
1589			if (lines > 5)
1590				break;
1591		}
1592	}
1593	if (lines != 0) {
1594		/* Print some data about the neighboring objects, if they
1595		 * exist:
1596		 */
1597		struct slab *slabp = virt_to_slab(objp);
1598		unsigned int objnr;
1599
1600		objnr = obj_to_index(cachep, slabp, objp);
1601		if (objnr) {
1602			objp = index_to_obj(cachep, slabp, objnr - 1);
1603			realobj = (char *)objp + obj_offset(cachep);
1604			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1605			       realobj, size);
1606			print_objinfo(cachep, objp, 2);
1607		}
1608		if (objnr + 1 < cachep->num) {
1609			objp = index_to_obj(cachep, slabp, objnr + 1);
1610			realobj = (char *)objp + obj_offset(cachep);
1611			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1612			       realobj, size);
1613			print_objinfo(cachep, objp, 2);
1614		}
1615	}
1616}
1617#endif
1618
1619#if DEBUG
1620/**
1621 * slab_destroy_objs - destroy a slab and its objects
1622 * @cachep: cache pointer being destroyed
1623 * @slabp: slab pointer being destroyed
1624 *
1625 * Call the registered destructor for each object in a slab that is being
1626 * destroyed.
1627 */
1628static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1629{
1630	int i;
1631	for (i = 0; i < cachep->num; i++) {
1632		void *objp = index_to_obj(cachep, slabp, i);
1633
1634		if (cachep->flags & SLAB_POISON) {
1635#ifdef CONFIG_DEBUG_PAGEALLOC
1636			if (cachep->buffer_size % PAGE_SIZE == 0 &&
1637					OFF_SLAB(cachep))
1638				kernel_map_pages(virt_to_page(objp),
1639					cachep->buffer_size / PAGE_SIZE, 1);
1640			else
1641				check_poison_obj(cachep, objp);
1642#else
1643			check_poison_obj(cachep, objp);
1644#endif
1645		}
1646		if (cachep->flags & SLAB_RED_ZONE) {
1647			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1648				slab_error(cachep, "start of a freed object "
1649					   "was overwritten");
1650			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1651				slab_error(cachep, "end of a freed object "
1652					   "was overwritten");
1653		}
1654		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1655			(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
1656	}
1657}
1658#else
1659static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1660{
1661	if (cachep->dtor) {
1662		int i;
1663		for (i = 0; i < cachep->num; i++) {
1664			void *objp = index_to_obj(cachep, slabp, i);
1665			(cachep->dtor) (objp, cachep, 0);
1666		}
1667	}
1668}
1669#endif
1670
1671/**
1672 * slab_destroy - destroy and release all objects in a slab
1673 * @cachep: cache pointer being destroyed
1674 * @slabp: slab pointer being destroyed
1675 *
1676 * Destroy all the objs in a slab, and release the mem back to the system.
1677 * Before calling the slab must have been unlinked from the cache.  The
1678 * cache-lock is not held/needed.
1679 */
1680static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1681{
1682	void *addr = slabp->s_mem - slabp->colouroff;
1683
1684	slab_destroy_objs(cachep, slabp);
1685	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1686		struct slab_rcu *slab_rcu;
1687
1688		slab_rcu = (struct slab_rcu *)slabp;
1689		slab_rcu->cachep = cachep;
1690		slab_rcu->addr = addr;
1691		call_rcu(&slab_rcu->head, kmem_rcu_free);
1692	} else {
1693		kmem_freepages(cachep, addr);
1694		if (OFF_SLAB(cachep))
1695			kmem_cache_free(cachep->slabp_cache, slabp);
1696	}
1697}
1698
1699/*
1700 * For setting up all the kmem_list3s for cache whose buffer_size is same as
1701 * size of kmem_list3.
1702 */
1703static void set_up_list3s(struct kmem_cache *cachep, int index)
1704{
1705	int node;
1706
1707	for_each_online_node(node) {
1708		cachep->nodelists[node] = &initkmem_list3[index + node];
1709		cachep->nodelists[node]->next_reap = jiffies +
1710		    REAPTIMEOUT_LIST3 +
1711		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1712	}
1713}
1714
1715/**
1716 * calculate_slab_order - calculate size (page order) of slabs
1717 * @cachep: pointer to the cache that is being created
1718 * @size: size of objects to be created in this cache.
1719 * @align: required alignment for the objects.
1720 * @flags: slab allocation flags
1721 *
1722 * Also calculates the number of objects per slab.
1723 *
1724 * This could be made much more intelligent.  For now, try to avoid using
1725 * high order pages for slabs.  When the gfp() functions are more friendly
1726 * towards high-order requests, this should be changed.
1727 */
1728static size_t calculate_slab_order(struct kmem_cache *cachep,
1729			size_t size, size_t align, unsigned long flags)
1730{
1731	size_t left_over = 0;
1732	int gfporder;
1733
1734	for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
1735		unsigned int num;
1736		size_t remainder;
1737
1738		cache_estimate(gfporder, size, align, flags, &remainder, &num);
1739		if (!num)
1740			continue;
1741
1742		/* More than offslab_limit objects will cause problems */
1743		if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
1744			break;
1745
1746		/* Found something acceptable - save it away */
1747		cachep->num = num;
1748		cachep->gfporder = gfporder;
1749		left_over = remainder;
1750
1751		/*
1752		 * A VFS-reclaimable slab tends to have most allocations
1753		 * as GFP_NOFS and we really don't want to have to be allocating
1754		 * higher-order pages when we are unable to shrink dcache.
1755		 */
1756		if (flags & SLAB_RECLAIM_ACCOUNT)
1757			break;
1758
1759		/*
1760		 * Large number of objects is good, but very large slabs are
1761		 * currently bad for the gfp()s.
1762		 */
1763		if (gfporder >= slab_break_gfp_order)
1764			break;
1765
1766		/*
1767		 * Acceptable internal fragmentation?
1768		 */
1769		if (left_over * 8 <= (PAGE_SIZE << gfporder))
1770			break;
1771	}
1772	return left_over;
1773}
1774
1775static void setup_cpu_cache(struct kmem_cache *cachep)
1776{
1777	if (g_cpucache_up == FULL) {
1778		enable_cpucache(cachep);
1779		return;
1780	}
1781	if (g_cpucache_up == NONE) {
1782		/*
1783		 * Note: the first kmem_cache_create must create the cache
1784		 * that's used by kmalloc(24), otherwise the creation of
1785		 * further caches will BUG().
1786		 */
1787		cachep->array[smp_processor_id()] = &initarray_generic.cache;
1788
1789		/*
1790		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
1791		 * the first cache, then we need to set up all its list3s,
1792		 * otherwise the creation of further caches will BUG().
1793		 */
1794		set_up_list3s(cachep, SIZE_AC);
1795		if (INDEX_AC == INDEX_L3)
1796			g_cpucache_up = PARTIAL_L3;
1797		else
1798			g_cpucache_up = PARTIAL_AC;
1799	} else {
1800		cachep->array[smp_processor_id()] =
1801			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1802
1803		if (g_cpucache_up == PARTIAL_AC) {
1804			set_up_list3s(cachep, SIZE_L3);
1805			g_cpucache_up = PARTIAL_L3;
1806		} else {
1807			int node;
1808			for_each_online_node(node) {
1809				cachep->nodelists[node] =
1810				    kmalloc_node(sizeof(struct kmem_list3),
1811						GFP_KERNEL, node);
1812				BUG_ON(!cachep->nodelists[node]);
1813				kmem_list3_init(cachep->nodelists[node]);
1814			}
1815		}
1816	}
1817	cachep->nodelists[numa_node_id()]->next_reap =
1818			jiffies + REAPTIMEOUT_LIST3 +
1819			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1820
1821	cpu_cache_get(cachep)->avail = 0;
1822	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1823	cpu_cache_get(cachep)->batchcount = 1;
1824	cpu_cache_get(cachep)->touched = 0;
1825	cachep->batchcount = 1;
1826	cachep->limit = BOOT_CPUCACHE_ENTRIES;
1827}
1828
1829/**
1830 * kmem_cache_create - Create a cache.
1831 * @name: A string which is used in /proc/slabinfo to identify this cache.
1832 * @size: The size of objects to be created in this cache.
1833 * @align: The required alignment for the objects.
1834 * @flags: SLAB flags
1835 * @ctor: A constructor for the objects.
1836 * @dtor: A destructor for the objects.
1837 *
1838 * Returns a ptr to the cache on success, NULL on failure.
1839 * Cannot be called within a int, but can be interrupted.
1840 * The @ctor is run when new pages are allocated by the cache
1841 * and the @dtor is run before the pages are handed back.
1842 *
1843 * @name must be valid until the cache is destroyed. This implies that
1844 * the module calling this has to destroy the cache before getting unloaded.
1845 *
1846 * The flags are
1847 *
1848 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
1849 * to catch references to uninitialised memory.
1850 *
1851 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1852 * for buffer overruns.
1853 *
1854 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1855 * cacheline.  This can be beneficial if you're counting cycles as closely
1856 * as davem.
1857 */
1858struct kmem_cache *
1859kmem_cache_create (const char *name, size_t size, size_t align,
1860	unsigned long flags,
1861	void (*ctor)(void*, struct kmem_cache *, unsigned long),
1862	void (*dtor)(void*, struct kmem_cache *, unsigned long))
1863{
1864	size_t left_over, slab_size, ralign;
1865	struct kmem_cache *cachep = NULL;
1866	struct list_head *p;
1867
1868	/*
1869	 * Sanity checks... these are all serious usage bugs.
1870	 */
1871	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
1872	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
1873		printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
1874				name);
1875		BUG();
1876	}
1877
1878	/*
1879	 * Prevent CPUs from coming and going.
1880	 * lock_cpu_hotplug() nests outside cache_chain_mutex
1881	 */
1882	lock_cpu_hotplug();
1883
1884	mutex_lock(&cache_chain_mutex);
1885
1886	list_for_each(p, &cache_chain) {
1887		struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
1888		mm_segment_t old_fs = get_fs();
1889		char tmp;
1890		int res;
1891
1892		/*
1893		 * This happens when the module gets unloaded and doesn't
1894		 * destroy its slab cache and no-one else reuses the vmalloc
1895		 * area of the module.  Print a warning.
1896		 */
1897		set_fs(KERNEL_DS);
1898		res = __get_user(tmp, pc->name);
1899		set_fs(old_fs);
1900		if (res) {
1901			printk("SLAB: cache with size %d has lost its name\n",
1902			       pc->buffer_size);
1903			continue;
1904		}
1905
1906		if (!strcmp(pc->name, name)) {
1907			printk("kmem_cache_create: duplicate cache %s\n", name);
1908			dump_stack();
1909			goto oops;
1910		}
1911	}
1912
1913#if DEBUG
1914	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
1915	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
1916		/* No constructor, but inital state check requested */
1917		printk(KERN_ERR "%s: No con, but init state check "
1918		       "requested - %s\n", __FUNCTION__, name);
1919		flags &= ~SLAB_DEBUG_INITIAL;
1920	}
1921#if FORCED_DEBUG
1922	/*
1923	 * Enable redzoning and last user accounting, except for caches with
1924	 * large objects, if the increased size would increase the object size
1925	 * above the next power of two: caches with object sizes just above a
1926	 * power of two have a significant amount of internal fragmentation.
1927	 */
1928	if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
1929		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
1930	if (!(flags & SLAB_DESTROY_BY_RCU))
1931		flags |= SLAB_POISON;
1932#endif
1933	if (flags & SLAB_DESTROY_BY_RCU)
1934		BUG_ON(flags & SLAB_POISON);
1935#endif
1936	if (flags & SLAB_DESTROY_BY_RCU)
1937		BUG_ON(dtor);
1938
1939	/*
1940	 * Always checks flags, a caller might be expecting debug support which
1941	 * isn't available.
1942	 */
1943	if (flags & ~CREATE_MASK)
1944		BUG();
1945
1946	/*
1947	 * Check that size is in terms of words.  This is needed to avoid
1948	 * unaligned accesses for some archs when redzoning is used, and makes
1949	 * sure any on-slab bufctl's are also correctly aligned.
1950	 */
1951	if (size & (BYTES_PER_WORD - 1)) {
1952		size += (BYTES_PER_WORD - 1);
1953		size &= ~(BYTES_PER_WORD - 1);
1954	}
1955
1956	/* calculate the final buffer alignment: */
1957
1958	/* 1) arch recommendation: can be overridden for debug */
1959	if (flags & SLAB_HWCACHE_ALIGN) {
1960		/*
1961		 * Default alignment: as specified by the arch code.  Except if
1962		 * an object is really small, then squeeze multiple objects into
1963		 * one cacheline.
1964		 */
1965		ralign = cache_line_size();
1966		while (size <= ralign / 2)
1967			ralign /= 2;
1968	} else {
1969		ralign = BYTES_PER_WORD;
1970	}
1971	/* 2) arch mandated alignment: disables debug if necessary */
1972	if (ralign < ARCH_SLAB_MINALIGN) {
1973		ralign = ARCH_SLAB_MINALIGN;
1974		if (ralign > BYTES_PER_WORD)
1975			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1976	}
1977	/* 3) caller mandated alignment: disables debug if necessary */
1978	if (ralign < align) {
1979		ralign = align;
1980		if (ralign > BYTES_PER_WORD)
1981			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1982	}
1983	/*
1984	 * 4) Store it. Note that the debug code below can reduce
1985	 *    the alignment to BYTES_PER_WORD.
1986	 */
1987	align = ralign;
1988
1989	/* Get cache's description obj. */
1990	cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
1991	if (!cachep)
1992		goto oops;
1993	memset(cachep, 0, sizeof(struct kmem_cache));
1994
1995#if DEBUG
1996	cachep->obj_size = size;
1997
1998	if (flags & SLAB_RED_ZONE) {
1999		/* redzoning only works with word aligned caches */
2000		align = BYTES_PER_WORD;
2001
2002		/* add space for red zone words */
2003		cachep->obj_offset += BYTES_PER_WORD;
2004		size += 2 * BYTES_PER_WORD;
2005	}
2006	if (flags & SLAB_STORE_USER) {
2007		/* user store requires word alignment and
2008		 * one word storage behind the end of the real
2009		 * object.
2010		 */
2011		align = BYTES_PER_WORD;
2012		size += BYTES_PER_WORD;
2013	}
2014#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2015	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2016	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
2017		cachep->obj_offset += PAGE_SIZE - size;
2018		size = PAGE_SIZE;
2019	}
2020#endif
2021#endif
2022
2023	/* Determine if the slab management is 'on' or 'off' slab. */
2024	if (size >= (PAGE_SIZE >> 3))
2025		/*
2026		 * Size is large, assume best to place the slab management obj
2027		 * off-slab (should allow better packing of objs).
2028		 */
2029		flags |= CFLGS_OFF_SLAB;
2030
2031	size = ALIGN(size, align);
2032
2033	left_over = calculate_slab_order(cachep, size, align, flags);
2034
2035	if (!cachep->num) {
2036		printk("kmem_cache_create: couldn't create cache %s.\n", name);
2037		kmem_cache_free(&cache_cache, cachep);
2038		cachep = NULL;
2039		goto oops;
2040	}
2041	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2042			  + sizeof(struct slab), align);
2043
2044	/*
2045	 * If the slab has been placed off-slab, and we have enough space then
2046	 * move it on-slab. This is at the expense of any extra colouring.
2047	 */
2048	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2049		flags &= ~CFLGS_OFF_SLAB;
2050		left_over -= slab_size;
2051	}
2052
2053	if (flags & CFLGS_OFF_SLAB) {
2054		/* really off slab. No need for manual alignment */
2055		slab_size =
2056		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2057	}
2058
2059	cachep->colour_off = cache_line_size();
2060	/* Offset must be a multiple of the alignment. */
2061	if (cachep->colour_off < align)
2062		cachep->colour_off = align;
2063	cachep->colour = left_over / cachep->colour_off;
2064	cachep->slab_size = slab_size;
2065	cachep->flags = flags;
2066	cachep->gfpflags = 0;
2067	if (flags & SLAB_CACHE_DMA)
2068		cachep->gfpflags |= GFP_DMA;
2069	cachep->buffer_size = size;
2070
2071	if (flags & CFLGS_OFF_SLAB)
2072		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2073	cachep->ctor = ctor;
2074	cachep->dtor = dtor;
2075	cachep->name = name;
2076
2077
2078	setup_cpu_cache(cachep);
2079
2080	/* cache setup completed, link it into the list */
2081	list_add(&cachep->next, &cache_chain);
2082oops:
2083	if (!cachep && (flags & SLAB_PANIC))
2084		panic("kmem_cache_create(): failed to create slab `%s'\n",
2085		      name);
2086	mutex_unlock(&cache_chain_mutex);
2087	unlock_cpu_hotplug();
2088	return cachep;
2089}
2090EXPORT_SYMBOL(kmem_cache_create);
2091
2092#if DEBUG
2093static void check_irq_off(void)
2094{
2095	BUG_ON(!irqs_disabled());
2096}
2097
2098static void check_irq_on(void)
2099{
2100	BUG_ON(irqs_disabled());
2101}
2102
2103static void check_spinlock_acquired(struct kmem_cache *cachep)
2104{
2105#ifdef CONFIG_SMP
2106	check_irq_off();
2107	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
2108#endif
2109}
2110
2111static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2112{
2113#ifdef CONFIG_SMP
2114	check_irq_off();
2115	assert_spin_locked(&cachep->nodelists[node]->list_lock);
2116#endif
2117}
2118
2119#else
2120#define check_irq_off()	do { } while(0)
2121#define check_irq_on()	do { } while(0)
2122#define check_spinlock_acquired(x) do { } while(0)
2123#define check_spinlock_acquired_node(x, y) do { } while(0)
2124#endif
2125
2126static void drain_array_locked(struct kmem_cache *cachep,
2127			struct array_cache *ac, int force, int node);
2128
2129static void do_drain(void *arg)
2130{
2131	struct kmem_cache *cachep = arg;
2132	struct array_cache *ac;
2133	int node = numa_node_id();
2134
2135	check_irq_off();
2136	ac = cpu_cache_get(cachep);
2137	spin_lock(&cachep->nodelists[node]->list_lock);
2138	free_block(cachep, ac->entry, ac->avail, node);
2139	spin_unlock(&cachep->nodelists[node]->list_lock);
2140	ac->avail = 0;
2141}
2142
2143static void drain_cpu_caches(struct kmem_cache *cachep)
2144{
2145	struct kmem_list3 *l3;
2146	int node;
2147
2148	on_each_cpu(do_drain, cachep, 1, 1);
2149	check_irq_on();
2150	for_each_online_node(node) {
2151		l3 = cachep->nodelists[node];
2152		if (l3) {
2153			spin_lock_irq(&l3->list_lock);
2154			drain_array_locked(cachep, l3->shared, 1, node);
2155			spin_unlock_irq(&l3->list_lock);
2156			if (l3->alien)
2157				drain_alien_cache(cachep, l3->alien);
2158		}
2159	}
2160}
2161
2162static int __node_shrink(struct kmem_cache *cachep, int node)
2163{
2164	struct slab *slabp;
2165	struct kmem_list3 *l3 = cachep->nodelists[node];
2166	int ret;
2167
2168	for (;;) {
2169		struct list_head *p;
2170
2171		p = l3->slabs_free.prev;
2172		if (p == &l3->slabs_free)
2173			break;
2174
2175		slabp = list_entry(l3->slabs_free.prev, struct slab, list);
2176#if DEBUG
2177		if (slabp->inuse)
2178			BUG();
2179#endif
2180		list_del(&slabp->list);
2181
2182		l3->free_objects -= cachep->num;
2183		spin_unlock_irq(&l3->list_lock);
2184		slab_destroy(cachep, slabp);
2185		spin_lock_irq(&l3->list_lock);
2186	}
2187	ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
2188	return ret;
2189}
2190
2191static int __cache_shrink(struct kmem_cache *cachep)
2192{
2193	int ret = 0, i = 0;
2194	struct kmem_list3 *l3;
2195
2196	drain_cpu_caches(cachep);
2197
2198	check_irq_on();
2199	for_each_online_node(i) {
2200		l3 = cachep->nodelists[i];
2201		if (l3) {
2202			spin_lock_irq(&l3->list_lock);
2203			ret += __node_shrink(cachep, i);
2204			spin_unlock_irq(&l3->list_lock);
2205		}
2206	}
2207	return (ret ? 1 : 0);
2208}
2209
2210/**
2211 * kmem_cache_shrink - Shrink a cache.
2212 * @cachep: The cache to shrink.
2213 *
2214 * Releases as many slabs as possible for a cache.
2215 * To help debugging, a zero exit status indicates all slabs were released.
2216 */
2217int kmem_cache_shrink(struct kmem_cache *cachep)
2218{
2219	if (!cachep || in_interrupt())
2220		BUG();
2221
2222	return __cache_shrink(cachep);
2223}
2224EXPORT_SYMBOL(kmem_cache_shrink);
2225
2226/**
2227 * kmem_cache_destroy - delete a cache
2228 * @cachep: the cache to destroy
2229 *
2230 * Remove a struct kmem_cache object from the slab cache.
2231 * Returns 0 on success.
2232 *
2233 * It is expected this function will be called by a module when it is
2234 * unloaded.  This will remove the cache completely, and avoid a duplicate
2235 * cache being allocated each time a module is loaded and unloaded, if the
2236 * module doesn't have persistent in-kernel storage across loads and unloads.
2237 *
2238 * The cache must be empty before calling this function.
2239 *
2240 * The caller must guarantee that noone will allocate memory from the cache
2241 * during the kmem_cache_destroy().
2242 */
2243int kmem_cache_destroy(struct kmem_cache *cachep)
2244{
2245	int i;
2246	struct kmem_list3 *l3;
2247
2248	if (!cachep || in_interrupt())
2249		BUG();
2250
2251	/* Don't let CPUs to come and go */
2252	lock_cpu_hotplug();
2253
2254	/* Find the cache in the chain of caches. */
2255	mutex_lock(&cache_chain_mutex);
2256	/*
2257	 * the chain is never empty, cache_cache is never destroyed
2258	 */
2259	list_del(&cachep->next);
2260	mutex_unlock(&cache_chain_mutex);
2261
2262	if (__cache_shrink(cachep)) {
2263		slab_error(cachep, "Can't free all objects");
2264		mutex_lock(&cache_chain_mutex);
2265		list_add(&cachep->next, &cache_chain);
2266		mutex_unlock(&cache_chain_mutex);
2267		unlock_cpu_hotplug();
2268		return 1;
2269	}
2270
2271	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2272		synchronize_rcu();
2273
2274	for_each_online_cpu(i)
2275	    kfree(cachep->array[i]);
2276
2277	/* NUMA: free the list3 structures */
2278	for_each_online_node(i) {
2279		l3 = cachep->nodelists[i];
2280		if (l3) {
2281			kfree(l3->shared);
2282			free_alien_cache(l3->alien);
2283			kfree(l3);
2284		}
2285	}
2286	kmem_cache_free(&cache_cache, cachep);
2287	unlock_cpu_hotplug();
2288	return 0;
2289}
2290EXPORT_SYMBOL(kmem_cache_destroy);
2291
2292/* Get the memory for a slab management obj. */
2293static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2294				   int colour_off, gfp_t local_flags)
2295{
2296	struct slab *slabp;
2297
2298	if (OFF_SLAB(cachep)) {
2299		/* Slab management obj is off-slab. */
2300		slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
2301		if (!slabp)
2302			return NULL;
2303	} else {
2304		slabp = objp + colour_off;
2305		colour_off += cachep->slab_size;
2306	}
2307	slabp->inuse = 0;
2308	slabp->colouroff = colour_off;
2309	slabp->s_mem = objp + colour_off;
2310	return slabp;
2311}
2312
2313static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2314{
2315	return (kmem_bufctl_t *) (slabp + 1);
2316}
2317
2318static void cache_init_objs(struct kmem_cache *cachep,
2319			    struct slab *slabp, unsigned long ctor_flags)
2320{
2321	int i;
2322
2323	for (i = 0; i < cachep->num; i++) {
2324		void *objp = index_to_obj(cachep, slabp, i);
2325#if DEBUG
2326		/* need to poison the objs? */
2327		if (cachep->flags & SLAB_POISON)
2328			poison_obj(cachep, objp, POISON_FREE);
2329		if (cachep->flags & SLAB_STORE_USER)
2330			*dbg_userword(cachep, objp) = NULL;
2331
2332		if (cachep->flags & SLAB_RED_ZONE) {
2333			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2334			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2335		}
2336		/*
2337		 * Constructors are not allowed to allocate memory from the same
2338		 * cache which they are a constructor for.  Otherwise, deadlock.
2339		 * They must also be threaded.
2340		 */
2341		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2342			cachep->ctor(objp + obj_offset(cachep), cachep,
2343				     ctor_flags);
2344
2345		if (cachep->flags & SLAB_RED_ZONE) {
2346			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2347				slab_error(cachep, "constructor overwrote the"
2348					   " end of an object");
2349			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2350				slab_error(cachep, "constructor overwrote the"
2351					   " start of an object");
2352		}
2353		if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2354			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2355			kernel_map_pages(virt_to_page(objp),
2356					 cachep->buffer_size / PAGE_SIZE, 0);
2357#else
2358		if (cachep->ctor)
2359			cachep->ctor(objp, cachep, ctor_flags);
2360#endif
2361		slab_bufctl(slabp)[i] = i + 1;
2362	}
2363	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2364	slabp->free = 0;
2365}
2366
2367static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2368{
2369	if (flags & SLAB_DMA)
2370		BUG_ON(!(cachep->gfpflags & GFP_DMA));
2371	else
2372		BUG_ON(cachep->gfpflags & GFP_DMA);
2373}
2374
2375static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2376				int nodeid)
2377{
2378	void *objp = index_to_obj(cachep, slabp, slabp->free);
2379	kmem_bufctl_t next;
2380
2381	slabp->inuse++;
2382	next = slab_bufctl(slabp)[slabp->free];
2383#if DEBUG
2384	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2385	WARN_ON(slabp->nodeid != nodeid);
2386#endif
2387	slabp->free = next;
2388
2389	return objp;
2390}
2391
2392static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2393				void *objp, int nodeid)
2394{
2395	unsigned int objnr = obj_to_index(cachep, slabp, objp);
2396
2397#if DEBUG
2398	/* Verify that the slab belongs to the intended node */
2399	WARN_ON(slabp->nodeid != nodeid);
2400
2401	if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2402		printk(KERN_ERR "slab: double free detected in cache "
2403				"'%s', objp %p\n", cachep->name, objp);
2404		BUG();
2405	}
2406#endif
2407	slab_bufctl(slabp)[objnr] = slabp->free;
2408	slabp->free = objnr;
2409	slabp->inuse--;
2410}
2411
2412static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
2413			void *objp)
2414{
2415	int i;
2416	struct page *page;
2417
2418	/* Nasty!!!!!! I hope this is OK. */
2419	page = virt_to_page(objp);
2420
2421	i = 1;
2422	if (likely(!PageCompound(page)))
2423		i <<= cachep->gfporder;
2424	do {
2425		page_set_cache(page, cachep);
2426		page_set_slab(page, slabp);
2427		page++;
2428	} while (--i);
2429}
2430
2431/*
2432 * Grow (by 1) the number of slabs within a cache.  This is called by
2433 * kmem_cache_alloc() when there are no active objs left in a cache.
2434 */
2435static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2436{
2437	struct slab *slabp;
2438	void *objp;
2439	size_t offset;
2440	gfp_t local_flags;
2441	unsigned long ctor_flags;
2442	struct kmem_list3 *l3;
2443
2444	/*
2445	 * Be lazy and only check for valid flags here,  keeping it out of the
2446	 * critical path in kmem_cache_alloc().
2447	 */
2448	if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
2449		BUG();
2450	if (flags & SLAB_NO_GROW)
2451		return 0;
2452
2453	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2454	local_flags = (flags & SLAB_LEVEL_MASK);
2455	if (!(local_flags & __GFP_WAIT))
2456		/*
2457		 * Not allowed to sleep.  Need to tell a constructor about
2458		 * this - it might need to know...
2459		 */
2460		ctor_flags |= SLAB_CTOR_ATOMIC;
2461
2462	/* Take the l3 list lock to change the colour_next on this node */
2463	check_irq_off();
2464	l3 = cachep->nodelists[nodeid];
2465	spin_lock(&l3->list_lock);
2466
2467	/* Get colour for the slab, and cal the next value. */
2468	offset = l3->colour_next;
2469	l3->colour_next++;
2470	if (l3->colour_next >= cachep->colour)
2471		l3->colour_next = 0;
2472	spin_unlock(&l3->list_lock);
2473
2474	offset *= cachep->colour_off;
2475
2476	if (local_flags & __GFP_WAIT)
2477		local_irq_enable();
2478
2479	/*
2480	 * The test for missing atomic flag is performed here, rather than
2481	 * the more obvious place, simply to reduce the critical path length
2482	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2483	 * will eventually be caught here (where it matters).
2484	 */
2485	kmem_flagcheck(cachep, flags);
2486
2487	/*
2488	 * Get mem for the objs.  Attempt to allocate a physical page from
2489	 * 'nodeid'.
2490	 */
2491	objp = kmem_getpages(cachep, flags, nodeid);
2492	if (!objp)
2493		goto failed;
2494
2495	/* Get slab management. */
2496	slabp = alloc_slabmgmt(cachep, objp, offset, local_flags);
2497	if (!slabp)
2498		goto opps1;
2499
2500	slabp->nodeid = nodeid;
2501	set_slab_attr(cachep, slabp, objp);
2502
2503	cache_init_objs(cachep, slabp, ctor_flags);
2504
2505	if (local_flags & __GFP_WAIT)
2506		local_irq_disable();
2507	check_irq_off();
2508	spin_lock(&l3->list_lock);
2509
2510	/* Make slab active. */
2511	list_add_tail(&slabp->list, &(l3->slabs_free));
2512	STATS_INC_GROWN(cachep);
2513	l3->free_objects += cachep->num;
2514	spin_unlock(&l3->list_lock);
2515	return 1;
2516opps1:
2517	kmem_freepages(cachep, objp);
2518failed:
2519	if (local_flags & __GFP_WAIT)
2520		local_irq_disable();
2521	return 0;
2522}
2523
2524#if DEBUG
2525
2526/*
2527 * Perform extra freeing checks:
2528 * - detect bad pointers.
2529 * - POISON/RED_ZONE checking
2530 * - destructor calls, for caches with POISON+dtor
2531 */
2532static void kfree_debugcheck(const void *objp)
2533{
2534	struct page *page;
2535
2536	if (!virt_addr_valid(objp)) {
2537		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2538		       (unsigned long)objp);
2539		BUG();
2540	}
2541	page = virt_to_page(objp);
2542	if (!PageSlab(page)) {
2543		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
2544		       (unsigned long)objp);
2545		BUG();
2546	}
2547}
2548
2549static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2550				   void *caller)
2551{
2552	struct page *page;
2553	unsigned int objnr;
2554	struct slab *slabp;
2555
2556	objp -= obj_offset(cachep);
2557	kfree_debugcheck(objp);
2558	page = virt_to_page(objp);
2559
2560	if (page_get_cache(page) != cachep) {
2561		printk(KERN_ERR "mismatch in kmem_cache_free: expected "
2562				"cache %p, got %p\n",
2563		       page_get_cache(page), cachep);
2564		printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
2565		printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
2566		       page_get_cache(page)->name);
2567		WARN_ON(1);
2568	}
2569	slabp = page_get_slab(page);
2570
2571	if (cachep->flags & SLAB_RED_ZONE) {
2572		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
2573				*dbg_redzone2(cachep, objp) != RED_ACTIVE) {
2574			slab_error(cachep, "double free, or memory outside"
2575						" object was overwritten");
2576			printk(KERN_ERR "%p: redzone 1:0x%lx, "
2577					"redzone 2:0x%lx.\n",
2578			       objp, *dbg_redzone1(cachep, objp),
2579			       *dbg_redzone2(cachep, objp));
2580		}
2581		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2582		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2583	}
2584	if (cachep->flags & SLAB_STORE_USER)
2585		*dbg_userword(cachep, objp) = caller;
2586
2587	objnr = obj_to_index(cachep, slabp, objp);
2588
2589	BUG_ON(objnr >= cachep->num);
2590	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2591
2592	if (cachep->flags & SLAB_DEBUG_INITIAL) {
2593		/*
2594		 * Need to call the slab's constructor so the caller can
2595		 * perform a verify of its state (debugging).  Called without
2596		 * the cache-lock held.
2597		 */
2598		cachep->ctor(objp + obj_offset(cachep),
2599			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2600	}
2601	if (cachep->flags & SLAB_POISON && cachep->dtor) {
2602		/* we want to cache poison the object,
2603		 * call the destruction callback
2604		 */
2605		cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2606	}
2607	if (cachep->flags & SLAB_POISON) {
2608#ifdef CONFIG_DEBUG_PAGEALLOC
2609		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2610			store_stackinfo(cachep, objp, (unsigned long)caller);
2611			kernel_map_pages(virt_to_page(objp),
2612					 cachep->buffer_size / PAGE_SIZE, 0);
2613		} else {
2614			poison_obj(cachep, objp, POISON_FREE);
2615		}
2616#else
2617		poison_obj(cachep, objp, POISON_FREE);
2618#endif
2619	}
2620	return objp;
2621}
2622
2623static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2624{
2625	kmem_bufctl_t i;
2626	int entries = 0;
2627
2628	/* Check slab's freelist to see if this obj is there. */
2629	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2630		entries++;
2631		if (entries > cachep->num || i >= cachep->num)
2632			goto bad;
2633	}
2634	if (entries != cachep->num - slabp->inuse) {
2635bad:
2636		printk(KERN_ERR "slab: Internal list corruption detected in "
2637				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2638			cachep->name, cachep->num, slabp, slabp->inuse);
2639		for (i = 0;
2640		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2641		     i++) {
2642			if (i % 16 == 0)
2643				printk("\n%03x:", i);
2644			printk(" %02x", ((unsigned char *)slabp)[i]);
2645		}
2646		printk("\n");
2647		BUG();
2648	}
2649}
2650#else
2651#define kfree_debugcheck(x) do { } while(0)
2652#define cache_free_debugcheck(x,objp,z) (objp)
2653#define check_slabp(x,y) do { } while(0)
2654#endif
2655
2656static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2657{
2658	int batchcount;
2659	struct kmem_list3 *l3;
2660	struct array_cache *ac;
2661
2662	check_irq_off();
2663	ac = cpu_cache_get(cachep);
2664retry:
2665	batchcount = ac->batchcount;
2666	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2667		/*
2668		 * If there was little recent activity on this cache, then
2669		 * perform only a partial refill.  Otherwise we could generate
2670		 * refill bouncing.
2671		 */
2672		batchcount = BATCHREFILL_LIMIT;
2673	}
2674	l3 = cachep->nodelists[numa_node_id()];
2675
2676	BUG_ON(ac->avail > 0 || !l3);
2677	spin_lock(&l3->list_lock);
2678
2679	if (l3->shared) {
2680		struct array_cache *shared_array = l3->shared;
2681		if (shared_array->avail) {
2682			if (batchcount > shared_array->avail)
2683				batchcount = shared_array->avail;
2684			shared_array->avail -= batchcount;
2685			ac->avail = batchcount;
2686			memcpy(ac->entry,
2687			       &(shared_array->entry[shared_array->avail]),
2688			       sizeof(void *) * batchcount);
2689			shared_array->touched = 1;
2690			goto alloc_done;
2691		}
2692	}
2693	while (batchcount > 0) {
2694		struct list_head *entry;
2695		struct slab *slabp;
2696		/* Get slab alloc is to come from. */
2697		entry = l3->slabs_partial.next;
2698		if (entry == &l3->slabs_partial) {
2699			l3->free_touched = 1;
2700			entry = l3->slabs_free.next;
2701			if (entry == &l3->slabs_free)
2702				goto must_grow;
2703		}
2704
2705		slabp = list_entry(entry, struct slab, list);
2706		check_slabp(cachep, slabp);
2707		check_spinlock_acquired(cachep);
2708		while (slabp->inuse < cachep->num && batchcount--) {
2709			STATS_INC_ALLOCED(cachep);
2710			STATS_INC_ACTIVE(cachep);
2711			STATS_SET_HIGH(cachep);
2712
2713			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
2714							    numa_node_id());
2715		}
2716		check_slabp(cachep, slabp);
2717
2718		/* move slabp to correct slabp list: */
2719		list_del(&slabp->list);
2720		if (slabp->free == BUFCTL_END)
2721			list_add(&slabp->list, &l3->slabs_full);
2722		else
2723			list_add(&slabp->list, &l3->slabs_partial);
2724	}
2725
2726must_grow:
2727	l3->free_objects -= ac->avail;
2728alloc_done:
2729	spin_unlock(&l3->list_lock);
2730
2731	if (unlikely(!ac->avail)) {
2732		int x;
2733		x = cache_grow(cachep, flags, numa_node_id());
2734
2735		/* cache_grow can reenable interrupts, then ac could change. */
2736		ac = cpu_cache_get(cachep);
2737		if (!x && ac->avail == 0)	/* no objects in sight? abort */
2738			return NULL;
2739
2740		if (!ac->avail)		/* objects refilled by interrupt? */
2741			goto retry;
2742	}
2743	ac->touched = 1;
2744	return ac->entry[--ac->avail];
2745}
2746
2747static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2748						gfp_t flags)
2749{
2750	might_sleep_if(flags & __GFP_WAIT);
2751#if DEBUG
2752	kmem_flagcheck(cachep, flags);
2753#endif
2754}
2755
2756#if DEBUG
2757static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2758				gfp_t flags, void *objp, void *caller)
2759{
2760	if (!objp)
2761		return objp;
2762	if (cachep->flags & SLAB_POISON) {
2763#ifdef CONFIG_DEBUG_PAGEALLOC
2764		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2765			kernel_map_pages(virt_to_page(objp),
2766					 cachep->buffer_size / PAGE_SIZE, 1);
2767		else
2768			check_poison_obj(cachep, objp);
2769#else
2770		check_poison_obj(cachep, objp);
2771#endif
2772		poison_obj(cachep, objp, POISON_INUSE);
2773	}
2774	if (cachep->flags & SLAB_STORE_USER)
2775		*dbg_userword(cachep, objp) = caller;
2776
2777	if (cachep->flags & SLAB_RED_ZONE) {
2778		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
2779				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2780			slab_error(cachep, "double free, or memory outside"
2781						" object was overwritten");
2782			printk(KERN_ERR
2783				"%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
2784				objp, *dbg_redzone1(cachep, objp),
2785				*dbg_redzone2(cachep, objp));
2786		}
2787		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
2788		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
2789	}
2790	objp += obj_offset(cachep);
2791	if (cachep->ctor && cachep->flags & SLAB_POISON) {
2792		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2793
2794		if (!(flags & __GFP_WAIT))
2795			ctor_flags |= SLAB_CTOR_ATOMIC;
2796
2797		cachep->ctor(objp, cachep, ctor_flags);
2798	}
2799	return objp;
2800}
2801#else
2802#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2803#endif
2804
2805static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2806{
2807	void *objp;
2808	struct array_cache *ac;
2809
2810#ifdef CONFIG_NUMA
2811	if (unlikely(current->mempolicy && !in_interrupt())) {
2812		int nid = slab_node(current->mempolicy);
2813
2814		if (nid != numa_node_id())
2815			return __cache_alloc_node(cachep, flags, nid);
2816	}
2817#endif
2818
2819	check_irq_off();
2820	ac = cpu_cache_get(cachep);
2821	if (likely(ac->avail)) {
2822		STATS_INC_ALLOCHIT(cachep);
2823		ac->touched = 1;
2824		objp = ac->entry[--ac->avail];
2825	} else {
2826		STATS_INC_ALLOCMISS(cachep);
2827		objp = cache_alloc_refill(cachep, flags);
2828	}
2829	return objp;
2830}
2831
2832static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
2833						gfp_t flags, void *caller)
2834{
2835	unsigned long save_flags;
2836	void *objp;
2837
2838	cache_alloc_debugcheck_before(cachep, flags);
2839
2840	local_irq_save(save_flags);
2841	objp = ____cache_alloc(cachep, flags);
2842	local_irq_restore(save_flags);
2843	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
2844					    caller);
2845	prefetchw(objp);
2846	return objp;
2847}
2848
2849#ifdef CONFIG_NUMA
2850/*
2851 * A interface to enable slab creation on nodeid
2852 */
2853static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
2854				int nodeid)
2855{
2856	struct list_head *entry;
2857	struct slab *slabp;
2858	struct kmem_list3 *l3;
2859	void *obj;
2860	int x;
2861
2862	l3 = cachep->nodelists[nodeid];
2863	BUG_ON(!l3);
2864
2865retry:
2866	check_irq_off();
2867	spin_lock(&l3->list_lock);
2868	entry = l3->slabs_partial.next;
2869	if (entry == &l3->slabs_partial) {
2870		l3->free_touched = 1;
2871		entry = l3->slabs_free.next;
2872		if (entry == &l3->slabs_free)
2873			goto must_grow;
2874	}
2875
2876	slabp = list_entry(entry, struct slab, list);
2877	check_spinlock_acquired_node(cachep, nodeid);
2878	check_slabp(cachep, slabp);
2879
2880	STATS_INC_NODEALLOCS(cachep);
2881	STATS_INC_ACTIVE(cachep);
2882	STATS_SET_HIGH(cachep);
2883
2884	BUG_ON(slabp->inuse == cachep->num);
2885
2886	obj = slab_get_obj(cachep, slabp, nodeid);
2887	check_slabp(cachep, slabp);
2888	l3->free_objects--;
2889	/* move slabp to correct slabp list: */
2890	list_del(&slabp->list);
2891
2892	if (slabp->free == BUFCTL_END)
2893		list_add(&slabp->list, &l3->slabs_full);
2894	else
2895		list_add(&slabp->list, &l3->slabs_partial);
2896
2897	spin_unlock(&l3->list_lock);
2898	goto done;
2899
2900must_grow:
2901	spin_unlock(&l3->list_lock);
2902	x = cache_grow(cachep, flags, nodeid);
2903
2904	if (!x)
2905		return NULL;
2906
2907	goto retry;
2908done:
2909	return obj;
2910}
2911#endif
2912
2913/*
2914 * Caller needs to acquire correct kmem_list's list_lock
2915 */
2916static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
2917		       int node)
2918{
2919	int i;
2920	struct kmem_list3 *l3;
2921
2922	for (i = 0; i < nr_objects; i++) {
2923		void *objp = objpp[i];
2924		struct slab *slabp;
2925
2926		slabp = virt_to_slab(objp);
2927		l3 = cachep->nodelists[node];
2928		list_del(&slabp->list);
2929		check_spinlock_acquired_node(cachep, node);
2930		check_slabp(cachep, slabp);
2931		slab_put_obj(cachep, slabp, objp, node);
2932		STATS_DEC_ACTIVE(cachep);
2933		l3->free_objects++;
2934		check_slabp(cachep, slabp);
2935
2936		/* fixup slab chains */
2937		if (slabp->inuse == 0) {
2938			if (l3->free_objects > l3->free_limit) {
2939				l3->free_objects -= cachep->num;
2940				slab_destroy(cachep, slabp);
2941			} else {
2942				list_add(&slabp->list, &l3->slabs_free);
2943			}
2944		} else {
2945			/* Unconditionally move a slab to the end of the
2946			 * partial list on free - maximum time for the
2947			 * other objects to be freed, too.
2948			 */
2949			list_add_tail(&slabp->list, &l3->slabs_partial);
2950		}
2951	}
2952}
2953
2954static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
2955{
2956	int batchcount;
2957	struct kmem_list3 *l3;
2958	int node = numa_node_id();
2959
2960	batchcount = ac->batchcount;
2961#if DEBUG
2962	BUG_ON(!batchcount || batchcount > ac->avail);
2963#endif
2964	check_irq_off();
2965	l3 = cachep->nodelists[node];
2966	spin_lock(&l3->list_lock);
2967	if (l3->shared) {
2968		struct array_cache *shared_array = l3->shared;
2969		int max = shared_array->limit - shared_array->avail;
2970		if (max) {
2971			if (batchcount > max)
2972				batchcount = max;
2973			memcpy(&(shared_array->entry[shared_array->avail]),
2974			       ac->entry, sizeof(void *) * batchcount);
2975			shared_array->avail += batchcount;
2976			goto free_done;
2977		}
2978	}
2979
2980	free_block(cachep, ac->entry, batchcount, node);
2981free_done:
2982#if STATS
2983	{
2984		int i = 0;
2985		struct list_head *p;
2986
2987		p = l3->slabs_free.next;
2988		while (p != &(l3->slabs_free)) {
2989			struct slab *slabp;
2990
2991			slabp = list_entry(p, struct slab, list);
2992			BUG_ON(slabp->inuse);
2993
2994			i++;
2995			p = p->next;
2996		}
2997		STATS_SET_FREEABLE(cachep, i);
2998	}
2999#endif
3000	spin_unlock(&l3->list_lock);
3001	ac->avail -= batchcount;
3002	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3003}
3004
3005/*
3006 * Release an obj back to its cache. If the obj has a constructed state, it must
3007 * be in this state _before_ it is released.  Called with disabled ints.
3008 */
3009static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3010{
3011	struct array_cache *ac = cpu_cache_get(cachep);
3012
3013	check_irq_off();
3014	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3015
3016	/* Make sure we are not freeing a object from another
3017	 * node to the array cache on this cpu.
3018	 */
3019#ifdef CONFIG_NUMA
3020	{
3021		struct slab *slabp;
3022		slabp = virt_to_slab(objp);
3023		if (unlikely(slabp->nodeid != numa_node_id())) {
3024			struct array_cache *alien = NULL;
3025			int nodeid = slabp->nodeid;
3026			struct kmem_list3 *l3;
3027
3028			l3 = cachep->nodelists[numa_node_id()];
3029			STATS_INC_NODEFREES(cachep);
3030			if (l3->alien && l3->alien[nodeid]) {
3031				alien = l3->alien[nodeid];
3032				spin_lock(&alien->lock);
3033				if (unlikely(alien->avail == alien->limit))
3034					__drain_alien_cache(cachep,
3035							    alien, nodeid);
3036				alien->entry[alien->avail++] = objp;
3037				spin_unlock(&alien->lock);
3038			} else {
3039				spin_lock(&(cachep->nodelists[nodeid])->
3040					  list_lock);
3041				free_block(cachep, &objp, 1, nodeid);
3042				spin_unlock(&(cachep->nodelists[nodeid])->
3043					    list_lock);
3044			}
3045			return;
3046		}
3047	}
3048#endif
3049	if (likely(ac->avail < ac->limit)) {
3050		STATS_INC_FREEHIT(cachep);
3051		ac->entry[ac->avail++] = objp;
3052		return;
3053	} else {
3054		STATS_INC_FREEMISS(cachep);
3055		cache_flusharray(cachep, ac);
3056		ac->entry[ac->avail++] = objp;
3057	}
3058}
3059
3060/**
3061 * kmem_cache_alloc - Allocate an object
3062 * @cachep: The cache to allocate from.
3063 * @flags: See kmalloc().
3064 *
3065 * Allocate an object from this cache.  The flags are only relevant
3066 * if the cache has no available objects.
3067 */
3068void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3069{
3070	return __cache_alloc(cachep, flags, __builtin_return_address(0));
3071}
3072EXPORT_SYMBOL(kmem_cache_alloc);
3073
3074/**
3075 * kmem_ptr_validate - check if an untrusted pointer might
3076 *	be a slab entry.
3077 * @cachep: the cache we're checking against
3078 * @ptr: pointer to validate
3079 *
3080 * This verifies that the untrusted pointer looks sane:
3081 * it is _not_ a guarantee that the pointer is actually
3082 * part of the slab cache in question, but it at least
3083 * validates that the pointer can be dereferenced and
3084 * looks half-way sane.
3085 *
3086 * Currently only used for dentry validation.
3087 */
3088int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
3089{
3090	unsigned long addr = (unsigned long)ptr;
3091	unsigned long min_addr = PAGE_OFFSET;
3092	unsigned long align_mask = BYTES_PER_WORD - 1;
3093	unsigned long size = cachep->buffer_size;
3094	struct page *page;
3095
3096	if (unlikely(addr < min_addr))
3097		goto out;
3098	if (unlikely(addr > (unsigned long)high_memory - size))
3099		goto out;
3100	if (unlikely(addr & align_mask))
3101		goto out;
3102	if (unlikely(!kern_addr_valid(addr)))
3103		goto out;
3104	if (unlikely(!kern_addr_valid(addr + size - 1)))
3105		goto out;
3106	page = virt_to_page(ptr);
3107	if (unlikely(!PageSlab(page)))
3108		goto out;
3109	if (unlikely(page_get_cache(page) != cachep))
3110		goto out;
3111	return 1;
3112out:
3113	return 0;
3114}
3115
3116#ifdef CONFIG_NUMA
3117/**
3118 * kmem_cache_alloc_node - Allocate an object on the specified node
3119 * @cachep: The cache to allocate from.
3120 * @flags: See kmalloc().
3121 * @nodeid: node number of the target node.
3122 *
3123 * Identical to kmem_cache_alloc, except that this function is slow
3124 * and can sleep. And it will allocate memory on the given node, which
3125 * can improve the performance for cpu bound structures.
3126 * New and improved: it will now make sure that the object gets
3127 * put on the correct node list so that there is no false sharing.
3128 */
3129void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3130{
3131	unsigned long save_flags;
3132	void *ptr;
3133
3134	cache_alloc_debugcheck_before(cachep, flags);
3135	local_irq_save(save_flags);
3136
3137	if (nodeid == -1 || nodeid == numa_node_id() ||
3138			!cachep->nodelists[nodeid])
3139		ptr = ____cache_alloc(cachep, flags);
3140	else
3141		ptr = __cache_alloc_node(cachep, flags, nodeid);
3142	local_irq_restore(save_flags);
3143
3144	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
3145					   __builtin_return_address(0));
3146
3147	return ptr;
3148}
3149EXPORT_SYMBOL(kmem_cache_alloc_node);
3150
3151void *kmalloc_node(size_t size, gfp_t flags, int node)
3152{
3153	struct kmem_cache *cachep;
3154
3155	cachep = kmem_find_general_cachep(size, flags);
3156	if (unlikely(cachep == NULL))
3157		return NULL;
3158	return kmem_cache_alloc_node(cachep, flags, node);
3159}
3160EXPORT_SYMBOL(kmalloc_node);
3161#endif
3162
3163/**
3164 * kmalloc - allocate memory
3165 * @size: how many bytes of memory are required.
3166 * @flags: the type of memory to allocate.
3167 * @caller: function caller for debug tracking of the caller
3168 *
3169 * kmalloc is the normal method of allocating memory
3170 * in the kernel.
3171 *
3172 * The @flags argument may be one of:
3173 *
3174 * %GFP_USER - Allocate memory on behalf of user.  May sleep.
3175 *
3176 * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
3177 *
3178 * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
3179 *
3180 * Additionally, the %GFP_DMA flag may be set to indicate the memory
3181 * must be suitable for DMA.  This can mean different things on different
3182 * platforms.  For example, on i386, it means that the memory must come
3183 * from the first 16MB.
3184 */
3185static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3186					  void *caller)
3187{
3188	struct kmem_cache *cachep;
3189
3190	/* If you want to save a few bytes .text space: replace
3191	 * __ with kmem_.
3192	 * Then kmalloc uses the uninlined functions instead of the inline
3193	 * functions.
3194	 */
3195	cachep = __find_general_cachep(size, flags);
3196	if (unlikely(cachep == NULL))
3197		return NULL;
3198	return __cache_alloc(cachep, flags, caller);
3199}
3200
3201#ifndef CONFIG_DEBUG_SLAB
3202
3203void *__kmalloc(size_t size, gfp_t flags)
3204{
3205	return __do_kmalloc(size, flags, NULL);
3206}
3207EXPORT_SYMBOL(__kmalloc);
3208
3209#else
3210
3211void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3212{
3213	return __do_kmalloc(size, flags, caller);
3214}
3215EXPORT_SYMBOL(__kmalloc_track_caller);
3216
3217#endif
3218
3219#ifdef CONFIG_SMP
3220/**
3221 * __alloc_percpu - allocate one copy of the object for every present
3222 * cpu in the system, zeroing them.
3223 * Objects should be dereferenced using the per_cpu_ptr macro only.
3224 *
3225 * @size: how many bytes of memory are required.
3226 */
3227void *__alloc_percpu(size_t size)
3228{
3229	int i;
3230	struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
3231
3232	if (!pdata)
3233		return NULL;
3234
3235	/*
3236	 * Cannot use for_each_online_cpu since a cpu may come online
3237	 * and we have no way of figuring out how to fix the array
3238	 * that we have allocated then....
3239	 */
3240	for_each_cpu(i) {
3241		int node = cpu_to_node(i);
3242
3243		if (node_online(node))
3244			pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
3245		else
3246			pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
3247
3248		if (!pdata->ptrs[i])
3249			goto unwind_oom;
3250		memset(pdata->ptrs[i], 0, size);
3251	}
3252
3253	/* Catch derefs w/o wrappers */
3254	return (void *)(~(unsigned long)pdata);
3255
3256unwind_oom:
3257	while (--i >= 0) {
3258		if (!cpu_possible(i))
3259			continue;
3260		kfree(pdata->ptrs[i]);
3261	}
3262	kfree(pdata);
3263	return NULL;
3264}
3265EXPORT_SYMBOL(__alloc_percpu);
3266#endif
3267
3268/**
3269 * kmem_cache_free - Deallocate an object
3270 * @cachep: The cache the allocation was from.
3271 * @objp: The previously allocated object.
3272 *
3273 * Free an object which was previously allocated from this
3274 * cache.
3275 */
3276void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3277{
3278	unsigned long flags;
3279
3280	local_irq_save(flags);
3281	__cache_free(cachep, objp);
3282	local_irq_restore(flags);
3283}
3284EXPORT_SYMBOL(kmem_cache_free);
3285
3286/**
3287 * kfree - free previously allocated memory
3288 * @objp: pointer returned by kmalloc.
3289 *
3290 * If @objp is NULL, no operation is performed.
3291 *
3292 * Don't free memory not originally allocated by kmalloc()
3293 * or you will run into trouble.
3294 */
3295void kfree(const void *objp)
3296{
3297	struct kmem_cache *c;
3298	unsigned long flags;
3299
3300	if (unlikely(!objp))
3301		return;
3302	local_irq_save(flags);
3303	kfree_debugcheck(objp);
3304	c = virt_to_cache(objp);
3305	mutex_debug_check_no_locks_freed(objp, obj_size(c));
3306	__cache_free(c, (void *)objp);
3307	local_irq_restore(flags);
3308}
3309EXPORT_SYMBOL(kfree);
3310
3311#ifdef CONFIG_SMP
3312/**
3313 * free_percpu - free previously allocated percpu memory
3314 * @objp: pointer returned by alloc_percpu.
3315 *
3316 * Don't free memory not originally allocated by alloc_percpu()
3317 * The complemented objp is to check for that.
3318 */
3319void free_percpu(const void *objp)
3320{
3321	int i;
3322	struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
3323
3324	/*
3325	 * We allocate for all cpus so we cannot use for online cpu here.
3326	 */
3327	for_each_cpu(i)
3328	    kfree(p->ptrs[i]);
3329	kfree(p);
3330}
3331EXPORT_SYMBOL(free_percpu);
3332#endif
3333
3334unsigned int kmem_cache_size(struct kmem_cache *cachep)
3335{
3336	return obj_size(cachep);
3337}
3338EXPORT_SYMBOL(kmem_cache_size);
3339
3340const char *kmem_cache_name(struct kmem_cache *cachep)
3341{
3342	return cachep->name;
3343}
3344EXPORT_SYMBOL_GPL(kmem_cache_name);
3345
3346/*
3347 * This initializes kmem_list3 for all nodes.
3348 */
3349static int alloc_kmemlist(struct kmem_cache *cachep)
3350{
3351	int node;
3352	struct kmem_list3 *l3;
3353	int err = 0;
3354
3355	for_each_online_node(node) {
3356		struct array_cache *nc = NULL, *new;
3357		struct array_cache **new_alien = NULL;
3358#ifdef CONFIG_NUMA
3359		new_alien = alloc_alien_cache(node, cachep->limit);
3360		if (!new_alien)
3361			goto fail;
3362#endif
3363		new = alloc_arraycache(node, cachep->shared*cachep->batchcount,
3364					0xbaadf00d);
3365		if (!new)
3366			goto fail;
3367		l3 = cachep->nodelists[node];
3368		if (l3) {
3369			spin_lock_irq(&l3->list_lock);
3370
3371			nc = cachep->nodelists[node]->shared;
3372			if (nc)
3373				free_block(cachep, nc->entry, nc->avail, node);
3374
3375			l3->shared = new;
3376			if (!cachep->nodelists[node]->alien) {
3377				l3->alien = new_alien;
3378				new_alien = NULL;
3379			}
3380			l3->free_limit = (1 + nr_cpus_node(node)) *
3381					cachep->batchcount + cachep->num;
3382			spin_unlock_irq(&l3->list_lock);
3383			kfree(nc);
3384			free_alien_cache(new_alien);
3385			continue;
3386		}
3387		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
3388		if (!l3)
3389			goto fail;
3390
3391		kmem_list3_init(l3);
3392		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3393				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3394		l3->shared = new;
3395		l3->alien = new_alien;
3396		l3->free_limit = (1 + nr_cpus_node(node)) *
3397					cachep->batchcount + cachep->num;
3398		cachep->nodelists[node] = l3;
3399	}
3400	return err;
3401fail:
3402	err = -ENOMEM;
3403	return err;
3404}
3405
3406struct ccupdate_struct {
3407	struct kmem_cache *cachep;
3408	struct array_cache *new[NR_CPUS];
3409};
3410
3411static void do_ccupdate_local(void *info)
3412{
3413	struct ccupdate_struct *new = info;
3414	struct array_cache *old;
3415
3416	check_irq_off();
3417	old = cpu_cache_get(new->cachep);
3418
3419	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3420	new->new[smp_processor_id()] = old;
3421}
3422
3423/* Always called with the cache_chain_mutex held */
3424static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3425				int batchcount, int shared)
3426{
3427	struct ccupdate_struct new;
3428	int i, err;
3429
3430	memset(&new.new, 0, sizeof(new.new));
3431	for_each_online_cpu(i) {
3432		new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
3433						batchcount);
3434		if (!new.new[i]) {
3435			for (i--; i >= 0; i--)
3436				kfree(new.new[i]);
3437			return -ENOMEM;
3438		}
3439	}
3440	new.cachep = cachep;
3441
3442	on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
3443
3444	check_irq_on();
3445	cachep->batchcount = batchcount;
3446	cachep->limit = limit;
3447	cachep->shared = shared;
3448
3449	for_each_online_cpu(i) {
3450		struct array_cache *ccold = new.new[i];
3451		if (!ccold)
3452			continue;
3453		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3454		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3455		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3456		kfree(ccold);
3457	}
3458
3459	err = alloc_kmemlist(cachep);
3460	if (err) {
3461		printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
3462		       cachep->name, -err);
3463		BUG();
3464	}
3465	return 0;
3466}
3467
3468/* Called with cache_chain_mutex held always */
3469static void enable_cpucache(struct kmem_cache *cachep)
3470{
3471	int err;
3472	int limit, shared;
3473
3474	/*
3475	 * The head array serves three purposes:
3476	 * - create a LIFO ordering, i.e. return objects that are cache-warm
3477	 * - reduce the number of spinlock operations.
3478	 * - reduce the number of linked list operations on the slab and
3479	 *   bufctl chains: array operations are cheaper.
3480	 * The numbers are guessed, we should auto-tune as described by
3481	 * Bonwick.
3482	 */
3483	if (cachep->buffer_size > 131072)
3484		limit = 1;
3485	else if (cachep->buffer_size > PAGE_SIZE)
3486		limit = 8;
3487	else if (cachep->buffer_size > 1024)
3488		limit = 24;
3489	else if (cachep->buffer_size > 256)
3490		limit = 54;
3491	else
3492		limit = 120;
3493
3494	/*
3495	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3496	 * allocation behaviour: Most allocs on one cpu, most free operations
3497	 * on another cpu. For these cases, an efficient object passing between
3498	 * cpus is necessary. This is provided by a shared array. The array
3499	 * replaces Bonwick's magazine layer.
3500	 * On uniprocessor, it's functionally equivalent (but less efficient)
3501	 * to a larger limit. Thus disabled by default.
3502	 */
3503	shared = 0;
3504#ifdef CONFIG_SMP
3505	if (cachep->buffer_size <= PAGE_SIZE)
3506		shared = 8;
3507#endif
3508
3509#if DEBUG
3510	/*
3511	 * With debugging enabled, large batchcount lead to excessively long
3512	 * periods with disabled local interrupts. Limit the batchcount
3513	 */
3514	if (limit > 32)
3515		limit = 32;
3516#endif
3517	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
3518	if (err)
3519		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3520		       cachep->name, -err);
3521}
3522
3523static void drain_array_locked(struct kmem_cache *cachep,
3524				struct array_cache *ac, int force, int node)
3525{
3526	int tofree;
3527
3528	check_spinlock_acquired_node(cachep, node);
3529	if (ac->touched && !force) {
3530		ac->touched = 0;
3531	} else if (ac->avail) {
3532		tofree = force ? ac->avail : (ac->limit + 4) / 5;
3533		if (tofree > ac->avail)
3534			tofree = (ac->avail + 1) / 2;
3535		free_block(cachep, ac->entry, tofree, node);
3536		ac->avail -= tofree;
3537		memmove(ac->entry, &(ac->entry[tofree]),
3538			sizeof(void *) * ac->avail);
3539	}
3540}
3541
3542/**
3543 * cache_reap - Reclaim memory from caches.
3544 * @unused: unused parameter
3545 *
3546 * Called from workqueue/eventd every few seconds.
3547 * Purpose:
3548 * - clear the per-cpu caches for this CPU.
3549 * - return freeable pages to the main free memory pool.
3550 *
3551 * If we cannot acquire the cache chain mutex then just give up - we'll try
3552 * again on the next iteration.
3553 */
3554static void cache_reap(void *unused)
3555{
3556	struct list_head *walk;
3557	struct kmem_list3 *l3;
3558
3559	if (!mutex_trylock(&cache_chain_mutex)) {
3560		/* Give up. Setup the next iteration. */
3561		schedule_delayed_work(&__get_cpu_var(reap_work),
3562				      REAPTIMEOUT_CPUC);
3563		return;
3564	}
3565
3566	list_for_each(walk, &cache_chain) {
3567		struct kmem_cache *searchp;
3568		struct list_head *p;
3569		int tofree;
3570		struct slab *slabp;
3571
3572		searchp = list_entry(walk, struct kmem_cache, next);
3573		check_irq_on();
3574
3575		l3 = searchp->nodelists[numa_node_id()];
3576		reap_alien(searchp, l3);
3577		spin_lock_irq(&l3->list_lock);
3578
3579		drain_array_locked(searchp, cpu_cache_get(searchp), 0,
3580				   numa_node_id());
3581
3582		if (time_after(l3->next_reap, jiffies))
3583			goto next_unlock;
3584
3585		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
3586
3587		if (l3->shared)
3588			drain_array_locked(searchp, l3->shared, 0,
3589					   numa_node_id());
3590
3591		if (l3->free_touched) {
3592			l3->free_touched = 0;
3593			goto next_unlock;
3594		}
3595
3596		tofree = (l3->free_limit + 5 * searchp->num - 1) /
3597				(5 * searchp->num);
3598		do {
3599			p = l3->slabs_free.next;
3600			if (p == &(l3->slabs_free))
3601				break;
3602
3603			slabp = list_entry(p, struct slab, list);
3604			BUG_ON(slabp->inuse);
3605			list_del(&slabp->list);
3606			STATS_INC_REAPED(searchp);
3607
3608			/*
3609			 * Safe to drop the lock. The slab is no longer linked
3610			 * to the cache. searchp cannot disappear, we hold
3611			 * cache_chain_lock
3612			 */
3613			l3->free_objects -= searchp->num;
3614			spin_unlock_irq(&l3->list_lock);
3615			slab_destroy(searchp, slabp);
3616			spin_lock_irq(&l3->list_lock);
3617		} while (--tofree > 0);
3618next_unlock:
3619		spin_unlock_irq(&l3->list_lock);
3620		cond_resched();
3621	}
3622	check_irq_on();
3623	mutex_unlock(&cache_chain_mutex);
3624	next_reap_node();
3625	/* Set up the next iteration */
3626	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3627}
3628
3629#ifdef CONFIG_PROC_FS
3630
3631static void print_slabinfo_header(struct seq_file *m)
3632{
3633	/*
3634	 * Output format version, so at least we can change it
3635	 * without _too_ many complaints.
3636	 */
3637#if STATS
3638	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
3639#else
3640	seq_puts(m, "slabinfo - version: 2.1\n");
3641#endif
3642	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
3643		 "<objperslab> <pagesperslab>");
3644	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3645	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3646#if STATS
3647	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
3648		 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
3649	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
3650#endif
3651	seq_putc(m, '\n');
3652}
3653
3654static void *s_start(struct seq_file *m, loff_t *pos)
3655{
3656	loff_t n = *pos;
3657	struct list_head *p;
3658
3659	mutex_lock(&cache_chain_mutex);
3660	if (!n)
3661		print_slabinfo_header(m);
3662	p = cache_chain.next;
3663	while (n--) {
3664		p = p->next;
3665		if (p == &cache_chain)
3666			return NULL;
3667	}
3668	return list_entry(p, struct kmem_cache, next);
3669}
3670
3671static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3672{
3673	struct kmem_cache *cachep = p;
3674	++*pos;
3675	return cachep->next.next == &cache_chain ?
3676		NULL : list_entry(cachep->next.next, struct kmem_cache, next);
3677}
3678
3679static void s_stop(struct seq_file *m, void *p)
3680{
3681	mutex_unlock(&cache_chain_mutex);
3682}
3683
3684static int s_show(struct seq_file *m, void *p)
3685{
3686	struct kmem_cache *cachep = p;
3687	struct list_head *q;
3688	struct slab *slabp;
3689	unsigned long active_objs;
3690	unsigned long num_objs;
3691	unsigned long active_slabs = 0;
3692	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
3693	const char *name;
3694	char *error = NULL;
3695	int node;
3696	struct kmem_list3 *l3;
3697
3698	active_objs = 0;
3699	num_slabs = 0;
3700	for_each_online_node(node) {
3701		l3 = cachep->nodelists[node];
3702		if (!l3)
3703			continue;
3704
3705		check_irq_on();
3706		spin_lock_irq(&l3->list_lock);
3707
3708		list_for_each(q, &l3->slabs_full) {
3709			slabp = list_entry(q, struct slab, list);
3710			if (slabp->inuse != cachep->num && !error)
3711				error = "slabs_full accounting error";
3712			active_objs += cachep->num;
3713			active_slabs++;
3714		}
3715		list_for_each(q, &l3->slabs_partial) {
3716			slabp = list_entry(q, struct slab, list);
3717			if (slabp->inuse == cachep->num && !error)
3718				error = "slabs_partial inuse accounting error";
3719			if (!slabp->inuse && !error)
3720				error = "slabs_partial/inuse accounting error";
3721			active_objs += slabp->inuse;
3722			active_slabs++;
3723		}
3724		list_for_each(q, &l3->slabs_free) {
3725			slabp = list_entry(q, struct slab, list);
3726			if (slabp->inuse && !error)
3727				error = "slabs_free/inuse accounting error";
3728			num_slabs++;
3729		}
3730		free_objects += l3->free_objects;
3731		if (l3->shared)
3732			shared_avail += l3->shared->avail;
3733
3734		spin_unlock_irq(&l3->list_lock);
3735	}
3736	num_slabs += active_slabs;
3737	num_objs = num_slabs * cachep->num;
3738	if (num_objs - active_objs != free_objects && !error)
3739		error = "free_objects accounting error";
3740
3741	name = cachep->name;
3742	if (error)
3743		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
3744
3745	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
3746		   name, active_objs, num_objs, cachep->buffer_size,
3747		   cachep->num, (1 << cachep->gfporder));
3748	seq_printf(m, " : tunables %4u %4u %4u",
3749		   cachep->limit, cachep->batchcount, cachep->shared);
3750	seq_printf(m, " : slabdata %6lu %6lu %6lu",
3751		   active_slabs, num_slabs, shared_avail);
3752#if STATS
3753	{			/* list3 stats */
3754		unsigned long high = cachep->high_mark;
3755		unsigned long allocs = cachep->num_allocations;
3756		unsigned long grown = cachep->grown;
3757		unsigned long reaped = cachep->reaped;
3758		unsigned long errors = cachep->errors;
3759		unsigned long max_freeable = cachep->max_freeable;
3760		unsigned long node_allocs = cachep->node_allocs;
3761		unsigned long node_frees = cachep->node_frees;
3762
3763		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
3764				%4lu %4lu %4lu %4lu", allocs, high, grown,
3765				reaped, errors, max_freeable, node_allocs,
3766				node_frees);
3767	}
3768	/* cpu stats */
3769	{
3770		unsigned long allochit = atomic_read(&cachep->allochit);
3771		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
3772		unsigned long freehit = atomic_read(&cachep->freehit);
3773		unsigned long freemiss = atomic_read(&cachep->freemiss);
3774
3775		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
3776			   allochit, allocmiss, freehit, freemiss);
3777	}
3778#endif
3779	seq_putc(m, '\n');
3780	return 0;
3781}
3782
3783/*
3784 * slabinfo_op - iterator that generates /proc/slabinfo
3785 *
3786 * Output layout:
3787 * cache-name
3788 * num-active-objs
3789 * total-objs
3790 * object size
3791 * num-active-slabs
3792 * total-slabs
3793 * num-pages-per-slab
3794 * + further values on SMP and with statistics enabled
3795 */
3796
3797struct seq_operations slabinfo_op = {
3798	.start = s_start,
3799	.next = s_next,
3800	.stop = s_stop,
3801	.show = s_show,
3802};
3803
3804#define MAX_SLABINFO_WRITE 128
3805/**
3806 * slabinfo_write - Tuning for the slab allocator
3807 * @file: unused
3808 * @buffer: user buffer
3809 * @count: data length
3810 * @ppos: unused
3811 */
3812ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3813		       size_t count, loff_t *ppos)
3814{
3815	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
3816	int limit, batchcount, shared, res;
3817	struct list_head *p;
3818
3819	if (count > MAX_SLABINFO_WRITE)
3820		return -EINVAL;
3821	if (copy_from_user(&kbuf, buffer, count))
3822		return -EFAULT;
3823	kbuf[MAX_SLABINFO_WRITE] = '\0';
3824
3825	tmp = strchr(kbuf, ' ');
3826	if (!tmp)
3827		return -EINVAL;
3828	*tmp = '\0';
3829	tmp++;
3830	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
3831		return -EINVAL;
3832
3833	/* Find the cache in the chain of caches. */
3834	mutex_lock(&cache_chain_mutex);
3835	res = -EINVAL;
3836	list_for_each(p, &cache_chain) {
3837		struct kmem_cache *cachep;
3838
3839		cachep = list_entry(p, struct kmem_cache, next);
3840		if (!strcmp(cachep->name, kbuf)) {
3841			if (limit < 1 || batchcount < 1 ||
3842					batchcount > limit || shared < 0) {
3843				res = 0;
3844			} else {
3845				res = do_tune_cpucache(cachep, limit,
3846						       batchcount, shared);
3847			}
3848			break;
3849		}
3850	}
3851	mutex_unlock(&cache_chain_mutex);
3852	if (res >= 0)
3853		res = count;
3854	return res;
3855}
3856#endif
3857
3858/**
3859 * ksize - get the actual amount of memory allocated for a given object
3860 * @objp: Pointer to the object
3861 *
3862 * kmalloc may internally round up allocations and return more memory
3863 * than requested. ksize() can be used to determine the actual amount of
3864 * memory allocated. The caller may use this additional memory, even though
3865 * a smaller amount of memory was initially specified with the kmalloc call.
3866 * The caller must guarantee that objp points to a valid object previously
3867 * allocated with either kmalloc() or kmem_cache_alloc(). The object
3868 * must not be freed during the duration of the call.
3869 */
3870unsigned int ksize(const void *objp)
3871{
3872	if (unlikely(objp == NULL))
3873		return 0;
3874
3875	return obj_size(virt_to_cache(objp));
3876}
3877