slab.c revision 364fbb29a0105863d76a1f7bbc01783a4af30a75
1/*
2 * linux/mm/slab.c
3 * Written by Mark Hemment, 1996/97.
4 * (markhe@nextd.demon.co.uk)
5 *
6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7 *
8 * Major cleanup, different bufctl logic, per-cpu arrays
9 *	(c) 2000 Manfred Spraul
10 *
11 * Cleanup, make the head arrays unconditional, preparation for NUMA
12 * 	(c) 2002 Manfred Spraul
13 *
14 * An implementation of the Slab Allocator as described in outline in;
15 *	UNIX Internals: The New Frontiers by Uresh Vahalia
16 *	Pub: Prentice Hall	ISBN 0-13-101908-2
17 * or with a little more detail in;
18 *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
19 *	Jeff Bonwick (Sun Microsystems).
20 *	Presented at: USENIX Summer 1994 Technical Conference
21 *
22 * The memory is organized in caches, one cache for each object type.
23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24 * Each cache consists out of many slabs (they are small (usually one
25 * page long) and always contiguous), and each slab contains multiple
26 * initialized objects.
27 *
28 * This means, that your constructor is used only for newly allocated
29 * slabs and you must pass objects with the same intializations to
30 * kmem_cache_free.
31 *
32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33 * normal). If you need a special memory type, then must create a new
34 * cache for that memory type.
35 *
36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37 *   full slabs with 0 free objects
38 *   partial slabs
39 *   empty slabs with no allocated objects
40 *
41 * If partial slabs exist, then new allocations come from these slabs,
42 * otherwise from empty slabs or new slabs are allocated.
43 *
44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46 *
47 * Each cache has a short per-cpu head array, most allocs
48 * and frees go into that array, and if that array overflows, then 1/2
49 * of the entries in the array are given back into the global cache.
50 * The head array is strictly LIFO and should improve the cache hit rates.
51 * On SMP, it additionally reduces the spinlock operations.
52 *
53 * The c_cpuarray may not be read with enabled local interrupts -
54 * it's changed with a smp_call_function().
55 *
56 * SMP synchronization:
57 *  constructors and destructors are called without any locking.
58 *  Several members in struct kmem_cache and struct slab never change, they
59 *	are accessed without any locking.
60 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
61 *  	and local interrupts are disabled so slab code is preempt-safe.
62 *  The non-constant members are protected with a per-cache irq spinlock.
63 *
64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65 * in 2000 - many ideas in the current implementation are derived from
66 * his patch.
67 *
68 * Further notes from the original documentation:
69 *
70 * 11 April '97.  Started multi-threading - markhe
71 *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
72 *	The sem is only needed when accessing/extending the cache-chain, which
73 *	can never happen inside an interrupt (kmem_cache_create(),
74 *	kmem_cache_shrink() and kmem_cache_reap()).
75 *
76 *	At present, each engine can be growing a cache.  This should be blocked.
77 *
78 * 15 March 2005. NUMA slab allocator.
79 *	Shai Fultheim <shai@scalex86.org>.
80 *	Shobhit Dayal <shobhit@calsoftinc.com>
81 *	Alok N Kataria <alokk@calsoftinc.com>
82 *	Christoph Lameter <christoph@lameter.com>
83 *
84 *	Modified the slab allocator to be node aware on NUMA systems.
85 *	Each node has its own list of partial, free and full slabs.
86 *	All object allocations for a node occur from node specific slab lists.
87 */
88
89#include	<linux/slab.h>
90#include	<linux/mm.h>
91#include	<linux/poison.h>
92#include	<linux/swap.h>
93#include	<linux/cache.h>
94#include	<linux/interrupt.h>
95#include	<linux/init.h>
96#include	<linux/compiler.h>
97#include	<linux/cpuset.h>
98#include	<linux/seq_file.h>
99#include	<linux/notifier.h>
100#include	<linux/kallsyms.h>
101#include	<linux/cpu.h>
102#include	<linux/sysctl.h>
103#include	<linux/module.h>
104#include	<linux/rcupdate.h>
105#include	<linux/string.h>
106#include	<linux/uaccess.h>
107#include	<linux/nodemask.h>
108#include	<linux/mempolicy.h>
109#include	<linux/mutex.h>
110#include	<linux/fault-inject.h>
111#include	<linux/rtmutex.h>
112#include	<linux/reciprocal_div.h>
113
114#include	<asm/cacheflush.h>
115#include	<asm/tlbflush.h>
116#include	<asm/page.h>
117
118/*
119 * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
120 *		  SLAB_RED_ZONE & SLAB_POISON.
121 *		  0 for faster, smaller code (especially in the critical paths).
122 *
123 * STATS	- 1 to collect stats for /proc/slabinfo.
124 *		  0 for faster, smaller code (especially in the critical paths).
125 *
126 * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
127 */
128
129#ifdef CONFIG_DEBUG_SLAB
130#define	DEBUG		1
131#define	STATS		1
132#define	FORCED_DEBUG	1
133#else
134#define	DEBUG		0
135#define	STATS		0
136#define	FORCED_DEBUG	0
137#endif
138
139/* Shouldn't this be in a header file somewhere? */
140#define	BYTES_PER_WORD		sizeof(void *)
141
142#ifndef cache_line_size
143#define cache_line_size()	L1_CACHE_BYTES
144#endif
145
146#ifndef ARCH_KMALLOC_MINALIGN
147/*
148 * Enforce a minimum alignment for the kmalloc caches.
149 * Usually, the kmalloc caches are cache_line_size() aligned, except when
150 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
151 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
152 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
153 * Note that this flag disables some debug features.
154 */
155#define ARCH_KMALLOC_MINALIGN 0
156#endif
157
158#ifndef ARCH_SLAB_MINALIGN
159/*
160 * Enforce a minimum alignment for all caches.
161 * Intended for archs that get misalignment faults even for BYTES_PER_WORD
162 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
163 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
164 * some debug features.
165 */
166#define ARCH_SLAB_MINALIGN 0
167#endif
168
169#ifndef ARCH_KMALLOC_FLAGS
170#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
171#endif
172
173/* Legal flag mask for kmem_cache_create(). */
174#if DEBUG
175# define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
176			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
177			 SLAB_CACHE_DMA | \
178			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
179			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
180			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
181#else
182# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
183			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
184			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
185			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
186#endif
187
188/*
189 * kmem_bufctl_t:
190 *
191 * Bufctl's are used for linking objs within a slab
192 * linked offsets.
193 *
194 * This implementation relies on "struct page" for locating the cache &
195 * slab an object belongs to.
196 * This allows the bufctl structure to be small (one int), but limits
197 * the number of objects a slab (not a cache) can contain when off-slab
198 * bufctls are used. The limit is the size of the largest general cache
199 * that does not use off-slab slabs.
200 * For 32bit archs with 4 kB pages, is this 56.
201 * This is not serious, as it is only for large objects, when it is unwise
202 * to have too many per slab.
203 * Note: This limit can be raised by introducing a general cache whose size
204 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
205 */
206
207typedef unsigned int kmem_bufctl_t;
208#define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
209#define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
210#define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
211#define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
212
213/*
214 * struct slab
215 *
216 * Manages the objs in a slab. Placed either at the beginning of mem allocated
217 * for a slab, or allocated from an general cache.
218 * Slabs are chained into three list: fully used, partial, fully free slabs.
219 */
220struct slab {
221	struct list_head list;
222	unsigned long colouroff;
223	void *s_mem;		/* including colour offset */
224	unsigned int inuse;	/* num of objs active in slab */
225	kmem_bufctl_t free;
226	unsigned short nodeid;
227};
228
229/*
230 * struct slab_rcu
231 *
232 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
233 * arrange for kmem_freepages to be called via RCU.  This is useful if
234 * we need to approach a kernel structure obliquely, from its address
235 * obtained without the usual locking.  We can lock the structure to
236 * stabilize it and check it's still at the given address, only if we
237 * can be sure that the memory has not been meanwhile reused for some
238 * other kind of object (which our subsystem's lock might corrupt).
239 *
240 * rcu_read_lock before reading the address, then rcu_read_unlock after
241 * taking the spinlock within the structure expected at that address.
242 *
243 * We assume struct slab_rcu can overlay struct slab when destroying.
244 */
245struct slab_rcu {
246	struct rcu_head head;
247	struct kmem_cache *cachep;
248	void *addr;
249};
250
251/*
252 * struct array_cache
253 *
254 * Purpose:
255 * - LIFO ordering, to hand out cache-warm objects from _alloc
256 * - reduce the number of linked list operations
257 * - reduce spinlock operations
258 *
259 * The limit is stored in the per-cpu structure to reduce the data cache
260 * footprint.
261 *
262 */
263struct array_cache {
264	unsigned int avail;
265	unsigned int limit;
266	unsigned int batchcount;
267	unsigned int touched;
268	spinlock_t lock;
269	void *entry[0];	/*
270			 * Must have this definition in here for the proper
271			 * alignment of array_cache. Also simplifies accessing
272			 * the entries.
273			 * [0] is for gcc 2.95. It should really be [].
274			 */
275};
276
277/*
278 * bootstrap: The caches do not work without cpuarrays anymore, but the
279 * cpuarrays are allocated from the generic caches...
280 */
281#define BOOT_CPUCACHE_ENTRIES	1
282struct arraycache_init {
283	struct array_cache cache;
284	void *entries[BOOT_CPUCACHE_ENTRIES];
285};
286
287/*
288 * The slab lists for all objects.
289 */
290struct kmem_list3 {
291	struct list_head slabs_partial;	/* partial list first, better asm code */
292	struct list_head slabs_full;
293	struct list_head slabs_free;
294	unsigned long free_objects;
295	unsigned int free_limit;
296	unsigned int colour_next;	/* Per-node cache coloring */
297	spinlock_t list_lock;
298	struct array_cache *shared;	/* shared per node */
299	struct array_cache **alien;	/* on other nodes */
300	unsigned long next_reap;	/* updated without locking */
301	int free_touched;		/* updated without locking */
302};
303
304/*
305 * Need this for bootstrapping a per node allocator.
306 */
307#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
308struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
309#define	CACHE_CACHE 0
310#define	SIZE_AC 1
311#define	SIZE_L3 (1 + MAX_NUMNODES)
312
313static int drain_freelist(struct kmem_cache *cache,
314			struct kmem_list3 *l3, int tofree);
315static void free_block(struct kmem_cache *cachep, void **objpp, int len,
316			int node);
317static int enable_cpucache(struct kmem_cache *cachep);
318static void cache_reap(struct work_struct *unused);
319
320/*
321 * This function must be completely optimized away if a constant is passed to
322 * it.  Mostly the same as what is in linux/slab.h except it returns an index.
323 */
324static __always_inline int index_of(const size_t size)
325{
326	extern void __bad_size(void);
327
328	if (__builtin_constant_p(size)) {
329		int i = 0;
330
331#define CACHE(x) \
332	if (size <=x) \
333		return i; \
334	else \
335		i++;
336#include "linux/kmalloc_sizes.h"
337#undef CACHE
338		__bad_size();
339	} else
340		__bad_size();
341	return 0;
342}
343
344static int slab_early_init = 1;
345
346#define INDEX_AC index_of(sizeof(struct arraycache_init))
347#define INDEX_L3 index_of(sizeof(struct kmem_list3))
348
349static void kmem_list3_init(struct kmem_list3 *parent)
350{
351	INIT_LIST_HEAD(&parent->slabs_full);
352	INIT_LIST_HEAD(&parent->slabs_partial);
353	INIT_LIST_HEAD(&parent->slabs_free);
354	parent->shared = NULL;
355	parent->alien = NULL;
356	parent->colour_next = 0;
357	spin_lock_init(&parent->list_lock);
358	parent->free_objects = 0;
359	parent->free_touched = 0;
360}
361
362#define MAKE_LIST(cachep, listp, slab, nodeid)				\
363	do {								\
364		INIT_LIST_HEAD(listp);					\
365		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
366	} while (0)
367
368#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
369	do {								\
370	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
371	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
372	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
373	} while (0)
374
375/*
376 * struct kmem_cache
377 *
378 * manages a cache.
379 */
380
381struct kmem_cache {
382/* 1) per-cpu data, touched during every alloc/free */
383	struct array_cache *array[NR_CPUS];
384/* 2) Cache tunables. Protected by cache_chain_mutex */
385	unsigned int batchcount;
386	unsigned int limit;
387	unsigned int shared;
388
389	unsigned int buffer_size;
390	u32 reciprocal_buffer_size;
391/* 3) touched by every alloc & free from the backend */
392	struct kmem_list3 *nodelists[MAX_NUMNODES];
393
394	unsigned int flags;		/* constant flags */
395	unsigned int num;		/* # of objs per slab */
396
397/* 4) cache_grow/shrink */
398	/* order of pgs per slab (2^n) */
399	unsigned int gfporder;
400
401	/* force GFP flags, e.g. GFP_DMA */
402	gfp_t gfpflags;
403
404	size_t colour;			/* cache colouring range */
405	unsigned int colour_off;	/* colour offset */
406	struct kmem_cache *slabp_cache;
407	unsigned int slab_size;
408	unsigned int dflags;		/* dynamic flags */
409
410	/* constructor func */
411	void (*ctor) (void *, struct kmem_cache *, unsigned long);
412
413	/* de-constructor func */
414	void (*dtor) (void *, struct kmem_cache *, unsigned long);
415
416/* 5) cache creation/removal */
417	const char *name;
418	struct list_head next;
419
420/* 6) statistics */
421#if STATS
422	unsigned long num_active;
423	unsigned long num_allocations;
424	unsigned long high_mark;
425	unsigned long grown;
426	unsigned long reaped;
427	unsigned long errors;
428	unsigned long max_freeable;
429	unsigned long node_allocs;
430	unsigned long node_frees;
431	unsigned long node_overflow;
432	atomic_t allochit;
433	atomic_t allocmiss;
434	atomic_t freehit;
435	atomic_t freemiss;
436#endif
437#if DEBUG
438	/*
439	 * If debugging is enabled, then the allocator can add additional
440	 * fields and/or padding to every object. buffer_size contains the total
441	 * object size including these internal fields, the following two
442	 * variables contain the offset to the user object and its size.
443	 */
444	int obj_offset;
445	int obj_size;
446#endif
447};
448
449#define CFLGS_OFF_SLAB		(0x80000000UL)
450#define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
451
452#define BATCHREFILL_LIMIT	16
453/*
454 * Optimization question: fewer reaps means less probability for unnessary
455 * cpucache drain/refill cycles.
456 *
457 * OTOH the cpuarrays can contain lots of objects,
458 * which could lock up otherwise freeable slabs.
459 */
460#define REAPTIMEOUT_CPUC	(2*HZ)
461#define REAPTIMEOUT_LIST3	(4*HZ)
462
463#if STATS
464#define	STATS_INC_ACTIVE(x)	((x)->num_active++)
465#define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
466#define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
467#define	STATS_INC_GROWN(x)	((x)->grown++)
468#define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
469#define	STATS_SET_HIGH(x)						\
470	do {								\
471		if ((x)->num_active > (x)->high_mark)			\
472			(x)->high_mark = (x)->num_active;		\
473	} while (0)
474#define	STATS_INC_ERR(x)	((x)->errors++)
475#define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
476#define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
477#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
478#define	STATS_SET_FREEABLE(x, i)					\
479	do {								\
480		if ((x)->max_freeable < i)				\
481			(x)->max_freeable = i;				\
482	} while (0)
483#define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
484#define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
485#define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
486#define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
487#else
488#define	STATS_INC_ACTIVE(x)	do { } while (0)
489#define	STATS_DEC_ACTIVE(x)	do { } while (0)
490#define	STATS_INC_ALLOCED(x)	do { } while (0)
491#define	STATS_INC_GROWN(x)	do { } while (0)
492#define	STATS_ADD_REAPED(x,y)	do { } while (0)
493#define	STATS_SET_HIGH(x)	do { } while (0)
494#define	STATS_INC_ERR(x)	do { } while (0)
495#define	STATS_INC_NODEALLOCS(x)	do { } while (0)
496#define	STATS_INC_NODEFREES(x)	do { } while (0)
497#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
498#define	STATS_SET_FREEABLE(x, i) do { } while (0)
499#define STATS_INC_ALLOCHIT(x)	do { } while (0)
500#define STATS_INC_ALLOCMISS(x)	do { } while (0)
501#define STATS_INC_FREEHIT(x)	do { } while (0)
502#define STATS_INC_FREEMISS(x)	do { } while (0)
503#endif
504
505#if DEBUG
506
507/*
508 * memory layout of objects:
509 * 0		: objp
510 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
511 * 		the end of an object is aligned with the end of the real
512 * 		allocation. Catches writes behind the end of the allocation.
513 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
514 * 		redzone word.
515 * cachep->obj_offset: The real object.
516 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
517 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
518 *					[BYTES_PER_WORD long]
519 */
520static int obj_offset(struct kmem_cache *cachep)
521{
522	return cachep->obj_offset;
523}
524
525static int obj_size(struct kmem_cache *cachep)
526{
527	return cachep->obj_size;
528}
529
530static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
531{
532	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
533	return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
534}
535
536static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
537{
538	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
539	if (cachep->flags & SLAB_STORE_USER)
540		return (unsigned long *)(objp + cachep->buffer_size -
541					 2 * BYTES_PER_WORD);
542	return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
543}
544
545static void **dbg_userword(struct kmem_cache *cachep, void *objp)
546{
547	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
548	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
549}
550
551#else
552
553#define obj_offset(x)			0
554#define obj_size(cachep)		(cachep->buffer_size)
555#define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long *)NULL;})
556#define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long *)NULL;})
557#define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
558
559#endif
560
561/*
562 * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
563 * order.
564 */
565#if defined(CONFIG_LARGE_ALLOCS)
566#define	MAX_OBJ_ORDER	13	/* up to 32Mb */
567#define	MAX_GFP_ORDER	13	/* up to 32Mb */
568#elif defined(CONFIG_MMU)
569#define	MAX_OBJ_ORDER	5	/* 32 pages */
570#define	MAX_GFP_ORDER	5	/* 32 pages */
571#else
572#define	MAX_OBJ_ORDER	8	/* up to 1Mb */
573#define	MAX_GFP_ORDER	8	/* up to 1Mb */
574#endif
575
576/*
577 * Do not go above this order unless 0 objects fit into the slab.
578 */
579#define	BREAK_GFP_ORDER_HI	1
580#define	BREAK_GFP_ORDER_LO	0
581static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
582
583/*
584 * Functions for storing/retrieving the cachep and or slab from the page
585 * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
586 * these are used to find the cache which an obj belongs to.
587 */
588static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
589{
590	page->lru.next = (struct list_head *)cache;
591}
592
593static inline struct kmem_cache *page_get_cache(struct page *page)
594{
595	if (unlikely(PageCompound(page)))
596		page = (struct page *)page_private(page);
597	BUG_ON(!PageSlab(page));
598	return (struct kmem_cache *)page->lru.next;
599}
600
601static inline void page_set_slab(struct page *page, struct slab *slab)
602{
603	page->lru.prev = (struct list_head *)slab;
604}
605
606static inline struct slab *page_get_slab(struct page *page)
607{
608	if (unlikely(PageCompound(page)))
609		page = (struct page *)page_private(page);
610	BUG_ON(!PageSlab(page));
611	return (struct slab *)page->lru.prev;
612}
613
614static inline struct kmem_cache *virt_to_cache(const void *obj)
615{
616	struct page *page = virt_to_page(obj);
617	return page_get_cache(page);
618}
619
620static inline struct slab *virt_to_slab(const void *obj)
621{
622	struct page *page = virt_to_page(obj);
623	return page_get_slab(page);
624}
625
626static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
627				 unsigned int idx)
628{
629	return slab->s_mem + cache->buffer_size * idx;
630}
631
632/*
633 * We want to avoid an expensive divide : (offset / cache->buffer_size)
634 *   Using the fact that buffer_size is a constant for a particular cache,
635 *   we can replace (offset / cache->buffer_size) by
636 *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
637 */
638static inline unsigned int obj_to_index(const struct kmem_cache *cache,
639					const struct slab *slab, void *obj)
640{
641	u32 offset = (obj - slab->s_mem);
642	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
643}
644
645/*
646 * These are the default caches for kmalloc. Custom caches can have other sizes.
647 */
648struct cache_sizes malloc_sizes[] = {
649#define CACHE(x) { .cs_size = (x) },
650#include <linux/kmalloc_sizes.h>
651	CACHE(ULONG_MAX)
652#undef CACHE
653};
654EXPORT_SYMBOL(malloc_sizes);
655
656/* Must match cache_sizes above. Out of line to keep cache footprint low. */
657struct cache_names {
658	char *name;
659	char *name_dma;
660};
661
662static struct cache_names __initdata cache_names[] = {
663#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
664#include <linux/kmalloc_sizes.h>
665	{NULL,}
666#undef CACHE
667};
668
669static struct arraycache_init initarray_cache __initdata =
670    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
671static struct arraycache_init initarray_generic =
672    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
673
674/* internal cache of cache description objs */
675static struct kmem_cache cache_cache = {
676	.batchcount = 1,
677	.limit = BOOT_CPUCACHE_ENTRIES,
678	.shared = 1,
679	.buffer_size = sizeof(struct kmem_cache),
680	.name = "kmem_cache",
681#if DEBUG
682	.obj_size = sizeof(struct kmem_cache),
683#endif
684};
685
686#define BAD_ALIEN_MAGIC 0x01020304ul
687
688#ifdef CONFIG_LOCKDEP
689
690/*
691 * Slab sometimes uses the kmalloc slabs to store the slab headers
692 * for other slabs "off slab".
693 * The locking for this is tricky in that it nests within the locks
694 * of all other slabs in a few places; to deal with this special
695 * locking we put on-slab caches into a separate lock-class.
696 *
697 * We set lock class for alien array caches which are up during init.
698 * The lock annotation will be lost if all cpus of a node goes down and
699 * then comes back up during hotplug
700 */
701static struct lock_class_key on_slab_l3_key;
702static struct lock_class_key on_slab_alc_key;
703
704static inline void init_lock_keys(void)
705
706{
707	int q;
708	struct cache_sizes *s = malloc_sizes;
709
710	while (s->cs_size != ULONG_MAX) {
711		for_each_node(q) {
712			struct array_cache **alc;
713			int r;
714			struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
715			if (!l3 || OFF_SLAB(s->cs_cachep))
716				continue;
717			lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
718			alc = l3->alien;
719			/*
720			 * FIXME: This check for BAD_ALIEN_MAGIC
721			 * should go away when common slab code is taught to
722			 * work even without alien caches.
723			 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
724			 * for alloc_alien_cache,
725			 */
726			if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
727				continue;
728			for_each_node(r) {
729				if (alc[r])
730					lockdep_set_class(&alc[r]->lock,
731					     &on_slab_alc_key);
732			}
733		}
734		s++;
735	}
736}
737#else
738static inline void init_lock_keys(void)
739{
740}
741#endif
742
743/*
744 * 1. Guard access to the cache-chain.
745 * 2. Protect sanity of cpu_online_map against cpu hotplug events
746 */
747static DEFINE_MUTEX(cache_chain_mutex);
748static struct list_head cache_chain;
749
750/*
751 * chicken and egg problem: delay the per-cpu array allocation
752 * until the general caches are up.
753 */
754static enum {
755	NONE,
756	PARTIAL_AC,
757	PARTIAL_L3,
758	FULL
759} g_cpucache_up;
760
761/*
762 * used by boot code to determine if it can use slab based allocator
763 */
764int slab_is_available(void)
765{
766	return g_cpucache_up == FULL;
767}
768
769static DEFINE_PER_CPU(struct delayed_work, reap_work);
770
771static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
772{
773	return cachep->array[smp_processor_id()];
774}
775
776static inline struct kmem_cache *__find_general_cachep(size_t size,
777							gfp_t gfpflags)
778{
779	struct cache_sizes *csizep = malloc_sizes;
780
781#if DEBUG
782	/* This happens if someone tries to call
783	 * kmem_cache_create(), or __kmalloc(), before
784	 * the generic caches are initialized.
785	 */
786	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
787#endif
788	while (size > csizep->cs_size)
789		csizep++;
790
791	/*
792	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
793	 * has cs_{dma,}cachep==NULL. Thus no special case
794	 * for large kmalloc calls required.
795	 */
796#ifdef CONFIG_ZONE_DMA
797	if (unlikely(gfpflags & GFP_DMA))
798		return csizep->cs_dmacachep;
799#endif
800	return csizep->cs_cachep;
801}
802
803static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
804{
805	return __find_general_cachep(size, gfpflags);
806}
807
808static size_t slab_mgmt_size(size_t nr_objs, size_t align)
809{
810	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
811}
812
813/*
814 * Calculate the number of objects and left-over bytes for a given buffer size.
815 */
816static void cache_estimate(unsigned long gfporder, size_t buffer_size,
817			   size_t align, int flags, size_t *left_over,
818			   unsigned int *num)
819{
820	int nr_objs;
821	size_t mgmt_size;
822	size_t slab_size = PAGE_SIZE << gfporder;
823
824	/*
825	 * The slab management structure can be either off the slab or
826	 * on it. For the latter case, the memory allocated for a
827	 * slab is used for:
828	 *
829	 * - The struct slab
830	 * - One kmem_bufctl_t for each object
831	 * - Padding to respect alignment of @align
832	 * - @buffer_size bytes for each object
833	 *
834	 * If the slab management structure is off the slab, then the
835	 * alignment will already be calculated into the size. Because
836	 * the slabs are all pages aligned, the objects will be at the
837	 * correct alignment when allocated.
838	 */
839	if (flags & CFLGS_OFF_SLAB) {
840		mgmt_size = 0;
841		nr_objs = slab_size / buffer_size;
842
843		if (nr_objs > SLAB_LIMIT)
844			nr_objs = SLAB_LIMIT;
845	} else {
846		/*
847		 * Ignore padding for the initial guess. The padding
848		 * is at most @align-1 bytes, and @buffer_size is at
849		 * least @align. In the worst case, this result will
850		 * be one greater than the number of objects that fit
851		 * into the memory allocation when taking the padding
852		 * into account.
853		 */
854		nr_objs = (slab_size - sizeof(struct slab)) /
855			  (buffer_size + sizeof(kmem_bufctl_t));
856
857		/*
858		 * This calculated number will be either the right
859		 * amount, or one greater than what we want.
860		 */
861		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
862		       > slab_size)
863			nr_objs--;
864
865		if (nr_objs > SLAB_LIMIT)
866			nr_objs = SLAB_LIMIT;
867
868		mgmt_size = slab_mgmt_size(nr_objs, align);
869	}
870	*num = nr_objs;
871	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
872}
873
874#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
875
876static void __slab_error(const char *function, struct kmem_cache *cachep,
877			char *msg)
878{
879	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
880	       function, cachep->name, msg);
881	dump_stack();
882}
883
884/*
885 * By default on NUMA we use alien caches to stage the freeing of
886 * objects allocated from other nodes. This causes massive memory
887 * inefficiencies when using fake NUMA setup to split memory into a
888 * large number of small nodes, so it can be disabled on the command
889 * line
890  */
891
892static int use_alien_caches __read_mostly = 1;
893static int __init noaliencache_setup(char *s)
894{
895	use_alien_caches = 0;
896	return 1;
897}
898__setup("noaliencache", noaliencache_setup);
899
900#ifdef CONFIG_NUMA
901/*
902 * Special reaping functions for NUMA systems called from cache_reap().
903 * These take care of doing round robin flushing of alien caches (containing
904 * objects freed on different nodes from which they were allocated) and the
905 * flushing of remote pcps by calling drain_node_pages.
906 */
907static DEFINE_PER_CPU(unsigned long, reap_node);
908
909static void init_reap_node(int cpu)
910{
911	int node;
912
913	node = next_node(cpu_to_node(cpu), node_online_map);
914	if (node == MAX_NUMNODES)
915		node = first_node(node_online_map);
916
917	per_cpu(reap_node, cpu) = node;
918}
919
920static void next_reap_node(void)
921{
922	int node = __get_cpu_var(reap_node);
923
924	/*
925	 * Also drain per cpu pages on remote zones
926	 */
927	if (node != numa_node_id())
928		drain_node_pages(node);
929
930	node = next_node(node, node_online_map);
931	if (unlikely(node >= MAX_NUMNODES))
932		node = first_node(node_online_map);
933	__get_cpu_var(reap_node) = node;
934}
935
936#else
937#define init_reap_node(cpu) do { } while (0)
938#define next_reap_node(void) do { } while (0)
939#endif
940
941/*
942 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
943 * via the workqueue/eventd.
944 * Add the CPU number into the expiration time to minimize the possibility of
945 * the CPUs getting into lockstep and contending for the global cache chain
946 * lock.
947 */
948static void __devinit start_cpu_timer(int cpu)
949{
950	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
951
952	/*
953	 * When this gets called from do_initcalls via cpucache_init(),
954	 * init_workqueues() has already run, so keventd will be setup
955	 * at that time.
956	 */
957	if (keventd_up() && reap_work->work.func == NULL) {
958		init_reap_node(cpu);
959		INIT_DELAYED_WORK(reap_work, cache_reap);
960		schedule_delayed_work_on(cpu, reap_work,
961					__round_jiffies_relative(HZ, cpu));
962	}
963}
964
965static struct array_cache *alloc_arraycache(int node, int entries,
966					    int batchcount)
967{
968	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
969	struct array_cache *nc = NULL;
970
971	nc = kmalloc_node(memsize, GFP_KERNEL, node);
972	if (nc) {
973		nc->avail = 0;
974		nc->limit = entries;
975		nc->batchcount = batchcount;
976		nc->touched = 0;
977		spin_lock_init(&nc->lock);
978	}
979	return nc;
980}
981
982/*
983 * Transfer objects in one arraycache to another.
984 * Locking must be handled by the caller.
985 *
986 * Return the number of entries transferred.
987 */
988static int transfer_objects(struct array_cache *to,
989		struct array_cache *from, unsigned int max)
990{
991	/* Figure out how many entries to transfer */
992	int nr = min(min(from->avail, max), to->limit - to->avail);
993
994	if (!nr)
995		return 0;
996
997	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
998			sizeof(void *) *nr);
999
1000	from->avail -= nr;
1001	to->avail += nr;
1002	to->touched = 1;
1003	return nr;
1004}
1005
1006#ifndef CONFIG_NUMA
1007
1008#define drain_alien_cache(cachep, alien) do { } while (0)
1009#define reap_alien(cachep, l3) do { } while (0)
1010
1011static inline struct array_cache **alloc_alien_cache(int node, int limit)
1012{
1013	return (struct array_cache **)BAD_ALIEN_MAGIC;
1014}
1015
1016static inline void free_alien_cache(struct array_cache **ac_ptr)
1017{
1018}
1019
1020static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1021{
1022	return 0;
1023}
1024
1025static inline void *alternate_node_alloc(struct kmem_cache *cachep,
1026		gfp_t flags)
1027{
1028	return NULL;
1029}
1030
1031static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1032		 gfp_t flags, int nodeid)
1033{
1034	return NULL;
1035}
1036
1037#else	/* CONFIG_NUMA */
1038
1039static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1040static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1041
1042static struct array_cache **alloc_alien_cache(int node, int limit)
1043{
1044	struct array_cache **ac_ptr;
1045	int memsize = sizeof(void *) * nr_node_ids;
1046	int i;
1047
1048	if (limit > 1)
1049		limit = 12;
1050	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
1051	if (ac_ptr) {
1052		for_each_node(i) {
1053			if (i == node || !node_online(i)) {
1054				ac_ptr[i] = NULL;
1055				continue;
1056			}
1057			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
1058			if (!ac_ptr[i]) {
1059				for (i--; i <= 0; i--)
1060					kfree(ac_ptr[i]);
1061				kfree(ac_ptr);
1062				return NULL;
1063			}
1064		}
1065	}
1066	return ac_ptr;
1067}
1068
1069static void free_alien_cache(struct array_cache **ac_ptr)
1070{
1071	int i;
1072
1073	if (!ac_ptr)
1074		return;
1075	for_each_node(i)
1076	    kfree(ac_ptr[i]);
1077	kfree(ac_ptr);
1078}
1079
1080static void __drain_alien_cache(struct kmem_cache *cachep,
1081				struct array_cache *ac, int node)
1082{
1083	struct kmem_list3 *rl3 = cachep->nodelists[node];
1084
1085	if (ac->avail) {
1086		spin_lock(&rl3->list_lock);
1087		/*
1088		 * Stuff objects into the remote nodes shared array first.
1089		 * That way we could avoid the overhead of putting the objects
1090		 * into the free lists and getting them back later.
1091		 */
1092		if (rl3->shared)
1093			transfer_objects(rl3->shared, ac, ac->limit);
1094
1095		free_block(cachep, ac->entry, ac->avail, node);
1096		ac->avail = 0;
1097		spin_unlock(&rl3->list_lock);
1098	}
1099}
1100
1101/*
1102 * Called from cache_reap() to regularly drain alien caches round robin.
1103 */
1104static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1105{
1106	int node = __get_cpu_var(reap_node);
1107
1108	if (l3->alien) {
1109		struct array_cache *ac = l3->alien[node];
1110
1111		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1112			__drain_alien_cache(cachep, ac, node);
1113			spin_unlock_irq(&ac->lock);
1114		}
1115	}
1116}
1117
1118static void drain_alien_cache(struct kmem_cache *cachep,
1119				struct array_cache **alien)
1120{
1121	int i = 0;
1122	struct array_cache *ac;
1123	unsigned long flags;
1124
1125	for_each_online_node(i) {
1126		ac = alien[i];
1127		if (ac) {
1128			spin_lock_irqsave(&ac->lock, flags);
1129			__drain_alien_cache(cachep, ac, i);
1130			spin_unlock_irqrestore(&ac->lock, flags);
1131		}
1132	}
1133}
1134
1135static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1136{
1137	struct slab *slabp = virt_to_slab(objp);
1138	int nodeid = slabp->nodeid;
1139	struct kmem_list3 *l3;
1140	struct array_cache *alien = NULL;
1141	int node;
1142
1143	node = numa_node_id();
1144
1145	/*
1146	 * Make sure we are not freeing a object from another node to the array
1147	 * cache on this cpu.
1148	 */
1149	if (likely(slabp->nodeid == node))
1150		return 0;
1151
1152	l3 = cachep->nodelists[node];
1153	STATS_INC_NODEFREES(cachep);
1154	if (l3->alien && l3->alien[nodeid]) {
1155		alien = l3->alien[nodeid];
1156		spin_lock(&alien->lock);
1157		if (unlikely(alien->avail == alien->limit)) {
1158			STATS_INC_ACOVERFLOW(cachep);
1159			__drain_alien_cache(cachep, alien, nodeid);
1160		}
1161		alien->entry[alien->avail++] = objp;
1162		spin_unlock(&alien->lock);
1163	} else {
1164		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1165		free_block(cachep, &objp, 1, nodeid);
1166		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1167	}
1168	return 1;
1169}
1170#endif
1171
1172static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1173				    unsigned long action, void *hcpu)
1174{
1175	long cpu = (long)hcpu;
1176	struct kmem_cache *cachep;
1177	struct kmem_list3 *l3 = NULL;
1178	int node = cpu_to_node(cpu);
1179	int memsize = sizeof(struct kmem_list3);
1180
1181	switch (action) {
1182	case CPU_UP_PREPARE:
1183		mutex_lock(&cache_chain_mutex);
1184		/*
1185		 * We need to do this right in the beginning since
1186		 * alloc_arraycache's are going to use this list.
1187		 * kmalloc_node allows us to add the slab to the right
1188		 * kmem_list3 and not this cpu's kmem_list3
1189		 */
1190
1191		list_for_each_entry(cachep, &cache_chain, next) {
1192			/*
1193			 * Set up the size64 kmemlist for cpu before we can
1194			 * begin anything. Make sure some other cpu on this
1195			 * node has not already allocated this
1196			 */
1197			if (!cachep->nodelists[node]) {
1198				l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1199				if (!l3)
1200					goto bad;
1201				kmem_list3_init(l3);
1202				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1203				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1204
1205				/*
1206				 * The l3s don't come and go as CPUs come and
1207				 * go.  cache_chain_mutex is sufficient
1208				 * protection here.
1209				 */
1210				cachep->nodelists[node] = l3;
1211			}
1212
1213			spin_lock_irq(&cachep->nodelists[node]->list_lock);
1214			cachep->nodelists[node]->free_limit =
1215				(1 + nr_cpus_node(node)) *
1216				cachep->batchcount + cachep->num;
1217			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1218		}
1219
1220		/*
1221		 * Now we can go ahead with allocating the shared arrays and
1222		 * array caches
1223		 */
1224		list_for_each_entry(cachep, &cache_chain, next) {
1225			struct array_cache *nc;
1226			struct array_cache *shared;
1227			struct array_cache **alien = NULL;
1228
1229			nc = alloc_arraycache(node, cachep->limit,
1230						cachep->batchcount);
1231			if (!nc)
1232				goto bad;
1233			shared = alloc_arraycache(node,
1234					cachep->shared * cachep->batchcount,
1235					0xbaadf00d);
1236			if (!shared)
1237				goto bad;
1238
1239			if (use_alien_caches) {
1240                                alien = alloc_alien_cache(node, cachep->limit);
1241                                if (!alien)
1242                                        goto bad;
1243                        }
1244			cachep->array[cpu] = nc;
1245			l3 = cachep->nodelists[node];
1246			BUG_ON(!l3);
1247
1248			spin_lock_irq(&l3->list_lock);
1249			if (!l3->shared) {
1250				/*
1251				 * We are serialised from CPU_DEAD or
1252				 * CPU_UP_CANCELLED by the cpucontrol lock
1253				 */
1254				l3->shared = shared;
1255				shared = NULL;
1256			}
1257#ifdef CONFIG_NUMA
1258			if (!l3->alien) {
1259				l3->alien = alien;
1260				alien = NULL;
1261			}
1262#endif
1263			spin_unlock_irq(&l3->list_lock);
1264			kfree(shared);
1265			free_alien_cache(alien);
1266		}
1267		break;
1268	case CPU_ONLINE:
1269		mutex_unlock(&cache_chain_mutex);
1270		start_cpu_timer(cpu);
1271		break;
1272#ifdef CONFIG_HOTPLUG_CPU
1273	case CPU_DOWN_PREPARE:
1274		mutex_lock(&cache_chain_mutex);
1275		break;
1276	case CPU_DOWN_FAILED:
1277		mutex_unlock(&cache_chain_mutex);
1278		break;
1279	case CPU_DEAD:
1280		/*
1281		 * Even if all the cpus of a node are down, we don't free the
1282		 * kmem_list3 of any cache. This to avoid a race between
1283		 * cpu_down, and a kmalloc allocation from another cpu for
1284		 * memory from the node of the cpu going down.  The list3
1285		 * structure is usually allocated from kmem_cache_create() and
1286		 * gets destroyed at kmem_cache_destroy().
1287		 */
1288		/* fall thru */
1289#endif
1290	case CPU_UP_CANCELED:
1291		list_for_each_entry(cachep, &cache_chain, next) {
1292			struct array_cache *nc;
1293			struct array_cache *shared;
1294			struct array_cache **alien;
1295			cpumask_t mask;
1296
1297			mask = node_to_cpumask(node);
1298			/* cpu is dead; no one can alloc from it. */
1299			nc = cachep->array[cpu];
1300			cachep->array[cpu] = NULL;
1301			l3 = cachep->nodelists[node];
1302
1303			if (!l3)
1304				goto free_array_cache;
1305
1306			spin_lock_irq(&l3->list_lock);
1307
1308			/* Free limit for this kmem_list3 */
1309			l3->free_limit -= cachep->batchcount;
1310			if (nc)
1311				free_block(cachep, nc->entry, nc->avail, node);
1312
1313			if (!cpus_empty(mask)) {
1314				spin_unlock_irq(&l3->list_lock);
1315				goto free_array_cache;
1316			}
1317
1318			shared = l3->shared;
1319			if (shared) {
1320				free_block(cachep, l3->shared->entry,
1321					   l3->shared->avail, node);
1322				l3->shared = NULL;
1323			}
1324
1325			alien = l3->alien;
1326			l3->alien = NULL;
1327
1328			spin_unlock_irq(&l3->list_lock);
1329
1330			kfree(shared);
1331			if (alien) {
1332				drain_alien_cache(cachep, alien);
1333				free_alien_cache(alien);
1334			}
1335free_array_cache:
1336			kfree(nc);
1337		}
1338		/*
1339		 * In the previous loop, all the objects were freed to
1340		 * the respective cache's slabs,  now we can go ahead and
1341		 * shrink each nodelist to its limit.
1342		 */
1343		list_for_each_entry(cachep, &cache_chain, next) {
1344			l3 = cachep->nodelists[node];
1345			if (!l3)
1346				continue;
1347			drain_freelist(cachep, l3, l3->free_objects);
1348		}
1349		mutex_unlock(&cache_chain_mutex);
1350		break;
1351	}
1352	return NOTIFY_OK;
1353bad:
1354	return NOTIFY_BAD;
1355}
1356
1357static struct notifier_block __cpuinitdata cpucache_notifier = {
1358	&cpuup_callback, NULL, 0
1359};
1360
1361/*
1362 * swap the static kmem_list3 with kmalloced memory
1363 */
1364static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1365			int nodeid)
1366{
1367	struct kmem_list3 *ptr;
1368
1369	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
1370	BUG_ON(!ptr);
1371
1372	local_irq_disable();
1373	memcpy(ptr, list, sizeof(struct kmem_list3));
1374	/*
1375	 * Do not assume that spinlocks can be initialized via memcpy:
1376	 */
1377	spin_lock_init(&ptr->list_lock);
1378
1379	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1380	cachep->nodelists[nodeid] = ptr;
1381	local_irq_enable();
1382}
1383
1384/*
1385 * Initialisation.  Called after the page allocator have been initialised and
1386 * before smp_init().
1387 */
1388void __init kmem_cache_init(void)
1389{
1390	size_t left_over;
1391	struct cache_sizes *sizes;
1392	struct cache_names *names;
1393	int i;
1394	int order;
1395	int node;
1396
1397	if (num_possible_nodes() == 1)
1398		use_alien_caches = 0;
1399
1400	for (i = 0; i < NUM_INIT_LISTS; i++) {
1401		kmem_list3_init(&initkmem_list3[i]);
1402		if (i < MAX_NUMNODES)
1403			cache_cache.nodelists[i] = NULL;
1404	}
1405
1406	/*
1407	 * Fragmentation resistance on low memory - only use bigger
1408	 * page orders on machines with more than 32MB of memory.
1409	 */
1410	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
1411		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1412
1413	/* Bootstrap is tricky, because several objects are allocated
1414	 * from caches that do not exist yet:
1415	 * 1) initialize the cache_cache cache: it contains the struct
1416	 *    kmem_cache structures of all caches, except cache_cache itself:
1417	 *    cache_cache is statically allocated.
1418	 *    Initially an __init data area is used for the head array and the
1419	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
1420	 *    array at the end of the bootstrap.
1421	 * 2) Create the first kmalloc cache.
1422	 *    The struct kmem_cache for the new cache is allocated normally.
1423	 *    An __init data area is used for the head array.
1424	 * 3) Create the remaining kmalloc caches, with minimally sized
1425	 *    head arrays.
1426	 * 4) Replace the __init data head arrays for cache_cache and the first
1427	 *    kmalloc cache with kmalloc allocated arrays.
1428	 * 5) Replace the __init data for kmem_list3 for cache_cache and
1429	 *    the other cache's with kmalloc allocated memory.
1430	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1431	 */
1432
1433	node = numa_node_id();
1434
1435	/* 1) create the cache_cache */
1436	INIT_LIST_HEAD(&cache_chain);
1437	list_add(&cache_cache.next, &cache_chain);
1438	cache_cache.colour_off = cache_line_size();
1439	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1440	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
1441
1442	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1443					cache_line_size());
1444	cache_cache.reciprocal_buffer_size =
1445		reciprocal_value(cache_cache.buffer_size);
1446
1447	for (order = 0; order < MAX_ORDER; order++) {
1448		cache_estimate(order, cache_cache.buffer_size,
1449			cache_line_size(), 0, &left_over, &cache_cache.num);
1450		if (cache_cache.num)
1451			break;
1452	}
1453	BUG_ON(!cache_cache.num);
1454	cache_cache.gfporder = order;
1455	cache_cache.colour = left_over / cache_cache.colour_off;
1456	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1457				      sizeof(struct slab), cache_line_size());
1458
1459	/* 2+3) create the kmalloc caches */
1460	sizes = malloc_sizes;
1461	names = cache_names;
1462
1463	/*
1464	 * Initialize the caches that provide memory for the array cache and the
1465	 * kmem_list3 structures first.  Without this, further allocations will
1466	 * bug.
1467	 */
1468
1469	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1470					sizes[INDEX_AC].cs_size,
1471					ARCH_KMALLOC_MINALIGN,
1472					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1473					NULL, NULL);
1474
1475	if (INDEX_AC != INDEX_L3) {
1476		sizes[INDEX_L3].cs_cachep =
1477			kmem_cache_create(names[INDEX_L3].name,
1478				sizes[INDEX_L3].cs_size,
1479				ARCH_KMALLOC_MINALIGN,
1480				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1481				NULL, NULL);
1482	}
1483
1484	slab_early_init = 0;
1485
1486	while (sizes->cs_size != ULONG_MAX) {
1487		/*
1488		 * For performance, all the general caches are L1 aligned.
1489		 * This should be particularly beneficial on SMP boxes, as it
1490		 * eliminates "false sharing".
1491		 * Note for systems short on memory removing the alignment will
1492		 * allow tighter packing of the smaller caches.
1493		 */
1494		if (!sizes->cs_cachep) {
1495			sizes->cs_cachep = kmem_cache_create(names->name,
1496					sizes->cs_size,
1497					ARCH_KMALLOC_MINALIGN,
1498					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1499					NULL, NULL);
1500		}
1501#ifdef CONFIG_ZONE_DMA
1502		sizes->cs_dmacachep = kmem_cache_create(
1503					names->name_dma,
1504					sizes->cs_size,
1505					ARCH_KMALLOC_MINALIGN,
1506					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1507						SLAB_PANIC,
1508					NULL, NULL);
1509#endif
1510		sizes++;
1511		names++;
1512	}
1513	/* 4) Replace the bootstrap head arrays */
1514	{
1515		struct array_cache *ptr;
1516
1517		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1518
1519		local_irq_disable();
1520		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1521		memcpy(ptr, cpu_cache_get(&cache_cache),
1522		       sizeof(struct arraycache_init));
1523		/*
1524		 * Do not assume that spinlocks can be initialized via memcpy:
1525		 */
1526		spin_lock_init(&ptr->lock);
1527
1528		cache_cache.array[smp_processor_id()] = ptr;
1529		local_irq_enable();
1530
1531		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1532
1533		local_irq_disable();
1534		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1535		       != &initarray_generic.cache);
1536		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1537		       sizeof(struct arraycache_init));
1538		/*
1539		 * Do not assume that spinlocks can be initialized via memcpy:
1540		 */
1541		spin_lock_init(&ptr->lock);
1542
1543		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1544		    ptr;
1545		local_irq_enable();
1546	}
1547	/* 5) Replace the bootstrap kmem_list3's */
1548	{
1549		int nid;
1550
1551		/* Replace the static kmem_list3 structures for the boot cpu */
1552		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
1553
1554		for_each_online_node(nid) {
1555			init_list(malloc_sizes[INDEX_AC].cs_cachep,
1556				  &initkmem_list3[SIZE_AC + nid], nid);
1557
1558			if (INDEX_AC != INDEX_L3) {
1559				init_list(malloc_sizes[INDEX_L3].cs_cachep,
1560					  &initkmem_list3[SIZE_L3 + nid], nid);
1561			}
1562		}
1563	}
1564
1565	/* 6) resize the head arrays to their final sizes */
1566	{
1567		struct kmem_cache *cachep;
1568		mutex_lock(&cache_chain_mutex);
1569		list_for_each_entry(cachep, &cache_chain, next)
1570			if (enable_cpucache(cachep))
1571				BUG();
1572		mutex_unlock(&cache_chain_mutex);
1573	}
1574
1575	/* Annotate slab for lockdep -- annotate the malloc caches */
1576	init_lock_keys();
1577
1578
1579	/* Done! */
1580	g_cpucache_up = FULL;
1581
1582	/*
1583	 * Register a cpu startup notifier callback that initializes
1584	 * cpu_cache_get for all new cpus
1585	 */
1586	register_cpu_notifier(&cpucache_notifier);
1587
1588	/*
1589	 * The reap timers are started later, with a module init call: That part
1590	 * of the kernel is not yet operational.
1591	 */
1592}
1593
1594static int __init cpucache_init(void)
1595{
1596	int cpu;
1597
1598	/*
1599	 * Register the timers that return unneeded pages to the page allocator
1600	 */
1601	for_each_online_cpu(cpu)
1602		start_cpu_timer(cpu);
1603	return 0;
1604}
1605__initcall(cpucache_init);
1606
1607/*
1608 * Interface to system's page allocator. No need to hold the cache-lock.
1609 *
1610 * If we requested dmaable memory, we will get it. Even if we
1611 * did not request dmaable memory, we might get it, but that
1612 * would be relatively rare and ignorable.
1613 */
1614static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1615{
1616	struct page *page;
1617	int nr_pages;
1618	int i;
1619
1620#ifndef CONFIG_MMU
1621	/*
1622	 * Nommu uses slab's for process anonymous memory allocations, and thus
1623	 * requires __GFP_COMP to properly refcount higher order allocations
1624	 */
1625	flags |= __GFP_COMP;
1626#endif
1627
1628	flags |= cachep->gfpflags;
1629
1630	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1631	if (!page)
1632		return NULL;
1633
1634	nr_pages = (1 << cachep->gfporder);
1635	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1636		add_zone_page_state(page_zone(page),
1637			NR_SLAB_RECLAIMABLE, nr_pages);
1638	else
1639		add_zone_page_state(page_zone(page),
1640			NR_SLAB_UNRECLAIMABLE, nr_pages);
1641	for (i = 0; i < nr_pages; i++)
1642		__SetPageSlab(page + i);
1643	return page_address(page);
1644}
1645
1646/*
1647 * Interface to system's page release.
1648 */
1649static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1650{
1651	unsigned long i = (1 << cachep->gfporder);
1652	struct page *page = virt_to_page(addr);
1653	const unsigned long nr_freed = i;
1654
1655	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1656		sub_zone_page_state(page_zone(page),
1657				NR_SLAB_RECLAIMABLE, nr_freed);
1658	else
1659		sub_zone_page_state(page_zone(page),
1660				NR_SLAB_UNRECLAIMABLE, nr_freed);
1661	while (i--) {
1662		BUG_ON(!PageSlab(page));
1663		__ClearPageSlab(page);
1664		page++;
1665	}
1666	if (current->reclaim_state)
1667		current->reclaim_state->reclaimed_slab += nr_freed;
1668	free_pages((unsigned long)addr, cachep->gfporder);
1669}
1670
1671static void kmem_rcu_free(struct rcu_head *head)
1672{
1673	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1674	struct kmem_cache *cachep = slab_rcu->cachep;
1675
1676	kmem_freepages(cachep, slab_rcu->addr);
1677	if (OFF_SLAB(cachep))
1678		kmem_cache_free(cachep->slabp_cache, slab_rcu);
1679}
1680
1681#if DEBUG
1682
1683#ifdef CONFIG_DEBUG_PAGEALLOC
1684static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1685			    unsigned long caller)
1686{
1687	int size = obj_size(cachep);
1688
1689	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1690
1691	if (size < 5 * sizeof(unsigned long))
1692		return;
1693
1694	*addr++ = 0x12345678;
1695	*addr++ = caller;
1696	*addr++ = smp_processor_id();
1697	size -= 3 * sizeof(unsigned long);
1698	{
1699		unsigned long *sptr = &caller;
1700		unsigned long svalue;
1701
1702		while (!kstack_end(sptr)) {
1703			svalue = *sptr++;
1704			if (kernel_text_address(svalue)) {
1705				*addr++ = svalue;
1706				size -= sizeof(unsigned long);
1707				if (size <= sizeof(unsigned long))
1708					break;
1709			}
1710		}
1711
1712	}
1713	*addr++ = 0x87654321;
1714}
1715#endif
1716
1717static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1718{
1719	int size = obj_size(cachep);
1720	addr = &((char *)addr)[obj_offset(cachep)];
1721
1722	memset(addr, val, size);
1723	*(unsigned char *)(addr + size - 1) = POISON_END;
1724}
1725
1726static void dump_line(char *data, int offset, int limit)
1727{
1728	int i;
1729	unsigned char error = 0;
1730	int bad_count = 0;
1731
1732	printk(KERN_ERR "%03x:", offset);
1733	for (i = 0; i < limit; i++) {
1734		if (data[offset + i] != POISON_FREE) {
1735			error = data[offset + i];
1736			bad_count++;
1737		}
1738		printk(" %02x", (unsigned char)data[offset + i]);
1739	}
1740	printk("\n");
1741
1742	if (bad_count == 1) {
1743		error ^= POISON_FREE;
1744		if (!(error & (error - 1))) {
1745			printk(KERN_ERR "Single bit error detected. Probably "
1746					"bad RAM.\n");
1747#ifdef CONFIG_X86
1748			printk(KERN_ERR "Run memtest86+ or a similar memory "
1749					"test tool.\n");
1750#else
1751			printk(KERN_ERR "Run a memory test tool.\n");
1752#endif
1753		}
1754	}
1755}
1756#endif
1757
1758#if DEBUG
1759
1760static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1761{
1762	int i, size;
1763	char *realobj;
1764
1765	if (cachep->flags & SLAB_RED_ZONE) {
1766		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1767			*dbg_redzone1(cachep, objp),
1768			*dbg_redzone2(cachep, objp));
1769	}
1770
1771	if (cachep->flags & SLAB_STORE_USER) {
1772		printk(KERN_ERR "Last user: [<%p>]",
1773			*dbg_userword(cachep, objp));
1774		print_symbol("(%s)",
1775				(unsigned long)*dbg_userword(cachep, objp));
1776		printk("\n");
1777	}
1778	realobj = (char *)objp + obj_offset(cachep);
1779	size = obj_size(cachep);
1780	for (i = 0; i < size && lines; i += 16, lines--) {
1781		int limit;
1782		limit = 16;
1783		if (i + limit > size)
1784			limit = size - i;
1785		dump_line(realobj, i, limit);
1786	}
1787}
1788
1789static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1790{
1791	char *realobj;
1792	int size, i;
1793	int lines = 0;
1794
1795	realobj = (char *)objp + obj_offset(cachep);
1796	size = obj_size(cachep);
1797
1798	for (i = 0; i < size; i++) {
1799		char exp = POISON_FREE;
1800		if (i == size - 1)
1801			exp = POISON_END;
1802		if (realobj[i] != exp) {
1803			int limit;
1804			/* Mismatch ! */
1805			/* Print header */
1806			if (lines == 0) {
1807				printk(KERN_ERR
1808					"Slab corruption: %s start=%p, len=%d\n",
1809					cachep->name, realobj, size);
1810				print_objinfo(cachep, objp, 0);
1811			}
1812			/* Hexdump the affected line */
1813			i = (i / 16) * 16;
1814			limit = 16;
1815			if (i + limit > size)
1816				limit = size - i;
1817			dump_line(realobj, i, limit);
1818			i += 16;
1819			lines++;
1820			/* Limit to 5 lines */
1821			if (lines > 5)
1822				break;
1823		}
1824	}
1825	if (lines != 0) {
1826		/* Print some data about the neighboring objects, if they
1827		 * exist:
1828		 */
1829		struct slab *slabp = virt_to_slab(objp);
1830		unsigned int objnr;
1831
1832		objnr = obj_to_index(cachep, slabp, objp);
1833		if (objnr) {
1834			objp = index_to_obj(cachep, slabp, objnr - 1);
1835			realobj = (char *)objp + obj_offset(cachep);
1836			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1837			       realobj, size);
1838			print_objinfo(cachep, objp, 2);
1839		}
1840		if (objnr + 1 < cachep->num) {
1841			objp = index_to_obj(cachep, slabp, objnr + 1);
1842			realobj = (char *)objp + obj_offset(cachep);
1843			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1844			       realobj, size);
1845			print_objinfo(cachep, objp, 2);
1846		}
1847	}
1848}
1849#endif
1850
1851#if DEBUG
1852/**
1853 * slab_destroy_objs - destroy a slab and its objects
1854 * @cachep: cache pointer being destroyed
1855 * @slabp: slab pointer being destroyed
1856 *
1857 * Call the registered destructor for each object in a slab that is being
1858 * destroyed.
1859 */
1860static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1861{
1862	int i;
1863	for (i = 0; i < cachep->num; i++) {
1864		void *objp = index_to_obj(cachep, slabp, i);
1865
1866		if (cachep->flags & SLAB_POISON) {
1867#ifdef CONFIG_DEBUG_PAGEALLOC
1868			if (cachep->buffer_size % PAGE_SIZE == 0 &&
1869					OFF_SLAB(cachep))
1870				kernel_map_pages(virt_to_page(objp),
1871					cachep->buffer_size / PAGE_SIZE, 1);
1872			else
1873				check_poison_obj(cachep, objp);
1874#else
1875			check_poison_obj(cachep, objp);
1876#endif
1877		}
1878		if (cachep->flags & SLAB_RED_ZONE) {
1879			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1880				slab_error(cachep, "start of a freed object "
1881					   "was overwritten");
1882			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1883				slab_error(cachep, "end of a freed object "
1884					   "was overwritten");
1885		}
1886		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1887			(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
1888	}
1889}
1890#else
1891static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1892{
1893	if (cachep->dtor) {
1894		int i;
1895		for (i = 0; i < cachep->num; i++) {
1896			void *objp = index_to_obj(cachep, slabp, i);
1897			(cachep->dtor) (objp, cachep, 0);
1898		}
1899	}
1900}
1901#endif
1902
1903/**
1904 * slab_destroy - destroy and release all objects in a slab
1905 * @cachep: cache pointer being destroyed
1906 * @slabp: slab pointer being destroyed
1907 *
1908 * Destroy all the objs in a slab, and release the mem back to the system.
1909 * Before calling the slab must have been unlinked from the cache.  The
1910 * cache-lock is not held/needed.
1911 */
1912static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1913{
1914	void *addr = slabp->s_mem - slabp->colouroff;
1915
1916	slab_destroy_objs(cachep, slabp);
1917	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1918		struct slab_rcu *slab_rcu;
1919
1920		slab_rcu = (struct slab_rcu *)slabp;
1921		slab_rcu->cachep = cachep;
1922		slab_rcu->addr = addr;
1923		call_rcu(&slab_rcu->head, kmem_rcu_free);
1924	} else {
1925		kmem_freepages(cachep, addr);
1926		if (OFF_SLAB(cachep))
1927			kmem_cache_free(cachep->slabp_cache, slabp);
1928	}
1929}
1930
1931/*
1932 * For setting up all the kmem_list3s for cache whose buffer_size is same as
1933 * size of kmem_list3.
1934 */
1935static void set_up_list3s(struct kmem_cache *cachep, int index)
1936{
1937	int node;
1938
1939	for_each_online_node(node) {
1940		cachep->nodelists[node] = &initkmem_list3[index + node];
1941		cachep->nodelists[node]->next_reap = jiffies +
1942		    REAPTIMEOUT_LIST3 +
1943		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1944	}
1945}
1946
1947static void __kmem_cache_destroy(struct kmem_cache *cachep)
1948{
1949	int i;
1950	struct kmem_list3 *l3;
1951
1952	for_each_online_cpu(i)
1953	    kfree(cachep->array[i]);
1954
1955	/* NUMA: free the list3 structures */
1956	for_each_online_node(i) {
1957		l3 = cachep->nodelists[i];
1958		if (l3) {
1959			kfree(l3->shared);
1960			free_alien_cache(l3->alien);
1961			kfree(l3);
1962		}
1963	}
1964	kmem_cache_free(&cache_cache, cachep);
1965}
1966
1967
1968/**
1969 * calculate_slab_order - calculate size (page order) of slabs
1970 * @cachep: pointer to the cache that is being created
1971 * @size: size of objects to be created in this cache.
1972 * @align: required alignment for the objects.
1973 * @flags: slab allocation flags
1974 *
1975 * Also calculates the number of objects per slab.
1976 *
1977 * This could be made much more intelligent.  For now, try to avoid using
1978 * high order pages for slabs.  When the gfp() functions are more friendly
1979 * towards high-order requests, this should be changed.
1980 */
1981static size_t calculate_slab_order(struct kmem_cache *cachep,
1982			size_t size, size_t align, unsigned long flags)
1983{
1984	unsigned long offslab_limit;
1985	size_t left_over = 0;
1986	int gfporder;
1987
1988	for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
1989		unsigned int num;
1990		size_t remainder;
1991
1992		cache_estimate(gfporder, size, align, flags, &remainder, &num);
1993		if (!num)
1994			continue;
1995
1996		if (flags & CFLGS_OFF_SLAB) {
1997			/*
1998			 * Max number of objs-per-slab for caches which
1999			 * use off-slab slabs. Needed to avoid a possible
2000			 * looping condition in cache_grow().
2001			 */
2002			offslab_limit = size - sizeof(struct slab);
2003			offslab_limit /= sizeof(kmem_bufctl_t);
2004
2005 			if (num > offslab_limit)
2006				break;
2007		}
2008
2009		/* Found something acceptable - save it away */
2010		cachep->num = num;
2011		cachep->gfporder = gfporder;
2012		left_over = remainder;
2013
2014		/*
2015		 * A VFS-reclaimable slab tends to have most allocations
2016		 * as GFP_NOFS and we really don't want to have to be allocating
2017		 * higher-order pages when we are unable to shrink dcache.
2018		 */
2019		if (flags & SLAB_RECLAIM_ACCOUNT)
2020			break;
2021
2022		/*
2023		 * Large number of objects is good, but very large slabs are
2024		 * currently bad for the gfp()s.
2025		 */
2026		if (gfporder >= slab_break_gfp_order)
2027			break;
2028
2029		/*
2030		 * Acceptable internal fragmentation?
2031		 */
2032		if (left_over * 8 <= (PAGE_SIZE << gfporder))
2033			break;
2034	}
2035	return left_over;
2036}
2037
2038static int setup_cpu_cache(struct kmem_cache *cachep)
2039{
2040	if (g_cpucache_up == FULL)
2041		return enable_cpucache(cachep);
2042
2043	if (g_cpucache_up == NONE) {
2044		/*
2045		 * Note: the first kmem_cache_create must create the cache
2046		 * that's used by kmalloc(24), otherwise the creation of
2047		 * further caches will BUG().
2048		 */
2049		cachep->array[smp_processor_id()] = &initarray_generic.cache;
2050
2051		/*
2052		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2053		 * the first cache, then we need to set up all its list3s,
2054		 * otherwise the creation of further caches will BUG().
2055		 */
2056		set_up_list3s(cachep, SIZE_AC);
2057		if (INDEX_AC == INDEX_L3)
2058			g_cpucache_up = PARTIAL_L3;
2059		else
2060			g_cpucache_up = PARTIAL_AC;
2061	} else {
2062		cachep->array[smp_processor_id()] =
2063			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
2064
2065		if (g_cpucache_up == PARTIAL_AC) {
2066			set_up_list3s(cachep, SIZE_L3);
2067			g_cpucache_up = PARTIAL_L3;
2068		} else {
2069			int node;
2070			for_each_online_node(node) {
2071				cachep->nodelists[node] =
2072				    kmalloc_node(sizeof(struct kmem_list3),
2073						GFP_KERNEL, node);
2074				BUG_ON(!cachep->nodelists[node]);
2075				kmem_list3_init(cachep->nodelists[node]);
2076			}
2077		}
2078	}
2079	cachep->nodelists[numa_node_id()]->next_reap =
2080			jiffies + REAPTIMEOUT_LIST3 +
2081			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2082
2083	cpu_cache_get(cachep)->avail = 0;
2084	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2085	cpu_cache_get(cachep)->batchcount = 1;
2086	cpu_cache_get(cachep)->touched = 0;
2087	cachep->batchcount = 1;
2088	cachep->limit = BOOT_CPUCACHE_ENTRIES;
2089	return 0;
2090}
2091
2092/**
2093 * kmem_cache_create - Create a cache.
2094 * @name: A string which is used in /proc/slabinfo to identify this cache.
2095 * @size: The size of objects to be created in this cache.
2096 * @align: The required alignment for the objects.
2097 * @flags: SLAB flags
2098 * @ctor: A constructor for the objects.
2099 * @dtor: A destructor for the objects.
2100 *
2101 * Returns a ptr to the cache on success, NULL on failure.
2102 * Cannot be called within a int, but can be interrupted.
2103 * The @ctor is run when new pages are allocated by the cache
2104 * and the @dtor is run before the pages are handed back.
2105 *
2106 * @name must be valid until the cache is destroyed. This implies that
2107 * the module calling this has to destroy the cache before getting unloaded.
2108 *
2109 * The flags are
2110 *
2111 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2112 * to catch references to uninitialised memory.
2113 *
2114 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2115 * for buffer overruns.
2116 *
2117 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2118 * cacheline.  This can be beneficial if you're counting cycles as closely
2119 * as davem.
2120 */
2121struct kmem_cache *
2122kmem_cache_create (const char *name, size_t size, size_t align,
2123	unsigned long flags,
2124	void (*ctor)(void*, struct kmem_cache *, unsigned long),
2125	void (*dtor)(void*, struct kmem_cache *, unsigned long))
2126{
2127	size_t left_over, slab_size, ralign;
2128	struct kmem_cache *cachep = NULL, *pc;
2129
2130	/*
2131	 * Sanity checks... these are all serious usage bugs.
2132	 */
2133	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2134	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
2135		printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
2136				name);
2137		BUG();
2138	}
2139
2140	/*
2141	 * We use cache_chain_mutex to ensure a consistent view of
2142	 * cpu_online_map as well.  Please see cpuup_callback
2143	 */
2144	mutex_lock(&cache_chain_mutex);
2145
2146	list_for_each_entry(pc, &cache_chain, next) {
2147		char tmp;
2148		int res;
2149
2150		/*
2151		 * This happens when the module gets unloaded and doesn't
2152		 * destroy its slab cache and no-one else reuses the vmalloc
2153		 * area of the module.  Print a warning.
2154		 */
2155		res = probe_kernel_address(pc->name, tmp);
2156		if (res) {
2157			printk("SLAB: cache with size %d has lost its name\n",
2158			       pc->buffer_size);
2159			continue;
2160		}
2161
2162		if (!strcmp(pc->name, name)) {
2163			printk("kmem_cache_create: duplicate cache %s\n", name);
2164			dump_stack();
2165			goto oops;
2166		}
2167	}
2168
2169#if DEBUG
2170	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
2171	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
2172		/* No constructor, but inital state check requested */
2173		printk(KERN_ERR "%s: No con, but init state check "
2174		       "requested - %s\n", __FUNCTION__, name);
2175		flags &= ~SLAB_DEBUG_INITIAL;
2176	}
2177#if FORCED_DEBUG
2178	/*
2179	 * Enable redzoning and last user accounting, except for caches with
2180	 * large objects, if the increased size would increase the object size
2181	 * above the next power of two: caches with object sizes just above a
2182	 * power of two have a significant amount of internal fragmentation.
2183	 */
2184	if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
2185		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2186	if (!(flags & SLAB_DESTROY_BY_RCU))
2187		flags |= SLAB_POISON;
2188#endif
2189	if (flags & SLAB_DESTROY_BY_RCU)
2190		BUG_ON(flags & SLAB_POISON);
2191#endif
2192	if (flags & SLAB_DESTROY_BY_RCU)
2193		BUG_ON(dtor);
2194
2195	/*
2196	 * Always checks flags, a caller might be expecting debug support which
2197	 * isn't available.
2198	 */
2199	BUG_ON(flags & ~CREATE_MASK);
2200
2201	/*
2202	 * Check that size is in terms of words.  This is needed to avoid
2203	 * unaligned accesses for some archs when redzoning is used, and makes
2204	 * sure any on-slab bufctl's are also correctly aligned.
2205	 */
2206	if (size & (BYTES_PER_WORD - 1)) {
2207		size += (BYTES_PER_WORD - 1);
2208		size &= ~(BYTES_PER_WORD - 1);
2209	}
2210
2211	/* calculate the final buffer alignment: */
2212
2213	/* 1) arch recommendation: can be overridden for debug */
2214	if (flags & SLAB_HWCACHE_ALIGN) {
2215		/*
2216		 * Default alignment: as specified by the arch code.  Except if
2217		 * an object is really small, then squeeze multiple objects into
2218		 * one cacheline.
2219		 */
2220		ralign = cache_line_size();
2221		while (size <= ralign / 2)
2222			ralign /= 2;
2223	} else {
2224		ralign = BYTES_PER_WORD;
2225	}
2226
2227	/*
2228	 * Redzoning and user store require word alignment. Note this will be
2229	 * overridden by architecture or caller mandated alignment if either
2230	 * is greater than BYTES_PER_WORD.
2231	 */
2232	if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
2233		ralign = BYTES_PER_WORD;
2234
2235	/* 2) arch mandated alignment */
2236	if (ralign < ARCH_SLAB_MINALIGN) {
2237		ralign = ARCH_SLAB_MINALIGN;
2238	}
2239	/* 3) caller mandated alignment */
2240	if (ralign < align) {
2241		ralign = align;
2242	}
2243	/* disable debug if necessary */
2244	if (ralign > BYTES_PER_WORD)
2245		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2246	/*
2247	 * 4) Store it.
2248	 */
2249	align = ralign;
2250
2251	/* Get cache's description obj. */
2252	cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
2253	if (!cachep)
2254		goto oops;
2255
2256#if DEBUG
2257	cachep->obj_size = size;
2258
2259	/*
2260	 * Both debugging options require word-alignment which is calculated
2261	 * into align above.
2262	 */
2263	if (flags & SLAB_RED_ZONE) {
2264		/* add space for red zone words */
2265		cachep->obj_offset += BYTES_PER_WORD;
2266		size += 2 * BYTES_PER_WORD;
2267	}
2268	if (flags & SLAB_STORE_USER) {
2269		/* user store requires one word storage behind the end of
2270		 * the real object.
2271		 */
2272		size += BYTES_PER_WORD;
2273	}
2274#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2275	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2276	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
2277		cachep->obj_offset += PAGE_SIZE - size;
2278		size = PAGE_SIZE;
2279	}
2280#endif
2281#endif
2282
2283	/*
2284	 * Determine if the slab management is 'on' or 'off' slab.
2285	 * (bootstrapping cannot cope with offslab caches so don't do
2286	 * it too early on.)
2287	 */
2288	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
2289		/*
2290		 * Size is large, assume best to place the slab management obj
2291		 * off-slab (should allow better packing of objs).
2292		 */
2293		flags |= CFLGS_OFF_SLAB;
2294
2295	size = ALIGN(size, align);
2296
2297	left_over = calculate_slab_order(cachep, size, align, flags);
2298
2299	if (!cachep->num) {
2300		printk("kmem_cache_create: couldn't create cache %s.\n", name);
2301		kmem_cache_free(&cache_cache, cachep);
2302		cachep = NULL;
2303		goto oops;
2304	}
2305	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2306			  + sizeof(struct slab), align);
2307
2308	/*
2309	 * If the slab has been placed off-slab, and we have enough space then
2310	 * move it on-slab. This is at the expense of any extra colouring.
2311	 */
2312	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2313		flags &= ~CFLGS_OFF_SLAB;
2314		left_over -= slab_size;
2315	}
2316
2317	if (flags & CFLGS_OFF_SLAB) {
2318		/* really off slab. No need for manual alignment */
2319		slab_size =
2320		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2321	}
2322
2323	cachep->colour_off = cache_line_size();
2324	/* Offset must be a multiple of the alignment. */
2325	if (cachep->colour_off < align)
2326		cachep->colour_off = align;
2327	cachep->colour = left_over / cachep->colour_off;
2328	cachep->slab_size = slab_size;
2329	cachep->flags = flags;
2330	cachep->gfpflags = 0;
2331	if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2332		cachep->gfpflags |= GFP_DMA;
2333	cachep->buffer_size = size;
2334	cachep->reciprocal_buffer_size = reciprocal_value(size);
2335
2336	if (flags & CFLGS_OFF_SLAB) {
2337		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2338		/*
2339		 * This is a possibility for one of the malloc_sizes caches.
2340		 * But since we go off slab only for object size greater than
2341		 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2342		 * this should not happen at all.
2343		 * But leave a BUG_ON for some lucky dude.
2344		 */
2345		BUG_ON(!cachep->slabp_cache);
2346	}
2347	cachep->ctor = ctor;
2348	cachep->dtor = dtor;
2349	cachep->name = name;
2350
2351	if (setup_cpu_cache(cachep)) {
2352		__kmem_cache_destroy(cachep);
2353		cachep = NULL;
2354		goto oops;
2355	}
2356
2357	/* cache setup completed, link it into the list */
2358	list_add(&cachep->next, &cache_chain);
2359oops:
2360	if (!cachep && (flags & SLAB_PANIC))
2361		panic("kmem_cache_create(): failed to create slab `%s'\n",
2362		      name);
2363	mutex_unlock(&cache_chain_mutex);
2364	return cachep;
2365}
2366EXPORT_SYMBOL(kmem_cache_create);
2367
2368#if DEBUG
2369static void check_irq_off(void)
2370{
2371	BUG_ON(!irqs_disabled());
2372}
2373
2374static void check_irq_on(void)
2375{
2376	BUG_ON(irqs_disabled());
2377}
2378
2379static void check_spinlock_acquired(struct kmem_cache *cachep)
2380{
2381#ifdef CONFIG_SMP
2382	check_irq_off();
2383	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
2384#endif
2385}
2386
2387static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2388{
2389#ifdef CONFIG_SMP
2390	check_irq_off();
2391	assert_spin_locked(&cachep->nodelists[node]->list_lock);
2392#endif
2393}
2394
2395#else
2396#define check_irq_off()	do { } while(0)
2397#define check_irq_on()	do { } while(0)
2398#define check_spinlock_acquired(x) do { } while(0)
2399#define check_spinlock_acquired_node(x, y) do { } while(0)
2400#endif
2401
2402static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2403			struct array_cache *ac,
2404			int force, int node);
2405
2406static void do_drain(void *arg)
2407{
2408	struct kmem_cache *cachep = arg;
2409	struct array_cache *ac;
2410	int node = numa_node_id();
2411
2412	check_irq_off();
2413	ac = cpu_cache_get(cachep);
2414	spin_lock(&cachep->nodelists[node]->list_lock);
2415	free_block(cachep, ac->entry, ac->avail, node);
2416	spin_unlock(&cachep->nodelists[node]->list_lock);
2417	ac->avail = 0;
2418}
2419
2420static void drain_cpu_caches(struct kmem_cache *cachep)
2421{
2422	struct kmem_list3 *l3;
2423	int node;
2424
2425	on_each_cpu(do_drain, cachep, 1, 1);
2426	check_irq_on();
2427	for_each_online_node(node) {
2428		l3 = cachep->nodelists[node];
2429		if (l3 && l3->alien)
2430			drain_alien_cache(cachep, l3->alien);
2431	}
2432
2433	for_each_online_node(node) {
2434		l3 = cachep->nodelists[node];
2435		if (l3)
2436			drain_array(cachep, l3, l3->shared, 1, node);
2437	}
2438}
2439
2440/*
2441 * Remove slabs from the list of free slabs.
2442 * Specify the number of slabs to drain in tofree.
2443 *
2444 * Returns the actual number of slabs released.
2445 */
2446static int drain_freelist(struct kmem_cache *cache,
2447			struct kmem_list3 *l3, int tofree)
2448{
2449	struct list_head *p;
2450	int nr_freed;
2451	struct slab *slabp;
2452
2453	nr_freed = 0;
2454	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2455
2456		spin_lock_irq(&l3->list_lock);
2457		p = l3->slabs_free.prev;
2458		if (p == &l3->slabs_free) {
2459			spin_unlock_irq(&l3->list_lock);
2460			goto out;
2461		}
2462
2463		slabp = list_entry(p, struct slab, list);
2464#if DEBUG
2465		BUG_ON(slabp->inuse);
2466#endif
2467		list_del(&slabp->list);
2468		/*
2469		 * Safe to drop the lock. The slab is no longer linked
2470		 * to the cache.
2471		 */
2472		l3->free_objects -= cache->num;
2473		spin_unlock_irq(&l3->list_lock);
2474		slab_destroy(cache, slabp);
2475		nr_freed++;
2476	}
2477out:
2478	return nr_freed;
2479}
2480
2481/* Called with cache_chain_mutex held to protect against cpu hotplug */
2482static int __cache_shrink(struct kmem_cache *cachep)
2483{
2484	int ret = 0, i = 0;
2485	struct kmem_list3 *l3;
2486
2487	drain_cpu_caches(cachep);
2488
2489	check_irq_on();
2490	for_each_online_node(i) {
2491		l3 = cachep->nodelists[i];
2492		if (!l3)
2493			continue;
2494
2495		drain_freelist(cachep, l3, l3->free_objects);
2496
2497		ret += !list_empty(&l3->slabs_full) ||
2498			!list_empty(&l3->slabs_partial);
2499	}
2500	return (ret ? 1 : 0);
2501}
2502
2503/**
2504 * kmem_cache_shrink - Shrink a cache.
2505 * @cachep: The cache to shrink.
2506 *
2507 * Releases as many slabs as possible for a cache.
2508 * To help debugging, a zero exit status indicates all slabs were released.
2509 */
2510int kmem_cache_shrink(struct kmem_cache *cachep)
2511{
2512	int ret;
2513	BUG_ON(!cachep || in_interrupt());
2514
2515	mutex_lock(&cache_chain_mutex);
2516	ret = __cache_shrink(cachep);
2517	mutex_unlock(&cache_chain_mutex);
2518	return ret;
2519}
2520EXPORT_SYMBOL(kmem_cache_shrink);
2521
2522/**
2523 * kmem_cache_destroy - delete a cache
2524 * @cachep: the cache to destroy
2525 *
2526 * Remove a &struct kmem_cache object from the slab cache.
2527 *
2528 * It is expected this function will be called by a module when it is
2529 * unloaded.  This will remove the cache completely, and avoid a duplicate
2530 * cache being allocated each time a module is loaded and unloaded, if the
2531 * module doesn't have persistent in-kernel storage across loads and unloads.
2532 *
2533 * The cache must be empty before calling this function.
2534 *
2535 * The caller must guarantee that noone will allocate memory from the cache
2536 * during the kmem_cache_destroy().
2537 */
2538void kmem_cache_destroy(struct kmem_cache *cachep)
2539{
2540	BUG_ON(!cachep || in_interrupt());
2541
2542	/* Find the cache in the chain of caches. */
2543	mutex_lock(&cache_chain_mutex);
2544	/*
2545	 * the chain is never empty, cache_cache is never destroyed
2546	 */
2547	list_del(&cachep->next);
2548	if (__cache_shrink(cachep)) {
2549		slab_error(cachep, "Can't free all objects");
2550		list_add(&cachep->next, &cache_chain);
2551		mutex_unlock(&cache_chain_mutex);
2552		return;
2553	}
2554
2555	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2556		synchronize_rcu();
2557
2558	__kmem_cache_destroy(cachep);
2559	mutex_unlock(&cache_chain_mutex);
2560}
2561EXPORT_SYMBOL(kmem_cache_destroy);
2562
2563/*
2564 * Get the memory for a slab management obj.
2565 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2566 * always come from malloc_sizes caches.  The slab descriptor cannot
2567 * come from the same cache which is getting created because,
2568 * when we are searching for an appropriate cache for these
2569 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2570 * If we are creating a malloc_sizes cache here it would not be visible to
2571 * kmem_find_general_cachep till the initialization is complete.
2572 * Hence we cannot have slabp_cache same as the original cache.
2573 */
2574static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2575				   int colour_off, gfp_t local_flags,
2576				   int nodeid)
2577{
2578	struct slab *slabp;
2579
2580	if (OFF_SLAB(cachep)) {
2581		/* Slab management obj is off-slab. */
2582		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2583					      local_flags & ~GFP_THISNODE, nodeid);
2584		if (!slabp)
2585			return NULL;
2586	} else {
2587		slabp = objp + colour_off;
2588		colour_off += cachep->slab_size;
2589	}
2590	slabp->inuse = 0;
2591	slabp->colouroff = colour_off;
2592	slabp->s_mem = objp + colour_off;
2593	slabp->nodeid = nodeid;
2594	return slabp;
2595}
2596
2597static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2598{
2599	return (kmem_bufctl_t *) (slabp + 1);
2600}
2601
2602static void cache_init_objs(struct kmem_cache *cachep,
2603			    struct slab *slabp, unsigned long ctor_flags)
2604{
2605	int i;
2606
2607	for (i = 0; i < cachep->num; i++) {
2608		void *objp = index_to_obj(cachep, slabp, i);
2609#if DEBUG
2610		/* need to poison the objs? */
2611		if (cachep->flags & SLAB_POISON)
2612			poison_obj(cachep, objp, POISON_FREE);
2613		if (cachep->flags & SLAB_STORE_USER)
2614			*dbg_userword(cachep, objp) = NULL;
2615
2616		if (cachep->flags & SLAB_RED_ZONE) {
2617			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2618			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2619		}
2620		/*
2621		 * Constructors are not allowed to allocate memory from the same
2622		 * cache which they are a constructor for.  Otherwise, deadlock.
2623		 * They must also be threaded.
2624		 */
2625		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2626			cachep->ctor(objp + obj_offset(cachep), cachep,
2627				     ctor_flags);
2628
2629		if (cachep->flags & SLAB_RED_ZONE) {
2630			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2631				slab_error(cachep, "constructor overwrote the"
2632					   " end of an object");
2633			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2634				slab_error(cachep, "constructor overwrote the"
2635					   " start of an object");
2636		}
2637		if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2638			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2639			kernel_map_pages(virt_to_page(objp),
2640					 cachep->buffer_size / PAGE_SIZE, 0);
2641#else
2642		if (cachep->ctor)
2643			cachep->ctor(objp, cachep, ctor_flags);
2644#endif
2645		slab_bufctl(slabp)[i] = i + 1;
2646	}
2647	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2648	slabp->free = 0;
2649}
2650
2651static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2652{
2653	if (CONFIG_ZONE_DMA_FLAG) {
2654		if (flags & GFP_DMA)
2655			BUG_ON(!(cachep->gfpflags & GFP_DMA));
2656		else
2657			BUG_ON(cachep->gfpflags & GFP_DMA);
2658	}
2659}
2660
2661static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2662				int nodeid)
2663{
2664	void *objp = index_to_obj(cachep, slabp, slabp->free);
2665	kmem_bufctl_t next;
2666
2667	slabp->inuse++;
2668	next = slab_bufctl(slabp)[slabp->free];
2669#if DEBUG
2670	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2671	WARN_ON(slabp->nodeid != nodeid);
2672#endif
2673	slabp->free = next;
2674
2675	return objp;
2676}
2677
2678static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2679				void *objp, int nodeid)
2680{
2681	unsigned int objnr = obj_to_index(cachep, slabp, objp);
2682
2683#if DEBUG
2684	/* Verify that the slab belongs to the intended node */
2685	WARN_ON(slabp->nodeid != nodeid);
2686
2687	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2688		printk(KERN_ERR "slab: double free detected in cache "
2689				"'%s', objp %p\n", cachep->name, objp);
2690		BUG();
2691	}
2692#endif
2693	slab_bufctl(slabp)[objnr] = slabp->free;
2694	slabp->free = objnr;
2695	slabp->inuse--;
2696}
2697
2698/*
2699 * Map pages beginning at addr to the given cache and slab. This is required
2700 * for the slab allocator to be able to lookup the cache and slab of a
2701 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
2702 */
2703static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2704			   void *addr)
2705{
2706	int nr_pages;
2707	struct page *page;
2708
2709	page = virt_to_page(addr);
2710
2711	nr_pages = 1;
2712	if (likely(!PageCompound(page)))
2713		nr_pages <<= cache->gfporder;
2714
2715	do {
2716		page_set_cache(page, cache);
2717		page_set_slab(page, slab);
2718		page++;
2719	} while (--nr_pages);
2720}
2721
2722/*
2723 * Grow (by 1) the number of slabs within a cache.  This is called by
2724 * kmem_cache_alloc() when there are no active objs left in a cache.
2725 */
2726static int cache_grow(struct kmem_cache *cachep,
2727		gfp_t flags, int nodeid, void *objp)
2728{
2729	struct slab *slabp;
2730	size_t offset;
2731	gfp_t local_flags;
2732	unsigned long ctor_flags;
2733	struct kmem_list3 *l3;
2734
2735	/*
2736	 * Be lazy and only check for valid flags here,  keeping it out of the
2737	 * critical path in kmem_cache_alloc().
2738	 */
2739	BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
2740	if (flags & __GFP_NO_GROW)
2741		return 0;
2742
2743	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2744	local_flags = (flags & GFP_LEVEL_MASK);
2745	if (!(local_flags & __GFP_WAIT))
2746		/*
2747		 * Not allowed to sleep.  Need to tell a constructor about
2748		 * this - it might need to know...
2749		 */
2750		ctor_flags |= SLAB_CTOR_ATOMIC;
2751
2752	/* Take the l3 list lock to change the colour_next on this node */
2753	check_irq_off();
2754	l3 = cachep->nodelists[nodeid];
2755	spin_lock(&l3->list_lock);
2756
2757	/* Get colour for the slab, and cal the next value. */
2758	offset = l3->colour_next;
2759	l3->colour_next++;
2760	if (l3->colour_next >= cachep->colour)
2761		l3->colour_next = 0;
2762	spin_unlock(&l3->list_lock);
2763
2764	offset *= cachep->colour_off;
2765
2766	if (local_flags & __GFP_WAIT)
2767		local_irq_enable();
2768
2769	/*
2770	 * The test for missing atomic flag is performed here, rather than
2771	 * the more obvious place, simply to reduce the critical path length
2772	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2773	 * will eventually be caught here (where it matters).
2774	 */
2775	kmem_flagcheck(cachep, flags);
2776
2777	/*
2778	 * Get mem for the objs.  Attempt to allocate a physical page from
2779	 * 'nodeid'.
2780	 */
2781	if (!objp)
2782		objp = kmem_getpages(cachep, flags, nodeid);
2783	if (!objp)
2784		goto failed;
2785
2786	/* Get slab management. */
2787	slabp = alloc_slabmgmt(cachep, objp, offset,
2788			local_flags & ~GFP_THISNODE, nodeid);
2789	if (!slabp)
2790		goto opps1;
2791
2792	slabp->nodeid = nodeid;
2793	slab_map_pages(cachep, slabp, objp);
2794
2795	cache_init_objs(cachep, slabp, ctor_flags);
2796
2797	if (local_flags & __GFP_WAIT)
2798		local_irq_disable();
2799	check_irq_off();
2800	spin_lock(&l3->list_lock);
2801
2802	/* Make slab active. */
2803	list_add_tail(&slabp->list, &(l3->slabs_free));
2804	STATS_INC_GROWN(cachep);
2805	l3->free_objects += cachep->num;
2806	spin_unlock(&l3->list_lock);
2807	return 1;
2808opps1:
2809	kmem_freepages(cachep, objp);
2810failed:
2811	if (local_flags & __GFP_WAIT)
2812		local_irq_disable();
2813	return 0;
2814}
2815
2816#if DEBUG
2817
2818/*
2819 * Perform extra freeing checks:
2820 * - detect bad pointers.
2821 * - POISON/RED_ZONE checking
2822 * - destructor calls, for caches with POISON+dtor
2823 */
2824static void kfree_debugcheck(const void *objp)
2825{
2826	if (!virt_addr_valid(objp)) {
2827		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2828		       (unsigned long)objp);
2829		BUG();
2830	}
2831}
2832
2833static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2834{
2835	unsigned long redzone1, redzone2;
2836
2837	redzone1 = *dbg_redzone1(cache, obj);
2838	redzone2 = *dbg_redzone2(cache, obj);
2839
2840	/*
2841	 * Redzone is ok.
2842	 */
2843	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2844		return;
2845
2846	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2847		slab_error(cache, "double free detected");
2848	else
2849		slab_error(cache, "memory outside object was overwritten");
2850
2851	printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
2852			obj, redzone1, redzone2);
2853}
2854
2855static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2856				   void *caller)
2857{
2858	struct page *page;
2859	unsigned int objnr;
2860	struct slab *slabp;
2861
2862	objp -= obj_offset(cachep);
2863	kfree_debugcheck(objp);
2864	page = virt_to_page(objp);
2865
2866	slabp = page_get_slab(page);
2867
2868	if (cachep->flags & SLAB_RED_ZONE) {
2869		verify_redzone_free(cachep, objp);
2870		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2871		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2872	}
2873	if (cachep->flags & SLAB_STORE_USER)
2874		*dbg_userword(cachep, objp) = caller;
2875
2876	objnr = obj_to_index(cachep, slabp, objp);
2877
2878	BUG_ON(objnr >= cachep->num);
2879	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2880
2881	if (cachep->flags & SLAB_DEBUG_INITIAL) {
2882		/*
2883		 * Need to call the slab's constructor so the caller can
2884		 * perform a verify of its state (debugging).  Called without
2885		 * the cache-lock held.
2886		 */
2887		cachep->ctor(objp + obj_offset(cachep),
2888			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2889	}
2890	if (cachep->flags & SLAB_POISON && cachep->dtor) {
2891		/* we want to cache poison the object,
2892		 * call the destruction callback
2893		 */
2894		cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2895	}
2896#ifdef CONFIG_DEBUG_SLAB_LEAK
2897	slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2898#endif
2899	if (cachep->flags & SLAB_POISON) {
2900#ifdef CONFIG_DEBUG_PAGEALLOC
2901		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2902			store_stackinfo(cachep, objp, (unsigned long)caller);
2903			kernel_map_pages(virt_to_page(objp),
2904					 cachep->buffer_size / PAGE_SIZE, 0);
2905		} else {
2906			poison_obj(cachep, objp, POISON_FREE);
2907		}
2908#else
2909		poison_obj(cachep, objp, POISON_FREE);
2910#endif
2911	}
2912	return objp;
2913}
2914
2915static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2916{
2917	kmem_bufctl_t i;
2918	int entries = 0;
2919
2920	/* Check slab's freelist to see if this obj is there. */
2921	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2922		entries++;
2923		if (entries > cachep->num || i >= cachep->num)
2924			goto bad;
2925	}
2926	if (entries != cachep->num - slabp->inuse) {
2927bad:
2928		printk(KERN_ERR "slab: Internal list corruption detected in "
2929				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2930			cachep->name, cachep->num, slabp, slabp->inuse);
2931		for (i = 0;
2932		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2933		     i++) {
2934			if (i % 16 == 0)
2935				printk("\n%03x:", i);
2936			printk(" %02x", ((unsigned char *)slabp)[i]);
2937		}
2938		printk("\n");
2939		BUG();
2940	}
2941}
2942#else
2943#define kfree_debugcheck(x) do { } while(0)
2944#define cache_free_debugcheck(x,objp,z) (objp)
2945#define check_slabp(x,y) do { } while(0)
2946#endif
2947
2948static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2949{
2950	int batchcount;
2951	struct kmem_list3 *l3;
2952	struct array_cache *ac;
2953	int node;
2954
2955	node = numa_node_id();
2956
2957	check_irq_off();
2958	ac = cpu_cache_get(cachep);
2959retry:
2960	batchcount = ac->batchcount;
2961	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2962		/*
2963		 * If there was little recent activity on this cache, then
2964		 * perform only a partial refill.  Otherwise we could generate
2965		 * refill bouncing.
2966		 */
2967		batchcount = BATCHREFILL_LIMIT;
2968	}
2969	l3 = cachep->nodelists[node];
2970
2971	BUG_ON(ac->avail > 0 || !l3);
2972	spin_lock(&l3->list_lock);
2973
2974	/* See if we can refill from the shared array */
2975	if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
2976		goto alloc_done;
2977
2978	while (batchcount > 0) {
2979		struct list_head *entry;
2980		struct slab *slabp;
2981		/* Get slab alloc is to come from. */
2982		entry = l3->slabs_partial.next;
2983		if (entry == &l3->slabs_partial) {
2984			l3->free_touched = 1;
2985			entry = l3->slabs_free.next;
2986			if (entry == &l3->slabs_free)
2987				goto must_grow;
2988		}
2989
2990		slabp = list_entry(entry, struct slab, list);
2991		check_slabp(cachep, slabp);
2992		check_spinlock_acquired(cachep);
2993
2994		/*
2995		 * The slab was either on partial or free list so
2996		 * there must be at least one object available for
2997		 * allocation.
2998		 */
2999		BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num);
3000
3001		while (slabp->inuse < cachep->num && batchcount--) {
3002			STATS_INC_ALLOCED(cachep);
3003			STATS_INC_ACTIVE(cachep);
3004			STATS_SET_HIGH(cachep);
3005
3006			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
3007							    node);
3008		}
3009		check_slabp(cachep, slabp);
3010
3011		/* move slabp to correct slabp list: */
3012		list_del(&slabp->list);
3013		if (slabp->free == BUFCTL_END)
3014			list_add(&slabp->list, &l3->slabs_full);
3015		else
3016			list_add(&slabp->list, &l3->slabs_partial);
3017	}
3018
3019must_grow:
3020	l3->free_objects -= ac->avail;
3021alloc_done:
3022	spin_unlock(&l3->list_lock);
3023
3024	if (unlikely(!ac->avail)) {
3025		int x;
3026		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3027
3028		/* cache_grow can reenable interrupts, then ac could change. */
3029		ac = cpu_cache_get(cachep);
3030		if (!x && ac->avail == 0)	/* no objects in sight? abort */
3031			return NULL;
3032
3033		if (!ac->avail)		/* objects refilled by interrupt? */
3034			goto retry;
3035	}
3036	ac->touched = 1;
3037	return ac->entry[--ac->avail];
3038}
3039
3040static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3041						gfp_t flags)
3042{
3043	might_sleep_if(flags & __GFP_WAIT);
3044#if DEBUG
3045	kmem_flagcheck(cachep, flags);
3046#endif
3047}
3048
3049#if DEBUG
3050static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3051				gfp_t flags, void *objp, void *caller)
3052{
3053	if (!objp)
3054		return objp;
3055	if (cachep->flags & SLAB_POISON) {
3056#ifdef CONFIG_DEBUG_PAGEALLOC
3057		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3058			kernel_map_pages(virt_to_page(objp),
3059					 cachep->buffer_size / PAGE_SIZE, 1);
3060		else
3061			check_poison_obj(cachep, objp);
3062#else
3063		check_poison_obj(cachep, objp);
3064#endif
3065		poison_obj(cachep, objp, POISON_INUSE);
3066	}
3067	if (cachep->flags & SLAB_STORE_USER)
3068		*dbg_userword(cachep, objp) = caller;
3069
3070	if (cachep->flags & SLAB_RED_ZONE) {
3071		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3072				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3073			slab_error(cachep, "double free, or memory outside"
3074						" object was overwritten");
3075			printk(KERN_ERR
3076				"%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
3077				objp, *dbg_redzone1(cachep, objp),
3078				*dbg_redzone2(cachep, objp));
3079		}
3080		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
3081		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
3082	}
3083#ifdef CONFIG_DEBUG_SLAB_LEAK
3084	{
3085		struct slab *slabp;
3086		unsigned objnr;
3087
3088		slabp = page_get_slab(virt_to_page(objp));
3089		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
3090		slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3091	}
3092#endif
3093	objp += obj_offset(cachep);
3094	if (cachep->ctor && cachep->flags & SLAB_POISON) {
3095		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
3096
3097		if (!(flags & __GFP_WAIT))
3098			ctor_flags |= SLAB_CTOR_ATOMIC;
3099
3100		cachep->ctor(objp, cachep, ctor_flags);
3101	}
3102#if ARCH_SLAB_MINALIGN
3103	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3104		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3105		       objp, ARCH_SLAB_MINALIGN);
3106	}
3107#endif
3108	return objp;
3109}
3110#else
3111#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3112#endif
3113
3114#ifdef CONFIG_FAILSLAB
3115
3116static struct failslab_attr {
3117
3118	struct fault_attr attr;
3119
3120	u32 ignore_gfp_wait;
3121#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3122	struct dentry *ignore_gfp_wait_file;
3123#endif
3124
3125} failslab = {
3126	.attr = FAULT_ATTR_INITIALIZER,
3127	.ignore_gfp_wait = 1,
3128};
3129
3130static int __init setup_failslab(char *str)
3131{
3132	return setup_fault_attr(&failslab.attr, str);
3133}
3134__setup("failslab=", setup_failslab);
3135
3136static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3137{
3138	if (cachep == &cache_cache)
3139		return 0;
3140	if (flags & __GFP_NOFAIL)
3141		return 0;
3142	if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
3143		return 0;
3144
3145	return should_fail(&failslab.attr, obj_size(cachep));
3146}
3147
3148#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3149
3150static int __init failslab_debugfs(void)
3151{
3152	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
3153	struct dentry *dir;
3154	int err;
3155
3156       	err = init_fault_attr_dentries(&failslab.attr, "failslab");
3157	if (err)
3158		return err;
3159	dir = failslab.attr.dentries.dir;
3160
3161	failslab.ignore_gfp_wait_file =
3162		debugfs_create_bool("ignore-gfp-wait", mode, dir,
3163				      &failslab.ignore_gfp_wait);
3164
3165	if (!failslab.ignore_gfp_wait_file) {
3166		err = -ENOMEM;
3167		debugfs_remove(failslab.ignore_gfp_wait_file);
3168		cleanup_fault_attr_dentries(&failslab.attr);
3169	}
3170
3171	return err;
3172}
3173
3174late_initcall(failslab_debugfs);
3175
3176#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3177
3178#else /* CONFIG_FAILSLAB */
3179
3180static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3181{
3182	return 0;
3183}
3184
3185#endif /* CONFIG_FAILSLAB */
3186
3187static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3188{
3189	void *objp;
3190	struct array_cache *ac;
3191
3192	check_irq_off();
3193
3194	if (should_failslab(cachep, flags))
3195		return NULL;
3196
3197	ac = cpu_cache_get(cachep);
3198	if (likely(ac->avail)) {
3199		STATS_INC_ALLOCHIT(cachep);
3200		ac->touched = 1;
3201		objp = ac->entry[--ac->avail];
3202	} else {
3203		STATS_INC_ALLOCMISS(cachep);
3204		objp = cache_alloc_refill(cachep, flags);
3205	}
3206	return objp;
3207}
3208
3209#ifdef CONFIG_NUMA
3210/*
3211 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3212 *
3213 * If we are in_interrupt, then process context, including cpusets and
3214 * mempolicy, may not apply and should not be used for allocation policy.
3215 */
3216static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3217{
3218	int nid_alloc, nid_here;
3219
3220	if (in_interrupt() || (flags & __GFP_THISNODE))
3221		return NULL;
3222	nid_alloc = nid_here = numa_node_id();
3223	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3224		nid_alloc = cpuset_mem_spread_node();
3225	else if (current->mempolicy)
3226		nid_alloc = slab_node(current->mempolicy);
3227	if (nid_alloc != nid_here)
3228		return ____cache_alloc_node(cachep, flags, nid_alloc);
3229	return NULL;
3230}
3231
3232/*
3233 * Fallback function if there was no memory available and no objects on a
3234 * certain node and fall back is permitted. First we scan all the
3235 * available nodelists for available objects. If that fails then we
3236 * perform an allocation without specifying a node. This allows the page
3237 * allocator to do its reclaim / fallback magic. We then insert the
3238 * slab into the proper nodelist and then allocate from it.
3239 */
3240static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3241{
3242	struct zonelist *zonelist;
3243	gfp_t local_flags;
3244	struct zone **z;
3245	void *obj = NULL;
3246	int nid;
3247
3248	if (flags & __GFP_THISNODE)
3249		return NULL;
3250
3251	zonelist = &NODE_DATA(slab_node(current->mempolicy))
3252			->node_zonelists[gfp_zone(flags)];
3253	local_flags = (flags & GFP_LEVEL_MASK);
3254
3255retry:
3256	/*
3257	 * Look through allowed nodes for objects available
3258	 * from existing per node queues.
3259	 */
3260	for (z = zonelist->zones; *z && !obj; z++) {
3261		nid = zone_to_nid(*z);
3262
3263		if (cpuset_zone_allowed_hardwall(*z, flags) &&
3264			cache->nodelists[nid] &&
3265			cache->nodelists[nid]->free_objects)
3266				obj = ____cache_alloc_node(cache,
3267					flags | GFP_THISNODE, nid);
3268	}
3269
3270	if (!obj && !(flags & __GFP_NO_GROW)) {
3271		/*
3272		 * This allocation will be performed within the constraints
3273		 * of the current cpuset / memory policy requirements.
3274		 * We may trigger various forms of reclaim on the allowed
3275		 * set and go into memory reserves if necessary.
3276		 */
3277		if (local_flags & __GFP_WAIT)
3278			local_irq_enable();
3279		kmem_flagcheck(cache, flags);
3280		obj = kmem_getpages(cache, flags, -1);
3281		if (local_flags & __GFP_WAIT)
3282			local_irq_disable();
3283		if (obj) {
3284			/*
3285			 * Insert into the appropriate per node queues
3286			 */
3287			nid = page_to_nid(virt_to_page(obj));
3288			if (cache_grow(cache, flags, nid, obj)) {
3289				obj = ____cache_alloc_node(cache,
3290					flags | GFP_THISNODE, nid);
3291				if (!obj)
3292					/*
3293					 * Another processor may allocate the
3294					 * objects in the slab since we are
3295					 * not holding any locks.
3296					 */
3297					goto retry;
3298			} else {
3299				/* cache_grow already freed obj */
3300				obj = NULL;
3301			}
3302		}
3303	}
3304	return obj;
3305}
3306
3307/*
3308 * A interface to enable slab creation on nodeid
3309 */
3310static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3311				int nodeid)
3312{
3313	struct list_head *entry;
3314	struct slab *slabp;
3315	struct kmem_list3 *l3;
3316	void *obj;
3317	int x;
3318
3319	l3 = cachep->nodelists[nodeid];
3320	BUG_ON(!l3);
3321
3322retry:
3323	check_irq_off();
3324	spin_lock(&l3->list_lock);
3325	entry = l3->slabs_partial.next;
3326	if (entry == &l3->slabs_partial) {
3327		l3->free_touched = 1;
3328		entry = l3->slabs_free.next;
3329		if (entry == &l3->slabs_free)
3330			goto must_grow;
3331	}
3332
3333	slabp = list_entry(entry, struct slab, list);
3334	check_spinlock_acquired_node(cachep, nodeid);
3335	check_slabp(cachep, slabp);
3336
3337	STATS_INC_NODEALLOCS(cachep);
3338	STATS_INC_ACTIVE(cachep);
3339	STATS_SET_HIGH(cachep);
3340
3341	BUG_ON(slabp->inuse == cachep->num);
3342
3343	obj = slab_get_obj(cachep, slabp, nodeid);
3344	check_slabp(cachep, slabp);
3345	l3->free_objects--;
3346	/* move slabp to correct slabp list: */
3347	list_del(&slabp->list);
3348
3349	if (slabp->free == BUFCTL_END)
3350		list_add(&slabp->list, &l3->slabs_full);
3351	else
3352		list_add(&slabp->list, &l3->slabs_partial);
3353
3354	spin_unlock(&l3->list_lock);
3355	goto done;
3356
3357must_grow:
3358	spin_unlock(&l3->list_lock);
3359	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3360	if (x)
3361		goto retry;
3362
3363	return fallback_alloc(cachep, flags);
3364
3365done:
3366	return obj;
3367}
3368
3369/**
3370 * kmem_cache_alloc_node - Allocate an object on the specified node
3371 * @cachep: The cache to allocate from.
3372 * @flags: See kmalloc().
3373 * @nodeid: node number of the target node.
3374 * @caller: return address of caller, used for debug information
3375 *
3376 * Identical to kmem_cache_alloc but it will allocate memory on the given
3377 * node, which can improve the performance for cpu bound structures.
3378 *
3379 * Fallback to other node is possible if __GFP_THISNODE is not set.
3380 */
3381static __always_inline void *
3382__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3383		   void *caller)
3384{
3385	unsigned long save_flags;
3386	void *ptr;
3387
3388	cache_alloc_debugcheck_before(cachep, flags);
3389	local_irq_save(save_flags);
3390
3391	if (unlikely(nodeid == -1))
3392		nodeid = numa_node_id();
3393
3394	if (unlikely(!cachep->nodelists[nodeid])) {
3395		/* Node not bootstrapped yet */
3396		ptr = fallback_alloc(cachep, flags);
3397		goto out;
3398	}
3399
3400	if (nodeid == numa_node_id()) {
3401		/*
3402		 * Use the locally cached objects if possible.
3403		 * However ____cache_alloc does not allow fallback
3404		 * to other nodes. It may fail while we still have
3405		 * objects on other nodes available.
3406		 */
3407		ptr = ____cache_alloc(cachep, flags);
3408		if (ptr)
3409			goto out;
3410	}
3411	/* ___cache_alloc_node can fall back to other nodes */
3412	ptr = ____cache_alloc_node(cachep, flags, nodeid);
3413  out:
3414	local_irq_restore(save_flags);
3415	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3416
3417	return ptr;
3418}
3419
3420static __always_inline void *
3421__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3422{
3423	void *objp;
3424
3425	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
3426		objp = alternate_node_alloc(cache, flags);
3427		if (objp)
3428			goto out;
3429	}
3430	objp = ____cache_alloc(cache, flags);
3431
3432	/*
3433	 * We may just have run out of memory on the local node.
3434	 * ____cache_alloc_node() knows how to locate memory on other nodes
3435	 */
3436 	if (!objp)
3437 		objp = ____cache_alloc_node(cache, flags, numa_node_id());
3438
3439  out:
3440	return objp;
3441}
3442#else
3443
3444static __always_inline void *
3445__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3446{
3447	return ____cache_alloc(cachep, flags);
3448}
3449
3450#endif /* CONFIG_NUMA */
3451
3452static __always_inline void *
3453__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3454{
3455	unsigned long save_flags;
3456	void *objp;
3457
3458	cache_alloc_debugcheck_before(cachep, flags);
3459	local_irq_save(save_flags);
3460	objp = __do_cache_alloc(cachep, flags);
3461	local_irq_restore(save_flags);
3462	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3463	prefetchw(objp);
3464
3465	return objp;
3466}
3467
3468/*
3469 * Caller needs to acquire correct kmem_list's list_lock
3470 */
3471static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3472		       int node)
3473{
3474	int i;
3475	struct kmem_list3 *l3;
3476
3477	for (i = 0; i < nr_objects; i++) {
3478		void *objp = objpp[i];
3479		struct slab *slabp;
3480
3481		slabp = virt_to_slab(objp);
3482		l3 = cachep->nodelists[node];
3483		list_del(&slabp->list);
3484		check_spinlock_acquired_node(cachep, node);
3485		check_slabp(cachep, slabp);
3486		slab_put_obj(cachep, slabp, objp, node);
3487		STATS_DEC_ACTIVE(cachep);
3488		l3->free_objects++;
3489		check_slabp(cachep, slabp);
3490
3491		/* fixup slab chains */
3492		if (slabp->inuse == 0) {
3493			if (l3->free_objects > l3->free_limit) {
3494				l3->free_objects -= cachep->num;
3495				/* No need to drop any previously held
3496				 * lock here, even if we have a off-slab slab
3497				 * descriptor it is guaranteed to come from
3498				 * a different cache, refer to comments before
3499				 * alloc_slabmgmt.
3500				 */
3501				slab_destroy(cachep, slabp);
3502			} else {
3503				list_add(&slabp->list, &l3->slabs_free);
3504			}
3505		} else {
3506			/* Unconditionally move a slab to the end of the
3507			 * partial list on free - maximum time for the
3508			 * other objects to be freed, too.
3509			 */
3510			list_add_tail(&slabp->list, &l3->slabs_partial);
3511		}
3512	}
3513}
3514
3515static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3516{
3517	int batchcount;
3518	struct kmem_list3 *l3;
3519	int node = numa_node_id();
3520
3521	batchcount = ac->batchcount;
3522#if DEBUG
3523	BUG_ON(!batchcount || batchcount > ac->avail);
3524#endif
3525	check_irq_off();
3526	l3 = cachep->nodelists[node];
3527	spin_lock(&l3->list_lock);
3528	if (l3->shared) {
3529		struct array_cache *shared_array = l3->shared;
3530		int max = shared_array->limit - shared_array->avail;
3531		if (max) {
3532			if (batchcount > max)
3533				batchcount = max;
3534			memcpy(&(shared_array->entry[shared_array->avail]),
3535			       ac->entry, sizeof(void *) * batchcount);
3536			shared_array->avail += batchcount;
3537			goto free_done;
3538		}
3539	}
3540
3541	free_block(cachep, ac->entry, batchcount, node);
3542free_done:
3543#if STATS
3544	{
3545		int i = 0;
3546		struct list_head *p;
3547
3548		p = l3->slabs_free.next;
3549		while (p != &(l3->slabs_free)) {
3550			struct slab *slabp;
3551
3552			slabp = list_entry(p, struct slab, list);
3553			BUG_ON(slabp->inuse);
3554
3555			i++;
3556			p = p->next;
3557		}
3558		STATS_SET_FREEABLE(cachep, i);
3559	}
3560#endif
3561	spin_unlock(&l3->list_lock);
3562	ac->avail -= batchcount;
3563	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3564}
3565
3566/*
3567 * Release an obj back to its cache. If the obj has a constructed state, it must
3568 * be in this state _before_ it is released.  Called with disabled ints.
3569 */
3570static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3571{
3572	struct array_cache *ac = cpu_cache_get(cachep);
3573
3574	check_irq_off();
3575	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3576
3577	if (use_alien_caches && cache_free_alien(cachep, objp))
3578		return;
3579
3580	if (likely(ac->avail < ac->limit)) {
3581		STATS_INC_FREEHIT(cachep);
3582		ac->entry[ac->avail++] = objp;
3583		return;
3584	} else {
3585		STATS_INC_FREEMISS(cachep);
3586		cache_flusharray(cachep, ac);
3587		ac->entry[ac->avail++] = objp;
3588	}
3589}
3590
3591/**
3592 * kmem_cache_alloc - Allocate an object
3593 * @cachep: The cache to allocate from.
3594 * @flags: See kmalloc().
3595 *
3596 * Allocate an object from this cache.  The flags are only relevant
3597 * if the cache has no available objects.
3598 */
3599void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3600{
3601	return __cache_alloc(cachep, flags, __builtin_return_address(0));
3602}
3603EXPORT_SYMBOL(kmem_cache_alloc);
3604
3605/**
3606 * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
3607 * @cache: The cache to allocate from.
3608 * @flags: See kmalloc().
3609 *
3610 * Allocate an object from this cache and set the allocated memory to zero.
3611 * The flags are only relevant if the cache has no available objects.
3612 */
3613void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
3614{
3615	void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
3616	if (ret)
3617		memset(ret, 0, obj_size(cache));
3618	return ret;
3619}
3620EXPORT_SYMBOL(kmem_cache_zalloc);
3621
3622/**
3623 * kmem_ptr_validate - check if an untrusted pointer might
3624 *	be a slab entry.
3625 * @cachep: the cache we're checking against
3626 * @ptr: pointer to validate
3627 *
3628 * This verifies that the untrusted pointer looks sane:
3629 * it is _not_ a guarantee that the pointer is actually
3630 * part of the slab cache in question, but it at least
3631 * validates that the pointer can be dereferenced and
3632 * looks half-way sane.
3633 *
3634 * Currently only used for dentry validation.
3635 */
3636int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3637{
3638	unsigned long addr = (unsigned long)ptr;
3639	unsigned long min_addr = PAGE_OFFSET;
3640	unsigned long align_mask = BYTES_PER_WORD - 1;
3641	unsigned long size = cachep->buffer_size;
3642	struct page *page;
3643
3644	if (unlikely(addr < min_addr))
3645		goto out;
3646	if (unlikely(addr > (unsigned long)high_memory - size))
3647		goto out;
3648	if (unlikely(addr & align_mask))
3649		goto out;
3650	if (unlikely(!kern_addr_valid(addr)))
3651		goto out;
3652	if (unlikely(!kern_addr_valid(addr + size - 1)))
3653		goto out;
3654	page = virt_to_page(ptr);
3655	if (unlikely(!PageSlab(page)))
3656		goto out;
3657	if (unlikely(page_get_cache(page) != cachep))
3658		goto out;
3659	return 1;
3660out:
3661	return 0;
3662}
3663
3664#ifdef CONFIG_NUMA
3665void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3666{
3667	return __cache_alloc_node(cachep, flags, nodeid,
3668			__builtin_return_address(0));
3669}
3670EXPORT_SYMBOL(kmem_cache_alloc_node);
3671
3672static __always_inline void *
3673__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3674{
3675	struct kmem_cache *cachep;
3676
3677	cachep = kmem_find_general_cachep(size, flags);
3678	if (unlikely(cachep == NULL))
3679		return NULL;
3680	return kmem_cache_alloc_node(cachep, flags, node);
3681}
3682
3683#ifdef CONFIG_DEBUG_SLAB
3684void *__kmalloc_node(size_t size, gfp_t flags, int node)
3685{
3686	return __do_kmalloc_node(size, flags, node,
3687			__builtin_return_address(0));
3688}
3689EXPORT_SYMBOL(__kmalloc_node);
3690
3691void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3692		int node, void *caller)
3693{
3694	return __do_kmalloc_node(size, flags, node, caller);
3695}
3696EXPORT_SYMBOL(__kmalloc_node_track_caller);
3697#else
3698void *__kmalloc_node(size_t size, gfp_t flags, int node)
3699{
3700	return __do_kmalloc_node(size, flags, node, NULL);
3701}
3702EXPORT_SYMBOL(__kmalloc_node);
3703#endif /* CONFIG_DEBUG_SLAB */
3704#endif /* CONFIG_NUMA */
3705
3706/**
3707 * __do_kmalloc - allocate memory
3708 * @size: how many bytes of memory are required.
3709 * @flags: the type of memory to allocate (see kmalloc).
3710 * @caller: function caller for debug tracking of the caller
3711 */
3712static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3713					  void *caller)
3714{
3715	struct kmem_cache *cachep;
3716
3717	/* If you want to save a few bytes .text space: replace
3718	 * __ with kmem_.
3719	 * Then kmalloc uses the uninlined functions instead of the inline
3720	 * functions.
3721	 */
3722	cachep = __find_general_cachep(size, flags);
3723	if (unlikely(cachep == NULL))
3724		return NULL;
3725	return __cache_alloc(cachep, flags, caller);
3726}
3727
3728
3729#ifdef CONFIG_DEBUG_SLAB
3730void *__kmalloc(size_t size, gfp_t flags)
3731{
3732	return __do_kmalloc(size, flags, __builtin_return_address(0));
3733}
3734EXPORT_SYMBOL(__kmalloc);
3735
3736void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3737{
3738	return __do_kmalloc(size, flags, caller);
3739}
3740EXPORT_SYMBOL(__kmalloc_track_caller);
3741
3742#else
3743void *__kmalloc(size_t size, gfp_t flags)
3744{
3745	return __do_kmalloc(size, flags, NULL);
3746}
3747EXPORT_SYMBOL(__kmalloc);
3748#endif
3749
3750/**
3751 * krealloc - reallocate memory. The contents will remain unchanged.
3752 *
3753 * @p: object to reallocate memory for.
3754 * @new_size: how many bytes of memory are required.
3755 * @flags: the type of memory to allocate.
3756 *
3757 * The contents of the object pointed to are preserved up to the
3758 * lesser of the new and old sizes.  If @p is %NULL, krealloc()
3759 * behaves exactly like kmalloc().  If @size is 0 and @p is not a
3760 * %NULL pointer, the object pointed to is freed.
3761 */
3762void *krealloc(const void *p, size_t new_size, gfp_t flags)
3763{
3764	struct kmem_cache *cache, *new_cache;
3765	void *ret;
3766
3767	if (unlikely(!p))
3768		return kmalloc_track_caller(new_size, flags);
3769
3770	if (unlikely(!new_size)) {
3771		kfree(p);
3772		return NULL;
3773	}
3774
3775	cache = virt_to_cache(p);
3776	new_cache = __find_general_cachep(new_size, flags);
3777
3778	/*
3779 	 * If new size fits in the current cache, bail out.
3780 	 */
3781	if (likely(cache == new_cache))
3782		return (void *)p;
3783
3784	/*
3785 	 * We are on the slow-path here so do not use __cache_alloc
3786 	 * because it bloats kernel text.
3787 	 */
3788	ret = kmalloc_track_caller(new_size, flags);
3789	if (ret) {
3790		memcpy(ret, p, min(new_size, ksize(p)));
3791		kfree(p);
3792	}
3793	return ret;
3794}
3795EXPORT_SYMBOL(krealloc);
3796
3797/**
3798 * kmem_cache_free - Deallocate an object
3799 * @cachep: The cache the allocation was from.
3800 * @objp: The previously allocated object.
3801 *
3802 * Free an object which was previously allocated from this
3803 * cache.
3804 */
3805void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3806{
3807	unsigned long flags;
3808
3809	BUG_ON(virt_to_cache(objp) != cachep);
3810
3811	local_irq_save(flags);
3812	debug_check_no_locks_freed(objp, obj_size(cachep));
3813	__cache_free(cachep, objp);
3814	local_irq_restore(flags);
3815}
3816EXPORT_SYMBOL(kmem_cache_free);
3817
3818/**
3819 * kfree - free previously allocated memory
3820 * @objp: pointer returned by kmalloc.
3821 *
3822 * If @objp is NULL, no operation is performed.
3823 *
3824 * Don't free memory not originally allocated by kmalloc()
3825 * or you will run into trouble.
3826 */
3827void kfree(const void *objp)
3828{
3829	struct kmem_cache *c;
3830	unsigned long flags;
3831
3832	if (unlikely(!objp))
3833		return;
3834	local_irq_save(flags);
3835	kfree_debugcheck(objp);
3836	c = virt_to_cache(objp);
3837	debug_check_no_locks_freed(objp, obj_size(c));
3838	__cache_free(c, (void *)objp);
3839	local_irq_restore(flags);
3840}
3841EXPORT_SYMBOL(kfree);
3842
3843unsigned int kmem_cache_size(struct kmem_cache *cachep)
3844{
3845	return obj_size(cachep);
3846}
3847EXPORT_SYMBOL(kmem_cache_size);
3848
3849const char *kmem_cache_name(struct kmem_cache *cachep)
3850{
3851	return cachep->name;
3852}
3853EXPORT_SYMBOL_GPL(kmem_cache_name);
3854
3855/*
3856 * This initializes kmem_list3 or resizes varioius caches for all nodes.
3857 */
3858static int alloc_kmemlist(struct kmem_cache *cachep)
3859{
3860	int node;
3861	struct kmem_list3 *l3;
3862	struct array_cache *new_shared;
3863	struct array_cache **new_alien = NULL;
3864
3865	for_each_online_node(node) {
3866
3867                if (use_alien_caches) {
3868                        new_alien = alloc_alien_cache(node, cachep->limit);
3869                        if (!new_alien)
3870                                goto fail;
3871                }
3872
3873		new_shared = alloc_arraycache(node,
3874				cachep->shared*cachep->batchcount,
3875					0xbaadf00d);
3876		if (!new_shared) {
3877			free_alien_cache(new_alien);
3878			goto fail;
3879		}
3880
3881		l3 = cachep->nodelists[node];
3882		if (l3) {
3883			struct array_cache *shared = l3->shared;
3884
3885			spin_lock_irq(&l3->list_lock);
3886
3887			if (shared)
3888				free_block(cachep, shared->entry,
3889						shared->avail, node);
3890
3891			l3->shared = new_shared;
3892			if (!l3->alien) {
3893				l3->alien = new_alien;
3894				new_alien = NULL;
3895			}
3896			l3->free_limit = (1 + nr_cpus_node(node)) *
3897					cachep->batchcount + cachep->num;
3898			spin_unlock_irq(&l3->list_lock);
3899			kfree(shared);
3900			free_alien_cache(new_alien);
3901			continue;
3902		}
3903		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
3904		if (!l3) {
3905			free_alien_cache(new_alien);
3906			kfree(new_shared);
3907			goto fail;
3908		}
3909
3910		kmem_list3_init(l3);
3911		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3912				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3913		l3->shared = new_shared;
3914		l3->alien = new_alien;
3915		l3->free_limit = (1 + nr_cpus_node(node)) *
3916					cachep->batchcount + cachep->num;
3917		cachep->nodelists[node] = l3;
3918	}
3919	return 0;
3920
3921fail:
3922	if (!cachep->next.next) {
3923		/* Cache is not active yet. Roll back what we did */
3924		node--;
3925		while (node >= 0) {
3926			if (cachep->nodelists[node]) {
3927				l3 = cachep->nodelists[node];
3928
3929				kfree(l3->shared);
3930				free_alien_cache(l3->alien);
3931				kfree(l3);
3932				cachep->nodelists[node] = NULL;
3933			}
3934			node--;
3935		}
3936	}
3937	return -ENOMEM;
3938}
3939
3940struct ccupdate_struct {
3941	struct kmem_cache *cachep;
3942	struct array_cache *new[NR_CPUS];
3943};
3944
3945static void do_ccupdate_local(void *info)
3946{
3947	struct ccupdate_struct *new = info;
3948	struct array_cache *old;
3949
3950	check_irq_off();
3951	old = cpu_cache_get(new->cachep);
3952
3953	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3954	new->new[smp_processor_id()] = old;
3955}
3956
3957/* Always called with the cache_chain_mutex held */
3958static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3959				int batchcount, int shared)
3960{
3961	struct ccupdate_struct *new;
3962	int i;
3963
3964	new = kzalloc(sizeof(*new), GFP_KERNEL);
3965	if (!new)
3966		return -ENOMEM;
3967
3968	for_each_online_cpu(i) {
3969		new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3970						batchcount);
3971		if (!new->new[i]) {
3972			for (i--; i >= 0; i--)
3973				kfree(new->new[i]);
3974			kfree(new);
3975			return -ENOMEM;
3976		}
3977	}
3978	new->cachep = cachep;
3979
3980	on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
3981
3982	check_irq_on();
3983	cachep->batchcount = batchcount;
3984	cachep->limit = limit;
3985	cachep->shared = shared;
3986
3987	for_each_online_cpu(i) {
3988		struct array_cache *ccold = new->new[i];
3989		if (!ccold)
3990			continue;
3991		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3992		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3993		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3994		kfree(ccold);
3995	}
3996	kfree(new);
3997	return alloc_kmemlist(cachep);
3998}
3999
4000/* Called with cache_chain_mutex held always */
4001static int enable_cpucache(struct kmem_cache *cachep)
4002{
4003	int err;
4004	int limit, shared;
4005
4006	/*
4007	 * The head array serves three purposes:
4008	 * - create a LIFO ordering, i.e. return objects that are cache-warm
4009	 * - reduce the number of spinlock operations.
4010	 * - reduce the number of linked list operations on the slab and
4011	 *   bufctl chains: array operations are cheaper.
4012	 * The numbers are guessed, we should auto-tune as described by
4013	 * Bonwick.
4014	 */
4015	if (cachep->buffer_size > 131072)
4016		limit = 1;
4017	else if (cachep->buffer_size > PAGE_SIZE)
4018		limit = 8;
4019	else if (cachep->buffer_size > 1024)
4020		limit = 24;
4021	else if (cachep->buffer_size > 256)
4022		limit = 54;
4023	else
4024		limit = 120;
4025
4026	/*
4027	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
4028	 * allocation behaviour: Most allocs on one cpu, most free operations
4029	 * on another cpu. For these cases, an efficient object passing between
4030	 * cpus is necessary. This is provided by a shared array. The array
4031	 * replaces Bonwick's magazine layer.
4032	 * On uniprocessor, it's functionally equivalent (but less efficient)
4033	 * to a larger limit. Thus disabled by default.
4034	 */
4035	shared = 0;
4036	if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
4037		shared = 8;
4038
4039#if DEBUG
4040	/*
4041	 * With debugging enabled, large batchcount lead to excessively long
4042	 * periods with disabled local interrupts. Limit the batchcount
4043	 */
4044	if (limit > 32)
4045		limit = 32;
4046#endif
4047	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
4048	if (err)
4049		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4050		       cachep->name, -err);
4051	return err;
4052}
4053
4054/*
4055 * Drain an array if it contains any elements taking the l3 lock only if
4056 * necessary. Note that the l3 listlock also protects the array_cache
4057 * if drain_array() is used on the shared array.
4058 */
4059void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
4060			 struct array_cache *ac, int force, int node)
4061{
4062	int tofree;
4063
4064	if (!ac || !ac->avail)
4065		return;
4066	if (ac->touched && !force) {
4067		ac->touched = 0;
4068	} else {
4069		spin_lock_irq(&l3->list_lock);
4070		if (ac->avail) {
4071			tofree = force ? ac->avail : (ac->limit + 4) / 5;
4072			if (tofree > ac->avail)
4073				tofree = (ac->avail + 1) / 2;
4074			free_block(cachep, ac->entry, tofree, node);
4075			ac->avail -= tofree;
4076			memmove(ac->entry, &(ac->entry[tofree]),
4077				sizeof(void *) * ac->avail);
4078		}
4079		spin_unlock_irq(&l3->list_lock);
4080	}
4081}
4082
4083/**
4084 * cache_reap - Reclaim memory from caches.
4085 * @w: work descriptor
4086 *
4087 * Called from workqueue/eventd every few seconds.
4088 * Purpose:
4089 * - clear the per-cpu caches for this CPU.
4090 * - return freeable pages to the main free memory pool.
4091 *
4092 * If we cannot acquire the cache chain mutex then just give up - we'll try
4093 * again on the next iteration.
4094 */
4095static void cache_reap(struct work_struct *w)
4096{
4097	struct kmem_cache *searchp;
4098	struct kmem_list3 *l3;
4099	int node = numa_node_id();
4100	struct delayed_work *work =
4101		container_of(w, struct delayed_work, work);
4102
4103	if (!mutex_trylock(&cache_chain_mutex))
4104		/* Give up. Setup the next iteration. */
4105		goto out;
4106
4107	list_for_each_entry(searchp, &cache_chain, next) {
4108		check_irq_on();
4109
4110		/*
4111		 * We only take the l3 lock if absolutely necessary and we
4112		 * have established with reasonable certainty that
4113		 * we can do some work if the lock was obtained.
4114		 */
4115		l3 = searchp->nodelists[node];
4116
4117		reap_alien(searchp, l3);
4118
4119		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
4120
4121		/*
4122		 * These are racy checks but it does not matter
4123		 * if we skip one check or scan twice.
4124		 */
4125		if (time_after(l3->next_reap, jiffies))
4126			goto next;
4127
4128		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
4129
4130		drain_array(searchp, l3, l3->shared, 0, node);
4131
4132		if (l3->free_touched)
4133			l3->free_touched = 0;
4134		else {
4135			int freed;
4136
4137			freed = drain_freelist(searchp, l3, (l3->free_limit +
4138				5 * searchp->num - 1) / (5 * searchp->num));
4139			STATS_ADD_REAPED(searchp, freed);
4140		}
4141next:
4142		cond_resched();
4143	}
4144	check_irq_on();
4145	mutex_unlock(&cache_chain_mutex);
4146	next_reap_node();
4147	refresh_cpu_vm_stats(smp_processor_id());
4148out:
4149	/* Set up the next iteration */
4150	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
4151}
4152
4153#ifdef CONFIG_PROC_FS
4154
4155static void print_slabinfo_header(struct seq_file *m)
4156{
4157	/*
4158	 * Output format version, so at least we can change it
4159	 * without _too_ many complaints.
4160	 */
4161#if STATS
4162	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4163#else
4164	seq_puts(m, "slabinfo - version: 2.1\n");
4165#endif
4166	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4167		 "<objperslab> <pagesperslab>");
4168	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4169	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4170#if STATS
4171	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4172		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4173	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4174#endif
4175	seq_putc(m, '\n');
4176}
4177
4178static void *s_start(struct seq_file *m, loff_t *pos)
4179{
4180	loff_t n = *pos;
4181	struct list_head *p;
4182
4183	mutex_lock(&cache_chain_mutex);
4184	if (!n)
4185		print_slabinfo_header(m);
4186	p = cache_chain.next;
4187	while (n--) {
4188		p = p->next;
4189		if (p == &cache_chain)
4190			return NULL;
4191	}
4192	return list_entry(p, struct kmem_cache, next);
4193}
4194
4195static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4196{
4197	struct kmem_cache *cachep = p;
4198	++*pos;
4199	return cachep->next.next == &cache_chain ?
4200		NULL : list_entry(cachep->next.next, struct kmem_cache, next);
4201}
4202
4203static void s_stop(struct seq_file *m, void *p)
4204{
4205	mutex_unlock(&cache_chain_mutex);
4206}
4207
4208static int s_show(struct seq_file *m, void *p)
4209{
4210	struct kmem_cache *cachep = p;
4211	struct slab *slabp;
4212	unsigned long active_objs;
4213	unsigned long num_objs;
4214	unsigned long active_slabs = 0;
4215	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4216	const char *name;
4217	char *error = NULL;
4218	int node;
4219	struct kmem_list3 *l3;
4220
4221	active_objs = 0;
4222	num_slabs = 0;
4223	for_each_online_node(node) {
4224		l3 = cachep->nodelists[node];
4225		if (!l3)
4226			continue;
4227
4228		check_irq_on();
4229		spin_lock_irq(&l3->list_lock);
4230
4231		list_for_each_entry(slabp, &l3->slabs_full, list) {
4232			if (slabp->inuse != cachep->num && !error)
4233				error = "slabs_full accounting error";
4234			active_objs += cachep->num;
4235			active_slabs++;
4236		}
4237		list_for_each_entry(slabp, &l3->slabs_partial, list) {
4238			if (slabp->inuse == cachep->num && !error)
4239				error = "slabs_partial inuse accounting error";
4240			if (!slabp->inuse && !error)
4241				error = "slabs_partial/inuse accounting error";
4242			active_objs += slabp->inuse;
4243			active_slabs++;
4244		}
4245		list_for_each_entry(slabp, &l3->slabs_free, list) {
4246			if (slabp->inuse && !error)
4247				error = "slabs_free/inuse accounting error";
4248			num_slabs++;
4249		}
4250		free_objects += l3->free_objects;
4251		if (l3->shared)
4252			shared_avail += l3->shared->avail;
4253
4254		spin_unlock_irq(&l3->list_lock);
4255	}
4256	num_slabs += active_slabs;
4257	num_objs = num_slabs * cachep->num;
4258	if (num_objs - active_objs != free_objects && !error)
4259		error = "free_objects accounting error";
4260
4261	name = cachep->name;
4262	if (error)
4263		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4264
4265	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4266		   name, active_objs, num_objs, cachep->buffer_size,
4267		   cachep->num, (1 << cachep->gfporder));
4268	seq_printf(m, " : tunables %4u %4u %4u",
4269		   cachep->limit, cachep->batchcount, cachep->shared);
4270	seq_printf(m, " : slabdata %6lu %6lu %6lu",
4271		   active_slabs, num_slabs, shared_avail);
4272#if STATS
4273	{			/* list3 stats */
4274		unsigned long high = cachep->high_mark;
4275		unsigned long allocs = cachep->num_allocations;
4276		unsigned long grown = cachep->grown;
4277		unsigned long reaped = cachep->reaped;
4278		unsigned long errors = cachep->errors;
4279		unsigned long max_freeable = cachep->max_freeable;
4280		unsigned long node_allocs = cachep->node_allocs;
4281		unsigned long node_frees = cachep->node_frees;
4282		unsigned long overflows = cachep->node_overflow;
4283
4284		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
4285				%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
4286				reaped, errors, max_freeable, node_allocs,
4287				node_frees, overflows);
4288	}
4289	/* cpu stats */
4290	{
4291		unsigned long allochit = atomic_read(&cachep->allochit);
4292		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4293		unsigned long freehit = atomic_read(&cachep->freehit);
4294		unsigned long freemiss = atomic_read(&cachep->freemiss);
4295
4296		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4297			   allochit, allocmiss, freehit, freemiss);
4298	}
4299#endif
4300	seq_putc(m, '\n');
4301	return 0;
4302}
4303
4304/*
4305 * slabinfo_op - iterator that generates /proc/slabinfo
4306 *
4307 * Output layout:
4308 * cache-name
4309 * num-active-objs
4310 * total-objs
4311 * object size
4312 * num-active-slabs
4313 * total-slabs
4314 * num-pages-per-slab
4315 * + further values on SMP and with statistics enabled
4316 */
4317
4318const struct seq_operations slabinfo_op = {
4319	.start = s_start,
4320	.next = s_next,
4321	.stop = s_stop,
4322	.show = s_show,
4323};
4324
4325#define MAX_SLABINFO_WRITE 128
4326/**
4327 * slabinfo_write - Tuning for the slab allocator
4328 * @file: unused
4329 * @buffer: user buffer
4330 * @count: data length
4331 * @ppos: unused
4332 */
4333ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4334		       size_t count, loff_t *ppos)
4335{
4336	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4337	int limit, batchcount, shared, res;
4338	struct kmem_cache *cachep;
4339
4340	if (count > MAX_SLABINFO_WRITE)
4341		return -EINVAL;
4342	if (copy_from_user(&kbuf, buffer, count))
4343		return -EFAULT;
4344	kbuf[MAX_SLABINFO_WRITE] = '\0';
4345
4346	tmp = strchr(kbuf, ' ');
4347	if (!tmp)
4348		return -EINVAL;
4349	*tmp = '\0';
4350	tmp++;
4351	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4352		return -EINVAL;
4353
4354	/* Find the cache in the chain of caches. */
4355	mutex_lock(&cache_chain_mutex);
4356	res = -EINVAL;
4357	list_for_each_entry(cachep, &cache_chain, next) {
4358		if (!strcmp(cachep->name, kbuf)) {
4359			if (limit < 1 || batchcount < 1 ||
4360					batchcount > limit || shared < 0) {
4361				res = 0;
4362			} else {
4363				res = do_tune_cpucache(cachep, limit,
4364						       batchcount, shared);
4365			}
4366			break;
4367		}
4368	}
4369	mutex_unlock(&cache_chain_mutex);
4370	if (res >= 0)
4371		res = count;
4372	return res;
4373}
4374
4375#ifdef CONFIG_DEBUG_SLAB_LEAK
4376
4377static void *leaks_start(struct seq_file *m, loff_t *pos)
4378{
4379	loff_t n = *pos;
4380	struct list_head *p;
4381
4382	mutex_lock(&cache_chain_mutex);
4383	p = cache_chain.next;
4384	while (n--) {
4385		p = p->next;
4386		if (p == &cache_chain)
4387			return NULL;
4388	}
4389	return list_entry(p, struct kmem_cache, next);
4390}
4391
4392static inline int add_caller(unsigned long *n, unsigned long v)
4393{
4394	unsigned long *p;
4395	int l;
4396	if (!v)
4397		return 1;
4398	l = n[1];
4399	p = n + 2;
4400	while (l) {
4401		int i = l/2;
4402		unsigned long *q = p + 2 * i;
4403		if (*q == v) {
4404			q[1]++;
4405			return 1;
4406		}
4407		if (*q > v) {
4408			l = i;
4409		} else {
4410			p = q + 2;
4411			l -= i + 1;
4412		}
4413	}
4414	if (++n[1] == n[0])
4415		return 0;
4416	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4417	p[0] = v;
4418	p[1] = 1;
4419	return 1;
4420}
4421
4422static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4423{
4424	void *p;
4425	int i;
4426	if (n[0] == n[1])
4427		return;
4428	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4429		if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4430			continue;
4431		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4432			return;
4433	}
4434}
4435
4436static void show_symbol(struct seq_file *m, unsigned long address)
4437{
4438#ifdef CONFIG_KALLSYMS
4439	char *modname;
4440	const char *name;
4441	unsigned long offset, size;
4442	char namebuf[KSYM_NAME_LEN+1];
4443
4444	name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
4445
4446	if (name) {
4447		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4448		if (modname)
4449			seq_printf(m, " [%s]", modname);
4450		return;
4451	}
4452#endif
4453	seq_printf(m, "%p", (void *)address);
4454}
4455
4456static int leaks_show(struct seq_file *m, void *p)
4457{
4458	struct kmem_cache *cachep = p;
4459	struct slab *slabp;
4460	struct kmem_list3 *l3;
4461	const char *name;
4462	unsigned long *n = m->private;
4463	int node;
4464	int i;
4465
4466	if (!(cachep->flags & SLAB_STORE_USER))
4467		return 0;
4468	if (!(cachep->flags & SLAB_RED_ZONE))
4469		return 0;
4470
4471	/* OK, we can do it */
4472
4473	n[1] = 0;
4474
4475	for_each_online_node(node) {
4476		l3 = cachep->nodelists[node];
4477		if (!l3)
4478			continue;
4479
4480		check_irq_on();
4481		spin_lock_irq(&l3->list_lock);
4482
4483		list_for_each_entry(slabp, &l3->slabs_full, list)
4484			handle_slab(n, cachep, slabp);
4485		list_for_each_entry(slabp, &l3->slabs_partial, list)
4486			handle_slab(n, cachep, slabp);
4487		spin_unlock_irq(&l3->list_lock);
4488	}
4489	name = cachep->name;
4490	if (n[0] == n[1]) {
4491		/* Increase the buffer size */
4492		mutex_unlock(&cache_chain_mutex);
4493		m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4494		if (!m->private) {
4495			/* Too bad, we are really out */
4496			m->private = n;
4497			mutex_lock(&cache_chain_mutex);
4498			return -ENOMEM;
4499		}
4500		*(unsigned long *)m->private = n[0] * 2;
4501		kfree(n);
4502		mutex_lock(&cache_chain_mutex);
4503		/* Now make sure this entry will be retried */
4504		m->count = m->size;
4505		return 0;
4506	}
4507	for (i = 0; i < n[1]; i++) {
4508		seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4509		show_symbol(m, n[2*i+2]);
4510		seq_putc(m, '\n');
4511	}
4512
4513	return 0;
4514}
4515
4516const struct seq_operations slabstats_op = {
4517	.start = leaks_start,
4518	.next = s_next,
4519	.stop = s_stop,
4520	.show = leaks_show,
4521};
4522#endif
4523#endif
4524
4525/**
4526 * ksize - get the actual amount of memory allocated for a given object
4527 * @objp: Pointer to the object
4528 *
4529 * kmalloc may internally round up allocations and return more memory
4530 * than requested. ksize() can be used to determine the actual amount of
4531 * memory allocated. The caller may use this additional memory, even though
4532 * a smaller amount of memory was initially specified with the kmalloc call.
4533 * The caller must guarantee that objp points to a valid object previously
4534 * allocated with either kmalloc() or kmem_cache_alloc(). The object
4535 * must not be freed during the duration of the call.
4536 */
4537size_t ksize(const void *objp)
4538{
4539	if (unlikely(objp == NULL))
4540		return 0;
4541
4542	return obj_size(virt_to_cache(objp));
4543}
4544