slab.c revision dcce284a259373f9e5570f2e33f79eca84fcf565
1/*
2 * linux/mm/slab.c
3 * Written by Mark Hemment, 1996/97.
4 * (markhe@nextd.demon.co.uk)
5 *
6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7 *
8 * Major cleanup, different bufctl logic, per-cpu arrays
9 *	(c) 2000 Manfred Spraul
10 *
11 * Cleanup, make the head arrays unconditional, preparation for NUMA
12 * 	(c) 2002 Manfred Spraul
13 *
14 * An implementation of the Slab Allocator as described in outline in;
15 *	UNIX Internals: The New Frontiers by Uresh Vahalia
16 *	Pub: Prentice Hall	ISBN 0-13-101908-2
17 * or with a little more detail in;
18 *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
19 *	Jeff Bonwick (Sun Microsystems).
20 *	Presented at: USENIX Summer 1994 Technical Conference
21 *
22 * The memory is organized in caches, one cache for each object type.
23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24 * Each cache consists out of many slabs (they are small (usually one
25 * page long) and always contiguous), and each slab contains multiple
26 * initialized objects.
27 *
28 * This means, that your constructor is used only for newly allocated
29 * slabs and you must pass objects with the same initializations to
30 * kmem_cache_free.
31 *
32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33 * normal). If you need a special memory type, then must create a new
34 * cache for that memory type.
35 *
36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37 *   full slabs with 0 free objects
38 *   partial slabs
39 *   empty slabs with no allocated objects
40 *
41 * If partial slabs exist, then new allocations come from these slabs,
42 * otherwise from empty slabs or new slabs are allocated.
43 *
44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46 *
47 * Each cache has a short per-cpu head array, most allocs
48 * and frees go into that array, and if that array overflows, then 1/2
49 * of the entries in the array are given back into the global cache.
50 * The head array is strictly LIFO and should improve the cache hit rates.
51 * On SMP, it additionally reduces the spinlock operations.
52 *
53 * The c_cpuarray may not be read with enabled local interrupts -
54 * it's changed with a smp_call_function().
55 *
56 * SMP synchronization:
57 *  constructors and destructors are called without any locking.
58 *  Several members in struct kmem_cache and struct slab never change, they
59 *	are accessed without any locking.
60 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
61 *  	and local interrupts are disabled so slab code is preempt-safe.
62 *  The non-constant members are protected with a per-cache irq spinlock.
63 *
64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65 * in 2000 - many ideas in the current implementation are derived from
66 * his patch.
67 *
68 * Further notes from the original documentation:
69 *
70 * 11 April '97.  Started multi-threading - markhe
71 *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
72 *	The sem is only needed when accessing/extending the cache-chain, which
73 *	can never happen inside an interrupt (kmem_cache_create(),
74 *	kmem_cache_shrink() and kmem_cache_reap()).
75 *
76 *	At present, each engine can be growing a cache.  This should be blocked.
77 *
78 * 15 March 2005. NUMA slab allocator.
79 *	Shai Fultheim <shai@scalex86.org>.
80 *	Shobhit Dayal <shobhit@calsoftinc.com>
81 *	Alok N Kataria <alokk@calsoftinc.com>
82 *	Christoph Lameter <christoph@lameter.com>
83 *
84 *	Modified the slab allocator to be node aware on NUMA systems.
85 *	Each node has its own list of partial, free and full slabs.
86 *	All object allocations for a node occur from node specific slab lists.
87 */
88
89#include	<linux/slab.h>
90#include	<linux/mm.h>
91#include	<linux/poison.h>
92#include	<linux/swap.h>
93#include	<linux/cache.h>
94#include	<linux/interrupt.h>
95#include	<linux/init.h>
96#include	<linux/compiler.h>
97#include	<linux/cpuset.h>
98#include	<linux/proc_fs.h>
99#include	<linux/seq_file.h>
100#include	<linux/notifier.h>
101#include	<linux/kallsyms.h>
102#include	<linux/cpu.h>
103#include	<linux/sysctl.h>
104#include	<linux/module.h>
105#include	<linux/kmemtrace.h>
106#include	<linux/rcupdate.h>
107#include	<linux/string.h>
108#include	<linux/uaccess.h>
109#include	<linux/nodemask.h>
110#include	<linux/kmemleak.h>
111#include	<linux/mempolicy.h>
112#include	<linux/mutex.h>
113#include	<linux/fault-inject.h>
114#include	<linux/rtmutex.h>
115#include	<linux/reciprocal_div.h>
116#include	<linux/debugobjects.h>
117#include	<linux/kmemcheck.h>
118
119#include	<asm/cacheflush.h>
120#include	<asm/tlbflush.h>
121#include	<asm/page.h>
122
123/*
124 * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
125 *		  0 for faster, smaller code (especially in the critical paths).
126 *
127 * STATS	- 1 to collect stats for /proc/slabinfo.
128 *		  0 for faster, smaller code (especially in the critical paths).
129 *
130 * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
131 */
132
133#ifdef CONFIG_DEBUG_SLAB
134#define	DEBUG		1
135#define	STATS		1
136#define	FORCED_DEBUG	1
137#else
138#define	DEBUG		0
139#define	STATS		0
140#define	FORCED_DEBUG	0
141#endif
142
143/* Shouldn't this be in a header file somewhere? */
144#define	BYTES_PER_WORD		sizeof(void *)
145#define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
146
147#ifndef ARCH_KMALLOC_MINALIGN
148/*
149 * Enforce a minimum alignment for the kmalloc caches.
150 * Usually, the kmalloc caches are cache_line_size() aligned, except when
151 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
152 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
153 * alignment larger than the alignment of a 64-bit integer.
154 * ARCH_KMALLOC_MINALIGN allows that.
155 * Note that increasing this value may disable some debug features.
156 */
157#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
158#endif
159
160#ifndef ARCH_SLAB_MINALIGN
161/*
162 * Enforce a minimum alignment for all caches.
163 * Intended for archs that get misalignment faults even for BYTES_PER_WORD
164 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
165 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
166 * some debug features.
167 */
168#define ARCH_SLAB_MINALIGN 0
169#endif
170
171#ifndef ARCH_KMALLOC_FLAGS
172#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
173#endif
174
175/* Legal flag mask for kmem_cache_create(). */
176#if DEBUG
177# define CREATE_MASK	(SLAB_RED_ZONE | \
178			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
179			 SLAB_CACHE_DMA | \
180			 SLAB_STORE_USER | \
181			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
182			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
183			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
184#else
185# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
186			 SLAB_CACHE_DMA | \
187			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
188			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
189			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
190#endif
191
192/*
193 * kmem_bufctl_t:
194 *
195 * Bufctl's are used for linking objs within a slab
196 * linked offsets.
197 *
198 * This implementation relies on "struct page" for locating the cache &
199 * slab an object belongs to.
200 * This allows the bufctl structure to be small (one int), but limits
201 * the number of objects a slab (not a cache) can contain when off-slab
202 * bufctls are used. The limit is the size of the largest general cache
203 * that does not use off-slab slabs.
204 * For 32bit archs with 4 kB pages, is this 56.
205 * This is not serious, as it is only for large objects, when it is unwise
206 * to have too many per slab.
207 * Note: This limit can be raised by introducing a general cache whose size
208 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
209 */
210
211typedef unsigned int kmem_bufctl_t;
212#define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
213#define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
214#define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
215#define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
216
217/*
218 * struct slab
219 *
220 * Manages the objs in a slab. Placed either at the beginning of mem allocated
221 * for a slab, or allocated from an general cache.
222 * Slabs are chained into three list: fully used, partial, fully free slabs.
223 */
224struct slab {
225	struct list_head list;
226	unsigned long colouroff;
227	void *s_mem;		/* including colour offset */
228	unsigned int inuse;	/* num of objs active in slab */
229	kmem_bufctl_t free;
230	unsigned short nodeid;
231};
232
233/*
234 * struct slab_rcu
235 *
236 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
237 * arrange for kmem_freepages to be called via RCU.  This is useful if
238 * we need to approach a kernel structure obliquely, from its address
239 * obtained without the usual locking.  We can lock the structure to
240 * stabilize it and check it's still at the given address, only if we
241 * can be sure that the memory has not been meanwhile reused for some
242 * other kind of object (which our subsystem's lock might corrupt).
243 *
244 * rcu_read_lock before reading the address, then rcu_read_unlock after
245 * taking the spinlock within the structure expected at that address.
246 *
247 * We assume struct slab_rcu can overlay struct slab when destroying.
248 */
249struct slab_rcu {
250	struct rcu_head head;
251	struct kmem_cache *cachep;
252	void *addr;
253};
254
255/*
256 * struct array_cache
257 *
258 * Purpose:
259 * - LIFO ordering, to hand out cache-warm objects from _alloc
260 * - reduce the number of linked list operations
261 * - reduce spinlock operations
262 *
263 * The limit is stored in the per-cpu structure to reduce the data cache
264 * footprint.
265 *
266 */
267struct array_cache {
268	unsigned int avail;
269	unsigned int limit;
270	unsigned int batchcount;
271	unsigned int touched;
272	spinlock_t lock;
273	void *entry[];	/*
274			 * Must have this definition in here for the proper
275			 * alignment of array_cache. Also simplifies accessing
276			 * the entries.
277			 */
278};
279
280/*
281 * bootstrap: The caches do not work without cpuarrays anymore, but the
282 * cpuarrays are allocated from the generic caches...
283 */
284#define BOOT_CPUCACHE_ENTRIES	1
285struct arraycache_init {
286	struct array_cache cache;
287	void *entries[BOOT_CPUCACHE_ENTRIES];
288};
289
290/*
291 * The slab lists for all objects.
292 */
293struct kmem_list3 {
294	struct list_head slabs_partial;	/* partial list first, better asm code */
295	struct list_head slabs_full;
296	struct list_head slabs_free;
297	unsigned long free_objects;
298	unsigned int free_limit;
299	unsigned int colour_next;	/* Per-node cache coloring */
300	spinlock_t list_lock;
301	struct array_cache *shared;	/* shared per node */
302	struct array_cache **alien;	/* on other nodes */
303	unsigned long next_reap;	/* updated without locking */
304	int free_touched;		/* updated without locking */
305};
306
307/*
308 * Need this for bootstrapping a per node allocator.
309 */
310#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
311struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
312#define	CACHE_CACHE 0
313#define	SIZE_AC MAX_NUMNODES
314#define	SIZE_L3 (2 * MAX_NUMNODES)
315
316static int drain_freelist(struct kmem_cache *cache,
317			struct kmem_list3 *l3, int tofree);
318static void free_block(struct kmem_cache *cachep, void **objpp, int len,
319			int node);
320static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
321static void cache_reap(struct work_struct *unused);
322
323/*
324 * This function must be completely optimized away if a constant is passed to
325 * it.  Mostly the same as what is in linux/slab.h except it returns an index.
326 */
327static __always_inline int index_of(const size_t size)
328{
329	extern void __bad_size(void);
330
331	if (__builtin_constant_p(size)) {
332		int i = 0;
333
334#define CACHE(x) \
335	if (size <=x) \
336		return i; \
337	else \
338		i++;
339#include <linux/kmalloc_sizes.h>
340#undef CACHE
341		__bad_size();
342	} else
343		__bad_size();
344	return 0;
345}
346
347static int slab_early_init = 1;
348
349#define INDEX_AC index_of(sizeof(struct arraycache_init))
350#define INDEX_L3 index_of(sizeof(struct kmem_list3))
351
352static void kmem_list3_init(struct kmem_list3 *parent)
353{
354	INIT_LIST_HEAD(&parent->slabs_full);
355	INIT_LIST_HEAD(&parent->slabs_partial);
356	INIT_LIST_HEAD(&parent->slabs_free);
357	parent->shared = NULL;
358	parent->alien = NULL;
359	parent->colour_next = 0;
360	spin_lock_init(&parent->list_lock);
361	parent->free_objects = 0;
362	parent->free_touched = 0;
363}
364
365#define MAKE_LIST(cachep, listp, slab, nodeid)				\
366	do {								\
367		INIT_LIST_HEAD(listp);					\
368		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
369	} while (0)
370
371#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
372	do {								\
373	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
374	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
375	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
376	} while (0)
377
378#define CFLGS_OFF_SLAB		(0x80000000UL)
379#define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
380
381#define BATCHREFILL_LIMIT	16
382/*
383 * Optimization question: fewer reaps means less probability for unnessary
384 * cpucache drain/refill cycles.
385 *
386 * OTOH the cpuarrays can contain lots of objects,
387 * which could lock up otherwise freeable slabs.
388 */
389#define REAPTIMEOUT_CPUC	(2*HZ)
390#define REAPTIMEOUT_LIST3	(4*HZ)
391
392#if STATS
393#define	STATS_INC_ACTIVE(x)	((x)->num_active++)
394#define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
395#define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
396#define	STATS_INC_GROWN(x)	((x)->grown++)
397#define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
398#define	STATS_SET_HIGH(x)						\
399	do {								\
400		if ((x)->num_active > (x)->high_mark)			\
401			(x)->high_mark = (x)->num_active;		\
402	} while (0)
403#define	STATS_INC_ERR(x)	((x)->errors++)
404#define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
405#define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
406#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
407#define	STATS_SET_FREEABLE(x, i)					\
408	do {								\
409		if ((x)->max_freeable < i)				\
410			(x)->max_freeable = i;				\
411	} while (0)
412#define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
413#define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
414#define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
415#define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
416#else
417#define	STATS_INC_ACTIVE(x)	do { } while (0)
418#define	STATS_DEC_ACTIVE(x)	do { } while (0)
419#define	STATS_INC_ALLOCED(x)	do { } while (0)
420#define	STATS_INC_GROWN(x)	do { } while (0)
421#define	STATS_ADD_REAPED(x,y)	do { } while (0)
422#define	STATS_SET_HIGH(x)	do { } while (0)
423#define	STATS_INC_ERR(x)	do { } while (0)
424#define	STATS_INC_NODEALLOCS(x)	do { } while (0)
425#define	STATS_INC_NODEFREES(x)	do { } while (0)
426#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
427#define	STATS_SET_FREEABLE(x, i) do { } while (0)
428#define STATS_INC_ALLOCHIT(x)	do { } while (0)
429#define STATS_INC_ALLOCMISS(x)	do { } while (0)
430#define STATS_INC_FREEHIT(x)	do { } while (0)
431#define STATS_INC_FREEMISS(x)	do { } while (0)
432#endif
433
434#if DEBUG
435
436/*
437 * memory layout of objects:
438 * 0		: objp
439 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
440 * 		the end of an object is aligned with the end of the real
441 * 		allocation. Catches writes behind the end of the allocation.
442 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
443 * 		redzone word.
444 * cachep->obj_offset: The real object.
445 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
446 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
447 *					[BYTES_PER_WORD long]
448 */
449static int obj_offset(struct kmem_cache *cachep)
450{
451	return cachep->obj_offset;
452}
453
454static int obj_size(struct kmem_cache *cachep)
455{
456	return cachep->obj_size;
457}
458
459static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
460{
461	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
462	return (unsigned long long*) (objp + obj_offset(cachep) -
463				      sizeof(unsigned long long));
464}
465
466static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
467{
468	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
469	if (cachep->flags & SLAB_STORE_USER)
470		return (unsigned long long *)(objp + cachep->buffer_size -
471					      sizeof(unsigned long long) -
472					      REDZONE_ALIGN);
473	return (unsigned long long *) (objp + cachep->buffer_size -
474				       sizeof(unsigned long long));
475}
476
477static void **dbg_userword(struct kmem_cache *cachep, void *objp)
478{
479	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
480	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
481}
482
483#else
484
485#define obj_offset(x)			0
486#define obj_size(cachep)		(cachep->buffer_size)
487#define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
488#define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
489#define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
490
491#endif
492
493#ifdef CONFIG_KMEMTRACE
494size_t slab_buffer_size(struct kmem_cache *cachep)
495{
496	return cachep->buffer_size;
497}
498EXPORT_SYMBOL(slab_buffer_size);
499#endif
500
501/*
502 * Do not go above this order unless 0 objects fit into the slab.
503 */
504#define	BREAK_GFP_ORDER_HI	1
505#define	BREAK_GFP_ORDER_LO	0
506static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
507
508/*
509 * Functions for storing/retrieving the cachep and or slab from the page
510 * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
511 * these are used to find the cache which an obj belongs to.
512 */
513static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
514{
515	page->lru.next = (struct list_head *)cache;
516}
517
518static inline struct kmem_cache *page_get_cache(struct page *page)
519{
520	page = compound_head(page);
521	BUG_ON(!PageSlab(page));
522	return (struct kmem_cache *)page->lru.next;
523}
524
525static inline void page_set_slab(struct page *page, struct slab *slab)
526{
527	page->lru.prev = (struct list_head *)slab;
528}
529
530static inline struct slab *page_get_slab(struct page *page)
531{
532	BUG_ON(!PageSlab(page));
533	return (struct slab *)page->lru.prev;
534}
535
536static inline struct kmem_cache *virt_to_cache(const void *obj)
537{
538	struct page *page = virt_to_head_page(obj);
539	return page_get_cache(page);
540}
541
542static inline struct slab *virt_to_slab(const void *obj)
543{
544	struct page *page = virt_to_head_page(obj);
545	return page_get_slab(page);
546}
547
548static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
549				 unsigned int idx)
550{
551	return slab->s_mem + cache->buffer_size * idx;
552}
553
554/*
555 * We want to avoid an expensive divide : (offset / cache->buffer_size)
556 *   Using the fact that buffer_size is a constant for a particular cache,
557 *   we can replace (offset / cache->buffer_size) by
558 *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
559 */
560static inline unsigned int obj_to_index(const struct kmem_cache *cache,
561					const struct slab *slab, void *obj)
562{
563	u32 offset = (obj - slab->s_mem);
564	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
565}
566
567/*
568 * These are the default caches for kmalloc. Custom caches can have other sizes.
569 */
570struct cache_sizes malloc_sizes[] = {
571#define CACHE(x) { .cs_size = (x) },
572#include <linux/kmalloc_sizes.h>
573	CACHE(ULONG_MAX)
574#undef CACHE
575};
576EXPORT_SYMBOL(malloc_sizes);
577
578/* Must match cache_sizes above. Out of line to keep cache footprint low. */
579struct cache_names {
580	char *name;
581	char *name_dma;
582};
583
584static struct cache_names __initdata cache_names[] = {
585#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
586#include <linux/kmalloc_sizes.h>
587	{NULL,}
588#undef CACHE
589};
590
591static struct arraycache_init initarray_cache __initdata =
592    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
593static struct arraycache_init initarray_generic =
594    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
595
596/* internal cache of cache description objs */
597static struct kmem_cache cache_cache = {
598	.batchcount = 1,
599	.limit = BOOT_CPUCACHE_ENTRIES,
600	.shared = 1,
601	.buffer_size = sizeof(struct kmem_cache),
602	.name = "kmem_cache",
603};
604
605#define BAD_ALIEN_MAGIC 0x01020304ul
606
607#ifdef CONFIG_LOCKDEP
608
609/*
610 * Slab sometimes uses the kmalloc slabs to store the slab headers
611 * for other slabs "off slab".
612 * The locking for this is tricky in that it nests within the locks
613 * of all other slabs in a few places; to deal with this special
614 * locking we put on-slab caches into a separate lock-class.
615 *
616 * We set lock class for alien array caches which are up during init.
617 * The lock annotation will be lost if all cpus of a node goes down and
618 * then comes back up during hotplug
619 */
620static struct lock_class_key on_slab_l3_key;
621static struct lock_class_key on_slab_alc_key;
622
623static inline void init_lock_keys(void)
624
625{
626	int q;
627	struct cache_sizes *s = malloc_sizes;
628
629	while (s->cs_size != ULONG_MAX) {
630		for_each_node(q) {
631			struct array_cache **alc;
632			int r;
633			struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
634			if (!l3 || OFF_SLAB(s->cs_cachep))
635				continue;
636			lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
637			alc = l3->alien;
638			/*
639			 * FIXME: This check for BAD_ALIEN_MAGIC
640			 * should go away when common slab code is taught to
641			 * work even without alien caches.
642			 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
643			 * for alloc_alien_cache,
644			 */
645			if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
646				continue;
647			for_each_node(r) {
648				if (alc[r])
649					lockdep_set_class(&alc[r]->lock,
650					     &on_slab_alc_key);
651			}
652		}
653		s++;
654	}
655}
656#else
657static inline void init_lock_keys(void)
658{
659}
660#endif
661
662/*
663 * Guard access to the cache-chain.
664 */
665static DEFINE_MUTEX(cache_chain_mutex);
666static struct list_head cache_chain;
667
668/*
669 * chicken and egg problem: delay the per-cpu array allocation
670 * until the general caches are up.
671 */
672static enum {
673	NONE,
674	PARTIAL_AC,
675	PARTIAL_L3,
676	EARLY,
677	FULL
678} g_cpucache_up;
679
680/*
681 * used by boot code to determine if it can use slab based allocator
682 */
683int slab_is_available(void)
684{
685	return g_cpucache_up >= EARLY;
686}
687
688static DEFINE_PER_CPU(struct delayed_work, reap_work);
689
690static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
691{
692	return cachep->array[smp_processor_id()];
693}
694
695static inline struct kmem_cache *__find_general_cachep(size_t size,
696							gfp_t gfpflags)
697{
698	struct cache_sizes *csizep = malloc_sizes;
699
700#if DEBUG
701	/* This happens if someone tries to call
702	 * kmem_cache_create(), or __kmalloc(), before
703	 * the generic caches are initialized.
704	 */
705	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
706#endif
707	if (!size)
708		return ZERO_SIZE_PTR;
709
710	while (size > csizep->cs_size)
711		csizep++;
712
713	/*
714	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
715	 * has cs_{dma,}cachep==NULL. Thus no special case
716	 * for large kmalloc calls required.
717	 */
718#ifdef CONFIG_ZONE_DMA
719	if (unlikely(gfpflags & GFP_DMA))
720		return csizep->cs_dmacachep;
721#endif
722	return csizep->cs_cachep;
723}
724
725static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
726{
727	return __find_general_cachep(size, gfpflags);
728}
729
730static size_t slab_mgmt_size(size_t nr_objs, size_t align)
731{
732	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
733}
734
735/*
736 * Calculate the number of objects and left-over bytes for a given buffer size.
737 */
738static void cache_estimate(unsigned long gfporder, size_t buffer_size,
739			   size_t align, int flags, size_t *left_over,
740			   unsigned int *num)
741{
742	int nr_objs;
743	size_t mgmt_size;
744	size_t slab_size = PAGE_SIZE << gfporder;
745
746	/*
747	 * The slab management structure can be either off the slab or
748	 * on it. For the latter case, the memory allocated for a
749	 * slab is used for:
750	 *
751	 * - The struct slab
752	 * - One kmem_bufctl_t for each object
753	 * - Padding to respect alignment of @align
754	 * - @buffer_size bytes for each object
755	 *
756	 * If the slab management structure is off the slab, then the
757	 * alignment will already be calculated into the size. Because
758	 * the slabs are all pages aligned, the objects will be at the
759	 * correct alignment when allocated.
760	 */
761	if (flags & CFLGS_OFF_SLAB) {
762		mgmt_size = 0;
763		nr_objs = slab_size / buffer_size;
764
765		if (nr_objs > SLAB_LIMIT)
766			nr_objs = SLAB_LIMIT;
767	} else {
768		/*
769		 * Ignore padding for the initial guess. The padding
770		 * is at most @align-1 bytes, and @buffer_size is at
771		 * least @align. In the worst case, this result will
772		 * be one greater than the number of objects that fit
773		 * into the memory allocation when taking the padding
774		 * into account.
775		 */
776		nr_objs = (slab_size - sizeof(struct slab)) /
777			  (buffer_size + sizeof(kmem_bufctl_t));
778
779		/*
780		 * This calculated number will be either the right
781		 * amount, or one greater than what we want.
782		 */
783		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
784		       > slab_size)
785			nr_objs--;
786
787		if (nr_objs > SLAB_LIMIT)
788			nr_objs = SLAB_LIMIT;
789
790		mgmt_size = slab_mgmt_size(nr_objs, align);
791	}
792	*num = nr_objs;
793	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
794}
795
796#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
797
798static void __slab_error(const char *function, struct kmem_cache *cachep,
799			char *msg)
800{
801	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
802	       function, cachep->name, msg);
803	dump_stack();
804}
805
806/*
807 * By default on NUMA we use alien caches to stage the freeing of
808 * objects allocated from other nodes. This causes massive memory
809 * inefficiencies when using fake NUMA setup to split memory into a
810 * large number of small nodes, so it can be disabled on the command
811 * line
812  */
813
814static int use_alien_caches __read_mostly = 1;
815static int __init noaliencache_setup(char *s)
816{
817	use_alien_caches = 0;
818	return 1;
819}
820__setup("noaliencache", noaliencache_setup);
821
822#ifdef CONFIG_NUMA
823/*
824 * Special reaping functions for NUMA systems called from cache_reap().
825 * These take care of doing round robin flushing of alien caches (containing
826 * objects freed on different nodes from which they were allocated) and the
827 * flushing of remote pcps by calling drain_node_pages.
828 */
829static DEFINE_PER_CPU(unsigned long, reap_node);
830
831static void init_reap_node(int cpu)
832{
833	int node;
834
835	node = next_node(cpu_to_node(cpu), node_online_map);
836	if (node == MAX_NUMNODES)
837		node = first_node(node_online_map);
838
839	per_cpu(reap_node, cpu) = node;
840}
841
842static void next_reap_node(void)
843{
844	int node = __get_cpu_var(reap_node);
845
846	node = next_node(node, node_online_map);
847	if (unlikely(node >= MAX_NUMNODES))
848		node = first_node(node_online_map);
849	__get_cpu_var(reap_node) = node;
850}
851
852#else
853#define init_reap_node(cpu) do { } while (0)
854#define next_reap_node(void) do { } while (0)
855#endif
856
857/*
858 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
859 * via the workqueue/eventd.
860 * Add the CPU number into the expiration time to minimize the possibility of
861 * the CPUs getting into lockstep and contending for the global cache chain
862 * lock.
863 */
864static void __cpuinit start_cpu_timer(int cpu)
865{
866	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
867
868	/*
869	 * When this gets called from do_initcalls via cpucache_init(),
870	 * init_workqueues() has already run, so keventd will be setup
871	 * at that time.
872	 */
873	if (keventd_up() && reap_work->work.func == NULL) {
874		init_reap_node(cpu);
875		INIT_DELAYED_WORK(reap_work, cache_reap);
876		schedule_delayed_work_on(cpu, reap_work,
877					__round_jiffies_relative(HZ, cpu));
878	}
879}
880
881static struct array_cache *alloc_arraycache(int node, int entries,
882					    int batchcount, gfp_t gfp)
883{
884	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
885	struct array_cache *nc = NULL;
886
887	nc = kmalloc_node(memsize, gfp, node);
888	/*
889	 * The array_cache structures contain pointers to free object.
890	 * However, when such objects are allocated or transfered to another
891	 * cache the pointers are not cleared and they could be counted as
892	 * valid references during a kmemleak scan. Therefore, kmemleak must
893	 * not scan such objects.
894	 */
895	kmemleak_no_scan(nc);
896	if (nc) {
897		nc->avail = 0;
898		nc->limit = entries;
899		nc->batchcount = batchcount;
900		nc->touched = 0;
901		spin_lock_init(&nc->lock);
902	}
903	return nc;
904}
905
906/*
907 * Transfer objects in one arraycache to another.
908 * Locking must be handled by the caller.
909 *
910 * Return the number of entries transferred.
911 */
912static int transfer_objects(struct array_cache *to,
913		struct array_cache *from, unsigned int max)
914{
915	/* Figure out how many entries to transfer */
916	int nr = min(min(from->avail, max), to->limit - to->avail);
917
918	if (!nr)
919		return 0;
920
921	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
922			sizeof(void *) *nr);
923
924	from->avail -= nr;
925	to->avail += nr;
926	to->touched = 1;
927	return nr;
928}
929
930#ifndef CONFIG_NUMA
931
932#define drain_alien_cache(cachep, alien) do { } while (0)
933#define reap_alien(cachep, l3) do { } while (0)
934
935static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
936{
937	return (struct array_cache **)BAD_ALIEN_MAGIC;
938}
939
940static inline void free_alien_cache(struct array_cache **ac_ptr)
941{
942}
943
944static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
945{
946	return 0;
947}
948
949static inline void *alternate_node_alloc(struct kmem_cache *cachep,
950		gfp_t flags)
951{
952	return NULL;
953}
954
955static inline void *____cache_alloc_node(struct kmem_cache *cachep,
956		 gfp_t flags, int nodeid)
957{
958	return NULL;
959}
960
961#else	/* CONFIG_NUMA */
962
963static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
964static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
965
966static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
967{
968	struct array_cache **ac_ptr;
969	int memsize = sizeof(void *) * nr_node_ids;
970	int i;
971
972	if (limit > 1)
973		limit = 12;
974	ac_ptr = kmalloc_node(memsize, gfp, node);
975	if (ac_ptr) {
976		for_each_node(i) {
977			if (i == node || !node_online(i)) {
978				ac_ptr[i] = NULL;
979				continue;
980			}
981			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
982			if (!ac_ptr[i]) {
983				for (i--; i >= 0; i--)
984					kfree(ac_ptr[i]);
985				kfree(ac_ptr);
986				return NULL;
987			}
988		}
989	}
990	return ac_ptr;
991}
992
993static void free_alien_cache(struct array_cache **ac_ptr)
994{
995	int i;
996
997	if (!ac_ptr)
998		return;
999	for_each_node(i)
1000	    kfree(ac_ptr[i]);
1001	kfree(ac_ptr);
1002}
1003
1004static void __drain_alien_cache(struct kmem_cache *cachep,
1005				struct array_cache *ac, int node)
1006{
1007	struct kmem_list3 *rl3 = cachep->nodelists[node];
1008
1009	if (ac->avail) {
1010		spin_lock(&rl3->list_lock);
1011		/*
1012		 * Stuff objects into the remote nodes shared array first.
1013		 * That way we could avoid the overhead of putting the objects
1014		 * into the free lists and getting them back later.
1015		 */
1016		if (rl3->shared)
1017			transfer_objects(rl3->shared, ac, ac->limit);
1018
1019		free_block(cachep, ac->entry, ac->avail, node);
1020		ac->avail = 0;
1021		spin_unlock(&rl3->list_lock);
1022	}
1023}
1024
1025/*
1026 * Called from cache_reap() to regularly drain alien caches round robin.
1027 */
1028static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1029{
1030	int node = __get_cpu_var(reap_node);
1031
1032	if (l3->alien) {
1033		struct array_cache *ac = l3->alien[node];
1034
1035		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1036			__drain_alien_cache(cachep, ac, node);
1037			spin_unlock_irq(&ac->lock);
1038		}
1039	}
1040}
1041
1042static void drain_alien_cache(struct kmem_cache *cachep,
1043				struct array_cache **alien)
1044{
1045	int i = 0;
1046	struct array_cache *ac;
1047	unsigned long flags;
1048
1049	for_each_online_node(i) {
1050		ac = alien[i];
1051		if (ac) {
1052			spin_lock_irqsave(&ac->lock, flags);
1053			__drain_alien_cache(cachep, ac, i);
1054			spin_unlock_irqrestore(&ac->lock, flags);
1055		}
1056	}
1057}
1058
1059static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1060{
1061	struct slab *slabp = virt_to_slab(objp);
1062	int nodeid = slabp->nodeid;
1063	struct kmem_list3 *l3;
1064	struct array_cache *alien = NULL;
1065	int node;
1066
1067	node = numa_node_id();
1068
1069	/*
1070	 * Make sure we are not freeing a object from another node to the array
1071	 * cache on this cpu.
1072	 */
1073	if (likely(slabp->nodeid == node))
1074		return 0;
1075
1076	l3 = cachep->nodelists[node];
1077	STATS_INC_NODEFREES(cachep);
1078	if (l3->alien && l3->alien[nodeid]) {
1079		alien = l3->alien[nodeid];
1080		spin_lock(&alien->lock);
1081		if (unlikely(alien->avail == alien->limit)) {
1082			STATS_INC_ACOVERFLOW(cachep);
1083			__drain_alien_cache(cachep, alien, nodeid);
1084		}
1085		alien->entry[alien->avail++] = objp;
1086		spin_unlock(&alien->lock);
1087	} else {
1088		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1089		free_block(cachep, &objp, 1, nodeid);
1090		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1091	}
1092	return 1;
1093}
1094#endif
1095
1096static void __cpuinit cpuup_canceled(long cpu)
1097{
1098	struct kmem_cache *cachep;
1099	struct kmem_list3 *l3 = NULL;
1100	int node = cpu_to_node(cpu);
1101	const struct cpumask *mask = cpumask_of_node(node);
1102
1103	list_for_each_entry(cachep, &cache_chain, next) {
1104		struct array_cache *nc;
1105		struct array_cache *shared;
1106		struct array_cache **alien;
1107
1108		/* cpu is dead; no one can alloc from it. */
1109		nc = cachep->array[cpu];
1110		cachep->array[cpu] = NULL;
1111		l3 = cachep->nodelists[node];
1112
1113		if (!l3)
1114			goto free_array_cache;
1115
1116		spin_lock_irq(&l3->list_lock);
1117
1118		/* Free limit for this kmem_list3 */
1119		l3->free_limit -= cachep->batchcount;
1120		if (nc)
1121			free_block(cachep, nc->entry, nc->avail, node);
1122
1123		if (!cpus_empty(*mask)) {
1124			spin_unlock_irq(&l3->list_lock);
1125			goto free_array_cache;
1126		}
1127
1128		shared = l3->shared;
1129		if (shared) {
1130			free_block(cachep, shared->entry,
1131				   shared->avail, node);
1132			l3->shared = NULL;
1133		}
1134
1135		alien = l3->alien;
1136		l3->alien = NULL;
1137
1138		spin_unlock_irq(&l3->list_lock);
1139
1140		kfree(shared);
1141		if (alien) {
1142			drain_alien_cache(cachep, alien);
1143			free_alien_cache(alien);
1144		}
1145free_array_cache:
1146		kfree(nc);
1147	}
1148	/*
1149	 * In the previous loop, all the objects were freed to
1150	 * the respective cache's slabs,  now we can go ahead and
1151	 * shrink each nodelist to its limit.
1152	 */
1153	list_for_each_entry(cachep, &cache_chain, next) {
1154		l3 = cachep->nodelists[node];
1155		if (!l3)
1156			continue;
1157		drain_freelist(cachep, l3, l3->free_objects);
1158	}
1159}
1160
1161static int __cpuinit cpuup_prepare(long cpu)
1162{
1163	struct kmem_cache *cachep;
1164	struct kmem_list3 *l3 = NULL;
1165	int node = cpu_to_node(cpu);
1166	const int memsize = sizeof(struct kmem_list3);
1167
1168	/*
1169	 * We need to do this right in the beginning since
1170	 * alloc_arraycache's are going to use this list.
1171	 * kmalloc_node allows us to add the slab to the right
1172	 * kmem_list3 and not this cpu's kmem_list3
1173	 */
1174
1175	list_for_each_entry(cachep, &cache_chain, next) {
1176		/*
1177		 * Set up the size64 kmemlist for cpu before we can
1178		 * begin anything. Make sure some other cpu on this
1179		 * node has not already allocated this
1180		 */
1181		if (!cachep->nodelists[node]) {
1182			l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1183			if (!l3)
1184				goto bad;
1185			kmem_list3_init(l3);
1186			l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1187			    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1188
1189			/*
1190			 * The l3s don't come and go as CPUs come and
1191			 * go.  cache_chain_mutex is sufficient
1192			 * protection here.
1193			 */
1194			cachep->nodelists[node] = l3;
1195		}
1196
1197		spin_lock_irq(&cachep->nodelists[node]->list_lock);
1198		cachep->nodelists[node]->free_limit =
1199			(1 + nr_cpus_node(node)) *
1200			cachep->batchcount + cachep->num;
1201		spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1202	}
1203
1204	/*
1205	 * Now we can go ahead with allocating the shared arrays and
1206	 * array caches
1207	 */
1208	list_for_each_entry(cachep, &cache_chain, next) {
1209		struct array_cache *nc;
1210		struct array_cache *shared = NULL;
1211		struct array_cache **alien = NULL;
1212
1213		nc = alloc_arraycache(node, cachep->limit,
1214					cachep->batchcount, GFP_KERNEL);
1215		if (!nc)
1216			goto bad;
1217		if (cachep->shared) {
1218			shared = alloc_arraycache(node,
1219				cachep->shared * cachep->batchcount,
1220				0xbaadf00d, GFP_KERNEL);
1221			if (!shared) {
1222				kfree(nc);
1223				goto bad;
1224			}
1225		}
1226		if (use_alien_caches) {
1227			alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1228			if (!alien) {
1229				kfree(shared);
1230				kfree(nc);
1231				goto bad;
1232			}
1233		}
1234		cachep->array[cpu] = nc;
1235		l3 = cachep->nodelists[node];
1236		BUG_ON(!l3);
1237
1238		spin_lock_irq(&l3->list_lock);
1239		if (!l3->shared) {
1240			/*
1241			 * We are serialised from CPU_DEAD or
1242			 * CPU_UP_CANCELLED by the cpucontrol lock
1243			 */
1244			l3->shared = shared;
1245			shared = NULL;
1246		}
1247#ifdef CONFIG_NUMA
1248		if (!l3->alien) {
1249			l3->alien = alien;
1250			alien = NULL;
1251		}
1252#endif
1253		spin_unlock_irq(&l3->list_lock);
1254		kfree(shared);
1255		free_alien_cache(alien);
1256	}
1257	return 0;
1258bad:
1259	cpuup_canceled(cpu);
1260	return -ENOMEM;
1261}
1262
1263static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1264				    unsigned long action, void *hcpu)
1265{
1266	long cpu = (long)hcpu;
1267	int err = 0;
1268
1269	switch (action) {
1270	case CPU_UP_PREPARE:
1271	case CPU_UP_PREPARE_FROZEN:
1272		mutex_lock(&cache_chain_mutex);
1273		err = cpuup_prepare(cpu);
1274		mutex_unlock(&cache_chain_mutex);
1275		break;
1276	case CPU_ONLINE:
1277	case CPU_ONLINE_FROZEN:
1278		start_cpu_timer(cpu);
1279		break;
1280#ifdef CONFIG_HOTPLUG_CPU
1281  	case CPU_DOWN_PREPARE:
1282  	case CPU_DOWN_PREPARE_FROZEN:
1283		/*
1284		 * Shutdown cache reaper. Note that the cache_chain_mutex is
1285		 * held so that if cache_reap() is invoked it cannot do
1286		 * anything expensive but will only modify reap_work
1287		 * and reschedule the timer.
1288		*/
1289		cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
1290		/* Now the cache_reaper is guaranteed to be not running. */
1291		per_cpu(reap_work, cpu).work.func = NULL;
1292  		break;
1293  	case CPU_DOWN_FAILED:
1294  	case CPU_DOWN_FAILED_FROZEN:
1295		start_cpu_timer(cpu);
1296  		break;
1297	case CPU_DEAD:
1298	case CPU_DEAD_FROZEN:
1299		/*
1300		 * Even if all the cpus of a node are down, we don't free the
1301		 * kmem_list3 of any cache. This to avoid a race between
1302		 * cpu_down, and a kmalloc allocation from another cpu for
1303		 * memory from the node of the cpu going down.  The list3
1304		 * structure is usually allocated from kmem_cache_create() and
1305		 * gets destroyed at kmem_cache_destroy().
1306		 */
1307		/* fall through */
1308#endif
1309	case CPU_UP_CANCELED:
1310	case CPU_UP_CANCELED_FROZEN:
1311		mutex_lock(&cache_chain_mutex);
1312		cpuup_canceled(cpu);
1313		mutex_unlock(&cache_chain_mutex);
1314		break;
1315	}
1316	return err ? NOTIFY_BAD : NOTIFY_OK;
1317}
1318
1319static struct notifier_block __cpuinitdata cpucache_notifier = {
1320	&cpuup_callback, NULL, 0
1321};
1322
1323/*
1324 * swap the static kmem_list3 with kmalloced memory
1325 */
1326static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1327			int nodeid)
1328{
1329	struct kmem_list3 *ptr;
1330
1331	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
1332	BUG_ON(!ptr);
1333
1334	memcpy(ptr, list, sizeof(struct kmem_list3));
1335	/*
1336	 * Do not assume that spinlocks can be initialized via memcpy:
1337	 */
1338	spin_lock_init(&ptr->list_lock);
1339
1340	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1341	cachep->nodelists[nodeid] = ptr;
1342}
1343
1344/*
1345 * For setting up all the kmem_list3s for cache whose buffer_size is same as
1346 * size of kmem_list3.
1347 */
1348static void __init set_up_list3s(struct kmem_cache *cachep, int index)
1349{
1350	int node;
1351
1352	for_each_online_node(node) {
1353		cachep->nodelists[node] = &initkmem_list3[index + node];
1354		cachep->nodelists[node]->next_reap = jiffies +
1355		    REAPTIMEOUT_LIST3 +
1356		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1357	}
1358}
1359
1360/*
1361 * Initialisation.  Called after the page allocator have been initialised and
1362 * before smp_init().
1363 */
1364void __init kmem_cache_init(void)
1365{
1366	size_t left_over;
1367	struct cache_sizes *sizes;
1368	struct cache_names *names;
1369	int i;
1370	int order;
1371	int node;
1372
1373	if (num_possible_nodes() == 1)
1374		use_alien_caches = 0;
1375
1376	for (i = 0; i < NUM_INIT_LISTS; i++) {
1377		kmem_list3_init(&initkmem_list3[i]);
1378		if (i < MAX_NUMNODES)
1379			cache_cache.nodelists[i] = NULL;
1380	}
1381	set_up_list3s(&cache_cache, CACHE_CACHE);
1382
1383	/*
1384	 * Fragmentation resistance on low memory - only use bigger
1385	 * page orders on machines with more than 32MB of memory.
1386	 */
1387	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
1388		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1389
1390	/* Bootstrap is tricky, because several objects are allocated
1391	 * from caches that do not exist yet:
1392	 * 1) initialize the cache_cache cache: it contains the struct
1393	 *    kmem_cache structures of all caches, except cache_cache itself:
1394	 *    cache_cache is statically allocated.
1395	 *    Initially an __init data area is used for the head array and the
1396	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
1397	 *    array at the end of the bootstrap.
1398	 * 2) Create the first kmalloc cache.
1399	 *    The struct kmem_cache for the new cache is allocated normally.
1400	 *    An __init data area is used for the head array.
1401	 * 3) Create the remaining kmalloc caches, with minimally sized
1402	 *    head arrays.
1403	 * 4) Replace the __init data head arrays for cache_cache and the first
1404	 *    kmalloc cache with kmalloc allocated arrays.
1405	 * 5) Replace the __init data for kmem_list3 for cache_cache and
1406	 *    the other cache's with kmalloc allocated memory.
1407	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1408	 */
1409
1410	node = numa_node_id();
1411
1412	/* 1) create the cache_cache */
1413	INIT_LIST_HEAD(&cache_chain);
1414	list_add(&cache_cache.next, &cache_chain);
1415	cache_cache.colour_off = cache_line_size();
1416	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1417	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1418
1419	/*
1420	 * struct kmem_cache size depends on nr_node_ids, which
1421	 * can be less than MAX_NUMNODES.
1422	 */
1423	cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
1424				 nr_node_ids * sizeof(struct kmem_list3 *);
1425#if DEBUG
1426	cache_cache.obj_size = cache_cache.buffer_size;
1427#endif
1428	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1429					cache_line_size());
1430	cache_cache.reciprocal_buffer_size =
1431		reciprocal_value(cache_cache.buffer_size);
1432
1433	for (order = 0; order < MAX_ORDER; order++) {
1434		cache_estimate(order, cache_cache.buffer_size,
1435			cache_line_size(), 0, &left_over, &cache_cache.num);
1436		if (cache_cache.num)
1437			break;
1438	}
1439	BUG_ON(!cache_cache.num);
1440	cache_cache.gfporder = order;
1441	cache_cache.colour = left_over / cache_cache.colour_off;
1442	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1443				      sizeof(struct slab), cache_line_size());
1444
1445	/* 2+3) create the kmalloc caches */
1446	sizes = malloc_sizes;
1447	names = cache_names;
1448
1449	/*
1450	 * Initialize the caches that provide memory for the array cache and the
1451	 * kmem_list3 structures first.  Without this, further allocations will
1452	 * bug.
1453	 */
1454
1455	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1456					sizes[INDEX_AC].cs_size,
1457					ARCH_KMALLOC_MINALIGN,
1458					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1459					NULL);
1460
1461	if (INDEX_AC != INDEX_L3) {
1462		sizes[INDEX_L3].cs_cachep =
1463			kmem_cache_create(names[INDEX_L3].name,
1464				sizes[INDEX_L3].cs_size,
1465				ARCH_KMALLOC_MINALIGN,
1466				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1467				NULL);
1468	}
1469
1470	slab_early_init = 0;
1471
1472	while (sizes->cs_size != ULONG_MAX) {
1473		/*
1474		 * For performance, all the general caches are L1 aligned.
1475		 * This should be particularly beneficial on SMP boxes, as it
1476		 * eliminates "false sharing".
1477		 * Note for systems short on memory removing the alignment will
1478		 * allow tighter packing of the smaller caches.
1479		 */
1480		if (!sizes->cs_cachep) {
1481			sizes->cs_cachep = kmem_cache_create(names->name,
1482					sizes->cs_size,
1483					ARCH_KMALLOC_MINALIGN,
1484					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1485					NULL);
1486		}
1487#ifdef CONFIG_ZONE_DMA
1488		sizes->cs_dmacachep = kmem_cache_create(
1489					names->name_dma,
1490					sizes->cs_size,
1491					ARCH_KMALLOC_MINALIGN,
1492					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1493						SLAB_PANIC,
1494					NULL);
1495#endif
1496		sizes++;
1497		names++;
1498	}
1499	/* 4) Replace the bootstrap head arrays */
1500	{
1501		struct array_cache *ptr;
1502
1503		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1504
1505		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1506		memcpy(ptr, cpu_cache_get(&cache_cache),
1507		       sizeof(struct arraycache_init));
1508		/*
1509		 * Do not assume that spinlocks can be initialized via memcpy:
1510		 */
1511		spin_lock_init(&ptr->lock);
1512
1513		cache_cache.array[smp_processor_id()] = ptr;
1514
1515		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1516
1517		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1518		       != &initarray_generic.cache);
1519		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1520		       sizeof(struct arraycache_init));
1521		/*
1522		 * Do not assume that spinlocks can be initialized via memcpy:
1523		 */
1524		spin_lock_init(&ptr->lock);
1525
1526		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1527		    ptr;
1528	}
1529	/* 5) Replace the bootstrap kmem_list3's */
1530	{
1531		int nid;
1532
1533		for_each_online_node(nid) {
1534			init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
1535
1536			init_list(malloc_sizes[INDEX_AC].cs_cachep,
1537				  &initkmem_list3[SIZE_AC + nid], nid);
1538
1539			if (INDEX_AC != INDEX_L3) {
1540				init_list(malloc_sizes[INDEX_L3].cs_cachep,
1541					  &initkmem_list3[SIZE_L3 + nid], nid);
1542			}
1543		}
1544	}
1545
1546	g_cpucache_up = EARLY;
1547
1548	/* Annotate slab for lockdep -- annotate the malloc caches */
1549	init_lock_keys();
1550}
1551
1552void __init kmem_cache_init_late(void)
1553{
1554	struct kmem_cache *cachep;
1555
1556	/* 6) resize the head arrays to their final sizes */
1557	mutex_lock(&cache_chain_mutex);
1558	list_for_each_entry(cachep, &cache_chain, next)
1559		if (enable_cpucache(cachep, GFP_NOWAIT))
1560			BUG();
1561	mutex_unlock(&cache_chain_mutex);
1562
1563	/* Done! */
1564	g_cpucache_up = FULL;
1565
1566	/*
1567	 * Register a cpu startup notifier callback that initializes
1568	 * cpu_cache_get for all new cpus
1569	 */
1570	register_cpu_notifier(&cpucache_notifier);
1571
1572	/*
1573	 * The reap timers are started later, with a module init call: That part
1574	 * of the kernel is not yet operational.
1575	 */
1576}
1577
1578static int __init cpucache_init(void)
1579{
1580	int cpu;
1581
1582	/*
1583	 * Register the timers that return unneeded pages to the page allocator
1584	 */
1585	for_each_online_cpu(cpu)
1586		start_cpu_timer(cpu);
1587	return 0;
1588}
1589__initcall(cpucache_init);
1590
1591/*
1592 * Interface to system's page allocator. No need to hold the cache-lock.
1593 *
1594 * If we requested dmaable memory, we will get it. Even if we
1595 * did not request dmaable memory, we might get it, but that
1596 * would be relatively rare and ignorable.
1597 */
1598static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1599{
1600	struct page *page;
1601	int nr_pages;
1602	int i;
1603
1604#ifndef CONFIG_MMU
1605	/*
1606	 * Nommu uses slab's for process anonymous memory allocations, and thus
1607	 * requires __GFP_COMP to properly refcount higher order allocations
1608	 */
1609	flags |= __GFP_COMP;
1610#endif
1611
1612	flags |= cachep->gfpflags;
1613	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1614		flags |= __GFP_RECLAIMABLE;
1615
1616	page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1617	if (!page)
1618		return NULL;
1619
1620	nr_pages = (1 << cachep->gfporder);
1621	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1622		add_zone_page_state(page_zone(page),
1623			NR_SLAB_RECLAIMABLE, nr_pages);
1624	else
1625		add_zone_page_state(page_zone(page),
1626			NR_SLAB_UNRECLAIMABLE, nr_pages);
1627	for (i = 0; i < nr_pages; i++)
1628		__SetPageSlab(page + i);
1629
1630	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1631		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1632
1633		if (cachep->ctor)
1634			kmemcheck_mark_uninitialized_pages(page, nr_pages);
1635		else
1636			kmemcheck_mark_unallocated_pages(page, nr_pages);
1637	}
1638
1639	return page_address(page);
1640}
1641
1642/*
1643 * Interface to system's page release.
1644 */
1645static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1646{
1647	unsigned long i = (1 << cachep->gfporder);
1648	struct page *page = virt_to_page(addr);
1649	const unsigned long nr_freed = i;
1650
1651	kmemcheck_free_shadow(page, cachep->gfporder);
1652
1653	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1654		sub_zone_page_state(page_zone(page),
1655				NR_SLAB_RECLAIMABLE, nr_freed);
1656	else
1657		sub_zone_page_state(page_zone(page),
1658				NR_SLAB_UNRECLAIMABLE, nr_freed);
1659	while (i--) {
1660		BUG_ON(!PageSlab(page));
1661		__ClearPageSlab(page);
1662		page++;
1663	}
1664	if (current->reclaim_state)
1665		current->reclaim_state->reclaimed_slab += nr_freed;
1666	free_pages((unsigned long)addr, cachep->gfporder);
1667}
1668
1669static void kmem_rcu_free(struct rcu_head *head)
1670{
1671	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1672	struct kmem_cache *cachep = slab_rcu->cachep;
1673
1674	kmem_freepages(cachep, slab_rcu->addr);
1675	if (OFF_SLAB(cachep))
1676		kmem_cache_free(cachep->slabp_cache, slab_rcu);
1677}
1678
1679#if DEBUG
1680
1681#ifdef CONFIG_DEBUG_PAGEALLOC
1682static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1683			    unsigned long caller)
1684{
1685	int size = obj_size(cachep);
1686
1687	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1688
1689	if (size < 5 * sizeof(unsigned long))
1690		return;
1691
1692	*addr++ = 0x12345678;
1693	*addr++ = caller;
1694	*addr++ = smp_processor_id();
1695	size -= 3 * sizeof(unsigned long);
1696	{
1697		unsigned long *sptr = &caller;
1698		unsigned long svalue;
1699
1700		while (!kstack_end(sptr)) {
1701			svalue = *sptr++;
1702			if (kernel_text_address(svalue)) {
1703				*addr++ = svalue;
1704				size -= sizeof(unsigned long);
1705				if (size <= sizeof(unsigned long))
1706					break;
1707			}
1708		}
1709
1710	}
1711	*addr++ = 0x87654321;
1712}
1713#endif
1714
1715static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1716{
1717	int size = obj_size(cachep);
1718	addr = &((char *)addr)[obj_offset(cachep)];
1719
1720	memset(addr, val, size);
1721	*(unsigned char *)(addr + size - 1) = POISON_END;
1722}
1723
1724static void dump_line(char *data, int offset, int limit)
1725{
1726	int i;
1727	unsigned char error = 0;
1728	int bad_count = 0;
1729
1730	printk(KERN_ERR "%03x:", offset);
1731	for (i = 0; i < limit; i++) {
1732		if (data[offset + i] != POISON_FREE) {
1733			error = data[offset + i];
1734			bad_count++;
1735		}
1736		printk(" %02x", (unsigned char)data[offset + i]);
1737	}
1738	printk("\n");
1739
1740	if (bad_count == 1) {
1741		error ^= POISON_FREE;
1742		if (!(error & (error - 1))) {
1743			printk(KERN_ERR "Single bit error detected. Probably "
1744					"bad RAM.\n");
1745#ifdef CONFIG_X86
1746			printk(KERN_ERR "Run memtest86+ or a similar memory "
1747					"test tool.\n");
1748#else
1749			printk(KERN_ERR "Run a memory test tool.\n");
1750#endif
1751		}
1752	}
1753}
1754#endif
1755
1756#if DEBUG
1757
1758static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1759{
1760	int i, size;
1761	char *realobj;
1762
1763	if (cachep->flags & SLAB_RED_ZONE) {
1764		printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
1765			*dbg_redzone1(cachep, objp),
1766			*dbg_redzone2(cachep, objp));
1767	}
1768
1769	if (cachep->flags & SLAB_STORE_USER) {
1770		printk(KERN_ERR "Last user: [<%p>]",
1771			*dbg_userword(cachep, objp));
1772		print_symbol("(%s)",
1773				(unsigned long)*dbg_userword(cachep, objp));
1774		printk("\n");
1775	}
1776	realobj = (char *)objp + obj_offset(cachep);
1777	size = obj_size(cachep);
1778	for (i = 0; i < size && lines; i += 16, lines--) {
1779		int limit;
1780		limit = 16;
1781		if (i + limit > size)
1782			limit = size - i;
1783		dump_line(realobj, i, limit);
1784	}
1785}
1786
1787static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1788{
1789	char *realobj;
1790	int size, i;
1791	int lines = 0;
1792
1793	realobj = (char *)objp + obj_offset(cachep);
1794	size = obj_size(cachep);
1795
1796	for (i = 0; i < size; i++) {
1797		char exp = POISON_FREE;
1798		if (i == size - 1)
1799			exp = POISON_END;
1800		if (realobj[i] != exp) {
1801			int limit;
1802			/* Mismatch ! */
1803			/* Print header */
1804			if (lines == 0) {
1805				printk(KERN_ERR
1806					"Slab corruption: %s start=%p, len=%d\n",
1807					cachep->name, realobj, size);
1808				print_objinfo(cachep, objp, 0);
1809			}
1810			/* Hexdump the affected line */
1811			i = (i / 16) * 16;
1812			limit = 16;
1813			if (i + limit > size)
1814				limit = size - i;
1815			dump_line(realobj, i, limit);
1816			i += 16;
1817			lines++;
1818			/* Limit to 5 lines */
1819			if (lines > 5)
1820				break;
1821		}
1822	}
1823	if (lines != 0) {
1824		/* Print some data about the neighboring objects, if they
1825		 * exist:
1826		 */
1827		struct slab *slabp = virt_to_slab(objp);
1828		unsigned int objnr;
1829
1830		objnr = obj_to_index(cachep, slabp, objp);
1831		if (objnr) {
1832			objp = index_to_obj(cachep, slabp, objnr - 1);
1833			realobj = (char *)objp + obj_offset(cachep);
1834			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1835			       realobj, size);
1836			print_objinfo(cachep, objp, 2);
1837		}
1838		if (objnr + 1 < cachep->num) {
1839			objp = index_to_obj(cachep, slabp, objnr + 1);
1840			realobj = (char *)objp + obj_offset(cachep);
1841			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1842			       realobj, size);
1843			print_objinfo(cachep, objp, 2);
1844		}
1845	}
1846}
1847#endif
1848
1849#if DEBUG
1850static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
1851{
1852	int i;
1853	for (i = 0; i < cachep->num; i++) {
1854		void *objp = index_to_obj(cachep, slabp, i);
1855
1856		if (cachep->flags & SLAB_POISON) {
1857#ifdef CONFIG_DEBUG_PAGEALLOC
1858			if (cachep->buffer_size % PAGE_SIZE == 0 &&
1859					OFF_SLAB(cachep))
1860				kernel_map_pages(virt_to_page(objp),
1861					cachep->buffer_size / PAGE_SIZE, 1);
1862			else
1863				check_poison_obj(cachep, objp);
1864#else
1865			check_poison_obj(cachep, objp);
1866#endif
1867		}
1868		if (cachep->flags & SLAB_RED_ZONE) {
1869			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1870				slab_error(cachep, "start of a freed object "
1871					   "was overwritten");
1872			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1873				slab_error(cachep, "end of a freed object "
1874					   "was overwritten");
1875		}
1876	}
1877}
1878#else
1879static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
1880{
1881}
1882#endif
1883
1884/**
1885 * slab_destroy - destroy and release all objects in a slab
1886 * @cachep: cache pointer being destroyed
1887 * @slabp: slab pointer being destroyed
1888 *
1889 * Destroy all the objs in a slab, and release the mem back to the system.
1890 * Before calling the slab must have been unlinked from the cache.  The
1891 * cache-lock is not held/needed.
1892 */
1893static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1894{
1895	void *addr = slabp->s_mem - slabp->colouroff;
1896
1897	slab_destroy_debugcheck(cachep, slabp);
1898	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1899		struct slab_rcu *slab_rcu;
1900
1901		slab_rcu = (struct slab_rcu *)slabp;
1902		slab_rcu->cachep = cachep;
1903		slab_rcu->addr = addr;
1904		call_rcu(&slab_rcu->head, kmem_rcu_free);
1905	} else {
1906		kmem_freepages(cachep, addr);
1907		if (OFF_SLAB(cachep))
1908			kmem_cache_free(cachep->slabp_cache, slabp);
1909	}
1910}
1911
1912static void __kmem_cache_destroy(struct kmem_cache *cachep)
1913{
1914	int i;
1915	struct kmem_list3 *l3;
1916
1917	for_each_online_cpu(i)
1918	    kfree(cachep->array[i]);
1919
1920	/* NUMA: free the list3 structures */
1921	for_each_online_node(i) {
1922		l3 = cachep->nodelists[i];
1923		if (l3) {
1924			kfree(l3->shared);
1925			free_alien_cache(l3->alien);
1926			kfree(l3);
1927		}
1928	}
1929	kmem_cache_free(&cache_cache, cachep);
1930}
1931
1932
1933/**
1934 * calculate_slab_order - calculate size (page order) of slabs
1935 * @cachep: pointer to the cache that is being created
1936 * @size: size of objects to be created in this cache.
1937 * @align: required alignment for the objects.
1938 * @flags: slab allocation flags
1939 *
1940 * Also calculates the number of objects per slab.
1941 *
1942 * This could be made much more intelligent.  For now, try to avoid using
1943 * high order pages for slabs.  When the gfp() functions are more friendly
1944 * towards high-order requests, this should be changed.
1945 */
1946static size_t calculate_slab_order(struct kmem_cache *cachep,
1947			size_t size, size_t align, unsigned long flags)
1948{
1949	unsigned long offslab_limit;
1950	size_t left_over = 0;
1951	int gfporder;
1952
1953	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
1954		unsigned int num;
1955		size_t remainder;
1956
1957		cache_estimate(gfporder, size, align, flags, &remainder, &num);
1958		if (!num)
1959			continue;
1960
1961		if (flags & CFLGS_OFF_SLAB) {
1962			/*
1963			 * Max number of objs-per-slab for caches which
1964			 * use off-slab slabs. Needed to avoid a possible
1965			 * looping condition in cache_grow().
1966			 */
1967			offslab_limit = size - sizeof(struct slab);
1968			offslab_limit /= sizeof(kmem_bufctl_t);
1969
1970 			if (num > offslab_limit)
1971				break;
1972		}
1973
1974		/* Found something acceptable - save it away */
1975		cachep->num = num;
1976		cachep->gfporder = gfporder;
1977		left_over = remainder;
1978
1979		/*
1980		 * A VFS-reclaimable slab tends to have most allocations
1981		 * as GFP_NOFS and we really don't want to have to be allocating
1982		 * higher-order pages when we are unable to shrink dcache.
1983		 */
1984		if (flags & SLAB_RECLAIM_ACCOUNT)
1985			break;
1986
1987		/*
1988		 * Large number of objects is good, but very large slabs are
1989		 * currently bad for the gfp()s.
1990		 */
1991		if (gfporder >= slab_break_gfp_order)
1992			break;
1993
1994		/*
1995		 * Acceptable internal fragmentation?
1996		 */
1997		if (left_over * 8 <= (PAGE_SIZE << gfporder))
1998			break;
1999	}
2000	return left_over;
2001}
2002
2003static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2004{
2005	if (g_cpucache_up == FULL)
2006		return enable_cpucache(cachep, gfp);
2007
2008	if (g_cpucache_up == NONE) {
2009		/*
2010		 * Note: the first kmem_cache_create must create the cache
2011		 * that's used by kmalloc(24), otherwise the creation of
2012		 * further caches will BUG().
2013		 */
2014		cachep->array[smp_processor_id()] = &initarray_generic.cache;
2015
2016		/*
2017		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2018		 * the first cache, then we need to set up all its list3s,
2019		 * otherwise the creation of further caches will BUG().
2020		 */
2021		set_up_list3s(cachep, SIZE_AC);
2022		if (INDEX_AC == INDEX_L3)
2023			g_cpucache_up = PARTIAL_L3;
2024		else
2025			g_cpucache_up = PARTIAL_AC;
2026	} else {
2027		cachep->array[smp_processor_id()] =
2028			kmalloc(sizeof(struct arraycache_init), gfp);
2029
2030		if (g_cpucache_up == PARTIAL_AC) {
2031			set_up_list3s(cachep, SIZE_L3);
2032			g_cpucache_up = PARTIAL_L3;
2033		} else {
2034			int node;
2035			for_each_online_node(node) {
2036				cachep->nodelists[node] =
2037				    kmalloc_node(sizeof(struct kmem_list3),
2038						gfp, node);
2039				BUG_ON(!cachep->nodelists[node]);
2040				kmem_list3_init(cachep->nodelists[node]);
2041			}
2042		}
2043	}
2044	cachep->nodelists[numa_node_id()]->next_reap =
2045			jiffies + REAPTIMEOUT_LIST3 +
2046			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2047
2048	cpu_cache_get(cachep)->avail = 0;
2049	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2050	cpu_cache_get(cachep)->batchcount = 1;
2051	cpu_cache_get(cachep)->touched = 0;
2052	cachep->batchcount = 1;
2053	cachep->limit = BOOT_CPUCACHE_ENTRIES;
2054	return 0;
2055}
2056
2057/**
2058 * kmem_cache_create - Create a cache.
2059 * @name: A string which is used in /proc/slabinfo to identify this cache.
2060 * @size: The size of objects to be created in this cache.
2061 * @align: The required alignment for the objects.
2062 * @flags: SLAB flags
2063 * @ctor: A constructor for the objects.
2064 *
2065 * Returns a ptr to the cache on success, NULL on failure.
2066 * Cannot be called within a int, but can be interrupted.
2067 * The @ctor is run when new pages are allocated by the cache.
2068 *
2069 * @name must be valid until the cache is destroyed. This implies that
2070 * the module calling this has to destroy the cache before getting unloaded.
2071 * Note that kmem_cache_name() is not guaranteed to return the same pointer,
2072 * therefore applications must manage it themselves.
2073 *
2074 * The flags are
2075 *
2076 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2077 * to catch references to uninitialised memory.
2078 *
2079 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2080 * for buffer overruns.
2081 *
2082 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2083 * cacheline.  This can be beneficial if you're counting cycles as closely
2084 * as davem.
2085 */
2086struct kmem_cache *
2087kmem_cache_create (const char *name, size_t size, size_t align,
2088	unsigned long flags, void (*ctor)(void *))
2089{
2090	size_t left_over, slab_size, ralign;
2091	struct kmem_cache *cachep = NULL, *pc;
2092	gfp_t gfp;
2093
2094	/*
2095	 * Sanity checks... these are all serious usage bugs.
2096	 */
2097	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2098	    size > KMALLOC_MAX_SIZE) {
2099		printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
2100				name);
2101		BUG();
2102	}
2103
2104	/*
2105	 * We use cache_chain_mutex to ensure a consistent view of
2106	 * cpu_online_mask as well.  Please see cpuup_callback
2107	 */
2108	if (slab_is_available()) {
2109		get_online_cpus();
2110		mutex_lock(&cache_chain_mutex);
2111	}
2112
2113	list_for_each_entry(pc, &cache_chain, next) {
2114		char tmp;
2115		int res;
2116
2117		/*
2118		 * This happens when the module gets unloaded and doesn't
2119		 * destroy its slab cache and no-one else reuses the vmalloc
2120		 * area of the module.  Print a warning.
2121		 */
2122		res = probe_kernel_address(pc->name, tmp);
2123		if (res) {
2124			printk(KERN_ERR
2125			       "SLAB: cache with size %d has lost its name\n",
2126			       pc->buffer_size);
2127			continue;
2128		}
2129
2130		if (!strcmp(pc->name, name)) {
2131			printk(KERN_ERR
2132			       "kmem_cache_create: duplicate cache %s\n", name);
2133			dump_stack();
2134			goto oops;
2135		}
2136	}
2137
2138#if DEBUG
2139	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
2140#if FORCED_DEBUG
2141	/*
2142	 * Enable redzoning and last user accounting, except for caches with
2143	 * large objects, if the increased size would increase the object size
2144	 * above the next power of two: caches with object sizes just above a
2145	 * power of two have a significant amount of internal fragmentation.
2146	 */
2147	if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2148						2 * sizeof(unsigned long long)))
2149		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2150	if (!(flags & SLAB_DESTROY_BY_RCU))
2151		flags |= SLAB_POISON;
2152#endif
2153	if (flags & SLAB_DESTROY_BY_RCU)
2154		BUG_ON(flags & SLAB_POISON);
2155#endif
2156	/*
2157	 * Always checks flags, a caller might be expecting debug support which
2158	 * isn't available.
2159	 */
2160	BUG_ON(flags & ~CREATE_MASK);
2161
2162	/*
2163	 * Check that size is in terms of words.  This is needed to avoid
2164	 * unaligned accesses for some archs when redzoning is used, and makes
2165	 * sure any on-slab bufctl's are also correctly aligned.
2166	 */
2167	if (size & (BYTES_PER_WORD - 1)) {
2168		size += (BYTES_PER_WORD - 1);
2169		size &= ~(BYTES_PER_WORD - 1);
2170	}
2171
2172	/* calculate the final buffer alignment: */
2173
2174	/* 1) arch recommendation: can be overridden for debug */
2175	if (flags & SLAB_HWCACHE_ALIGN) {
2176		/*
2177		 * Default alignment: as specified by the arch code.  Except if
2178		 * an object is really small, then squeeze multiple objects into
2179		 * one cacheline.
2180		 */
2181		ralign = cache_line_size();
2182		while (size <= ralign / 2)
2183			ralign /= 2;
2184	} else {
2185		ralign = BYTES_PER_WORD;
2186	}
2187
2188	/*
2189	 * Redzoning and user store require word alignment or possibly larger.
2190	 * Note this will be overridden by architecture or caller mandated
2191	 * alignment if either is greater than BYTES_PER_WORD.
2192	 */
2193	if (flags & SLAB_STORE_USER)
2194		ralign = BYTES_PER_WORD;
2195
2196	if (flags & SLAB_RED_ZONE) {
2197		ralign = REDZONE_ALIGN;
2198		/* If redzoning, ensure that the second redzone is suitably
2199		 * aligned, by adjusting the object size accordingly. */
2200		size += REDZONE_ALIGN - 1;
2201		size &= ~(REDZONE_ALIGN - 1);
2202	}
2203
2204	/* 2) arch mandated alignment */
2205	if (ralign < ARCH_SLAB_MINALIGN) {
2206		ralign = ARCH_SLAB_MINALIGN;
2207	}
2208	/* 3) caller mandated alignment */
2209	if (ralign < align) {
2210		ralign = align;
2211	}
2212	/* disable debug if necessary */
2213	if (ralign > __alignof__(unsigned long long))
2214		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2215	/*
2216	 * 4) Store it.
2217	 */
2218	align = ralign;
2219
2220	if (slab_is_available())
2221		gfp = GFP_KERNEL;
2222	else
2223		gfp = GFP_NOWAIT;
2224
2225	/* Get cache's description obj. */
2226	cachep = kmem_cache_zalloc(&cache_cache, gfp);
2227	if (!cachep)
2228		goto oops;
2229
2230#if DEBUG
2231	cachep->obj_size = size;
2232
2233	/*
2234	 * Both debugging options require word-alignment which is calculated
2235	 * into align above.
2236	 */
2237	if (flags & SLAB_RED_ZONE) {
2238		/* add space for red zone words */
2239		cachep->obj_offset += sizeof(unsigned long long);
2240		size += 2 * sizeof(unsigned long long);
2241	}
2242	if (flags & SLAB_STORE_USER) {
2243		/* user store requires one word storage behind the end of
2244		 * the real object. But if the second red zone needs to be
2245		 * aligned to 64 bits, we must allow that much space.
2246		 */
2247		if (flags & SLAB_RED_ZONE)
2248			size += REDZONE_ALIGN;
2249		else
2250			size += BYTES_PER_WORD;
2251	}
2252#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2253	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2254	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
2255		cachep->obj_offset += PAGE_SIZE - size;
2256		size = PAGE_SIZE;
2257	}
2258#endif
2259#endif
2260
2261	/*
2262	 * Determine if the slab management is 'on' or 'off' slab.
2263	 * (bootstrapping cannot cope with offslab caches so don't do
2264	 * it too early on.)
2265	 */
2266	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
2267		/*
2268		 * Size is large, assume best to place the slab management obj
2269		 * off-slab (should allow better packing of objs).
2270		 */
2271		flags |= CFLGS_OFF_SLAB;
2272
2273	size = ALIGN(size, align);
2274
2275	left_over = calculate_slab_order(cachep, size, align, flags);
2276
2277	if (!cachep->num) {
2278		printk(KERN_ERR
2279		       "kmem_cache_create: couldn't create cache %s.\n", name);
2280		kmem_cache_free(&cache_cache, cachep);
2281		cachep = NULL;
2282		goto oops;
2283	}
2284	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2285			  + sizeof(struct slab), align);
2286
2287	/*
2288	 * If the slab has been placed off-slab, and we have enough space then
2289	 * move it on-slab. This is at the expense of any extra colouring.
2290	 */
2291	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2292		flags &= ~CFLGS_OFF_SLAB;
2293		left_over -= slab_size;
2294	}
2295
2296	if (flags & CFLGS_OFF_SLAB) {
2297		/* really off slab. No need for manual alignment */
2298		slab_size =
2299		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2300
2301#ifdef CONFIG_PAGE_POISONING
2302		/* If we're going to use the generic kernel_map_pages()
2303		 * poisoning, then it's going to smash the contents of
2304		 * the redzone and userword anyhow, so switch them off.
2305		 */
2306		if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2307			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2308#endif
2309	}
2310
2311	cachep->colour_off = cache_line_size();
2312	/* Offset must be a multiple of the alignment. */
2313	if (cachep->colour_off < align)
2314		cachep->colour_off = align;
2315	cachep->colour = left_over / cachep->colour_off;
2316	cachep->slab_size = slab_size;
2317	cachep->flags = flags;
2318	cachep->gfpflags = 0;
2319	if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2320		cachep->gfpflags |= GFP_DMA;
2321	cachep->buffer_size = size;
2322	cachep->reciprocal_buffer_size = reciprocal_value(size);
2323
2324	if (flags & CFLGS_OFF_SLAB) {
2325		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2326		/*
2327		 * This is a possibility for one of the malloc_sizes caches.
2328		 * But since we go off slab only for object size greater than
2329		 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2330		 * this should not happen at all.
2331		 * But leave a BUG_ON for some lucky dude.
2332		 */
2333		BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
2334	}
2335	cachep->ctor = ctor;
2336	cachep->name = name;
2337
2338	if (setup_cpu_cache(cachep, gfp)) {
2339		__kmem_cache_destroy(cachep);
2340		cachep = NULL;
2341		goto oops;
2342	}
2343
2344	/* cache setup completed, link it into the list */
2345	list_add(&cachep->next, &cache_chain);
2346oops:
2347	if (!cachep && (flags & SLAB_PANIC))
2348		panic("kmem_cache_create(): failed to create slab `%s'\n",
2349		      name);
2350	if (slab_is_available()) {
2351		mutex_unlock(&cache_chain_mutex);
2352		put_online_cpus();
2353	}
2354	return cachep;
2355}
2356EXPORT_SYMBOL(kmem_cache_create);
2357
2358#if DEBUG
2359static void check_irq_off(void)
2360{
2361	BUG_ON(!irqs_disabled());
2362}
2363
2364static void check_irq_on(void)
2365{
2366	BUG_ON(irqs_disabled());
2367}
2368
2369static void check_spinlock_acquired(struct kmem_cache *cachep)
2370{
2371#ifdef CONFIG_SMP
2372	check_irq_off();
2373	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
2374#endif
2375}
2376
2377static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2378{
2379#ifdef CONFIG_SMP
2380	check_irq_off();
2381	assert_spin_locked(&cachep->nodelists[node]->list_lock);
2382#endif
2383}
2384
2385#else
2386#define check_irq_off()	do { } while(0)
2387#define check_irq_on()	do { } while(0)
2388#define check_spinlock_acquired(x) do { } while(0)
2389#define check_spinlock_acquired_node(x, y) do { } while(0)
2390#endif
2391
2392static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2393			struct array_cache *ac,
2394			int force, int node);
2395
2396static void do_drain(void *arg)
2397{
2398	struct kmem_cache *cachep = arg;
2399	struct array_cache *ac;
2400	int node = numa_node_id();
2401
2402	check_irq_off();
2403	ac = cpu_cache_get(cachep);
2404	spin_lock(&cachep->nodelists[node]->list_lock);
2405	free_block(cachep, ac->entry, ac->avail, node);
2406	spin_unlock(&cachep->nodelists[node]->list_lock);
2407	ac->avail = 0;
2408}
2409
2410static void drain_cpu_caches(struct kmem_cache *cachep)
2411{
2412	struct kmem_list3 *l3;
2413	int node;
2414
2415	on_each_cpu(do_drain, cachep, 1);
2416	check_irq_on();
2417	for_each_online_node(node) {
2418		l3 = cachep->nodelists[node];
2419		if (l3 && l3->alien)
2420			drain_alien_cache(cachep, l3->alien);
2421	}
2422
2423	for_each_online_node(node) {
2424		l3 = cachep->nodelists[node];
2425		if (l3)
2426			drain_array(cachep, l3, l3->shared, 1, node);
2427	}
2428}
2429
2430/*
2431 * Remove slabs from the list of free slabs.
2432 * Specify the number of slabs to drain in tofree.
2433 *
2434 * Returns the actual number of slabs released.
2435 */
2436static int drain_freelist(struct kmem_cache *cache,
2437			struct kmem_list3 *l3, int tofree)
2438{
2439	struct list_head *p;
2440	int nr_freed;
2441	struct slab *slabp;
2442
2443	nr_freed = 0;
2444	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2445
2446		spin_lock_irq(&l3->list_lock);
2447		p = l3->slabs_free.prev;
2448		if (p == &l3->slabs_free) {
2449			spin_unlock_irq(&l3->list_lock);
2450			goto out;
2451		}
2452
2453		slabp = list_entry(p, struct slab, list);
2454#if DEBUG
2455		BUG_ON(slabp->inuse);
2456#endif
2457		list_del(&slabp->list);
2458		/*
2459		 * Safe to drop the lock. The slab is no longer linked
2460		 * to the cache.
2461		 */
2462		l3->free_objects -= cache->num;
2463		spin_unlock_irq(&l3->list_lock);
2464		slab_destroy(cache, slabp);
2465		nr_freed++;
2466	}
2467out:
2468	return nr_freed;
2469}
2470
2471/* Called with cache_chain_mutex held to protect against cpu hotplug */
2472static int __cache_shrink(struct kmem_cache *cachep)
2473{
2474	int ret = 0, i = 0;
2475	struct kmem_list3 *l3;
2476
2477	drain_cpu_caches(cachep);
2478
2479	check_irq_on();
2480	for_each_online_node(i) {
2481		l3 = cachep->nodelists[i];
2482		if (!l3)
2483			continue;
2484
2485		drain_freelist(cachep, l3, l3->free_objects);
2486
2487		ret += !list_empty(&l3->slabs_full) ||
2488			!list_empty(&l3->slabs_partial);
2489	}
2490	return (ret ? 1 : 0);
2491}
2492
2493/**
2494 * kmem_cache_shrink - Shrink a cache.
2495 * @cachep: The cache to shrink.
2496 *
2497 * Releases as many slabs as possible for a cache.
2498 * To help debugging, a zero exit status indicates all slabs were released.
2499 */
2500int kmem_cache_shrink(struct kmem_cache *cachep)
2501{
2502	int ret;
2503	BUG_ON(!cachep || in_interrupt());
2504
2505	get_online_cpus();
2506	mutex_lock(&cache_chain_mutex);
2507	ret = __cache_shrink(cachep);
2508	mutex_unlock(&cache_chain_mutex);
2509	put_online_cpus();
2510	return ret;
2511}
2512EXPORT_SYMBOL(kmem_cache_shrink);
2513
2514/**
2515 * kmem_cache_destroy - delete a cache
2516 * @cachep: the cache to destroy
2517 *
2518 * Remove a &struct kmem_cache object from the slab cache.
2519 *
2520 * It is expected this function will be called by a module when it is
2521 * unloaded.  This will remove the cache completely, and avoid a duplicate
2522 * cache being allocated each time a module is loaded and unloaded, if the
2523 * module doesn't have persistent in-kernel storage across loads and unloads.
2524 *
2525 * The cache must be empty before calling this function.
2526 *
2527 * The caller must guarantee that noone will allocate memory from the cache
2528 * during the kmem_cache_destroy().
2529 */
2530void kmem_cache_destroy(struct kmem_cache *cachep)
2531{
2532	BUG_ON(!cachep || in_interrupt());
2533
2534	/* Find the cache in the chain of caches. */
2535	get_online_cpus();
2536	mutex_lock(&cache_chain_mutex);
2537	/*
2538	 * the chain is never empty, cache_cache is never destroyed
2539	 */
2540	list_del(&cachep->next);
2541	if (__cache_shrink(cachep)) {
2542		slab_error(cachep, "Can't free all objects");
2543		list_add(&cachep->next, &cache_chain);
2544		mutex_unlock(&cache_chain_mutex);
2545		put_online_cpus();
2546		return;
2547	}
2548
2549	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2550		synchronize_rcu();
2551
2552	__kmem_cache_destroy(cachep);
2553	mutex_unlock(&cache_chain_mutex);
2554	put_online_cpus();
2555}
2556EXPORT_SYMBOL(kmem_cache_destroy);
2557
2558/*
2559 * Get the memory for a slab management obj.
2560 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2561 * always come from malloc_sizes caches.  The slab descriptor cannot
2562 * come from the same cache which is getting created because,
2563 * when we are searching for an appropriate cache for these
2564 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2565 * If we are creating a malloc_sizes cache here it would not be visible to
2566 * kmem_find_general_cachep till the initialization is complete.
2567 * Hence we cannot have slabp_cache same as the original cache.
2568 */
2569static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2570				   int colour_off, gfp_t local_flags,
2571				   int nodeid)
2572{
2573	struct slab *slabp;
2574
2575	if (OFF_SLAB(cachep)) {
2576		/* Slab management obj is off-slab. */
2577		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2578					      local_flags, nodeid);
2579		/*
2580		 * If the first object in the slab is leaked (it's allocated
2581		 * but no one has a reference to it), we want to make sure
2582		 * kmemleak does not treat the ->s_mem pointer as a reference
2583		 * to the object. Otherwise we will not report the leak.
2584		 */
2585		kmemleak_scan_area(slabp, offsetof(struct slab, list),
2586				   sizeof(struct list_head), local_flags);
2587		if (!slabp)
2588			return NULL;
2589	} else {
2590		slabp = objp + colour_off;
2591		colour_off += cachep->slab_size;
2592	}
2593	slabp->inuse = 0;
2594	slabp->colouroff = colour_off;
2595	slabp->s_mem = objp + colour_off;
2596	slabp->nodeid = nodeid;
2597	slabp->free = 0;
2598	return slabp;
2599}
2600
2601static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2602{
2603	return (kmem_bufctl_t *) (slabp + 1);
2604}
2605
2606static void cache_init_objs(struct kmem_cache *cachep,
2607			    struct slab *slabp)
2608{
2609	int i;
2610
2611	for (i = 0; i < cachep->num; i++) {
2612		void *objp = index_to_obj(cachep, slabp, i);
2613#if DEBUG
2614		/* need to poison the objs? */
2615		if (cachep->flags & SLAB_POISON)
2616			poison_obj(cachep, objp, POISON_FREE);
2617		if (cachep->flags & SLAB_STORE_USER)
2618			*dbg_userword(cachep, objp) = NULL;
2619
2620		if (cachep->flags & SLAB_RED_ZONE) {
2621			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2622			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2623		}
2624		/*
2625		 * Constructors are not allowed to allocate memory from the same
2626		 * cache which they are a constructor for.  Otherwise, deadlock.
2627		 * They must also be threaded.
2628		 */
2629		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2630			cachep->ctor(objp + obj_offset(cachep));
2631
2632		if (cachep->flags & SLAB_RED_ZONE) {
2633			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2634				slab_error(cachep, "constructor overwrote the"
2635					   " end of an object");
2636			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2637				slab_error(cachep, "constructor overwrote the"
2638					   " start of an object");
2639		}
2640		if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2641			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2642			kernel_map_pages(virt_to_page(objp),
2643					 cachep->buffer_size / PAGE_SIZE, 0);
2644#else
2645		if (cachep->ctor)
2646			cachep->ctor(objp);
2647#endif
2648		slab_bufctl(slabp)[i] = i + 1;
2649	}
2650	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2651}
2652
2653static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2654{
2655	if (CONFIG_ZONE_DMA_FLAG) {
2656		if (flags & GFP_DMA)
2657			BUG_ON(!(cachep->gfpflags & GFP_DMA));
2658		else
2659			BUG_ON(cachep->gfpflags & GFP_DMA);
2660	}
2661}
2662
2663static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2664				int nodeid)
2665{
2666	void *objp = index_to_obj(cachep, slabp, slabp->free);
2667	kmem_bufctl_t next;
2668
2669	slabp->inuse++;
2670	next = slab_bufctl(slabp)[slabp->free];
2671#if DEBUG
2672	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2673	WARN_ON(slabp->nodeid != nodeid);
2674#endif
2675	slabp->free = next;
2676
2677	return objp;
2678}
2679
2680static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2681				void *objp, int nodeid)
2682{
2683	unsigned int objnr = obj_to_index(cachep, slabp, objp);
2684
2685#if DEBUG
2686	/* Verify that the slab belongs to the intended node */
2687	WARN_ON(slabp->nodeid != nodeid);
2688
2689	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2690		printk(KERN_ERR "slab: double free detected in cache "
2691				"'%s', objp %p\n", cachep->name, objp);
2692		BUG();
2693	}
2694#endif
2695	slab_bufctl(slabp)[objnr] = slabp->free;
2696	slabp->free = objnr;
2697	slabp->inuse--;
2698}
2699
2700/*
2701 * Map pages beginning at addr to the given cache and slab. This is required
2702 * for the slab allocator to be able to lookup the cache and slab of a
2703 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
2704 */
2705static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2706			   void *addr)
2707{
2708	int nr_pages;
2709	struct page *page;
2710
2711	page = virt_to_page(addr);
2712
2713	nr_pages = 1;
2714	if (likely(!PageCompound(page)))
2715		nr_pages <<= cache->gfporder;
2716
2717	do {
2718		page_set_cache(page, cache);
2719		page_set_slab(page, slab);
2720		page++;
2721	} while (--nr_pages);
2722}
2723
2724/*
2725 * Grow (by 1) the number of slabs within a cache.  This is called by
2726 * kmem_cache_alloc() when there are no active objs left in a cache.
2727 */
2728static int cache_grow(struct kmem_cache *cachep,
2729		gfp_t flags, int nodeid, void *objp)
2730{
2731	struct slab *slabp;
2732	size_t offset;
2733	gfp_t local_flags;
2734	struct kmem_list3 *l3;
2735
2736	/*
2737	 * Be lazy and only check for valid flags here,  keeping it out of the
2738	 * critical path in kmem_cache_alloc().
2739	 */
2740	BUG_ON(flags & GFP_SLAB_BUG_MASK);
2741	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2742
2743	/* Take the l3 list lock to change the colour_next on this node */
2744	check_irq_off();
2745	l3 = cachep->nodelists[nodeid];
2746	spin_lock(&l3->list_lock);
2747
2748	/* Get colour for the slab, and cal the next value. */
2749	offset = l3->colour_next;
2750	l3->colour_next++;
2751	if (l3->colour_next >= cachep->colour)
2752		l3->colour_next = 0;
2753	spin_unlock(&l3->list_lock);
2754
2755	offset *= cachep->colour_off;
2756
2757	if (local_flags & __GFP_WAIT)
2758		local_irq_enable();
2759
2760	/*
2761	 * The test for missing atomic flag is performed here, rather than
2762	 * the more obvious place, simply to reduce the critical path length
2763	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2764	 * will eventually be caught here (where it matters).
2765	 */
2766	kmem_flagcheck(cachep, flags);
2767
2768	/*
2769	 * Get mem for the objs.  Attempt to allocate a physical page from
2770	 * 'nodeid'.
2771	 */
2772	if (!objp)
2773		objp = kmem_getpages(cachep, local_flags, nodeid);
2774	if (!objp)
2775		goto failed;
2776
2777	/* Get slab management. */
2778	slabp = alloc_slabmgmt(cachep, objp, offset,
2779			local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2780	if (!slabp)
2781		goto opps1;
2782
2783	slab_map_pages(cachep, slabp, objp);
2784
2785	cache_init_objs(cachep, slabp);
2786
2787	if (local_flags & __GFP_WAIT)
2788		local_irq_disable();
2789	check_irq_off();
2790	spin_lock(&l3->list_lock);
2791
2792	/* Make slab active. */
2793	list_add_tail(&slabp->list, &(l3->slabs_free));
2794	STATS_INC_GROWN(cachep);
2795	l3->free_objects += cachep->num;
2796	spin_unlock(&l3->list_lock);
2797	return 1;
2798opps1:
2799	kmem_freepages(cachep, objp);
2800failed:
2801	if (local_flags & __GFP_WAIT)
2802		local_irq_disable();
2803	return 0;
2804}
2805
2806#if DEBUG
2807
2808/*
2809 * Perform extra freeing checks:
2810 * - detect bad pointers.
2811 * - POISON/RED_ZONE checking
2812 */
2813static void kfree_debugcheck(const void *objp)
2814{
2815	if (!virt_addr_valid(objp)) {
2816		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2817		       (unsigned long)objp);
2818		BUG();
2819	}
2820}
2821
2822static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2823{
2824	unsigned long long redzone1, redzone2;
2825
2826	redzone1 = *dbg_redzone1(cache, obj);
2827	redzone2 = *dbg_redzone2(cache, obj);
2828
2829	/*
2830	 * Redzone is ok.
2831	 */
2832	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2833		return;
2834
2835	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2836		slab_error(cache, "double free detected");
2837	else
2838		slab_error(cache, "memory outside object was overwritten");
2839
2840	printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
2841			obj, redzone1, redzone2);
2842}
2843
2844static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2845				   void *caller)
2846{
2847	struct page *page;
2848	unsigned int objnr;
2849	struct slab *slabp;
2850
2851	BUG_ON(virt_to_cache(objp) != cachep);
2852
2853	objp -= obj_offset(cachep);
2854	kfree_debugcheck(objp);
2855	page = virt_to_head_page(objp);
2856
2857	slabp = page_get_slab(page);
2858
2859	if (cachep->flags & SLAB_RED_ZONE) {
2860		verify_redzone_free(cachep, objp);
2861		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2862		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2863	}
2864	if (cachep->flags & SLAB_STORE_USER)
2865		*dbg_userword(cachep, objp) = caller;
2866
2867	objnr = obj_to_index(cachep, slabp, objp);
2868
2869	BUG_ON(objnr >= cachep->num);
2870	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2871
2872#ifdef CONFIG_DEBUG_SLAB_LEAK
2873	slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2874#endif
2875	if (cachep->flags & SLAB_POISON) {
2876#ifdef CONFIG_DEBUG_PAGEALLOC
2877		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2878			store_stackinfo(cachep, objp, (unsigned long)caller);
2879			kernel_map_pages(virt_to_page(objp),
2880					 cachep->buffer_size / PAGE_SIZE, 0);
2881		} else {
2882			poison_obj(cachep, objp, POISON_FREE);
2883		}
2884#else
2885		poison_obj(cachep, objp, POISON_FREE);
2886#endif
2887	}
2888	return objp;
2889}
2890
2891static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2892{
2893	kmem_bufctl_t i;
2894	int entries = 0;
2895
2896	/* Check slab's freelist to see if this obj is there. */
2897	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2898		entries++;
2899		if (entries > cachep->num || i >= cachep->num)
2900			goto bad;
2901	}
2902	if (entries != cachep->num - slabp->inuse) {
2903bad:
2904		printk(KERN_ERR "slab: Internal list corruption detected in "
2905				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2906			cachep->name, cachep->num, slabp, slabp->inuse);
2907		for (i = 0;
2908		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2909		     i++) {
2910			if (i % 16 == 0)
2911				printk("\n%03x:", i);
2912			printk(" %02x", ((unsigned char *)slabp)[i]);
2913		}
2914		printk("\n");
2915		BUG();
2916	}
2917}
2918#else
2919#define kfree_debugcheck(x) do { } while(0)
2920#define cache_free_debugcheck(x,objp,z) (objp)
2921#define check_slabp(x,y) do { } while(0)
2922#endif
2923
2924static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2925{
2926	int batchcount;
2927	struct kmem_list3 *l3;
2928	struct array_cache *ac;
2929	int node;
2930
2931retry:
2932	check_irq_off();
2933	node = numa_node_id();
2934	ac = cpu_cache_get(cachep);
2935	batchcount = ac->batchcount;
2936	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2937		/*
2938		 * If there was little recent activity on this cache, then
2939		 * perform only a partial refill.  Otherwise we could generate
2940		 * refill bouncing.
2941		 */
2942		batchcount = BATCHREFILL_LIMIT;
2943	}
2944	l3 = cachep->nodelists[node];
2945
2946	BUG_ON(ac->avail > 0 || !l3);
2947	spin_lock(&l3->list_lock);
2948
2949	/* See if we can refill from the shared array */
2950	if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
2951		goto alloc_done;
2952
2953	while (batchcount > 0) {
2954		struct list_head *entry;
2955		struct slab *slabp;
2956		/* Get slab alloc is to come from. */
2957		entry = l3->slabs_partial.next;
2958		if (entry == &l3->slabs_partial) {
2959			l3->free_touched = 1;
2960			entry = l3->slabs_free.next;
2961			if (entry == &l3->slabs_free)
2962				goto must_grow;
2963		}
2964
2965		slabp = list_entry(entry, struct slab, list);
2966		check_slabp(cachep, slabp);
2967		check_spinlock_acquired(cachep);
2968
2969		/*
2970		 * The slab was either on partial or free list so
2971		 * there must be at least one object available for
2972		 * allocation.
2973		 */
2974		BUG_ON(slabp->inuse >= cachep->num);
2975
2976		while (slabp->inuse < cachep->num && batchcount--) {
2977			STATS_INC_ALLOCED(cachep);
2978			STATS_INC_ACTIVE(cachep);
2979			STATS_SET_HIGH(cachep);
2980
2981			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
2982							    node);
2983		}
2984		check_slabp(cachep, slabp);
2985
2986		/* move slabp to correct slabp list: */
2987		list_del(&slabp->list);
2988		if (slabp->free == BUFCTL_END)
2989			list_add(&slabp->list, &l3->slabs_full);
2990		else
2991			list_add(&slabp->list, &l3->slabs_partial);
2992	}
2993
2994must_grow:
2995	l3->free_objects -= ac->avail;
2996alloc_done:
2997	spin_unlock(&l3->list_lock);
2998
2999	if (unlikely(!ac->avail)) {
3000		int x;
3001		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3002
3003		/* cache_grow can reenable interrupts, then ac could change. */
3004		ac = cpu_cache_get(cachep);
3005		if (!x && ac->avail == 0)	/* no objects in sight? abort */
3006			return NULL;
3007
3008		if (!ac->avail)		/* objects refilled by interrupt? */
3009			goto retry;
3010	}
3011	ac->touched = 1;
3012	return ac->entry[--ac->avail];
3013}
3014
3015static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3016						gfp_t flags)
3017{
3018	might_sleep_if(flags & __GFP_WAIT);
3019#if DEBUG
3020	kmem_flagcheck(cachep, flags);
3021#endif
3022}
3023
3024#if DEBUG
3025static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3026				gfp_t flags, void *objp, void *caller)
3027{
3028	if (!objp)
3029		return objp;
3030	if (cachep->flags & SLAB_POISON) {
3031#ifdef CONFIG_DEBUG_PAGEALLOC
3032		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3033			kernel_map_pages(virt_to_page(objp),
3034					 cachep->buffer_size / PAGE_SIZE, 1);
3035		else
3036			check_poison_obj(cachep, objp);
3037#else
3038		check_poison_obj(cachep, objp);
3039#endif
3040		poison_obj(cachep, objp, POISON_INUSE);
3041	}
3042	if (cachep->flags & SLAB_STORE_USER)
3043		*dbg_userword(cachep, objp) = caller;
3044
3045	if (cachep->flags & SLAB_RED_ZONE) {
3046		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3047				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3048			slab_error(cachep, "double free, or memory outside"
3049						" object was overwritten");
3050			printk(KERN_ERR
3051				"%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
3052				objp, *dbg_redzone1(cachep, objp),
3053				*dbg_redzone2(cachep, objp));
3054		}
3055		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
3056		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
3057	}
3058#ifdef CONFIG_DEBUG_SLAB_LEAK
3059	{
3060		struct slab *slabp;
3061		unsigned objnr;
3062
3063		slabp = page_get_slab(virt_to_head_page(objp));
3064		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
3065		slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3066	}
3067#endif
3068	objp += obj_offset(cachep);
3069	if (cachep->ctor && cachep->flags & SLAB_POISON)
3070		cachep->ctor(objp);
3071#if ARCH_SLAB_MINALIGN
3072	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3073		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3074		       objp, ARCH_SLAB_MINALIGN);
3075	}
3076#endif
3077	return objp;
3078}
3079#else
3080#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3081#endif
3082
3083static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3084{
3085	if (cachep == &cache_cache)
3086		return false;
3087
3088	return should_failslab(obj_size(cachep), flags);
3089}
3090
3091static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3092{
3093	void *objp;
3094	struct array_cache *ac;
3095
3096	check_irq_off();
3097
3098	ac = cpu_cache_get(cachep);
3099	if (likely(ac->avail)) {
3100		STATS_INC_ALLOCHIT(cachep);
3101		ac->touched = 1;
3102		objp = ac->entry[--ac->avail];
3103	} else {
3104		STATS_INC_ALLOCMISS(cachep);
3105		objp = cache_alloc_refill(cachep, flags);
3106	}
3107	/*
3108	 * To avoid a false negative, if an object that is in one of the
3109	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3110	 * treat the array pointers as a reference to the object.
3111	 */
3112	kmemleak_erase(&ac->entry[ac->avail]);
3113	return objp;
3114}
3115
3116#ifdef CONFIG_NUMA
3117/*
3118 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3119 *
3120 * If we are in_interrupt, then process context, including cpusets and
3121 * mempolicy, may not apply and should not be used for allocation policy.
3122 */
3123static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3124{
3125	int nid_alloc, nid_here;
3126
3127	if (in_interrupt() || (flags & __GFP_THISNODE))
3128		return NULL;
3129	nid_alloc = nid_here = numa_node_id();
3130	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3131		nid_alloc = cpuset_mem_spread_node();
3132	else if (current->mempolicy)
3133		nid_alloc = slab_node(current->mempolicy);
3134	if (nid_alloc != nid_here)
3135		return ____cache_alloc_node(cachep, flags, nid_alloc);
3136	return NULL;
3137}
3138
3139/*
3140 * Fallback function if there was no memory available and no objects on a
3141 * certain node and fall back is permitted. First we scan all the
3142 * available nodelists for available objects. If that fails then we
3143 * perform an allocation without specifying a node. This allows the page
3144 * allocator to do its reclaim / fallback magic. We then insert the
3145 * slab into the proper nodelist and then allocate from it.
3146 */
3147static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3148{
3149	struct zonelist *zonelist;
3150	gfp_t local_flags;
3151	struct zoneref *z;
3152	struct zone *zone;
3153	enum zone_type high_zoneidx = gfp_zone(flags);
3154	void *obj = NULL;
3155	int nid;
3156
3157	if (flags & __GFP_THISNODE)
3158		return NULL;
3159
3160	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3161	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3162
3163retry:
3164	/*
3165	 * Look through allowed nodes for objects available
3166	 * from existing per node queues.
3167	 */
3168	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
3169		nid = zone_to_nid(zone);
3170
3171		if (cpuset_zone_allowed_hardwall(zone, flags) &&
3172			cache->nodelists[nid] &&
3173			cache->nodelists[nid]->free_objects) {
3174				obj = ____cache_alloc_node(cache,
3175					flags | GFP_THISNODE, nid);
3176				if (obj)
3177					break;
3178		}
3179	}
3180
3181	if (!obj) {
3182		/*
3183		 * This allocation will be performed within the constraints
3184		 * of the current cpuset / memory policy requirements.
3185		 * We may trigger various forms of reclaim on the allowed
3186		 * set and go into memory reserves if necessary.
3187		 */
3188		if (local_flags & __GFP_WAIT)
3189			local_irq_enable();
3190		kmem_flagcheck(cache, flags);
3191		obj = kmem_getpages(cache, local_flags, numa_node_id());
3192		if (local_flags & __GFP_WAIT)
3193			local_irq_disable();
3194		if (obj) {
3195			/*
3196			 * Insert into the appropriate per node queues
3197			 */
3198			nid = page_to_nid(virt_to_page(obj));
3199			if (cache_grow(cache, flags, nid, obj)) {
3200				obj = ____cache_alloc_node(cache,
3201					flags | GFP_THISNODE, nid);
3202				if (!obj)
3203					/*
3204					 * Another processor may allocate the
3205					 * objects in the slab since we are
3206					 * not holding any locks.
3207					 */
3208					goto retry;
3209			} else {
3210				/* cache_grow already freed obj */
3211				obj = NULL;
3212			}
3213		}
3214	}
3215	return obj;
3216}
3217
3218/*
3219 * A interface to enable slab creation on nodeid
3220 */
3221static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3222				int nodeid)
3223{
3224	struct list_head *entry;
3225	struct slab *slabp;
3226	struct kmem_list3 *l3;
3227	void *obj;
3228	int x;
3229
3230	l3 = cachep->nodelists[nodeid];
3231	BUG_ON(!l3);
3232
3233retry:
3234	check_irq_off();
3235	spin_lock(&l3->list_lock);
3236	entry = l3->slabs_partial.next;
3237	if (entry == &l3->slabs_partial) {
3238		l3->free_touched = 1;
3239		entry = l3->slabs_free.next;
3240		if (entry == &l3->slabs_free)
3241			goto must_grow;
3242	}
3243
3244	slabp = list_entry(entry, struct slab, list);
3245	check_spinlock_acquired_node(cachep, nodeid);
3246	check_slabp(cachep, slabp);
3247
3248	STATS_INC_NODEALLOCS(cachep);
3249	STATS_INC_ACTIVE(cachep);
3250	STATS_SET_HIGH(cachep);
3251
3252	BUG_ON(slabp->inuse == cachep->num);
3253
3254	obj = slab_get_obj(cachep, slabp, nodeid);
3255	check_slabp(cachep, slabp);
3256	l3->free_objects--;
3257	/* move slabp to correct slabp list: */
3258	list_del(&slabp->list);
3259
3260	if (slabp->free == BUFCTL_END)
3261		list_add(&slabp->list, &l3->slabs_full);
3262	else
3263		list_add(&slabp->list, &l3->slabs_partial);
3264
3265	spin_unlock(&l3->list_lock);
3266	goto done;
3267
3268must_grow:
3269	spin_unlock(&l3->list_lock);
3270	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3271	if (x)
3272		goto retry;
3273
3274	return fallback_alloc(cachep, flags);
3275
3276done:
3277	return obj;
3278}
3279
3280/**
3281 * kmem_cache_alloc_node - Allocate an object on the specified node
3282 * @cachep: The cache to allocate from.
3283 * @flags: See kmalloc().
3284 * @nodeid: node number of the target node.
3285 * @caller: return address of caller, used for debug information
3286 *
3287 * Identical to kmem_cache_alloc but it will allocate memory on the given
3288 * node, which can improve the performance for cpu bound structures.
3289 *
3290 * Fallback to other node is possible if __GFP_THISNODE is not set.
3291 */
3292static __always_inline void *
3293__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3294		   void *caller)
3295{
3296	unsigned long save_flags;
3297	void *ptr;
3298
3299	flags &= gfp_allowed_mask;
3300
3301	lockdep_trace_alloc(flags);
3302
3303	if (slab_should_failslab(cachep, flags))
3304		return NULL;
3305
3306	cache_alloc_debugcheck_before(cachep, flags);
3307	local_irq_save(save_flags);
3308
3309	if (unlikely(nodeid == -1))
3310		nodeid = numa_node_id();
3311
3312	if (unlikely(!cachep->nodelists[nodeid])) {
3313		/* Node not bootstrapped yet */
3314		ptr = fallback_alloc(cachep, flags);
3315		goto out;
3316	}
3317
3318	if (nodeid == numa_node_id()) {
3319		/*
3320		 * Use the locally cached objects if possible.
3321		 * However ____cache_alloc does not allow fallback
3322		 * to other nodes. It may fail while we still have
3323		 * objects on other nodes available.
3324		 */
3325		ptr = ____cache_alloc(cachep, flags);
3326		if (ptr)
3327			goto out;
3328	}
3329	/* ___cache_alloc_node can fall back to other nodes */
3330	ptr = ____cache_alloc_node(cachep, flags, nodeid);
3331  out:
3332	local_irq_restore(save_flags);
3333	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3334	kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
3335				 flags);
3336
3337	if (likely(ptr))
3338		kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
3339
3340	if (unlikely((flags & __GFP_ZERO) && ptr))
3341		memset(ptr, 0, obj_size(cachep));
3342
3343	return ptr;
3344}
3345
3346static __always_inline void *
3347__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3348{
3349	void *objp;
3350
3351	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
3352		objp = alternate_node_alloc(cache, flags);
3353		if (objp)
3354			goto out;
3355	}
3356	objp = ____cache_alloc(cache, flags);
3357
3358	/*
3359	 * We may just have run out of memory on the local node.
3360	 * ____cache_alloc_node() knows how to locate memory on other nodes
3361	 */
3362 	if (!objp)
3363 		objp = ____cache_alloc_node(cache, flags, numa_node_id());
3364
3365  out:
3366	return objp;
3367}
3368#else
3369
3370static __always_inline void *
3371__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3372{
3373	return ____cache_alloc(cachep, flags);
3374}
3375
3376#endif /* CONFIG_NUMA */
3377
3378static __always_inline void *
3379__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3380{
3381	unsigned long save_flags;
3382	void *objp;
3383
3384	flags &= gfp_allowed_mask;
3385
3386	lockdep_trace_alloc(flags);
3387
3388	if (slab_should_failslab(cachep, flags))
3389		return NULL;
3390
3391	cache_alloc_debugcheck_before(cachep, flags);
3392	local_irq_save(save_flags);
3393	objp = __do_cache_alloc(cachep, flags);
3394	local_irq_restore(save_flags);
3395	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3396	kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
3397				 flags);
3398	prefetchw(objp);
3399
3400	if (likely(objp))
3401		kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
3402
3403	if (unlikely((flags & __GFP_ZERO) && objp))
3404		memset(objp, 0, obj_size(cachep));
3405
3406	return objp;
3407}
3408
3409/*
3410 * Caller needs to acquire correct kmem_list's list_lock
3411 */
3412static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3413		       int node)
3414{
3415	int i;
3416	struct kmem_list3 *l3;
3417
3418	for (i = 0; i < nr_objects; i++) {
3419		void *objp = objpp[i];
3420		struct slab *slabp;
3421
3422		slabp = virt_to_slab(objp);
3423		l3 = cachep->nodelists[node];
3424		list_del(&slabp->list);
3425		check_spinlock_acquired_node(cachep, node);
3426		check_slabp(cachep, slabp);
3427		slab_put_obj(cachep, slabp, objp, node);
3428		STATS_DEC_ACTIVE(cachep);
3429		l3->free_objects++;
3430		check_slabp(cachep, slabp);
3431
3432		/* fixup slab chains */
3433		if (slabp->inuse == 0) {
3434			if (l3->free_objects > l3->free_limit) {
3435				l3->free_objects -= cachep->num;
3436				/* No need to drop any previously held
3437				 * lock here, even if we have a off-slab slab
3438				 * descriptor it is guaranteed to come from
3439				 * a different cache, refer to comments before
3440				 * alloc_slabmgmt.
3441				 */
3442				slab_destroy(cachep, slabp);
3443			} else {
3444				list_add(&slabp->list, &l3->slabs_free);
3445			}
3446		} else {
3447			/* Unconditionally move a slab to the end of the
3448			 * partial list on free - maximum time for the
3449			 * other objects to be freed, too.
3450			 */
3451			list_add_tail(&slabp->list, &l3->slabs_partial);
3452		}
3453	}
3454}
3455
3456static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3457{
3458	int batchcount;
3459	struct kmem_list3 *l3;
3460	int node = numa_node_id();
3461
3462	batchcount = ac->batchcount;
3463#if DEBUG
3464	BUG_ON(!batchcount || batchcount > ac->avail);
3465#endif
3466	check_irq_off();
3467	l3 = cachep->nodelists[node];
3468	spin_lock(&l3->list_lock);
3469	if (l3->shared) {
3470		struct array_cache *shared_array = l3->shared;
3471		int max = shared_array->limit - shared_array->avail;
3472		if (max) {
3473			if (batchcount > max)
3474				batchcount = max;
3475			memcpy(&(shared_array->entry[shared_array->avail]),
3476			       ac->entry, sizeof(void *) * batchcount);
3477			shared_array->avail += batchcount;
3478			goto free_done;
3479		}
3480	}
3481
3482	free_block(cachep, ac->entry, batchcount, node);
3483free_done:
3484#if STATS
3485	{
3486		int i = 0;
3487		struct list_head *p;
3488
3489		p = l3->slabs_free.next;
3490		while (p != &(l3->slabs_free)) {
3491			struct slab *slabp;
3492
3493			slabp = list_entry(p, struct slab, list);
3494			BUG_ON(slabp->inuse);
3495
3496			i++;
3497			p = p->next;
3498		}
3499		STATS_SET_FREEABLE(cachep, i);
3500	}
3501#endif
3502	spin_unlock(&l3->list_lock);
3503	ac->avail -= batchcount;
3504	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3505}
3506
3507/*
3508 * Release an obj back to its cache. If the obj has a constructed state, it must
3509 * be in this state _before_ it is released.  Called with disabled ints.
3510 */
3511static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3512{
3513	struct array_cache *ac = cpu_cache_get(cachep);
3514
3515	check_irq_off();
3516	kmemleak_free_recursive(objp, cachep->flags);
3517	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3518
3519	kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3520
3521	/*
3522	 * Skip calling cache_free_alien() when the platform is not numa.
3523	 * This will avoid cache misses that happen while accessing slabp (which
3524	 * is per page memory  reference) to get nodeid. Instead use a global
3525	 * variable to skip the call, which is mostly likely to be present in
3526	 * the cache.
3527	 */
3528	if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3529		return;
3530
3531	if (likely(ac->avail < ac->limit)) {
3532		STATS_INC_FREEHIT(cachep);
3533		ac->entry[ac->avail++] = objp;
3534		return;
3535	} else {
3536		STATS_INC_FREEMISS(cachep);
3537		cache_flusharray(cachep, ac);
3538		ac->entry[ac->avail++] = objp;
3539	}
3540}
3541
3542/**
3543 * kmem_cache_alloc - Allocate an object
3544 * @cachep: The cache to allocate from.
3545 * @flags: See kmalloc().
3546 *
3547 * Allocate an object from this cache.  The flags are only relevant
3548 * if the cache has no available objects.
3549 */
3550void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3551{
3552	void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3553
3554	trace_kmem_cache_alloc(_RET_IP_, ret,
3555			       obj_size(cachep), cachep->buffer_size, flags);
3556
3557	return ret;
3558}
3559EXPORT_SYMBOL(kmem_cache_alloc);
3560
3561#ifdef CONFIG_KMEMTRACE
3562void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
3563{
3564	return __cache_alloc(cachep, flags, __builtin_return_address(0));
3565}
3566EXPORT_SYMBOL(kmem_cache_alloc_notrace);
3567#endif
3568
3569/**
3570 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
3571 * @cachep: the cache we're checking against
3572 * @ptr: pointer to validate
3573 *
3574 * This verifies that the untrusted pointer looks sane;
3575 * it is _not_ a guarantee that the pointer is actually
3576 * part of the slab cache in question, but it at least
3577 * validates that the pointer can be dereferenced and
3578 * looks half-way sane.
3579 *
3580 * Currently only used for dentry validation.
3581 */
3582int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3583{
3584	unsigned long addr = (unsigned long)ptr;
3585	unsigned long min_addr = PAGE_OFFSET;
3586	unsigned long align_mask = BYTES_PER_WORD - 1;
3587	unsigned long size = cachep->buffer_size;
3588	struct page *page;
3589
3590	if (unlikely(addr < min_addr))
3591		goto out;
3592	if (unlikely(addr > (unsigned long)high_memory - size))
3593		goto out;
3594	if (unlikely(addr & align_mask))
3595		goto out;
3596	if (unlikely(!kern_addr_valid(addr)))
3597		goto out;
3598	if (unlikely(!kern_addr_valid(addr + size - 1)))
3599		goto out;
3600	page = virt_to_page(ptr);
3601	if (unlikely(!PageSlab(page)))
3602		goto out;
3603	if (unlikely(page_get_cache(page) != cachep))
3604		goto out;
3605	return 1;
3606out:
3607	return 0;
3608}
3609
3610#ifdef CONFIG_NUMA
3611void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3612{
3613	void *ret = __cache_alloc_node(cachep, flags, nodeid,
3614				       __builtin_return_address(0));
3615
3616	trace_kmem_cache_alloc_node(_RET_IP_, ret,
3617				    obj_size(cachep), cachep->buffer_size,
3618				    flags, nodeid);
3619
3620	return ret;
3621}
3622EXPORT_SYMBOL(kmem_cache_alloc_node);
3623
3624#ifdef CONFIG_KMEMTRACE
3625void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
3626				    gfp_t flags,
3627				    int nodeid)
3628{
3629	return __cache_alloc_node(cachep, flags, nodeid,
3630				  __builtin_return_address(0));
3631}
3632EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
3633#endif
3634
3635static __always_inline void *
3636__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3637{
3638	struct kmem_cache *cachep;
3639	void *ret;
3640
3641	cachep = kmem_find_general_cachep(size, flags);
3642	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3643		return cachep;
3644	ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
3645
3646	trace_kmalloc_node((unsigned long) caller, ret,
3647			   size, cachep->buffer_size, flags, node);
3648
3649	return ret;
3650}
3651
3652#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
3653void *__kmalloc_node(size_t size, gfp_t flags, int node)
3654{
3655	return __do_kmalloc_node(size, flags, node,
3656			__builtin_return_address(0));
3657}
3658EXPORT_SYMBOL(__kmalloc_node);
3659
3660void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3661		int node, unsigned long caller)
3662{
3663	return __do_kmalloc_node(size, flags, node, (void *)caller);
3664}
3665EXPORT_SYMBOL(__kmalloc_node_track_caller);
3666#else
3667void *__kmalloc_node(size_t size, gfp_t flags, int node)
3668{
3669	return __do_kmalloc_node(size, flags, node, NULL);
3670}
3671EXPORT_SYMBOL(__kmalloc_node);
3672#endif /* CONFIG_DEBUG_SLAB */
3673#endif /* CONFIG_NUMA */
3674
3675/**
3676 * __do_kmalloc - allocate memory
3677 * @size: how many bytes of memory are required.
3678 * @flags: the type of memory to allocate (see kmalloc).
3679 * @caller: function caller for debug tracking of the caller
3680 */
3681static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3682					  void *caller)
3683{
3684	struct kmem_cache *cachep;
3685	void *ret;
3686
3687	/* If you want to save a few bytes .text space: replace
3688	 * __ with kmem_.
3689	 * Then kmalloc uses the uninlined functions instead of the inline
3690	 * functions.
3691	 */
3692	cachep = __find_general_cachep(size, flags);
3693	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3694		return cachep;
3695	ret = __cache_alloc(cachep, flags, caller);
3696
3697	trace_kmalloc((unsigned long) caller, ret,
3698		      size, cachep->buffer_size, flags);
3699
3700	return ret;
3701}
3702
3703
3704#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
3705void *__kmalloc(size_t size, gfp_t flags)
3706{
3707	return __do_kmalloc(size, flags, __builtin_return_address(0));
3708}
3709EXPORT_SYMBOL(__kmalloc);
3710
3711void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3712{
3713	return __do_kmalloc(size, flags, (void *)caller);
3714}
3715EXPORT_SYMBOL(__kmalloc_track_caller);
3716
3717#else
3718void *__kmalloc(size_t size, gfp_t flags)
3719{
3720	return __do_kmalloc(size, flags, NULL);
3721}
3722EXPORT_SYMBOL(__kmalloc);
3723#endif
3724
3725/**
3726 * kmem_cache_free - Deallocate an object
3727 * @cachep: The cache the allocation was from.
3728 * @objp: The previously allocated object.
3729 *
3730 * Free an object which was previously allocated from this
3731 * cache.
3732 */
3733void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3734{
3735	unsigned long flags;
3736
3737	local_irq_save(flags);
3738	debug_check_no_locks_freed(objp, obj_size(cachep));
3739	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3740		debug_check_no_obj_freed(objp, obj_size(cachep));
3741	__cache_free(cachep, objp);
3742	local_irq_restore(flags);
3743
3744	trace_kmem_cache_free(_RET_IP_, objp);
3745}
3746EXPORT_SYMBOL(kmem_cache_free);
3747
3748/**
3749 * kfree - free previously allocated memory
3750 * @objp: pointer returned by kmalloc.
3751 *
3752 * If @objp is NULL, no operation is performed.
3753 *
3754 * Don't free memory not originally allocated by kmalloc()
3755 * or you will run into trouble.
3756 */
3757void kfree(const void *objp)
3758{
3759	struct kmem_cache *c;
3760	unsigned long flags;
3761
3762	trace_kfree(_RET_IP_, objp);
3763
3764	if (unlikely(ZERO_OR_NULL_PTR(objp)))
3765		return;
3766	local_irq_save(flags);
3767	kfree_debugcheck(objp);
3768	c = virt_to_cache(objp);
3769	debug_check_no_locks_freed(objp, obj_size(c));
3770	debug_check_no_obj_freed(objp, obj_size(c));
3771	__cache_free(c, (void *)objp);
3772	local_irq_restore(flags);
3773}
3774EXPORT_SYMBOL(kfree);
3775
3776unsigned int kmem_cache_size(struct kmem_cache *cachep)
3777{
3778	return obj_size(cachep);
3779}
3780EXPORT_SYMBOL(kmem_cache_size);
3781
3782const char *kmem_cache_name(struct kmem_cache *cachep)
3783{
3784	return cachep->name;
3785}
3786EXPORT_SYMBOL_GPL(kmem_cache_name);
3787
3788/*
3789 * This initializes kmem_list3 or resizes various caches for all nodes.
3790 */
3791static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3792{
3793	int node;
3794	struct kmem_list3 *l3;
3795	struct array_cache *new_shared;
3796	struct array_cache **new_alien = NULL;
3797
3798	for_each_online_node(node) {
3799
3800                if (use_alien_caches) {
3801                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3802                        if (!new_alien)
3803                                goto fail;
3804                }
3805
3806		new_shared = NULL;
3807		if (cachep->shared) {
3808			new_shared = alloc_arraycache(node,
3809				cachep->shared*cachep->batchcount,
3810					0xbaadf00d, gfp);
3811			if (!new_shared) {
3812				free_alien_cache(new_alien);
3813				goto fail;
3814			}
3815		}
3816
3817		l3 = cachep->nodelists[node];
3818		if (l3) {
3819			struct array_cache *shared = l3->shared;
3820
3821			spin_lock_irq(&l3->list_lock);
3822
3823			if (shared)
3824				free_block(cachep, shared->entry,
3825						shared->avail, node);
3826
3827			l3->shared = new_shared;
3828			if (!l3->alien) {
3829				l3->alien = new_alien;
3830				new_alien = NULL;
3831			}
3832			l3->free_limit = (1 + nr_cpus_node(node)) *
3833					cachep->batchcount + cachep->num;
3834			spin_unlock_irq(&l3->list_lock);
3835			kfree(shared);
3836			free_alien_cache(new_alien);
3837			continue;
3838		}
3839		l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
3840		if (!l3) {
3841			free_alien_cache(new_alien);
3842			kfree(new_shared);
3843			goto fail;
3844		}
3845
3846		kmem_list3_init(l3);
3847		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3848				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3849		l3->shared = new_shared;
3850		l3->alien = new_alien;
3851		l3->free_limit = (1 + nr_cpus_node(node)) *
3852					cachep->batchcount + cachep->num;
3853		cachep->nodelists[node] = l3;
3854	}
3855	return 0;
3856
3857fail:
3858	if (!cachep->next.next) {
3859		/* Cache is not active yet. Roll back what we did */
3860		node--;
3861		while (node >= 0) {
3862			if (cachep->nodelists[node]) {
3863				l3 = cachep->nodelists[node];
3864
3865				kfree(l3->shared);
3866				free_alien_cache(l3->alien);
3867				kfree(l3);
3868				cachep->nodelists[node] = NULL;
3869			}
3870			node--;
3871		}
3872	}
3873	return -ENOMEM;
3874}
3875
3876struct ccupdate_struct {
3877	struct kmem_cache *cachep;
3878	struct array_cache *new[NR_CPUS];
3879};
3880
3881static void do_ccupdate_local(void *info)
3882{
3883	struct ccupdate_struct *new = info;
3884	struct array_cache *old;
3885
3886	check_irq_off();
3887	old = cpu_cache_get(new->cachep);
3888
3889	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3890	new->new[smp_processor_id()] = old;
3891}
3892
3893/* Always called with the cache_chain_mutex held */
3894static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3895				int batchcount, int shared, gfp_t gfp)
3896{
3897	struct ccupdate_struct *new;
3898	int i;
3899
3900	new = kzalloc(sizeof(*new), gfp);
3901	if (!new)
3902		return -ENOMEM;
3903
3904	for_each_online_cpu(i) {
3905		new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3906						batchcount, gfp);
3907		if (!new->new[i]) {
3908			for (i--; i >= 0; i--)
3909				kfree(new->new[i]);
3910			kfree(new);
3911			return -ENOMEM;
3912		}
3913	}
3914	new->cachep = cachep;
3915
3916	on_each_cpu(do_ccupdate_local, (void *)new, 1);
3917
3918	check_irq_on();
3919	cachep->batchcount = batchcount;
3920	cachep->limit = limit;
3921	cachep->shared = shared;
3922
3923	for_each_online_cpu(i) {
3924		struct array_cache *ccold = new->new[i];
3925		if (!ccold)
3926			continue;
3927		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3928		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3929		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3930		kfree(ccold);
3931	}
3932	kfree(new);
3933	return alloc_kmemlist(cachep, gfp);
3934}
3935
3936/* Called with cache_chain_mutex held always */
3937static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3938{
3939	int err;
3940	int limit, shared;
3941
3942	/*
3943	 * The head array serves three purposes:
3944	 * - create a LIFO ordering, i.e. return objects that are cache-warm
3945	 * - reduce the number of spinlock operations.
3946	 * - reduce the number of linked list operations on the slab and
3947	 *   bufctl chains: array operations are cheaper.
3948	 * The numbers are guessed, we should auto-tune as described by
3949	 * Bonwick.
3950	 */
3951	if (cachep->buffer_size > 131072)
3952		limit = 1;
3953	else if (cachep->buffer_size > PAGE_SIZE)
3954		limit = 8;
3955	else if (cachep->buffer_size > 1024)
3956		limit = 24;
3957	else if (cachep->buffer_size > 256)
3958		limit = 54;
3959	else
3960		limit = 120;
3961
3962	/*
3963	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3964	 * allocation behaviour: Most allocs on one cpu, most free operations
3965	 * on another cpu. For these cases, an efficient object passing between
3966	 * cpus is necessary. This is provided by a shared array. The array
3967	 * replaces Bonwick's magazine layer.
3968	 * On uniprocessor, it's functionally equivalent (but less efficient)
3969	 * to a larger limit. Thus disabled by default.
3970	 */
3971	shared = 0;
3972	if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
3973		shared = 8;
3974
3975#if DEBUG
3976	/*
3977	 * With debugging enabled, large batchcount lead to excessively long
3978	 * periods with disabled local interrupts. Limit the batchcount
3979	 */
3980	if (limit > 32)
3981		limit = 32;
3982#endif
3983	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
3984	if (err)
3985		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3986		       cachep->name, -err);
3987	return err;
3988}
3989
3990/*
3991 * Drain an array if it contains any elements taking the l3 lock only if
3992 * necessary. Note that the l3 listlock also protects the array_cache
3993 * if drain_array() is used on the shared array.
3994 */
3995void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3996			 struct array_cache *ac, int force, int node)
3997{
3998	int tofree;
3999
4000	if (!ac || !ac->avail)
4001		return;
4002	if (ac->touched && !force) {
4003		ac->touched = 0;
4004	} else {
4005		spin_lock_irq(&l3->list_lock);
4006		if (ac->avail) {
4007			tofree = force ? ac->avail : (ac->limit + 4) / 5;
4008			if (tofree > ac->avail)
4009				tofree = (ac->avail + 1) / 2;
4010			free_block(cachep, ac->entry, tofree, node);
4011			ac->avail -= tofree;
4012			memmove(ac->entry, &(ac->entry[tofree]),
4013				sizeof(void *) * ac->avail);
4014		}
4015		spin_unlock_irq(&l3->list_lock);
4016	}
4017}
4018
4019/**
4020 * cache_reap - Reclaim memory from caches.
4021 * @w: work descriptor
4022 *
4023 * Called from workqueue/eventd every few seconds.
4024 * Purpose:
4025 * - clear the per-cpu caches for this CPU.
4026 * - return freeable pages to the main free memory pool.
4027 *
4028 * If we cannot acquire the cache chain mutex then just give up - we'll try
4029 * again on the next iteration.
4030 */
4031static void cache_reap(struct work_struct *w)
4032{
4033	struct kmem_cache *searchp;
4034	struct kmem_list3 *l3;
4035	int node = numa_node_id();
4036	struct delayed_work *work = to_delayed_work(w);
4037
4038	if (!mutex_trylock(&cache_chain_mutex))
4039		/* Give up. Setup the next iteration. */
4040		goto out;
4041
4042	list_for_each_entry(searchp, &cache_chain, next) {
4043		check_irq_on();
4044
4045		/*
4046		 * We only take the l3 lock if absolutely necessary and we
4047		 * have established with reasonable certainty that
4048		 * we can do some work if the lock was obtained.
4049		 */
4050		l3 = searchp->nodelists[node];
4051
4052		reap_alien(searchp, l3);
4053
4054		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
4055
4056		/*
4057		 * These are racy checks but it does not matter
4058		 * if we skip one check or scan twice.
4059		 */
4060		if (time_after(l3->next_reap, jiffies))
4061			goto next;
4062
4063		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
4064
4065		drain_array(searchp, l3, l3->shared, 0, node);
4066
4067		if (l3->free_touched)
4068			l3->free_touched = 0;
4069		else {
4070			int freed;
4071
4072			freed = drain_freelist(searchp, l3, (l3->free_limit +
4073				5 * searchp->num - 1) / (5 * searchp->num));
4074			STATS_ADD_REAPED(searchp, freed);
4075		}
4076next:
4077		cond_resched();
4078	}
4079	check_irq_on();
4080	mutex_unlock(&cache_chain_mutex);
4081	next_reap_node();
4082out:
4083	/* Set up the next iteration */
4084	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
4085}
4086
4087#ifdef CONFIG_SLABINFO
4088
4089static void print_slabinfo_header(struct seq_file *m)
4090{
4091	/*
4092	 * Output format version, so at least we can change it
4093	 * without _too_ many complaints.
4094	 */
4095#if STATS
4096	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4097#else
4098	seq_puts(m, "slabinfo - version: 2.1\n");
4099#endif
4100	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4101		 "<objperslab> <pagesperslab>");
4102	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4103	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4104#if STATS
4105	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4106		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4107	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4108#endif
4109	seq_putc(m, '\n');
4110}
4111
4112static void *s_start(struct seq_file *m, loff_t *pos)
4113{
4114	loff_t n = *pos;
4115
4116	mutex_lock(&cache_chain_mutex);
4117	if (!n)
4118		print_slabinfo_header(m);
4119
4120	return seq_list_start(&cache_chain, *pos);
4121}
4122
4123static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4124{
4125	return seq_list_next(p, &cache_chain, pos);
4126}
4127
4128static void s_stop(struct seq_file *m, void *p)
4129{
4130	mutex_unlock(&cache_chain_mutex);
4131}
4132
4133static int s_show(struct seq_file *m, void *p)
4134{
4135	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4136	struct slab *slabp;
4137	unsigned long active_objs;
4138	unsigned long num_objs;
4139	unsigned long active_slabs = 0;
4140	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4141	const char *name;
4142	char *error = NULL;
4143	int node;
4144	struct kmem_list3 *l3;
4145
4146	active_objs = 0;
4147	num_slabs = 0;
4148	for_each_online_node(node) {
4149		l3 = cachep->nodelists[node];
4150		if (!l3)
4151			continue;
4152
4153		check_irq_on();
4154		spin_lock_irq(&l3->list_lock);
4155
4156		list_for_each_entry(slabp, &l3->slabs_full, list) {
4157			if (slabp->inuse != cachep->num && !error)
4158				error = "slabs_full accounting error";
4159			active_objs += cachep->num;
4160			active_slabs++;
4161		}
4162		list_for_each_entry(slabp, &l3->slabs_partial, list) {
4163			if (slabp->inuse == cachep->num && !error)
4164				error = "slabs_partial inuse accounting error";
4165			if (!slabp->inuse && !error)
4166				error = "slabs_partial/inuse accounting error";
4167			active_objs += slabp->inuse;
4168			active_slabs++;
4169		}
4170		list_for_each_entry(slabp, &l3->slabs_free, list) {
4171			if (slabp->inuse && !error)
4172				error = "slabs_free/inuse accounting error";
4173			num_slabs++;
4174		}
4175		free_objects += l3->free_objects;
4176		if (l3->shared)
4177			shared_avail += l3->shared->avail;
4178
4179		spin_unlock_irq(&l3->list_lock);
4180	}
4181	num_slabs += active_slabs;
4182	num_objs = num_slabs * cachep->num;
4183	if (num_objs - active_objs != free_objects && !error)
4184		error = "free_objects accounting error";
4185
4186	name = cachep->name;
4187	if (error)
4188		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4189
4190	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4191		   name, active_objs, num_objs, cachep->buffer_size,
4192		   cachep->num, (1 << cachep->gfporder));
4193	seq_printf(m, " : tunables %4u %4u %4u",
4194		   cachep->limit, cachep->batchcount, cachep->shared);
4195	seq_printf(m, " : slabdata %6lu %6lu %6lu",
4196		   active_slabs, num_slabs, shared_avail);
4197#if STATS
4198	{			/* list3 stats */
4199		unsigned long high = cachep->high_mark;
4200		unsigned long allocs = cachep->num_allocations;
4201		unsigned long grown = cachep->grown;
4202		unsigned long reaped = cachep->reaped;
4203		unsigned long errors = cachep->errors;
4204		unsigned long max_freeable = cachep->max_freeable;
4205		unsigned long node_allocs = cachep->node_allocs;
4206		unsigned long node_frees = cachep->node_frees;
4207		unsigned long overflows = cachep->node_overflow;
4208
4209		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
4210				%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
4211				reaped, errors, max_freeable, node_allocs,
4212				node_frees, overflows);
4213	}
4214	/* cpu stats */
4215	{
4216		unsigned long allochit = atomic_read(&cachep->allochit);
4217		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4218		unsigned long freehit = atomic_read(&cachep->freehit);
4219		unsigned long freemiss = atomic_read(&cachep->freemiss);
4220
4221		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4222			   allochit, allocmiss, freehit, freemiss);
4223	}
4224#endif
4225	seq_putc(m, '\n');
4226	return 0;
4227}
4228
4229/*
4230 * slabinfo_op - iterator that generates /proc/slabinfo
4231 *
4232 * Output layout:
4233 * cache-name
4234 * num-active-objs
4235 * total-objs
4236 * object size
4237 * num-active-slabs
4238 * total-slabs
4239 * num-pages-per-slab
4240 * + further values on SMP and with statistics enabled
4241 */
4242
4243static const struct seq_operations slabinfo_op = {
4244	.start = s_start,
4245	.next = s_next,
4246	.stop = s_stop,
4247	.show = s_show,
4248};
4249
4250#define MAX_SLABINFO_WRITE 128
4251/**
4252 * slabinfo_write - Tuning for the slab allocator
4253 * @file: unused
4254 * @buffer: user buffer
4255 * @count: data length
4256 * @ppos: unused
4257 */
4258ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4259		       size_t count, loff_t *ppos)
4260{
4261	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4262	int limit, batchcount, shared, res;
4263	struct kmem_cache *cachep;
4264
4265	if (count > MAX_SLABINFO_WRITE)
4266		return -EINVAL;
4267	if (copy_from_user(&kbuf, buffer, count))
4268		return -EFAULT;
4269	kbuf[MAX_SLABINFO_WRITE] = '\0';
4270
4271	tmp = strchr(kbuf, ' ');
4272	if (!tmp)
4273		return -EINVAL;
4274	*tmp = '\0';
4275	tmp++;
4276	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4277		return -EINVAL;
4278
4279	/* Find the cache in the chain of caches. */
4280	mutex_lock(&cache_chain_mutex);
4281	res = -EINVAL;
4282	list_for_each_entry(cachep, &cache_chain, next) {
4283		if (!strcmp(cachep->name, kbuf)) {
4284			if (limit < 1 || batchcount < 1 ||
4285					batchcount > limit || shared < 0) {
4286				res = 0;
4287			} else {
4288				res = do_tune_cpucache(cachep, limit,
4289						       batchcount, shared,
4290						       GFP_KERNEL);
4291			}
4292			break;
4293		}
4294	}
4295	mutex_unlock(&cache_chain_mutex);
4296	if (res >= 0)
4297		res = count;
4298	return res;
4299}
4300
4301static int slabinfo_open(struct inode *inode, struct file *file)
4302{
4303	return seq_open(file, &slabinfo_op);
4304}
4305
4306static const struct file_operations proc_slabinfo_operations = {
4307	.open		= slabinfo_open,
4308	.read		= seq_read,
4309	.write		= slabinfo_write,
4310	.llseek		= seq_lseek,
4311	.release	= seq_release,
4312};
4313
4314#ifdef CONFIG_DEBUG_SLAB_LEAK
4315
4316static void *leaks_start(struct seq_file *m, loff_t *pos)
4317{
4318	mutex_lock(&cache_chain_mutex);
4319	return seq_list_start(&cache_chain, *pos);
4320}
4321
4322static inline int add_caller(unsigned long *n, unsigned long v)
4323{
4324	unsigned long *p;
4325	int l;
4326	if (!v)
4327		return 1;
4328	l = n[1];
4329	p = n + 2;
4330	while (l) {
4331		int i = l/2;
4332		unsigned long *q = p + 2 * i;
4333		if (*q == v) {
4334			q[1]++;
4335			return 1;
4336		}
4337		if (*q > v) {
4338			l = i;
4339		} else {
4340			p = q + 2;
4341			l -= i + 1;
4342		}
4343	}
4344	if (++n[1] == n[0])
4345		return 0;
4346	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4347	p[0] = v;
4348	p[1] = 1;
4349	return 1;
4350}
4351
4352static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4353{
4354	void *p;
4355	int i;
4356	if (n[0] == n[1])
4357		return;
4358	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4359		if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4360			continue;
4361		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4362			return;
4363	}
4364}
4365
4366static void show_symbol(struct seq_file *m, unsigned long address)
4367{
4368#ifdef CONFIG_KALLSYMS
4369	unsigned long offset, size;
4370	char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
4371
4372	if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
4373		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4374		if (modname[0])
4375			seq_printf(m, " [%s]", modname);
4376		return;
4377	}
4378#endif
4379	seq_printf(m, "%p", (void *)address);
4380}
4381
4382static int leaks_show(struct seq_file *m, void *p)
4383{
4384	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4385	struct slab *slabp;
4386	struct kmem_list3 *l3;
4387	const char *name;
4388	unsigned long *n = m->private;
4389	int node;
4390	int i;
4391
4392	if (!(cachep->flags & SLAB_STORE_USER))
4393		return 0;
4394	if (!(cachep->flags & SLAB_RED_ZONE))
4395		return 0;
4396
4397	/* OK, we can do it */
4398
4399	n[1] = 0;
4400
4401	for_each_online_node(node) {
4402		l3 = cachep->nodelists[node];
4403		if (!l3)
4404			continue;
4405
4406		check_irq_on();
4407		spin_lock_irq(&l3->list_lock);
4408
4409		list_for_each_entry(slabp, &l3->slabs_full, list)
4410			handle_slab(n, cachep, slabp);
4411		list_for_each_entry(slabp, &l3->slabs_partial, list)
4412			handle_slab(n, cachep, slabp);
4413		spin_unlock_irq(&l3->list_lock);
4414	}
4415	name = cachep->name;
4416	if (n[0] == n[1]) {
4417		/* Increase the buffer size */
4418		mutex_unlock(&cache_chain_mutex);
4419		m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4420		if (!m->private) {
4421			/* Too bad, we are really out */
4422			m->private = n;
4423			mutex_lock(&cache_chain_mutex);
4424			return -ENOMEM;
4425		}
4426		*(unsigned long *)m->private = n[0] * 2;
4427		kfree(n);
4428		mutex_lock(&cache_chain_mutex);
4429		/* Now make sure this entry will be retried */
4430		m->count = m->size;
4431		return 0;
4432	}
4433	for (i = 0; i < n[1]; i++) {
4434		seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4435		show_symbol(m, n[2*i+2]);
4436		seq_putc(m, '\n');
4437	}
4438
4439	return 0;
4440}
4441
4442static const struct seq_operations slabstats_op = {
4443	.start = leaks_start,
4444	.next = s_next,
4445	.stop = s_stop,
4446	.show = leaks_show,
4447};
4448
4449static int slabstats_open(struct inode *inode, struct file *file)
4450{
4451	unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
4452	int ret = -ENOMEM;
4453	if (n) {
4454		ret = seq_open(file, &slabstats_op);
4455		if (!ret) {
4456			struct seq_file *m = file->private_data;
4457			*n = PAGE_SIZE / (2 * sizeof(unsigned long));
4458			m->private = n;
4459			n = NULL;
4460		}
4461		kfree(n);
4462	}
4463	return ret;
4464}
4465
4466static const struct file_operations proc_slabstats_operations = {
4467	.open		= slabstats_open,
4468	.read		= seq_read,
4469	.llseek		= seq_lseek,
4470	.release	= seq_release_private,
4471};
4472#endif
4473
4474static int __init slab_proc_init(void)
4475{
4476	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
4477#ifdef CONFIG_DEBUG_SLAB_LEAK
4478	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4479#endif
4480	return 0;
4481}
4482module_init(slab_proc_init);
4483#endif
4484
4485/**
4486 * ksize - get the actual amount of memory allocated for a given object
4487 * @objp: Pointer to the object
4488 *
4489 * kmalloc may internally round up allocations and return more memory
4490 * than requested. ksize() can be used to determine the actual amount of
4491 * memory allocated. The caller may use this additional memory, even though
4492 * a smaller amount of memory was initially specified with the kmalloc call.
4493 * The caller must guarantee that objp points to a valid object previously
4494 * allocated with either kmalloc() or kmem_cache_alloc(). The object
4495 * must not be freed during the duration of the call.
4496 */
4497size_t ksize(const void *objp)
4498{
4499	BUG_ON(!objp);
4500	if (unlikely(objp == ZERO_SIZE_PTR))
4501		return 0;
4502
4503	return obj_size(virt_to_cache(objp));
4504}
4505EXPORT_SYMBOL(ksize);
4506