slab.c revision 1ab335d8f85792e3b107ff8237d53cf64db714df
1/*
2 * linux/mm/slab.c
3 * Written by Mark Hemment, 1996/97.
4 * (markhe@nextd.demon.co.uk)
5 *
6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7 *
8 * Major cleanup, different bufctl logic, per-cpu arrays
9 *	(c) 2000 Manfred Spraul
10 *
11 * Cleanup, make the head arrays unconditional, preparation for NUMA
12 * 	(c) 2002 Manfred Spraul
13 *
14 * An implementation of the Slab Allocator as described in outline in;
15 *	UNIX Internals: The New Frontiers by Uresh Vahalia
16 *	Pub: Prentice Hall	ISBN 0-13-101908-2
17 * or with a little more detail in;
18 *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
19 *	Jeff Bonwick (Sun Microsystems).
20 *	Presented at: USENIX Summer 1994 Technical Conference
21 *
22 * The memory is organized in caches, one cache for each object type.
23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24 * Each cache consists out of many slabs (they are small (usually one
25 * page long) and always contiguous), and each slab contains multiple
26 * initialized objects.
27 *
28 * This means, that your constructor is used only for newly allocated
29 * slabs and you must pass objects with the same initializations to
30 * kmem_cache_free.
31 *
32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33 * normal). If you need a special memory type, then must create a new
34 * cache for that memory type.
35 *
36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37 *   full slabs with 0 free objects
38 *   partial slabs
39 *   empty slabs with no allocated objects
40 *
41 * If partial slabs exist, then new allocations come from these slabs,
42 * otherwise from empty slabs or new slabs are allocated.
43 *
44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46 *
47 * Each cache has a short per-cpu head array, most allocs
48 * and frees go into that array, and if that array overflows, then 1/2
49 * of the entries in the array are given back into the global cache.
50 * The head array is strictly LIFO and should improve the cache hit rates.
51 * On SMP, it additionally reduces the spinlock operations.
52 *
53 * The c_cpuarray may not be read with enabled local interrupts -
54 * it's changed with a smp_call_function().
55 *
56 * SMP synchronization:
57 *  constructors and destructors are called without any locking.
58 *  Several members in struct kmem_cache and struct slab never change, they
59 *	are accessed without any locking.
60 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
61 *  	and local interrupts are disabled so slab code is preempt-safe.
62 *  The non-constant members are protected with a per-cache irq spinlock.
63 *
64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65 * in 2000 - many ideas in the current implementation are derived from
66 * his patch.
67 *
68 * Further notes from the original documentation:
69 *
70 * 11 April '97.  Started multi-threading - markhe
71 *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
72 *	The sem is only needed when accessing/extending the cache-chain, which
73 *	can never happen inside an interrupt (kmem_cache_create(),
74 *	kmem_cache_shrink() and kmem_cache_reap()).
75 *
76 *	At present, each engine can be growing a cache.  This should be blocked.
77 *
78 * 15 March 2005. NUMA slab allocator.
79 *	Shai Fultheim <shai@scalex86.org>.
80 *	Shobhit Dayal <shobhit@calsoftinc.com>
81 *	Alok N Kataria <alokk@calsoftinc.com>
82 *	Christoph Lameter <christoph@lameter.com>
83 *
84 *	Modified the slab allocator to be node aware on NUMA systems.
85 *	Each node has its own list of partial, free and full slabs.
86 *	All object allocations for a node occur from node specific slab lists.
87 */
88
89#include	<linux/slab.h>
90#include	<linux/mm.h>
91#include	<linux/poison.h>
92#include	<linux/swap.h>
93#include	<linux/cache.h>
94#include	<linux/interrupt.h>
95#include	<linux/init.h>
96#include	<linux/compiler.h>
97#include	<linux/cpuset.h>
98#include	<linux/proc_fs.h>
99#include	<linux/seq_file.h>
100#include	<linux/notifier.h>
101#include	<linux/kallsyms.h>
102#include	<linux/cpu.h>
103#include	<linux/sysctl.h>
104#include	<linux/module.h>
105#include	<linux/rcupdate.h>
106#include	<linux/string.h>
107#include	<linux/uaccess.h>
108#include	<linux/nodemask.h>
109#include	<linux/kmemleak.h>
110#include	<linux/mempolicy.h>
111#include	<linux/mutex.h>
112#include	<linux/fault-inject.h>
113#include	<linux/rtmutex.h>
114#include	<linux/reciprocal_div.h>
115#include	<linux/debugobjects.h>
116#include	<linux/kmemcheck.h>
117#include	<linux/memory.h>
118
119#include	<asm/cacheflush.h>
120#include	<asm/tlbflush.h>
121#include	<asm/page.h>
122
123/*
124 * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
125 *		  0 for faster, smaller code (especially in the critical paths).
126 *
127 * STATS	- 1 to collect stats for /proc/slabinfo.
128 *		  0 for faster, smaller code (especially in the critical paths).
129 *
130 * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
131 */
132
133#ifdef CONFIG_DEBUG_SLAB
134#define	DEBUG		1
135#define	STATS		1
136#define	FORCED_DEBUG	1
137#else
138#define	DEBUG		0
139#define	STATS		0
140#define	FORCED_DEBUG	0
141#endif
142
143/* Shouldn't this be in a header file somewhere? */
144#define	BYTES_PER_WORD		sizeof(void *)
145#define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
146
147#ifndef ARCH_KMALLOC_FLAGS
148#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
149#endif
150
151/* Legal flag mask for kmem_cache_create(). */
152#if DEBUG
153# define CREATE_MASK	(SLAB_RED_ZONE | \
154			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
155			 SLAB_CACHE_DMA | \
156			 SLAB_STORE_USER | \
157			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
158			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
159			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
160#else
161# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
162			 SLAB_CACHE_DMA | \
163			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
164			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
165			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
166#endif
167
168/*
169 * kmem_bufctl_t:
170 *
171 * Bufctl's are used for linking objs within a slab
172 * linked offsets.
173 *
174 * This implementation relies on "struct page" for locating the cache &
175 * slab an object belongs to.
176 * This allows the bufctl structure to be small (one int), but limits
177 * the number of objects a slab (not a cache) can contain when off-slab
178 * bufctls are used. The limit is the size of the largest general cache
179 * that does not use off-slab slabs.
180 * For 32bit archs with 4 kB pages, is this 56.
181 * This is not serious, as it is only for large objects, when it is unwise
182 * to have too many per slab.
183 * Note: This limit can be raised by introducing a general cache whose size
184 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
185 */
186
187typedef unsigned int kmem_bufctl_t;
188#define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
189#define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
190#define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
191#define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
192
193/*
194 * struct slab
195 *
196 * Manages the objs in a slab. Placed either at the beginning of mem allocated
197 * for a slab, or allocated from an general cache.
198 * Slabs are chained into three list: fully used, partial, fully free slabs.
199 */
200struct slab {
201	struct list_head list;
202	unsigned long colouroff;
203	void *s_mem;		/* including colour offset */
204	unsigned int inuse;	/* num of objs active in slab */
205	kmem_bufctl_t free;
206	unsigned short nodeid;
207};
208
209/*
210 * struct slab_rcu
211 *
212 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
213 * arrange for kmem_freepages to be called via RCU.  This is useful if
214 * we need to approach a kernel structure obliquely, from its address
215 * obtained without the usual locking.  We can lock the structure to
216 * stabilize it and check it's still at the given address, only if we
217 * can be sure that the memory has not been meanwhile reused for some
218 * other kind of object (which our subsystem's lock might corrupt).
219 *
220 * rcu_read_lock before reading the address, then rcu_read_unlock after
221 * taking the spinlock within the structure expected at that address.
222 *
223 * We assume struct slab_rcu can overlay struct slab when destroying.
224 */
225struct slab_rcu {
226	struct rcu_head head;
227	struct kmem_cache *cachep;
228	void *addr;
229};
230
231/*
232 * struct array_cache
233 *
234 * Purpose:
235 * - LIFO ordering, to hand out cache-warm objects from _alloc
236 * - reduce the number of linked list operations
237 * - reduce spinlock operations
238 *
239 * The limit is stored in the per-cpu structure to reduce the data cache
240 * footprint.
241 *
242 */
243struct array_cache {
244	unsigned int avail;
245	unsigned int limit;
246	unsigned int batchcount;
247	unsigned int touched;
248	spinlock_t lock;
249	void *entry[];	/*
250			 * Must have this definition in here for the proper
251			 * alignment of array_cache. Also simplifies accessing
252			 * the entries.
253			 */
254};
255
256/*
257 * bootstrap: The caches do not work without cpuarrays anymore, but the
258 * cpuarrays are allocated from the generic caches...
259 */
260#define BOOT_CPUCACHE_ENTRIES	1
261struct arraycache_init {
262	struct array_cache cache;
263	void *entries[BOOT_CPUCACHE_ENTRIES];
264};
265
266/*
267 * The slab lists for all objects.
268 */
269struct kmem_list3 {
270	struct list_head slabs_partial;	/* partial list first, better asm code */
271	struct list_head slabs_full;
272	struct list_head slabs_free;
273	unsigned long free_objects;
274	unsigned int free_limit;
275	unsigned int colour_next;	/* Per-node cache coloring */
276	spinlock_t list_lock;
277	struct array_cache *shared;	/* shared per node */
278	struct array_cache **alien;	/* on other nodes */
279	unsigned long next_reap;	/* updated without locking */
280	int free_touched;		/* updated without locking */
281};
282
283/*
284 * Need this for bootstrapping a per node allocator.
285 */
286#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
287struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
288#define	CACHE_CACHE 0
289#define	SIZE_AC MAX_NUMNODES
290#define	SIZE_L3 (2 * MAX_NUMNODES)
291
292static int drain_freelist(struct kmem_cache *cache,
293			struct kmem_list3 *l3, int tofree);
294static void free_block(struct kmem_cache *cachep, void **objpp, int len,
295			int node);
296static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
297static void cache_reap(struct work_struct *unused);
298
299/*
300 * This function must be completely optimized away if a constant is passed to
301 * it.  Mostly the same as what is in linux/slab.h except it returns an index.
302 */
303static __always_inline int index_of(const size_t size)
304{
305	extern void __bad_size(void);
306
307	if (__builtin_constant_p(size)) {
308		int i = 0;
309
310#define CACHE(x) \
311	if (size <=x) \
312		return i; \
313	else \
314		i++;
315#include <linux/kmalloc_sizes.h>
316#undef CACHE
317		__bad_size();
318	} else
319		__bad_size();
320	return 0;
321}
322
323static int slab_early_init = 1;
324
325#define INDEX_AC index_of(sizeof(struct arraycache_init))
326#define INDEX_L3 index_of(sizeof(struct kmem_list3))
327
328static void kmem_list3_init(struct kmem_list3 *parent)
329{
330	INIT_LIST_HEAD(&parent->slabs_full);
331	INIT_LIST_HEAD(&parent->slabs_partial);
332	INIT_LIST_HEAD(&parent->slabs_free);
333	parent->shared = NULL;
334	parent->alien = NULL;
335	parent->colour_next = 0;
336	spin_lock_init(&parent->list_lock);
337	parent->free_objects = 0;
338	parent->free_touched = 0;
339}
340
341#define MAKE_LIST(cachep, listp, slab, nodeid)				\
342	do {								\
343		INIT_LIST_HEAD(listp);					\
344		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
345	} while (0)
346
347#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
348	do {								\
349	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
350	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
351	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
352	} while (0)
353
354#define CFLGS_OFF_SLAB		(0x80000000UL)
355#define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
356
357#define BATCHREFILL_LIMIT	16
358/*
359 * Optimization question: fewer reaps means less probability for unnessary
360 * cpucache drain/refill cycles.
361 *
362 * OTOH the cpuarrays can contain lots of objects,
363 * which could lock up otherwise freeable slabs.
364 */
365#define REAPTIMEOUT_CPUC	(2*HZ)
366#define REAPTIMEOUT_LIST3	(4*HZ)
367
368#if STATS
369#define	STATS_INC_ACTIVE(x)	((x)->num_active++)
370#define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
371#define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
372#define	STATS_INC_GROWN(x)	((x)->grown++)
373#define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
374#define	STATS_SET_HIGH(x)						\
375	do {								\
376		if ((x)->num_active > (x)->high_mark)			\
377			(x)->high_mark = (x)->num_active;		\
378	} while (0)
379#define	STATS_INC_ERR(x)	((x)->errors++)
380#define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
381#define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
382#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
383#define	STATS_SET_FREEABLE(x, i)					\
384	do {								\
385		if ((x)->max_freeable < i)				\
386			(x)->max_freeable = i;				\
387	} while (0)
388#define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
389#define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
390#define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
391#define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
392#else
393#define	STATS_INC_ACTIVE(x)	do { } while (0)
394#define	STATS_DEC_ACTIVE(x)	do { } while (0)
395#define	STATS_INC_ALLOCED(x)	do { } while (0)
396#define	STATS_INC_GROWN(x)	do { } while (0)
397#define	STATS_ADD_REAPED(x,y)	do { } while (0)
398#define	STATS_SET_HIGH(x)	do { } while (0)
399#define	STATS_INC_ERR(x)	do { } while (0)
400#define	STATS_INC_NODEALLOCS(x)	do { } while (0)
401#define	STATS_INC_NODEFREES(x)	do { } while (0)
402#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
403#define	STATS_SET_FREEABLE(x, i) do { } while (0)
404#define STATS_INC_ALLOCHIT(x)	do { } while (0)
405#define STATS_INC_ALLOCMISS(x)	do { } while (0)
406#define STATS_INC_FREEHIT(x)	do { } while (0)
407#define STATS_INC_FREEMISS(x)	do { } while (0)
408#endif
409
410#if DEBUG
411
412/*
413 * memory layout of objects:
414 * 0		: objp
415 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
416 * 		the end of an object is aligned with the end of the real
417 * 		allocation. Catches writes behind the end of the allocation.
418 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
419 * 		redzone word.
420 * cachep->obj_offset: The real object.
421 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
422 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
423 *					[BYTES_PER_WORD long]
424 */
425static int obj_offset(struct kmem_cache *cachep)
426{
427	return cachep->obj_offset;
428}
429
430static int obj_size(struct kmem_cache *cachep)
431{
432	return cachep->obj_size;
433}
434
435static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
436{
437	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
438	return (unsigned long long*) (objp + obj_offset(cachep) -
439				      sizeof(unsigned long long));
440}
441
442static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
443{
444	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
445	if (cachep->flags & SLAB_STORE_USER)
446		return (unsigned long long *)(objp + cachep->buffer_size -
447					      sizeof(unsigned long long) -
448					      REDZONE_ALIGN);
449	return (unsigned long long *) (objp + cachep->buffer_size -
450				       sizeof(unsigned long long));
451}
452
453static void **dbg_userword(struct kmem_cache *cachep, void *objp)
454{
455	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
456	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
457}
458
459#else
460
461#define obj_offset(x)			0
462#define obj_size(cachep)		(cachep->buffer_size)
463#define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
464#define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
465#define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
466
467#endif
468
469#ifdef CONFIG_TRACING
470size_t slab_buffer_size(struct kmem_cache *cachep)
471{
472	return cachep->buffer_size;
473}
474EXPORT_SYMBOL(slab_buffer_size);
475#endif
476
477/*
478 * Do not go above this order unless 0 objects fit into the slab.
479 */
480#define	BREAK_GFP_ORDER_HI	1
481#define	BREAK_GFP_ORDER_LO	0
482static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
483
484/*
485 * Functions for storing/retrieving the cachep and or slab from the page
486 * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
487 * these are used to find the cache which an obj belongs to.
488 */
489static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
490{
491	page->lru.next = (struct list_head *)cache;
492}
493
494static inline struct kmem_cache *page_get_cache(struct page *page)
495{
496	page = compound_head(page);
497	BUG_ON(!PageSlab(page));
498	return (struct kmem_cache *)page->lru.next;
499}
500
501static inline void page_set_slab(struct page *page, struct slab *slab)
502{
503	page->lru.prev = (struct list_head *)slab;
504}
505
506static inline struct slab *page_get_slab(struct page *page)
507{
508	BUG_ON(!PageSlab(page));
509	return (struct slab *)page->lru.prev;
510}
511
512static inline struct kmem_cache *virt_to_cache(const void *obj)
513{
514	struct page *page = virt_to_head_page(obj);
515	return page_get_cache(page);
516}
517
518static inline struct slab *virt_to_slab(const void *obj)
519{
520	struct page *page = virt_to_head_page(obj);
521	return page_get_slab(page);
522}
523
524static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
525				 unsigned int idx)
526{
527	return slab->s_mem + cache->buffer_size * idx;
528}
529
530/*
531 * We want to avoid an expensive divide : (offset / cache->buffer_size)
532 *   Using the fact that buffer_size is a constant for a particular cache,
533 *   we can replace (offset / cache->buffer_size) by
534 *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
535 */
536static inline unsigned int obj_to_index(const struct kmem_cache *cache,
537					const struct slab *slab, void *obj)
538{
539	u32 offset = (obj - slab->s_mem);
540	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
541}
542
543/*
544 * These are the default caches for kmalloc. Custom caches can have other sizes.
545 */
546struct cache_sizes malloc_sizes[] = {
547#define CACHE(x) { .cs_size = (x) },
548#include <linux/kmalloc_sizes.h>
549	CACHE(ULONG_MAX)
550#undef CACHE
551};
552EXPORT_SYMBOL(malloc_sizes);
553
554/* Must match cache_sizes above. Out of line to keep cache footprint low. */
555struct cache_names {
556	char *name;
557	char *name_dma;
558};
559
560static struct cache_names __initdata cache_names[] = {
561#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
562#include <linux/kmalloc_sizes.h>
563	{NULL,}
564#undef CACHE
565};
566
567static struct arraycache_init initarray_cache __initdata =
568    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
569static struct arraycache_init initarray_generic =
570    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
571
572/* internal cache of cache description objs */
573static struct kmem_cache cache_cache = {
574	.batchcount = 1,
575	.limit = BOOT_CPUCACHE_ENTRIES,
576	.shared = 1,
577	.buffer_size = sizeof(struct kmem_cache),
578	.name = "kmem_cache",
579};
580
581#define BAD_ALIEN_MAGIC 0x01020304ul
582
583/*
584 * chicken and egg problem: delay the per-cpu array allocation
585 * until the general caches are up.
586 */
587static enum {
588	NONE,
589	PARTIAL_AC,
590	PARTIAL_L3,
591	EARLY,
592	FULL
593} g_cpucache_up;
594
595/*
596 * used by boot code to determine if it can use slab based allocator
597 */
598int slab_is_available(void)
599{
600	return g_cpucache_up >= EARLY;
601}
602
603#ifdef CONFIG_LOCKDEP
604
605/*
606 * Slab sometimes uses the kmalloc slabs to store the slab headers
607 * for other slabs "off slab".
608 * The locking for this is tricky in that it nests within the locks
609 * of all other slabs in a few places; to deal with this special
610 * locking we put on-slab caches into a separate lock-class.
611 *
612 * We set lock class for alien array caches which are up during init.
613 * The lock annotation will be lost if all cpus of a node goes down and
614 * then comes back up during hotplug
615 */
616static struct lock_class_key on_slab_l3_key;
617static struct lock_class_key on_slab_alc_key;
618
619static void init_node_lock_keys(int q)
620{
621	struct cache_sizes *s = malloc_sizes;
622
623	if (g_cpucache_up != FULL)
624		return;
625
626	for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
627		struct array_cache **alc;
628		struct kmem_list3 *l3;
629		int r;
630
631		l3 = s->cs_cachep->nodelists[q];
632		if (!l3 || OFF_SLAB(s->cs_cachep))
633			continue;
634		lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
635		alc = l3->alien;
636		/*
637		 * FIXME: This check for BAD_ALIEN_MAGIC
638		 * should go away when common slab code is taught to
639		 * work even without alien caches.
640		 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
641		 * for alloc_alien_cache,
642		 */
643		if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
644			continue;
645		for_each_node(r) {
646			if (alc[r])
647				lockdep_set_class(&alc[r]->lock,
648					&on_slab_alc_key);
649		}
650	}
651}
652
653static inline void init_lock_keys(void)
654{
655	int node;
656
657	for_each_node(node)
658		init_node_lock_keys(node);
659}
660#else
661static void init_node_lock_keys(int q)
662{
663}
664
665static inline void init_lock_keys(void)
666{
667}
668#endif
669
670/*
671 * Guard access to the cache-chain.
672 */
673static DEFINE_MUTEX(cache_chain_mutex);
674static struct list_head cache_chain;
675
676static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
677
678static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
679{
680	return cachep->array[smp_processor_id()];
681}
682
683static inline struct kmem_cache *__find_general_cachep(size_t size,
684							gfp_t gfpflags)
685{
686	struct cache_sizes *csizep = malloc_sizes;
687
688#if DEBUG
689	/* This happens if someone tries to call
690	 * kmem_cache_create(), or __kmalloc(), before
691	 * the generic caches are initialized.
692	 */
693	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
694#endif
695	if (!size)
696		return ZERO_SIZE_PTR;
697
698	while (size > csizep->cs_size)
699		csizep++;
700
701	/*
702	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
703	 * has cs_{dma,}cachep==NULL. Thus no special case
704	 * for large kmalloc calls required.
705	 */
706#ifdef CONFIG_ZONE_DMA
707	if (unlikely(gfpflags & GFP_DMA))
708		return csizep->cs_dmacachep;
709#endif
710	return csizep->cs_cachep;
711}
712
713static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
714{
715	return __find_general_cachep(size, gfpflags);
716}
717
718static size_t slab_mgmt_size(size_t nr_objs, size_t align)
719{
720	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
721}
722
723/*
724 * Calculate the number of objects and left-over bytes for a given buffer size.
725 */
726static void cache_estimate(unsigned long gfporder, size_t buffer_size,
727			   size_t align, int flags, size_t *left_over,
728			   unsigned int *num)
729{
730	int nr_objs;
731	size_t mgmt_size;
732	size_t slab_size = PAGE_SIZE << gfporder;
733
734	/*
735	 * The slab management structure can be either off the slab or
736	 * on it. For the latter case, the memory allocated for a
737	 * slab is used for:
738	 *
739	 * - The struct slab
740	 * - One kmem_bufctl_t for each object
741	 * - Padding to respect alignment of @align
742	 * - @buffer_size bytes for each object
743	 *
744	 * If the slab management structure is off the slab, then the
745	 * alignment will already be calculated into the size. Because
746	 * the slabs are all pages aligned, the objects will be at the
747	 * correct alignment when allocated.
748	 */
749	if (flags & CFLGS_OFF_SLAB) {
750		mgmt_size = 0;
751		nr_objs = slab_size / buffer_size;
752
753		if (nr_objs > SLAB_LIMIT)
754			nr_objs = SLAB_LIMIT;
755	} else {
756		/*
757		 * Ignore padding for the initial guess. The padding
758		 * is at most @align-1 bytes, and @buffer_size is at
759		 * least @align. In the worst case, this result will
760		 * be one greater than the number of objects that fit
761		 * into the memory allocation when taking the padding
762		 * into account.
763		 */
764		nr_objs = (slab_size - sizeof(struct slab)) /
765			  (buffer_size + sizeof(kmem_bufctl_t));
766
767		/*
768		 * This calculated number will be either the right
769		 * amount, or one greater than what we want.
770		 */
771		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
772		       > slab_size)
773			nr_objs--;
774
775		if (nr_objs > SLAB_LIMIT)
776			nr_objs = SLAB_LIMIT;
777
778		mgmt_size = slab_mgmt_size(nr_objs, align);
779	}
780	*num = nr_objs;
781	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
782}
783
784#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
785
786static void __slab_error(const char *function, struct kmem_cache *cachep,
787			char *msg)
788{
789	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
790	       function, cachep->name, msg);
791	dump_stack();
792}
793
794/*
795 * By default on NUMA we use alien caches to stage the freeing of
796 * objects allocated from other nodes. This causes massive memory
797 * inefficiencies when using fake NUMA setup to split memory into a
798 * large number of small nodes, so it can be disabled on the command
799 * line
800  */
801
802static int use_alien_caches __read_mostly = 1;
803static int __init noaliencache_setup(char *s)
804{
805	use_alien_caches = 0;
806	return 1;
807}
808__setup("noaliencache", noaliencache_setup);
809
810#ifdef CONFIG_NUMA
811/*
812 * Special reaping functions for NUMA systems called from cache_reap().
813 * These take care of doing round robin flushing of alien caches (containing
814 * objects freed on different nodes from which they were allocated) and the
815 * flushing of remote pcps by calling drain_node_pages.
816 */
817static DEFINE_PER_CPU(unsigned long, slab_reap_node);
818
819static void init_reap_node(int cpu)
820{
821	int node;
822
823	node = next_node(cpu_to_mem(cpu), node_online_map);
824	if (node == MAX_NUMNODES)
825		node = first_node(node_online_map);
826
827	per_cpu(slab_reap_node, cpu) = node;
828}
829
830static void next_reap_node(void)
831{
832	int node = __get_cpu_var(slab_reap_node);
833
834	node = next_node(node, node_online_map);
835	if (unlikely(node >= MAX_NUMNODES))
836		node = first_node(node_online_map);
837	__get_cpu_var(slab_reap_node) = node;
838}
839
840#else
841#define init_reap_node(cpu) do { } while (0)
842#define next_reap_node(void) do { } while (0)
843#endif
844
845/*
846 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
847 * via the workqueue/eventd.
848 * Add the CPU number into the expiration time to minimize the possibility of
849 * the CPUs getting into lockstep and contending for the global cache chain
850 * lock.
851 */
852static void __cpuinit start_cpu_timer(int cpu)
853{
854	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
855
856	/*
857	 * When this gets called from do_initcalls via cpucache_init(),
858	 * init_workqueues() has already run, so keventd will be setup
859	 * at that time.
860	 */
861	if (keventd_up() && reap_work->work.func == NULL) {
862		init_reap_node(cpu);
863		INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
864		schedule_delayed_work_on(cpu, reap_work,
865					__round_jiffies_relative(HZ, cpu));
866	}
867}
868
869static struct array_cache *alloc_arraycache(int node, int entries,
870					    int batchcount, gfp_t gfp)
871{
872	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
873	struct array_cache *nc = NULL;
874
875	nc = kmalloc_node(memsize, gfp, node);
876	/*
877	 * The array_cache structures contain pointers to free object.
878	 * However, when such objects are allocated or transfered to another
879	 * cache the pointers are not cleared and they could be counted as
880	 * valid references during a kmemleak scan. Therefore, kmemleak must
881	 * not scan such objects.
882	 */
883	kmemleak_no_scan(nc);
884	if (nc) {
885		nc->avail = 0;
886		nc->limit = entries;
887		nc->batchcount = batchcount;
888		nc->touched = 0;
889		spin_lock_init(&nc->lock);
890	}
891	return nc;
892}
893
894/*
895 * Transfer objects in one arraycache to another.
896 * Locking must be handled by the caller.
897 *
898 * Return the number of entries transferred.
899 */
900static int transfer_objects(struct array_cache *to,
901		struct array_cache *from, unsigned int max)
902{
903	/* Figure out how many entries to transfer */
904	int nr = min(min(from->avail, max), to->limit - to->avail);
905
906	if (!nr)
907		return 0;
908
909	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
910			sizeof(void *) *nr);
911
912	from->avail -= nr;
913	to->avail += nr;
914	return nr;
915}
916
917#ifndef CONFIG_NUMA
918
919#define drain_alien_cache(cachep, alien) do { } while (0)
920#define reap_alien(cachep, l3) do { } while (0)
921
922static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
923{
924	return (struct array_cache **)BAD_ALIEN_MAGIC;
925}
926
927static inline void free_alien_cache(struct array_cache **ac_ptr)
928{
929}
930
931static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
932{
933	return 0;
934}
935
936static inline void *alternate_node_alloc(struct kmem_cache *cachep,
937		gfp_t flags)
938{
939	return NULL;
940}
941
942static inline void *____cache_alloc_node(struct kmem_cache *cachep,
943		 gfp_t flags, int nodeid)
944{
945	return NULL;
946}
947
948#else	/* CONFIG_NUMA */
949
950static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
951static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
952
953static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
954{
955	struct array_cache **ac_ptr;
956	int memsize = sizeof(void *) * nr_node_ids;
957	int i;
958
959	if (limit > 1)
960		limit = 12;
961	ac_ptr = kzalloc_node(memsize, gfp, node);
962	if (ac_ptr) {
963		for_each_node(i) {
964			if (i == node || !node_online(i))
965				continue;
966			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
967			if (!ac_ptr[i]) {
968				for (i--; i >= 0; i--)
969					kfree(ac_ptr[i]);
970				kfree(ac_ptr);
971				return NULL;
972			}
973		}
974	}
975	return ac_ptr;
976}
977
978static void free_alien_cache(struct array_cache **ac_ptr)
979{
980	int i;
981
982	if (!ac_ptr)
983		return;
984	for_each_node(i)
985	    kfree(ac_ptr[i]);
986	kfree(ac_ptr);
987}
988
989static void __drain_alien_cache(struct kmem_cache *cachep,
990				struct array_cache *ac, int node)
991{
992	struct kmem_list3 *rl3 = cachep->nodelists[node];
993
994	if (ac->avail) {
995		spin_lock(&rl3->list_lock);
996		/*
997		 * Stuff objects into the remote nodes shared array first.
998		 * That way we could avoid the overhead of putting the objects
999		 * into the free lists and getting them back later.
1000		 */
1001		if (rl3->shared)
1002			transfer_objects(rl3->shared, ac, ac->limit);
1003
1004		free_block(cachep, ac->entry, ac->avail, node);
1005		ac->avail = 0;
1006		spin_unlock(&rl3->list_lock);
1007	}
1008}
1009
1010/*
1011 * Called from cache_reap() to regularly drain alien caches round robin.
1012 */
1013static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1014{
1015	int node = __get_cpu_var(slab_reap_node);
1016
1017	if (l3->alien) {
1018		struct array_cache *ac = l3->alien[node];
1019
1020		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1021			__drain_alien_cache(cachep, ac, node);
1022			spin_unlock_irq(&ac->lock);
1023		}
1024	}
1025}
1026
1027static void drain_alien_cache(struct kmem_cache *cachep,
1028				struct array_cache **alien)
1029{
1030	int i = 0;
1031	struct array_cache *ac;
1032	unsigned long flags;
1033
1034	for_each_online_node(i) {
1035		ac = alien[i];
1036		if (ac) {
1037			spin_lock_irqsave(&ac->lock, flags);
1038			__drain_alien_cache(cachep, ac, i);
1039			spin_unlock_irqrestore(&ac->lock, flags);
1040		}
1041	}
1042}
1043
1044static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1045{
1046	struct slab *slabp = virt_to_slab(objp);
1047	int nodeid = slabp->nodeid;
1048	struct kmem_list3 *l3;
1049	struct array_cache *alien = NULL;
1050	int node;
1051
1052	node = numa_mem_id();
1053
1054	/*
1055	 * Make sure we are not freeing a object from another node to the array
1056	 * cache on this cpu.
1057	 */
1058	if (likely(slabp->nodeid == node))
1059		return 0;
1060
1061	l3 = cachep->nodelists[node];
1062	STATS_INC_NODEFREES(cachep);
1063	if (l3->alien && l3->alien[nodeid]) {
1064		alien = l3->alien[nodeid];
1065		spin_lock(&alien->lock);
1066		if (unlikely(alien->avail == alien->limit)) {
1067			STATS_INC_ACOVERFLOW(cachep);
1068			__drain_alien_cache(cachep, alien, nodeid);
1069		}
1070		alien->entry[alien->avail++] = objp;
1071		spin_unlock(&alien->lock);
1072	} else {
1073		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1074		free_block(cachep, &objp, 1, nodeid);
1075		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1076	}
1077	return 1;
1078}
1079#endif
1080
1081/*
1082 * Allocates and initializes nodelists for a node on each slab cache, used for
1083 * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3
1084 * will be allocated off-node since memory is not yet online for the new node.
1085 * When hotplugging memory or a cpu, existing nodelists are not replaced if
1086 * already in use.
1087 *
1088 * Must hold cache_chain_mutex.
1089 */
1090static int init_cache_nodelists_node(int node)
1091{
1092	struct kmem_cache *cachep;
1093	struct kmem_list3 *l3;
1094	const int memsize = sizeof(struct kmem_list3);
1095
1096	list_for_each_entry(cachep, &cache_chain, next) {
1097		/*
1098		 * Set up the size64 kmemlist for cpu before we can
1099		 * begin anything. Make sure some other cpu on this
1100		 * node has not already allocated this
1101		 */
1102		if (!cachep->nodelists[node]) {
1103			l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1104			if (!l3)
1105				return -ENOMEM;
1106			kmem_list3_init(l3);
1107			l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1108			    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1109
1110			/*
1111			 * The l3s don't come and go as CPUs come and
1112			 * go.  cache_chain_mutex is sufficient
1113			 * protection here.
1114			 */
1115			cachep->nodelists[node] = l3;
1116		}
1117
1118		spin_lock_irq(&cachep->nodelists[node]->list_lock);
1119		cachep->nodelists[node]->free_limit =
1120			(1 + nr_cpus_node(node)) *
1121			cachep->batchcount + cachep->num;
1122		spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1123	}
1124	return 0;
1125}
1126
1127static void __cpuinit cpuup_canceled(long cpu)
1128{
1129	struct kmem_cache *cachep;
1130	struct kmem_list3 *l3 = NULL;
1131	int node = cpu_to_mem(cpu);
1132	const struct cpumask *mask = cpumask_of_node(node);
1133
1134	list_for_each_entry(cachep, &cache_chain, next) {
1135		struct array_cache *nc;
1136		struct array_cache *shared;
1137		struct array_cache **alien;
1138
1139		/* cpu is dead; no one can alloc from it. */
1140		nc = cachep->array[cpu];
1141		cachep->array[cpu] = NULL;
1142		l3 = cachep->nodelists[node];
1143
1144		if (!l3)
1145			goto free_array_cache;
1146
1147		spin_lock_irq(&l3->list_lock);
1148
1149		/* Free limit for this kmem_list3 */
1150		l3->free_limit -= cachep->batchcount;
1151		if (nc)
1152			free_block(cachep, nc->entry, nc->avail, node);
1153
1154		if (!cpumask_empty(mask)) {
1155			spin_unlock_irq(&l3->list_lock);
1156			goto free_array_cache;
1157		}
1158
1159		shared = l3->shared;
1160		if (shared) {
1161			free_block(cachep, shared->entry,
1162				   shared->avail, node);
1163			l3->shared = NULL;
1164		}
1165
1166		alien = l3->alien;
1167		l3->alien = NULL;
1168
1169		spin_unlock_irq(&l3->list_lock);
1170
1171		kfree(shared);
1172		if (alien) {
1173			drain_alien_cache(cachep, alien);
1174			free_alien_cache(alien);
1175		}
1176free_array_cache:
1177		kfree(nc);
1178	}
1179	/*
1180	 * In the previous loop, all the objects were freed to
1181	 * the respective cache's slabs,  now we can go ahead and
1182	 * shrink each nodelist to its limit.
1183	 */
1184	list_for_each_entry(cachep, &cache_chain, next) {
1185		l3 = cachep->nodelists[node];
1186		if (!l3)
1187			continue;
1188		drain_freelist(cachep, l3, l3->free_objects);
1189	}
1190}
1191
1192static int __cpuinit cpuup_prepare(long cpu)
1193{
1194	struct kmem_cache *cachep;
1195	struct kmem_list3 *l3 = NULL;
1196	int node = cpu_to_mem(cpu);
1197	int err;
1198
1199	/*
1200	 * We need to do this right in the beginning since
1201	 * alloc_arraycache's are going to use this list.
1202	 * kmalloc_node allows us to add the slab to the right
1203	 * kmem_list3 and not this cpu's kmem_list3
1204	 */
1205	err = init_cache_nodelists_node(node);
1206	if (err < 0)
1207		goto bad;
1208
1209	/*
1210	 * Now we can go ahead with allocating the shared arrays and
1211	 * array caches
1212	 */
1213	list_for_each_entry(cachep, &cache_chain, next) {
1214		struct array_cache *nc;
1215		struct array_cache *shared = NULL;
1216		struct array_cache **alien = NULL;
1217
1218		nc = alloc_arraycache(node, cachep->limit,
1219					cachep->batchcount, GFP_KERNEL);
1220		if (!nc)
1221			goto bad;
1222		if (cachep->shared) {
1223			shared = alloc_arraycache(node,
1224				cachep->shared * cachep->batchcount,
1225				0xbaadf00d, GFP_KERNEL);
1226			if (!shared) {
1227				kfree(nc);
1228				goto bad;
1229			}
1230		}
1231		if (use_alien_caches) {
1232			alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1233			if (!alien) {
1234				kfree(shared);
1235				kfree(nc);
1236				goto bad;
1237			}
1238		}
1239		cachep->array[cpu] = nc;
1240		l3 = cachep->nodelists[node];
1241		BUG_ON(!l3);
1242
1243		spin_lock_irq(&l3->list_lock);
1244		if (!l3->shared) {
1245			/*
1246			 * We are serialised from CPU_DEAD or
1247			 * CPU_UP_CANCELLED by the cpucontrol lock
1248			 */
1249			l3->shared = shared;
1250			shared = NULL;
1251		}
1252#ifdef CONFIG_NUMA
1253		if (!l3->alien) {
1254			l3->alien = alien;
1255			alien = NULL;
1256		}
1257#endif
1258		spin_unlock_irq(&l3->list_lock);
1259		kfree(shared);
1260		free_alien_cache(alien);
1261	}
1262	init_node_lock_keys(node);
1263
1264	return 0;
1265bad:
1266	cpuup_canceled(cpu);
1267	return -ENOMEM;
1268}
1269
1270static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1271				    unsigned long action, void *hcpu)
1272{
1273	long cpu = (long)hcpu;
1274	int err = 0;
1275
1276	switch (action) {
1277	case CPU_UP_PREPARE:
1278	case CPU_UP_PREPARE_FROZEN:
1279		mutex_lock(&cache_chain_mutex);
1280		err = cpuup_prepare(cpu);
1281		mutex_unlock(&cache_chain_mutex);
1282		break;
1283	case CPU_ONLINE:
1284	case CPU_ONLINE_FROZEN:
1285		start_cpu_timer(cpu);
1286		break;
1287#ifdef CONFIG_HOTPLUG_CPU
1288  	case CPU_DOWN_PREPARE:
1289  	case CPU_DOWN_PREPARE_FROZEN:
1290		/*
1291		 * Shutdown cache reaper. Note that the cache_chain_mutex is
1292		 * held so that if cache_reap() is invoked it cannot do
1293		 * anything expensive but will only modify reap_work
1294		 * and reschedule the timer.
1295		*/
1296		cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
1297		/* Now the cache_reaper is guaranteed to be not running. */
1298		per_cpu(slab_reap_work, cpu).work.func = NULL;
1299  		break;
1300  	case CPU_DOWN_FAILED:
1301  	case CPU_DOWN_FAILED_FROZEN:
1302		start_cpu_timer(cpu);
1303  		break;
1304	case CPU_DEAD:
1305	case CPU_DEAD_FROZEN:
1306		/*
1307		 * Even if all the cpus of a node are down, we don't free the
1308		 * kmem_list3 of any cache. This to avoid a race between
1309		 * cpu_down, and a kmalloc allocation from another cpu for
1310		 * memory from the node of the cpu going down.  The list3
1311		 * structure is usually allocated from kmem_cache_create() and
1312		 * gets destroyed at kmem_cache_destroy().
1313		 */
1314		/* fall through */
1315#endif
1316	case CPU_UP_CANCELED:
1317	case CPU_UP_CANCELED_FROZEN:
1318		mutex_lock(&cache_chain_mutex);
1319		cpuup_canceled(cpu);
1320		mutex_unlock(&cache_chain_mutex);
1321		break;
1322	}
1323	return notifier_from_errno(err);
1324}
1325
1326static struct notifier_block __cpuinitdata cpucache_notifier = {
1327	&cpuup_callback, NULL, 0
1328};
1329
1330#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
1331/*
1332 * Drains freelist for a node on each slab cache, used for memory hot-remove.
1333 * Returns -EBUSY if all objects cannot be drained so that the node is not
1334 * removed.
1335 *
1336 * Must hold cache_chain_mutex.
1337 */
1338static int __meminit drain_cache_nodelists_node(int node)
1339{
1340	struct kmem_cache *cachep;
1341	int ret = 0;
1342
1343	list_for_each_entry(cachep, &cache_chain, next) {
1344		struct kmem_list3 *l3;
1345
1346		l3 = cachep->nodelists[node];
1347		if (!l3)
1348			continue;
1349
1350		drain_freelist(cachep, l3, l3->free_objects);
1351
1352		if (!list_empty(&l3->slabs_full) ||
1353		    !list_empty(&l3->slabs_partial)) {
1354			ret = -EBUSY;
1355			break;
1356		}
1357	}
1358	return ret;
1359}
1360
1361static int __meminit slab_memory_callback(struct notifier_block *self,
1362					unsigned long action, void *arg)
1363{
1364	struct memory_notify *mnb = arg;
1365	int ret = 0;
1366	int nid;
1367
1368	nid = mnb->status_change_nid;
1369	if (nid < 0)
1370		goto out;
1371
1372	switch (action) {
1373	case MEM_GOING_ONLINE:
1374		mutex_lock(&cache_chain_mutex);
1375		ret = init_cache_nodelists_node(nid);
1376		mutex_unlock(&cache_chain_mutex);
1377		break;
1378	case MEM_GOING_OFFLINE:
1379		mutex_lock(&cache_chain_mutex);
1380		ret = drain_cache_nodelists_node(nid);
1381		mutex_unlock(&cache_chain_mutex);
1382		break;
1383	case MEM_ONLINE:
1384	case MEM_OFFLINE:
1385	case MEM_CANCEL_ONLINE:
1386	case MEM_CANCEL_OFFLINE:
1387		break;
1388	}
1389out:
1390	return ret ? notifier_from_errno(ret) : NOTIFY_OK;
1391}
1392#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
1393
1394/*
1395 * swap the static kmem_list3 with kmalloced memory
1396 */
1397static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1398				int nodeid)
1399{
1400	struct kmem_list3 *ptr;
1401
1402	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
1403	BUG_ON(!ptr);
1404
1405	memcpy(ptr, list, sizeof(struct kmem_list3));
1406	/*
1407	 * Do not assume that spinlocks can be initialized via memcpy:
1408	 */
1409	spin_lock_init(&ptr->list_lock);
1410
1411	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1412	cachep->nodelists[nodeid] = ptr;
1413}
1414
1415/*
1416 * For setting up all the kmem_list3s for cache whose buffer_size is same as
1417 * size of kmem_list3.
1418 */
1419static void __init set_up_list3s(struct kmem_cache *cachep, int index)
1420{
1421	int node;
1422
1423	for_each_online_node(node) {
1424		cachep->nodelists[node] = &initkmem_list3[index + node];
1425		cachep->nodelists[node]->next_reap = jiffies +
1426		    REAPTIMEOUT_LIST3 +
1427		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1428	}
1429}
1430
1431/*
1432 * Initialisation.  Called after the page allocator have been initialised and
1433 * before smp_init().
1434 */
1435void __init kmem_cache_init(void)
1436{
1437	size_t left_over;
1438	struct cache_sizes *sizes;
1439	struct cache_names *names;
1440	int i;
1441	int order;
1442	int node;
1443
1444	if (num_possible_nodes() == 1)
1445		use_alien_caches = 0;
1446
1447	for (i = 0; i < NUM_INIT_LISTS; i++) {
1448		kmem_list3_init(&initkmem_list3[i]);
1449		if (i < MAX_NUMNODES)
1450			cache_cache.nodelists[i] = NULL;
1451	}
1452	set_up_list3s(&cache_cache, CACHE_CACHE);
1453
1454	/*
1455	 * Fragmentation resistance on low memory - only use bigger
1456	 * page orders on machines with more than 32MB of memory.
1457	 */
1458	if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
1459		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1460
1461	/* Bootstrap is tricky, because several objects are allocated
1462	 * from caches that do not exist yet:
1463	 * 1) initialize the cache_cache cache: it contains the struct
1464	 *    kmem_cache structures of all caches, except cache_cache itself:
1465	 *    cache_cache is statically allocated.
1466	 *    Initially an __init data area is used for the head array and the
1467	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
1468	 *    array at the end of the bootstrap.
1469	 * 2) Create the first kmalloc cache.
1470	 *    The struct kmem_cache for the new cache is allocated normally.
1471	 *    An __init data area is used for the head array.
1472	 * 3) Create the remaining kmalloc caches, with minimally sized
1473	 *    head arrays.
1474	 * 4) Replace the __init data head arrays for cache_cache and the first
1475	 *    kmalloc cache with kmalloc allocated arrays.
1476	 * 5) Replace the __init data for kmem_list3 for cache_cache and
1477	 *    the other cache's with kmalloc allocated memory.
1478	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1479	 */
1480
1481	node = numa_mem_id();
1482
1483	/* 1) create the cache_cache */
1484	INIT_LIST_HEAD(&cache_chain);
1485	list_add(&cache_cache.next, &cache_chain);
1486	cache_cache.colour_off = cache_line_size();
1487	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1488	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1489
1490	/*
1491	 * struct kmem_cache size depends on nr_node_ids, which
1492	 * can be less than MAX_NUMNODES.
1493	 */
1494	cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
1495				 nr_node_ids * sizeof(struct kmem_list3 *);
1496#if DEBUG
1497	cache_cache.obj_size = cache_cache.buffer_size;
1498#endif
1499	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1500					cache_line_size());
1501	cache_cache.reciprocal_buffer_size =
1502		reciprocal_value(cache_cache.buffer_size);
1503
1504	for (order = 0; order < MAX_ORDER; order++) {
1505		cache_estimate(order, cache_cache.buffer_size,
1506			cache_line_size(), 0, &left_over, &cache_cache.num);
1507		if (cache_cache.num)
1508			break;
1509	}
1510	BUG_ON(!cache_cache.num);
1511	cache_cache.gfporder = order;
1512	cache_cache.colour = left_over / cache_cache.colour_off;
1513	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1514				      sizeof(struct slab), cache_line_size());
1515
1516	/* 2+3) create the kmalloc caches */
1517	sizes = malloc_sizes;
1518	names = cache_names;
1519
1520	/*
1521	 * Initialize the caches that provide memory for the array cache and the
1522	 * kmem_list3 structures first.  Without this, further allocations will
1523	 * bug.
1524	 */
1525
1526	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1527					sizes[INDEX_AC].cs_size,
1528					ARCH_KMALLOC_MINALIGN,
1529					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1530					NULL);
1531
1532	if (INDEX_AC != INDEX_L3) {
1533		sizes[INDEX_L3].cs_cachep =
1534			kmem_cache_create(names[INDEX_L3].name,
1535				sizes[INDEX_L3].cs_size,
1536				ARCH_KMALLOC_MINALIGN,
1537				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1538				NULL);
1539	}
1540
1541	slab_early_init = 0;
1542
1543	while (sizes->cs_size != ULONG_MAX) {
1544		/*
1545		 * For performance, all the general caches are L1 aligned.
1546		 * This should be particularly beneficial on SMP boxes, as it
1547		 * eliminates "false sharing".
1548		 * Note for systems short on memory removing the alignment will
1549		 * allow tighter packing of the smaller caches.
1550		 */
1551		if (!sizes->cs_cachep) {
1552			sizes->cs_cachep = kmem_cache_create(names->name,
1553					sizes->cs_size,
1554					ARCH_KMALLOC_MINALIGN,
1555					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1556					NULL);
1557		}
1558#ifdef CONFIG_ZONE_DMA
1559		sizes->cs_dmacachep = kmem_cache_create(
1560					names->name_dma,
1561					sizes->cs_size,
1562					ARCH_KMALLOC_MINALIGN,
1563					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1564						SLAB_PANIC,
1565					NULL);
1566#endif
1567		sizes++;
1568		names++;
1569	}
1570	/* 4) Replace the bootstrap head arrays */
1571	{
1572		struct array_cache *ptr;
1573
1574		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1575
1576		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1577		memcpy(ptr, cpu_cache_get(&cache_cache),
1578		       sizeof(struct arraycache_init));
1579		/*
1580		 * Do not assume that spinlocks can be initialized via memcpy:
1581		 */
1582		spin_lock_init(&ptr->lock);
1583
1584		cache_cache.array[smp_processor_id()] = ptr;
1585
1586		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1587
1588		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1589		       != &initarray_generic.cache);
1590		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1591		       sizeof(struct arraycache_init));
1592		/*
1593		 * Do not assume that spinlocks can be initialized via memcpy:
1594		 */
1595		spin_lock_init(&ptr->lock);
1596
1597		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1598		    ptr;
1599	}
1600	/* 5) Replace the bootstrap kmem_list3's */
1601	{
1602		int nid;
1603
1604		for_each_online_node(nid) {
1605			init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
1606
1607			init_list(malloc_sizes[INDEX_AC].cs_cachep,
1608				  &initkmem_list3[SIZE_AC + nid], nid);
1609
1610			if (INDEX_AC != INDEX_L3) {
1611				init_list(malloc_sizes[INDEX_L3].cs_cachep,
1612					  &initkmem_list3[SIZE_L3 + nid], nid);
1613			}
1614		}
1615	}
1616
1617	g_cpucache_up = EARLY;
1618}
1619
1620void __init kmem_cache_init_late(void)
1621{
1622	struct kmem_cache *cachep;
1623
1624	/* 6) resize the head arrays to their final sizes */
1625	mutex_lock(&cache_chain_mutex);
1626	list_for_each_entry(cachep, &cache_chain, next)
1627		if (enable_cpucache(cachep, GFP_NOWAIT))
1628			BUG();
1629	mutex_unlock(&cache_chain_mutex);
1630
1631	/* Done! */
1632	g_cpucache_up = FULL;
1633
1634	/* Annotate slab for lockdep -- annotate the malloc caches */
1635	init_lock_keys();
1636
1637	/*
1638	 * Register a cpu startup notifier callback that initializes
1639	 * cpu_cache_get for all new cpus
1640	 */
1641	register_cpu_notifier(&cpucache_notifier);
1642
1643#ifdef CONFIG_NUMA
1644	/*
1645	 * Register a memory hotplug callback that initializes and frees
1646	 * nodelists.
1647	 */
1648	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
1649#endif
1650
1651	/*
1652	 * The reap timers are started later, with a module init call: That part
1653	 * of the kernel is not yet operational.
1654	 */
1655}
1656
1657static int __init cpucache_init(void)
1658{
1659	int cpu;
1660
1661	/*
1662	 * Register the timers that return unneeded pages to the page allocator
1663	 */
1664	for_each_online_cpu(cpu)
1665		start_cpu_timer(cpu);
1666	return 0;
1667}
1668__initcall(cpucache_init);
1669
1670/*
1671 * Interface to system's page allocator. No need to hold the cache-lock.
1672 *
1673 * If we requested dmaable memory, we will get it. Even if we
1674 * did not request dmaable memory, we might get it, but that
1675 * would be relatively rare and ignorable.
1676 */
1677static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1678{
1679	struct page *page;
1680	int nr_pages;
1681	int i;
1682
1683#ifndef CONFIG_MMU
1684	/*
1685	 * Nommu uses slab's for process anonymous memory allocations, and thus
1686	 * requires __GFP_COMP to properly refcount higher order allocations
1687	 */
1688	flags |= __GFP_COMP;
1689#endif
1690
1691	flags |= cachep->gfpflags;
1692	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1693		flags |= __GFP_RECLAIMABLE;
1694
1695	page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1696	if (!page)
1697		return NULL;
1698
1699	nr_pages = (1 << cachep->gfporder);
1700	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1701		add_zone_page_state(page_zone(page),
1702			NR_SLAB_RECLAIMABLE, nr_pages);
1703	else
1704		add_zone_page_state(page_zone(page),
1705			NR_SLAB_UNRECLAIMABLE, nr_pages);
1706	for (i = 0; i < nr_pages; i++)
1707		__SetPageSlab(page + i);
1708
1709	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1710		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1711
1712		if (cachep->ctor)
1713			kmemcheck_mark_uninitialized_pages(page, nr_pages);
1714		else
1715			kmemcheck_mark_unallocated_pages(page, nr_pages);
1716	}
1717
1718	return page_address(page);
1719}
1720
1721/*
1722 * Interface to system's page release.
1723 */
1724static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1725{
1726	unsigned long i = (1 << cachep->gfporder);
1727	struct page *page = virt_to_page(addr);
1728	const unsigned long nr_freed = i;
1729
1730	kmemcheck_free_shadow(page, cachep->gfporder);
1731
1732	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1733		sub_zone_page_state(page_zone(page),
1734				NR_SLAB_RECLAIMABLE, nr_freed);
1735	else
1736		sub_zone_page_state(page_zone(page),
1737				NR_SLAB_UNRECLAIMABLE, nr_freed);
1738	while (i--) {
1739		BUG_ON(!PageSlab(page));
1740		__ClearPageSlab(page);
1741		page++;
1742	}
1743	if (current->reclaim_state)
1744		current->reclaim_state->reclaimed_slab += nr_freed;
1745	free_pages((unsigned long)addr, cachep->gfporder);
1746}
1747
1748static void kmem_rcu_free(struct rcu_head *head)
1749{
1750	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1751	struct kmem_cache *cachep = slab_rcu->cachep;
1752
1753	kmem_freepages(cachep, slab_rcu->addr);
1754	if (OFF_SLAB(cachep))
1755		kmem_cache_free(cachep->slabp_cache, slab_rcu);
1756}
1757
1758#if DEBUG
1759
1760#ifdef CONFIG_DEBUG_PAGEALLOC
1761static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1762			    unsigned long caller)
1763{
1764	int size = obj_size(cachep);
1765
1766	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1767
1768	if (size < 5 * sizeof(unsigned long))
1769		return;
1770
1771	*addr++ = 0x12345678;
1772	*addr++ = caller;
1773	*addr++ = smp_processor_id();
1774	size -= 3 * sizeof(unsigned long);
1775	{
1776		unsigned long *sptr = &caller;
1777		unsigned long svalue;
1778
1779		while (!kstack_end(sptr)) {
1780			svalue = *sptr++;
1781			if (kernel_text_address(svalue)) {
1782				*addr++ = svalue;
1783				size -= sizeof(unsigned long);
1784				if (size <= sizeof(unsigned long))
1785					break;
1786			}
1787		}
1788
1789	}
1790	*addr++ = 0x87654321;
1791}
1792#endif
1793
1794static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1795{
1796	int size = obj_size(cachep);
1797	addr = &((char *)addr)[obj_offset(cachep)];
1798
1799	memset(addr, val, size);
1800	*(unsigned char *)(addr + size - 1) = POISON_END;
1801}
1802
1803static void dump_line(char *data, int offset, int limit)
1804{
1805	int i;
1806	unsigned char error = 0;
1807	int bad_count = 0;
1808
1809	printk(KERN_ERR "%03x:", offset);
1810	for (i = 0; i < limit; i++) {
1811		if (data[offset + i] != POISON_FREE) {
1812			error = data[offset + i];
1813			bad_count++;
1814		}
1815		printk(" %02x", (unsigned char)data[offset + i]);
1816	}
1817	printk("\n");
1818
1819	if (bad_count == 1) {
1820		error ^= POISON_FREE;
1821		if (!(error & (error - 1))) {
1822			printk(KERN_ERR "Single bit error detected. Probably "
1823					"bad RAM.\n");
1824#ifdef CONFIG_X86
1825			printk(KERN_ERR "Run memtest86+ or a similar memory "
1826					"test tool.\n");
1827#else
1828			printk(KERN_ERR "Run a memory test tool.\n");
1829#endif
1830		}
1831	}
1832}
1833#endif
1834
1835#if DEBUG
1836
1837static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1838{
1839	int i, size;
1840	char *realobj;
1841
1842	if (cachep->flags & SLAB_RED_ZONE) {
1843		printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
1844			*dbg_redzone1(cachep, objp),
1845			*dbg_redzone2(cachep, objp));
1846	}
1847
1848	if (cachep->flags & SLAB_STORE_USER) {
1849		printk(KERN_ERR "Last user: [<%p>]",
1850			*dbg_userword(cachep, objp));
1851		print_symbol("(%s)",
1852				(unsigned long)*dbg_userword(cachep, objp));
1853		printk("\n");
1854	}
1855	realobj = (char *)objp + obj_offset(cachep);
1856	size = obj_size(cachep);
1857	for (i = 0; i < size && lines; i += 16, lines--) {
1858		int limit;
1859		limit = 16;
1860		if (i + limit > size)
1861			limit = size - i;
1862		dump_line(realobj, i, limit);
1863	}
1864}
1865
1866static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1867{
1868	char *realobj;
1869	int size, i;
1870	int lines = 0;
1871
1872	realobj = (char *)objp + obj_offset(cachep);
1873	size = obj_size(cachep);
1874
1875	for (i = 0; i < size; i++) {
1876		char exp = POISON_FREE;
1877		if (i == size - 1)
1878			exp = POISON_END;
1879		if (realobj[i] != exp) {
1880			int limit;
1881			/* Mismatch ! */
1882			/* Print header */
1883			if (lines == 0) {
1884				printk(KERN_ERR
1885					"Slab corruption: %s start=%p, len=%d\n",
1886					cachep->name, realobj, size);
1887				print_objinfo(cachep, objp, 0);
1888			}
1889			/* Hexdump the affected line */
1890			i = (i / 16) * 16;
1891			limit = 16;
1892			if (i + limit > size)
1893				limit = size - i;
1894			dump_line(realobj, i, limit);
1895			i += 16;
1896			lines++;
1897			/* Limit to 5 lines */
1898			if (lines > 5)
1899				break;
1900		}
1901	}
1902	if (lines != 0) {
1903		/* Print some data about the neighboring objects, if they
1904		 * exist:
1905		 */
1906		struct slab *slabp = virt_to_slab(objp);
1907		unsigned int objnr;
1908
1909		objnr = obj_to_index(cachep, slabp, objp);
1910		if (objnr) {
1911			objp = index_to_obj(cachep, slabp, objnr - 1);
1912			realobj = (char *)objp + obj_offset(cachep);
1913			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1914			       realobj, size);
1915			print_objinfo(cachep, objp, 2);
1916		}
1917		if (objnr + 1 < cachep->num) {
1918			objp = index_to_obj(cachep, slabp, objnr + 1);
1919			realobj = (char *)objp + obj_offset(cachep);
1920			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1921			       realobj, size);
1922			print_objinfo(cachep, objp, 2);
1923		}
1924	}
1925}
1926#endif
1927
1928#if DEBUG
1929static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
1930{
1931	int i;
1932	for (i = 0; i < cachep->num; i++) {
1933		void *objp = index_to_obj(cachep, slabp, i);
1934
1935		if (cachep->flags & SLAB_POISON) {
1936#ifdef CONFIG_DEBUG_PAGEALLOC
1937			if (cachep->buffer_size % PAGE_SIZE == 0 &&
1938					OFF_SLAB(cachep))
1939				kernel_map_pages(virt_to_page(objp),
1940					cachep->buffer_size / PAGE_SIZE, 1);
1941			else
1942				check_poison_obj(cachep, objp);
1943#else
1944			check_poison_obj(cachep, objp);
1945#endif
1946		}
1947		if (cachep->flags & SLAB_RED_ZONE) {
1948			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1949				slab_error(cachep, "start of a freed object "
1950					   "was overwritten");
1951			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1952				slab_error(cachep, "end of a freed object "
1953					   "was overwritten");
1954		}
1955	}
1956}
1957#else
1958static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
1959{
1960}
1961#endif
1962
1963/**
1964 * slab_destroy - destroy and release all objects in a slab
1965 * @cachep: cache pointer being destroyed
1966 * @slabp: slab pointer being destroyed
1967 *
1968 * Destroy all the objs in a slab, and release the mem back to the system.
1969 * Before calling the slab must have been unlinked from the cache.  The
1970 * cache-lock is not held/needed.
1971 */
1972static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1973{
1974	void *addr = slabp->s_mem - slabp->colouroff;
1975
1976	slab_destroy_debugcheck(cachep, slabp);
1977	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1978		struct slab_rcu *slab_rcu;
1979
1980		slab_rcu = (struct slab_rcu *)slabp;
1981		slab_rcu->cachep = cachep;
1982		slab_rcu->addr = addr;
1983		call_rcu(&slab_rcu->head, kmem_rcu_free);
1984	} else {
1985		kmem_freepages(cachep, addr);
1986		if (OFF_SLAB(cachep))
1987			kmem_cache_free(cachep->slabp_cache, slabp);
1988	}
1989}
1990
1991static void __kmem_cache_destroy(struct kmem_cache *cachep)
1992{
1993	int i;
1994	struct kmem_list3 *l3;
1995
1996	for_each_online_cpu(i)
1997	    kfree(cachep->array[i]);
1998
1999	/* NUMA: free the list3 structures */
2000	for_each_online_node(i) {
2001		l3 = cachep->nodelists[i];
2002		if (l3) {
2003			kfree(l3->shared);
2004			free_alien_cache(l3->alien);
2005			kfree(l3);
2006		}
2007	}
2008	kmem_cache_free(&cache_cache, cachep);
2009}
2010
2011
2012/**
2013 * calculate_slab_order - calculate size (page order) of slabs
2014 * @cachep: pointer to the cache that is being created
2015 * @size: size of objects to be created in this cache.
2016 * @align: required alignment for the objects.
2017 * @flags: slab allocation flags
2018 *
2019 * Also calculates the number of objects per slab.
2020 *
2021 * This could be made much more intelligent.  For now, try to avoid using
2022 * high order pages for slabs.  When the gfp() functions are more friendly
2023 * towards high-order requests, this should be changed.
2024 */
2025static size_t calculate_slab_order(struct kmem_cache *cachep,
2026			size_t size, size_t align, unsigned long flags)
2027{
2028	unsigned long offslab_limit;
2029	size_t left_over = 0;
2030	int gfporder;
2031
2032	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
2033		unsigned int num;
2034		size_t remainder;
2035
2036		cache_estimate(gfporder, size, align, flags, &remainder, &num);
2037		if (!num)
2038			continue;
2039
2040		if (flags & CFLGS_OFF_SLAB) {
2041			/*
2042			 * Max number of objs-per-slab for caches which
2043			 * use off-slab slabs. Needed to avoid a possible
2044			 * looping condition in cache_grow().
2045			 */
2046			offslab_limit = size - sizeof(struct slab);
2047			offslab_limit /= sizeof(kmem_bufctl_t);
2048
2049 			if (num > offslab_limit)
2050				break;
2051		}
2052
2053		/* Found something acceptable - save it away */
2054		cachep->num = num;
2055		cachep->gfporder = gfporder;
2056		left_over = remainder;
2057
2058		/*
2059		 * A VFS-reclaimable slab tends to have most allocations
2060		 * as GFP_NOFS and we really don't want to have to be allocating
2061		 * higher-order pages when we are unable to shrink dcache.
2062		 */
2063		if (flags & SLAB_RECLAIM_ACCOUNT)
2064			break;
2065
2066		/*
2067		 * Large number of objects is good, but very large slabs are
2068		 * currently bad for the gfp()s.
2069		 */
2070		if (gfporder >= slab_break_gfp_order)
2071			break;
2072
2073		/*
2074		 * Acceptable internal fragmentation?
2075		 */
2076		if (left_over * 8 <= (PAGE_SIZE << gfporder))
2077			break;
2078	}
2079	return left_over;
2080}
2081
2082static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2083{
2084	if (g_cpucache_up == FULL)
2085		return enable_cpucache(cachep, gfp);
2086
2087	if (g_cpucache_up == NONE) {
2088		/*
2089		 * Note: the first kmem_cache_create must create the cache
2090		 * that's used by kmalloc(24), otherwise the creation of
2091		 * further caches will BUG().
2092		 */
2093		cachep->array[smp_processor_id()] = &initarray_generic.cache;
2094
2095		/*
2096		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2097		 * the first cache, then we need to set up all its list3s,
2098		 * otherwise the creation of further caches will BUG().
2099		 */
2100		set_up_list3s(cachep, SIZE_AC);
2101		if (INDEX_AC == INDEX_L3)
2102			g_cpucache_up = PARTIAL_L3;
2103		else
2104			g_cpucache_up = PARTIAL_AC;
2105	} else {
2106		cachep->array[smp_processor_id()] =
2107			kmalloc(sizeof(struct arraycache_init), gfp);
2108
2109		if (g_cpucache_up == PARTIAL_AC) {
2110			set_up_list3s(cachep, SIZE_L3);
2111			g_cpucache_up = PARTIAL_L3;
2112		} else {
2113			int node;
2114			for_each_online_node(node) {
2115				cachep->nodelists[node] =
2116				    kmalloc_node(sizeof(struct kmem_list3),
2117						gfp, node);
2118				BUG_ON(!cachep->nodelists[node]);
2119				kmem_list3_init(cachep->nodelists[node]);
2120			}
2121		}
2122	}
2123	cachep->nodelists[numa_mem_id()]->next_reap =
2124			jiffies + REAPTIMEOUT_LIST3 +
2125			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2126
2127	cpu_cache_get(cachep)->avail = 0;
2128	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2129	cpu_cache_get(cachep)->batchcount = 1;
2130	cpu_cache_get(cachep)->touched = 0;
2131	cachep->batchcount = 1;
2132	cachep->limit = BOOT_CPUCACHE_ENTRIES;
2133	return 0;
2134}
2135
2136/**
2137 * kmem_cache_create - Create a cache.
2138 * @name: A string which is used in /proc/slabinfo to identify this cache.
2139 * @size: The size of objects to be created in this cache.
2140 * @align: The required alignment for the objects.
2141 * @flags: SLAB flags
2142 * @ctor: A constructor for the objects.
2143 *
2144 * Returns a ptr to the cache on success, NULL on failure.
2145 * Cannot be called within a int, but can be interrupted.
2146 * The @ctor is run when new pages are allocated by the cache.
2147 *
2148 * @name must be valid until the cache is destroyed. This implies that
2149 * the module calling this has to destroy the cache before getting unloaded.
2150 * Note that kmem_cache_name() is not guaranteed to return the same pointer,
2151 * therefore applications must manage it themselves.
2152 *
2153 * The flags are
2154 *
2155 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2156 * to catch references to uninitialised memory.
2157 *
2158 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2159 * for buffer overruns.
2160 *
2161 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2162 * cacheline.  This can be beneficial if you're counting cycles as closely
2163 * as davem.
2164 */
2165struct kmem_cache *
2166kmem_cache_create (const char *name, size_t size, size_t align,
2167	unsigned long flags, void (*ctor)(void *))
2168{
2169	size_t left_over, slab_size, ralign;
2170	struct kmem_cache *cachep = NULL, *pc;
2171	gfp_t gfp;
2172
2173	/*
2174	 * Sanity checks... these are all serious usage bugs.
2175	 */
2176	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2177	    size > KMALLOC_MAX_SIZE) {
2178		printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
2179				name);
2180		BUG();
2181	}
2182
2183	/*
2184	 * We use cache_chain_mutex to ensure a consistent view of
2185	 * cpu_online_mask as well.  Please see cpuup_callback
2186	 */
2187	if (slab_is_available()) {
2188		get_online_cpus();
2189		mutex_lock(&cache_chain_mutex);
2190	}
2191
2192	list_for_each_entry(pc, &cache_chain, next) {
2193		char tmp;
2194		int res;
2195
2196		/*
2197		 * This happens when the module gets unloaded and doesn't
2198		 * destroy its slab cache and no-one else reuses the vmalloc
2199		 * area of the module.  Print a warning.
2200		 */
2201		res = probe_kernel_address(pc->name, tmp);
2202		if (res) {
2203			printk(KERN_ERR
2204			       "SLAB: cache with size %d has lost its name\n",
2205			       pc->buffer_size);
2206			continue;
2207		}
2208
2209		if (!strcmp(pc->name, name)) {
2210			printk(KERN_ERR
2211			       "kmem_cache_create: duplicate cache %s\n", name);
2212			dump_stack();
2213			goto oops;
2214		}
2215	}
2216
2217#if DEBUG
2218	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
2219#if FORCED_DEBUG
2220	/*
2221	 * Enable redzoning and last user accounting, except for caches with
2222	 * large objects, if the increased size would increase the object size
2223	 * above the next power of two: caches with object sizes just above a
2224	 * power of two have a significant amount of internal fragmentation.
2225	 */
2226	if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2227						2 * sizeof(unsigned long long)))
2228		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2229	if (!(flags & SLAB_DESTROY_BY_RCU))
2230		flags |= SLAB_POISON;
2231#endif
2232	if (flags & SLAB_DESTROY_BY_RCU)
2233		BUG_ON(flags & SLAB_POISON);
2234#endif
2235	/*
2236	 * Always checks flags, a caller might be expecting debug support which
2237	 * isn't available.
2238	 */
2239	BUG_ON(flags & ~CREATE_MASK);
2240
2241	/*
2242	 * Check that size is in terms of words.  This is needed to avoid
2243	 * unaligned accesses for some archs when redzoning is used, and makes
2244	 * sure any on-slab bufctl's are also correctly aligned.
2245	 */
2246	if (size & (BYTES_PER_WORD - 1)) {
2247		size += (BYTES_PER_WORD - 1);
2248		size &= ~(BYTES_PER_WORD - 1);
2249	}
2250
2251	/* calculate the final buffer alignment: */
2252
2253	/* 1) arch recommendation: can be overridden for debug */
2254	if (flags & SLAB_HWCACHE_ALIGN) {
2255		/*
2256		 * Default alignment: as specified by the arch code.  Except if
2257		 * an object is really small, then squeeze multiple objects into
2258		 * one cacheline.
2259		 */
2260		ralign = cache_line_size();
2261		while (size <= ralign / 2)
2262			ralign /= 2;
2263	} else {
2264		ralign = BYTES_PER_WORD;
2265	}
2266
2267	/*
2268	 * Redzoning and user store require word alignment or possibly larger.
2269	 * Note this will be overridden by architecture or caller mandated
2270	 * alignment if either is greater than BYTES_PER_WORD.
2271	 */
2272	if (flags & SLAB_STORE_USER)
2273		ralign = BYTES_PER_WORD;
2274
2275	if (flags & SLAB_RED_ZONE) {
2276		ralign = REDZONE_ALIGN;
2277		/* If redzoning, ensure that the second redzone is suitably
2278		 * aligned, by adjusting the object size accordingly. */
2279		size += REDZONE_ALIGN - 1;
2280		size &= ~(REDZONE_ALIGN - 1);
2281	}
2282
2283	/* 2) arch mandated alignment */
2284	if (ralign < ARCH_SLAB_MINALIGN) {
2285		ralign = ARCH_SLAB_MINALIGN;
2286	}
2287	/* 3) caller mandated alignment */
2288	if (ralign < align) {
2289		ralign = align;
2290	}
2291	/* disable debug if not aligning with REDZONE_ALIGN */
2292	if (ralign & (__alignof__(unsigned long long) - 1))
2293		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2294	/*
2295	 * 4) Store it.
2296	 */
2297	align = ralign;
2298
2299	if (slab_is_available())
2300		gfp = GFP_KERNEL;
2301	else
2302		gfp = GFP_NOWAIT;
2303
2304	/* Get cache's description obj. */
2305	cachep = kmem_cache_zalloc(&cache_cache, gfp);
2306	if (!cachep)
2307		goto oops;
2308
2309#if DEBUG
2310	cachep->obj_size = size;
2311
2312	/*
2313	 * Both debugging options require word-alignment which is calculated
2314	 * into align above.
2315	 */
2316	if (flags & SLAB_RED_ZONE) {
2317		/* add space for red zone words */
2318		cachep->obj_offset += align;
2319		size += align + sizeof(unsigned long long);
2320	}
2321	if (flags & SLAB_STORE_USER) {
2322		/* user store requires one word storage behind the end of
2323		 * the real object. But if the second red zone needs to be
2324		 * aligned to 64 bits, we must allow that much space.
2325		 */
2326		if (flags & SLAB_RED_ZONE)
2327			size += REDZONE_ALIGN;
2328		else
2329			size += BYTES_PER_WORD;
2330	}
2331#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2332	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2333	    && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
2334		cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
2335		size = PAGE_SIZE;
2336	}
2337#endif
2338#endif
2339
2340	/*
2341	 * Determine if the slab management is 'on' or 'off' slab.
2342	 * (bootstrapping cannot cope with offslab caches so don't do
2343	 * it too early on. Always use on-slab management when
2344	 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2345	 */
2346	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
2347	    !(flags & SLAB_NOLEAKTRACE))
2348		/*
2349		 * Size is large, assume best to place the slab management obj
2350		 * off-slab (should allow better packing of objs).
2351		 */
2352		flags |= CFLGS_OFF_SLAB;
2353
2354	size = ALIGN(size, align);
2355
2356	left_over = calculate_slab_order(cachep, size, align, flags);
2357
2358	if (!cachep->num) {
2359		printk(KERN_ERR
2360		       "kmem_cache_create: couldn't create cache %s.\n", name);
2361		kmem_cache_free(&cache_cache, cachep);
2362		cachep = NULL;
2363		goto oops;
2364	}
2365	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2366			  + sizeof(struct slab), align);
2367
2368	/*
2369	 * If the slab has been placed off-slab, and we have enough space then
2370	 * move it on-slab. This is at the expense of any extra colouring.
2371	 */
2372	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2373		flags &= ~CFLGS_OFF_SLAB;
2374		left_over -= slab_size;
2375	}
2376
2377	if (flags & CFLGS_OFF_SLAB) {
2378		/* really off slab. No need for manual alignment */
2379		slab_size =
2380		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2381
2382#ifdef CONFIG_PAGE_POISONING
2383		/* If we're going to use the generic kernel_map_pages()
2384		 * poisoning, then it's going to smash the contents of
2385		 * the redzone and userword anyhow, so switch them off.
2386		 */
2387		if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2388			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2389#endif
2390	}
2391
2392	cachep->colour_off = cache_line_size();
2393	/* Offset must be a multiple of the alignment. */
2394	if (cachep->colour_off < align)
2395		cachep->colour_off = align;
2396	cachep->colour = left_over / cachep->colour_off;
2397	cachep->slab_size = slab_size;
2398	cachep->flags = flags;
2399	cachep->gfpflags = 0;
2400	if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2401		cachep->gfpflags |= GFP_DMA;
2402	cachep->buffer_size = size;
2403	cachep->reciprocal_buffer_size = reciprocal_value(size);
2404
2405	if (flags & CFLGS_OFF_SLAB) {
2406		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2407		/*
2408		 * This is a possibility for one of the malloc_sizes caches.
2409		 * But since we go off slab only for object size greater than
2410		 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2411		 * this should not happen at all.
2412		 * But leave a BUG_ON for some lucky dude.
2413		 */
2414		BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
2415	}
2416	cachep->ctor = ctor;
2417	cachep->name = name;
2418
2419	if (setup_cpu_cache(cachep, gfp)) {
2420		__kmem_cache_destroy(cachep);
2421		cachep = NULL;
2422		goto oops;
2423	}
2424
2425	/* cache setup completed, link it into the list */
2426	list_add(&cachep->next, &cache_chain);
2427oops:
2428	if (!cachep && (flags & SLAB_PANIC))
2429		panic("kmem_cache_create(): failed to create slab `%s'\n",
2430		      name);
2431	if (slab_is_available()) {
2432		mutex_unlock(&cache_chain_mutex);
2433		put_online_cpus();
2434	}
2435	return cachep;
2436}
2437EXPORT_SYMBOL(kmem_cache_create);
2438
2439#if DEBUG
2440static void check_irq_off(void)
2441{
2442	BUG_ON(!irqs_disabled());
2443}
2444
2445static void check_irq_on(void)
2446{
2447	BUG_ON(irqs_disabled());
2448}
2449
2450static void check_spinlock_acquired(struct kmem_cache *cachep)
2451{
2452#ifdef CONFIG_SMP
2453	check_irq_off();
2454	assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
2455#endif
2456}
2457
2458static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2459{
2460#ifdef CONFIG_SMP
2461	check_irq_off();
2462	assert_spin_locked(&cachep->nodelists[node]->list_lock);
2463#endif
2464}
2465
2466#else
2467#define check_irq_off()	do { } while(0)
2468#define check_irq_on()	do { } while(0)
2469#define check_spinlock_acquired(x) do { } while(0)
2470#define check_spinlock_acquired_node(x, y) do { } while(0)
2471#endif
2472
2473static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2474			struct array_cache *ac,
2475			int force, int node);
2476
2477static void do_drain(void *arg)
2478{
2479	struct kmem_cache *cachep = arg;
2480	struct array_cache *ac;
2481	int node = numa_mem_id();
2482
2483	check_irq_off();
2484	ac = cpu_cache_get(cachep);
2485	spin_lock(&cachep->nodelists[node]->list_lock);
2486	free_block(cachep, ac->entry, ac->avail, node);
2487	spin_unlock(&cachep->nodelists[node]->list_lock);
2488	ac->avail = 0;
2489}
2490
2491static void drain_cpu_caches(struct kmem_cache *cachep)
2492{
2493	struct kmem_list3 *l3;
2494	int node;
2495
2496	on_each_cpu(do_drain, cachep, 1);
2497	check_irq_on();
2498	for_each_online_node(node) {
2499		l3 = cachep->nodelists[node];
2500		if (l3 && l3->alien)
2501			drain_alien_cache(cachep, l3->alien);
2502	}
2503
2504	for_each_online_node(node) {
2505		l3 = cachep->nodelists[node];
2506		if (l3)
2507			drain_array(cachep, l3, l3->shared, 1, node);
2508	}
2509}
2510
2511/*
2512 * Remove slabs from the list of free slabs.
2513 * Specify the number of slabs to drain in tofree.
2514 *
2515 * Returns the actual number of slabs released.
2516 */
2517static int drain_freelist(struct kmem_cache *cache,
2518			struct kmem_list3 *l3, int tofree)
2519{
2520	struct list_head *p;
2521	int nr_freed;
2522	struct slab *slabp;
2523
2524	nr_freed = 0;
2525	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2526
2527		spin_lock_irq(&l3->list_lock);
2528		p = l3->slabs_free.prev;
2529		if (p == &l3->slabs_free) {
2530			spin_unlock_irq(&l3->list_lock);
2531			goto out;
2532		}
2533
2534		slabp = list_entry(p, struct slab, list);
2535#if DEBUG
2536		BUG_ON(slabp->inuse);
2537#endif
2538		list_del(&slabp->list);
2539		/*
2540		 * Safe to drop the lock. The slab is no longer linked
2541		 * to the cache.
2542		 */
2543		l3->free_objects -= cache->num;
2544		spin_unlock_irq(&l3->list_lock);
2545		slab_destroy(cache, slabp);
2546		nr_freed++;
2547	}
2548out:
2549	return nr_freed;
2550}
2551
2552/* Called with cache_chain_mutex held to protect against cpu hotplug */
2553static int __cache_shrink(struct kmem_cache *cachep)
2554{
2555	int ret = 0, i = 0;
2556	struct kmem_list3 *l3;
2557
2558	drain_cpu_caches(cachep);
2559
2560	check_irq_on();
2561	for_each_online_node(i) {
2562		l3 = cachep->nodelists[i];
2563		if (!l3)
2564			continue;
2565
2566		drain_freelist(cachep, l3, l3->free_objects);
2567
2568		ret += !list_empty(&l3->slabs_full) ||
2569			!list_empty(&l3->slabs_partial);
2570	}
2571	return (ret ? 1 : 0);
2572}
2573
2574/**
2575 * kmem_cache_shrink - Shrink a cache.
2576 * @cachep: The cache to shrink.
2577 *
2578 * Releases as many slabs as possible for a cache.
2579 * To help debugging, a zero exit status indicates all slabs were released.
2580 */
2581int kmem_cache_shrink(struct kmem_cache *cachep)
2582{
2583	int ret;
2584	BUG_ON(!cachep || in_interrupt());
2585
2586	get_online_cpus();
2587	mutex_lock(&cache_chain_mutex);
2588	ret = __cache_shrink(cachep);
2589	mutex_unlock(&cache_chain_mutex);
2590	put_online_cpus();
2591	return ret;
2592}
2593EXPORT_SYMBOL(kmem_cache_shrink);
2594
2595/**
2596 * kmem_cache_destroy - delete a cache
2597 * @cachep: the cache to destroy
2598 *
2599 * Remove a &struct kmem_cache object from the slab cache.
2600 *
2601 * It is expected this function will be called by a module when it is
2602 * unloaded.  This will remove the cache completely, and avoid a duplicate
2603 * cache being allocated each time a module is loaded and unloaded, if the
2604 * module doesn't have persistent in-kernel storage across loads and unloads.
2605 *
2606 * The cache must be empty before calling this function.
2607 *
2608 * The caller must guarantee that noone will allocate memory from the cache
2609 * during the kmem_cache_destroy().
2610 */
2611void kmem_cache_destroy(struct kmem_cache *cachep)
2612{
2613	BUG_ON(!cachep || in_interrupt());
2614
2615	/* Find the cache in the chain of caches. */
2616	get_online_cpus();
2617	mutex_lock(&cache_chain_mutex);
2618	/*
2619	 * the chain is never empty, cache_cache is never destroyed
2620	 */
2621	list_del(&cachep->next);
2622	if (__cache_shrink(cachep)) {
2623		slab_error(cachep, "Can't free all objects");
2624		list_add(&cachep->next, &cache_chain);
2625		mutex_unlock(&cache_chain_mutex);
2626		put_online_cpus();
2627		return;
2628	}
2629
2630	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2631		rcu_barrier();
2632
2633	__kmem_cache_destroy(cachep);
2634	mutex_unlock(&cache_chain_mutex);
2635	put_online_cpus();
2636}
2637EXPORT_SYMBOL(kmem_cache_destroy);
2638
2639/*
2640 * Get the memory for a slab management obj.
2641 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2642 * always come from malloc_sizes caches.  The slab descriptor cannot
2643 * come from the same cache which is getting created because,
2644 * when we are searching for an appropriate cache for these
2645 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2646 * If we are creating a malloc_sizes cache here it would not be visible to
2647 * kmem_find_general_cachep till the initialization is complete.
2648 * Hence we cannot have slabp_cache same as the original cache.
2649 */
2650static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2651				   int colour_off, gfp_t local_flags,
2652				   int nodeid)
2653{
2654	struct slab *slabp;
2655
2656	if (OFF_SLAB(cachep)) {
2657		/* Slab management obj is off-slab. */
2658		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2659					      local_flags, nodeid);
2660		/*
2661		 * If the first object in the slab is leaked (it's allocated
2662		 * but no one has a reference to it), we want to make sure
2663		 * kmemleak does not treat the ->s_mem pointer as a reference
2664		 * to the object. Otherwise we will not report the leak.
2665		 */
2666		kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
2667				   local_flags);
2668		if (!slabp)
2669			return NULL;
2670	} else {
2671		slabp = objp + colour_off;
2672		colour_off += cachep->slab_size;
2673	}
2674	slabp->inuse = 0;
2675	slabp->colouroff = colour_off;
2676	slabp->s_mem = objp + colour_off;
2677	slabp->nodeid = nodeid;
2678	slabp->free = 0;
2679	return slabp;
2680}
2681
2682static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2683{
2684	return (kmem_bufctl_t *) (slabp + 1);
2685}
2686
2687static void cache_init_objs(struct kmem_cache *cachep,
2688			    struct slab *slabp)
2689{
2690	int i;
2691
2692	for (i = 0; i < cachep->num; i++) {
2693		void *objp = index_to_obj(cachep, slabp, i);
2694#if DEBUG
2695		/* need to poison the objs? */
2696		if (cachep->flags & SLAB_POISON)
2697			poison_obj(cachep, objp, POISON_FREE);
2698		if (cachep->flags & SLAB_STORE_USER)
2699			*dbg_userword(cachep, objp) = NULL;
2700
2701		if (cachep->flags & SLAB_RED_ZONE) {
2702			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2703			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2704		}
2705		/*
2706		 * Constructors are not allowed to allocate memory from the same
2707		 * cache which they are a constructor for.  Otherwise, deadlock.
2708		 * They must also be threaded.
2709		 */
2710		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2711			cachep->ctor(objp + obj_offset(cachep));
2712
2713		if (cachep->flags & SLAB_RED_ZONE) {
2714			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2715				slab_error(cachep, "constructor overwrote the"
2716					   " end of an object");
2717			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2718				slab_error(cachep, "constructor overwrote the"
2719					   " start of an object");
2720		}
2721		if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2722			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2723			kernel_map_pages(virt_to_page(objp),
2724					 cachep->buffer_size / PAGE_SIZE, 0);
2725#else
2726		if (cachep->ctor)
2727			cachep->ctor(objp);
2728#endif
2729		slab_bufctl(slabp)[i] = i + 1;
2730	}
2731	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2732}
2733
2734static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2735{
2736	if (CONFIG_ZONE_DMA_FLAG) {
2737		if (flags & GFP_DMA)
2738			BUG_ON(!(cachep->gfpflags & GFP_DMA));
2739		else
2740			BUG_ON(cachep->gfpflags & GFP_DMA);
2741	}
2742}
2743
2744static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2745				int nodeid)
2746{
2747	void *objp = index_to_obj(cachep, slabp, slabp->free);
2748	kmem_bufctl_t next;
2749
2750	slabp->inuse++;
2751	next = slab_bufctl(slabp)[slabp->free];
2752#if DEBUG
2753	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2754	WARN_ON(slabp->nodeid != nodeid);
2755#endif
2756	slabp->free = next;
2757
2758	return objp;
2759}
2760
2761static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2762				void *objp, int nodeid)
2763{
2764	unsigned int objnr = obj_to_index(cachep, slabp, objp);
2765
2766#if DEBUG
2767	/* Verify that the slab belongs to the intended node */
2768	WARN_ON(slabp->nodeid != nodeid);
2769
2770	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2771		printk(KERN_ERR "slab: double free detected in cache "
2772				"'%s', objp %p\n", cachep->name, objp);
2773		BUG();
2774	}
2775#endif
2776	slab_bufctl(slabp)[objnr] = slabp->free;
2777	slabp->free = objnr;
2778	slabp->inuse--;
2779}
2780
2781/*
2782 * Map pages beginning at addr to the given cache and slab. This is required
2783 * for the slab allocator to be able to lookup the cache and slab of a
2784 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
2785 */
2786static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2787			   void *addr)
2788{
2789	int nr_pages;
2790	struct page *page;
2791
2792	page = virt_to_page(addr);
2793
2794	nr_pages = 1;
2795	if (likely(!PageCompound(page)))
2796		nr_pages <<= cache->gfporder;
2797
2798	do {
2799		page_set_cache(page, cache);
2800		page_set_slab(page, slab);
2801		page++;
2802	} while (--nr_pages);
2803}
2804
2805/*
2806 * Grow (by 1) the number of slabs within a cache.  This is called by
2807 * kmem_cache_alloc() when there are no active objs left in a cache.
2808 */
2809static int cache_grow(struct kmem_cache *cachep,
2810		gfp_t flags, int nodeid, void *objp)
2811{
2812	struct slab *slabp;
2813	size_t offset;
2814	gfp_t local_flags;
2815	struct kmem_list3 *l3;
2816
2817	/*
2818	 * Be lazy and only check for valid flags here,  keeping it out of the
2819	 * critical path in kmem_cache_alloc().
2820	 */
2821	BUG_ON(flags & GFP_SLAB_BUG_MASK);
2822	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2823
2824	/* Take the l3 list lock to change the colour_next on this node */
2825	check_irq_off();
2826	l3 = cachep->nodelists[nodeid];
2827	spin_lock(&l3->list_lock);
2828
2829	/* Get colour for the slab, and cal the next value. */
2830	offset = l3->colour_next;
2831	l3->colour_next++;
2832	if (l3->colour_next >= cachep->colour)
2833		l3->colour_next = 0;
2834	spin_unlock(&l3->list_lock);
2835
2836	offset *= cachep->colour_off;
2837
2838	if (local_flags & __GFP_WAIT)
2839		local_irq_enable();
2840
2841	/*
2842	 * The test for missing atomic flag is performed here, rather than
2843	 * the more obvious place, simply to reduce the critical path length
2844	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2845	 * will eventually be caught here (where it matters).
2846	 */
2847	kmem_flagcheck(cachep, flags);
2848
2849	/*
2850	 * Get mem for the objs.  Attempt to allocate a physical page from
2851	 * 'nodeid'.
2852	 */
2853	if (!objp)
2854		objp = kmem_getpages(cachep, local_flags, nodeid);
2855	if (!objp)
2856		goto failed;
2857
2858	/* Get slab management. */
2859	slabp = alloc_slabmgmt(cachep, objp, offset,
2860			local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2861	if (!slabp)
2862		goto opps1;
2863
2864	slab_map_pages(cachep, slabp, objp);
2865
2866	cache_init_objs(cachep, slabp);
2867
2868	if (local_flags & __GFP_WAIT)
2869		local_irq_disable();
2870	check_irq_off();
2871	spin_lock(&l3->list_lock);
2872
2873	/* Make slab active. */
2874	list_add_tail(&slabp->list, &(l3->slabs_free));
2875	STATS_INC_GROWN(cachep);
2876	l3->free_objects += cachep->num;
2877	spin_unlock(&l3->list_lock);
2878	return 1;
2879opps1:
2880	kmem_freepages(cachep, objp);
2881failed:
2882	if (local_flags & __GFP_WAIT)
2883		local_irq_disable();
2884	return 0;
2885}
2886
2887#if DEBUG
2888
2889/*
2890 * Perform extra freeing checks:
2891 * - detect bad pointers.
2892 * - POISON/RED_ZONE checking
2893 */
2894static void kfree_debugcheck(const void *objp)
2895{
2896	if (!virt_addr_valid(objp)) {
2897		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2898		       (unsigned long)objp);
2899		BUG();
2900	}
2901}
2902
2903static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2904{
2905	unsigned long long redzone1, redzone2;
2906
2907	redzone1 = *dbg_redzone1(cache, obj);
2908	redzone2 = *dbg_redzone2(cache, obj);
2909
2910	/*
2911	 * Redzone is ok.
2912	 */
2913	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2914		return;
2915
2916	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2917		slab_error(cache, "double free detected");
2918	else
2919		slab_error(cache, "memory outside object was overwritten");
2920
2921	printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
2922			obj, redzone1, redzone2);
2923}
2924
2925static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2926				   void *caller)
2927{
2928	struct page *page;
2929	unsigned int objnr;
2930	struct slab *slabp;
2931
2932	BUG_ON(virt_to_cache(objp) != cachep);
2933
2934	objp -= obj_offset(cachep);
2935	kfree_debugcheck(objp);
2936	page = virt_to_head_page(objp);
2937
2938	slabp = page_get_slab(page);
2939
2940	if (cachep->flags & SLAB_RED_ZONE) {
2941		verify_redzone_free(cachep, objp);
2942		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2943		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2944	}
2945	if (cachep->flags & SLAB_STORE_USER)
2946		*dbg_userword(cachep, objp) = caller;
2947
2948	objnr = obj_to_index(cachep, slabp, objp);
2949
2950	BUG_ON(objnr >= cachep->num);
2951	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2952
2953#ifdef CONFIG_DEBUG_SLAB_LEAK
2954	slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2955#endif
2956	if (cachep->flags & SLAB_POISON) {
2957#ifdef CONFIG_DEBUG_PAGEALLOC
2958		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2959			store_stackinfo(cachep, objp, (unsigned long)caller);
2960			kernel_map_pages(virt_to_page(objp),
2961					 cachep->buffer_size / PAGE_SIZE, 0);
2962		} else {
2963			poison_obj(cachep, objp, POISON_FREE);
2964		}
2965#else
2966		poison_obj(cachep, objp, POISON_FREE);
2967#endif
2968	}
2969	return objp;
2970}
2971
2972static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2973{
2974	kmem_bufctl_t i;
2975	int entries = 0;
2976
2977	/* Check slab's freelist to see if this obj is there. */
2978	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2979		entries++;
2980		if (entries > cachep->num || i >= cachep->num)
2981			goto bad;
2982	}
2983	if (entries != cachep->num - slabp->inuse) {
2984bad:
2985		printk(KERN_ERR "slab: Internal list corruption detected in "
2986				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2987			cachep->name, cachep->num, slabp, slabp->inuse);
2988		for (i = 0;
2989		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2990		     i++) {
2991			if (i % 16 == 0)
2992				printk("\n%03x:", i);
2993			printk(" %02x", ((unsigned char *)slabp)[i]);
2994		}
2995		printk("\n");
2996		BUG();
2997	}
2998}
2999#else
3000#define kfree_debugcheck(x) do { } while(0)
3001#define cache_free_debugcheck(x,objp,z) (objp)
3002#define check_slabp(x,y) do { } while(0)
3003#endif
3004
3005static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
3006{
3007	int batchcount;
3008	struct kmem_list3 *l3;
3009	struct array_cache *ac;
3010	int node;
3011
3012retry:
3013	check_irq_off();
3014	node = numa_mem_id();
3015	ac = cpu_cache_get(cachep);
3016	batchcount = ac->batchcount;
3017	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
3018		/*
3019		 * If there was little recent activity on this cache, then
3020		 * perform only a partial refill.  Otherwise we could generate
3021		 * refill bouncing.
3022		 */
3023		batchcount = BATCHREFILL_LIMIT;
3024	}
3025	l3 = cachep->nodelists[node];
3026
3027	BUG_ON(ac->avail > 0 || !l3);
3028	spin_lock(&l3->list_lock);
3029
3030	/* See if we can refill from the shared array */
3031	if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
3032		l3->shared->touched = 1;
3033		goto alloc_done;
3034	}
3035
3036	while (batchcount > 0) {
3037		struct list_head *entry;
3038		struct slab *slabp;
3039		/* Get slab alloc is to come from. */
3040		entry = l3->slabs_partial.next;
3041		if (entry == &l3->slabs_partial) {
3042			l3->free_touched = 1;
3043			entry = l3->slabs_free.next;
3044			if (entry == &l3->slabs_free)
3045				goto must_grow;
3046		}
3047
3048		slabp = list_entry(entry, struct slab, list);
3049		check_slabp(cachep, slabp);
3050		check_spinlock_acquired(cachep);
3051
3052		/*
3053		 * The slab was either on partial or free list so
3054		 * there must be at least one object available for
3055		 * allocation.
3056		 */
3057		BUG_ON(slabp->inuse >= cachep->num);
3058
3059		while (slabp->inuse < cachep->num && batchcount--) {
3060			STATS_INC_ALLOCED(cachep);
3061			STATS_INC_ACTIVE(cachep);
3062			STATS_SET_HIGH(cachep);
3063
3064			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
3065							    node);
3066		}
3067		check_slabp(cachep, slabp);
3068
3069		/* move slabp to correct slabp list: */
3070		list_del(&slabp->list);
3071		if (slabp->free == BUFCTL_END)
3072			list_add(&slabp->list, &l3->slabs_full);
3073		else
3074			list_add(&slabp->list, &l3->slabs_partial);
3075	}
3076
3077must_grow:
3078	l3->free_objects -= ac->avail;
3079alloc_done:
3080	spin_unlock(&l3->list_lock);
3081
3082	if (unlikely(!ac->avail)) {
3083		int x;
3084		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3085
3086		/* cache_grow can reenable interrupts, then ac could change. */
3087		ac = cpu_cache_get(cachep);
3088		if (!x && ac->avail == 0)	/* no objects in sight? abort */
3089			return NULL;
3090
3091		if (!ac->avail)		/* objects refilled by interrupt? */
3092			goto retry;
3093	}
3094	ac->touched = 1;
3095	return ac->entry[--ac->avail];
3096}
3097
3098static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3099						gfp_t flags)
3100{
3101	might_sleep_if(flags & __GFP_WAIT);
3102#if DEBUG
3103	kmem_flagcheck(cachep, flags);
3104#endif
3105}
3106
3107#if DEBUG
3108static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3109				gfp_t flags, void *objp, void *caller)
3110{
3111	if (!objp)
3112		return objp;
3113	if (cachep->flags & SLAB_POISON) {
3114#ifdef CONFIG_DEBUG_PAGEALLOC
3115		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3116			kernel_map_pages(virt_to_page(objp),
3117					 cachep->buffer_size / PAGE_SIZE, 1);
3118		else
3119			check_poison_obj(cachep, objp);
3120#else
3121		check_poison_obj(cachep, objp);
3122#endif
3123		poison_obj(cachep, objp, POISON_INUSE);
3124	}
3125	if (cachep->flags & SLAB_STORE_USER)
3126		*dbg_userword(cachep, objp) = caller;
3127
3128	if (cachep->flags & SLAB_RED_ZONE) {
3129		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3130				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3131			slab_error(cachep, "double free, or memory outside"
3132						" object was overwritten");
3133			printk(KERN_ERR
3134				"%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
3135				objp, *dbg_redzone1(cachep, objp),
3136				*dbg_redzone2(cachep, objp));
3137		}
3138		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
3139		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
3140	}
3141#ifdef CONFIG_DEBUG_SLAB_LEAK
3142	{
3143		struct slab *slabp;
3144		unsigned objnr;
3145
3146		slabp = page_get_slab(virt_to_head_page(objp));
3147		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
3148		slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3149	}
3150#endif
3151	objp += obj_offset(cachep);
3152	if (cachep->ctor && cachep->flags & SLAB_POISON)
3153		cachep->ctor(objp);
3154#if ARCH_SLAB_MINALIGN
3155	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3156		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3157		       objp, ARCH_SLAB_MINALIGN);
3158	}
3159#endif
3160	return objp;
3161}
3162#else
3163#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3164#endif
3165
3166static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3167{
3168	if (cachep == &cache_cache)
3169		return false;
3170
3171	return should_failslab(obj_size(cachep), flags, cachep->flags);
3172}
3173
3174static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3175{
3176	void *objp;
3177	struct array_cache *ac;
3178
3179	check_irq_off();
3180
3181	ac = cpu_cache_get(cachep);
3182	if (likely(ac->avail)) {
3183		STATS_INC_ALLOCHIT(cachep);
3184		ac->touched = 1;
3185		objp = ac->entry[--ac->avail];
3186	} else {
3187		STATS_INC_ALLOCMISS(cachep);
3188		objp = cache_alloc_refill(cachep, flags);
3189		/*
3190		 * the 'ac' may be updated by cache_alloc_refill(),
3191		 * and kmemleak_erase() requires its correct value.
3192		 */
3193		ac = cpu_cache_get(cachep);
3194	}
3195	/*
3196	 * To avoid a false negative, if an object that is in one of the
3197	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3198	 * treat the array pointers as a reference to the object.
3199	 */
3200	if (objp)
3201		kmemleak_erase(&ac->entry[ac->avail]);
3202	return objp;
3203}
3204
3205#ifdef CONFIG_NUMA
3206/*
3207 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3208 *
3209 * If we are in_interrupt, then process context, including cpusets and
3210 * mempolicy, may not apply and should not be used for allocation policy.
3211 */
3212static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3213{
3214	int nid_alloc, nid_here;
3215
3216	if (in_interrupt() || (flags & __GFP_THISNODE))
3217		return NULL;
3218	nid_alloc = nid_here = numa_mem_id();
3219	get_mems_allowed();
3220	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3221		nid_alloc = cpuset_slab_spread_node();
3222	else if (current->mempolicy)
3223		nid_alloc = slab_node(current->mempolicy);
3224	put_mems_allowed();
3225	if (nid_alloc != nid_here)
3226		return ____cache_alloc_node(cachep, flags, nid_alloc);
3227	return NULL;
3228}
3229
3230/*
3231 * Fallback function if there was no memory available and no objects on a
3232 * certain node and fall back is permitted. First we scan all the
3233 * available nodelists for available objects. If that fails then we
3234 * perform an allocation without specifying a node. This allows the page
3235 * allocator to do its reclaim / fallback magic. We then insert the
3236 * slab into the proper nodelist and then allocate from it.
3237 */
3238static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3239{
3240	struct zonelist *zonelist;
3241	gfp_t local_flags;
3242	struct zoneref *z;
3243	struct zone *zone;
3244	enum zone_type high_zoneidx = gfp_zone(flags);
3245	void *obj = NULL;
3246	int nid;
3247
3248	if (flags & __GFP_THISNODE)
3249		return NULL;
3250
3251	get_mems_allowed();
3252	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3253	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3254
3255retry:
3256	/*
3257	 * Look through allowed nodes for objects available
3258	 * from existing per node queues.
3259	 */
3260	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
3261		nid = zone_to_nid(zone);
3262
3263		if (cpuset_zone_allowed_hardwall(zone, flags) &&
3264			cache->nodelists[nid] &&
3265			cache->nodelists[nid]->free_objects) {
3266				obj = ____cache_alloc_node(cache,
3267					flags | GFP_THISNODE, nid);
3268				if (obj)
3269					break;
3270		}
3271	}
3272
3273	if (!obj) {
3274		/*
3275		 * This allocation will be performed within the constraints
3276		 * of the current cpuset / memory policy requirements.
3277		 * We may trigger various forms of reclaim on the allowed
3278		 * set and go into memory reserves if necessary.
3279		 */
3280		if (local_flags & __GFP_WAIT)
3281			local_irq_enable();
3282		kmem_flagcheck(cache, flags);
3283		obj = kmem_getpages(cache, local_flags, numa_mem_id());
3284		if (local_flags & __GFP_WAIT)
3285			local_irq_disable();
3286		if (obj) {
3287			/*
3288			 * Insert into the appropriate per node queues
3289			 */
3290			nid = page_to_nid(virt_to_page(obj));
3291			if (cache_grow(cache, flags, nid, obj)) {
3292				obj = ____cache_alloc_node(cache,
3293					flags | GFP_THISNODE, nid);
3294				if (!obj)
3295					/*
3296					 * Another processor may allocate the
3297					 * objects in the slab since we are
3298					 * not holding any locks.
3299					 */
3300					goto retry;
3301			} else {
3302				/* cache_grow already freed obj */
3303				obj = NULL;
3304			}
3305		}
3306	}
3307	put_mems_allowed();
3308	return obj;
3309}
3310
3311/*
3312 * A interface to enable slab creation on nodeid
3313 */
3314static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3315				int nodeid)
3316{
3317	struct list_head *entry;
3318	struct slab *slabp;
3319	struct kmem_list3 *l3;
3320	void *obj;
3321	int x;
3322
3323	l3 = cachep->nodelists[nodeid];
3324	BUG_ON(!l3);
3325
3326retry:
3327	check_irq_off();
3328	spin_lock(&l3->list_lock);
3329	entry = l3->slabs_partial.next;
3330	if (entry == &l3->slabs_partial) {
3331		l3->free_touched = 1;
3332		entry = l3->slabs_free.next;
3333		if (entry == &l3->slabs_free)
3334			goto must_grow;
3335	}
3336
3337	slabp = list_entry(entry, struct slab, list);
3338	check_spinlock_acquired_node(cachep, nodeid);
3339	check_slabp(cachep, slabp);
3340
3341	STATS_INC_NODEALLOCS(cachep);
3342	STATS_INC_ACTIVE(cachep);
3343	STATS_SET_HIGH(cachep);
3344
3345	BUG_ON(slabp->inuse == cachep->num);
3346
3347	obj = slab_get_obj(cachep, slabp, nodeid);
3348	check_slabp(cachep, slabp);
3349	l3->free_objects--;
3350	/* move slabp to correct slabp list: */
3351	list_del(&slabp->list);
3352
3353	if (slabp->free == BUFCTL_END)
3354		list_add(&slabp->list, &l3->slabs_full);
3355	else
3356		list_add(&slabp->list, &l3->slabs_partial);
3357
3358	spin_unlock(&l3->list_lock);
3359	goto done;
3360
3361must_grow:
3362	spin_unlock(&l3->list_lock);
3363	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3364	if (x)
3365		goto retry;
3366
3367	return fallback_alloc(cachep, flags);
3368
3369done:
3370	return obj;
3371}
3372
3373/**
3374 * kmem_cache_alloc_node - Allocate an object on the specified node
3375 * @cachep: The cache to allocate from.
3376 * @flags: See kmalloc().
3377 * @nodeid: node number of the target node.
3378 * @caller: return address of caller, used for debug information
3379 *
3380 * Identical to kmem_cache_alloc but it will allocate memory on the given
3381 * node, which can improve the performance for cpu bound structures.
3382 *
3383 * Fallback to other node is possible if __GFP_THISNODE is not set.
3384 */
3385static __always_inline void *
3386__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3387		   void *caller)
3388{
3389	unsigned long save_flags;
3390	void *ptr;
3391	int slab_node = numa_mem_id();
3392
3393	flags &= gfp_allowed_mask;
3394
3395	lockdep_trace_alloc(flags);
3396
3397	if (slab_should_failslab(cachep, flags))
3398		return NULL;
3399
3400	cache_alloc_debugcheck_before(cachep, flags);
3401	local_irq_save(save_flags);
3402
3403	if (nodeid == -1)
3404		nodeid = slab_node;
3405
3406	if (unlikely(!cachep->nodelists[nodeid])) {
3407		/* Node not bootstrapped yet */
3408		ptr = fallback_alloc(cachep, flags);
3409		goto out;
3410	}
3411
3412	if (nodeid == slab_node) {
3413		/*
3414		 * Use the locally cached objects if possible.
3415		 * However ____cache_alloc does not allow fallback
3416		 * to other nodes. It may fail while we still have
3417		 * objects on other nodes available.
3418		 */
3419		ptr = ____cache_alloc(cachep, flags);
3420		if (ptr)
3421			goto out;
3422	}
3423	/* ___cache_alloc_node can fall back to other nodes */
3424	ptr = ____cache_alloc_node(cachep, flags, nodeid);
3425  out:
3426	local_irq_restore(save_flags);
3427	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3428	kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
3429				 flags);
3430
3431	if (likely(ptr))
3432		kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
3433
3434	if (unlikely((flags & __GFP_ZERO) && ptr))
3435		memset(ptr, 0, obj_size(cachep));
3436
3437	return ptr;
3438}
3439
3440static __always_inline void *
3441__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3442{
3443	void *objp;
3444
3445	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
3446		objp = alternate_node_alloc(cache, flags);
3447		if (objp)
3448			goto out;
3449	}
3450	objp = ____cache_alloc(cache, flags);
3451
3452	/*
3453	 * We may just have run out of memory on the local node.
3454	 * ____cache_alloc_node() knows how to locate memory on other nodes
3455	 */
3456	if (!objp)
3457		objp = ____cache_alloc_node(cache, flags, numa_mem_id());
3458
3459  out:
3460	return objp;
3461}
3462#else
3463
3464static __always_inline void *
3465__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3466{
3467	return ____cache_alloc(cachep, flags);
3468}
3469
3470#endif /* CONFIG_NUMA */
3471
3472static __always_inline void *
3473__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3474{
3475	unsigned long save_flags;
3476	void *objp;
3477
3478	flags &= gfp_allowed_mask;
3479
3480	lockdep_trace_alloc(flags);
3481
3482	if (slab_should_failslab(cachep, flags))
3483		return NULL;
3484
3485	cache_alloc_debugcheck_before(cachep, flags);
3486	local_irq_save(save_flags);
3487	objp = __do_cache_alloc(cachep, flags);
3488	local_irq_restore(save_flags);
3489	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3490	kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
3491				 flags);
3492	prefetchw(objp);
3493
3494	if (likely(objp))
3495		kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
3496
3497	if (unlikely((flags & __GFP_ZERO) && objp))
3498		memset(objp, 0, obj_size(cachep));
3499
3500	return objp;
3501}
3502
3503/*
3504 * Caller needs to acquire correct kmem_list's list_lock
3505 */
3506static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3507		       int node)
3508{
3509	int i;
3510	struct kmem_list3 *l3;
3511
3512	for (i = 0; i < nr_objects; i++) {
3513		void *objp = objpp[i];
3514		struct slab *slabp;
3515
3516		slabp = virt_to_slab(objp);
3517		l3 = cachep->nodelists[node];
3518		list_del(&slabp->list);
3519		check_spinlock_acquired_node(cachep, node);
3520		check_slabp(cachep, slabp);
3521		slab_put_obj(cachep, slabp, objp, node);
3522		STATS_DEC_ACTIVE(cachep);
3523		l3->free_objects++;
3524		check_slabp(cachep, slabp);
3525
3526		/* fixup slab chains */
3527		if (slabp->inuse == 0) {
3528			if (l3->free_objects > l3->free_limit) {
3529				l3->free_objects -= cachep->num;
3530				/* No need to drop any previously held
3531				 * lock here, even if we have a off-slab slab
3532				 * descriptor it is guaranteed to come from
3533				 * a different cache, refer to comments before
3534				 * alloc_slabmgmt.
3535				 */
3536				slab_destroy(cachep, slabp);
3537			} else {
3538				list_add(&slabp->list, &l3->slabs_free);
3539			}
3540		} else {
3541			/* Unconditionally move a slab to the end of the
3542			 * partial list on free - maximum time for the
3543			 * other objects to be freed, too.
3544			 */
3545			list_add_tail(&slabp->list, &l3->slabs_partial);
3546		}
3547	}
3548}
3549
3550static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3551{
3552	int batchcount;
3553	struct kmem_list3 *l3;
3554	int node = numa_mem_id();
3555
3556	batchcount = ac->batchcount;
3557#if DEBUG
3558	BUG_ON(!batchcount || batchcount > ac->avail);
3559#endif
3560	check_irq_off();
3561	l3 = cachep->nodelists[node];
3562	spin_lock(&l3->list_lock);
3563	if (l3->shared) {
3564		struct array_cache *shared_array = l3->shared;
3565		int max = shared_array->limit - shared_array->avail;
3566		if (max) {
3567			if (batchcount > max)
3568				batchcount = max;
3569			memcpy(&(shared_array->entry[shared_array->avail]),
3570			       ac->entry, sizeof(void *) * batchcount);
3571			shared_array->avail += batchcount;
3572			goto free_done;
3573		}
3574	}
3575
3576	free_block(cachep, ac->entry, batchcount, node);
3577free_done:
3578#if STATS
3579	{
3580		int i = 0;
3581		struct list_head *p;
3582
3583		p = l3->slabs_free.next;
3584		while (p != &(l3->slabs_free)) {
3585			struct slab *slabp;
3586
3587			slabp = list_entry(p, struct slab, list);
3588			BUG_ON(slabp->inuse);
3589
3590			i++;
3591			p = p->next;
3592		}
3593		STATS_SET_FREEABLE(cachep, i);
3594	}
3595#endif
3596	spin_unlock(&l3->list_lock);
3597	ac->avail -= batchcount;
3598	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3599}
3600
3601/*
3602 * Release an obj back to its cache. If the obj has a constructed state, it must
3603 * be in this state _before_ it is released.  Called with disabled ints.
3604 */
3605static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3606{
3607	struct array_cache *ac = cpu_cache_get(cachep);
3608
3609	check_irq_off();
3610	kmemleak_free_recursive(objp, cachep->flags);
3611	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3612
3613	kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3614
3615	/*
3616	 * Skip calling cache_free_alien() when the platform is not numa.
3617	 * This will avoid cache misses that happen while accessing slabp (which
3618	 * is per page memory  reference) to get nodeid. Instead use a global
3619	 * variable to skip the call, which is mostly likely to be present in
3620	 * the cache.
3621	 */
3622	if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3623		return;
3624
3625	if (likely(ac->avail < ac->limit)) {
3626		STATS_INC_FREEHIT(cachep);
3627		ac->entry[ac->avail++] = objp;
3628		return;
3629	} else {
3630		STATS_INC_FREEMISS(cachep);
3631		cache_flusharray(cachep, ac);
3632		ac->entry[ac->avail++] = objp;
3633	}
3634}
3635
3636/**
3637 * kmem_cache_alloc - Allocate an object
3638 * @cachep: The cache to allocate from.
3639 * @flags: See kmalloc().
3640 *
3641 * Allocate an object from this cache.  The flags are only relevant
3642 * if the cache has no available objects.
3643 */
3644void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3645{
3646	void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3647
3648	trace_kmem_cache_alloc(_RET_IP_, ret,
3649			       obj_size(cachep), cachep->buffer_size, flags);
3650
3651	return ret;
3652}
3653EXPORT_SYMBOL(kmem_cache_alloc);
3654
3655#ifdef CONFIG_TRACING
3656void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
3657{
3658	return __cache_alloc(cachep, flags, __builtin_return_address(0));
3659}
3660EXPORT_SYMBOL(kmem_cache_alloc_notrace);
3661#endif
3662
3663/**
3664 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
3665 * @cachep: the cache we're checking against
3666 * @ptr: pointer to validate
3667 *
3668 * This verifies that the untrusted pointer looks sane;
3669 * it is _not_ a guarantee that the pointer is actually
3670 * part of the slab cache in question, but it at least
3671 * validates that the pointer can be dereferenced and
3672 * looks half-way sane.
3673 *
3674 * Currently only used for dentry validation.
3675 */
3676int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3677{
3678	unsigned long size = cachep->buffer_size;
3679	struct page *page;
3680
3681	if (unlikely(!kern_ptr_validate(ptr, size)))
3682		goto out;
3683	page = virt_to_page(ptr);
3684	if (unlikely(!PageSlab(page)))
3685		goto out;
3686	if (unlikely(page_get_cache(page) != cachep))
3687		goto out;
3688	return 1;
3689out:
3690	return 0;
3691}
3692
3693#ifdef CONFIG_NUMA
3694void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3695{
3696	void *ret = __cache_alloc_node(cachep, flags, nodeid,
3697				       __builtin_return_address(0));
3698
3699	trace_kmem_cache_alloc_node(_RET_IP_, ret,
3700				    obj_size(cachep), cachep->buffer_size,
3701				    flags, nodeid);
3702
3703	return ret;
3704}
3705EXPORT_SYMBOL(kmem_cache_alloc_node);
3706
3707#ifdef CONFIG_TRACING
3708void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
3709				    gfp_t flags,
3710				    int nodeid)
3711{
3712	return __cache_alloc_node(cachep, flags, nodeid,
3713				  __builtin_return_address(0));
3714}
3715EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
3716#endif
3717
3718static __always_inline void *
3719__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3720{
3721	struct kmem_cache *cachep;
3722	void *ret;
3723
3724	cachep = kmem_find_general_cachep(size, flags);
3725	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3726		return cachep;
3727	ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
3728
3729	trace_kmalloc_node((unsigned long) caller, ret,
3730			   size, cachep->buffer_size, flags, node);
3731
3732	return ret;
3733}
3734
3735#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3736void *__kmalloc_node(size_t size, gfp_t flags, int node)
3737{
3738	return __do_kmalloc_node(size, flags, node,
3739			__builtin_return_address(0));
3740}
3741EXPORT_SYMBOL(__kmalloc_node);
3742
3743void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3744		int node, unsigned long caller)
3745{
3746	return __do_kmalloc_node(size, flags, node, (void *)caller);
3747}
3748EXPORT_SYMBOL(__kmalloc_node_track_caller);
3749#else
3750void *__kmalloc_node(size_t size, gfp_t flags, int node)
3751{
3752	return __do_kmalloc_node(size, flags, node, NULL);
3753}
3754EXPORT_SYMBOL(__kmalloc_node);
3755#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
3756#endif /* CONFIG_NUMA */
3757
3758/**
3759 * __do_kmalloc - allocate memory
3760 * @size: how many bytes of memory are required.
3761 * @flags: the type of memory to allocate (see kmalloc).
3762 * @caller: function caller for debug tracking of the caller
3763 */
3764static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3765					  void *caller)
3766{
3767	struct kmem_cache *cachep;
3768	void *ret;
3769
3770	/* If you want to save a few bytes .text space: replace
3771	 * __ with kmem_.
3772	 * Then kmalloc uses the uninlined functions instead of the inline
3773	 * functions.
3774	 */
3775	cachep = __find_general_cachep(size, flags);
3776	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3777		return cachep;
3778	ret = __cache_alloc(cachep, flags, caller);
3779
3780	trace_kmalloc((unsigned long) caller, ret,
3781		      size, cachep->buffer_size, flags);
3782
3783	return ret;
3784}
3785
3786
3787#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3788void *__kmalloc(size_t size, gfp_t flags)
3789{
3790	return __do_kmalloc(size, flags, __builtin_return_address(0));
3791}
3792EXPORT_SYMBOL(__kmalloc);
3793
3794void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3795{
3796	return __do_kmalloc(size, flags, (void *)caller);
3797}
3798EXPORT_SYMBOL(__kmalloc_track_caller);
3799
3800#else
3801void *__kmalloc(size_t size, gfp_t flags)
3802{
3803	return __do_kmalloc(size, flags, NULL);
3804}
3805EXPORT_SYMBOL(__kmalloc);
3806#endif
3807
3808/**
3809 * kmem_cache_free - Deallocate an object
3810 * @cachep: The cache the allocation was from.
3811 * @objp: The previously allocated object.
3812 *
3813 * Free an object which was previously allocated from this
3814 * cache.
3815 */
3816void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3817{
3818	unsigned long flags;
3819
3820	local_irq_save(flags);
3821	debug_check_no_locks_freed(objp, obj_size(cachep));
3822	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3823		debug_check_no_obj_freed(objp, obj_size(cachep));
3824	__cache_free(cachep, objp);
3825	local_irq_restore(flags);
3826
3827	trace_kmem_cache_free(_RET_IP_, objp);
3828}
3829EXPORT_SYMBOL(kmem_cache_free);
3830
3831/**
3832 * kfree - free previously allocated memory
3833 * @objp: pointer returned by kmalloc.
3834 *
3835 * If @objp is NULL, no operation is performed.
3836 *
3837 * Don't free memory not originally allocated by kmalloc()
3838 * or you will run into trouble.
3839 */
3840void kfree(const void *objp)
3841{
3842	struct kmem_cache *c;
3843	unsigned long flags;
3844
3845	trace_kfree(_RET_IP_, objp);
3846
3847	if (unlikely(ZERO_OR_NULL_PTR(objp)))
3848		return;
3849	local_irq_save(flags);
3850	kfree_debugcheck(objp);
3851	c = virt_to_cache(objp);
3852	debug_check_no_locks_freed(objp, obj_size(c));
3853	debug_check_no_obj_freed(objp, obj_size(c));
3854	__cache_free(c, (void *)objp);
3855	local_irq_restore(flags);
3856}
3857EXPORT_SYMBOL(kfree);
3858
3859unsigned int kmem_cache_size(struct kmem_cache *cachep)
3860{
3861	return obj_size(cachep);
3862}
3863EXPORT_SYMBOL(kmem_cache_size);
3864
3865const char *kmem_cache_name(struct kmem_cache *cachep)
3866{
3867	return cachep->name;
3868}
3869EXPORT_SYMBOL_GPL(kmem_cache_name);
3870
3871/*
3872 * This initializes kmem_list3 or resizes various caches for all nodes.
3873 */
3874static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3875{
3876	int node;
3877	struct kmem_list3 *l3;
3878	struct array_cache *new_shared;
3879	struct array_cache **new_alien = NULL;
3880
3881	for_each_online_node(node) {
3882
3883                if (use_alien_caches) {
3884                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3885                        if (!new_alien)
3886                                goto fail;
3887                }
3888
3889		new_shared = NULL;
3890		if (cachep->shared) {
3891			new_shared = alloc_arraycache(node,
3892				cachep->shared*cachep->batchcount,
3893					0xbaadf00d, gfp);
3894			if (!new_shared) {
3895				free_alien_cache(new_alien);
3896				goto fail;
3897			}
3898		}
3899
3900		l3 = cachep->nodelists[node];
3901		if (l3) {
3902			struct array_cache *shared = l3->shared;
3903
3904			spin_lock_irq(&l3->list_lock);
3905
3906			if (shared)
3907				free_block(cachep, shared->entry,
3908						shared->avail, node);
3909
3910			l3->shared = new_shared;
3911			if (!l3->alien) {
3912				l3->alien = new_alien;
3913				new_alien = NULL;
3914			}
3915			l3->free_limit = (1 + nr_cpus_node(node)) *
3916					cachep->batchcount + cachep->num;
3917			spin_unlock_irq(&l3->list_lock);
3918			kfree(shared);
3919			free_alien_cache(new_alien);
3920			continue;
3921		}
3922		l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
3923		if (!l3) {
3924			free_alien_cache(new_alien);
3925			kfree(new_shared);
3926			goto fail;
3927		}
3928
3929		kmem_list3_init(l3);
3930		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3931				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3932		l3->shared = new_shared;
3933		l3->alien = new_alien;
3934		l3->free_limit = (1 + nr_cpus_node(node)) *
3935					cachep->batchcount + cachep->num;
3936		cachep->nodelists[node] = l3;
3937	}
3938	return 0;
3939
3940fail:
3941	if (!cachep->next.next) {
3942		/* Cache is not active yet. Roll back what we did */
3943		node--;
3944		while (node >= 0) {
3945			if (cachep->nodelists[node]) {
3946				l3 = cachep->nodelists[node];
3947
3948				kfree(l3->shared);
3949				free_alien_cache(l3->alien);
3950				kfree(l3);
3951				cachep->nodelists[node] = NULL;
3952			}
3953			node--;
3954		}
3955	}
3956	return -ENOMEM;
3957}
3958
3959struct ccupdate_struct {
3960	struct kmem_cache *cachep;
3961	struct array_cache *new[NR_CPUS];
3962};
3963
3964static void do_ccupdate_local(void *info)
3965{
3966	struct ccupdate_struct *new = info;
3967	struct array_cache *old;
3968
3969	check_irq_off();
3970	old = cpu_cache_get(new->cachep);
3971
3972	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3973	new->new[smp_processor_id()] = old;
3974}
3975
3976/* Always called with the cache_chain_mutex held */
3977static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3978				int batchcount, int shared, gfp_t gfp)
3979{
3980	struct ccupdate_struct *new;
3981	int i;
3982
3983	new = kzalloc(sizeof(*new), gfp);
3984	if (!new)
3985		return -ENOMEM;
3986
3987	for_each_online_cpu(i) {
3988		new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
3989						batchcount, gfp);
3990		if (!new->new[i]) {
3991			for (i--; i >= 0; i--)
3992				kfree(new->new[i]);
3993			kfree(new);
3994			return -ENOMEM;
3995		}
3996	}
3997	new->cachep = cachep;
3998
3999	on_each_cpu(do_ccupdate_local, (void *)new, 1);
4000
4001	check_irq_on();
4002	cachep->batchcount = batchcount;
4003	cachep->limit = limit;
4004	cachep->shared = shared;
4005
4006	for_each_online_cpu(i) {
4007		struct array_cache *ccold = new->new[i];
4008		if (!ccold)
4009			continue;
4010		spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
4011		free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
4012		spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
4013		kfree(ccold);
4014	}
4015	kfree(new);
4016	return alloc_kmemlist(cachep, gfp);
4017}
4018
4019/* Called with cache_chain_mutex held always */
4020static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4021{
4022	int err;
4023	int limit, shared;
4024
4025	/*
4026	 * The head array serves three purposes:
4027	 * - create a LIFO ordering, i.e. return objects that are cache-warm
4028	 * - reduce the number of spinlock operations.
4029	 * - reduce the number of linked list operations on the slab and
4030	 *   bufctl chains: array operations are cheaper.
4031	 * The numbers are guessed, we should auto-tune as described by
4032	 * Bonwick.
4033	 */
4034	if (cachep->buffer_size > 131072)
4035		limit = 1;
4036	else if (cachep->buffer_size > PAGE_SIZE)
4037		limit = 8;
4038	else if (cachep->buffer_size > 1024)
4039		limit = 24;
4040	else if (cachep->buffer_size > 256)
4041		limit = 54;
4042	else
4043		limit = 120;
4044
4045	/*
4046	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
4047	 * allocation behaviour: Most allocs on one cpu, most free operations
4048	 * on another cpu. For these cases, an efficient object passing between
4049	 * cpus is necessary. This is provided by a shared array. The array
4050	 * replaces Bonwick's magazine layer.
4051	 * On uniprocessor, it's functionally equivalent (but less efficient)
4052	 * to a larger limit. Thus disabled by default.
4053	 */
4054	shared = 0;
4055	if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
4056		shared = 8;
4057
4058#if DEBUG
4059	/*
4060	 * With debugging enabled, large batchcount lead to excessively long
4061	 * periods with disabled local interrupts. Limit the batchcount
4062	 */
4063	if (limit > 32)
4064		limit = 32;
4065#endif
4066	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
4067	if (err)
4068		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4069		       cachep->name, -err);
4070	return err;
4071}
4072
4073/*
4074 * Drain an array if it contains any elements taking the l3 lock only if
4075 * necessary. Note that the l3 listlock also protects the array_cache
4076 * if drain_array() is used on the shared array.
4077 */
4078void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
4079			 struct array_cache *ac, int force, int node)
4080{
4081	int tofree;
4082
4083	if (!ac || !ac->avail)
4084		return;
4085	if (ac->touched && !force) {
4086		ac->touched = 0;
4087	} else {
4088		spin_lock_irq(&l3->list_lock);
4089		if (ac->avail) {
4090			tofree = force ? ac->avail : (ac->limit + 4) / 5;
4091			if (tofree > ac->avail)
4092				tofree = (ac->avail + 1) / 2;
4093			free_block(cachep, ac->entry, tofree, node);
4094			ac->avail -= tofree;
4095			memmove(ac->entry, &(ac->entry[tofree]),
4096				sizeof(void *) * ac->avail);
4097		}
4098		spin_unlock_irq(&l3->list_lock);
4099	}
4100}
4101
4102/**
4103 * cache_reap - Reclaim memory from caches.
4104 * @w: work descriptor
4105 *
4106 * Called from workqueue/eventd every few seconds.
4107 * Purpose:
4108 * - clear the per-cpu caches for this CPU.
4109 * - return freeable pages to the main free memory pool.
4110 *
4111 * If we cannot acquire the cache chain mutex then just give up - we'll try
4112 * again on the next iteration.
4113 */
4114static void cache_reap(struct work_struct *w)
4115{
4116	struct kmem_cache *searchp;
4117	struct kmem_list3 *l3;
4118	int node = numa_mem_id();
4119	struct delayed_work *work = to_delayed_work(w);
4120
4121	if (!mutex_trylock(&cache_chain_mutex))
4122		/* Give up. Setup the next iteration. */
4123		goto out;
4124
4125	list_for_each_entry(searchp, &cache_chain, next) {
4126		check_irq_on();
4127
4128		/*
4129		 * We only take the l3 lock if absolutely necessary and we
4130		 * have established with reasonable certainty that
4131		 * we can do some work if the lock was obtained.
4132		 */
4133		l3 = searchp->nodelists[node];
4134
4135		reap_alien(searchp, l3);
4136
4137		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
4138
4139		/*
4140		 * These are racy checks but it does not matter
4141		 * if we skip one check or scan twice.
4142		 */
4143		if (time_after(l3->next_reap, jiffies))
4144			goto next;
4145
4146		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
4147
4148		drain_array(searchp, l3, l3->shared, 0, node);
4149
4150		if (l3->free_touched)
4151			l3->free_touched = 0;
4152		else {
4153			int freed;
4154
4155			freed = drain_freelist(searchp, l3, (l3->free_limit +
4156				5 * searchp->num - 1) / (5 * searchp->num));
4157			STATS_ADD_REAPED(searchp, freed);
4158		}
4159next:
4160		cond_resched();
4161	}
4162	check_irq_on();
4163	mutex_unlock(&cache_chain_mutex);
4164	next_reap_node();
4165out:
4166	/* Set up the next iteration */
4167	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
4168}
4169
4170#ifdef CONFIG_SLABINFO
4171
4172static void print_slabinfo_header(struct seq_file *m)
4173{
4174	/*
4175	 * Output format version, so at least we can change it
4176	 * without _too_ many complaints.
4177	 */
4178#if STATS
4179	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4180#else
4181	seq_puts(m, "slabinfo - version: 2.1\n");
4182#endif
4183	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4184		 "<objperslab> <pagesperslab>");
4185	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4186	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4187#if STATS
4188	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4189		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4190	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4191#endif
4192	seq_putc(m, '\n');
4193}
4194
4195static void *s_start(struct seq_file *m, loff_t *pos)
4196{
4197	loff_t n = *pos;
4198
4199	mutex_lock(&cache_chain_mutex);
4200	if (!n)
4201		print_slabinfo_header(m);
4202
4203	return seq_list_start(&cache_chain, *pos);
4204}
4205
4206static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4207{
4208	return seq_list_next(p, &cache_chain, pos);
4209}
4210
4211static void s_stop(struct seq_file *m, void *p)
4212{
4213	mutex_unlock(&cache_chain_mutex);
4214}
4215
4216static int s_show(struct seq_file *m, void *p)
4217{
4218	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4219	struct slab *slabp;
4220	unsigned long active_objs;
4221	unsigned long num_objs;
4222	unsigned long active_slabs = 0;
4223	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4224	const char *name;
4225	char *error = NULL;
4226	int node;
4227	struct kmem_list3 *l3;
4228
4229	active_objs = 0;
4230	num_slabs = 0;
4231	for_each_online_node(node) {
4232		l3 = cachep->nodelists[node];
4233		if (!l3)
4234			continue;
4235
4236		check_irq_on();
4237		spin_lock_irq(&l3->list_lock);
4238
4239		list_for_each_entry(slabp, &l3->slabs_full, list) {
4240			if (slabp->inuse != cachep->num && !error)
4241				error = "slabs_full accounting error";
4242			active_objs += cachep->num;
4243			active_slabs++;
4244		}
4245		list_for_each_entry(slabp, &l3->slabs_partial, list) {
4246			if (slabp->inuse == cachep->num && !error)
4247				error = "slabs_partial inuse accounting error";
4248			if (!slabp->inuse && !error)
4249				error = "slabs_partial/inuse accounting error";
4250			active_objs += slabp->inuse;
4251			active_slabs++;
4252		}
4253		list_for_each_entry(slabp, &l3->slabs_free, list) {
4254			if (slabp->inuse && !error)
4255				error = "slabs_free/inuse accounting error";
4256			num_slabs++;
4257		}
4258		free_objects += l3->free_objects;
4259		if (l3->shared)
4260			shared_avail += l3->shared->avail;
4261
4262		spin_unlock_irq(&l3->list_lock);
4263	}
4264	num_slabs += active_slabs;
4265	num_objs = num_slabs * cachep->num;
4266	if (num_objs - active_objs != free_objects && !error)
4267		error = "free_objects accounting error";
4268
4269	name = cachep->name;
4270	if (error)
4271		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4272
4273	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4274		   name, active_objs, num_objs, cachep->buffer_size,
4275		   cachep->num, (1 << cachep->gfporder));
4276	seq_printf(m, " : tunables %4u %4u %4u",
4277		   cachep->limit, cachep->batchcount, cachep->shared);
4278	seq_printf(m, " : slabdata %6lu %6lu %6lu",
4279		   active_slabs, num_slabs, shared_avail);
4280#if STATS
4281	{			/* list3 stats */
4282		unsigned long high = cachep->high_mark;
4283		unsigned long allocs = cachep->num_allocations;
4284		unsigned long grown = cachep->grown;
4285		unsigned long reaped = cachep->reaped;
4286		unsigned long errors = cachep->errors;
4287		unsigned long max_freeable = cachep->max_freeable;
4288		unsigned long node_allocs = cachep->node_allocs;
4289		unsigned long node_frees = cachep->node_frees;
4290		unsigned long overflows = cachep->node_overflow;
4291
4292		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
4293			   "%4lu %4lu %4lu %4lu %4lu",
4294			   allocs, high, grown,
4295			   reaped, errors, max_freeable, node_allocs,
4296			   node_frees, overflows);
4297	}
4298	/* cpu stats */
4299	{
4300		unsigned long allochit = atomic_read(&cachep->allochit);
4301		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4302		unsigned long freehit = atomic_read(&cachep->freehit);
4303		unsigned long freemiss = atomic_read(&cachep->freemiss);
4304
4305		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4306			   allochit, allocmiss, freehit, freemiss);
4307	}
4308#endif
4309	seq_putc(m, '\n');
4310	return 0;
4311}
4312
4313/*
4314 * slabinfo_op - iterator that generates /proc/slabinfo
4315 *
4316 * Output layout:
4317 * cache-name
4318 * num-active-objs
4319 * total-objs
4320 * object size
4321 * num-active-slabs
4322 * total-slabs
4323 * num-pages-per-slab
4324 * + further values on SMP and with statistics enabled
4325 */
4326
4327static const struct seq_operations slabinfo_op = {
4328	.start = s_start,
4329	.next = s_next,
4330	.stop = s_stop,
4331	.show = s_show,
4332};
4333
4334#define MAX_SLABINFO_WRITE 128
4335/**
4336 * slabinfo_write - Tuning for the slab allocator
4337 * @file: unused
4338 * @buffer: user buffer
4339 * @count: data length
4340 * @ppos: unused
4341 */
4342ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4343		       size_t count, loff_t *ppos)
4344{
4345	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4346	int limit, batchcount, shared, res;
4347	struct kmem_cache *cachep;
4348
4349	if (count > MAX_SLABINFO_WRITE)
4350		return -EINVAL;
4351	if (copy_from_user(&kbuf, buffer, count))
4352		return -EFAULT;
4353	kbuf[MAX_SLABINFO_WRITE] = '\0';
4354
4355	tmp = strchr(kbuf, ' ');
4356	if (!tmp)
4357		return -EINVAL;
4358	*tmp = '\0';
4359	tmp++;
4360	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4361		return -EINVAL;
4362
4363	/* Find the cache in the chain of caches. */
4364	mutex_lock(&cache_chain_mutex);
4365	res = -EINVAL;
4366	list_for_each_entry(cachep, &cache_chain, next) {
4367		if (!strcmp(cachep->name, kbuf)) {
4368			if (limit < 1 || batchcount < 1 ||
4369					batchcount > limit || shared < 0) {
4370				res = 0;
4371			} else {
4372				res = do_tune_cpucache(cachep, limit,
4373						       batchcount, shared,
4374						       GFP_KERNEL);
4375			}
4376			break;
4377		}
4378	}
4379	mutex_unlock(&cache_chain_mutex);
4380	if (res >= 0)
4381		res = count;
4382	return res;
4383}
4384
4385static int slabinfo_open(struct inode *inode, struct file *file)
4386{
4387	return seq_open(file, &slabinfo_op);
4388}
4389
4390static const struct file_operations proc_slabinfo_operations = {
4391	.open		= slabinfo_open,
4392	.read		= seq_read,
4393	.write		= slabinfo_write,
4394	.llseek		= seq_lseek,
4395	.release	= seq_release,
4396};
4397
4398#ifdef CONFIG_DEBUG_SLAB_LEAK
4399
4400static void *leaks_start(struct seq_file *m, loff_t *pos)
4401{
4402	mutex_lock(&cache_chain_mutex);
4403	return seq_list_start(&cache_chain, *pos);
4404}
4405
4406static inline int add_caller(unsigned long *n, unsigned long v)
4407{
4408	unsigned long *p;
4409	int l;
4410	if (!v)
4411		return 1;
4412	l = n[1];
4413	p = n + 2;
4414	while (l) {
4415		int i = l/2;
4416		unsigned long *q = p + 2 * i;
4417		if (*q == v) {
4418			q[1]++;
4419			return 1;
4420		}
4421		if (*q > v) {
4422			l = i;
4423		} else {
4424			p = q + 2;
4425			l -= i + 1;
4426		}
4427	}
4428	if (++n[1] == n[0])
4429		return 0;
4430	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4431	p[0] = v;
4432	p[1] = 1;
4433	return 1;
4434}
4435
4436static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4437{
4438	void *p;
4439	int i;
4440	if (n[0] == n[1])
4441		return;
4442	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4443		if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4444			continue;
4445		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4446			return;
4447	}
4448}
4449
4450static void show_symbol(struct seq_file *m, unsigned long address)
4451{
4452#ifdef CONFIG_KALLSYMS
4453	unsigned long offset, size;
4454	char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
4455
4456	if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
4457		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4458		if (modname[0])
4459			seq_printf(m, " [%s]", modname);
4460		return;
4461	}
4462#endif
4463	seq_printf(m, "%p", (void *)address);
4464}
4465
4466static int leaks_show(struct seq_file *m, void *p)
4467{
4468	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4469	struct slab *slabp;
4470	struct kmem_list3 *l3;
4471	const char *name;
4472	unsigned long *n = m->private;
4473	int node;
4474	int i;
4475
4476	if (!(cachep->flags & SLAB_STORE_USER))
4477		return 0;
4478	if (!(cachep->flags & SLAB_RED_ZONE))
4479		return 0;
4480
4481	/* OK, we can do it */
4482
4483	n[1] = 0;
4484
4485	for_each_online_node(node) {
4486		l3 = cachep->nodelists[node];
4487		if (!l3)
4488			continue;
4489
4490		check_irq_on();
4491		spin_lock_irq(&l3->list_lock);
4492
4493		list_for_each_entry(slabp, &l3->slabs_full, list)
4494			handle_slab(n, cachep, slabp);
4495		list_for_each_entry(slabp, &l3->slabs_partial, list)
4496			handle_slab(n, cachep, slabp);
4497		spin_unlock_irq(&l3->list_lock);
4498	}
4499	name = cachep->name;
4500	if (n[0] == n[1]) {
4501		/* Increase the buffer size */
4502		mutex_unlock(&cache_chain_mutex);
4503		m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4504		if (!m->private) {
4505			/* Too bad, we are really out */
4506			m->private = n;
4507			mutex_lock(&cache_chain_mutex);
4508			return -ENOMEM;
4509		}
4510		*(unsigned long *)m->private = n[0] * 2;
4511		kfree(n);
4512		mutex_lock(&cache_chain_mutex);
4513		/* Now make sure this entry will be retried */
4514		m->count = m->size;
4515		return 0;
4516	}
4517	for (i = 0; i < n[1]; i++) {
4518		seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4519		show_symbol(m, n[2*i+2]);
4520		seq_putc(m, '\n');
4521	}
4522
4523	return 0;
4524}
4525
4526static const struct seq_operations slabstats_op = {
4527	.start = leaks_start,
4528	.next = s_next,
4529	.stop = s_stop,
4530	.show = leaks_show,
4531};
4532
4533static int slabstats_open(struct inode *inode, struct file *file)
4534{
4535	unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
4536	int ret = -ENOMEM;
4537	if (n) {
4538		ret = seq_open(file, &slabstats_op);
4539		if (!ret) {
4540			struct seq_file *m = file->private_data;
4541			*n = PAGE_SIZE / (2 * sizeof(unsigned long));
4542			m->private = n;
4543			n = NULL;
4544		}
4545		kfree(n);
4546	}
4547	return ret;
4548}
4549
4550static const struct file_operations proc_slabstats_operations = {
4551	.open		= slabstats_open,
4552	.read		= seq_read,
4553	.llseek		= seq_lseek,
4554	.release	= seq_release_private,
4555};
4556#endif
4557
4558static int __init slab_proc_init(void)
4559{
4560	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
4561#ifdef CONFIG_DEBUG_SLAB_LEAK
4562	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4563#endif
4564	return 0;
4565}
4566module_init(slab_proc_init);
4567#endif
4568
4569/**
4570 * ksize - get the actual amount of memory allocated for a given object
4571 * @objp: Pointer to the object
4572 *
4573 * kmalloc may internally round up allocations and return more memory
4574 * than requested. ksize() can be used to determine the actual amount of
4575 * memory allocated. The caller may use this additional memory, even though
4576 * a smaller amount of memory was initially specified with the kmalloc call.
4577 * The caller must guarantee that objp points to a valid object previously
4578 * allocated with either kmalloc() or kmem_cache_alloc(). The object
4579 * must not be freed during the duration of the call.
4580 */
4581size_t ksize(const void *objp)
4582{
4583	BUG_ON(!objp);
4584	if (unlikely(objp == ZERO_SIZE_PTR))
4585		return 0;
4586
4587	return obj_size(virt_to_cache(objp));
4588}
4589EXPORT_SYMBOL(ksize);
4590