slab.c revision b7f869a2847dfe6f9b0835ca1b24e73bed926d7d
1/*
2 * linux/mm/slab.c
3 * Written by Mark Hemment, 1996/97.
4 * (markhe@nextd.demon.co.uk)
5 *
6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7 *
8 * Major cleanup, different bufctl logic, per-cpu arrays
9 *	(c) 2000 Manfred Spraul
10 *
11 * Cleanup, make the head arrays unconditional, preparation for NUMA
12 * 	(c) 2002 Manfred Spraul
13 *
14 * An implementation of the Slab Allocator as described in outline in;
15 *	UNIX Internals: The New Frontiers by Uresh Vahalia
16 *	Pub: Prentice Hall	ISBN 0-13-101908-2
17 * or with a little more detail in;
18 *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
19 *	Jeff Bonwick (Sun Microsystems).
20 *	Presented at: USENIX Summer 1994 Technical Conference
21 *
22 * The memory is organized in caches, one cache for each object type.
23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24 * Each cache consists out of many slabs (they are small (usually one
25 * page long) and always contiguous), and each slab contains multiple
26 * initialized objects.
27 *
28 * This means, that your constructor is used only for newly allocated
29 * slabs and you must pass objects with the same intializations to
30 * kmem_cache_free.
31 *
32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33 * normal). If you need a special memory type, then must create a new
34 * cache for that memory type.
35 *
36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37 *   full slabs with 0 free objects
38 *   partial slabs
39 *   empty slabs with no allocated objects
40 *
41 * If partial slabs exist, then new allocations come from these slabs,
42 * otherwise from empty slabs or new slabs are allocated.
43 *
44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46 *
47 * Each cache has a short per-cpu head array, most allocs
48 * and frees go into that array, and if that array overflows, then 1/2
49 * of the entries in the array are given back into the global cache.
50 * The head array is strictly LIFO and should improve the cache hit rates.
51 * On SMP, it additionally reduces the spinlock operations.
52 *
53 * The c_cpuarray may not be read with enabled local interrupts -
54 * it's changed with a smp_call_function().
55 *
56 * SMP synchronization:
57 *  constructors and destructors are called without any locking.
58 *  Several members in struct kmem_cache and struct slab never change, they
59 *	are accessed without any locking.
60 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
61 *  	and local interrupts are disabled so slab code is preempt-safe.
62 *  The non-constant members are protected with a per-cache irq spinlock.
63 *
64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65 * in 2000 - many ideas in the current implementation are derived from
66 * his patch.
67 *
68 * Further notes from the original documentation:
69 *
70 * 11 April '97.  Started multi-threading - markhe
71 *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
72 *	The sem is only needed when accessing/extending the cache-chain, which
73 *	can never happen inside an interrupt (kmem_cache_create(),
74 *	kmem_cache_shrink() and kmem_cache_reap()).
75 *
76 *	At present, each engine can be growing a cache.  This should be blocked.
77 *
78 * 15 March 2005. NUMA slab allocator.
79 *	Shai Fultheim <shai@scalex86.org>.
80 *	Shobhit Dayal <shobhit@calsoftinc.com>
81 *	Alok N Kataria <alokk@calsoftinc.com>
82 *	Christoph Lameter <christoph@lameter.com>
83 *
84 *	Modified the slab allocator to be node aware on NUMA systems.
85 *	Each node has its own list of partial, free and full slabs.
86 *	All object allocations for a node occur from node specific slab lists.
87 */
88
89#include	<linux/slab.h>
90#include	<linux/mm.h>
91#include	<linux/poison.h>
92#include	<linux/swap.h>
93#include	<linux/cache.h>
94#include	<linux/interrupt.h>
95#include	<linux/init.h>
96#include	<linux/compiler.h>
97#include	<linux/cpuset.h>
98#include	<linux/seq_file.h>
99#include	<linux/notifier.h>
100#include	<linux/kallsyms.h>
101#include	<linux/cpu.h>
102#include	<linux/sysctl.h>
103#include	<linux/module.h>
104#include	<linux/rcupdate.h>
105#include	<linux/string.h>
106#include	<linux/uaccess.h>
107#include	<linux/nodemask.h>
108#include	<linux/mempolicy.h>
109#include	<linux/mutex.h>
110#include	<linux/fault-inject.h>
111#include	<linux/rtmutex.h>
112#include	<linux/reciprocal_div.h>
113
114#include	<asm/cacheflush.h>
115#include	<asm/tlbflush.h>
116#include	<asm/page.h>
117
118/*
119 * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
120 *		  SLAB_RED_ZONE & SLAB_POISON.
121 *		  0 for faster, smaller code (especially in the critical paths).
122 *
123 * STATS	- 1 to collect stats for /proc/slabinfo.
124 *		  0 for faster, smaller code (especially in the critical paths).
125 *
126 * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
127 */
128
129#ifdef CONFIG_DEBUG_SLAB
130#define	DEBUG		1
131#define	STATS		1
132#define	FORCED_DEBUG	1
133#else
134#define	DEBUG		0
135#define	STATS		0
136#define	FORCED_DEBUG	0
137#endif
138
139/* Shouldn't this be in a header file somewhere? */
140#define	BYTES_PER_WORD		sizeof(void *)
141
142#ifndef cache_line_size
143#define cache_line_size()	L1_CACHE_BYTES
144#endif
145
146#ifndef ARCH_KMALLOC_MINALIGN
147/*
148 * Enforce a minimum alignment for the kmalloc caches.
149 * Usually, the kmalloc caches are cache_line_size() aligned, except when
150 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
151 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
152 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
153 * Note that this flag disables some debug features.
154 */
155#define ARCH_KMALLOC_MINALIGN 0
156#endif
157
158#ifndef ARCH_SLAB_MINALIGN
159/*
160 * Enforce a minimum alignment for all caches.
161 * Intended for archs that get misalignment faults even for BYTES_PER_WORD
162 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
163 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
164 * some debug features.
165 */
166#define ARCH_SLAB_MINALIGN 0
167#endif
168
169#ifndef ARCH_KMALLOC_FLAGS
170#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
171#endif
172
173/* Legal flag mask for kmem_cache_create(). */
174#if DEBUG
175# define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
176			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
177			 SLAB_CACHE_DMA | \
178			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
179			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
180			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
181#else
182# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
183			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
184			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
185			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
186#endif
187
188/*
189 * kmem_bufctl_t:
190 *
191 * Bufctl's are used for linking objs within a slab
192 * linked offsets.
193 *
194 * This implementation relies on "struct page" for locating the cache &
195 * slab an object belongs to.
196 * This allows the bufctl structure to be small (one int), but limits
197 * the number of objects a slab (not a cache) can contain when off-slab
198 * bufctls are used. The limit is the size of the largest general cache
199 * that does not use off-slab slabs.
200 * For 32bit archs with 4 kB pages, is this 56.
201 * This is not serious, as it is only for large objects, when it is unwise
202 * to have too many per slab.
203 * Note: This limit can be raised by introducing a general cache whose size
204 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
205 */
206
207typedef unsigned int kmem_bufctl_t;
208#define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
209#define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
210#define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
211#define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
212
213/*
214 * struct slab
215 *
216 * Manages the objs in a slab. Placed either at the beginning of mem allocated
217 * for a slab, or allocated from an general cache.
218 * Slabs are chained into three list: fully used, partial, fully free slabs.
219 */
220struct slab {
221	struct list_head list;
222	unsigned long colouroff;
223	void *s_mem;		/* including colour offset */
224	unsigned int inuse;	/* num of objs active in slab */
225	kmem_bufctl_t free;
226	unsigned short nodeid;
227};
228
229/*
230 * struct slab_rcu
231 *
232 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
233 * arrange for kmem_freepages to be called via RCU.  This is useful if
234 * we need to approach a kernel structure obliquely, from its address
235 * obtained without the usual locking.  We can lock the structure to
236 * stabilize it and check it's still at the given address, only if we
237 * can be sure that the memory has not been meanwhile reused for some
238 * other kind of object (which our subsystem's lock might corrupt).
239 *
240 * rcu_read_lock before reading the address, then rcu_read_unlock after
241 * taking the spinlock within the structure expected at that address.
242 *
243 * We assume struct slab_rcu can overlay struct slab when destroying.
244 */
245struct slab_rcu {
246	struct rcu_head head;
247	struct kmem_cache *cachep;
248	void *addr;
249};
250
251/*
252 * struct array_cache
253 *
254 * Purpose:
255 * - LIFO ordering, to hand out cache-warm objects from _alloc
256 * - reduce the number of linked list operations
257 * - reduce spinlock operations
258 *
259 * The limit is stored in the per-cpu structure to reduce the data cache
260 * footprint.
261 *
262 */
263struct array_cache {
264	unsigned int avail;
265	unsigned int limit;
266	unsigned int batchcount;
267	unsigned int touched;
268	spinlock_t lock;
269	void *entry[0];	/*
270			 * Must have this definition in here for the proper
271			 * alignment of array_cache. Also simplifies accessing
272			 * the entries.
273			 * [0] is for gcc 2.95. It should really be [].
274			 */
275};
276
277/*
278 * bootstrap: The caches do not work without cpuarrays anymore, but the
279 * cpuarrays are allocated from the generic caches...
280 */
281#define BOOT_CPUCACHE_ENTRIES	1
282struct arraycache_init {
283	struct array_cache cache;
284	void *entries[BOOT_CPUCACHE_ENTRIES];
285};
286
287/*
288 * The slab lists for all objects.
289 */
290struct kmem_list3 {
291	struct list_head slabs_partial;	/* partial list first, better asm code */
292	struct list_head slabs_full;
293	struct list_head slabs_free;
294	unsigned long free_objects;
295	unsigned int free_limit;
296	unsigned int colour_next;	/* Per-node cache coloring */
297	spinlock_t list_lock;
298	struct array_cache *shared;	/* shared per node */
299	struct array_cache **alien;	/* on other nodes */
300	unsigned long next_reap;	/* updated without locking */
301	int free_touched;		/* updated without locking */
302};
303
304/*
305 * Need this for bootstrapping a per node allocator.
306 */
307#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
308struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
309#define	CACHE_CACHE 0
310#define	SIZE_AC 1
311#define	SIZE_L3 (1 + MAX_NUMNODES)
312
313static int drain_freelist(struct kmem_cache *cache,
314			struct kmem_list3 *l3, int tofree);
315static void free_block(struct kmem_cache *cachep, void **objpp, int len,
316			int node);
317static int enable_cpucache(struct kmem_cache *cachep);
318static void cache_reap(struct work_struct *unused);
319
320/*
321 * This function must be completely optimized away if a constant is passed to
322 * it.  Mostly the same as what is in linux/slab.h except it returns an index.
323 */
324static __always_inline int index_of(const size_t size)
325{
326	extern void __bad_size(void);
327
328	if (__builtin_constant_p(size)) {
329		int i = 0;
330
331#define CACHE(x) \
332	if (size <=x) \
333		return i; \
334	else \
335		i++;
336#include "linux/kmalloc_sizes.h"
337#undef CACHE
338		__bad_size();
339	} else
340		__bad_size();
341	return 0;
342}
343
344static int slab_early_init = 1;
345
346#define INDEX_AC index_of(sizeof(struct arraycache_init))
347#define INDEX_L3 index_of(sizeof(struct kmem_list3))
348
349static void kmem_list3_init(struct kmem_list3 *parent)
350{
351	INIT_LIST_HEAD(&parent->slabs_full);
352	INIT_LIST_HEAD(&parent->slabs_partial);
353	INIT_LIST_HEAD(&parent->slabs_free);
354	parent->shared = NULL;
355	parent->alien = NULL;
356	parent->colour_next = 0;
357	spin_lock_init(&parent->list_lock);
358	parent->free_objects = 0;
359	parent->free_touched = 0;
360}
361
362#define MAKE_LIST(cachep, listp, slab, nodeid)				\
363	do {								\
364		INIT_LIST_HEAD(listp);					\
365		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
366	} while (0)
367
368#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
369	do {								\
370	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
371	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
372	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
373	} while (0)
374
375/*
376 * struct kmem_cache
377 *
378 * manages a cache.
379 */
380
381struct kmem_cache {
382/* 1) per-cpu data, touched during every alloc/free */
383	struct array_cache *array[NR_CPUS];
384/* 2) Cache tunables. Protected by cache_chain_mutex */
385	unsigned int batchcount;
386	unsigned int limit;
387	unsigned int shared;
388
389	unsigned int buffer_size;
390	u32 reciprocal_buffer_size;
391/* 3) touched by every alloc & free from the backend */
392	struct kmem_list3 *nodelists[MAX_NUMNODES];
393
394	unsigned int flags;		/* constant flags */
395	unsigned int num;		/* # of objs per slab */
396
397/* 4) cache_grow/shrink */
398	/* order of pgs per slab (2^n) */
399	unsigned int gfporder;
400
401	/* force GFP flags, e.g. GFP_DMA */
402	gfp_t gfpflags;
403
404	size_t colour;			/* cache colouring range */
405	unsigned int colour_off;	/* colour offset */
406	struct kmem_cache *slabp_cache;
407	unsigned int slab_size;
408	unsigned int dflags;		/* dynamic flags */
409
410	/* constructor func */
411	void (*ctor) (void *, struct kmem_cache *, unsigned long);
412
413	/* de-constructor func */
414	void (*dtor) (void *, struct kmem_cache *, unsigned long);
415
416/* 5) cache creation/removal */
417	const char *name;
418	struct list_head next;
419
420/* 6) statistics */
421#if STATS
422	unsigned long num_active;
423	unsigned long num_allocations;
424	unsigned long high_mark;
425	unsigned long grown;
426	unsigned long reaped;
427	unsigned long errors;
428	unsigned long max_freeable;
429	unsigned long node_allocs;
430	unsigned long node_frees;
431	unsigned long node_overflow;
432	atomic_t allochit;
433	atomic_t allocmiss;
434	atomic_t freehit;
435	atomic_t freemiss;
436#endif
437#if DEBUG
438	/*
439	 * If debugging is enabled, then the allocator can add additional
440	 * fields and/or padding to every object. buffer_size contains the total
441	 * object size including these internal fields, the following two
442	 * variables contain the offset to the user object and its size.
443	 */
444	int obj_offset;
445	int obj_size;
446#endif
447};
448
449#define CFLGS_OFF_SLAB		(0x80000000UL)
450#define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
451
452#define BATCHREFILL_LIMIT	16
453/*
454 * Optimization question: fewer reaps means less probability for unnessary
455 * cpucache drain/refill cycles.
456 *
457 * OTOH the cpuarrays can contain lots of objects,
458 * which could lock up otherwise freeable slabs.
459 */
460#define REAPTIMEOUT_CPUC	(2*HZ)
461#define REAPTIMEOUT_LIST3	(4*HZ)
462
463#if STATS
464#define	STATS_INC_ACTIVE(x)	((x)->num_active++)
465#define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
466#define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
467#define	STATS_INC_GROWN(x)	((x)->grown++)
468#define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
469#define	STATS_SET_HIGH(x)						\
470	do {								\
471		if ((x)->num_active > (x)->high_mark)			\
472			(x)->high_mark = (x)->num_active;		\
473	} while (0)
474#define	STATS_INC_ERR(x)	((x)->errors++)
475#define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
476#define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
477#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
478#define	STATS_SET_FREEABLE(x, i)					\
479	do {								\
480		if ((x)->max_freeable < i)				\
481			(x)->max_freeable = i;				\
482	} while (0)
483#define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
484#define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
485#define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
486#define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
487#else
488#define	STATS_INC_ACTIVE(x)	do { } while (0)
489#define	STATS_DEC_ACTIVE(x)	do { } while (0)
490#define	STATS_INC_ALLOCED(x)	do { } while (0)
491#define	STATS_INC_GROWN(x)	do { } while (0)
492#define	STATS_ADD_REAPED(x,y)	do { } while (0)
493#define	STATS_SET_HIGH(x)	do { } while (0)
494#define	STATS_INC_ERR(x)	do { } while (0)
495#define	STATS_INC_NODEALLOCS(x)	do { } while (0)
496#define	STATS_INC_NODEFREES(x)	do { } while (0)
497#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
498#define	STATS_SET_FREEABLE(x, i) do { } while (0)
499#define STATS_INC_ALLOCHIT(x)	do { } while (0)
500#define STATS_INC_ALLOCMISS(x)	do { } while (0)
501#define STATS_INC_FREEHIT(x)	do { } while (0)
502#define STATS_INC_FREEMISS(x)	do { } while (0)
503#endif
504
505#if DEBUG
506
507/*
508 * memory layout of objects:
509 * 0		: objp
510 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
511 * 		the end of an object is aligned with the end of the real
512 * 		allocation. Catches writes behind the end of the allocation.
513 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
514 * 		redzone word.
515 * cachep->obj_offset: The real object.
516 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
517 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
518 *					[BYTES_PER_WORD long]
519 */
520static int obj_offset(struct kmem_cache *cachep)
521{
522	return cachep->obj_offset;
523}
524
525static int obj_size(struct kmem_cache *cachep)
526{
527	return cachep->obj_size;
528}
529
530static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
531{
532	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
533	return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
534}
535
536static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
537{
538	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
539	if (cachep->flags & SLAB_STORE_USER)
540		return (unsigned long *)(objp + cachep->buffer_size -
541					 2 * BYTES_PER_WORD);
542	return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
543}
544
545static void **dbg_userword(struct kmem_cache *cachep, void *objp)
546{
547	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
548	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
549}
550
551#else
552
553#define obj_offset(x)			0
554#define obj_size(cachep)		(cachep->buffer_size)
555#define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long *)NULL;})
556#define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long *)NULL;})
557#define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
558
559#endif
560
561/*
562 * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
563 * order.
564 */
565#if defined(CONFIG_LARGE_ALLOCS)
566#define	MAX_OBJ_ORDER	13	/* up to 32Mb */
567#define	MAX_GFP_ORDER	13	/* up to 32Mb */
568#elif defined(CONFIG_MMU)
569#define	MAX_OBJ_ORDER	5	/* 32 pages */
570#define	MAX_GFP_ORDER	5	/* 32 pages */
571#else
572#define	MAX_OBJ_ORDER	8	/* up to 1Mb */
573#define	MAX_GFP_ORDER	8	/* up to 1Mb */
574#endif
575
576/*
577 * Do not go above this order unless 0 objects fit into the slab.
578 */
579#define	BREAK_GFP_ORDER_HI	1
580#define	BREAK_GFP_ORDER_LO	0
581static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
582
583/*
584 * Functions for storing/retrieving the cachep and or slab from the page
585 * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
586 * these are used to find the cache which an obj belongs to.
587 */
588static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
589{
590	page->lru.next = (struct list_head *)cache;
591}
592
593static inline struct kmem_cache *page_get_cache(struct page *page)
594{
595	if (unlikely(PageCompound(page)))
596		page = (struct page *)page_private(page);
597	BUG_ON(!PageSlab(page));
598	return (struct kmem_cache *)page->lru.next;
599}
600
601static inline void page_set_slab(struct page *page, struct slab *slab)
602{
603	page->lru.prev = (struct list_head *)slab;
604}
605
606static inline struct slab *page_get_slab(struct page *page)
607{
608	if (unlikely(PageCompound(page)))
609		page = (struct page *)page_private(page);
610	BUG_ON(!PageSlab(page));
611	return (struct slab *)page->lru.prev;
612}
613
614static inline struct kmem_cache *virt_to_cache(const void *obj)
615{
616	struct page *page = virt_to_page(obj);
617	return page_get_cache(page);
618}
619
620static inline struct slab *virt_to_slab(const void *obj)
621{
622	struct page *page = virt_to_page(obj);
623	return page_get_slab(page);
624}
625
626static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
627				 unsigned int idx)
628{
629	return slab->s_mem + cache->buffer_size * idx;
630}
631
632/*
633 * We want to avoid an expensive divide : (offset / cache->buffer_size)
634 *   Using the fact that buffer_size is a constant for a particular cache,
635 *   we can replace (offset / cache->buffer_size) by
636 *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
637 */
638static inline unsigned int obj_to_index(const struct kmem_cache *cache,
639					const struct slab *slab, void *obj)
640{
641	u32 offset = (obj - slab->s_mem);
642	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
643}
644
645/*
646 * These are the default caches for kmalloc. Custom caches can have other sizes.
647 */
648struct cache_sizes malloc_sizes[] = {
649#define CACHE(x) { .cs_size = (x) },
650#include <linux/kmalloc_sizes.h>
651	CACHE(ULONG_MAX)
652#undef CACHE
653};
654EXPORT_SYMBOL(malloc_sizes);
655
656/* Must match cache_sizes above. Out of line to keep cache footprint low. */
657struct cache_names {
658	char *name;
659	char *name_dma;
660};
661
662static struct cache_names __initdata cache_names[] = {
663#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
664#include <linux/kmalloc_sizes.h>
665	{NULL,}
666#undef CACHE
667};
668
669static struct arraycache_init initarray_cache __initdata =
670    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
671static struct arraycache_init initarray_generic =
672    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
673
674/* internal cache of cache description objs */
675static struct kmem_cache cache_cache = {
676	.batchcount = 1,
677	.limit = BOOT_CPUCACHE_ENTRIES,
678	.shared = 1,
679	.buffer_size = sizeof(struct kmem_cache),
680	.name = "kmem_cache",
681#if DEBUG
682	.obj_size = sizeof(struct kmem_cache),
683#endif
684};
685
686#define BAD_ALIEN_MAGIC 0x01020304ul
687
688#ifdef CONFIG_LOCKDEP
689
690/*
691 * Slab sometimes uses the kmalloc slabs to store the slab headers
692 * for other slabs "off slab".
693 * The locking for this is tricky in that it nests within the locks
694 * of all other slabs in a few places; to deal with this special
695 * locking we put on-slab caches into a separate lock-class.
696 *
697 * We set lock class for alien array caches which are up during init.
698 * The lock annotation will be lost if all cpus of a node goes down and
699 * then comes back up during hotplug
700 */
701static struct lock_class_key on_slab_l3_key;
702static struct lock_class_key on_slab_alc_key;
703
704static inline void init_lock_keys(void)
705
706{
707	int q;
708	struct cache_sizes *s = malloc_sizes;
709
710	while (s->cs_size != ULONG_MAX) {
711		for_each_node(q) {
712			struct array_cache **alc;
713			int r;
714			struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
715			if (!l3 || OFF_SLAB(s->cs_cachep))
716				continue;
717			lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
718			alc = l3->alien;
719			/*
720			 * FIXME: This check for BAD_ALIEN_MAGIC
721			 * should go away when common slab code is taught to
722			 * work even without alien caches.
723			 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
724			 * for alloc_alien_cache,
725			 */
726			if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
727				continue;
728			for_each_node(r) {
729				if (alc[r])
730					lockdep_set_class(&alc[r]->lock,
731					     &on_slab_alc_key);
732			}
733		}
734		s++;
735	}
736}
737#else
738static inline void init_lock_keys(void)
739{
740}
741#endif
742
743/*
744 * 1. Guard access to the cache-chain.
745 * 2. Protect sanity of cpu_online_map against cpu hotplug events
746 */
747static DEFINE_MUTEX(cache_chain_mutex);
748static struct list_head cache_chain;
749
750/*
751 * chicken and egg problem: delay the per-cpu array allocation
752 * until the general caches are up.
753 */
754static enum {
755	NONE,
756	PARTIAL_AC,
757	PARTIAL_L3,
758	FULL
759} g_cpucache_up;
760
761/*
762 * used by boot code to determine if it can use slab based allocator
763 */
764int slab_is_available(void)
765{
766	return g_cpucache_up == FULL;
767}
768
769static DEFINE_PER_CPU(struct delayed_work, reap_work);
770
771static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
772{
773	return cachep->array[smp_processor_id()];
774}
775
776static inline struct kmem_cache *__find_general_cachep(size_t size,
777							gfp_t gfpflags)
778{
779	struct cache_sizes *csizep = malloc_sizes;
780
781#if DEBUG
782	/* This happens if someone tries to call
783	 * kmem_cache_create(), or __kmalloc(), before
784	 * the generic caches are initialized.
785	 */
786	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
787#endif
788	while (size > csizep->cs_size)
789		csizep++;
790
791	/*
792	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
793	 * has cs_{dma,}cachep==NULL. Thus no special case
794	 * for large kmalloc calls required.
795	 */
796	if (unlikely(gfpflags & GFP_DMA))
797		return csizep->cs_dmacachep;
798	return csizep->cs_cachep;
799}
800
801static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
802{
803	return __find_general_cachep(size, gfpflags);
804}
805
806static size_t slab_mgmt_size(size_t nr_objs, size_t align)
807{
808	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
809}
810
811/*
812 * Calculate the number of objects and left-over bytes for a given buffer size.
813 */
814static void cache_estimate(unsigned long gfporder, size_t buffer_size,
815			   size_t align, int flags, size_t *left_over,
816			   unsigned int *num)
817{
818	int nr_objs;
819	size_t mgmt_size;
820	size_t slab_size = PAGE_SIZE << gfporder;
821
822	/*
823	 * The slab management structure can be either off the slab or
824	 * on it. For the latter case, the memory allocated for a
825	 * slab is used for:
826	 *
827	 * - The struct slab
828	 * - One kmem_bufctl_t for each object
829	 * - Padding to respect alignment of @align
830	 * - @buffer_size bytes for each object
831	 *
832	 * If the slab management structure is off the slab, then the
833	 * alignment will already be calculated into the size. Because
834	 * the slabs are all pages aligned, the objects will be at the
835	 * correct alignment when allocated.
836	 */
837	if (flags & CFLGS_OFF_SLAB) {
838		mgmt_size = 0;
839		nr_objs = slab_size / buffer_size;
840
841		if (nr_objs > SLAB_LIMIT)
842			nr_objs = SLAB_LIMIT;
843	} else {
844		/*
845		 * Ignore padding for the initial guess. The padding
846		 * is at most @align-1 bytes, and @buffer_size is at
847		 * least @align. In the worst case, this result will
848		 * be one greater than the number of objects that fit
849		 * into the memory allocation when taking the padding
850		 * into account.
851		 */
852		nr_objs = (slab_size - sizeof(struct slab)) /
853			  (buffer_size + sizeof(kmem_bufctl_t));
854
855		/*
856		 * This calculated number will be either the right
857		 * amount, or one greater than what we want.
858		 */
859		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
860		       > slab_size)
861			nr_objs--;
862
863		if (nr_objs > SLAB_LIMIT)
864			nr_objs = SLAB_LIMIT;
865
866		mgmt_size = slab_mgmt_size(nr_objs, align);
867	}
868	*num = nr_objs;
869	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
870}
871
872#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
873
874static void __slab_error(const char *function, struct kmem_cache *cachep,
875			char *msg)
876{
877	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
878	       function, cachep->name, msg);
879	dump_stack();
880}
881
882/*
883 * By default on NUMA we use alien caches to stage the freeing of
884 * objects allocated from other nodes. This causes massive memory
885 * inefficiencies when using fake NUMA setup to split memory into a
886 * large number of small nodes, so it can be disabled on the command
887 * line
888  */
889
890static int use_alien_caches __read_mostly = 1;
891static int __init noaliencache_setup(char *s)
892{
893	use_alien_caches = 0;
894	return 1;
895}
896__setup("noaliencache", noaliencache_setup);
897
898#ifdef CONFIG_NUMA
899/*
900 * Special reaping functions for NUMA systems called from cache_reap().
901 * These take care of doing round robin flushing of alien caches (containing
902 * objects freed on different nodes from which they were allocated) and the
903 * flushing of remote pcps by calling drain_node_pages.
904 */
905static DEFINE_PER_CPU(unsigned long, reap_node);
906
907static void init_reap_node(int cpu)
908{
909	int node;
910
911	node = next_node(cpu_to_node(cpu), node_online_map);
912	if (node == MAX_NUMNODES)
913		node = first_node(node_online_map);
914
915	per_cpu(reap_node, cpu) = node;
916}
917
918static void next_reap_node(void)
919{
920	int node = __get_cpu_var(reap_node);
921
922	/*
923	 * Also drain per cpu pages on remote zones
924	 */
925	if (node != numa_node_id())
926		drain_node_pages(node);
927
928	node = next_node(node, node_online_map);
929	if (unlikely(node >= MAX_NUMNODES))
930		node = first_node(node_online_map);
931	__get_cpu_var(reap_node) = node;
932}
933
934#else
935#define init_reap_node(cpu) do { } while (0)
936#define next_reap_node(void) do { } while (0)
937#endif
938
939/*
940 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
941 * via the workqueue/eventd.
942 * Add the CPU number into the expiration time to minimize the possibility of
943 * the CPUs getting into lockstep and contending for the global cache chain
944 * lock.
945 */
946static void __devinit start_cpu_timer(int cpu)
947{
948	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
949
950	/*
951	 * When this gets called from do_initcalls via cpucache_init(),
952	 * init_workqueues() has already run, so keventd will be setup
953	 * at that time.
954	 */
955	if (keventd_up() && reap_work->work.func == NULL) {
956		init_reap_node(cpu);
957		INIT_DELAYED_WORK(reap_work, cache_reap);
958		schedule_delayed_work_on(cpu, reap_work,
959					__round_jiffies_relative(HZ, cpu));
960	}
961}
962
963static struct array_cache *alloc_arraycache(int node, int entries,
964					    int batchcount)
965{
966	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
967	struct array_cache *nc = NULL;
968
969	nc = kmalloc_node(memsize, GFP_KERNEL, node);
970	if (nc) {
971		nc->avail = 0;
972		nc->limit = entries;
973		nc->batchcount = batchcount;
974		nc->touched = 0;
975		spin_lock_init(&nc->lock);
976	}
977	return nc;
978}
979
980/*
981 * Transfer objects in one arraycache to another.
982 * Locking must be handled by the caller.
983 *
984 * Return the number of entries transferred.
985 */
986static int transfer_objects(struct array_cache *to,
987		struct array_cache *from, unsigned int max)
988{
989	/* Figure out how many entries to transfer */
990	int nr = min(min(from->avail, max), to->limit - to->avail);
991
992	if (!nr)
993		return 0;
994
995	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
996			sizeof(void *) *nr);
997
998	from->avail -= nr;
999	to->avail += nr;
1000	to->touched = 1;
1001	return nr;
1002}
1003
1004#ifndef CONFIG_NUMA
1005
1006#define drain_alien_cache(cachep, alien) do { } while (0)
1007#define reap_alien(cachep, l3) do { } while (0)
1008
1009static inline struct array_cache **alloc_alien_cache(int node, int limit)
1010{
1011	return (struct array_cache **)BAD_ALIEN_MAGIC;
1012}
1013
1014static inline void free_alien_cache(struct array_cache **ac_ptr)
1015{
1016}
1017
1018static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1019{
1020	return 0;
1021}
1022
1023static inline void *alternate_node_alloc(struct kmem_cache *cachep,
1024		gfp_t flags)
1025{
1026	return NULL;
1027}
1028
1029static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1030		 gfp_t flags, int nodeid)
1031{
1032	return NULL;
1033}
1034
1035#else	/* CONFIG_NUMA */
1036
1037static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1038static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1039
1040static struct array_cache **alloc_alien_cache(int node, int limit)
1041{
1042	struct array_cache **ac_ptr;
1043	int memsize = sizeof(void *) * MAX_NUMNODES;
1044	int i;
1045
1046	if (limit > 1)
1047		limit = 12;
1048	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
1049	if (ac_ptr) {
1050		for_each_node(i) {
1051			if (i == node || !node_online(i)) {
1052				ac_ptr[i] = NULL;
1053				continue;
1054			}
1055			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
1056			if (!ac_ptr[i]) {
1057				for (i--; i <= 0; i--)
1058					kfree(ac_ptr[i]);
1059				kfree(ac_ptr);
1060				return NULL;
1061			}
1062		}
1063	}
1064	return ac_ptr;
1065}
1066
1067static void free_alien_cache(struct array_cache **ac_ptr)
1068{
1069	int i;
1070
1071	if (!ac_ptr)
1072		return;
1073	for_each_node(i)
1074	    kfree(ac_ptr[i]);
1075	kfree(ac_ptr);
1076}
1077
1078static void __drain_alien_cache(struct kmem_cache *cachep,
1079				struct array_cache *ac, int node)
1080{
1081	struct kmem_list3 *rl3 = cachep->nodelists[node];
1082
1083	if (ac->avail) {
1084		spin_lock(&rl3->list_lock);
1085		/*
1086		 * Stuff objects into the remote nodes shared array first.
1087		 * That way we could avoid the overhead of putting the objects
1088		 * into the free lists and getting them back later.
1089		 */
1090		if (rl3->shared)
1091			transfer_objects(rl3->shared, ac, ac->limit);
1092
1093		free_block(cachep, ac->entry, ac->avail, node);
1094		ac->avail = 0;
1095		spin_unlock(&rl3->list_lock);
1096	}
1097}
1098
1099/*
1100 * Called from cache_reap() to regularly drain alien caches round robin.
1101 */
1102static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1103{
1104	int node = __get_cpu_var(reap_node);
1105
1106	if (l3->alien) {
1107		struct array_cache *ac = l3->alien[node];
1108
1109		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1110			__drain_alien_cache(cachep, ac, node);
1111			spin_unlock_irq(&ac->lock);
1112		}
1113	}
1114}
1115
1116static void drain_alien_cache(struct kmem_cache *cachep,
1117				struct array_cache **alien)
1118{
1119	int i = 0;
1120	struct array_cache *ac;
1121	unsigned long flags;
1122
1123	for_each_online_node(i) {
1124		ac = alien[i];
1125		if (ac) {
1126			spin_lock_irqsave(&ac->lock, flags);
1127			__drain_alien_cache(cachep, ac, i);
1128			spin_unlock_irqrestore(&ac->lock, flags);
1129		}
1130	}
1131}
1132
1133static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1134{
1135	struct slab *slabp = virt_to_slab(objp);
1136	int nodeid = slabp->nodeid;
1137	struct kmem_list3 *l3;
1138	struct array_cache *alien = NULL;
1139	int node;
1140
1141	node = numa_node_id();
1142
1143	/*
1144	 * Make sure we are not freeing a object from another node to the array
1145	 * cache on this cpu.
1146	 */
1147	if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
1148		return 0;
1149
1150	l3 = cachep->nodelists[node];
1151	STATS_INC_NODEFREES(cachep);
1152	if (l3->alien && l3->alien[nodeid]) {
1153		alien = l3->alien[nodeid];
1154		spin_lock(&alien->lock);
1155		if (unlikely(alien->avail == alien->limit)) {
1156			STATS_INC_ACOVERFLOW(cachep);
1157			__drain_alien_cache(cachep, alien, nodeid);
1158		}
1159		alien->entry[alien->avail++] = objp;
1160		spin_unlock(&alien->lock);
1161	} else {
1162		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1163		free_block(cachep, &objp, 1, nodeid);
1164		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1165	}
1166	return 1;
1167}
1168#endif
1169
1170static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1171				    unsigned long action, void *hcpu)
1172{
1173	long cpu = (long)hcpu;
1174	struct kmem_cache *cachep;
1175	struct kmem_list3 *l3 = NULL;
1176	int node = cpu_to_node(cpu);
1177	int memsize = sizeof(struct kmem_list3);
1178
1179	switch (action) {
1180	case CPU_UP_PREPARE:
1181		mutex_lock(&cache_chain_mutex);
1182		/*
1183		 * We need to do this right in the beginning since
1184		 * alloc_arraycache's are going to use this list.
1185		 * kmalloc_node allows us to add the slab to the right
1186		 * kmem_list3 and not this cpu's kmem_list3
1187		 */
1188
1189		list_for_each_entry(cachep, &cache_chain, next) {
1190			/*
1191			 * Set up the size64 kmemlist for cpu before we can
1192			 * begin anything. Make sure some other cpu on this
1193			 * node has not already allocated this
1194			 */
1195			if (!cachep->nodelists[node]) {
1196				l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1197				if (!l3)
1198					goto bad;
1199				kmem_list3_init(l3);
1200				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1201				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1202
1203				/*
1204				 * The l3s don't come and go as CPUs come and
1205				 * go.  cache_chain_mutex is sufficient
1206				 * protection here.
1207				 */
1208				cachep->nodelists[node] = l3;
1209			}
1210
1211			spin_lock_irq(&cachep->nodelists[node]->list_lock);
1212			cachep->nodelists[node]->free_limit =
1213				(1 + nr_cpus_node(node)) *
1214				cachep->batchcount + cachep->num;
1215			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1216		}
1217
1218		/*
1219		 * Now we can go ahead with allocating the shared arrays and
1220		 * array caches
1221		 */
1222		list_for_each_entry(cachep, &cache_chain, next) {
1223			struct array_cache *nc;
1224			struct array_cache *shared;
1225			struct array_cache **alien = NULL;
1226
1227			nc = alloc_arraycache(node, cachep->limit,
1228						cachep->batchcount);
1229			if (!nc)
1230				goto bad;
1231			shared = alloc_arraycache(node,
1232					cachep->shared * cachep->batchcount,
1233					0xbaadf00d);
1234			if (!shared)
1235				goto bad;
1236
1237			if (use_alien_caches) {
1238                                alien = alloc_alien_cache(node, cachep->limit);
1239                                if (!alien)
1240                                        goto bad;
1241                        }
1242			cachep->array[cpu] = nc;
1243			l3 = cachep->nodelists[node];
1244			BUG_ON(!l3);
1245
1246			spin_lock_irq(&l3->list_lock);
1247			if (!l3->shared) {
1248				/*
1249				 * We are serialised from CPU_DEAD or
1250				 * CPU_UP_CANCELLED by the cpucontrol lock
1251				 */
1252				l3->shared = shared;
1253				shared = NULL;
1254			}
1255#ifdef CONFIG_NUMA
1256			if (!l3->alien) {
1257				l3->alien = alien;
1258				alien = NULL;
1259			}
1260#endif
1261			spin_unlock_irq(&l3->list_lock);
1262			kfree(shared);
1263			free_alien_cache(alien);
1264		}
1265		break;
1266	case CPU_ONLINE:
1267		mutex_unlock(&cache_chain_mutex);
1268		start_cpu_timer(cpu);
1269		break;
1270#ifdef CONFIG_HOTPLUG_CPU
1271	case CPU_DOWN_PREPARE:
1272		mutex_lock(&cache_chain_mutex);
1273		break;
1274	case CPU_DOWN_FAILED:
1275		mutex_unlock(&cache_chain_mutex);
1276		break;
1277	case CPU_DEAD:
1278		/*
1279		 * Even if all the cpus of a node are down, we don't free the
1280		 * kmem_list3 of any cache. This to avoid a race between
1281		 * cpu_down, and a kmalloc allocation from another cpu for
1282		 * memory from the node of the cpu going down.  The list3
1283		 * structure is usually allocated from kmem_cache_create() and
1284		 * gets destroyed at kmem_cache_destroy().
1285		 */
1286		/* fall thru */
1287#endif
1288	case CPU_UP_CANCELED:
1289		list_for_each_entry(cachep, &cache_chain, next) {
1290			struct array_cache *nc;
1291			struct array_cache *shared;
1292			struct array_cache **alien;
1293			cpumask_t mask;
1294
1295			mask = node_to_cpumask(node);
1296			/* cpu is dead; no one can alloc from it. */
1297			nc = cachep->array[cpu];
1298			cachep->array[cpu] = NULL;
1299			l3 = cachep->nodelists[node];
1300
1301			if (!l3)
1302				goto free_array_cache;
1303
1304			spin_lock_irq(&l3->list_lock);
1305
1306			/* Free limit for this kmem_list3 */
1307			l3->free_limit -= cachep->batchcount;
1308			if (nc)
1309				free_block(cachep, nc->entry, nc->avail, node);
1310
1311			if (!cpus_empty(mask)) {
1312				spin_unlock_irq(&l3->list_lock);
1313				goto free_array_cache;
1314			}
1315
1316			shared = l3->shared;
1317			if (shared) {
1318				free_block(cachep, l3->shared->entry,
1319					   l3->shared->avail, node);
1320				l3->shared = NULL;
1321			}
1322
1323			alien = l3->alien;
1324			l3->alien = NULL;
1325
1326			spin_unlock_irq(&l3->list_lock);
1327
1328			kfree(shared);
1329			if (alien) {
1330				drain_alien_cache(cachep, alien);
1331				free_alien_cache(alien);
1332			}
1333free_array_cache:
1334			kfree(nc);
1335		}
1336		/*
1337		 * In the previous loop, all the objects were freed to
1338		 * the respective cache's slabs,  now we can go ahead and
1339		 * shrink each nodelist to its limit.
1340		 */
1341		list_for_each_entry(cachep, &cache_chain, next) {
1342			l3 = cachep->nodelists[node];
1343			if (!l3)
1344				continue;
1345			drain_freelist(cachep, l3, l3->free_objects);
1346		}
1347		mutex_unlock(&cache_chain_mutex);
1348		break;
1349	}
1350	return NOTIFY_OK;
1351bad:
1352	return NOTIFY_BAD;
1353}
1354
1355static struct notifier_block __cpuinitdata cpucache_notifier = {
1356	&cpuup_callback, NULL, 0
1357};
1358
1359/*
1360 * swap the static kmem_list3 with kmalloced memory
1361 */
1362static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1363			int nodeid)
1364{
1365	struct kmem_list3 *ptr;
1366
1367	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
1368	BUG_ON(!ptr);
1369
1370	local_irq_disable();
1371	memcpy(ptr, list, sizeof(struct kmem_list3));
1372	/*
1373	 * Do not assume that spinlocks can be initialized via memcpy:
1374	 */
1375	spin_lock_init(&ptr->list_lock);
1376
1377	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1378	cachep->nodelists[nodeid] = ptr;
1379	local_irq_enable();
1380}
1381
1382/*
1383 * Initialisation.  Called after the page allocator have been initialised and
1384 * before smp_init().
1385 */
1386void __init kmem_cache_init(void)
1387{
1388	size_t left_over;
1389	struct cache_sizes *sizes;
1390	struct cache_names *names;
1391	int i;
1392	int order;
1393	int node;
1394
1395	for (i = 0; i < NUM_INIT_LISTS; i++) {
1396		kmem_list3_init(&initkmem_list3[i]);
1397		if (i < MAX_NUMNODES)
1398			cache_cache.nodelists[i] = NULL;
1399	}
1400
1401	/*
1402	 * Fragmentation resistance on low memory - only use bigger
1403	 * page orders on machines with more than 32MB of memory.
1404	 */
1405	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
1406		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1407
1408	/* Bootstrap is tricky, because several objects are allocated
1409	 * from caches that do not exist yet:
1410	 * 1) initialize the cache_cache cache: it contains the struct
1411	 *    kmem_cache structures of all caches, except cache_cache itself:
1412	 *    cache_cache is statically allocated.
1413	 *    Initially an __init data area is used for the head array and the
1414	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
1415	 *    array at the end of the bootstrap.
1416	 * 2) Create the first kmalloc cache.
1417	 *    The struct kmem_cache for the new cache is allocated normally.
1418	 *    An __init data area is used for the head array.
1419	 * 3) Create the remaining kmalloc caches, with minimally sized
1420	 *    head arrays.
1421	 * 4) Replace the __init data head arrays for cache_cache and the first
1422	 *    kmalloc cache with kmalloc allocated arrays.
1423	 * 5) Replace the __init data for kmem_list3 for cache_cache and
1424	 *    the other cache's with kmalloc allocated memory.
1425	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1426	 */
1427
1428	node = numa_node_id();
1429
1430	/* 1) create the cache_cache */
1431	INIT_LIST_HEAD(&cache_chain);
1432	list_add(&cache_cache.next, &cache_chain);
1433	cache_cache.colour_off = cache_line_size();
1434	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1435	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
1436
1437	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1438					cache_line_size());
1439	cache_cache.reciprocal_buffer_size =
1440		reciprocal_value(cache_cache.buffer_size);
1441
1442	for (order = 0; order < MAX_ORDER; order++) {
1443		cache_estimate(order, cache_cache.buffer_size,
1444			cache_line_size(), 0, &left_over, &cache_cache.num);
1445		if (cache_cache.num)
1446			break;
1447	}
1448	BUG_ON(!cache_cache.num);
1449	cache_cache.gfporder = order;
1450	cache_cache.colour = left_over / cache_cache.colour_off;
1451	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1452				      sizeof(struct slab), cache_line_size());
1453
1454	/* 2+3) create the kmalloc caches */
1455	sizes = malloc_sizes;
1456	names = cache_names;
1457
1458	/*
1459	 * Initialize the caches that provide memory for the array cache and the
1460	 * kmem_list3 structures first.  Without this, further allocations will
1461	 * bug.
1462	 */
1463
1464	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1465					sizes[INDEX_AC].cs_size,
1466					ARCH_KMALLOC_MINALIGN,
1467					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1468					NULL, NULL);
1469
1470	if (INDEX_AC != INDEX_L3) {
1471		sizes[INDEX_L3].cs_cachep =
1472			kmem_cache_create(names[INDEX_L3].name,
1473				sizes[INDEX_L3].cs_size,
1474				ARCH_KMALLOC_MINALIGN,
1475				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1476				NULL, NULL);
1477	}
1478
1479	slab_early_init = 0;
1480
1481	while (sizes->cs_size != ULONG_MAX) {
1482		/*
1483		 * For performance, all the general caches are L1 aligned.
1484		 * This should be particularly beneficial on SMP boxes, as it
1485		 * eliminates "false sharing".
1486		 * Note for systems short on memory removing the alignment will
1487		 * allow tighter packing of the smaller caches.
1488		 */
1489		if (!sizes->cs_cachep) {
1490			sizes->cs_cachep = kmem_cache_create(names->name,
1491					sizes->cs_size,
1492					ARCH_KMALLOC_MINALIGN,
1493					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1494					NULL, NULL);
1495		}
1496
1497		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1498					sizes->cs_size,
1499					ARCH_KMALLOC_MINALIGN,
1500					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1501						SLAB_PANIC,
1502					NULL, NULL);
1503		sizes++;
1504		names++;
1505	}
1506	/* 4) Replace the bootstrap head arrays */
1507	{
1508		struct array_cache *ptr;
1509
1510		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1511
1512		local_irq_disable();
1513		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1514		memcpy(ptr, cpu_cache_get(&cache_cache),
1515		       sizeof(struct arraycache_init));
1516		/*
1517		 * Do not assume that spinlocks can be initialized via memcpy:
1518		 */
1519		spin_lock_init(&ptr->lock);
1520
1521		cache_cache.array[smp_processor_id()] = ptr;
1522		local_irq_enable();
1523
1524		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1525
1526		local_irq_disable();
1527		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1528		       != &initarray_generic.cache);
1529		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1530		       sizeof(struct arraycache_init));
1531		/*
1532		 * Do not assume that spinlocks can be initialized via memcpy:
1533		 */
1534		spin_lock_init(&ptr->lock);
1535
1536		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1537		    ptr;
1538		local_irq_enable();
1539	}
1540	/* 5) Replace the bootstrap kmem_list3's */
1541	{
1542		int nid;
1543
1544		/* Replace the static kmem_list3 structures for the boot cpu */
1545		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
1546
1547		for_each_online_node(nid) {
1548			init_list(malloc_sizes[INDEX_AC].cs_cachep,
1549				  &initkmem_list3[SIZE_AC + nid], nid);
1550
1551			if (INDEX_AC != INDEX_L3) {
1552				init_list(malloc_sizes[INDEX_L3].cs_cachep,
1553					  &initkmem_list3[SIZE_L3 + nid], nid);
1554			}
1555		}
1556	}
1557
1558	/* 6) resize the head arrays to their final sizes */
1559	{
1560		struct kmem_cache *cachep;
1561		mutex_lock(&cache_chain_mutex);
1562		list_for_each_entry(cachep, &cache_chain, next)
1563			if (enable_cpucache(cachep))
1564				BUG();
1565		mutex_unlock(&cache_chain_mutex);
1566	}
1567
1568	/* Annotate slab for lockdep -- annotate the malloc caches */
1569	init_lock_keys();
1570
1571
1572	/* Done! */
1573	g_cpucache_up = FULL;
1574
1575	/*
1576	 * Register a cpu startup notifier callback that initializes
1577	 * cpu_cache_get for all new cpus
1578	 */
1579	register_cpu_notifier(&cpucache_notifier);
1580
1581	/*
1582	 * The reap timers are started later, with a module init call: That part
1583	 * of the kernel is not yet operational.
1584	 */
1585}
1586
1587static int __init cpucache_init(void)
1588{
1589	int cpu;
1590
1591	/*
1592	 * Register the timers that return unneeded pages to the page allocator
1593	 */
1594	for_each_online_cpu(cpu)
1595		start_cpu_timer(cpu);
1596	return 0;
1597}
1598__initcall(cpucache_init);
1599
1600/*
1601 * Interface to system's page allocator. No need to hold the cache-lock.
1602 *
1603 * If we requested dmaable memory, we will get it. Even if we
1604 * did not request dmaable memory, we might get it, but that
1605 * would be relatively rare and ignorable.
1606 */
1607static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1608{
1609	struct page *page;
1610	int nr_pages;
1611	int i;
1612
1613#ifndef CONFIG_MMU
1614	/*
1615	 * Nommu uses slab's for process anonymous memory allocations, and thus
1616	 * requires __GFP_COMP to properly refcount higher order allocations
1617	 */
1618	flags |= __GFP_COMP;
1619#endif
1620
1621	flags |= cachep->gfpflags;
1622
1623	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1624	if (!page)
1625		return NULL;
1626
1627	nr_pages = (1 << cachep->gfporder);
1628	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1629		add_zone_page_state(page_zone(page),
1630			NR_SLAB_RECLAIMABLE, nr_pages);
1631	else
1632		add_zone_page_state(page_zone(page),
1633			NR_SLAB_UNRECLAIMABLE, nr_pages);
1634	for (i = 0; i < nr_pages; i++)
1635		__SetPageSlab(page + i);
1636	return page_address(page);
1637}
1638
1639/*
1640 * Interface to system's page release.
1641 */
1642static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1643{
1644	unsigned long i = (1 << cachep->gfporder);
1645	struct page *page = virt_to_page(addr);
1646	const unsigned long nr_freed = i;
1647
1648	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1649		sub_zone_page_state(page_zone(page),
1650				NR_SLAB_RECLAIMABLE, nr_freed);
1651	else
1652		sub_zone_page_state(page_zone(page),
1653				NR_SLAB_UNRECLAIMABLE, nr_freed);
1654	while (i--) {
1655		BUG_ON(!PageSlab(page));
1656		__ClearPageSlab(page);
1657		page++;
1658	}
1659	if (current->reclaim_state)
1660		current->reclaim_state->reclaimed_slab += nr_freed;
1661	free_pages((unsigned long)addr, cachep->gfporder);
1662}
1663
1664static void kmem_rcu_free(struct rcu_head *head)
1665{
1666	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1667	struct kmem_cache *cachep = slab_rcu->cachep;
1668
1669	kmem_freepages(cachep, slab_rcu->addr);
1670	if (OFF_SLAB(cachep))
1671		kmem_cache_free(cachep->slabp_cache, slab_rcu);
1672}
1673
1674#if DEBUG
1675
1676#ifdef CONFIG_DEBUG_PAGEALLOC
1677static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1678			    unsigned long caller)
1679{
1680	int size = obj_size(cachep);
1681
1682	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1683
1684	if (size < 5 * sizeof(unsigned long))
1685		return;
1686
1687	*addr++ = 0x12345678;
1688	*addr++ = caller;
1689	*addr++ = smp_processor_id();
1690	size -= 3 * sizeof(unsigned long);
1691	{
1692		unsigned long *sptr = &caller;
1693		unsigned long svalue;
1694
1695		while (!kstack_end(sptr)) {
1696			svalue = *sptr++;
1697			if (kernel_text_address(svalue)) {
1698				*addr++ = svalue;
1699				size -= sizeof(unsigned long);
1700				if (size <= sizeof(unsigned long))
1701					break;
1702			}
1703		}
1704
1705	}
1706	*addr++ = 0x87654321;
1707}
1708#endif
1709
1710static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1711{
1712	int size = obj_size(cachep);
1713	addr = &((char *)addr)[obj_offset(cachep)];
1714
1715	memset(addr, val, size);
1716	*(unsigned char *)(addr + size - 1) = POISON_END;
1717}
1718
1719static void dump_line(char *data, int offset, int limit)
1720{
1721	int i;
1722	unsigned char error = 0;
1723	int bad_count = 0;
1724
1725	printk(KERN_ERR "%03x:", offset);
1726	for (i = 0; i < limit; i++) {
1727		if (data[offset + i] != POISON_FREE) {
1728			error = data[offset + i];
1729			bad_count++;
1730		}
1731		printk(" %02x", (unsigned char)data[offset + i]);
1732	}
1733	printk("\n");
1734
1735	if (bad_count == 1) {
1736		error ^= POISON_FREE;
1737		if (!(error & (error - 1))) {
1738			printk(KERN_ERR "Single bit error detected. Probably "
1739					"bad RAM.\n");
1740#ifdef CONFIG_X86
1741			printk(KERN_ERR "Run memtest86+ or a similar memory "
1742					"test tool.\n");
1743#else
1744			printk(KERN_ERR "Run a memory test tool.\n");
1745#endif
1746		}
1747	}
1748}
1749#endif
1750
1751#if DEBUG
1752
1753static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1754{
1755	int i, size;
1756	char *realobj;
1757
1758	if (cachep->flags & SLAB_RED_ZONE) {
1759		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1760			*dbg_redzone1(cachep, objp),
1761			*dbg_redzone2(cachep, objp));
1762	}
1763
1764	if (cachep->flags & SLAB_STORE_USER) {
1765		printk(KERN_ERR "Last user: [<%p>]",
1766			*dbg_userword(cachep, objp));
1767		print_symbol("(%s)",
1768				(unsigned long)*dbg_userword(cachep, objp));
1769		printk("\n");
1770	}
1771	realobj = (char *)objp + obj_offset(cachep);
1772	size = obj_size(cachep);
1773	for (i = 0; i < size && lines; i += 16, lines--) {
1774		int limit;
1775		limit = 16;
1776		if (i + limit > size)
1777			limit = size - i;
1778		dump_line(realobj, i, limit);
1779	}
1780}
1781
1782static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1783{
1784	char *realobj;
1785	int size, i;
1786	int lines = 0;
1787
1788	realobj = (char *)objp + obj_offset(cachep);
1789	size = obj_size(cachep);
1790
1791	for (i = 0; i < size; i++) {
1792		char exp = POISON_FREE;
1793		if (i == size - 1)
1794			exp = POISON_END;
1795		if (realobj[i] != exp) {
1796			int limit;
1797			/* Mismatch ! */
1798			/* Print header */
1799			if (lines == 0) {
1800				printk(KERN_ERR
1801					"Slab corruption: start=%p, len=%d\n",
1802					realobj, size);
1803				print_objinfo(cachep, objp, 0);
1804			}
1805			/* Hexdump the affected line */
1806			i = (i / 16) * 16;
1807			limit = 16;
1808			if (i + limit > size)
1809				limit = size - i;
1810			dump_line(realobj, i, limit);
1811			i += 16;
1812			lines++;
1813			/* Limit to 5 lines */
1814			if (lines > 5)
1815				break;
1816		}
1817	}
1818	if (lines != 0) {
1819		/* Print some data about the neighboring objects, if they
1820		 * exist:
1821		 */
1822		struct slab *slabp = virt_to_slab(objp);
1823		unsigned int objnr;
1824
1825		objnr = obj_to_index(cachep, slabp, objp);
1826		if (objnr) {
1827			objp = index_to_obj(cachep, slabp, objnr - 1);
1828			realobj = (char *)objp + obj_offset(cachep);
1829			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1830			       realobj, size);
1831			print_objinfo(cachep, objp, 2);
1832		}
1833		if (objnr + 1 < cachep->num) {
1834			objp = index_to_obj(cachep, slabp, objnr + 1);
1835			realobj = (char *)objp + obj_offset(cachep);
1836			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1837			       realobj, size);
1838			print_objinfo(cachep, objp, 2);
1839		}
1840	}
1841}
1842#endif
1843
1844#if DEBUG
1845/**
1846 * slab_destroy_objs - destroy a slab and its objects
1847 * @cachep: cache pointer being destroyed
1848 * @slabp: slab pointer being destroyed
1849 *
1850 * Call the registered destructor for each object in a slab that is being
1851 * destroyed.
1852 */
1853static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1854{
1855	int i;
1856	for (i = 0; i < cachep->num; i++) {
1857		void *objp = index_to_obj(cachep, slabp, i);
1858
1859		if (cachep->flags & SLAB_POISON) {
1860#ifdef CONFIG_DEBUG_PAGEALLOC
1861			if (cachep->buffer_size % PAGE_SIZE == 0 &&
1862					OFF_SLAB(cachep))
1863				kernel_map_pages(virt_to_page(objp),
1864					cachep->buffer_size / PAGE_SIZE, 1);
1865			else
1866				check_poison_obj(cachep, objp);
1867#else
1868			check_poison_obj(cachep, objp);
1869#endif
1870		}
1871		if (cachep->flags & SLAB_RED_ZONE) {
1872			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1873				slab_error(cachep, "start of a freed object "
1874					   "was overwritten");
1875			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1876				slab_error(cachep, "end of a freed object "
1877					   "was overwritten");
1878		}
1879		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1880			(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
1881	}
1882}
1883#else
1884static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1885{
1886	if (cachep->dtor) {
1887		int i;
1888		for (i = 0; i < cachep->num; i++) {
1889			void *objp = index_to_obj(cachep, slabp, i);
1890			(cachep->dtor) (objp, cachep, 0);
1891		}
1892	}
1893}
1894#endif
1895
1896/**
1897 * slab_destroy - destroy and release all objects in a slab
1898 * @cachep: cache pointer being destroyed
1899 * @slabp: slab pointer being destroyed
1900 *
1901 * Destroy all the objs in a slab, and release the mem back to the system.
1902 * Before calling the slab must have been unlinked from the cache.  The
1903 * cache-lock is not held/needed.
1904 */
1905static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1906{
1907	void *addr = slabp->s_mem - slabp->colouroff;
1908
1909	slab_destroy_objs(cachep, slabp);
1910	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1911		struct slab_rcu *slab_rcu;
1912
1913		slab_rcu = (struct slab_rcu *)slabp;
1914		slab_rcu->cachep = cachep;
1915		slab_rcu->addr = addr;
1916		call_rcu(&slab_rcu->head, kmem_rcu_free);
1917	} else {
1918		kmem_freepages(cachep, addr);
1919		if (OFF_SLAB(cachep))
1920			kmem_cache_free(cachep->slabp_cache, slabp);
1921	}
1922}
1923
1924/*
1925 * For setting up all the kmem_list3s for cache whose buffer_size is same as
1926 * size of kmem_list3.
1927 */
1928static void set_up_list3s(struct kmem_cache *cachep, int index)
1929{
1930	int node;
1931
1932	for_each_online_node(node) {
1933		cachep->nodelists[node] = &initkmem_list3[index + node];
1934		cachep->nodelists[node]->next_reap = jiffies +
1935		    REAPTIMEOUT_LIST3 +
1936		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1937	}
1938}
1939
1940static void __kmem_cache_destroy(struct kmem_cache *cachep)
1941{
1942	int i;
1943	struct kmem_list3 *l3;
1944
1945	for_each_online_cpu(i)
1946	    kfree(cachep->array[i]);
1947
1948	/* NUMA: free the list3 structures */
1949	for_each_online_node(i) {
1950		l3 = cachep->nodelists[i];
1951		if (l3) {
1952			kfree(l3->shared);
1953			free_alien_cache(l3->alien);
1954			kfree(l3);
1955		}
1956	}
1957	kmem_cache_free(&cache_cache, cachep);
1958}
1959
1960
1961/**
1962 * calculate_slab_order - calculate size (page order) of slabs
1963 * @cachep: pointer to the cache that is being created
1964 * @size: size of objects to be created in this cache.
1965 * @align: required alignment for the objects.
1966 * @flags: slab allocation flags
1967 *
1968 * Also calculates the number of objects per slab.
1969 *
1970 * This could be made much more intelligent.  For now, try to avoid using
1971 * high order pages for slabs.  When the gfp() functions are more friendly
1972 * towards high-order requests, this should be changed.
1973 */
1974static size_t calculate_slab_order(struct kmem_cache *cachep,
1975			size_t size, size_t align, unsigned long flags)
1976{
1977	unsigned long offslab_limit;
1978	size_t left_over = 0;
1979	int gfporder;
1980
1981	for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
1982		unsigned int num;
1983		size_t remainder;
1984
1985		cache_estimate(gfporder, size, align, flags, &remainder, &num);
1986		if (!num)
1987			continue;
1988
1989		if (flags & CFLGS_OFF_SLAB) {
1990			/*
1991			 * Max number of objs-per-slab for caches which
1992			 * use off-slab slabs. Needed to avoid a possible
1993			 * looping condition in cache_grow().
1994			 */
1995			offslab_limit = size - sizeof(struct slab);
1996			offslab_limit /= sizeof(kmem_bufctl_t);
1997
1998 			if (num > offslab_limit)
1999				break;
2000		}
2001
2002		/* Found something acceptable - save it away */
2003		cachep->num = num;
2004		cachep->gfporder = gfporder;
2005		left_over = remainder;
2006
2007		/*
2008		 * A VFS-reclaimable slab tends to have most allocations
2009		 * as GFP_NOFS and we really don't want to have to be allocating
2010		 * higher-order pages when we are unable to shrink dcache.
2011		 */
2012		if (flags & SLAB_RECLAIM_ACCOUNT)
2013			break;
2014
2015		/*
2016		 * Large number of objects is good, but very large slabs are
2017		 * currently bad for the gfp()s.
2018		 */
2019		if (gfporder >= slab_break_gfp_order)
2020			break;
2021
2022		/*
2023		 * Acceptable internal fragmentation?
2024		 */
2025		if (left_over * 8 <= (PAGE_SIZE << gfporder))
2026			break;
2027	}
2028	return left_over;
2029}
2030
2031static int setup_cpu_cache(struct kmem_cache *cachep)
2032{
2033	if (g_cpucache_up == FULL)
2034		return enable_cpucache(cachep);
2035
2036	if (g_cpucache_up == NONE) {
2037		/*
2038		 * Note: the first kmem_cache_create must create the cache
2039		 * that's used by kmalloc(24), otherwise the creation of
2040		 * further caches will BUG().
2041		 */
2042		cachep->array[smp_processor_id()] = &initarray_generic.cache;
2043
2044		/*
2045		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2046		 * the first cache, then we need to set up all its list3s,
2047		 * otherwise the creation of further caches will BUG().
2048		 */
2049		set_up_list3s(cachep, SIZE_AC);
2050		if (INDEX_AC == INDEX_L3)
2051			g_cpucache_up = PARTIAL_L3;
2052		else
2053			g_cpucache_up = PARTIAL_AC;
2054	} else {
2055		cachep->array[smp_processor_id()] =
2056			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
2057
2058		if (g_cpucache_up == PARTIAL_AC) {
2059			set_up_list3s(cachep, SIZE_L3);
2060			g_cpucache_up = PARTIAL_L3;
2061		} else {
2062			int node;
2063			for_each_online_node(node) {
2064				cachep->nodelists[node] =
2065				    kmalloc_node(sizeof(struct kmem_list3),
2066						GFP_KERNEL, node);
2067				BUG_ON(!cachep->nodelists[node]);
2068				kmem_list3_init(cachep->nodelists[node]);
2069			}
2070		}
2071	}
2072	cachep->nodelists[numa_node_id()]->next_reap =
2073			jiffies + REAPTIMEOUT_LIST3 +
2074			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2075
2076	cpu_cache_get(cachep)->avail = 0;
2077	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2078	cpu_cache_get(cachep)->batchcount = 1;
2079	cpu_cache_get(cachep)->touched = 0;
2080	cachep->batchcount = 1;
2081	cachep->limit = BOOT_CPUCACHE_ENTRIES;
2082	return 0;
2083}
2084
2085/**
2086 * kmem_cache_create - Create a cache.
2087 * @name: A string which is used in /proc/slabinfo to identify this cache.
2088 * @size: The size of objects to be created in this cache.
2089 * @align: The required alignment for the objects.
2090 * @flags: SLAB flags
2091 * @ctor: A constructor for the objects.
2092 * @dtor: A destructor for the objects.
2093 *
2094 * Returns a ptr to the cache on success, NULL on failure.
2095 * Cannot be called within a int, but can be interrupted.
2096 * The @ctor is run when new pages are allocated by the cache
2097 * and the @dtor is run before the pages are handed back.
2098 *
2099 * @name must be valid until the cache is destroyed. This implies that
2100 * the module calling this has to destroy the cache before getting unloaded.
2101 *
2102 * The flags are
2103 *
2104 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2105 * to catch references to uninitialised memory.
2106 *
2107 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2108 * for buffer overruns.
2109 *
2110 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2111 * cacheline.  This can be beneficial if you're counting cycles as closely
2112 * as davem.
2113 */
2114struct kmem_cache *
2115kmem_cache_create (const char *name, size_t size, size_t align,
2116	unsigned long flags,
2117	void (*ctor)(void*, struct kmem_cache *, unsigned long),
2118	void (*dtor)(void*, struct kmem_cache *, unsigned long))
2119{
2120	size_t left_over, slab_size, ralign;
2121	struct kmem_cache *cachep = NULL, *pc;
2122
2123	/*
2124	 * Sanity checks... these are all serious usage bugs.
2125	 */
2126	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2127	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
2128		printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
2129				name);
2130		BUG();
2131	}
2132
2133	/*
2134	 * We use cache_chain_mutex to ensure a consistent view of
2135	 * cpu_online_map as well.  Please see cpuup_callback
2136	 */
2137	mutex_lock(&cache_chain_mutex);
2138
2139	list_for_each_entry(pc, &cache_chain, next) {
2140		char tmp;
2141		int res;
2142
2143		/*
2144		 * This happens when the module gets unloaded and doesn't
2145		 * destroy its slab cache and no-one else reuses the vmalloc
2146		 * area of the module.  Print a warning.
2147		 */
2148		res = probe_kernel_address(pc->name, tmp);
2149		if (res) {
2150			printk("SLAB: cache with size %d has lost its name\n",
2151			       pc->buffer_size);
2152			continue;
2153		}
2154
2155		if (!strcmp(pc->name, name)) {
2156			printk("kmem_cache_create: duplicate cache %s\n", name);
2157			dump_stack();
2158			goto oops;
2159		}
2160	}
2161
2162#if DEBUG
2163	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
2164	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
2165		/* No constructor, but inital state check requested */
2166		printk(KERN_ERR "%s: No con, but init state check "
2167		       "requested - %s\n", __FUNCTION__, name);
2168		flags &= ~SLAB_DEBUG_INITIAL;
2169	}
2170#if FORCED_DEBUG
2171	/*
2172	 * Enable redzoning and last user accounting, except for caches with
2173	 * large objects, if the increased size would increase the object size
2174	 * above the next power of two: caches with object sizes just above a
2175	 * power of two have a significant amount of internal fragmentation.
2176	 */
2177	if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
2178		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2179	if (!(flags & SLAB_DESTROY_BY_RCU))
2180		flags |= SLAB_POISON;
2181#endif
2182	if (flags & SLAB_DESTROY_BY_RCU)
2183		BUG_ON(flags & SLAB_POISON);
2184#endif
2185	if (flags & SLAB_DESTROY_BY_RCU)
2186		BUG_ON(dtor);
2187
2188	/*
2189	 * Always checks flags, a caller might be expecting debug support which
2190	 * isn't available.
2191	 */
2192	BUG_ON(flags & ~CREATE_MASK);
2193
2194	/*
2195	 * Check that size is in terms of words.  This is needed to avoid
2196	 * unaligned accesses for some archs when redzoning is used, and makes
2197	 * sure any on-slab bufctl's are also correctly aligned.
2198	 */
2199	if (size & (BYTES_PER_WORD - 1)) {
2200		size += (BYTES_PER_WORD - 1);
2201		size &= ~(BYTES_PER_WORD - 1);
2202	}
2203
2204	/* calculate the final buffer alignment: */
2205
2206	/* 1) arch recommendation: can be overridden for debug */
2207	if (flags & SLAB_HWCACHE_ALIGN) {
2208		/*
2209		 * Default alignment: as specified by the arch code.  Except if
2210		 * an object is really small, then squeeze multiple objects into
2211		 * one cacheline.
2212		 */
2213		ralign = cache_line_size();
2214		while (size <= ralign / 2)
2215			ralign /= 2;
2216	} else {
2217		ralign = BYTES_PER_WORD;
2218	}
2219
2220	/*
2221	 * Redzoning and user store require word alignment. Note this will be
2222	 * overridden by architecture or caller mandated alignment if either
2223	 * is greater than BYTES_PER_WORD.
2224	 */
2225	if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
2226		ralign = BYTES_PER_WORD;
2227
2228	/* 2) arch mandated alignment */
2229	if (ralign < ARCH_SLAB_MINALIGN) {
2230		ralign = ARCH_SLAB_MINALIGN;
2231	}
2232	/* 3) caller mandated alignment */
2233	if (ralign < align) {
2234		ralign = align;
2235	}
2236	/* disable debug if necessary */
2237	if (ralign > BYTES_PER_WORD)
2238		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2239	/*
2240	 * 4) Store it.
2241	 */
2242	align = ralign;
2243
2244	/* Get cache's description obj. */
2245	cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
2246	if (!cachep)
2247		goto oops;
2248
2249#if DEBUG
2250	cachep->obj_size = size;
2251
2252	/*
2253	 * Both debugging options require word-alignment which is calculated
2254	 * into align above.
2255	 */
2256	if (flags & SLAB_RED_ZONE) {
2257		/* add space for red zone words */
2258		cachep->obj_offset += BYTES_PER_WORD;
2259		size += 2 * BYTES_PER_WORD;
2260	}
2261	if (flags & SLAB_STORE_USER) {
2262		/* user store requires one word storage behind the end of
2263		 * the real object.
2264		 */
2265		size += BYTES_PER_WORD;
2266	}
2267#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2268	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2269	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
2270		cachep->obj_offset += PAGE_SIZE - size;
2271		size = PAGE_SIZE;
2272	}
2273#endif
2274#endif
2275
2276	/*
2277	 * Determine if the slab management is 'on' or 'off' slab.
2278	 * (bootstrapping cannot cope with offslab caches so don't do
2279	 * it too early on.)
2280	 */
2281	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
2282		/*
2283		 * Size is large, assume best to place the slab management obj
2284		 * off-slab (should allow better packing of objs).
2285		 */
2286		flags |= CFLGS_OFF_SLAB;
2287
2288	size = ALIGN(size, align);
2289
2290	left_over = calculate_slab_order(cachep, size, align, flags);
2291
2292	if (!cachep->num) {
2293		printk("kmem_cache_create: couldn't create cache %s.\n", name);
2294		kmem_cache_free(&cache_cache, cachep);
2295		cachep = NULL;
2296		goto oops;
2297	}
2298	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2299			  + sizeof(struct slab), align);
2300
2301	/*
2302	 * If the slab has been placed off-slab, and we have enough space then
2303	 * move it on-slab. This is at the expense of any extra colouring.
2304	 */
2305	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2306		flags &= ~CFLGS_OFF_SLAB;
2307		left_over -= slab_size;
2308	}
2309
2310	if (flags & CFLGS_OFF_SLAB) {
2311		/* really off slab. No need for manual alignment */
2312		slab_size =
2313		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2314	}
2315
2316	cachep->colour_off = cache_line_size();
2317	/* Offset must be a multiple of the alignment. */
2318	if (cachep->colour_off < align)
2319		cachep->colour_off = align;
2320	cachep->colour = left_over / cachep->colour_off;
2321	cachep->slab_size = slab_size;
2322	cachep->flags = flags;
2323	cachep->gfpflags = 0;
2324	if (flags & SLAB_CACHE_DMA)
2325		cachep->gfpflags |= GFP_DMA;
2326	cachep->buffer_size = size;
2327	cachep->reciprocal_buffer_size = reciprocal_value(size);
2328
2329	if (flags & CFLGS_OFF_SLAB) {
2330		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2331		/*
2332		 * This is a possibility for one of the malloc_sizes caches.
2333		 * But since we go off slab only for object size greater than
2334		 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2335		 * this should not happen at all.
2336		 * But leave a BUG_ON for some lucky dude.
2337		 */
2338		BUG_ON(!cachep->slabp_cache);
2339	}
2340	cachep->ctor = ctor;
2341	cachep->dtor = dtor;
2342	cachep->name = name;
2343
2344	if (setup_cpu_cache(cachep)) {
2345		__kmem_cache_destroy(cachep);
2346		cachep = NULL;
2347		goto oops;
2348	}
2349
2350	/* cache setup completed, link it into the list */
2351	list_add(&cachep->next, &cache_chain);
2352oops:
2353	if (!cachep && (flags & SLAB_PANIC))
2354		panic("kmem_cache_create(): failed to create slab `%s'\n",
2355		      name);
2356	mutex_unlock(&cache_chain_mutex);
2357	return cachep;
2358}
2359EXPORT_SYMBOL(kmem_cache_create);
2360
2361#if DEBUG
2362static void check_irq_off(void)
2363{
2364	BUG_ON(!irqs_disabled());
2365}
2366
2367static void check_irq_on(void)
2368{
2369	BUG_ON(irqs_disabled());
2370}
2371
2372static void check_spinlock_acquired(struct kmem_cache *cachep)
2373{
2374#ifdef CONFIG_SMP
2375	check_irq_off();
2376	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
2377#endif
2378}
2379
2380static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2381{
2382#ifdef CONFIG_SMP
2383	check_irq_off();
2384	assert_spin_locked(&cachep->nodelists[node]->list_lock);
2385#endif
2386}
2387
2388#else
2389#define check_irq_off()	do { } while(0)
2390#define check_irq_on()	do { } while(0)
2391#define check_spinlock_acquired(x) do { } while(0)
2392#define check_spinlock_acquired_node(x, y) do { } while(0)
2393#endif
2394
2395static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2396			struct array_cache *ac,
2397			int force, int node);
2398
2399static void do_drain(void *arg)
2400{
2401	struct kmem_cache *cachep = arg;
2402	struct array_cache *ac;
2403	int node = numa_node_id();
2404
2405	check_irq_off();
2406	ac = cpu_cache_get(cachep);
2407	spin_lock(&cachep->nodelists[node]->list_lock);
2408	free_block(cachep, ac->entry, ac->avail, node);
2409	spin_unlock(&cachep->nodelists[node]->list_lock);
2410	ac->avail = 0;
2411}
2412
2413static void drain_cpu_caches(struct kmem_cache *cachep)
2414{
2415	struct kmem_list3 *l3;
2416	int node;
2417
2418	on_each_cpu(do_drain, cachep, 1, 1);
2419	check_irq_on();
2420	for_each_online_node(node) {
2421		l3 = cachep->nodelists[node];
2422		if (l3 && l3->alien)
2423			drain_alien_cache(cachep, l3->alien);
2424	}
2425
2426	for_each_online_node(node) {
2427		l3 = cachep->nodelists[node];
2428		if (l3)
2429			drain_array(cachep, l3, l3->shared, 1, node);
2430	}
2431}
2432
2433/*
2434 * Remove slabs from the list of free slabs.
2435 * Specify the number of slabs to drain in tofree.
2436 *
2437 * Returns the actual number of slabs released.
2438 */
2439static int drain_freelist(struct kmem_cache *cache,
2440			struct kmem_list3 *l3, int tofree)
2441{
2442	struct list_head *p;
2443	int nr_freed;
2444	struct slab *slabp;
2445
2446	nr_freed = 0;
2447	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2448
2449		spin_lock_irq(&l3->list_lock);
2450		p = l3->slabs_free.prev;
2451		if (p == &l3->slabs_free) {
2452			spin_unlock_irq(&l3->list_lock);
2453			goto out;
2454		}
2455
2456		slabp = list_entry(p, struct slab, list);
2457#if DEBUG
2458		BUG_ON(slabp->inuse);
2459#endif
2460		list_del(&slabp->list);
2461		/*
2462		 * Safe to drop the lock. The slab is no longer linked
2463		 * to the cache.
2464		 */
2465		l3->free_objects -= cache->num;
2466		spin_unlock_irq(&l3->list_lock);
2467		slab_destroy(cache, slabp);
2468		nr_freed++;
2469	}
2470out:
2471	return nr_freed;
2472}
2473
2474/* Called with cache_chain_mutex held to protect against cpu hotplug */
2475static int __cache_shrink(struct kmem_cache *cachep)
2476{
2477	int ret = 0, i = 0;
2478	struct kmem_list3 *l3;
2479
2480	drain_cpu_caches(cachep);
2481
2482	check_irq_on();
2483	for_each_online_node(i) {
2484		l3 = cachep->nodelists[i];
2485		if (!l3)
2486			continue;
2487
2488		drain_freelist(cachep, l3, l3->free_objects);
2489
2490		ret += !list_empty(&l3->slabs_full) ||
2491			!list_empty(&l3->slabs_partial);
2492	}
2493	return (ret ? 1 : 0);
2494}
2495
2496/**
2497 * kmem_cache_shrink - Shrink a cache.
2498 * @cachep: The cache to shrink.
2499 *
2500 * Releases as many slabs as possible for a cache.
2501 * To help debugging, a zero exit status indicates all slabs were released.
2502 */
2503int kmem_cache_shrink(struct kmem_cache *cachep)
2504{
2505	int ret;
2506	BUG_ON(!cachep || in_interrupt());
2507
2508	mutex_lock(&cache_chain_mutex);
2509	ret = __cache_shrink(cachep);
2510	mutex_unlock(&cache_chain_mutex);
2511	return ret;
2512}
2513EXPORT_SYMBOL(kmem_cache_shrink);
2514
2515/**
2516 * kmem_cache_destroy - delete a cache
2517 * @cachep: the cache to destroy
2518 *
2519 * Remove a struct kmem_cache object from the slab cache.
2520 *
2521 * It is expected this function will be called by a module when it is
2522 * unloaded.  This will remove the cache completely, and avoid a duplicate
2523 * cache being allocated each time a module is loaded and unloaded, if the
2524 * module doesn't have persistent in-kernel storage across loads and unloads.
2525 *
2526 * The cache must be empty before calling this function.
2527 *
2528 * The caller must guarantee that noone will allocate memory from the cache
2529 * during the kmem_cache_destroy().
2530 */
2531void kmem_cache_destroy(struct kmem_cache *cachep)
2532{
2533	BUG_ON(!cachep || in_interrupt());
2534
2535	/* Find the cache in the chain of caches. */
2536	mutex_lock(&cache_chain_mutex);
2537	/*
2538	 * the chain is never empty, cache_cache is never destroyed
2539	 */
2540	list_del(&cachep->next);
2541	if (__cache_shrink(cachep)) {
2542		slab_error(cachep, "Can't free all objects");
2543		list_add(&cachep->next, &cache_chain);
2544		mutex_unlock(&cache_chain_mutex);
2545		return;
2546	}
2547
2548	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2549		synchronize_rcu();
2550
2551	__kmem_cache_destroy(cachep);
2552	mutex_unlock(&cache_chain_mutex);
2553}
2554EXPORT_SYMBOL(kmem_cache_destroy);
2555
2556/*
2557 * Get the memory for a slab management obj.
2558 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2559 * always come from malloc_sizes caches.  The slab descriptor cannot
2560 * come from the same cache which is getting created because,
2561 * when we are searching for an appropriate cache for these
2562 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2563 * If we are creating a malloc_sizes cache here it would not be visible to
2564 * kmem_find_general_cachep till the initialization is complete.
2565 * Hence we cannot have slabp_cache same as the original cache.
2566 */
2567static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2568				   int colour_off, gfp_t local_flags,
2569				   int nodeid)
2570{
2571	struct slab *slabp;
2572
2573	if (OFF_SLAB(cachep)) {
2574		/* Slab management obj is off-slab. */
2575		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2576					      local_flags & ~GFP_THISNODE, nodeid);
2577		if (!slabp)
2578			return NULL;
2579	} else {
2580		slabp = objp + colour_off;
2581		colour_off += cachep->slab_size;
2582	}
2583	slabp->inuse = 0;
2584	slabp->colouroff = colour_off;
2585	slabp->s_mem = objp + colour_off;
2586	slabp->nodeid = nodeid;
2587	return slabp;
2588}
2589
2590static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2591{
2592	return (kmem_bufctl_t *) (slabp + 1);
2593}
2594
2595static void cache_init_objs(struct kmem_cache *cachep,
2596			    struct slab *slabp, unsigned long ctor_flags)
2597{
2598	int i;
2599
2600	for (i = 0; i < cachep->num; i++) {
2601		void *objp = index_to_obj(cachep, slabp, i);
2602#if DEBUG
2603		/* need to poison the objs? */
2604		if (cachep->flags & SLAB_POISON)
2605			poison_obj(cachep, objp, POISON_FREE);
2606		if (cachep->flags & SLAB_STORE_USER)
2607			*dbg_userword(cachep, objp) = NULL;
2608
2609		if (cachep->flags & SLAB_RED_ZONE) {
2610			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2611			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2612		}
2613		/*
2614		 * Constructors are not allowed to allocate memory from the same
2615		 * cache which they are a constructor for.  Otherwise, deadlock.
2616		 * They must also be threaded.
2617		 */
2618		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2619			cachep->ctor(objp + obj_offset(cachep), cachep,
2620				     ctor_flags);
2621
2622		if (cachep->flags & SLAB_RED_ZONE) {
2623			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2624				slab_error(cachep, "constructor overwrote the"
2625					   " end of an object");
2626			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2627				slab_error(cachep, "constructor overwrote the"
2628					   " start of an object");
2629		}
2630		if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2631			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2632			kernel_map_pages(virt_to_page(objp),
2633					 cachep->buffer_size / PAGE_SIZE, 0);
2634#else
2635		if (cachep->ctor)
2636			cachep->ctor(objp, cachep, ctor_flags);
2637#endif
2638		slab_bufctl(slabp)[i] = i + 1;
2639	}
2640	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2641	slabp->free = 0;
2642}
2643
2644static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2645{
2646	if (flags & GFP_DMA)
2647		BUG_ON(!(cachep->gfpflags & GFP_DMA));
2648	else
2649		BUG_ON(cachep->gfpflags & GFP_DMA);
2650}
2651
2652static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2653				int nodeid)
2654{
2655	void *objp = index_to_obj(cachep, slabp, slabp->free);
2656	kmem_bufctl_t next;
2657
2658	slabp->inuse++;
2659	next = slab_bufctl(slabp)[slabp->free];
2660#if DEBUG
2661	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2662	WARN_ON(slabp->nodeid != nodeid);
2663#endif
2664	slabp->free = next;
2665
2666	return objp;
2667}
2668
2669static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2670				void *objp, int nodeid)
2671{
2672	unsigned int objnr = obj_to_index(cachep, slabp, objp);
2673
2674#if DEBUG
2675	/* Verify that the slab belongs to the intended node */
2676	WARN_ON(slabp->nodeid != nodeid);
2677
2678	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2679		printk(KERN_ERR "slab: double free detected in cache "
2680				"'%s', objp %p\n", cachep->name, objp);
2681		BUG();
2682	}
2683#endif
2684	slab_bufctl(slabp)[objnr] = slabp->free;
2685	slabp->free = objnr;
2686	slabp->inuse--;
2687}
2688
2689/*
2690 * Map pages beginning at addr to the given cache and slab. This is required
2691 * for the slab allocator to be able to lookup the cache and slab of a
2692 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
2693 */
2694static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2695			   void *addr)
2696{
2697	int nr_pages;
2698	struct page *page;
2699
2700	page = virt_to_page(addr);
2701
2702	nr_pages = 1;
2703	if (likely(!PageCompound(page)))
2704		nr_pages <<= cache->gfporder;
2705
2706	do {
2707		page_set_cache(page, cache);
2708		page_set_slab(page, slab);
2709		page++;
2710	} while (--nr_pages);
2711}
2712
2713/*
2714 * Grow (by 1) the number of slabs within a cache.  This is called by
2715 * kmem_cache_alloc() when there are no active objs left in a cache.
2716 */
2717static int cache_grow(struct kmem_cache *cachep,
2718		gfp_t flags, int nodeid, void *objp)
2719{
2720	struct slab *slabp;
2721	size_t offset;
2722	gfp_t local_flags;
2723	unsigned long ctor_flags;
2724	struct kmem_list3 *l3;
2725
2726	/*
2727	 * Be lazy and only check for valid flags here,  keeping it out of the
2728	 * critical path in kmem_cache_alloc().
2729	 */
2730	BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
2731	if (flags & __GFP_NO_GROW)
2732		return 0;
2733
2734	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2735	local_flags = (flags & GFP_LEVEL_MASK);
2736	if (!(local_flags & __GFP_WAIT))
2737		/*
2738		 * Not allowed to sleep.  Need to tell a constructor about
2739		 * this - it might need to know...
2740		 */
2741		ctor_flags |= SLAB_CTOR_ATOMIC;
2742
2743	/* Take the l3 list lock to change the colour_next on this node */
2744	check_irq_off();
2745	l3 = cachep->nodelists[nodeid];
2746	spin_lock(&l3->list_lock);
2747
2748	/* Get colour for the slab, and cal the next value. */
2749	offset = l3->colour_next;
2750	l3->colour_next++;
2751	if (l3->colour_next >= cachep->colour)
2752		l3->colour_next = 0;
2753	spin_unlock(&l3->list_lock);
2754
2755	offset *= cachep->colour_off;
2756
2757	if (local_flags & __GFP_WAIT)
2758		local_irq_enable();
2759
2760	/*
2761	 * The test for missing atomic flag is performed here, rather than
2762	 * the more obvious place, simply to reduce the critical path length
2763	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2764	 * will eventually be caught here (where it matters).
2765	 */
2766	kmem_flagcheck(cachep, flags);
2767
2768	/*
2769	 * Get mem for the objs.  Attempt to allocate a physical page from
2770	 * 'nodeid'.
2771	 */
2772	if (!objp)
2773		objp = kmem_getpages(cachep, flags, nodeid);
2774	if (!objp)
2775		goto failed;
2776
2777	/* Get slab management. */
2778	slabp = alloc_slabmgmt(cachep, objp, offset,
2779			local_flags & ~GFP_THISNODE, nodeid);
2780	if (!slabp)
2781		goto opps1;
2782
2783	slabp->nodeid = nodeid;
2784	slab_map_pages(cachep, slabp, objp);
2785
2786	cache_init_objs(cachep, slabp, ctor_flags);
2787
2788	if (local_flags & __GFP_WAIT)
2789		local_irq_disable();
2790	check_irq_off();
2791	spin_lock(&l3->list_lock);
2792
2793	/* Make slab active. */
2794	list_add_tail(&slabp->list, &(l3->slabs_free));
2795	STATS_INC_GROWN(cachep);
2796	l3->free_objects += cachep->num;
2797	spin_unlock(&l3->list_lock);
2798	return 1;
2799opps1:
2800	kmem_freepages(cachep, objp);
2801failed:
2802	if (local_flags & __GFP_WAIT)
2803		local_irq_disable();
2804	return 0;
2805}
2806
2807#if DEBUG
2808
2809/*
2810 * Perform extra freeing checks:
2811 * - detect bad pointers.
2812 * - POISON/RED_ZONE checking
2813 * - destructor calls, for caches with POISON+dtor
2814 */
2815static void kfree_debugcheck(const void *objp)
2816{
2817	struct page *page;
2818
2819	if (!virt_addr_valid(objp)) {
2820		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2821		       (unsigned long)objp);
2822		BUG();
2823	}
2824	page = virt_to_page(objp);
2825	if (!PageSlab(page)) {
2826		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
2827		       (unsigned long)objp);
2828		BUG();
2829	}
2830}
2831
2832static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2833{
2834	unsigned long redzone1, redzone2;
2835
2836	redzone1 = *dbg_redzone1(cache, obj);
2837	redzone2 = *dbg_redzone2(cache, obj);
2838
2839	/*
2840	 * Redzone is ok.
2841	 */
2842	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2843		return;
2844
2845	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2846		slab_error(cache, "double free detected");
2847	else
2848		slab_error(cache, "memory outside object was overwritten");
2849
2850	printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
2851			obj, redzone1, redzone2);
2852}
2853
2854static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2855				   void *caller)
2856{
2857	struct page *page;
2858	unsigned int objnr;
2859	struct slab *slabp;
2860
2861	objp -= obj_offset(cachep);
2862	kfree_debugcheck(objp);
2863	page = virt_to_page(objp);
2864
2865	slabp = page_get_slab(page);
2866
2867	if (cachep->flags & SLAB_RED_ZONE) {
2868		verify_redzone_free(cachep, objp);
2869		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2870		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2871	}
2872	if (cachep->flags & SLAB_STORE_USER)
2873		*dbg_userword(cachep, objp) = caller;
2874
2875	objnr = obj_to_index(cachep, slabp, objp);
2876
2877	BUG_ON(objnr >= cachep->num);
2878	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2879
2880	if (cachep->flags & SLAB_DEBUG_INITIAL) {
2881		/*
2882		 * Need to call the slab's constructor so the caller can
2883		 * perform a verify of its state (debugging).  Called without
2884		 * the cache-lock held.
2885		 */
2886		cachep->ctor(objp + obj_offset(cachep),
2887			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2888	}
2889	if (cachep->flags & SLAB_POISON && cachep->dtor) {
2890		/* we want to cache poison the object,
2891		 * call the destruction callback
2892		 */
2893		cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2894	}
2895#ifdef CONFIG_DEBUG_SLAB_LEAK
2896	slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2897#endif
2898	if (cachep->flags & SLAB_POISON) {
2899#ifdef CONFIG_DEBUG_PAGEALLOC
2900		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2901			store_stackinfo(cachep, objp, (unsigned long)caller);
2902			kernel_map_pages(virt_to_page(objp),
2903					 cachep->buffer_size / PAGE_SIZE, 0);
2904		} else {
2905			poison_obj(cachep, objp, POISON_FREE);
2906		}
2907#else
2908		poison_obj(cachep, objp, POISON_FREE);
2909#endif
2910	}
2911	return objp;
2912}
2913
2914static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2915{
2916	kmem_bufctl_t i;
2917	int entries = 0;
2918
2919	/* Check slab's freelist to see if this obj is there. */
2920	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2921		entries++;
2922		if (entries > cachep->num || i >= cachep->num)
2923			goto bad;
2924	}
2925	if (entries != cachep->num - slabp->inuse) {
2926bad:
2927		printk(KERN_ERR "slab: Internal list corruption detected in "
2928				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2929			cachep->name, cachep->num, slabp, slabp->inuse);
2930		for (i = 0;
2931		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2932		     i++) {
2933			if (i % 16 == 0)
2934				printk("\n%03x:", i);
2935			printk(" %02x", ((unsigned char *)slabp)[i]);
2936		}
2937		printk("\n");
2938		BUG();
2939	}
2940}
2941#else
2942#define kfree_debugcheck(x) do { } while(0)
2943#define cache_free_debugcheck(x,objp,z) (objp)
2944#define check_slabp(x,y) do { } while(0)
2945#endif
2946
2947static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2948{
2949	int batchcount;
2950	struct kmem_list3 *l3;
2951	struct array_cache *ac;
2952	int node;
2953
2954	node = numa_node_id();
2955
2956	check_irq_off();
2957	ac = cpu_cache_get(cachep);
2958retry:
2959	batchcount = ac->batchcount;
2960	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2961		/*
2962		 * If there was little recent activity on this cache, then
2963		 * perform only a partial refill.  Otherwise we could generate
2964		 * refill bouncing.
2965		 */
2966		batchcount = BATCHREFILL_LIMIT;
2967	}
2968	l3 = cachep->nodelists[node];
2969
2970	BUG_ON(ac->avail > 0 || !l3);
2971	spin_lock(&l3->list_lock);
2972
2973	/* See if we can refill from the shared array */
2974	if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
2975		goto alloc_done;
2976
2977	while (batchcount > 0) {
2978		struct list_head *entry;
2979		struct slab *slabp;
2980		/* Get slab alloc is to come from. */
2981		entry = l3->slabs_partial.next;
2982		if (entry == &l3->slabs_partial) {
2983			l3->free_touched = 1;
2984			entry = l3->slabs_free.next;
2985			if (entry == &l3->slabs_free)
2986				goto must_grow;
2987		}
2988
2989		slabp = list_entry(entry, struct slab, list);
2990		check_slabp(cachep, slabp);
2991		check_spinlock_acquired(cachep);
2992		while (slabp->inuse < cachep->num && batchcount--) {
2993			STATS_INC_ALLOCED(cachep);
2994			STATS_INC_ACTIVE(cachep);
2995			STATS_SET_HIGH(cachep);
2996
2997			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
2998							    node);
2999		}
3000		check_slabp(cachep, slabp);
3001
3002		/* move slabp to correct slabp list: */
3003		list_del(&slabp->list);
3004		if (slabp->free == BUFCTL_END)
3005			list_add(&slabp->list, &l3->slabs_full);
3006		else
3007			list_add(&slabp->list, &l3->slabs_partial);
3008	}
3009
3010must_grow:
3011	l3->free_objects -= ac->avail;
3012alloc_done:
3013	spin_unlock(&l3->list_lock);
3014
3015	if (unlikely(!ac->avail)) {
3016		int x;
3017		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3018
3019		/* cache_grow can reenable interrupts, then ac could change. */
3020		ac = cpu_cache_get(cachep);
3021		if (!x && ac->avail == 0)	/* no objects in sight? abort */
3022			return NULL;
3023
3024		if (!ac->avail)		/* objects refilled by interrupt? */
3025			goto retry;
3026	}
3027	ac->touched = 1;
3028	return ac->entry[--ac->avail];
3029}
3030
3031static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3032						gfp_t flags)
3033{
3034	might_sleep_if(flags & __GFP_WAIT);
3035#if DEBUG
3036	kmem_flagcheck(cachep, flags);
3037#endif
3038}
3039
3040#if DEBUG
3041static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3042				gfp_t flags, void *objp, void *caller)
3043{
3044	if (!objp)
3045		return objp;
3046	if (cachep->flags & SLAB_POISON) {
3047#ifdef CONFIG_DEBUG_PAGEALLOC
3048		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3049			kernel_map_pages(virt_to_page(objp),
3050					 cachep->buffer_size / PAGE_SIZE, 1);
3051		else
3052			check_poison_obj(cachep, objp);
3053#else
3054		check_poison_obj(cachep, objp);
3055#endif
3056		poison_obj(cachep, objp, POISON_INUSE);
3057	}
3058	if (cachep->flags & SLAB_STORE_USER)
3059		*dbg_userword(cachep, objp) = caller;
3060
3061	if (cachep->flags & SLAB_RED_ZONE) {
3062		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3063				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3064			slab_error(cachep, "double free, or memory outside"
3065						" object was overwritten");
3066			printk(KERN_ERR
3067				"%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
3068				objp, *dbg_redzone1(cachep, objp),
3069				*dbg_redzone2(cachep, objp));
3070		}
3071		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
3072		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
3073	}
3074#ifdef CONFIG_DEBUG_SLAB_LEAK
3075	{
3076		struct slab *slabp;
3077		unsigned objnr;
3078
3079		slabp = page_get_slab(virt_to_page(objp));
3080		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
3081		slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3082	}
3083#endif
3084	objp += obj_offset(cachep);
3085	if (cachep->ctor && cachep->flags & SLAB_POISON) {
3086		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
3087
3088		if (!(flags & __GFP_WAIT))
3089			ctor_flags |= SLAB_CTOR_ATOMIC;
3090
3091		cachep->ctor(objp, cachep, ctor_flags);
3092	}
3093#if ARCH_SLAB_MINALIGN
3094	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3095		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3096		       objp, ARCH_SLAB_MINALIGN);
3097	}
3098#endif
3099	return objp;
3100}
3101#else
3102#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3103#endif
3104
3105#ifdef CONFIG_FAILSLAB
3106
3107static struct failslab_attr {
3108
3109	struct fault_attr attr;
3110
3111	u32 ignore_gfp_wait;
3112#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3113	struct dentry *ignore_gfp_wait_file;
3114#endif
3115
3116} failslab = {
3117	.attr = FAULT_ATTR_INITIALIZER,
3118	.ignore_gfp_wait = 1,
3119};
3120
3121static int __init setup_failslab(char *str)
3122{
3123	return setup_fault_attr(&failslab.attr, str);
3124}
3125__setup("failslab=", setup_failslab);
3126
3127static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3128{
3129	if (cachep == &cache_cache)
3130		return 0;
3131	if (flags & __GFP_NOFAIL)
3132		return 0;
3133	if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
3134		return 0;
3135
3136	return should_fail(&failslab.attr, obj_size(cachep));
3137}
3138
3139#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3140
3141static int __init failslab_debugfs(void)
3142{
3143	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
3144	struct dentry *dir;
3145	int err;
3146
3147       	err = init_fault_attr_dentries(&failslab.attr, "failslab");
3148	if (err)
3149		return err;
3150	dir = failslab.attr.dentries.dir;
3151
3152	failslab.ignore_gfp_wait_file =
3153		debugfs_create_bool("ignore-gfp-wait", mode, dir,
3154				      &failslab.ignore_gfp_wait);
3155
3156	if (!failslab.ignore_gfp_wait_file) {
3157		err = -ENOMEM;
3158		debugfs_remove(failslab.ignore_gfp_wait_file);
3159		cleanup_fault_attr_dentries(&failslab.attr);
3160	}
3161
3162	return err;
3163}
3164
3165late_initcall(failslab_debugfs);
3166
3167#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3168
3169#else /* CONFIG_FAILSLAB */
3170
3171static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3172{
3173	return 0;
3174}
3175
3176#endif /* CONFIG_FAILSLAB */
3177
3178static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3179{
3180	void *objp;
3181	struct array_cache *ac;
3182
3183	check_irq_off();
3184
3185	if (should_failslab(cachep, flags))
3186		return NULL;
3187
3188	ac = cpu_cache_get(cachep);
3189	if (likely(ac->avail)) {
3190		STATS_INC_ALLOCHIT(cachep);
3191		ac->touched = 1;
3192		objp = ac->entry[--ac->avail];
3193	} else {
3194		STATS_INC_ALLOCMISS(cachep);
3195		objp = cache_alloc_refill(cachep, flags);
3196	}
3197	return objp;
3198}
3199
3200static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
3201						gfp_t flags, void *caller)
3202{
3203	unsigned long save_flags;
3204	void *objp = NULL;
3205
3206	cache_alloc_debugcheck_before(cachep, flags);
3207
3208	local_irq_save(save_flags);
3209
3210	if (unlikely(NUMA_BUILD &&
3211			current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
3212		objp = alternate_node_alloc(cachep, flags);
3213
3214	if (!objp)
3215		objp = ____cache_alloc(cachep, flags);
3216	/*
3217	 * We may just have run out of memory on the local node.
3218	 * ____cache_alloc_node() knows how to locate memory on other nodes
3219	 */
3220 	if (NUMA_BUILD && !objp)
3221 		objp = ____cache_alloc_node(cachep, flags, numa_node_id());
3222	local_irq_restore(save_flags);
3223	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
3224					    caller);
3225	prefetchw(objp);
3226	return objp;
3227}
3228
3229#ifdef CONFIG_NUMA
3230/*
3231 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3232 *
3233 * If we are in_interrupt, then process context, including cpusets and
3234 * mempolicy, may not apply and should not be used for allocation policy.
3235 */
3236static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3237{
3238	int nid_alloc, nid_here;
3239
3240	if (in_interrupt() || (flags & __GFP_THISNODE))
3241		return NULL;
3242	nid_alloc = nid_here = numa_node_id();
3243	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3244		nid_alloc = cpuset_mem_spread_node();
3245	else if (current->mempolicy)
3246		nid_alloc = slab_node(current->mempolicy);
3247	if (nid_alloc != nid_here)
3248		return ____cache_alloc_node(cachep, flags, nid_alloc);
3249	return NULL;
3250}
3251
3252/*
3253 * Fallback function if there was no memory available and no objects on a
3254 * certain node and fall back is permitted. First we scan all the
3255 * available nodelists for available objects. If that fails then we
3256 * perform an allocation without specifying a node. This allows the page
3257 * allocator to do its reclaim / fallback magic. We then insert the
3258 * slab into the proper nodelist and then allocate from it.
3259 */
3260void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3261{
3262	struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
3263					->node_zonelists[gfp_zone(flags)];
3264	struct zone **z;
3265	void *obj = NULL;
3266	int nid;
3267	gfp_t local_flags = (flags & GFP_LEVEL_MASK);
3268
3269retry:
3270	/*
3271	 * Look through allowed nodes for objects available
3272	 * from existing per node queues.
3273	 */
3274	for (z = zonelist->zones; *z && !obj; z++) {
3275		nid = zone_to_nid(*z);
3276
3277		if (cpuset_zone_allowed_hardwall(*z, flags) &&
3278			cache->nodelists[nid] &&
3279			cache->nodelists[nid]->free_objects)
3280				obj = ____cache_alloc_node(cache,
3281					flags | GFP_THISNODE, nid);
3282	}
3283
3284	if (!obj) {
3285		/*
3286		 * This allocation will be performed within the constraints
3287		 * of the current cpuset / memory policy requirements.
3288		 * We may trigger various forms of reclaim on the allowed
3289		 * set and go into memory reserves if necessary.
3290		 */
3291		if (local_flags & __GFP_WAIT)
3292			local_irq_enable();
3293		kmem_flagcheck(cache, flags);
3294		obj = kmem_getpages(cache, flags, -1);
3295		if (local_flags & __GFP_WAIT)
3296			local_irq_disable();
3297		if (obj) {
3298			/*
3299			 * Insert into the appropriate per node queues
3300			 */
3301			nid = page_to_nid(virt_to_page(obj));
3302			if (cache_grow(cache, flags, nid, obj)) {
3303				obj = ____cache_alloc_node(cache,
3304					flags | GFP_THISNODE, nid);
3305				if (!obj)
3306					/*
3307					 * Another processor may allocate the
3308					 * objects in the slab since we are
3309					 * not holding any locks.
3310					 */
3311					goto retry;
3312			} else {
3313				kmem_freepages(cache, obj);
3314				obj = NULL;
3315			}
3316		}
3317	}
3318	return obj;
3319}
3320
3321/*
3322 * A interface to enable slab creation on nodeid
3323 */
3324static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3325				int nodeid)
3326{
3327	struct list_head *entry;
3328	struct slab *slabp;
3329	struct kmem_list3 *l3;
3330	void *obj;
3331	int x;
3332
3333	l3 = cachep->nodelists[nodeid];
3334	BUG_ON(!l3);
3335
3336retry:
3337	check_irq_off();
3338	spin_lock(&l3->list_lock);
3339	entry = l3->slabs_partial.next;
3340	if (entry == &l3->slabs_partial) {
3341		l3->free_touched = 1;
3342		entry = l3->slabs_free.next;
3343		if (entry == &l3->slabs_free)
3344			goto must_grow;
3345	}
3346
3347	slabp = list_entry(entry, struct slab, list);
3348	check_spinlock_acquired_node(cachep, nodeid);
3349	check_slabp(cachep, slabp);
3350
3351	STATS_INC_NODEALLOCS(cachep);
3352	STATS_INC_ACTIVE(cachep);
3353	STATS_SET_HIGH(cachep);
3354
3355	BUG_ON(slabp->inuse == cachep->num);
3356
3357	obj = slab_get_obj(cachep, slabp, nodeid);
3358	check_slabp(cachep, slabp);
3359	l3->free_objects--;
3360	/* move slabp to correct slabp list: */
3361	list_del(&slabp->list);
3362
3363	if (slabp->free == BUFCTL_END)
3364		list_add(&slabp->list, &l3->slabs_full);
3365	else
3366		list_add(&slabp->list, &l3->slabs_partial);
3367
3368	spin_unlock(&l3->list_lock);
3369	goto done;
3370
3371must_grow:
3372	spin_unlock(&l3->list_lock);
3373	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3374	if (x)
3375		goto retry;
3376
3377	if (!(flags & __GFP_THISNODE))
3378		/* Unable to grow the cache. Fall back to other nodes. */
3379		return fallback_alloc(cachep, flags);
3380
3381	return NULL;
3382
3383done:
3384	return obj;
3385}
3386#endif
3387
3388/*
3389 * Caller needs to acquire correct kmem_list's list_lock
3390 */
3391static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3392		       int node)
3393{
3394	int i;
3395	struct kmem_list3 *l3;
3396
3397	for (i = 0; i < nr_objects; i++) {
3398		void *objp = objpp[i];
3399		struct slab *slabp;
3400
3401		slabp = virt_to_slab(objp);
3402		l3 = cachep->nodelists[node];
3403		list_del(&slabp->list);
3404		check_spinlock_acquired_node(cachep, node);
3405		check_slabp(cachep, slabp);
3406		slab_put_obj(cachep, slabp, objp, node);
3407		STATS_DEC_ACTIVE(cachep);
3408		l3->free_objects++;
3409		check_slabp(cachep, slabp);
3410
3411		/* fixup slab chains */
3412		if (slabp->inuse == 0) {
3413			if (l3->free_objects > l3->free_limit) {
3414				l3->free_objects -= cachep->num;
3415				/* No need to drop any previously held
3416				 * lock here, even if we have a off-slab slab
3417				 * descriptor it is guaranteed to come from
3418				 * a different cache, refer to comments before
3419				 * alloc_slabmgmt.
3420				 */
3421				slab_destroy(cachep, slabp);
3422			} else {
3423				list_add(&slabp->list, &l3->slabs_free);
3424			}
3425		} else {
3426			/* Unconditionally move a slab to the end of the
3427			 * partial list on free - maximum time for the
3428			 * other objects to be freed, too.
3429			 */
3430			list_add_tail(&slabp->list, &l3->slabs_partial);
3431		}
3432	}
3433}
3434
3435static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3436{
3437	int batchcount;
3438	struct kmem_list3 *l3;
3439	int node = numa_node_id();
3440
3441	batchcount = ac->batchcount;
3442#if DEBUG
3443	BUG_ON(!batchcount || batchcount > ac->avail);
3444#endif
3445	check_irq_off();
3446	l3 = cachep->nodelists[node];
3447	spin_lock(&l3->list_lock);
3448	if (l3->shared) {
3449		struct array_cache *shared_array = l3->shared;
3450		int max = shared_array->limit - shared_array->avail;
3451		if (max) {
3452			if (batchcount > max)
3453				batchcount = max;
3454			memcpy(&(shared_array->entry[shared_array->avail]),
3455			       ac->entry, sizeof(void *) * batchcount);
3456			shared_array->avail += batchcount;
3457			goto free_done;
3458		}
3459	}
3460
3461	free_block(cachep, ac->entry, batchcount, node);
3462free_done:
3463#if STATS
3464	{
3465		int i = 0;
3466		struct list_head *p;
3467
3468		p = l3->slabs_free.next;
3469		while (p != &(l3->slabs_free)) {
3470			struct slab *slabp;
3471
3472			slabp = list_entry(p, struct slab, list);
3473			BUG_ON(slabp->inuse);
3474
3475			i++;
3476			p = p->next;
3477		}
3478		STATS_SET_FREEABLE(cachep, i);
3479	}
3480#endif
3481	spin_unlock(&l3->list_lock);
3482	ac->avail -= batchcount;
3483	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3484}
3485
3486/*
3487 * Release an obj back to its cache. If the obj has a constructed state, it must
3488 * be in this state _before_ it is released.  Called with disabled ints.
3489 */
3490static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3491{
3492	struct array_cache *ac = cpu_cache_get(cachep);
3493
3494	check_irq_off();
3495	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3496
3497	if (cache_free_alien(cachep, objp))
3498		return;
3499
3500	if (likely(ac->avail < ac->limit)) {
3501		STATS_INC_FREEHIT(cachep);
3502		ac->entry[ac->avail++] = objp;
3503		return;
3504	} else {
3505		STATS_INC_FREEMISS(cachep);
3506		cache_flusharray(cachep, ac);
3507		ac->entry[ac->avail++] = objp;
3508	}
3509}
3510
3511/**
3512 * kmem_cache_alloc - Allocate an object
3513 * @cachep: The cache to allocate from.
3514 * @flags: See kmalloc().
3515 *
3516 * Allocate an object from this cache.  The flags are only relevant
3517 * if the cache has no available objects.
3518 */
3519void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3520{
3521	return __cache_alloc(cachep, flags, __builtin_return_address(0));
3522}
3523EXPORT_SYMBOL(kmem_cache_alloc);
3524
3525/**
3526 * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
3527 * @cache: The cache to allocate from.
3528 * @flags: See kmalloc().
3529 *
3530 * Allocate an object from this cache and set the allocated memory to zero.
3531 * The flags are only relevant if the cache has no available objects.
3532 */
3533void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
3534{
3535	void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
3536	if (ret)
3537		memset(ret, 0, obj_size(cache));
3538	return ret;
3539}
3540EXPORT_SYMBOL(kmem_cache_zalloc);
3541
3542/**
3543 * kmem_ptr_validate - check if an untrusted pointer might
3544 *	be a slab entry.
3545 * @cachep: the cache we're checking against
3546 * @ptr: pointer to validate
3547 *
3548 * This verifies that the untrusted pointer looks sane:
3549 * it is _not_ a guarantee that the pointer is actually
3550 * part of the slab cache in question, but it at least
3551 * validates that the pointer can be dereferenced and
3552 * looks half-way sane.
3553 *
3554 * Currently only used for dentry validation.
3555 */
3556int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3557{
3558	unsigned long addr = (unsigned long)ptr;
3559	unsigned long min_addr = PAGE_OFFSET;
3560	unsigned long align_mask = BYTES_PER_WORD - 1;
3561	unsigned long size = cachep->buffer_size;
3562	struct page *page;
3563
3564	if (unlikely(addr < min_addr))
3565		goto out;
3566	if (unlikely(addr > (unsigned long)high_memory - size))
3567		goto out;
3568	if (unlikely(addr & align_mask))
3569		goto out;
3570	if (unlikely(!kern_addr_valid(addr)))
3571		goto out;
3572	if (unlikely(!kern_addr_valid(addr + size - 1)))
3573		goto out;
3574	page = virt_to_page(ptr);
3575	if (unlikely(!PageSlab(page)))
3576		goto out;
3577	if (unlikely(page_get_cache(page) != cachep))
3578		goto out;
3579	return 1;
3580out:
3581	return 0;
3582}
3583
3584#ifdef CONFIG_NUMA
3585/**
3586 * kmem_cache_alloc_node - Allocate an object on the specified node
3587 * @cachep: The cache to allocate from.
3588 * @flags: See kmalloc().
3589 * @nodeid: node number of the target node.
3590 *
3591 * Identical to kmem_cache_alloc but it will allocate memory on the given
3592 * node, which can improve the performance for cpu bound structures.
3593 *
3594 * Fallback to other node is possible if __GFP_THISNODE is not set.
3595 */
3596static __always_inline void *
3597__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3598		int nodeid, void *caller)
3599{
3600	unsigned long save_flags;
3601	void *ptr = NULL;
3602
3603	cache_alloc_debugcheck_before(cachep, flags);
3604	local_irq_save(save_flags);
3605
3606	if (unlikely(nodeid == -1))
3607		nodeid = numa_node_id();
3608
3609	if (likely(cachep->nodelists[nodeid])) {
3610		if (nodeid == numa_node_id()) {
3611			/*
3612			 * Use the locally cached objects if possible.
3613			 * However ____cache_alloc does not allow fallback
3614			 * to other nodes. It may fail while we still have
3615			 * objects on other nodes available.
3616			 */
3617			ptr = ____cache_alloc(cachep, flags);
3618		}
3619		if (!ptr) {
3620			/* ___cache_alloc_node can fall back to other nodes */
3621			ptr = ____cache_alloc_node(cachep, flags, nodeid);
3622		}
3623	} else {
3624		/* Node not bootstrapped yet */
3625		if (!(flags & __GFP_THISNODE))
3626			ptr = fallback_alloc(cachep, flags);
3627	}
3628
3629	local_irq_restore(save_flags);
3630	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3631
3632	return ptr;
3633}
3634
3635void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3636{
3637	return __cache_alloc_node(cachep, flags, nodeid,
3638			__builtin_return_address(0));
3639}
3640EXPORT_SYMBOL(kmem_cache_alloc_node);
3641
3642static __always_inline void *
3643__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3644{
3645	struct kmem_cache *cachep;
3646
3647	cachep = kmem_find_general_cachep(size, flags);
3648	if (unlikely(cachep == NULL))
3649		return NULL;
3650	return kmem_cache_alloc_node(cachep, flags, node);
3651}
3652
3653#ifdef CONFIG_DEBUG_SLAB
3654void *__kmalloc_node(size_t size, gfp_t flags, int node)
3655{
3656	return __do_kmalloc_node(size, flags, node,
3657			__builtin_return_address(0));
3658}
3659EXPORT_SYMBOL(__kmalloc_node);
3660
3661void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3662		int node, void *caller)
3663{
3664	return __do_kmalloc_node(size, flags, node, caller);
3665}
3666EXPORT_SYMBOL(__kmalloc_node_track_caller);
3667#else
3668void *__kmalloc_node(size_t size, gfp_t flags, int node)
3669{
3670	return __do_kmalloc_node(size, flags, node, NULL);
3671}
3672EXPORT_SYMBOL(__kmalloc_node);
3673#endif /* CONFIG_DEBUG_SLAB */
3674#endif /* CONFIG_NUMA */
3675
3676/**
3677 * __do_kmalloc - allocate memory
3678 * @size: how many bytes of memory are required.
3679 * @flags: the type of memory to allocate (see kmalloc).
3680 * @caller: function caller for debug tracking of the caller
3681 */
3682static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3683					  void *caller)
3684{
3685	struct kmem_cache *cachep;
3686
3687	/* If you want to save a few bytes .text space: replace
3688	 * __ with kmem_.
3689	 * Then kmalloc uses the uninlined functions instead of the inline
3690	 * functions.
3691	 */
3692	cachep = __find_general_cachep(size, flags);
3693	if (unlikely(cachep == NULL))
3694		return NULL;
3695	return __cache_alloc(cachep, flags, caller);
3696}
3697
3698
3699#ifdef CONFIG_DEBUG_SLAB
3700void *__kmalloc(size_t size, gfp_t flags)
3701{
3702	return __do_kmalloc(size, flags, __builtin_return_address(0));
3703}
3704EXPORT_SYMBOL(__kmalloc);
3705
3706void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3707{
3708	return __do_kmalloc(size, flags, caller);
3709}
3710EXPORT_SYMBOL(__kmalloc_track_caller);
3711
3712#else
3713void *__kmalloc(size_t size, gfp_t flags)
3714{
3715	return __do_kmalloc(size, flags, NULL);
3716}
3717EXPORT_SYMBOL(__kmalloc);
3718#endif
3719
3720/**
3721 * kmem_cache_free - Deallocate an object
3722 * @cachep: The cache the allocation was from.
3723 * @objp: The previously allocated object.
3724 *
3725 * Free an object which was previously allocated from this
3726 * cache.
3727 */
3728void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3729{
3730	unsigned long flags;
3731
3732	BUG_ON(virt_to_cache(objp) != cachep);
3733
3734	local_irq_save(flags);
3735	__cache_free(cachep, objp);
3736	local_irq_restore(flags);
3737}
3738EXPORT_SYMBOL(kmem_cache_free);
3739
3740/**
3741 * kfree - free previously allocated memory
3742 * @objp: pointer returned by kmalloc.
3743 *
3744 * If @objp is NULL, no operation is performed.
3745 *
3746 * Don't free memory not originally allocated by kmalloc()
3747 * or you will run into trouble.
3748 */
3749void kfree(const void *objp)
3750{
3751	struct kmem_cache *c;
3752	unsigned long flags;
3753
3754	if (unlikely(!objp))
3755		return;
3756	local_irq_save(flags);
3757	kfree_debugcheck(objp);
3758	c = virt_to_cache(objp);
3759	debug_check_no_locks_freed(objp, obj_size(c));
3760	__cache_free(c, (void *)objp);
3761	local_irq_restore(flags);
3762}
3763EXPORT_SYMBOL(kfree);
3764
3765unsigned int kmem_cache_size(struct kmem_cache *cachep)
3766{
3767	return obj_size(cachep);
3768}
3769EXPORT_SYMBOL(kmem_cache_size);
3770
3771const char *kmem_cache_name(struct kmem_cache *cachep)
3772{
3773	return cachep->name;
3774}
3775EXPORT_SYMBOL_GPL(kmem_cache_name);
3776
3777/*
3778 * This initializes kmem_list3 or resizes varioius caches for all nodes.
3779 */
3780static int alloc_kmemlist(struct kmem_cache *cachep)
3781{
3782	int node;
3783	struct kmem_list3 *l3;
3784	struct array_cache *new_shared;
3785	struct array_cache **new_alien = NULL;
3786
3787	for_each_online_node(node) {
3788
3789                if (use_alien_caches) {
3790                        new_alien = alloc_alien_cache(node, cachep->limit);
3791                        if (!new_alien)
3792                                goto fail;
3793                }
3794
3795		new_shared = alloc_arraycache(node,
3796				cachep->shared*cachep->batchcount,
3797					0xbaadf00d);
3798		if (!new_shared) {
3799			free_alien_cache(new_alien);
3800			goto fail;
3801		}
3802
3803		l3 = cachep->nodelists[node];
3804		if (l3) {
3805			struct array_cache *shared = l3->shared;
3806
3807			spin_lock_irq(&l3->list_lock);
3808
3809			if (shared)
3810				free_block(cachep, shared->entry,
3811						shared->avail, node);
3812
3813			l3->shared = new_shared;
3814			if (!l3->alien) {
3815				l3->alien = new_alien;
3816				new_alien = NULL;
3817			}
3818			l3->free_limit = (1 + nr_cpus_node(node)) *
3819					cachep->batchcount + cachep->num;
3820			spin_unlock_irq(&l3->list_lock);
3821			kfree(shared);
3822			free_alien_cache(new_alien);
3823			continue;
3824		}
3825		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
3826		if (!l3) {
3827			free_alien_cache(new_alien);
3828			kfree(new_shared);
3829			goto fail;
3830		}
3831
3832		kmem_list3_init(l3);
3833		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3834				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3835		l3->shared = new_shared;
3836		l3->alien = new_alien;
3837		l3->free_limit = (1 + nr_cpus_node(node)) *
3838					cachep->batchcount + cachep->num;
3839		cachep->nodelists[node] = l3;
3840	}
3841	return 0;
3842
3843fail:
3844	if (!cachep->next.next) {
3845		/* Cache is not active yet. Roll back what we did */
3846		node--;
3847		while (node >= 0) {
3848			if (cachep->nodelists[node]) {
3849				l3 = cachep->nodelists[node];
3850
3851				kfree(l3->shared);
3852				free_alien_cache(l3->alien);
3853				kfree(l3);
3854				cachep->nodelists[node] = NULL;
3855			}
3856			node--;
3857		}
3858	}
3859	return -ENOMEM;
3860}
3861
3862struct ccupdate_struct {
3863	struct kmem_cache *cachep;
3864	struct array_cache *new[NR_CPUS];
3865};
3866
3867static void do_ccupdate_local(void *info)
3868{
3869	struct ccupdate_struct *new = info;
3870	struct array_cache *old;
3871
3872	check_irq_off();
3873	old = cpu_cache_get(new->cachep);
3874
3875	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3876	new->new[smp_processor_id()] = old;
3877}
3878
3879/* Always called with the cache_chain_mutex held */
3880static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3881				int batchcount, int shared)
3882{
3883	struct ccupdate_struct *new;
3884	int i;
3885
3886	new = kzalloc(sizeof(*new), GFP_KERNEL);
3887	if (!new)
3888		return -ENOMEM;
3889
3890	for_each_online_cpu(i) {
3891		new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3892						batchcount);
3893		if (!new->new[i]) {
3894			for (i--; i >= 0; i--)
3895				kfree(new->new[i]);
3896			kfree(new);
3897			return -ENOMEM;
3898		}
3899	}
3900	new->cachep = cachep;
3901
3902	on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
3903
3904	check_irq_on();
3905	cachep->batchcount = batchcount;
3906	cachep->limit = limit;
3907	cachep->shared = shared;
3908
3909	for_each_online_cpu(i) {
3910		struct array_cache *ccold = new->new[i];
3911		if (!ccold)
3912			continue;
3913		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3914		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3915		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3916		kfree(ccold);
3917	}
3918	kfree(new);
3919	return alloc_kmemlist(cachep);
3920}
3921
3922/* Called with cache_chain_mutex held always */
3923static int enable_cpucache(struct kmem_cache *cachep)
3924{
3925	int err;
3926	int limit, shared;
3927
3928	/*
3929	 * The head array serves three purposes:
3930	 * - create a LIFO ordering, i.e. return objects that are cache-warm
3931	 * - reduce the number of spinlock operations.
3932	 * - reduce the number of linked list operations on the slab and
3933	 *   bufctl chains: array operations are cheaper.
3934	 * The numbers are guessed, we should auto-tune as described by
3935	 * Bonwick.
3936	 */
3937	if (cachep->buffer_size > 131072)
3938		limit = 1;
3939	else if (cachep->buffer_size > PAGE_SIZE)
3940		limit = 8;
3941	else if (cachep->buffer_size > 1024)
3942		limit = 24;
3943	else if (cachep->buffer_size > 256)
3944		limit = 54;
3945	else
3946		limit = 120;
3947
3948	/*
3949	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3950	 * allocation behaviour: Most allocs on one cpu, most free operations
3951	 * on another cpu. For these cases, an efficient object passing between
3952	 * cpus is necessary. This is provided by a shared array. The array
3953	 * replaces Bonwick's magazine layer.
3954	 * On uniprocessor, it's functionally equivalent (but less efficient)
3955	 * to a larger limit. Thus disabled by default.
3956	 */
3957	shared = 0;
3958#ifdef CONFIG_SMP
3959	if (cachep->buffer_size <= PAGE_SIZE)
3960		shared = 8;
3961#endif
3962
3963#if DEBUG
3964	/*
3965	 * With debugging enabled, large batchcount lead to excessively long
3966	 * periods with disabled local interrupts. Limit the batchcount
3967	 */
3968	if (limit > 32)
3969		limit = 32;
3970#endif
3971	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
3972	if (err)
3973		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3974		       cachep->name, -err);
3975	return err;
3976}
3977
3978/*
3979 * Drain an array if it contains any elements taking the l3 lock only if
3980 * necessary. Note that the l3 listlock also protects the array_cache
3981 * if drain_array() is used on the shared array.
3982 */
3983void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3984			 struct array_cache *ac, int force, int node)
3985{
3986	int tofree;
3987
3988	if (!ac || !ac->avail)
3989		return;
3990	if (ac->touched && !force) {
3991		ac->touched = 0;
3992	} else {
3993		spin_lock_irq(&l3->list_lock);
3994		if (ac->avail) {
3995			tofree = force ? ac->avail : (ac->limit + 4) / 5;
3996			if (tofree > ac->avail)
3997				tofree = (ac->avail + 1) / 2;
3998			free_block(cachep, ac->entry, tofree, node);
3999			ac->avail -= tofree;
4000			memmove(ac->entry, &(ac->entry[tofree]),
4001				sizeof(void *) * ac->avail);
4002		}
4003		spin_unlock_irq(&l3->list_lock);
4004	}
4005}
4006
4007/**
4008 * cache_reap - Reclaim memory from caches.
4009 * @unused: unused parameter
4010 *
4011 * Called from workqueue/eventd every few seconds.
4012 * Purpose:
4013 * - clear the per-cpu caches for this CPU.
4014 * - return freeable pages to the main free memory pool.
4015 *
4016 * If we cannot acquire the cache chain mutex then just give up - we'll try
4017 * again on the next iteration.
4018 */
4019static void cache_reap(struct work_struct *unused)
4020{
4021	struct kmem_cache *searchp;
4022	struct kmem_list3 *l3;
4023	int node = numa_node_id();
4024
4025	if (!mutex_trylock(&cache_chain_mutex)) {
4026		/* Give up. Setup the next iteration. */
4027		schedule_delayed_work(&__get_cpu_var(reap_work),
4028				      round_jiffies_relative(REAPTIMEOUT_CPUC));
4029		return;
4030	}
4031
4032	list_for_each_entry(searchp, &cache_chain, next) {
4033		check_irq_on();
4034
4035		/*
4036		 * We only take the l3 lock if absolutely necessary and we
4037		 * have established with reasonable certainty that
4038		 * we can do some work if the lock was obtained.
4039		 */
4040		l3 = searchp->nodelists[node];
4041
4042		reap_alien(searchp, l3);
4043
4044		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
4045
4046		/*
4047		 * These are racy checks but it does not matter
4048		 * if we skip one check or scan twice.
4049		 */
4050		if (time_after(l3->next_reap, jiffies))
4051			goto next;
4052
4053		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
4054
4055		drain_array(searchp, l3, l3->shared, 0, node);
4056
4057		if (l3->free_touched)
4058			l3->free_touched = 0;
4059		else {
4060			int freed;
4061
4062			freed = drain_freelist(searchp, l3, (l3->free_limit +
4063				5 * searchp->num - 1) / (5 * searchp->num));
4064			STATS_ADD_REAPED(searchp, freed);
4065		}
4066next:
4067		cond_resched();
4068	}
4069	check_irq_on();
4070	mutex_unlock(&cache_chain_mutex);
4071	next_reap_node();
4072	refresh_cpu_vm_stats(smp_processor_id());
4073	/* Set up the next iteration */
4074	schedule_delayed_work(&__get_cpu_var(reap_work),
4075		round_jiffies_relative(REAPTIMEOUT_CPUC));
4076}
4077
4078#ifdef CONFIG_PROC_FS
4079
4080static void print_slabinfo_header(struct seq_file *m)
4081{
4082	/*
4083	 * Output format version, so at least we can change it
4084	 * without _too_ many complaints.
4085	 */
4086#if STATS
4087	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4088#else
4089	seq_puts(m, "slabinfo - version: 2.1\n");
4090#endif
4091	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4092		 "<objperslab> <pagesperslab>");
4093	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4094	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4095#if STATS
4096	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4097		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4098	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4099#endif
4100	seq_putc(m, '\n');
4101}
4102
4103static void *s_start(struct seq_file *m, loff_t *pos)
4104{
4105	loff_t n = *pos;
4106	struct list_head *p;
4107
4108	mutex_lock(&cache_chain_mutex);
4109	if (!n)
4110		print_slabinfo_header(m);
4111	p = cache_chain.next;
4112	while (n--) {
4113		p = p->next;
4114		if (p == &cache_chain)
4115			return NULL;
4116	}
4117	return list_entry(p, struct kmem_cache, next);
4118}
4119
4120static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4121{
4122	struct kmem_cache *cachep = p;
4123	++*pos;
4124	return cachep->next.next == &cache_chain ?
4125		NULL : list_entry(cachep->next.next, struct kmem_cache, next);
4126}
4127
4128static void s_stop(struct seq_file *m, void *p)
4129{
4130	mutex_unlock(&cache_chain_mutex);
4131}
4132
4133static int s_show(struct seq_file *m, void *p)
4134{
4135	struct kmem_cache *cachep = p;
4136	struct slab *slabp;
4137	unsigned long active_objs;
4138	unsigned long num_objs;
4139	unsigned long active_slabs = 0;
4140	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4141	const char *name;
4142	char *error = NULL;
4143	int node;
4144	struct kmem_list3 *l3;
4145
4146	active_objs = 0;
4147	num_slabs = 0;
4148	for_each_online_node(node) {
4149		l3 = cachep->nodelists[node];
4150		if (!l3)
4151			continue;
4152
4153		check_irq_on();
4154		spin_lock_irq(&l3->list_lock);
4155
4156		list_for_each_entry(slabp, &l3->slabs_full, list) {
4157			if (slabp->inuse != cachep->num && !error)
4158				error = "slabs_full accounting error";
4159			active_objs += cachep->num;
4160			active_slabs++;
4161		}
4162		list_for_each_entry(slabp, &l3->slabs_partial, list) {
4163			if (slabp->inuse == cachep->num && !error)
4164				error = "slabs_partial inuse accounting error";
4165			if (!slabp->inuse && !error)
4166				error = "slabs_partial/inuse accounting error";
4167			active_objs += slabp->inuse;
4168			active_slabs++;
4169		}
4170		list_for_each_entry(slabp, &l3->slabs_free, list) {
4171			if (slabp->inuse && !error)
4172				error = "slabs_free/inuse accounting error";
4173			num_slabs++;
4174		}
4175		free_objects += l3->free_objects;
4176		if (l3->shared)
4177			shared_avail += l3->shared->avail;
4178
4179		spin_unlock_irq(&l3->list_lock);
4180	}
4181	num_slabs += active_slabs;
4182	num_objs = num_slabs * cachep->num;
4183	if (num_objs - active_objs != free_objects && !error)
4184		error = "free_objects accounting error";
4185
4186	name = cachep->name;
4187	if (error)
4188		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4189
4190	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4191		   name, active_objs, num_objs, cachep->buffer_size,
4192		   cachep->num, (1 << cachep->gfporder));
4193	seq_printf(m, " : tunables %4u %4u %4u",
4194		   cachep->limit, cachep->batchcount, cachep->shared);
4195	seq_printf(m, " : slabdata %6lu %6lu %6lu",
4196		   active_slabs, num_slabs, shared_avail);
4197#if STATS
4198	{			/* list3 stats */
4199		unsigned long high = cachep->high_mark;
4200		unsigned long allocs = cachep->num_allocations;
4201		unsigned long grown = cachep->grown;
4202		unsigned long reaped = cachep->reaped;
4203		unsigned long errors = cachep->errors;
4204		unsigned long max_freeable = cachep->max_freeable;
4205		unsigned long node_allocs = cachep->node_allocs;
4206		unsigned long node_frees = cachep->node_frees;
4207		unsigned long overflows = cachep->node_overflow;
4208
4209		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
4210				%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
4211				reaped, errors, max_freeable, node_allocs,
4212				node_frees, overflows);
4213	}
4214	/* cpu stats */
4215	{
4216		unsigned long allochit = atomic_read(&cachep->allochit);
4217		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4218		unsigned long freehit = atomic_read(&cachep->freehit);
4219		unsigned long freemiss = atomic_read(&cachep->freemiss);
4220
4221		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4222			   allochit, allocmiss, freehit, freemiss);
4223	}
4224#endif
4225	seq_putc(m, '\n');
4226	return 0;
4227}
4228
4229/*
4230 * slabinfo_op - iterator that generates /proc/slabinfo
4231 *
4232 * Output layout:
4233 * cache-name
4234 * num-active-objs
4235 * total-objs
4236 * object size
4237 * num-active-slabs
4238 * total-slabs
4239 * num-pages-per-slab
4240 * + further values on SMP and with statistics enabled
4241 */
4242
4243const struct seq_operations slabinfo_op = {
4244	.start = s_start,
4245	.next = s_next,
4246	.stop = s_stop,
4247	.show = s_show,
4248};
4249
4250#define MAX_SLABINFO_WRITE 128
4251/**
4252 * slabinfo_write - Tuning for the slab allocator
4253 * @file: unused
4254 * @buffer: user buffer
4255 * @count: data length
4256 * @ppos: unused
4257 */
4258ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4259		       size_t count, loff_t *ppos)
4260{
4261	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4262	int limit, batchcount, shared, res;
4263	struct kmem_cache *cachep;
4264
4265	if (count > MAX_SLABINFO_WRITE)
4266		return -EINVAL;
4267	if (copy_from_user(&kbuf, buffer, count))
4268		return -EFAULT;
4269	kbuf[MAX_SLABINFO_WRITE] = '\0';
4270
4271	tmp = strchr(kbuf, ' ');
4272	if (!tmp)
4273		return -EINVAL;
4274	*tmp = '\0';
4275	tmp++;
4276	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4277		return -EINVAL;
4278
4279	/* Find the cache in the chain of caches. */
4280	mutex_lock(&cache_chain_mutex);
4281	res = -EINVAL;
4282	list_for_each_entry(cachep, &cache_chain, next) {
4283		if (!strcmp(cachep->name, kbuf)) {
4284			if (limit < 1 || batchcount < 1 ||
4285					batchcount > limit || shared < 0) {
4286				res = 0;
4287			} else {
4288				res = do_tune_cpucache(cachep, limit,
4289						       batchcount, shared);
4290			}
4291			break;
4292		}
4293	}
4294	mutex_unlock(&cache_chain_mutex);
4295	if (res >= 0)
4296		res = count;
4297	return res;
4298}
4299
4300#ifdef CONFIG_DEBUG_SLAB_LEAK
4301
4302static void *leaks_start(struct seq_file *m, loff_t *pos)
4303{
4304	loff_t n = *pos;
4305	struct list_head *p;
4306
4307	mutex_lock(&cache_chain_mutex);
4308	p = cache_chain.next;
4309	while (n--) {
4310		p = p->next;
4311		if (p == &cache_chain)
4312			return NULL;
4313	}
4314	return list_entry(p, struct kmem_cache, next);
4315}
4316
4317static inline int add_caller(unsigned long *n, unsigned long v)
4318{
4319	unsigned long *p;
4320	int l;
4321	if (!v)
4322		return 1;
4323	l = n[1];
4324	p = n + 2;
4325	while (l) {
4326		int i = l/2;
4327		unsigned long *q = p + 2 * i;
4328		if (*q == v) {
4329			q[1]++;
4330			return 1;
4331		}
4332		if (*q > v) {
4333			l = i;
4334		} else {
4335			p = q + 2;
4336			l -= i + 1;
4337		}
4338	}
4339	if (++n[1] == n[0])
4340		return 0;
4341	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4342	p[0] = v;
4343	p[1] = 1;
4344	return 1;
4345}
4346
4347static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4348{
4349	void *p;
4350	int i;
4351	if (n[0] == n[1])
4352		return;
4353	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4354		if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4355			continue;
4356		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4357			return;
4358	}
4359}
4360
4361static void show_symbol(struct seq_file *m, unsigned long address)
4362{
4363#ifdef CONFIG_KALLSYMS
4364	char *modname;
4365	const char *name;
4366	unsigned long offset, size;
4367	char namebuf[KSYM_NAME_LEN+1];
4368
4369	name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
4370
4371	if (name) {
4372		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4373		if (modname)
4374			seq_printf(m, " [%s]", modname);
4375		return;
4376	}
4377#endif
4378	seq_printf(m, "%p", (void *)address);
4379}
4380
4381static int leaks_show(struct seq_file *m, void *p)
4382{
4383	struct kmem_cache *cachep = p;
4384	struct slab *slabp;
4385	struct kmem_list3 *l3;
4386	const char *name;
4387	unsigned long *n = m->private;
4388	int node;
4389	int i;
4390
4391	if (!(cachep->flags & SLAB_STORE_USER))
4392		return 0;
4393	if (!(cachep->flags & SLAB_RED_ZONE))
4394		return 0;
4395
4396	/* OK, we can do it */
4397
4398	n[1] = 0;
4399
4400	for_each_online_node(node) {
4401		l3 = cachep->nodelists[node];
4402		if (!l3)
4403			continue;
4404
4405		check_irq_on();
4406		spin_lock_irq(&l3->list_lock);
4407
4408		list_for_each_entry(slabp, &l3->slabs_full, list)
4409			handle_slab(n, cachep, slabp);
4410		list_for_each_entry(slabp, &l3->slabs_partial, list)
4411			handle_slab(n, cachep, slabp);
4412		spin_unlock_irq(&l3->list_lock);
4413	}
4414	name = cachep->name;
4415	if (n[0] == n[1]) {
4416		/* Increase the buffer size */
4417		mutex_unlock(&cache_chain_mutex);
4418		m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4419		if (!m->private) {
4420			/* Too bad, we are really out */
4421			m->private = n;
4422			mutex_lock(&cache_chain_mutex);
4423			return -ENOMEM;
4424		}
4425		*(unsigned long *)m->private = n[0] * 2;
4426		kfree(n);
4427		mutex_lock(&cache_chain_mutex);
4428		/* Now make sure this entry will be retried */
4429		m->count = m->size;
4430		return 0;
4431	}
4432	for (i = 0; i < n[1]; i++) {
4433		seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4434		show_symbol(m, n[2*i+2]);
4435		seq_putc(m, '\n');
4436	}
4437
4438	return 0;
4439}
4440
4441const struct seq_operations slabstats_op = {
4442	.start = leaks_start,
4443	.next = s_next,
4444	.stop = s_stop,
4445	.show = leaks_show,
4446};
4447#endif
4448#endif
4449
4450/**
4451 * ksize - get the actual amount of memory allocated for a given object
4452 * @objp: Pointer to the object
4453 *
4454 * kmalloc may internally round up allocations and return more memory
4455 * than requested. ksize() can be used to determine the actual amount of
4456 * memory allocated. The caller may use this additional memory, even though
4457 * a smaller amount of memory was initially specified with the kmalloc call.
4458 * The caller must guarantee that objp points to a valid object previously
4459 * allocated with either kmalloc() or kmem_cache_alloc(). The object
4460 * must not be freed during the duration of the call.
4461 */
4462unsigned int ksize(const void *objp)
4463{
4464	if (unlikely(objp == NULL))
4465		return 0;
4466
4467	return obj_size(virt_to_cache(objp));
4468}
4469