slub.c revision 27d9e4e94862c89d171cf70911b4f11ad69fb54e
1/*
2 * SLUB: A slab allocator that limits cache line use instead of queuing
3 * objects in per cpu and per node lists.
4 *
5 * The allocator synchronizes using per slab locks and only
6 * uses a centralized lock to manage a pool of partial slabs.
7 *
8 * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
9 */
10
11#include <linux/mm.h>
12#include <linux/module.h>
13#include <linux/bit_spinlock.h>
14#include <linux/interrupt.h>
15#include <linux/bitops.h>
16#include <linux/slab.h>
17#include <linux/seq_file.h>
18#include <linux/cpu.h>
19#include <linux/cpuset.h>
20#include <linux/mempolicy.h>
21#include <linux/ctype.h>
22#include <linux/kallsyms.h>
23#include <linux/memory.h>
24
25/*
26 * Lock order:
27 *   1. slab_lock(page)
28 *   2. slab->list_lock
29 *
30 *   The slab_lock protects operations on the object of a particular
31 *   slab and its metadata in the page struct. If the slab lock
32 *   has been taken then no allocations nor frees can be performed
33 *   on the objects in the slab nor can the slab be added or removed
34 *   from the partial or full lists since this would mean modifying
35 *   the page_struct of the slab.
36 *
37 *   The list_lock protects the partial and full list on each node and
38 *   the partial slab counter. If taken then no new slabs may be added or
39 *   removed from the lists nor make the number of partial slabs be modified.
40 *   (Note that the total number of slabs is an atomic value that may be
41 *   modified without taking the list lock).
42 *
43 *   The list_lock is a centralized lock and thus we avoid taking it as
44 *   much as possible. As long as SLUB does not have to handle partial
45 *   slabs, operations can continue without any centralized lock. F.e.
46 *   allocating a long series of objects that fill up slabs does not require
47 *   the list lock.
48 *
49 *   The lock order is sometimes inverted when we are trying to get a slab
50 *   off a list. We take the list_lock and then look for a page on the list
51 *   to use. While we do that objects in the slabs may be freed. We can
52 *   only operate on the slab if we have also taken the slab_lock. So we use
53 *   a slab_trylock() on the slab. If trylock was successful then no frees
54 *   can occur anymore and we can use the slab for allocations etc. If the
55 *   slab_trylock() does not succeed then frees are in progress in the slab and
56 *   we must stay away from it for a while since we may cause a bouncing
57 *   cacheline if we try to acquire the lock. So go onto the next slab.
58 *   If all pages are busy then we may allocate a new slab instead of reusing
59 *   a partial slab. A new slab has noone operating on it and thus there is
60 *   no danger of cacheline contention.
61 *
62 *   Interrupts are disabled during allocation and deallocation in order to
63 *   make the slab allocator safe to use in the context of an irq. In addition
64 *   interrupts are disabled to ensure that the processor does not change
65 *   while handling per_cpu slabs, due to kernel preemption.
66 *
67 * SLUB assigns one slab for allocation to each processor.
68 * Allocations only occur from these slabs called cpu slabs.
69 *
70 * Slabs with free elements are kept on a partial list and during regular
71 * operations no list for full slabs is used. If an object in a full slab is
72 * freed then the slab will show up again on the partial lists.
73 * We track full slabs for debugging purposes though because otherwise we
74 * cannot scan all objects.
75 *
76 * Slabs are freed when they become empty. Teardown and setup is
77 * minimal so we rely on the page allocators per cpu caches for
78 * fast frees and allocs.
79 *
80 * Overloading of page flags that are otherwise used for LRU management.
81 *
82 * PageActive 		The slab is frozen and exempt from list processing.
83 * 			This means that the slab is dedicated to a purpose
84 * 			such as satisfying allocations for a specific
85 * 			processor. Objects may be freed in the slab while
86 * 			it is frozen but slab_free will then skip the usual
87 * 			list operations. It is up to the processor holding
88 * 			the slab to integrate the slab into the slab lists
89 * 			when the slab is no longer needed.
90 *
91 * 			One use of this flag is to mark slabs that are
92 * 			used for allocations. Then such a slab becomes a cpu
93 * 			slab. The cpu slab may be equipped with an additional
94 * 			freelist that allows lockless access to
95 * 			free objects in addition to the regular freelist
96 * 			that requires the slab lock.
97 *
98 * PageError		Slab requires special handling due to debug
99 * 			options set. This moves	slab handling out of
100 * 			the fast path and disables lockless freelists.
101 */
102
103#define FROZEN (1 << PG_active)
104
105#ifdef CONFIG_SLUB_DEBUG
106#define SLABDEBUG (1 << PG_error)
107#else
108#define SLABDEBUG 0
109#endif
110
111static inline int SlabFrozen(struct page *page)
112{
113	return page->flags & FROZEN;
114}
115
116static inline void SetSlabFrozen(struct page *page)
117{
118	page->flags |= FROZEN;
119}
120
121static inline void ClearSlabFrozen(struct page *page)
122{
123	page->flags &= ~FROZEN;
124}
125
126static inline int SlabDebug(struct page *page)
127{
128	return page->flags & SLABDEBUG;
129}
130
131static inline void SetSlabDebug(struct page *page)
132{
133	page->flags |= SLABDEBUG;
134}
135
136static inline void ClearSlabDebug(struct page *page)
137{
138	page->flags &= ~SLABDEBUG;
139}
140
141/*
142 * Issues still to be resolved:
143 *
144 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
145 *
146 * - Variable sizing of the per node arrays
147 */
148
149/* Enable to test recovery from slab corruption on boot */
150#undef SLUB_RESILIENCY_TEST
151
152#if PAGE_SHIFT <= 12
153
154/*
155 * Small page size. Make sure that we do not fragment memory
156 */
157#define DEFAULT_MAX_ORDER 1
158#define DEFAULT_MIN_OBJECTS 4
159
160#else
161
162/*
163 * Large page machines are customarily able to handle larger
164 * page orders.
165 */
166#define DEFAULT_MAX_ORDER 2
167#define DEFAULT_MIN_OBJECTS 8
168
169#endif
170
171/*
172 * Mininum number of partial slabs. These will be left on the partial
173 * lists even if they are empty. kmem_cache_shrink may reclaim them.
174 */
175#define MIN_PARTIAL 5
176
177/*
178 * Maximum number of desirable partial slabs.
179 * The existence of more partial slabs makes kmem_cache_shrink
180 * sort the partial list by the number of objects in the.
181 */
182#define MAX_PARTIAL 10
183
184#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
185				SLAB_POISON | SLAB_STORE_USER)
186
187/*
188 * Set of flags that will prevent slab merging
189 */
190#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
191		SLAB_TRACE | SLAB_DESTROY_BY_RCU)
192
193#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
194		SLAB_CACHE_DMA)
195
196#ifndef ARCH_KMALLOC_MINALIGN
197#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
198#endif
199
200#ifndef ARCH_SLAB_MINALIGN
201#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
202#endif
203
204/* Internal SLUB flags */
205#define __OBJECT_POISON		0x80000000 /* Poison object */
206#define __SYSFS_ADD_DEFERRED	0x40000000 /* Not yet visible via sysfs */
207#define __KMALLOC_CACHE		0x20000000 /* objects freed using kfree */
208#define __PAGE_ALLOC_FALLBACK	0x10000000 /* Allow fallback to page alloc */
209
210/* Not all arches define cache_line_size */
211#ifndef cache_line_size
212#define cache_line_size()	L1_CACHE_BYTES
213#endif
214
215static int kmem_size = sizeof(struct kmem_cache);
216
217#ifdef CONFIG_SMP
218static struct notifier_block slab_notifier;
219#endif
220
221static enum {
222	DOWN,		/* No slab functionality available */
223	PARTIAL,	/* kmem_cache_open() works but kmalloc does not */
224	UP,		/* Everything works but does not show up in sysfs */
225	SYSFS		/* Sysfs up */
226} slab_state = DOWN;
227
228/* A list of all slab caches on the system */
229static DECLARE_RWSEM(slub_lock);
230static LIST_HEAD(slab_caches);
231
232/*
233 * Tracking user of a slab.
234 */
235struct track {
236	void *addr;		/* Called from address */
237	int cpu;		/* Was running on cpu */
238	int pid;		/* Pid context */
239	unsigned long when;	/* When did the operation occur */
240};
241
242enum track_item { TRACK_ALLOC, TRACK_FREE };
243
244#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
245static int sysfs_slab_add(struct kmem_cache *);
246static int sysfs_slab_alias(struct kmem_cache *, const char *);
247static void sysfs_slab_remove(struct kmem_cache *);
248
249#else
250static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
251static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
252							{ return 0; }
253static inline void sysfs_slab_remove(struct kmem_cache *s)
254{
255	kfree(s);
256}
257
258#endif
259
260static inline void stat(struct kmem_cache_cpu *c, enum stat_item si)
261{
262#ifdef CONFIG_SLUB_STATS
263	c->stat[si]++;
264#endif
265}
266
267/********************************************************************
268 * 			Core slab cache functions
269 *******************************************************************/
270
271int slab_is_available(void)
272{
273	return slab_state >= UP;
274}
275
276static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
277{
278#ifdef CONFIG_NUMA
279	return s->node[node];
280#else
281	return &s->local_node;
282#endif
283}
284
285static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
286{
287#ifdef CONFIG_SMP
288	return s->cpu_slab[cpu];
289#else
290	return &s->cpu_slab;
291#endif
292}
293
294static inline int check_valid_pointer(struct kmem_cache *s,
295				struct page *page, const void *object)
296{
297	void *base;
298
299	if (!object)
300		return 1;
301
302	base = page_address(page);
303	if (object < base || object >= base + s->objects * s->size ||
304		(object - base) % s->size) {
305		return 0;
306	}
307
308	return 1;
309}
310
311/*
312 * Slow version of get and set free pointer.
313 *
314 * This version requires touching the cache lines of kmem_cache which
315 * we avoid to do in the fast alloc free paths. There we obtain the offset
316 * from the page struct.
317 */
318static inline void *get_freepointer(struct kmem_cache *s, void *object)
319{
320	return *(void **)(object + s->offset);
321}
322
323static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
324{
325	*(void **)(object + s->offset) = fp;
326}
327
328/* Loop over all objects in a slab */
329#define for_each_object(__p, __s, __addr) \
330	for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
331			__p += (__s)->size)
332
333/* Scan freelist */
334#define for_each_free_object(__p, __s, __free) \
335	for (__p = (__free); __p; __p = get_freepointer((__s), __p))
336
337/* Determine object index from a given position */
338static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
339{
340	return (p - addr) / s->size;
341}
342
343#ifdef CONFIG_SLUB_DEBUG
344/*
345 * Debug settings:
346 */
347#ifdef CONFIG_SLUB_DEBUG_ON
348static int slub_debug = DEBUG_DEFAULT_FLAGS;
349#else
350static int slub_debug;
351#endif
352
353static char *slub_debug_slabs;
354
355/*
356 * Object debugging
357 */
358static void print_section(char *text, u8 *addr, unsigned int length)
359{
360	int i, offset;
361	int newline = 1;
362	char ascii[17];
363
364	ascii[16] = 0;
365
366	for (i = 0; i < length; i++) {
367		if (newline) {
368			printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
369			newline = 0;
370		}
371		printk(KERN_CONT " %02x", addr[i]);
372		offset = i % 16;
373		ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
374		if (offset == 15) {
375			printk(KERN_CONT " %s\n", ascii);
376			newline = 1;
377		}
378	}
379	if (!newline) {
380		i %= 16;
381		while (i < 16) {
382			printk(KERN_CONT "   ");
383			ascii[i] = ' ';
384			i++;
385		}
386		printk(KERN_CONT " %s\n", ascii);
387	}
388}
389
390static struct track *get_track(struct kmem_cache *s, void *object,
391	enum track_item alloc)
392{
393	struct track *p;
394
395	if (s->offset)
396		p = object + s->offset + sizeof(void *);
397	else
398		p = object + s->inuse;
399
400	return p + alloc;
401}
402
403static void set_track(struct kmem_cache *s, void *object,
404				enum track_item alloc, void *addr)
405{
406	struct track *p;
407
408	if (s->offset)
409		p = object + s->offset + sizeof(void *);
410	else
411		p = object + s->inuse;
412
413	p += alloc;
414	if (addr) {
415		p->addr = addr;
416		p->cpu = smp_processor_id();
417		p->pid = current ? current->pid : -1;
418		p->when = jiffies;
419	} else
420		memset(p, 0, sizeof(struct track));
421}
422
423static void init_tracking(struct kmem_cache *s, void *object)
424{
425	if (!(s->flags & SLAB_STORE_USER))
426		return;
427
428	set_track(s, object, TRACK_FREE, NULL);
429	set_track(s, object, TRACK_ALLOC, NULL);
430}
431
432static void print_track(const char *s, struct track *t)
433{
434	if (!t->addr)
435		return;
436
437	printk(KERN_ERR "INFO: %s in ", s);
438	__print_symbol("%s", (unsigned long)t->addr);
439	printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
440}
441
442static void print_tracking(struct kmem_cache *s, void *object)
443{
444	if (!(s->flags & SLAB_STORE_USER))
445		return;
446
447	print_track("Allocated", get_track(s, object, TRACK_ALLOC));
448	print_track("Freed", get_track(s, object, TRACK_FREE));
449}
450
451static void print_page_info(struct page *page)
452{
453	printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
454		page, page->inuse, page->freelist, page->flags);
455
456}
457
458static void slab_bug(struct kmem_cache *s, char *fmt, ...)
459{
460	va_list args;
461	char buf[100];
462
463	va_start(args, fmt);
464	vsnprintf(buf, sizeof(buf), fmt, args);
465	va_end(args);
466	printk(KERN_ERR "========================================"
467			"=====================================\n");
468	printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
469	printk(KERN_ERR "----------------------------------------"
470			"-------------------------------------\n\n");
471}
472
473static void slab_fix(struct kmem_cache *s, char *fmt, ...)
474{
475	va_list args;
476	char buf[100];
477
478	va_start(args, fmt);
479	vsnprintf(buf, sizeof(buf), fmt, args);
480	va_end(args);
481	printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
482}
483
484static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
485{
486	unsigned int off;	/* Offset of last byte */
487	u8 *addr = page_address(page);
488
489	print_tracking(s, p);
490
491	print_page_info(page);
492
493	printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
494			p, p - addr, get_freepointer(s, p));
495
496	if (p > addr + 16)
497		print_section("Bytes b4", p - 16, 16);
498
499	print_section("Object", p, min(s->objsize, 128));
500
501	if (s->flags & SLAB_RED_ZONE)
502		print_section("Redzone", p + s->objsize,
503			s->inuse - s->objsize);
504
505	if (s->offset)
506		off = s->offset + sizeof(void *);
507	else
508		off = s->inuse;
509
510	if (s->flags & SLAB_STORE_USER)
511		off += 2 * sizeof(struct track);
512
513	if (off != s->size)
514		/* Beginning of the filler is the free pointer */
515		print_section("Padding", p + off, s->size - off);
516
517	dump_stack();
518}
519
520static void object_err(struct kmem_cache *s, struct page *page,
521			u8 *object, char *reason)
522{
523	slab_bug(s, reason);
524	print_trailer(s, page, object);
525}
526
527static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
528{
529	va_list args;
530	char buf[100];
531
532	va_start(args, fmt);
533	vsnprintf(buf, sizeof(buf), fmt, args);
534	va_end(args);
535	slab_bug(s, fmt);
536	print_page_info(page);
537	dump_stack();
538}
539
540static void init_object(struct kmem_cache *s, void *object, int active)
541{
542	u8 *p = object;
543
544	if (s->flags & __OBJECT_POISON) {
545		memset(p, POISON_FREE, s->objsize - 1);
546		p[s->objsize - 1] = POISON_END;
547	}
548
549	if (s->flags & SLAB_RED_ZONE)
550		memset(p + s->objsize,
551			active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
552			s->inuse - s->objsize);
553}
554
555static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
556{
557	while (bytes) {
558		if (*start != (u8)value)
559			return start;
560		start++;
561		bytes--;
562	}
563	return NULL;
564}
565
566static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
567						void *from, void *to)
568{
569	slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
570	memset(from, data, to - from);
571}
572
573static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
574			u8 *object, char *what,
575			u8 *start, unsigned int value, unsigned int bytes)
576{
577	u8 *fault;
578	u8 *end;
579
580	fault = check_bytes(start, value, bytes);
581	if (!fault)
582		return 1;
583
584	end = start + bytes;
585	while (end > fault && end[-1] == value)
586		end--;
587
588	slab_bug(s, "%s overwritten", what);
589	printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
590					fault, end - 1, fault[0], value);
591	print_trailer(s, page, object);
592
593	restore_bytes(s, what, value, fault, end);
594	return 0;
595}
596
597/*
598 * Object layout:
599 *
600 * object address
601 * 	Bytes of the object to be managed.
602 * 	If the freepointer may overlay the object then the free
603 * 	pointer is the first word of the object.
604 *
605 * 	Poisoning uses 0x6b (POISON_FREE) and the last byte is
606 * 	0xa5 (POISON_END)
607 *
608 * object + s->objsize
609 * 	Padding to reach word boundary. This is also used for Redzoning.
610 * 	Padding is extended by another word if Redzoning is enabled and
611 * 	objsize == inuse.
612 *
613 * 	We fill with 0xbb (RED_INACTIVE) for inactive objects and with
614 * 	0xcc (RED_ACTIVE) for objects in use.
615 *
616 * object + s->inuse
617 * 	Meta data starts here.
618 *
619 * 	A. Free pointer (if we cannot overwrite object on free)
620 * 	B. Tracking data for SLAB_STORE_USER
621 * 	C. Padding to reach required alignment boundary or at mininum
622 * 		one word if debuggin is on to be able to detect writes
623 * 		before the word boundary.
624 *
625 *	Padding is done using 0x5a (POISON_INUSE)
626 *
627 * object + s->size
628 * 	Nothing is used beyond s->size.
629 *
630 * If slabcaches are merged then the objsize and inuse boundaries are mostly
631 * ignored. And therefore no slab options that rely on these boundaries
632 * may be used with merged slabcaches.
633 */
634
635static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
636{
637	unsigned long off = s->inuse;	/* The end of info */
638
639	if (s->offset)
640		/* Freepointer is placed after the object. */
641		off += sizeof(void *);
642
643	if (s->flags & SLAB_STORE_USER)
644		/* We also have user information there */
645		off += 2 * sizeof(struct track);
646
647	if (s->size == off)
648		return 1;
649
650	return check_bytes_and_report(s, page, p, "Object padding",
651				p + off, POISON_INUSE, s->size - off);
652}
653
654static int slab_pad_check(struct kmem_cache *s, struct page *page)
655{
656	u8 *start;
657	u8 *fault;
658	u8 *end;
659	int length;
660	int remainder;
661
662	if (!(s->flags & SLAB_POISON))
663		return 1;
664
665	start = page_address(page);
666	end = start + (PAGE_SIZE << s->order);
667	length = s->objects * s->size;
668	remainder = end - (start + length);
669	if (!remainder)
670		return 1;
671
672	fault = check_bytes(start + length, POISON_INUSE, remainder);
673	if (!fault)
674		return 1;
675	while (end > fault && end[-1] == POISON_INUSE)
676		end--;
677
678	slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
679	print_section("Padding", start, length);
680
681	restore_bytes(s, "slab padding", POISON_INUSE, start, end);
682	return 0;
683}
684
685static int check_object(struct kmem_cache *s, struct page *page,
686					void *object, int active)
687{
688	u8 *p = object;
689	u8 *endobject = object + s->objsize;
690
691	if (s->flags & SLAB_RED_ZONE) {
692		unsigned int red =
693			active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
694
695		if (!check_bytes_and_report(s, page, object, "Redzone",
696			endobject, red, s->inuse - s->objsize))
697			return 0;
698	} else {
699		if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
700			check_bytes_and_report(s, page, p, "Alignment padding",
701				endobject, POISON_INUSE, s->inuse - s->objsize);
702		}
703	}
704
705	if (s->flags & SLAB_POISON) {
706		if (!active && (s->flags & __OBJECT_POISON) &&
707			(!check_bytes_and_report(s, page, p, "Poison", p,
708					POISON_FREE, s->objsize - 1) ||
709			 !check_bytes_and_report(s, page, p, "Poison",
710				p + s->objsize - 1, POISON_END, 1)))
711			return 0;
712		/*
713		 * check_pad_bytes cleans up on its own.
714		 */
715		check_pad_bytes(s, page, p);
716	}
717
718	if (!s->offset && active)
719		/*
720		 * Object and freepointer overlap. Cannot check
721		 * freepointer while object is allocated.
722		 */
723		return 1;
724
725	/* Check free pointer validity */
726	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
727		object_err(s, page, p, "Freepointer corrupt");
728		/*
729		 * No choice but to zap it and thus loose the remainder
730		 * of the free objects in this slab. May cause
731		 * another error because the object count is now wrong.
732		 */
733		set_freepointer(s, p, NULL);
734		return 0;
735	}
736	return 1;
737}
738
739static int check_slab(struct kmem_cache *s, struct page *page)
740{
741	VM_BUG_ON(!irqs_disabled());
742
743	if (!PageSlab(page)) {
744		slab_err(s, page, "Not a valid slab page");
745		return 0;
746	}
747	if (page->inuse > s->objects) {
748		slab_err(s, page, "inuse %u > max %u",
749			s->name, page->inuse, s->objects);
750		return 0;
751	}
752	/* Slab_pad_check fixes things up after itself */
753	slab_pad_check(s, page);
754	return 1;
755}
756
757/*
758 * Determine if a certain object on a page is on the freelist. Must hold the
759 * slab lock to guarantee that the chains are in a consistent state.
760 */
761static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
762{
763	int nr = 0;
764	void *fp = page->freelist;
765	void *object = NULL;
766
767	while (fp && nr <= s->objects) {
768		if (fp == search)
769			return 1;
770		if (!check_valid_pointer(s, page, fp)) {
771			if (object) {
772				object_err(s, page, object,
773					"Freechain corrupt");
774				set_freepointer(s, object, NULL);
775				break;
776			} else {
777				slab_err(s, page, "Freepointer corrupt");
778				page->freelist = NULL;
779				page->inuse = s->objects;
780				slab_fix(s, "Freelist cleared");
781				return 0;
782			}
783			break;
784		}
785		object = fp;
786		fp = get_freepointer(s, object);
787		nr++;
788	}
789
790	if (page->inuse != s->objects - nr) {
791		slab_err(s, page, "Wrong object count. Counter is %d but "
792			"counted were %d", page->inuse, s->objects - nr);
793		page->inuse = s->objects - nr;
794		slab_fix(s, "Object count adjusted.");
795	}
796	return search == NULL;
797}
798
799static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc)
800{
801	if (s->flags & SLAB_TRACE) {
802		printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
803			s->name,
804			alloc ? "alloc" : "free",
805			object, page->inuse,
806			page->freelist);
807
808		if (!alloc)
809			print_section("Object", (void *)object, s->objsize);
810
811		dump_stack();
812	}
813}
814
815/*
816 * Tracking of fully allocated slabs for debugging purposes.
817 */
818static void add_full(struct kmem_cache_node *n, struct page *page)
819{
820	spin_lock(&n->list_lock);
821	list_add(&page->lru, &n->full);
822	spin_unlock(&n->list_lock);
823}
824
825static void remove_full(struct kmem_cache *s, struct page *page)
826{
827	struct kmem_cache_node *n;
828
829	if (!(s->flags & SLAB_STORE_USER))
830		return;
831
832	n = get_node(s, page_to_nid(page));
833
834	spin_lock(&n->list_lock);
835	list_del(&page->lru);
836	spin_unlock(&n->list_lock);
837}
838
839static void setup_object_debug(struct kmem_cache *s, struct page *page,
840								void *object)
841{
842	if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
843		return;
844
845	init_object(s, object, 0);
846	init_tracking(s, object);
847}
848
849static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
850						void *object, void *addr)
851{
852	if (!check_slab(s, page))
853		goto bad;
854
855	if (!on_freelist(s, page, object)) {
856		object_err(s, page, object, "Object already allocated");
857		goto bad;
858	}
859
860	if (!check_valid_pointer(s, page, object)) {
861		object_err(s, page, object, "Freelist Pointer check fails");
862		goto bad;
863	}
864
865	if (!check_object(s, page, object, 0))
866		goto bad;
867
868	/* Success perform special debug activities for allocs */
869	if (s->flags & SLAB_STORE_USER)
870		set_track(s, object, TRACK_ALLOC, addr);
871	trace(s, page, object, 1);
872	init_object(s, object, 1);
873	return 1;
874
875bad:
876	if (PageSlab(page)) {
877		/*
878		 * If this is a slab page then lets do the best we can
879		 * to avoid issues in the future. Marking all objects
880		 * as used avoids touching the remaining objects.
881		 */
882		slab_fix(s, "Marking all objects used");
883		page->inuse = s->objects;
884		page->freelist = NULL;
885	}
886	return 0;
887}
888
889static int free_debug_processing(struct kmem_cache *s, struct page *page,
890						void *object, void *addr)
891{
892	if (!check_slab(s, page))
893		goto fail;
894
895	if (!check_valid_pointer(s, page, object)) {
896		slab_err(s, page, "Invalid object pointer 0x%p", object);
897		goto fail;
898	}
899
900	if (on_freelist(s, page, object)) {
901		object_err(s, page, object, "Object already free");
902		goto fail;
903	}
904
905	if (!check_object(s, page, object, 1))
906		return 0;
907
908	if (unlikely(s != page->slab)) {
909		if (!PageSlab(page)) {
910			slab_err(s, page, "Attempt to free object(0x%p) "
911				"outside of slab", object);
912		} else if (!page->slab) {
913			printk(KERN_ERR
914				"SLUB <none>: no slab for object 0x%p.\n",
915						object);
916			dump_stack();
917		} else
918			object_err(s, page, object,
919					"page slab pointer corrupt.");
920		goto fail;
921	}
922
923	/* Special debug activities for freeing objects */
924	if (!SlabFrozen(page) && !page->freelist)
925		remove_full(s, page);
926	if (s->flags & SLAB_STORE_USER)
927		set_track(s, object, TRACK_FREE, addr);
928	trace(s, page, object, 0);
929	init_object(s, object, 0);
930	return 1;
931
932fail:
933	slab_fix(s, "Object at 0x%p not freed", object);
934	return 0;
935}
936
937static int __init setup_slub_debug(char *str)
938{
939	slub_debug = DEBUG_DEFAULT_FLAGS;
940	if (*str++ != '=' || !*str)
941		/*
942		 * No options specified. Switch on full debugging.
943		 */
944		goto out;
945
946	if (*str == ',')
947		/*
948		 * No options but restriction on slabs. This means full
949		 * debugging for slabs matching a pattern.
950		 */
951		goto check_slabs;
952
953	slub_debug = 0;
954	if (*str == '-')
955		/*
956		 * Switch off all debugging measures.
957		 */
958		goto out;
959
960	/*
961	 * Determine which debug features should be switched on
962	 */
963	for (; *str && *str != ','; str++) {
964		switch (tolower(*str)) {
965		case 'f':
966			slub_debug |= SLAB_DEBUG_FREE;
967			break;
968		case 'z':
969			slub_debug |= SLAB_RED_ZONE;
970			break;
971		case 'p':
972			slub_debug |= SLAB_POISON;
973			break;
974		case 'u':
975			slub_debug |= SLAB_STORE_USER;
976			break;
977		case 't':
978			slub_debug |= SLAB_TRACE;
979			break;
980		default:
981			printk(KERN_ERR "slub_debug option '%c' "
982				"unknown. skipped\n", *str);
983		}
984	}
985
986check_slabs:
987	if (*str == ',')
988		slub_debug_slabs = str + 1;
989out:
990	return 1;
991}
992
993__setup("slub_debug", setup_slub_debug);
994
995static unsigned long kmem_cache_flags(unsigned long objsize,
996	unsigned long flags, const char *name,
997	void (*ctor)(struct kmem_cache *, void *))
998{
999	/*
1000	 * Enable debugging if selected on the kernel commandline.
1001	 */
1002	if (slub_debug && (!slub_debug_slabs ||
1003	    strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0))
1004			flags |= slub_debug;
1005
1006	return flags;
1007}
1008#else
1009static inline void setup_object_debug(struct kmem_cache *s,
1010			struct page *page, void *object) {}
1011
1012static inline int alloc_debug_processing(struct kmem_cache *s,
1013	struct page *page, void *object, void *addr) { return 0; }
1014
1015static inline int free_debug_processing(struct kmem_cache *s,
1016	struct page *page, void *object, void *addr) { return 0; }
1017
1018static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1019			{ return 1; }
1020static inline int check_object(struct kmem_cache *s, struct page *page,
1021			void *object, int active) { return 1; }
1022static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1023static inline unsigned long kmem_cache_flags(unsigned long objsize,
1024	unsigned long flags, const char *name,
1025	void (*ctor)(struct kmem_cache *, void *))
1026{
1027	return flags;
1028}
1029#define slub_debug 0
1030#endif
1031/*
1032 * Slab allocation and freeing
1033 */
1034static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1035{
1036	struct page *page;
1037	int pages = 1 << s->order;
1038
1039	flags |= s->allocflags;
1040
1041	if (node == -1)
1042		page = alloc_pages(flags, s->order);
1043	else
1044		page = alloc_pages_node(node, flags, s->order);
1045
1046	if (!page)
1047		return NULL;
1048
1049	mod_zone_page_state(page_zone(page),
1050		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
1051		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1052		pages);
1053
1054	return page;
1055}
1056
1057static void setup_object(struct kmem_cache *s, struct page *page,
1058				void *object)
1059{
1060	setup_object_debug(s, page, object);
1061	if (unlikely(s->ctor))
1062		s->ctor(s, object);
1063}
1064
1065static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1066{
1067	struct page *page;
1068	struct kmem_cache_node *n;
1069	void *start;
1070	void *last;
1071	void *p;
1072
1073	BUG_ON(flags & GFP_SLAB_BUG_MASK);
1074
1075	page = allocate_slab(s,
1076		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1077	if (!page)
1078		goto out;
1079
1080	n = get_node(s, page_to_nid(page));
1081	if (n)
1082		atomic_long_inc(&n->nr_slabs);
1083	page->slab = s;
1084	page->flags |= 1 << PG_slab;
1085	if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
1086			SLAB_STORE_USER | SLAB_TRACE))
1087		SetSlabDebug(page);
1088
1089	start = page_address(page);
1090
1091	if (unlikely(s->flags & SLAB_POISON))
1092		memset(start, POISON_INUSE, PAGE_SIZE << s->order);
1093
1094	last = start;
1095	for_each_object(p, s, start) {
1096		setup_object(s, page, last);
1097		set_freepointer(s, last, p);
1098		last = p;
1099	}
1100	setup_object(s, page, last);
1101	set_freepointer(s, last, NULL);
1102
1103	page->freelist = start;
1104	page->inuse = 0;
1105out:
1106	return page;
1107}
1108
1109static void __free_slab(struct kmem_cache *s, struct page *page)
1110{
1111	int pages = 1 << s->order;
1112
1113	if (unlikely(SlabDebug(page))) {
1114		void *p;
1115
1116		slab_pad_check(s, page);
1117		for_each_object(p, s, page_address(page))
1118			check_object(s, page, p, 0);
1119		ClearSlabDebug(page);
1120	}
1121
1122	mod_zone_page_state(page_zone(page),
1123		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
1124		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1125		-pages);
1126
1127	__free_pages(page, s->order);
1128}
1129
1130static void rcu_free_slab(struct rcu_head *h)
1131{
1132	struct page *page;
1133
1134	page = container_of((struct list_head *)h, struct page, lru);
1135	__free_slab(page->slab, page);
1136}
1137
1138static void free_slab(struct kmem_cache *s, struct page *page)
1139{
1140	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1141		/*
1142		 * RCU free overloads the RCU head over the LRU
1143		 */
1144		struct rcu_head *head = (void *)&page->lru;
1145
1146		call_rcu(head, rcu_free_slab);
1147	} else
1148		__free_slab(s, page);
1149}
1150
1151static void discard_slab(struct kmem_cache *s, struct page *page)
1152{
1153	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1154
1155	atomic_long_dec(&n->nr_slabs);
1156	reset_page_mapcount(page);
1157	__ClearPageSlab(page);
1158	free_slab(s, page);
1159}
1160
1161/*
1162 * Per slab locking using the pagelock
1163 */
1164static __always_inline void slab_lock(struct page *page)
1165{
1166	bit_spin_lock(PG_locked, &page->flags);
1167}
1168
1169static __always_inline void slab_unlock(struct page *page)
1170{
1171	__bit_spin_unlock(PG_locked, &page->flags);
1172}
1173
1174static __always_inline int slab_trylock(struct page *page)
1175{
1176	int rc = 1;
1177
1178	rc = bit_spin_trylock(PG_locked, &page->flags);
1179	return rc;
1180}
1181
1182/*
1183 * Management of partially allocated slabs
1184 */
1185static void add_partial(struct kmem_cache_node *n,
1186				struct page *page, int tail)
1187{
1188	spin_lock(&n->list_lock);
1189	n->nr_partial++;
1190	if (tail)
1191		list_add_tail(&page->lru, &n->partial);
1192	else
1193		list_add(&page->lru, &n->partial);
1194	spin_unlock(&n->list_lock);
1195}
1196
1197static void remove_partial(struct kmem_cache *s,
1198						struct page *page)
1199{
1200	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1201
1202	spin_lock(&n->list_lock);
1203	list_del(&page->lru);
1204	n->nr_partial--;
1205	spin_unlock(&n->list_lock);
1206}
1207
1208/*
1209 * Lock slab and remove from the partial list.
1210 *
1211 * Must hold list_lock.
1212 */
1213static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page)
1214{
1215	if (slab_trylock(page)) {
1216		list_del(&page->lru);
1217		n->nr_partial--;
1218		SetSlabFrozen(page);
1219		return 1;
1220	}
1221	return 0;
1222}
1223
1224/*
1225 * Try to allocate a partial slab from a specific node.
1226 */
1227static struct page *get_partial_node(struct kmem_cache_node *n)
1228{
1229	struct page *page;
1230
1231	/*
1232	 * Racy check. If we mistakenly see no partial slabs then we
1233	 * just allocate an empty slab. If we mistakenly try to get a
1234	 * partial slab and there is none available then get_partials()
1235	 * will return NULL.
1236	 */
1237	if (!n || !n->nr_partial)
1238		return NULL;
1239
1240	spin_lock(&n->list_lock);
1241	list_for_each_entry(page, &n->partial, lru)
1242		if (lock_and_freeze_slab(n, page))
1243			goto out;
1244	page = NULL;
1245out:
1246	spin_unlock(&n->list_lock);
1247	return page;
1248}
1249
1250/*
1251 * Get a page from somewhere. Search in increasing NUMA distances.
1252 */
1253static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1254{
1255#ifdef CONFIG_NUMA
1256	struct zonelist *zonelist;
1257	struct zone **z;
1258	struct page *page;
1259
1260	/*
1261	 * The defrag ratio allows a configuration of the tradeoffs between
1262	 * inter node defragmentation and node local allocations. A lower
1263	 * defrag_ratio increases the tendency to do local allocations
1264	 * instead of attempting to obtain partial slabs from other nodes.
1265	 *
1266	 * If the defrag_ratio is set to 0 then kmalloc() always
1267	 * returns node local objects. If the ratio is higher then kmalloc()
1268	 * may return off node objects because partial slabs are obtained
1269	 * from other nodes and filled up.
1270	 *
1271	 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes
1272	 * defrag_ratio = 1000) then every (well almost) allocation will
1273	 * first attempt to defrag slab caches on other nodes. This means
1274	 * scanning over all nodes to look for partial slabs which may be
1275	 * expensive if we do it every time we are trying to find a slab
1276	 * with available objects.
1277	 */
1278	if (!s->remote_node_defrag_ratio ||
1279			get_cycles() % 1024 > s->remote_node_defrag_ratio)
1280		return NULL;
1281
1282	zonelist = &NODE_DATA(
1283		slab_node(current->mempolicy))->node_zonelists[gfp_zone(flags)];
1284	for (z = zonelist->zones; *z; z++) {
1285		struct kmem_cache_node *n;
1286
1287		n = get_node(s, zone_to_nid(*z));
1288
1289		if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
1290				n->nr_partial > MIN_PARTIAL) {
1291			page = get_partial_node(n);
1292			if (page)
1293				return page;
1294		}
1295	}
1296#endif
1297	return NULL;
1298}
1299
1300/*
1301 * Get a partial page, lock it and return it.
1302 */
1303static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1304{
1305	struct page *page;
1306	int searchnode = (node == -1) ? numa_node_id() : node;
1307
1308	page = get_partial_node(get_node(s, searchnode));
1309	if (page || (flags & __GFP_THISNODE))
1310		return page;
1311
1312	return get_any_partial(s, flags);
1313}
1314
1315/*
1316 * Move a page back to the lists.
1317 *
1318 * Must be called with the slab lock held.
1319 *
1320 * On exit the slab lock will have been dropped.
1321 */
1322static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1323{
1324	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1325	struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1326
1327	ClearSlabFrozen(page);
1328	if (page->inuse) {
1329
1330		if (page->freelist) {
1331			add_partial(n, page, tail);
1332			stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1333		} else {
1334			stat(c, DEACTIVATE_FULL);
1335			if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
1336				add_full(n, page);
1337		}
1338		slab_unlock(page);
1339	} else {
1340		stat(c, DEACTIVATE_EMPTY);
1341		if (n->nr_partial < MIN_PARTIAL) {
1342			/*
1343			 * Adding an empty slab to the partial slabs in order
1344			 * to avoid page allocator overhead. This slab needs
1345			 * to come after the other slabs with objects in
1346			 * order to fill them up. That way the size of the
1347			 * partial list stays small. kmem_cache_shrink can
1348			 * reclaim empty slabs from the partial list.
1349			 */
1350			add_partial(n, page, 1);
1351			slab_unlock(page);
1352		} else {
1353			slab_unlock(page);
1354			stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB);
1355			discard_slab(s, page);
1356		}
1357	}
1358}
1359
1360/*
1361 * Remove the cpu slab
1362 */
1363static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1364{
1365	struct page *page = c->page;
1366	int tail = 1;
1367
1368	if (c->freelist)
1369		stat(c, DEACTIVATE_REMOTE_FREES);
1370	/*
1371	 * Merge cpu freelist into freelist. Typically we get here
1372	 * because both freelists are empty. So this is unlikely
1373	 * to occur.
1374	 */
1375	while (unlikely(c->freelist)) {
1376		void **object;
1377
1378		tail = 0;	/* Hot objects. Put the slab first */
1379
1380		/* Retrieve object from cpu_freelist */
1381		object = c->freelist;
1382		c->freelist = c->freelist[c->offset];
1383
1384		/* And put onto the regular freelist */
1385		object[c->offset] = page->freelist;
1386		page->freelist = object;
1387		page->inuse--;
1388	}
1389	c->page = NULL;
1390	unfreeze_slab(s, page, tail);
1391}
1392
1393static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1394{
1395	stat(c, CPUSLAB_FLUSH);
1396	slab_lock(c->page);
1397	deactivate_slab(s, c);
1398}
1399
1400/*
1401 * Flush cpu slab.
1402 * Called from IPI handler with interrupts disabled.
1403 */
1404static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1405{
1406	struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1407
1408	if (likely(c && c->page))
1409		flush_slab(s, c);
1410}
1411
1412static void flush_cpu_slab(void *d)
1413{
1414	struct kmem_cache *s = d;
1415
1416	__flush_cpu_slab(s, smp_processor_id());
1417}
1418
1419static void flush_all(struct kmem_cache *s)
1420{
1421#ifdef CONFIG_SMP
1422	on_each_cpu(flush_cpu_slab, s, 1, 1);
1423#else
1424	unsigned long flags;
1425
1426	local_irq_save(flags);
1427	flush_cpu_slab(s);
1428	local_irq_restore(flags);
1429#endif
1430}
1431
1432/*
1433 * Check if the objects in a per cpu structure fit numa
1434 * locality expectations.
1435 */
1436static inline int node_match(struct kmem_cache_cpu *c, int node)
1437{
1438#ifdef CONFIG_NUMA
1439	if (node != -1 && c->node != node)
1440		return 0;
1441#endif
1442	return 1;
1443}
1444
1445/*
1446 * Slow path. The lockless freelist is empty or we need to perform
1447 * debugging duties.
1448 *
1449 * Interrupts are disabled.
1450 *
1451 * Processing is still very fast if new objects have been freed to the
1452 * regular freelist. In that case we simply take over the regular freelist
1453 * as the lockless freelist and zap the regular freelist.
1454 *
1455 * If that is not working then we fall back to the partial lists. We take the
1456 * first element of the freelist as the object to allocate now and move the
1457 * rest of the freelist to the lockless freelist.
1458 *
1459 * And if we were unable to get a new slab from the partial slab lists then
1460 * we need to allocate a new slab. This is slowest path since we may sleep.
1461 */
1462static void *__slab_alloc(struct kmem_cache *s,
1463		gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
1464{
1465	void **object;
1466	struct page *new;
1467
1468	if (!c->page)
1469		goto new_slab;
1470
1471	slab_lock(c->page);
1472	if (unlikely(!node_match(c, node)))
1473		goto another_slab;
1474	stat(c, ALLOC_REFILL);
1475load_freelist:
1476	object = c->page->freelist;
1477	if (unlikely(!object))
1478		goto another_slab;
1479	if (unlikely(SlabDebug(c->page)))
1480		goto debug;
1481
1482	object = c->page->freelist;
1483	c->freelist = object[c->offset];
1484	c->page->inuse = s->objects;
1485	c->page->freelist = NULL;
1486	c->node = page_to_nid(c->page);
1487unlock_out:
1488	slab_unlock(c->page);
1489	stat(c, ALLOC_SLOWPATH);
1490	return object;
1491
1492another_slab:
1493	deactivate_slab(s, c);
1494
1495new_slab:
1496	new = get_partial(s, gfpflags, node);
1497	if (new) {
1498		c->page = new;
1499		stat(c, ALLOC_FROM_PARTIAL);
1500		goto load_freelist;
1501	}
1502
1503	if (gfpflags & __GFP_WAIT)
1504		local_irq_enable();
1505
1506	new = new_slab(s, gfpflags, node);
1507
1508	if (gfpflags & __GFP_WAIT)
1509		local_irq_disable();
1510
1511	if (new) {
1512		c = get_cpu_slab(s, smp_processor_id());
1513		stat(c, ALLOC_SLAB);
1514		if (c->page)
1515			flush_slab(s, c);
1516		slab_lock(new);
1517		SetSlabFrozen(new);
1518		c->page = new;
1519		goto load_freelist;
1520	}
1521
1522	/*
1523	 * No memory available.
1524	 *
1525	 * If the slab uses higher order allocs but the object is
1526	 * smaller than a page size then we can fallback in emergencies
1527	 * to the page allocator via kmalloc_large. The page allocator may
1528	 * have failed to obtain a higher order page and we can try to
1529	 * allocate a single page if the object fits into a single page.
1530	 * That is only possible if certain conditions are met that are being
1531	 * checked when a slab is created.
1532	 */
1533	if (!(gfpflags & __GFP_NORETRY) && (s->flags & __PAGE_ALLOC_FALLBACK))
1534		return kmalloc_large(s->objsize, gfpflags);
1535
1536	return NULL;
1537debug:
1538	object = c->page->freelist;
1539	if (!alloc_debug_processing(s, c->page, object, addr))
1540		goto another_slab;
1541
1542	c->page->inuse++;
1543	c->page->freelist = object[c->offset];
1544	c->node = -1;
1545	goto unlock_out;
1546}
1547
1548/*
1549 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
1550 * have the fastpath folded into their functions. So no function call
1551 * overhead for requests that can be satisfied on the fastpath.
1552 *
1553 * The fastpath works by first checking if the lockless freelist can be used.
1554 * If not then __slab_alloc is called for slow processing.
1555 *
1556 * Otherwise we can simply pick the next object from the lockless free list.
1557 */
1558static __always_inline void *slab_alloc(struct kmem_cache *s,
1559		gfp_t gfpflags, int node, void *addr)
1560{
1561	void **object;
1562	struct kmem_cache_cpu *c;
1563	unsigned long flags;
1564
1565	local_irq_save(flags);
1566	c = get_cpu_slab(s, smp_processor_id());
1567	if (unlikely(!c->freelist || !node_match(c, node)))
1568
1569		object = __slab_alloc(s, gfpflags, node, addr, c);
1570
1571	else {
1572		object = c->freelist;
1573		c->freelist = object[c->offset];
1574		stat(c, ALLOC_FASTPATH);
1575	}
1576	local_irq_restore(flags);
1577
1578	if (unlikely((gfpflags & __GFP_ZERO) && object))
1579		memset(object, 0, c->objsize);
1580
1581	return object;
1582}
1583
1584void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1585{
1586	return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
1587}
1588EXPORT_SYMBOL(kmem_cache_alloc);
1589
1590#ifdef CONFIG_NUMA
1591void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1592{
1593	return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
1594}
1595EXPORT_SYMBOL(kmem_cache_alloc_node);
1596#endif
1597
1598/*
1599 * Slow patch handling. This may still be called frequently since objects
1600 * have a longer lifetime than the cpu slabs in most processing loads.
1601 *
1602 * So we still attempt to reduce cache line usage. Just take the slab
1603 * lock and free the item. If there is no additional partial page
1604 * handling required then we can return immediately.
1605 */
1606static void __slab_free(struct kmem_cache *s, struct page *page,
1607				void *x, void *addr, unsigned int offset)
1608{
1609	void *prior;
1610	void **object = (void *)x;
1611	struct kmem_cache_cpu *c;
1612
1613	c = get_cpu_slab(s, raw_smp_processor_id());
1614	stat(c, FREE_SLOWPATH);
1615	slab_lock(page);
1616
1617	if (unlikely(SlabDebug(page)))
1618		goto debug;
1619checks_ok:
1620	prior = object[offset] = page->freelist;
1621	page->freelist = object;
1622	page->inuse--;
1623
1624	if (unlikely(SlabFrozen(page))) {
1625		stat(c, FREE_FROZEN);
1626		goto out_unlock;
1627	}
1628
1629	if (unlikely(!page->inuse))
1630		goto slab_empty;
1631
1632	/*
1633	 * Objects left in the slab. If it
1634	 * was not on the partial list before
1635	 * then add it.
1636	 */
1637	if (unlikely(!prior)) {
1638		add_partial(get_node(s, page_to_nid(page)), page, 1);
1639		stat(c, FREE_ADD_PARTIAL);
1640	}
1641
1642out_unlock:
1643	slab_unlock(page);
1644	return;
1645
1646slab_empty:
1647	if (prior) {
1648		/*
1649		 * Slab still on the partial list.
1650		 */
1651		remove_partial(s, page);
1652		stat(c, FREE_REMOVE_PARTIAL);
1653	}
1654	slab_unlock(page);
1655	stat(c, FREE_SLAB);
1656	discard_slab(s, page);
1657	return;
1658
1659debug:
1660	if (!free_debug_processing(s, page, x, addr))
1661		goto out_unlock;
1662	goto checks_ok;
1663}
1664
1665/*
1666 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
1667 * can perform fastpath freeing without additional function calls.
1668 *
1669 * The fastpath is only possible if we are freeing to the current cpu slab
1670 * of this processor. This typically the case if we have just allocated
1671 * the item before.
1672 *
1673 * If fastpath is not possible then fall back to __slab_free where we deal
1674 * with all sorts of special processing.
1675 */
1676static __always_inline void slab_free(struct kmem_cache *s,
1677			struct page *page, void *x, void *addr)
1678{
1679	void **object = (void *)x;
1680	struct kmem_cache_cpu *c;
1681	unsigned long flags;
1682
1683	local_irq_save(flags);
1684	c = get_cpu_slab(s, smp_processor_id());
1685	debug_check_no_locks_freed(object, c->objsize);
1686	if (likely(page == c->page && c->node >= 0)) {
1687		object[c->offset] = c->freelist;
1688		c->freelist = object;
1689		stat(c, FREE_FASTPATH);
1690	} else
1691		__slab_free(s, page, x, addr, c->offset);
1692
1693	local_irq_restore(flags);
1694}
1695
1696void kmem_cache_free(struct kmem_cache *s, void *x)
1697{
1698	struct page *page;
1699
1700	page = virt_to_head_page(x);
1701
1702	slab_free(s, page, x, __builtin_return_address(0));
1703}
1704EXPORT_SYMBOL(kmem_cache_free);
1705
1706/* Figure out on which slab object the object resides */
1707static struct page *get_object_page(const void *x)
1708{
1709	struct page *page = virt_to_head_page(x);
1710
1711	if (!PageSlab(page))
1712		return NULL;
1713
1714	return page;
1715}
1716
1717/*
1718 * Object placement in a slab is made very easy because we always start at
1719 * offset 0. If we tune the size of the object to the alignment then we can
1720 * get the required alignment by putting one properly sized object after
1721 * another.
1722 *
1723 * Notice that the allocation order determines the sizes of the per cpu
1724 * caches. Each processor has always one slab available for allocations.
1725 * Increasing the allocation order reduces the number of times that slabs
1726 * must be moved on and off the partial lists and is therefore a factor in
1727 * locking overhead.
1728 */
1729
1730/*
1731 * Mininum / Maximum order of slab pages. This influences locking overhead
1732 * and slab fragmentation. A higher order reduces the number of partial slabs
1733 * and increases the number of allocations possible without having to
1734 * take the list_lock.
1735 */
1736static int slub_min_order;
1737static int slub_max_order = DEFAULT_MAX_ORDER;
1738static int slub_min_objects = DEFAULT_MIN_OBJECTS;
1739
1740/*
1741 * Merge control. If this is set then no merging of slab caches will occur.
1742 * (Could be removed. This was introduced to pacify the merge skeptics.)
1743 */
1744static int slub_nomerge;
1745
1746/*
1747 * Calculate the order of allocation given an slab object size.
1748 *
1749 * The order of allocation has significant impact on performance and other
1750 * system components. Generally order 0 allocations should be preferred since
1751 * order 0 does not cause fragmentation in the page allocator. Larger objects
1752 * be problematic to put into order 0 slabs because there may be too much
1753 * unused space left. We go to a higher order if more than 1/8th of the slab
1754 * would be wasted.
1755 *
1756 * In order to reach satisfactory performance we must ensure that a minimum
1757 * number of objects is in one slab. Otherwise we may generate too much
1758 * activity on the partial lists which requires taking the list_lock. This is
1759 * less a concern for large slabs though which are rarely used.
1760 *
1761 * slub_max_order specifies the order where we begin to stop considering the
1762 * number of objects in a slab as critical. If we reach slub_max_order then
1763 * we try to keep the page order as low as possible. So we accept more waste
1764 * of space in favor of a small page order.
1765 *
1766 * Higher order allocations also allow the placement of more objects in a
1767 * slab and thereby reduce object handling overhead. If the user has
1768 * requested a higher mininum order then we start with that one instead of
1769 * the smallest order which will fit the object.
1770 */
1771static inline int slab_order(int size, int min_objects,
1772				int max_order, int fract_leftover)
1773{
1774	int order;
1775	int rem;
1776	int min_order = slub_min_order;
1777
1778	for (order = max(min_order,
1779				fls(min_objects * size - 1) - PAGE_SHIFT);
1780			order <= max_order; order++) {
1781
1782		unsigned long slab_size = PAGE_SIZE << order;
1783
1784		if (slab_size < min_objects * size)
1785			continue;
1786
1787		rem = slab_size % size;
1788
1789		if (rem <= slab_size / fract_leftover)
1790			break;
1791
1792	}
1793
1794	return order;
1795}
1796
1797static inline int calculate_order(int size)
1798{
1799	int order;
1800	int min_objects;
1801	int fraction;
1802
1803	/*
1804	 * Attempt to find best configuration for a slab. This
1805	 * works by first attempting to generate a layout with
1806	 * the best configuration and backing off gradually.
1807	 *
1808	 * First we reduce the acceptable waste in a slab. Then
1809	 * we reduce the minimum objects required in a slab.
1810	 */
1811	min_objects = slub_min_objects;
1812	while (min_objects > 1) {
1813		fraction = 8;
1814		while (fraction >= 4) {
1815			order = slab_order(size, min_objects,
1816						slub_max_order, fraction);
1817			if (order <= slub_max_order)
1818				return order;
1819			fraction /= 2;
1820		}
1821		min_objects /= 2;
1822	}
1823
1824	/*
1825	 * We were unable to place multiple objects in a slab. Now
1826	 * lets see if we can place a single object there.
1827	 */
1828	order = slab_order(size, 1, slub_max_order, 1);
1829	if (order <= slub_max_order)
1830		return order;
1831
1832	/*
1833	 * Doh this slab cannot be placed using slub_max_order.
1834	 */
1835	order = slab_order(size, 1, MAX_ORDER, 1);
1836	if (order <= MAX_ORDER)
1837		return order;
1838	return -ENOSYS;
1839}
1840
1841/*
1842 * Figure out what the alignment of the objects will be.
1843 */
1844static unsigned long calculate_alignment(unsigned long flags,
1845		unsigned long align, unsigned long size)
1846{
1847	/*
1848	 * If the user wants hardware cache aligned objects then
1849	 * follow that suggestion if the object is sufficiently
1850	 * large.
1851	 *
1852	 * The hardware cache alignment cannot override the
1853	 * specified alignment though. If that is greater
1854	 * then use it.
1855	 */
1856	if ((flags & SLAB_HWCACHE_ALIGN) &&
1857			size > cache_line_size() / 2)
1858		return max_t(unsigned long, align, cache_line_size());
1859
1860	if (align < ARCH_SLAB_MINALIGN)
1861		return ARCH_SLAB_MINALIGN;
1862
1863	return ALIGN(align, sizeof(void *));
1864}
1865
1866static void init_kmem_cache_cpu(struct kmem_cache *s,
1867			struct kmem_cache_cpu *c)
1868{
1869	c->page = NULL;
1870	c->freelist = NULL;
1871	c->node = 0;
1872	c->offset = s->offset / sizeof(void *);
1873	c->objsize = s->objsize;
1874}
1875
1876static void init_kmem_cache_node(struct kmem_cache_node *n)
1877{
1878	n->nr_partial = 0;
1879	atomic_long_set(&n->nr_slabs, 0);
1880	spin_lock_init(&n->list_lock);
1881	INIT_LIST_HEAD(&n->partial);
1882#ifdef CONFIG_SLUB_DEBUG
1883	INIT_LIST_HEAD(&n->full);
1884#endif
1885}
1886
1887#ifdef CONFIG_SMP
1888/*
1889 * Per cpu array for per cpu structures.
1890 *
1891 * The per cpu array places all kmem_cache_cpu structures from one processor
1892 * close together meaning that it becomes possible that multiple per cpu
1893 * structures are contained in one cacheline. This may be particularly
1894 * beneficial for the kmalloc caches.
1895 *
1896 * A desktop system typically has around 60-80 slabs. With 100 here we are
1897 * likely able to get per cpu structures for all caches from the array defined
1898 * here. We must be able to cover all kmalloc caches during bootstrap.
1899 *
1900 * If the per cpu array is exhausted then fall back to kmalloc
1901 * of individual cachelines. No sharing is possible then.
1902 */
1903#define NR_KMEM_CACHE_CPU 100
1904
1905static DEFINE_PER_CPU(struct kmem_cache_cpu,
1906				kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
1907
1908static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
1909static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE;
1910
1911static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
1912							int cpu, gfp_t flags)
1913{
1914	struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
1915
1916	if (c)
1917		per_cpu(kmem_cache_cpu_free, cpu) =
1918				(void *)c->freelist;
1919	else {
1920		/* Table overflow: So allocate ourselves */
1921		c = kmalloc_node(
1922			ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
1923			flags, cpu_to_node(cpu));
1924		if (!c)
1925			return NULL;
1926	}
1927
1928	init_kmem_cache_cpu(s, c);
1929	return c;
1930}
1931
1932static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
1933{
1934	if (c < per_cpu(kmem_cache_cpu, cpu) ||
1935			c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
1936		kfree(c);
1937		return;
1938	}
1939	c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
1940	per_cpu(kmem_cache_cpu_free, cpu) = c;
1941}
1942
1943static void free_kmem_cache_cpus(struct kmem_cache *s)
1944{
1945	int cpu;
1946
1947	for_each_online_cpu(cpu) {
1948		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1949
1950		if (c) {
1951			s->cpu_slab[cpu] = NULL;
1952			free_kmem_cache_cpu(c, cpu);
1953		}
1954	}
1955}
1956
1957static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
1958{
1959	int cpu;
1960
1961	for_each_online_cpu(cpu) {
1962		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1963
1964		if (c)
1965			continue;
1966
1967		c = alloc_kmem_cache_cpu(s, cpu, flags);
1968		if (!c) {
1969			free_kmem_cache_cpus(s);
1970			return 0;
1971		}
1972		s->cpu_slab[cpu] = c;
1973	}
1974	return 1;
1975}
1976
1977/*
1978 * Initialize the per cpu array.
1979 */
1980static void init_alloc_cpu_cpu(int cpu)
1981{
1982	int i;
1983
1984	if (cpu_isset(cpu, kmem_cach_cpu_free_init_once))
1985		return;
1986
1987	for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
1988		free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
1989
1990	cpu_set(cpu, kmem_cach_cpu_free_init_once);
1991}
1992
1993static void __init init_alloc_cpu(void)
1994{
1995	int cpu;
1996
1997	for_each_online_cpu(cpu)
1998		init_alloc_cpu_cpu(cpu);
1999  }
2000
2001#else
2002static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
2003static inline void init_alloc_cpu(void) {}
2004
2005static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2006{
2007	init_kmem_cache_cpu(s, &s->cpu_slab);
2008	return 1;
2009}
2010#endif
2011
2012#ifdef CONFIG_NUMA
2013/*
2014 * No kmalloc_node yet so do it by hand. We know that this is the first
2015 * slab on the node for this slabcache. There are no concurrent accesses
2016 * possible.
2017 *
2018 * Note that this function only works on the kmalloc_node_cache
2019 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
2020 * memory on a fresh node that has no slab structures yet.
2021 */
2022static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
2023							   int node)
2024{
2025	struct page *page;
2026	struct kmem_cache_node *n;
2027	unsigned long flags;
2028
2029	BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
2030
2031	page = new_slab(kmalloc_caches, gfpflags, node);
2032
2033	BUG_ON(!page);
2034	if (page_to_nid(page) != node) {
2035		printk(KERN_ERR "SLUB: Unable to allocate memory from "
2036				"node %d\n", node);
2037		printk(KERN_ERR "SLUB: Allocating a useless per node structure "
2038				"in order to be able to continue\n");
2039	}
2040
2041	n = page->freelist;
2042	BUG_ON(!n);
2043	page->freelist = get_freepointer(kmalloc_caches, n);
2044	page->inuse++;
2045	kmalloc_caches->node[node] = n;
2046#ifdef CONFIG_SLUB_DEBUG
2047	init_object(kmalloc_caches, n, 1);
2048	init_tracking(kmalloc_caches, n);
2049#endif
2050	init_kmem_cache_node(n);
2051	atomic_long_inc(&n->nr_slabs);
2052	/*
2053	 * lockdep requires consistent irq usage for each lock
2054	 * so even though there cannot be a race this early in
2055	 * the boot sequence, we still disable irqs.
2056	 */
2057	local_irq_save(flags);
2058	add_partial(n, page, 0);
2059	local_irq_restore(flags);
2060	return n;
2061}
2062
2063static void free_kmem_cache_nodes(struct kmem_cache *s)
2064{
2065	int node;
2066
2067	for_each_node_state(node, N_NORMAL_MEMORY) {
2068		struct kmem_cache_node *n = s->node[node];
2069		if (n && n != &s->local_node)
2070			kmem_cache_free(kmalloc_caches, n);
2071		s->node[node] = NULL;
2072	}
2073}
2074
2075static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2076{
2077	int node;
2078	int local_node;
2079
2080	if (slab_state >= UP)
2081		local_node = page_to_nid(virt_to_page(s));
2082	else
2083		local_node = 0;
2084
2085	for_each_node_state(node, N_NORMAL_MEMORY) {
2086		struct kmem_cache_node *n;
2087
2088		if (local_node == node)
2089			n = &s->local_node;
2090		else {
2091			if (slab_state == DOWN) {
2092				n = early_kmem_cache_node_alloc(gfpflags,
2093								node);
2094				continue;
2095			}
2096			n = kmem_cache_alloc_node(kmalloc_caches,
2097							gfpflags, node);
2098
2099			if (!n) {
2100				free_kmem_cache_nodes(s);
2101				return 0;
2102			}
2103
2104		}
2105		s->node[node] = n;
2106		init_kmem_cache_node(n);
2107	}
2108	return 1;
2109}
2110#else
2111static void free_kmem_cache_nodes(struct kmem_cache *s)
2112{
2113}
2114
2115static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2116{
2117	init_kmem_cache_node(&s->local_node);
2118	return 1;
2119}
2120#endif
2121
2122/*
2123 * calculate_sizes() determines the order and the distribution of data within
2124 * a slab object.
2125 */
2126static int calculate_sizes(struct kmem_cache *s)
2127{
2128	unsigned long flags = s->flags;
2129	unsigned long size = s->objsize;
2130	unsigned long align = s->align;
2131
2132	/*
2133	 * Determine if we can poison the object itself. If the user of
2134	 * the slab may touch the object after free or before allocation
2135	 * then we should never poison the object itself.
2136	 */
2137	if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
2138			!s->ctor)
2139		s->flags |= __OBJECT_POISON;
2140	else
2141		s->flags &= ~__OBJECT_POISON;
2142
2143	/*
2144	 * Round up object size to the next word boundary. We can only
2145	 * place the free pointer at word boundaries and this determines
2146	 * the possible location of the free pointer.
2147	 */
2148	size = ALIGN(size, sizeof(void *));
2149
2150#ifdef CONFIG_SLUB_DEBUG
2151	/*
2152	 * If we are Redzoning then check if there is some space between the
2153	 * end of the object and the free pointer. If not then add an
2154	 * additional word to have some bytes to store Redzone information.
2155	 */
2156	if ((flags & SLAB_RED_ZONE) && size == s->objsize)
2157		size += sizeof(void *);
2158#endif
2159
2160	/*
2161	 * With that we have determined the number of bytes in actual use
2162	 * by the object. This is the potential offset to the free pointer.
2163	 */
2164	s->inuse = size;
2165
2166	if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
2167		s->ctor)) {
2168		/*
2169		 * Relocate free pointer after the object if it is not
2170		 * permitted to overwrite the first word of the object on
2171		 * kmem_cache_free.
2172		 *
2173		 * This is the case if we do RCU, have a constructor or
2174		 * destructor or are poisoning the objects.
2175		 */
2176		s->offset = size;
2177		size += sizeof(void *);
2178	}
2179
2180#ifdef CONFIG_SLUB_DEBUG
2181	if (flags & SLAB_STORE_USER)
2182		/*
2183		 * Need to store information about allocs and frees after
2184		 * the object.
2185		 */
2186		size += 2 * sizeof(struct track);
2187
2188	if (flags & SLAB_RED_ZONE)
2189		/*
2190		 * Add some empty padding so that we can catch
2191		 * overwrites from earlier objects rather than let
2192		 * tracking information or the free pointer be
2193		 * corrupted if an user writes before the start
2194		 * of the object.
2195		 */
2196		size += sizeof(void *);
2197#endif
2198
2199	/*
2200	 * Determine the alignment based on various parameters that the
2201	 * user specified and the dynamic determination of cache line size
2202	 * on bootup.
2203	 */
2204	align = calculate_alignment(flags, align, s->objsize);
2205
2206	/*
2207	 * SLUB stores one object immediately after another beginning from
2208	 * offset 0. In order to align the objects we have to simply size
2209	 * each object to conform to the alignment.
2210	 */
2211	size = ALIGN(size, align);
2212	s->size = size;
2213
2214	if ((flags & __KMALLOC_CACHE) &&
2215			PAGE_SIZE / size < slub_min_objects) {
2216		/*
2217		 * Kmalloc cache that would not have enough objects in
2218		 * an order 0 page. Kmalloc slabs can fallback to
2219		 * page allocator order 0 allocs so take a reasonably large
2220		 * order that will allows us a good number of objects.
2221		 */
2222		s->order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER);
2223		s->flags |= __PAGE_ALLOC_FALLBACK;
2224		s->allocflags |= __GFP_NOWARN;
2225	} else
2226		s->order = calculate_order(size);
2227
2228	if (s->order < 0)
2229		return 0;
2230
2231	s->allocflags = 0;
2232	if (s->order)
2233		s->allocflags |= __GFP_COMP;
2234
2235	if (s->flags & SLAB_CACHE_DMA)
2236		s->allocflags |= SLUB_DMA;
2237
2238	if (s->flags & SLAB_RECLAIM_ACCOUNT)
2239		s->allocflags |= __GFP_RECLAIMABLE;
2240
2241	/*
2242	 * Determine the number of objects per slab
2243	 */
2244	s->objects = (PAGE_SIZE << s->order) / size;
2245
2246	return !!s->objects;
2247
2248}
2249
2250static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2251		const char *name, size_t size,
2252		size_t align, unsigned long flags,
2253		void (*ctor)(struct kmem_cache *, void *))
2254{
2255	memset(s, 0, kmem_size);
2256	s->name = name;
2257	s->ctor = ctor;
2258	s->objsize = size;
2259	s->align = align;
2260	s->flags = kmem_cache_flags(size, flags, name, ctor);
2261
2262	if (!calculate_sizes(s))
2263		goto error;
2264
2265	s->refcount = 1;
2266#ifdef CONFIG_NUMA
2267	s->remote_node_defrag_ratio = 100;
2268#endif
2269	if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
2270		goto error;
2271
2272	if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
2273		return 1;
2274	free_kmem_cache_nodes(s);
2275error:
2276	if (flags & SLAB_PANIC)
2277		panic("Cannot create slab %s size=%lu realsize=%u "
2278			"order=%u offset=%u flags=%lx\n",
2279			s->name, (unsigned long)size, s->size, s->order,
2280			s->offset, flags);
2281	return 0;
2282}
2283
2284/*
2285 * Check if a given pointer is valid
2286 */
2287int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2288{
2289	struct page *page;
2290
2291	page = get_object_page(object);
2292
2293	if (!page || s != page->slab)
2294		/* No slab or wrong slab */
2295		return 0;
2296
2297	if (!check_valid_pointer(s, page, object))
2298		return 0;
2299
2300	/*
2301	 * We could also check if the object is on the slabs freelist.
2302	 * But this would be too expensive and it seems that the main
2303	 * purpose of kmem_ptr_valid is to check if the object belongs
2304	 * to a certain slab.
2305	 */
2306	return 1;
2307}
2308EXPORT_SYMBOL(kmem_ptr_validate);
2309
2310/*
2311 * Determine the size of a slab object
2312 */
2313unsigned int kmem_cache_size(struct kmem_cache *s)
2314{
2315	return s->objsize;
2316}
2317EXPORT_SYMBOL(kmem_cache_size);
2318
2319const char *kmem_cache_name(struct kmem_cache *s)
2320{
2321	return s->name;
2322}
2323EXPORT_SYMBOL(kmem_cache_name);
2324
2325/*
2326 * Attempt to free all slabs on a node. Return the number of slabs we
2327 * were unable to free.
2328 */
2329static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
2330			struct list_head *list)
2331{
2332	int slabs_inuse = 0;
2333	unsigned long flags;
2334	struct page *page, *h;
2335
2336	spin_lock_irqsave(&n->list_lock, flags);
2337	list_for_each_entry_safe(page, h, list, lru)
2338		if (!page->inuse) {
2339			list_del(&page->lru);
2340			discard_slab(s, page);
2341		} else
2342			slabs_inuse++;
2343	spin_unlock_irqrestore(&n->list_lock, flags);
2344	return slabs_inuse;
2345}
2346
2347/*
2348 * Release all resources used by a slab cache.
2349 */
2350static inline int kmem_cache_close(struct kmem_cache *s)
2351{
2352	int node;
2353
2354	flush_all(s);
2355
2356	/* Attempt to free all objects */
2357	free_kmem_cache_cpus(s);
2358	for_each_node_state(node, N_NORMAL_MEMORY) {
2359		struct kmem_cache_node *n = get_node(s, node);
2360
2361		n->nr_partial -= free_list(s, n, &n->partial);
2362		if (atomic_long_read(&n->nr_slabs))
2363			return 1;
2364	}
2365	free_kmem_cache_nodes(s);
2366	return 0;
2367}
2368
2369/*
2370 * Close a cache and release the kmem_cache structure
2371 * (must be used for caches created using kmem_cache_create)
2372 */
2373void kmem_cache_destroy(struct kmem_cache *s)
2374{
2375	down_write(&slub_lock);
2376	s->refcount--;
2377	if (!s->refcount) {
2378		list_del(&s->list);
2379		up_write(&slub_lock);
2380		if (kmem_cache_close(s))
2381			WARN_ON(1);
2382		sysfs_slab_remove(s);
2383	} else
2384		up_write(&slub_lock);
2385}
2386EXPORT_SYMBOL(kmem_cache_destroy);
2387
2388/********************************************************************
2389 *		Kmalloc subsystem
2390 *******************************************************************/
2391
2392struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
2393EXPORT_SYMBOL(kmalloc_caches);
2394
2395#ifdef CONFIG_ZONE_DMA
2396static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
2397#endif
2398
2399static int __init setup_slub_min_order(char *str)
2400{
2401	get_option(&str, &slub_min_order);
2402
2403	return 1;
2404}
2405
2406__setup("slub_min_order=", setup_slub_min_order);
2407
2408static int __init setup_slub_max_order(char *str)
2409{
2410	get_option(&str, &slub_max_order);
2411
2412	return 1;
2413}
2414
2415__setup("slub_max_order=", setup_slub_max_order);
2416
2417static int __init setup_slub_min_objects(char *str)
2418{
2419	get_option(&str, &slub_min_objects);
2420
2421	return 1;
2422}
2423
2424__setup("slub_min_objects=", setup_slub_min_objects);
2425
2426static int __init setup_slub_nomerge(char *str)
2427{
2428	slub_nomerge = 1;
2429	return 1;
2430}
2431
2432__setup("slub_nomerge", setup_slub_nomerge);
2433
2434static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
2435		const char *name, int size, gfp_t gfp_flags)
2436{
2437	unsigned int flags = 0;
2438
2439	if (gfp_flags & SLUB_DMA)
2440		flags = SLAB_CACHE_DMA;
2441
2442	down_write(&slub_lock);
2443	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
2444			flags | __KMALLOC_CACHE, NULL))
2445		goto panic;
2446
2447	list_add(&s->list, &slab_caches);
2448	up_write(&slub_lock);
2449	if (sysfs_slab_add(s))
2450		goto panic;
2451	return s;
2452
2453panic:
2454	panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
2455}
2456
2457#ifdef CONFIG_ZONE_DMA
2458
2459static void sysfs_add_func(struct work_struct *w)
2460{
2461	struct kmem_cache *s;
2462
2463	down_write(&slub_lock);
2464	list_for_each_entry(s, &slab_caches, list) {
2465		if (s->flags & __SYSFS_ADD_DEFERRED) {
2466			s->flags &= ~__SYSFS_ADD_DEFERRED;
2467			sysfs_slab_add(s);
2468		}
2469	}
2470	up_write(&slub_lock);
2471}
2472
2473static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
2474
2475static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2476{
2477	struct kmem_cache *s;
2478	char *text;
2479	size_t realsize;
2480
2481	s = kmalloc_caches_dma[index];
2482	if (s)
2483		return s;
2484
2485	/* Dynamically create dma cache */
2486	if (flags & __GFP_WAIT)
2487		down_write(&slub_lock);
2488	else {
2489		if (!down_write_trylock(&slub_lock))
2490			goto out;
2491	}
2492
2493	if (kmalloc_caches_dma[index])
2494		goto unlock_out;
2495
2496	realsize = kmalloc_caches[index].objsize;
2497	text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2498			 (unsigned int)realsize);
2499	s = kmalloc(kmem_size, flags & ~SLUB_DMA);
2500
2501	if (!s || !text || !kmem_cache_open(s, flags, text,
2502			realsize, ARCH_KMALLOC_MINALIGN,
2503			SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
2504		kfree(s);
2505		kfree(text);
2506		goto unlock_out;
2507	}
2508
2509	list_add(&s->list, &slab_caches);
2510	kmalloc_caches_dma[index] = s;
2511
2512	schedule_work(&sysfs_add_work);
2513
2514unlock_out:
2515	up_write(&slub_lock);
2516out:
2517	return kmalloc_caches_dma[index];
2518}
2519#endif
2520
2521/*
2522 * Conversion table for small slabs sizes / 8 to the index in the
2523 * kmalloc array. This is necessary for slabs < 192 since we have non power
2524 * of two cache sizes there. The size of larger slabs can be determined using
2525 * fls.
2526 */
2527static s8 size_index[24] = {
2528	3,	/* 8 */
2529	4,	/* 16 */
2530	5,	/* 24 */
2531	5,	/* 32 */
2532	6,	/* 40 */
2533	6,	/* 48 */
2534	6,	/* 56 */
2535	6,	/* 64 */
2536	1,	/* 72 */
2537	1,	/* 80 */
2538	1,	/* 88 */
2539	1,	/* 96 */
2540	7,	/* 104 */
2541	7,	/* 112 */
2542	7,	/* 120 */
2543	7,	/* 128 */
2544	2,	/* 136 */
2545	2,	/* 144 */
2546	2,	/* 152 */
2547	2,	/* 160 */
2548	2,	/* 168 */
2549	2,	/* 176 */
2550	2,	/* 184 */
2551	2	/* 192 */
2552};
2553
2554static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2555{
2556	int index;
2557
2558	if (size <= 192) {
2559		if (!size)
2560			return ZERO_SIZE_PTR;
2561
2562		index = size_index[(size - 1) / 8];
2563	} else
2564		index = fls(size - 1);
2565
2566#ifdef CONFIG_ZONE_DMA
2567	if (unlikely((flags & SLUB_DMA)))
2568		return dma_kmalloc_cache(index, flags);
2569
2570#endif
2571	return &kmalloc_caches[index];
2572}
2573
2574void *__kmalloc(size_t size, gfp_t flags)
2575{
2576	struct kmem_cache *s;
2577
2578	if (unlikely(size > PAGE_SIZE))
2579		return kmalloc_large(size, flags);
2580
2581	s = get_slab(size, flags);
2582
2583	if (unlikely(ZERO_OR_NULL_PTR(s)))
2584		return s;
2585
2586	return slab_alloc(s, flags, -1, __builtin_return_address(0));
2587}
2588EXPORT_SYMBOL(__kmalloc);
2589
2590#ifdef CONFIG_NUMA
2591void *__kmalloc_node(size_t size, gfp_t flags, int node)
2592{
2593	struct kmem_cache *s;
2594
2595	if (unlikely(size > PAGE_SIZE))
2596		return kmalloc_large(size, flags);
2597
2598	s = get_slab(size, flags);
2599
2600	if (unlikely(ZERO_OR_NULL_PTR(s)))
2601		return s;
2602
2603	return slab_alloc(s, flags, node, __builtin_return_address(0));
2604}
2605EXPORT_SYMBOL(__kmalloc_node);
2606#endif
2607
2608size_t ksize(const void *object)
2609{
2610	struct page *page;
2611	struct kmem_cache *s;
2612
2613	BUG_ON(!object);
2614	if (unlikely(object == ZERO_SIZE_PTR))
2615		return 0;
2616
2617	page = virt_to_head_page(object);
2618	BUG_ON(!page);
2619
2620	if (unlikely(!PageSlab(page)))
2621		return PAGE_SIZE << compound_order(page);
2622
2623	s = page->slab;
2624	BUG_ON(!s);
2625
2626	/*
2627	 * Debugging requires use of the padding between object
2628	 * and whatever may come after it.
2629	 */
2630	if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2631		return s->objsize;
2632
2633	/*
2634	 * If we have the need to store the freelist pointer
2635	 * back there or track user information then we can
2636	 * only use the space before that information.
2637	 */
2638	if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2639		return s->inuse;
2640
2641	/*
2642	 * Else we can use all the padding etc for the allocation
2643	 */
2644	return s->size;
2645}
2646EXPORT_SYMBOL(ksize);
2647
2648void kfree(const void *x)
2649{
2650	struct page *page;
2651	void *object = (void *)x;
2652
2653	if (unlikely(ZERO_OR_NULL_PTR(x)))
2654		return;
2655
2656	page = virt_to_head_page(x);
2657	if (unlikely(!PageSlab(page))) {
2658		put_page(page);
2659		return;
2660	}
2661	slab_free(page->slab, page, object, __builtin_return_address(0));
2662}
2663EXPORT_SYMBOL(kfree);
2664
2665static unsigned long count_partial(struct kmem_cache_node *n)
2666{
2667	unsigned long flags;
2668	unsigned long x = 0;
2669	struct page *page;
2670
2671	spin_lock_irqsave(&n->list_lock, flags);
2672	list_for_each_entry(page, &n->partial, lru)
2673		x += page->inuse;
2674	spin_unlock_irqrestore(&n->list_lock, flags);
2675	return x;
2676}
2677
2678/*
2679 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
2680 * the remaining slabs by the number of items in use. The slabs with the
2681 * most items in use come first. New allocations will then fill those up
2682 * and thus they can be removed from the partial lists.
2683 *
2684 * The slabs with the least items are placed last. This results in them
2685 * being allocated from last increasing the chance that the last objects
2686 * are freed in them.
2687 */
2688int kmem_cache_shrink(struct kmem_cache *s)
2689{
2690	int node;
2691	int i;
2692	struct kmem_cache_node *n;
2693	struct page *page;
2694	struct page *t;
2695	struct list_head *slabs_by_inuse =
2696		kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
2697	unsigned long flags;
2698
2699	if (!slabs_by_inuse)
2700		return -ENOMEM;
2701
2702	flush_all(s);
2703	for_each_node_state(node, N_NORMAL_MEMORY) {
2704		n = get_node(s, node);
2705
2706		if (!n->nr_partial)
2707			continue;
2708
2709		for (i = 0; i < s->objects; i++)
2710			INIT_LIST_HEAD(slabs_by_inuse + i);
2711
2712		spin_lock_irqsave(&n->list_lock, flags);
2713
2714		/*
2715		 * Build lists indexed by the items in use in each slab.
2716		 *
2717		 * Note that concurrent frees may occur while we hold the
2718		 * list_lock. page->inuse here is the upper limit.
2719		 */
2720		list_for_each_entry_safe(page, t, &n->partial, lru) {
2721			if (!page->inuse && slab_trylock(page)) {
2722				/*
2723				 * Must hold slab lock here because slab_free
2724				 * may have freed the last object and be
2725				 * waiting to release the slab.
2726				 */
2727				list_del(&page->lru);
2728				n->nr_partial--;
2729				slab_unlock(page);
2730				discard_slab(s, page);
2731			} else {
2732				list_move(&page->lru,
2733				slabs_by_inuse + page->inuse);
2734			}
2735		}
2736
2737		/*
2738		 * Rebuild the partial list with the slabs filled up most
2739		 * first and the least used slabs at the end.
2740		 */
2741		for (i = s->objects - 1; i >= 0; i--)
2742			list_splice(slabs_by_inuse + i, n->partial.prev);
2743
2744		spin_unlock_irqrestore(&n->list_lock, flags);
2745	}
2746
2747	kfree(slabs_by_inuse);
2748	return 0;
2749}
2750EXPORT_SYMBOL(kmem_cache_shrink);
2751
2752#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2753static int slab_mem_going_offline_callback(void *arg)
2754{
2755	struct kmem_cache *s;
2756
2757	down_read(&slub_lock);
2758	list_for_each_entry(s, &slab_caches, list)
2759		kmem_cache_shrink(s);
2760	up_read(&slub_lock);
2761
2762	return 0;
2763}
2764
2765static void slab_mem_offline_callback(void *arg)
2766{
2767	struct kmem_cache_node *n;
2768	struct kmem_cache *s;
2769	struct memory_notify *marg = arg;
2770	int offline_node;
2771
2772	offline_node = marg->status_change_nid;
2773
2774	/*
2775	 * If the node still has available memory. we need kmem_cache_node
2776	 * for it yet.
2777	 */
2778	if (offline_node < 0)
2779		return;
2780
2781	down_read(&slub_lock);
2782	list_for_each_entry(s, &slab_caches, list) {
2783		n = get_node(s, offline_node);
2784		if (n) {
2785			/*
2786			 * if n->nr_slabs > 0, slabs still exist on the node
2787			 * that is going down. We were unable to free them,
2788			 * and offline_pages() function shoudn't call this
2789			 * callback. So, we must fail.
2790			 */
2791			BUG_ON(atomic_long_read(&n->nr_slabs));
2792
2793			s->node[offline_node] = NULL;
2794			kmem_cache_free(kmalloc_caches, n);
2795		}
2796	}
2797	up_read(&slub_lock);
2798}
2799
2800static int slab_mem_going_online_callback(void *arg)
2801{
2802	struct kmem_cache_node *n;
2803	struct kmem_cache *s;
2804	struct memory_notify *marg = arg;
2805	int nid = marg->status_change_nid;
2806	int ret = 0;
2807
2808	/*
2809	 * If the node's memory is already available, then kmem_cache_node is
2810	 * already created. Nothing to do.
2811	 */
2812	if (nid < 0)
2813		return 0;
2814
2815	/*
2816	 * We are bringing a node online. No memory is availabe yet. We must
2817	 * allocate a kmem_cache_node structure in order to bring the node
2818	 * online.
2819	 */
2820	down_read(&slub_lock);
2821	list_for_each_entry(s, &slab_caches, list) {
2822		/*
2823		 * XXX: kmem_cache_alloc_node will fallback to other nodes
2824		 *      since memory is not yet available from the node that
2825		 *      is brought up.
2826		 */
2827		n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL);
2828		if (!n) {
2829			ret = -ENOMEM;
2830			goto out;
2831		}
2832		init_kmem_cache_node(n);
2833		s->node[nid] = n;
2834	}
2835out:
2836	up_read(&slub_lock);
2837	return ret;
2838}
2839
2840static int slab_memory_callback(struct notifier_block *self,
2841				unsigned long action, void *arg)
2842{
2843	int ret = 0;
2844
2845	switch (action) {
2846	case MEM_GOING_ONLINE:
2847		ret = slab_mem_going_online_callback(arg);
2848		break;
2849	case MEM_GOING_OFFLINE:
2850		ret = slab_mem_going_offline_callback(arg);
2851		break;
2852	case MEM_OFFLINE:
2853	case MEM_CANCEL_ONLINE:
2854		slab_mem_offline_callback(arg);
2855		break;
2856	case MEM_ONLINE:
2857	case MEM_CANCEL_OFFLINE:
2858		break;
2859	}
2860
2861	ret = notifier_from_errno(ret);
2862	return ret;
2863}
2864
2865#endif /* CONFIG_MEMORY_HOTPLUG */
2866
2867/********************************************************************
2868 *			Basic setup of slabs
2869 *******************************************************************/
2870
2871void __init kmem_cache_init(void)
2872{
2873	int i;
2874	int caches = 0;
2875
2876	init_alloc_cpu();
2877
2878#ifdef CONFIG_NUMA
2879	/*
2880	 * Must first have the slab cache available for the allocations of the
2881	 * struct kmem_cache_node's. There is special bootstrap code in
2882	 * kmem_cache_open for slab_state == DOWN.
2883	 */
2884	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
2885		sizeof(struct kmem_cache_node), GFP_KERNEL);
2886	kmalloc_caches[0].refcount = -1;
2887	caches++;
2888
2889	hotplug_memory_notifier(slab_memory_callback, 1);
2890#endif
2891
2892	/* Able to allocate the per node structures */
2893	slab_state = PARTIAL;
2894
2895	/* Caches that are not of the two-to-the-power-of size */
2896	if (KMALLOC_MIN_SIZE <= 64) {
2897		create_kmalloc_cache(&kmalloc_caches[1],
2898				"kmalloc-96", 96, GFP_KERNEL);
2899		caches++;
2900	}
2901	if (KMALLOC_MIN_SIZE <= 128) {
2902		create_kmalloc_cache(&kmalloc_caches[2],
2903				"kmalloc-192", 192, GFP_KERNEL);
2904		caches++;
2905	}
2906
2907	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
2908		create_kmalloc_cache(&kmalloc_caches[i],
2909			"kmalloc", 1 << i, GFP_KERNEL);
2910		caches++;
2911	}
2912
2913
2914	/*
2915	 * Patch up the size_index table if we have strange large alignment
2916	 * requirements for the kmalloc array. This is only the case for
2917	 * mips it seems. The standard arches will not generate any code here.
2918	 *
2919	 * Largest permitted alignment is 256 bytes due to the way we
2920	 * handle the index determination for the smaller caches.
2921	 *
2922	 * Make sure that nothing crazy happens if someone starts tinkering
2923	 * around with ARCH_KMALLOC_MINALIGN
2924	 */
2925	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
2926		(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
2927
2928	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
2929		size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
2930
2931	slab_state = UP;
2932
2933	/* Provide the correct kmalloc names now that the caches are up */
2934	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
2935		kmalloc_caches[i]. name =
2936			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
2937
2938#ifdef CONFIG_SMP
2939	register_cpu_notifier(&slab_notifier);
2940	kmem_size = offsetof(struct kmem_cache, cpu_slab) +
2941				nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
2942#else
2943	kmem_size = sizeof(struct kmem_cache);
2944#endif
2945
2946
2947	printk(KERN_INFO
2948		"SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
2949		" CPUs=%d, Nodes=%d\n",
2950		caches, cache_line_size(),
2951		slub_min_order, slub_max_order, slub_min_objects,
2952		nr_cpu_ids, nr_node_ids);
2953}
2954
2955/*
2956 * Find a mergeable slab cache
2957 */
2958static int slab_unmergeable(struct kmem_cache *s)
2959{
2960	if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
2961		return 1;
2962
2963	if ((s->flags & __PAGE_ALLOC_FALLBACK))
2964		return 1;
2965
2966	if (s->ctor)
2967		return 1;
2968
2969	/*
2970	 * We may have set a slab to be unmergeable during bootstrap.
2971	 */
2972	if (s->refcount < 0)
2973		return 1;
2974
2975	return 0;
2976}
2977
2978static struct kmem_cache *find_mergeable(size_t size,
2979		size_t align, unsigned long flags, const char *name,
2980		void (*ctor)(struct kmem_cache *, void *))
2981{
2982	struct kmem_cache *s;
2983
2984	if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
2985		return NULL;
2986
2987	if (ctor)
2988		return NULL;
2989
2990	size = ALIGN(size, sizeof(void *));
2991	align = calculate_alignment(flags, align, size);
2992	size = ALIGN(size, align);
2993	flags = kmem_cache_flags(size, flags, name, NULL);
2994
2995	list_for_each_entry(s, &slab_caches, list) {
2996		if (slab_unmergeable(s))
2997			continue;
2998
2999		if (size > s->size)
3000			continue;
3001
3002		if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
3003				continue;
3004		/*
3005		 * Check if alignment is compatible.
3006		 * Courtesy of Adrian Drzewiecki
3007		 */
3008		if ((s->size & ~(align - 1)) != s->size)
3009			continue;
3010
3011		if (s->size - size >= sizeof(void *))
3012			continue;
3013
3014		return s;
3015	}
3016	return NULL;
3017}
3018
3019struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3020		size_t align, unsigned long flags,
3021		void (*ctor)(struct kmem_cache *, void *))
3022{
3023	struct kmem_cache *s;
3024
3025	down_write(&slub_lock);
3026	s = find_mergeable(size, align, flags, name, ctor);
3027	if (s) {
3028		int cpu;
3029
3030		s->refcount++;
3031		/*
3032		 * Adjust the object sizes so that we clear
3033		 * the complete object on kzalloc.
3034		 */
3035		s->objsize = max(s->objsize, (int)size);
3036
3037		/*
3038		 * And then we need to update the object size in the
3039		 * per cpu structures
3040		 */
3041		for_each_online_cpu(cpu)
3042			get_cpu_slab(s, cpu)->objsize = s->objsize;
3043		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3044		up_write(&slub_lock);
3045		if (sysfs_slab_alias(s, name))
3046			goto err;
3047		return s;
3048	}
3049	s = kmalloc(kmem_size, GFP_KERNEL);
3050	if (s) {
3051		if (kmem_cache_open(s, GFP_KERNEL, name,
3052				size, align, flags, ctor)) {
3053			list_add(&s->list, &slab_caches);
3054			up_write(&slub_lock);
3055			if (sysfs_slab_add(s))
3056				goto err;
3057			return s;
3058		}
3059		kfree(s);
3060	}
3061	up_write(&slub_lock);
3062
3063err:
3064	if (flags & SLAB_PANIC)
3065		panic("Cannot create slabcache %s\n", name);
3066	else
3067		s = NULL;
3068	return s;
3069}
3070EXPORT_SYMBOL(kmem_cache_create);
3071
3072#ifdef CONFIG_SMP
3073/*
3074 * Use the cpu notifier to insure that the cpu slabs are flushed when
3075 * necessary.
3076 */
3077static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3078		unsigned long action, void *hcpu)
3079{
3080	long cpu = (long)hcpu;
3081	struct kmem_cache *s;
3082	unsigned long flags;
3083
3084	switch (action) {
3085	case CPU_UP_PREPARE:
3086	case CPU_UP_PREPARE_FROZEN:
3087		init_alloc_cpu_cpu(cpu);
3088		down_read(&slub_lock);
3089		list_for_each_entry(s, &slab_caches, list)
3090			s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
3091							GFP_KERNEL);
3092		up_read(&slub_lock);
3093		break;
3094
3095	case CPU_UP_CANCELED:
3096	case CPU_UP_CANCELED_FROZEN:
3097	case CPU_DEAD:
3098	case CPU_DEAD_FROZEN:
3099		down_read(&slub_lock);
3100		list_for_each_entry(s, &slab_caches, list) {
3101			struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3102
3103			local_irq_save(flags);
3104			__flush_cpu_slab(s, cpu);
3105			local_irq_restore(flags);
3106			free_kmem_cache_cpu(c, cpu);
3107			s->cpu_slab[cpu] = NULL;
3108		}
3109		up_read(&slub_lock);
3110		break;
3111	default:
3112		break;
3113	}
3114	return NOTIFY_OK;
3115}
3116
3117static struct notifier_block __cpuinitdata slab_notifier = {
3118	.notifier_call = slab_cpuup_callback
3119};
3120
3121#endif
3122
3123void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
3124{
3125	struct kmem_cache *s;
3126
3127	if (unlikely(size > PAGE_SIZE))
3128		return kmalloc_large(size, gfpflags);
3129
3130	s = get_slab(size, gfpflags);
3131
3132	if (unlikely(ZERO_OR_NULL_PTR(s)))
3133		return s;
3134
3135	return slab_alloc(s, gfpflags, -1, caller);
3136}
3137
3138void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3139					int node, void *caller)
3140{
3141	struct kmem_cache *s;
3142
3143	if (unlikely(size > PAGE_SIZE))
3144		return kmalloc_large(size, gfpflags);
3145
3146	s = get_slab(size, gfpflags);
3147
3148	if (unlikely(ZERO_OR_NULL_PTR(s)))
3149		return s;
3150
3151	return slab_alloc(s, gfpflags, node, caller);
3152}
3153
3154#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
3155static int validate_slab(struct kmem_cache *s, struct page *page,
3156						unsigned long *map)
3157{
3158	void *p;
3159	void *addr = page_address(page);
3160
3161	if (!check_slab(s, page) ||
3162			!on_freelist(s, page, NULL))
3163		return 0;
3164
3165	/* Now we know that a valid freelist exists */
3166	bitmap_zero(map, s->objects);
3167
3168	for_each_free_object(p, s, page->freelist) {
3169		set_bit(slab_index(p, s, addr), map);
3170		if (!check_object(s, page, p, 0))
3171			return 0;
3172	}
3173
3174	for_each_object(p, s, addr)
3175		if (!test_bit(slab_index(p, s, addr), map))
3176			if (!check_object(s, page, p, 1))
3177				return 0;
3178	return 1;
3179}
3180
3181static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3182						unsigned long *map)
3183{
3184	if (slab_trylock(page)) {
3185		validate_slab(s, page, map);
3186		slab_unlock(page);
3187	} else
3188		printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
3189			s->name, page);
3190
3191	if (s->flags & DEBUG_DEFAULT_FLAGS) {
3192		if (!SlabDebug(page))
3193			printk(KERN_ERR "SLUB %s: SlabDebug not set "
3194				"on slab 0x%p\n", s->name, page);
3195	} else {
3196		if (SlabDebug(page))
3197			printk(KERN_ERR "SLUB %s: SlabDebug set on "
3198				"slab 0x%p\n", s->name, page);
3199	}
3200}
3201
3202static int validate_slab_node(struct kmem_cache *s,
3203		struct kmem_cache_node *n, unsigned long *map)
3204{
3205	unsigned long count = 0;
3206	struct page *page;
3207	unsigned long flags;
3208
3209	spin_lock_irqsave(&n->list_lock, flags);
3210
3211	list_for_each_entry(page, &n->partial, lru) {
3212		validate_slab_slab(s, page, map);
3213		count++;
3214	}
3215	if (count != n->nr_partial)
3216		printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
3217			"counter=%ld\n", s->name, count, n->nr_partial);
3218
3219	if (!(s->flags & SLAB_STORE_USER))
3220		goto out;
3221
3222	list_for_each_entry(page, &n->full, lru) {
3223		validate_slab_slab(s, page, map);
3224		count++;
3225	}
3226	if (count != atomic_long_read(&n->nr_slabs))
3227		printk(KERN_ERR "SLUB: %s %ld slabs counted but "
3228			"counter=%ld\n", s->name, count,
3229			atomic_long_read(&n->nr_slabs));
3230
3231out:
3232	spin_unlock_irqrestore(&n->list_lock, flags);
3233	return count;
3234}
3235
3236static long validate_slab_cache(struct kmem_cache *s)
3237{
3238	int node;
3239	unsigned long count = 0;
3240	unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) *
3241				sizeof(unsigned long), GFP_KERNEL);
3242
3243	if (!map)
3244		return -ENOMEM;
3245
3246	flush_all(s);
3247	for_each_node_state(node, N_NORMAL_MEMORY) {
3248		struct kmem_cache_node *n = get_node(s, node);
3249
3250		count += validate_slab_node(s, n, map);
3251	}
3252	kfree(map);
3253	return count;
3254}
3255
3256#ifdef SLUB_RESILIENCY_TEST
3257static void resiliency_test(void)
3258{
3259	u8 *p;
3260
3261	printk(KERN_ERR "SLUB resiliency testing\n");
3262	printk(KERN_ERR "-----------------------\n");
3263	printk(KERN_ERR "A. Corruption after allocation\n");
3264
3265	p = kzalloc(16, GFP_KERNEL);
3266	p[16] = 0x12;
3267	printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
3268			" 0x12->0x%p\n\n", p + 16);
3269
3270	validate_slab_cache(kmalloc_caches + 4);
3271
3272	/* Hmmm... The next two are dangerous */
3273	p = kzalloc(32, GFP_KERNEL);
3274	p[32 + sizeof(void *)] = 0x34;
3275	printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
3276			" 0x34 -> -0x%p\n", p);
3277	printk(KERN_ERR
3278		"If allocated object is overwritten then not detectable\n\n");
3279
3280	validate_slab_cache(kmalloc_caches + 5);
3281	p = kzalloc(64, GFP_KERNEL);
3282	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
3283	*p = 0x56;
3284	printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
3285									p);
3286	printk(KERN_ERR
3287		"If allocated object is overwritten then not detectable\n\n");
3288	validate_slab_cache(kmalloc_caches + 6);
3289
3290	printk(KERN_ERR "\nB. Corruption after free\n");
3291	p = kzalloc(128, GFP_KERNEL);
3292	kfree(p);
3293	*p = 0x78;
3294	printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
3295	validate_slab_cache(kmalloc_caches + 7);
3296
3297	p = kzalloc(256, GFP_KERNEL);
3298	kfree(p);
3299	p[50] = 0x9a;
3300	printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
3301			p);
3302	validate_slab_cache(kmalloc_caches + 8);
3303
3304	p = kzalloc(512, GFP_KERNEL);
3305	kfree(p);
3306	p[512] = 0xab;
3307	printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
3308	validate_slab_cache(kmalloc_caches + 9);
3309}
3310#else
3311static void resiliency_test(void) {};
3312#endif
3313
3314/*
3315 * Generate lists of code addresses where slabcache objects are allocated
3316 * and freed.
3317 */
3318
3319struct location {
3320	unsigned long count;
3321	void *addr;
3322	long long sum_time;
3323	long min_time;
3324	long max_time;
3325	long min_pid;
3326	long max_pid;
3327	cpumask_t cpus;
3328	nodemask_t nodes;
3329};
3330
3331struct loc_track {
3332	unsigned long max;
3333	unsigned long count;
3334	struct location *loc;
3335};
3336
3337static void free_loc_track(struct loc_track *t)
3338{
3339	if (t->max)
3340		free_pages((unsigned long)t->loc,
3341			get_order(sizeof(struct location) * t->max));
3342}
3343
3344static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
3345{
3346	struct location *l;
3347	int order;
3348
3349	order = get_order(sizeof(struct location) * max);
3350
3351	l = (void *)__get_free_pages(flags, order);
3352	if (!l)
3353		return 0;
3354
3355	if (t->count) {
3356		memcpy(l, t->loc, sizeof(struct location) * t->count);
3357		free_loc_track(t);
3358	}
3359	t->max = max;
3360	t->loc = l;
3361	return 1;
3362}
3363
3364static int add_location(struct loc_track *t, struct kmem_cache *s,
3365				const struct track *track)
3366{
3367	long start, end, pos;
3368	struct location *l;
3369	void *caddr;
3370	unsigned long age = jiffies - track->when;
3371
3372	start = -1;
3373	end = t->count;
3374
3375	for ( ; ; ) {
3376		pos = start + (end - start + 1) / 2;
3377
3378		/*
3379		 * There is nothing at "end". If we end up there
3380		 * we need to add something to before end.
3381		 */
3382		if (pos == end)
3383			break;
3384
3385		caddr = t->loc[pos].addr;
3386		if (track->addr == caddr) {
3387
3388			l = &t->loc[pos];
3389			l->count++;
3390			if (track->when) {
3391				l->sum_time += age;
3392				if (age < l->min_time)
3393					l->min_time = age;
3394				if (age > l->max_time)
3395					l->max_time = age;
3396
3397				if (track->pid < l->min_pid)
3398					l->min_pid = track->pid;
3399				if (track->pid > l->max_pid)
3400					l->max_pid = track->pid;
3401
3402				cpu_set(track->cpu, l->cpus);
3403			}
3404			node_set(page_to_nid(virt_to_page(track)), l->nodes);
3405			return 1;
3406		}
3407
3408		if (track->addr < caddr)
3409			end = pos;
3410		else
3411			start = pos;
3412	}
3413
3414	/*
3415	 * Not found. Insert new tracking element.
3416	 */
3417	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
3418		return 0;
3419
3420	l = t->loc + pos;
3421	if (pos < t->count)
3422		memmove(l + 1, l,
3423			(t->count - pos) * sizeof(struct location));
3424	t->count++;
3425	l->count = 1;
3426	l->addr = track->addr;
3427	l->sum_time = age;
3428	l->min_time = age;
3429	l->max_time = age;
3430	l->min_pid = track->pid;
3431	l->max_pid = track->pid;
3432	cpus_clear(l->cpus);
3433	cpu_set(track->cpu, l->cpus);
3434	nodes_clear(l->nodes);
3435	node_set(page_to_nid(virt_to_page(track)), l->nodes);
3436	return 1;
3437}
3438
3439static void process_slab(struct loc_track *t, struct kmem_cache *s,
3440		struct page *page, enum track_item alloc)
3441{
3442	void *addr = page_address(page);
3443	DECLARE_BITMAP(map, s->objects);
3444	void *p;
3445
3446	bitmap_zero(map, s->objects);
3447	for_each_free_object(p, s, page->freelist)
3448		set_bit(slab_index(p, s, addr), map);
3449
3450	for_each_object(p, s, addr)
3451		if (!test_bit(slab_index(p, s, addr), map))
3452			add_location(t, s, get_track(s, p, alloc));
3453}
3454
3455static int list_locations(struct kmem_cache *s, char *buf,
3456					enum track_item alloc)
3457{
3458	int len = 0;
3459	unsigned long i;
3460	struct loc_track t = { 0, 0, NULL };
3461	int node;
3462
3463	if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
3464			GFP_TEMPORARY))
3465		return sprintf(buf, "Out of memory\n");
3466
3467	/* Push back cpu slabs */
3468	flush_all(s);
3469
3470	for_each_node_state(node, N_NORMAL_MEMORY) {
3471		struct kmem_cache_node *n = get_node(s, node);
3472		unsigned long flags;
3473		struct page *page;
3474
3475		if (!atomic_long_read(&n->nr_slabs))
3476			continue;
3477
3478		spin_lock_irqsave(&n->list_lock, flags);
3479		list_for_each_entry(page, &n->partial, lru)
3480			process_slab(&t, s, page, alloc);
3481		list_for_each_entry(page, &n->full, lru)
3482			process_slab(&t, s, page, alloc);
3483		spin_unlock_irqrestore(&n->list_lock, flags);
3484	}
3485
3486	for (i = 0; i < t.count; i++) {
3487		struct location *l = &t.loc[i];
3488
3489		if (len > PAGE_SIZE - 100)
3490			break;
3491		len += sprintf(buf + len, "%7ld ", l->count);
3492
3493		if (l->addr)
3494			len += sprint_symbol(buf + len, (unsigned long)l->addr);
3495		else
3496			len += sprintf(buf + len, "<not-available>");
3497
3498		if (l->sum_time != l->min_time) {
3499			unsigned long remainder;
3500
3501			len += sprintf(buf + len, " age=%ld/%ld/%ld",
3502			l->min_time,
3503			div_long_long_rem(l->sum_time, l->count, &remainder),
3504			l->max_time);
3505		} else
3506			len += sprintf(buf + len, " age=%ld",
3507				l->min_time);
3508
3509		if (l->min_pid != l->max_pid)
3510			len += sprintf(buf + len, " pid=%ld-%ld",
3511				l->min_pid, l->max_pid);
3512		else
3513			len += sprintf(buf + len, " pid=%ld",
3514				l->min_pid);
3515
3516		if (num_online_cpus() > 1 && !cpus_empty(l->cpus) &&
3517				len < PAGE_SIZE - 60) {
3518			len += sprintf(buf + len, " cpus=");
3519			len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3520					l->cpus);
3521		}
3522
3523		if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
3524				len < PAGE_SIZE - 60) {
3525			len += sprintf(buf + len, " nodes=");
3526			len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3527					l->nodes);
3528		}
3529
3530		len += sprintf(buf + len, "\n");
3531	}
3532
3533	free_loc_track(&t);
3534	if (!t.count)
3535		len += sprintf(buf, "No data\n");
3536	return len;
3537}
3538
3539enum slab_stat_type {
3540	SL_FULL,
3541	SL_PARTIAL,
3542	SL_CPU,
3543	SL_OBJECTS
3544};
3545
3546#define SO_FULL		(1 << SL_FULL)
3547#define SO_PARTIAL	(1 << SL_PARTIAL)
3548#define SO_CPU		(1 << SL_CPU)
3549#define SO_OBJECTS	(1 << SL_OBJECTS)
3550
3551static unsigned long show_slab_objects(struct kmem_cache *s,
3552			char *buf, unsigned long flags)
3553{
3554	unsigned long total = 0;
3555	int cpu;
3556	int node;
3557	int x;
3558	unsigned long *nodes;
3559	unsigned long *per_cpu;
3560
3561	nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
3562	per_cpu = nodes + nr_node_ids;
3563
3564	for_each_possible_cpu(cpu) {
3565		struct page *page;
3566		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3567
3568		if (!c)
3569			continue;
3570
3571		page = c->page;
3572		node = c->node;
3573		if (node < 0)
3574			continue;
3575		if (page) {
3576			if (flags & SO_CPU) {
3577				if (flags & SO_OBJECTS)
3578					x = page->inuse;
3579				else
3580					x = 1;
3581				total += x;
3582				nodes[node] += x;
3583			}
3584			per_cpu[node]++;
3585		}
3586	}
3587
3588	for_each_node_state(node, N_NORMAL_MEMORY) {
3589		struct kmem_cache_node *n = get_node(s, node);
3590
3591		if (flags & SO_PARTIAL) {
3592			if (flags & SO_OBJECTS)
3593				x = count_partial(n);
3594			else
3595				x = n->nr_partial;
3596			total += x;
3597			nodes[node] += x;
3598		}
3599
3600		if (flags & SO_FULL) {
3601			int full_slabs = atomic_long_read(&n->nr_slabs)
3602					- per_cpu[node]
3603					- n->nr_partial;
3604
3605			if (flags & SO_OBJECTS)
3606				x = full_slabs * s->objects;
3607			else
3608				x = full_slabs;
3609			total += x;
3610			nodes[node] += x;
3611		}
3612	}
3613
3614	x = sprintf(buf, "%lu", total);
3615#ifdef CONFIG_NUMA
3616	for_each_node_state(node, N_NORMAL_MEMORY)
3617		if (nodes[node])
3618			x += sprintf(buf + x, " N%d=%lu",
3619					node, nodes[node]);
3620#endif
3621	kfree(nodes);
3622	return x + sprintf(buf + x, "\n");
3623}
3624
3625static int any_slab_objects(struct kmem_cache *s)
3626{
3627	int node;
3628	int cpu;
3629
3630	for_each_possible_cpu(cpu) {
3631		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3632
3633		if (c && c->page)
3634			return 1;
3635	}
3636
3637	for_each_online_node(node) {
3638		struct kmem_cache_node *n = get_node(s, node);
3639
3640		if (!n)
3641			continue;
3642
3643		if (n->nr_partial || atomic_long_read(&n->nr_slabs))
3644			return 1;
3645	}
3646	return 0;
3647}
3648
3649#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3650#define to_slab(n) container_of(n, struct kmem_cache, kobj);
3651
3652struct slab_attribute {
3653	struct attribute attr;
3654	ssize_t (*show)(struct kmem_cache *s, char *buf);
3655	ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
3656};
3657
3658#define SLAB_ATTR_RO(_name) \
3659	static struct slab_attribute _name##_attr = __ATTR_RO(_name)
3660
3661#define SLAB_ATTR(_name) \
3662	static struct slab_attribute _name##_attr =  \
3663	__ATTR(_name, 0644, _name##_show, _name##_store)
3664
3665static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
3666{
3667	return sprintf(buf, "%d\n", s->size);
3668}
3669SLAB_ATTR_RO(slab_size);
3670
3671static ssize_t align_show(struct kmem_cache *s, char *buf)
3672{
3673	return sprintf(buf, "%d\n", s->align);
3674}
3675SLAB_ATTR_RO(align);
3676
3677static ssize_t object_size_show(struct kmem_cache *s, char *buf)
3678{
3679	return sprintf(buf, "%d\n", s->objsize);
3680}
3681SLAB_ATTR_RO(object_size);
3682
3683static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
3684{
3685	return sprintf(buf, "%d\n", s->objects);
3686}
3687SLAB_ATTR_RO(objs_per_slab);
3688
3689static ssize_t order_show(struct kmem_cache *s, char *buf)
3690{
3691	return sprintf(buf, "%d\n", s->order);
3692}
3693SLAB_ATTR_RO(order);
3694
3695static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3696{
3697	if (s->ctor) {
3698		int n = sprint_symbol(buf, (unsigned long)s->ctor);
3699
3700		return n + sprintf(buf + n, "\n");
3701	}
3702	return 0;
3703}
3704SLAB_ATTR_RO(ctor);
3705
3706static ssize_t aliases_show(struct kmem_cache *s, char *buf)
3707{
3708	return sprintf(buf, "%d\n", s->refcount - 1);
3709}
3710SLAB_ATTR_RO(aliases);
3711
3712static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3713{
3714	return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU);
3715}
3716SLAB_ATTR_RO(slabs);
3717
3718static ssize_t partial_show(struct kmem_cache *s, char *buf)
3719{
3720	return show_slab_objects(s, buf, SO_PARTIAL);
3721}
3722SLAB_ATTR_RO(partial);
3723
3724static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
3725{
3726	return show_slab_objects(s, buf, SO_CPU);
3727}
3728SLAB_ATTR_RO(cpu_slabs);
3729
3730static ssize_t objects_show(struct kmem_cache *s, char *buf)
3731{
3732	return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS);
3733}
3734SLAB_ATTR_RO(objects);
3735
3736static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
3737{
3738	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
3739}
3740
3741static ssize_t sanity_checks_store(struct kmem_cache *s,
3742				const char *buf, size_t length)
3743{
3744	s->flags &= ~SLAB_DEBUG_FREE;
3745	if (buf[0] == '1')
3746		s->flags |= SLAB_DEBUG_FREE;
3747	return length;
3748}
3749SLAB_ATTR(sanity_checks);
3750
3751static ssize_t trace_show(struct kmem_cache *s, char *buf)
3752{
3753	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
3754}
3755
3756static ssize_t trace_store(struct kmem_cache *s, const char *buf,
3757							size_t length)
3758{
3759	s->flags &= ~SLAB_TRACE;
3760	if (buf[0] == '1')
3761		s->flags |= SLAB_TRACE;
3762	return length;
3763}
3764SLAB_ATTR(trace);
3765
3766static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3767{
3768	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3769}
3770
3771static ssize_t reclaim_account_store(struct kmem_cache *s,
3772				const char *buf, size_t length)
3773{
3774	s->flags &= ~SLAB_RECLAIM_ACCOUNT;
3775	if (buf[0] == '1')
3776		s->flags |= SLAB_RECLAIM_ACCOUNT;
3777	return length;
3778}
3779SLAB_ATTR(reclaim_account);
3780
3781static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
3782{
3783	return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
3784}
3785SLAB_ATTR_RO(hwcache_align);
3786
3787#ifdef CONFIG_ZONE_DMA
3788static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
3789{
3790	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
3791}
3792SLAB_ATTR_RO(cache_dma);
3793#endif
3794
3795static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
3796{
3797	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
3798}
3799SLAB_ATTR_RO(destroy_by_rcu);
3800
3801static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
3802{
3803	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
3804}
3805
3806static ssize_t red_zone_store(struct kmem_cache *s,
3807				const char *buf, size_t length)
3808{
3809	if (any_slab_objects(s))
3810		return -EBUSY;
3811
3812	s->flags &= ~SLAB_RED_ZONE;
3813	if (buf[0] == '1')
3814		s->flags |= SLAB_RED_ZONE;
3815	calculate_sizes(s);
3816	return length;
3817}
3818SLAB_ATTR(red_zone);
3819
3820static ssize_t poison_show(struct kmem_cache *s, char *buf)
3821{
3822	return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
3823}
3824
3825static ssize_t poison_store(struct kmem_cache *s,
3826				const char *buf, size_t length)
3827{
3828	if (any_slab_objects(s))
3829		return -EBUSY;
3830
3831	s->flags &= ~SLAB_POISON;
3832	if (buf[0] == '1')
3833		s->flags |= SLAB_POISON;
3834	calculate_sizes(s);
3835	return length;
3836}
3837SLAB_ATTR(poison);
3838
3839static ssize_t store_user_show(struct kmem_cache *s, char *buf)
3840{
3841	return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
3842}
3843
3844static ssize_t store_user_store(struct kmem_cache *s,
3845				const char *buf, size_t length)
3846{
3847	if (any_slab_objects(s))
3848		return -EBUSY;
3849
3850	s->flags &= ~SLAB_STORE_USER;
3851	if (buf[0] == '1')
3852		s->flags |= SLAB_STORE_USER;
3853	calculate_sizes(s);
3854	return length;
3855}
3856SLAB_ATTR(store_user);
3857
3858static ssize_t validate_show(struct kmem_cache *s, char *buf)
3859{
3860	return 0;
3861}
3862
3863static ssize_t validate_store(struct kmem_cache *s,
3864			const char *buf, size_t length)
3865{
3866	int ret = -EINVAL;
3867
3868	if (buf[0] == '1') {
3869		ret = validate_slab_cache(s);
3870		if (ret >= 0)
3871			ret = length;
3872	}
3873	return ret;
3874}
3875SLAB_ATTR(validate);
3876
3877static ssize_t shrink_show(struct kmem_cache *s, char *buf)
3878{
3879	return 0;
3880}
3881
3882static ssize_t shrink_store(struct kmem_cache *s,
3883			const char *buf, size_t length)
3884{
3885	if (buf[0] == '1') {
3886		int rc = kmem_cache_shrink(s);
3887
3888		if (rc)
3889			return rc;
3890	} else
3891		return -EINVAL;
3892	return length;
3893}
3894SLAB_ATTR(shrink);
3895
3896static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
3897{
3898	if (!(s->flags & SLAB_STORE_USER))
3899		return -ENOSYS;
3900	return list_locations(s, buf, TRACK_ALLOC);
3901}
3902SLAB_ATTR_RO(alloc_calls);
3903
3904static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
3905{
3906	if (!(s->flags & SLAB_STORE_USER))
3907		return -ENOSYS;
3908	return list_locations(s, buf, TRACK_FREE);
3909}
3910SLAB_ATTR_RO(free_calls);
3911
3912#ifdef CONFIG_NUMA
3913static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
3914{
3915	return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
3916}
3917
3918static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
3919				const char *buf, size_t length)
3920{
3921	int n = simple_strtoul(buf, NULL, 10);
3922
3923	if (n < 100)
3924		s->remote_node_defrag_ratio = n * 10;
3925	return length;
3926}
3927SLAB_ATTR(remote_node_defrag_ratio);
3928#endif
3929
3930#ifdef CONFIG_SLUB_STATS
3931
3932static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
3933{
3934	unsigned long sum  = 0;
3935	int cpu;
3936	int len;
3937	int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
3938
3939	if (!data)
3940		return -ENOMEM;
3941
3942	for_each_online_cpu(cpu) {
3943		unsigned x = get_cpu_slab(s, cpu)->stat[si];
3944
3945		data[cpu] = x;
3946		sum += x;
3947	}
3948
3949	len = sprintf(buf, "%lu", sum);
3950
3951	for_each_online_cpu(cpu) {
3952		if (data[cpu] && len < PAGE_SIZE - 20)
3953			len += sprintf(buf + len, " c%d=%u", cpu, data[cpu]);
3954	}
3955	kfree(data);
3956	return len + sprintf(buf + len, "\n");
3957}
3958
3959#define STAT_ATTR(si, text) 					\
3960static ssize_t text##_show(struct kmem_cache *s, char *buf)	\
3961{								\
3962	return show_stat(s, buf, si);				\
3963}								\
3964SLAB_ATTR_RO(text);						\
3965
3966STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
3967STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
3968STAT_ATTR(FREE_FASTPATH, free_fastpath);
3969STAT_ATTR(FREE_SLOWPATH, free_slowpath);
3970STAT_ATTR(FREE_FROZEN, free_frozen);
3971STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
3972STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
3973STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
3974STAT_ATTR(ALLOC_SLAB, alloc_slab);
3975STAT_ATTR(ALLOC_REFILL, alloc_refill);
3976STAT_ATTR(FREE_SLAB, free_slab);
3977STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
3978STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
3979STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
3980STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
3981STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
3982STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
3983
3984#endif
3985
3986static struct attribute *slab_attrs[] = {
3987	&slab_size_attr.attr,
3988	&object_size_attr.attr,
3989	&objs_per_slab_attr.attr,
3990	&order_attr.attr,
3991	&objects_attr.attr,
3992	&slabs_attr.attr,
3993	&partial_attr.attr,
3994	&cpu_slabs_attr.attr,
3995	&ctor_attr.attr,
3996	&aliases_attr.attr,
3997	&align_attr.attr,
3998	&sanity_checks_attr.attr,
3999	&trace_attr.attr,
4000	&hwcache_align_attr.attr,
4001	&reclaim_account_attr.attr,
4002	&destroy_by_rcu_attr.attr,
4003	&red_zone_attr.attr,
4004	&poison_attr.attr,
4005	&store_user_attr.attr,
4006	&validate_attr.attr,
4007	&shrink_attr.attr,
4008	&alloc_calls_attr.attr,
4009	&free_calls_attr.attr,
4010#ifdef CONFIG_ZONE_DMA
4011	&cache_dma_attr.attr,
4012#endif
4013#ifdef CONFIG_NUMA
4014	&remote_node_defrag_ratio_attr.attr,
4015#endif
4016#ifdef CONFIG_SLUB_STATS
4017	&alloc_fastpath_attr.attr,
4018	&alloc_slowpath_attr.attr,
4019	&free_fastpath_attr.attr,
4020	&free_slowpath_attr.attr,
4021	&free_frozen_attr.attr,
4022	&free_add_partial_attr.attr,
4023	&free_remove_partial_attr.attr,
4024	&alloc_from_partial_attr.attr,
4025	&alloc_slab_attr.attr,
4026	&alloc_refill_attr.attr,
4027	&free_slab_attr.attr,
4028	&cpuslab_flush_attr.attr,
4029	&deactivate_full_attr.attr,
4030	&deactivate_empty_attr.attr,
4031	&deactivate_to_head_attr.attr,
4032	&deactivate_to_tail_attr.attr,
4033	&deactivate_remote_frees_attr.attr,
4034#endif
4035	NULL
4036};
4037
4038static struct attribute_group slab_attr_group = {
4039	.attrs = slab_attrs,
4040};
4041
4042static ssize_t slab_attr_show(struct kobject *kobj,
4043				struct attribute *attr,
4044				char *buf)
4045{
4046	struct slab_attribute *attribute;
4047	struct kmem_cache *s;
4048	int err;
4049
4050	attribute = to_slab_attr(attr);
4051	s = to_slab(kobj);
4052
4053	if (!attribute->show)
4054		return -EIO;
4055
4056	err = attribute->show(s, buf);
4057
4058	return err;
4059}
4060
4061static ssize_t slab_attr_store(struct kobject *kobj,
4062				struct attribute *attr,
4063				const char *buf, size_t len)
4064{
4065	struct slab_attribute *attribute;
4066	struct kmem_cache *s;
4067	int err;
4068
4069	attribute = to_slab_attr(attr);
4070	s = to_slab(kobj);
4071
4072	if (!attribute->store)
4073		return -EIO;
4074
4075	err = attribute->store(s, buf, len);
4076
4077	return err;
4078}
4079
4080static void kmem_cache_release(struct kobject *kobj)
4081{
4082	struct kmem_cache *s = to_slab(kobj);
4083
4084	kfree(s);
4085}
4086
4087static struct sysfs_ops slab_sysfs_ops = {
4088	.show = slab_attr_show,
4089	.store = slab_attr_store,
4090};
4091
4092static struct kobj_type slab_ktype = {
4093	.sysfs_ops = &slab_sysfs_ops,
4094	.release = kmem_cache_release
4095};
4096
4097static int uevent_filter(struct kset *kset, struct kobject *kobj)
4098{
4099	struct kobj_type *ktype = get_ktype(kobj);
4100
4101	if (ktype == &slab_ktype)
4102		return 1;
4103	return 0;
4104}
4105
4106static struct kset_uevent_ops slab_uevent_ops = {
4107	.filter = uevent_filter,
4108};
4109
4110static struct kset *slab_kset;
4111
4112#define ID_STR_LENGTH 64
4113
4114/* Create a unique string id for a slab cache:
4115 * format
4116 * :[flags-]size:[memory address of kmemcache]
4117 */
4118static char *create_unique_id(struct kmem_cache *s)
4119{
4120	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
4121	char *p = name;
4122
4123	BUG_ON(!name);
4124
4125	*p++ = ':';
4126	/*
4127	 * First flags affecting slabcache operations. We will only
4128	 * get here for aliasable slabs so we do not need to support
4129	 * too many flags. The flags here must cover all flags that
4130	 * are matched during merging to guarantee that the id is
4131	 * unique.
4132	 */
4133	if (s->flags & SLAB_CACHE_DMA)
4134		*p++ = 'd';
4135	if (s->flags & SLAB_RECLAIM_ACCOUNT)
4136		*p++ = 'a';
4137	if (s->flags & SLAB_DEBUG_FREE)
4138		*p++ = 'F';
4139	if (p != name + 1)
4140		*p++ = '-';
4141	p += sprintf(p, "%07d", s->size);
4142	BUG_ON(p > name + ID_STR_LENGTH - 1);
4143	return name;
4144}
4145
4146static int sysfs_slab_add(struct kmem_cache *s)
4147{
4148	int err;
4149	const char *name;
4150	int unmergeable;
4151
4152	if (slab_state < SYSFS)
4153		/* Defer until later */
4154		return 0;
4155
4156	unmergeable = slab_unmergeable(s);
4157	if (unmergeable) {
4158		/*
4159		 * Slabcache can never be merged so we can use the name proper.
4160		 * This is typically the case for debug situations. In that
4161		 * case we can catch duplicate names easily.
4162		 */
4163		sysfs_remove_link(&slab_kset->kobj, s->name);
4164		name = s->name;
4165	} else {
4166		/*
4167		 * Create a unique name for the slab as a target
4168		 * for the symlinks.
4169		 */
4170		name = create_unique_id(s);
4171	}
4172
4173	s->kobj.kset = slab_kset;
4174	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
4175	if (err) {
4176		kobject_put(&s->kobj);
4177		return err;
4178	}
4179
4180	err = sysfs_create_group(&s->kobj, &slab_attr_group);
4181	if (err)
4182		return err;
4183	kobject_uevent(&s->kobj, KOBJ_ADD);
4184	if (!unmergeable) {
4185		/* Setup first alias */
4186		sysfs_slab_alias(s, s->name);
4187		kfree(name);
4188	}
4189	return 0;
4190}
4191
4192static void sysfs_slab_remove(struct kmem_cache *s)
4193{
4194	kobject_uevent(&s->kobj, KOBJ_REMOVE);
4195	kobject_del(&s->kobj);
4196	kobject_put(&s->kobj);
4197}
4198
4199/*
4200 * Need to buffer aliases during bootup until sysfs becomes
4201 * available lest we loose that information.
4202 */
4203struct saved_alias {
4204	struct kmem_cache *s;
4205	const char *name;
4206	struct saved_alias *next;
4207};
4208
4209static struct saved_alias *alias_list;
4210
4211static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
4212{
4213	struct saved_alias *al;
4214
4215	if (slab_state == SYSFS) {
4216		/*
4217		 * If we have a leftover link then remove it.
4218		 */
4219		sysfs_remove_link(&slab_kset->kobj, name);
4220		return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
4221	}
4222
4223	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
4224	if (!al)
4225		return -ENOMEM;
4226
4227	al->s = s;
4228	al->name = name;
4229	al->next = alias_list;
4230	alias_list = al;
4231	return 0;
4232}
4233
4234static int __init slab_sysfs_init(void)
4235{
4236	struct kmem_cache *s;
4237	int err;
4238
4239	slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
4240	if (!slab_kset) {
4241		printk(KERN_ERR "Cannot register slab subsystem.\n");
4242		return -ENOSYS;
4243	}
4244
4245	slab_state = SYSFS;
4246
4247	list_for_each_entry(s, &slab_caches, list) {
4248		err = sysfs_slab_add(s);
4249		if (err)
4250			printk(KERN_ERR "SLUB: Unable to add boot slab %s"
4251						" to sysfs\n", s->name);
4252	}
4253
4254	while (alias_list) {
4255		struct saved_alias *al = alias_list;
4256
4257		alias_list = alias_list->next;
4258		err = sysfs_slab_alias(al->s, al->name);
4259		if (err)
4260			printk(KERN_ERR "SLUB: Unable to add boot slab alias"
4261					" %s to sysfs\n", s->name);
4262		kfree(al);
4263	}
4264
4265	resiliency_test();
4266	return 0;
4267}
4268
4269__initcall(slab_sysfs_init);
4270#endif
4271
4272/*
4273 * The /proc/slabinfo ABI
4274 */
4275#ifdef CONFIG_SLABINFO
4276
4277ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4278                       size_t count, loff_t *ppos)
4279{
4280	return -EINVAL;
4281}
4282
4283
4284static void print_slabinfo_header(struct seq_file *m)
4285{
4286	seq_puts(m, "slabinfo - version: 2.1\n");
4287	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4288		 "<objperslab> <pagesperslab>");
4289	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4290	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4291	seq_putc(m, '\n');
4292}
4293
4294static void *s_start(struct seq_file *m, loff_t *pos)
4295{
4296	loff_t n = *pos;
4297
4298	down_read(&slub_lock);
4299	if (!n)
4300		print_slabinfo_header(m);
4301
4302	return seq_list_start(&slab_caches, *pos);
4303}
4304
4305static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4306{
4307	return seq_list_next(p, &slab_caches, pos);
4308}
4309
4310static void s_stop(struct seq_file *m, void *p)
4311{
4312	up_read(&slub_lock);
4313}
4314
4315static int s_show(struct seq_file *m, void *p)
4316{
4317	unsigned long nr_partials = 0;
4318	unsigned long nr_slabs = 0;
4319	unsigned long nr_inuse = 0;
4320	unsigned long nr_objs;
4321	struct kmem_cache *s;
4322	int node;
4323
4324	s = list_entry(p, struct kmem_cache, list);
4325
4326	for_each_online_node(node) {
4327		struct kmem_cache_node *n = get_node(s, node);
4328
4329		if (!n)
4330			continue;
4331
4332		nr_partials += n->nr_partial;
4333		nr_slabs += atomic_long_read(&n->nr_slabs);
4334		nr_inuse += count_partial(n);
4335	}
4336
4337	nr_objs = nr_slabs * s->objects;
4338	nr_inuse += (nr_slabs - nr_partials) * s->objects;
4339
4340	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
4341		   nr_objs, s->size, s->objects, (1 << s->order));
4342	seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
4343	seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
4344		   0UL);
4345	seq_putc(m, '\n');
4346	return 0;
4347}
4348
4349const struct seq_operations slabinfo_op = {
4350	.start = s_start,
4351	.next = s_next,
4352	.stop = s_stop,
4353	.show = s_show,
4354};
4355
4356#endif /* CONFIG_SLABINFO */
4357