slub.c revision 4a92379bdfb48680a5e6775dd53a586df7b6b0b1
1/*
2 * SLUB: A slab allocator that limits cache line use instead of queuing
3 * objects in per cpu and per node lists.
4 *
5 * The allocator synchronizes using per slab locks and only
6 * uses a centralized lock to manage a pool of partial slabs.
7 *
8 * (C) 2007 SGI, Christoph Lameter
9 */
10
11#include <linux/mm.h>
12#include <linux/swap.h> /* struct reclaim_state */
13#include <linux/module.h>
14#include <linux/bit_spinlock.h>
15#include <linux/interrupt.h>
16#include <linux/bitops.h>
17#include <linux/slab.h>
18#include <linux/proc_fs.h>
19#include <linux/seq_file.h>
20#include <linux/kmemcheck.h>
21#include <linux/cpu.h>
22#include <linux/cpuset.h>
23#include <linux/mempolicy.h>
24#include <linux/ctype.h>
25#include <linux/debugobjects.h>
26#include <linux/kallsyms.h>
27#include <linux/memory.h>
28#include <linux/math64.h>
29#include <linux/fault-inject.h>
30
31#include <trace/events/kmem.h>
32
33/*
34 * Lock order:
35 *   1. slab_lock(page)
36 *   2. slab->list_lock
37 *
38 *   The slab_lock protects operations on the object of a particular
39 *   slab and its metadata in the page struct. If the slab lock
40 *   has been taken then no allocations nor frees can be performed
41 *   on the objects in the slab nor can the slab be added or removed
42 *   from the partial or full lists since this would mean modifying
43 *   the page_struct of the slab.
44 *
45 *   The list_lock protects the partial and full list on each node and
46 *   the partial slab counter. If taken then no new slabs may be added or
47 *   removed from the lists nor make the number of partial slabs be modified.
48 *   (Note that the total number of slabs is an atomic value that may be
49 *   modified without taking the list lock).
50 *
51 *   The list_lock is a centralized lock and thus we avoid taking it as
52 *   much as possible. As long as SLUB does not have to handle partial
53 *   slabs, operations can continue without any centralized lock. F.e.
54 *   allocating a long series of objects that fill up slabs does not require
55 *   the list lock.
56 *
57 *   The lock order is sometimes inverted when we are trying to get a slab
58 *   off a list. We take the list_lock and then look for a page on the list
59 *   to use. While we do that objects in the slabs may be freed. We can
60 *   only operate on the slab if we have also taken the slab_lock. So we use
61 *   a slab_trylock() on the slab. If trylock was successful then no frees
62 *   can occur anymore and we can use the slab for allocations etc. If the
63 *   slab_trylock() does not succeed then frees are in progress in the slab and
64 *   we must stay away from it for a while since we may cause a bouncing
65 *   cacheline if we try to acquire the lock. So go onto the next slab.
66 *   If all pages are busy then we may allocate a new slab instead of reusing
67 *   a partial slab. A new slab has noone operating on it and thus there is
68 *   no danger of cacheline contention.
69 *
70 *   Interrupts are disabled during allocation and deallocation in order to
71 *   make the slab allocator safe to use in the context of an irq. In addition
72 *   interrupts are disabled to ensure that the processor does not change
73 *   while handling per_cpu slabs, due to kernel preemption.
74 *
75 * SLUB assigns one slab for allocation to each processor.
76 * Allocations only occur from these slabs called cpu slabs.
77 *
78 * Slabs with free elements are kept on a partial list and during regular
79 * operations no list for full slabs is used. If an object in a full slab is
80 * freed then the slab will show up again on the partial lists.
81 * We track full slabs for debugging purposes though because otherwise we
82 * cannot scan all objects.
83 *
84 * Slabs are freed when they become empty. Teardown and setup is
85 * minimal so we rely on the page allocators per cpu caches for
86 * fast frees and allocs.
87 *
88 * Overloading of page flags that are otherwise used for LRU management.
89 *
90 * PageActive 		The slab is frozen and exempt from list processing.
91 * 			This means that the slab is dedicated to a purpose
92 * 			such as satisfying allocations for a specific
93 * 			processor. Objects may be freed in the slab while
94 * 			it is frozen but slab_free will then skip the usual
95 * 			list operations. It is up to the processor holding
96 * 			the slab to integrate the slab into the slab lists
97 * 			when the slab is no longer needed.
98 *
99 * 			One use of this flag is to mark slabs that are
100 * 			used for allocations. Then such a slab becomes a cpu
101 * 			slab. The cpu slab may be equipped with an additional
102 * 			freelist that allows lockless access to
103 * 			free objects in addition to the regular freelist
104 * 			that requires the slab lock.
105 *
106 * PageError		Slab requires special handling due to debug
107 * 			options set. This moves	slab handling out of
108 * 			the fast path and disables lockless freelists.
109 */
110
111#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
112		SLAB_TRACE | SLAB_DEBUG_FREE)
113
114static inline int kmem_cache_debug(struct kmem_cache *s)
115{
116#ifdef CONFIG_SLUB_DEBUG
117	return unlikely(s->flags & SLAB_DEBUG_FLAGS);
118#else
119	return 0;
120#endif
121}
122
123/*
124 * Issues still to be resolved:
125 *
126 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
127 *
128 * - Variable sizing of the per node arrays
129 */
130
131/* Enable to test recovery from slab corruption on boot */
132#undef SLUB_RESILIENCY_TEST
133
134/*
135 * Mininum number of partial slabs. These will be left on the partial
136 * lists even if they are empty. kmem_cache_shrink may reclaim them.
137 */
138#define MIN_PARTIAL 5
139
140/*
141 * Maximum number of desirable partial slabs.
142 * The existence of more partial slabs makes kmem_cache_shrink
143 * sort the partial list by the number of objects in the.
144 */
145#define MAX_PARTIAL 10
146
147#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
148				SLAB_POISON | SLAB_STORE_USER)
149
150/*
151 * Debugging flags that require metadata to be stored in the slab.  These get
152 * disabled when slub_debug=O is used and a cache's min order increases with
153 * metadata.
154 */
155#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
156
157/*
158 * Set of flags that will prevent slab merging
159 */
160#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
161		SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
162		SLAB_FAILSLAB)
163
164#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
165		SLAB_CACHE_DMA | SLAB_NOTRACK)
166
167#define OO_SHIFT	16
168#define OO_MASK		((1 << OO_SHIFT) - 1)
169#define MAX_OBJS_PER_PAGE	65535 /* since page.objects is u16 */
170
171/* Internal SLUB flags */
172#define __OBJECT_POISON		0x80000000UL /* Poison object */
173
174static int kmem_size = sizeof(struct kmem_cache);
175
176#ifdef CONFIG_SMP
177static struct notifier_block slab_notifier;
178#endif
179
180static enum {
181	DOWN,		/* No slab functionality available */
182	PARTIAL,	/* Kmem_cache_node works */
183	UP,		/* Everything works but does not show up in sysfs */
184	SYSFS		/* Sysfs up */
185} slab_state = DOWN;
186
187/* A list of all slab caches on the system */
188static DECLARE_RWSEM(slub_lock);
189static LIST_HEAD(slab_caches);
190
191/*
192 * Tracking user of a slab.
193 */
194struct track {
195	unsigned long addr;	/* Called from address */
196	int cpu;		/* Was running on cpu */
197	int pid;		/* Pid context */
198	unsigned long when;	/* When did the operation occur */
199};
200
201enum track_item { TRACK_ALLOC, TRACK_FREE };
202
203#ifdef CONFIG_SYSFS
204static int sysfs_slab_add(struct kmem_cache *);
205static int sysfs_slab_alias(struct kmem_cache *, const char *);
206static void sysfs_slab_remove(struct kmem_cache *);
207
208#else
209static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
210static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
211							{ return 0; }
212static inline void sysfs_slab_remove(struct kmem_cache *s)
213{
214	kfree(s->name);
215	kfree(s);
216}
217
218#endif
219
220static inline void stat(struct kmem_cache *s, enum stat_item si)
221{
222#ifdef CONFIG_SLUB_STATS
223	__this_cpu_inc(s->cpu_slab->stat[si]);
224#endif
225}
226
227/********************************************************************
228 * 			Core slab cache functions
229 *******************************************************************/
230
231int slab_is_available(void)
232{
233	return slab_state >= UP;
234}
235
236static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
237{
238	return s->node[node];
239}
240
241/* Verify that a pointer has an address that is valid within a slab page */
242static inline int check_valid_pointer(struct kmem_cache *s,
243				struct page *page, const void *object)
244{
245	void *base;
246
247	if (!object)
248		return 1;
249
250	base = page_address(page);
251	if (object < base || object >= base + page->objects * s->size ||
252		(object - base) % s->size) {
253		return 0;
254	}
255
256	return 1;
257}
258
259static inline void *get_freepointer(struct kmem_cache *s, void *object)
260{
261	return *(void **)(object + s->offset);
262}
263
264static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
265{
266	*(void **)(object + s->offset) = fp;
267}
268
269/* Loop over all objects in a slab */
270#define for_each_object(__p, __s, __addr, __objects) \
271	for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
272			__p += (__s)->size)
273
274/* Scan freelist */
275#define for_each_free_object(__p, __s, __free) \
276	for (__p = (__free); __p; __p = get_freepointer((__s), __p))
277
278/* Determine object index from a given position */
279static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
280{
281	return (p - addr) / s->size;
282}
283
284static inline struct kmem_cache_order_objects oo_make(int order,
285						unsigned long size)
286{
287	struct kmem_cache_order_objects x = {
288		(order << OO_SHIFT) + (PAGE_SIZE << order) / size
289	};
290
291	return x;
292}
293
294static inline int oo_order(struct kmem_cache_order_objects x)
295{
296	return x.x >> OO_SHIFT;
297}
298
299static inline int oo_objects(struct kmem_cache_order_objects x)
300{
301	return x.x & OO_MASK;
302}
303
304#ifdef CONFIG_SLUB_DEBUG
305/*
306 * Debug settings:
307 */
308#ifdef CONFIG_SLUB_DEBUG_ON
309static int slub_debug = DEBUG_DEFAULT_FLAGS;
310#else
311static int slub_debug;
312#endif
313
314static char *slub_debug_slabs;
315static int disable_higher_order_debug;
316
317/*
318 * Object debugging
319 */
320static void print_section(char *text, u8 *addr, unsigned int length)
321{
322	int i, offset;
323	int newline = 1;
324	char ascii[17];
325
326	ascii[16] = 0;
327
328	for (i = 0; i < length; i++) {
329		if (newline) {
330			printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
331			newline = 0;
332		}
333		printk(KERN_CONT " %02x", addr[i]);
334		offset = i % 16;
335		ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
336		if (offset == 15) {
337			printk(KERN_CONT " %s\n", ascii);
338			newline = 1;
339		}
340	}
341	if (!newline) {
342		i %= 16;
343		while (i < 16) {
344			printk(KERN_CONT "   ");
345			ascii[i] = ' ';
346			i++;
347		}
348		printk(KERN_CONT " %s\n", ascii);
349	}
350}
351
352static struct track *get_track(struct kmem_cache *s, void *object,
353	enum track_item alloc)
354{
355	struct track *p;
356
357	if (s->offset)
358		p = object + s->offset + sizeof(void *);
359	else
360		p = object + s->inuse;
361
362	return p + alloc;
363}
364
365static void set_track(struct kmem_cache *s, void *object,
366			enum track_item alloc, unsigned long addr)
367{
368	struct track *p = get_track(s, object, alloc);
369
370	if (addr) {
371		p->addr = addr;
372		p->cpu = smp_processor_id();
373		p->pid = current->pid;
374		p->when = jiffies;
375	} else
376		memset(p, 0, sizeof(struct track));
377}
378
379static void init_tracking(struct kmem_cache *s, void *object)
380{
381	if (!(s->flags & SLAB_STORE_USER))
382		return;
383
384	set_track(s, object, TRACK_FREE, 0UL);
385	set_track(s, object, TRACK_ALLOC, 0UL);
386}
387
388static void print_track(const char *s, struct track *t)
389{
390	if (!t->addr)
391		return;
392
393	printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
394		s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
395}
396
397static void print_tracking(struct kmem_cache *s, void *object)
398{
399	if (!(s->flags & SLAB_STORE_USER))
400		return;
401
402	print_track("Allocated", get_track(s, object, TRACK_ALLOC));
403	print_track("Freed", get_track(s, object, TRACK_FREE));
404}
405
406static void print_page_info(struct page *page)
407{
408	printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
409		page, page->objects, page->inuse, page->freelist, page->flags);
410
411}
412
413static void slab_bug(struct kmem_cache *s, char *fmt, ...)
414{
415	va_list args;
416	char buf[100];
417
418	va_start(args, fmt);
419	vsnprintf(buf, sizeof(buf), fmt, args);
420	va_end(args);
421	printk(KERN_ERR "========================================"
422			"=====================================\n");
423	printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
424	printk(KERN_ERR "----------------------------------------"
425			"-------------------------------------\n\n");
426}
427
428static void slab_fix(struct kmem_cache *s, char *fmt, ...)
429{
430	va_list args;
431	char buf[100];
432
433	va_start(args, fmt);
434	vsnprintf(buf, sizeof(buf), fmt, args);
435	va_end(args);
436	printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
437}
438
439static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
440{
441	unsigned int off;	/* Offset of last byte */
442	u8 *addr = page_address(page);
443
444	print_tracking(s, p);
445
446	print_page_info(page);
447
448	printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
449			p, p - addr, get_freepointer(s, p));
450
451	if (p > addr + 16)
452		print_section("Bytes b4", p - 16, 16);
453
454	print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
455
456	if (s->flags & SLAB_RED_ZONE)
457		print_section("Redzone", p + s->objsize,
458			s->inuse - s->objsize);
459
460	if (s->offset)
461		off = s->offset + sizeof(void *);
462	else
463		off = s->inuse;
464
465	if (s->flags & SLAB_STORE_USER)
466		off += 2 * sizeof(struct track);
467
468	if (off != s->size)
469		/* Beginning of the filler is the free pointer */
470		print_section("Padding", p + off, s->size - off);
471
472	dump_stack();
473}
474
475static void object_err(struct kmem_cache *s, struct page *page,
476			u8 *object, char *reason)
477{
478	slab_bug(s, "%s", reason);
479	print_trailer(s, page, object);
480}
481
482static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
483{
484	va_list args;
485	char buf[100];
486
487	va_start(args, fmt);
488	vsnprintf(buf, sizeof(buf), fmt, args);
489	va_end(args);
490	slab_bug(s, "%s", buf);
491	print_page_info(page);
492	dump_stack();
493}
494
495static void init_object(struct kmem_cache *s, void *object, u8 val)
496{
497	u8 *p = object;
498
499	if (s->flags & __OBJECT_POISON) {
500		memset(p, POISON_FREE, s->objsize - 1);
501		p[s->objsize - 1] = POISON_END;
502	}
503
504	if (s->flags & SLAB_RED_ZONE)
505		memset(p + s->objsize, val, s->inuse - s->objsize);
506}
507
508static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
509{
510	while (bytes) {
511		if (*start != (u8)value)
512			return start;
513		start++;
514		bytes--;
515	}
516	return NULL;
517}
518
519static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
520						void *from, void *to)
521{
522	slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
523	memset(from, data, to - from);
524}
525
526static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
527			u8 *object, char *what,
528			u8 *start, unsigned int value, unsigned int bytes)
529{
530	u8 *fault;
531	u8 *end;
532
533	fault = check_bytes(start, value, bytes);
534	if (!fault)
535		return 1;
536
537	end = start + bytes;
538	while (end > fault && end[-1] == value)
539		end--;
540
541	slab_bug(s, "%s overwritten", what);
542	printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
543					fault, end - 1, fault[0], value);
544	print_trailer(s, page, object);
545
546	restore_bytes(s, what, value, fault, end);
547	return 0;
548}
549
550/*
551 * Object layout:
552 *
553 * object address
554 * 	Bytes of the object to be managed.
555 * 	If the freepointer may overlay the object then the free
556 * 	pointer is the first word of the object.
557 *
558 * 	Poisoning uses 0x6b (POISON_FREE) and the last byte is
559 * 	0xa5 (POISON_END)
560 *
561 * object + s->objsize
562 * 	Padding to reach word boundary. This is also used for Redzoning.
563 * 	Padding is extended by another word if Redzoning is enabled and
564 * 	objsize == inuse.
565 *
566 * 	We fill with 0xbb (RED_INACTIVE) for inactive objects and with
567 * 	0xcc (RED_ACTIVE) for objects in use.
568 *
569 * object + s->inuse
570 * 	Meta data starts here.
571 *
572 * 	A. Free pointer (if we cannot overwrite object on free)
573 * 	B. Tracking data for SLAB_STORE_USER
574 * 	C. Padding to reach required alignment boundary or at mininum
575 * 		one word if debugging is on to be able to detect writes
576 * 		before the word boundary.
577 *
578 *	Padding is done using 0x5a (POISON_INUSE)
579 *
580 * object + s->size
581 * 	Nothing is used beyond s->size.
582 *
583 * If slabcaches are merged then the objsize and inuse boundaries are mostly
584 * ignored. And therefore no slab options that rely on these boundaries
585 * may be used with merged slabcaches.
586 */
587
588static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
589{
590	unsigned long off = s->inuse;	/* The end of info */
591
592	if (s->offset)
593		/* Freepointer is placed after the object. */
594		off += sizeof(void *);
595
596	if (s->flags & SLAB_STORE_USER)
597		/* We also have user information there */
598		off += 2 * sizeof(struct track);
599
600	if (s->size == off)
601		return 1;
602
603	return check_bytes_and_report(s, page, p, "Object padding",
604				p + off, POISON_INUSE, s->size - off);
605}
606
607/* Check the pad bytes at the end of a slab page */
608static int slab_pad_check(struct kmem_cache *s, struct page *page)
609{
610	u8 *start;
611	u8 *fault;
612	u8 *end;
613	int length;
614	int remainder;
615
616	if (!(s->flags & SLAB_POISON))
617		return 1;
618
619	start = page_address(page);
620	length = (PAGE_SIZE << compound_order(page));
621	end = start + length;
622	remainder = length % s->size;
623	if (!remainder)
624		return 1;
625
626	fault = check_bytes(end - remainder, POISON_INUSE, remainder);
627	if (!fault)
628		return 1;
629	while (end > fault && end[-1] == POISON_INUSE)
630		end--;
631
632	slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
633	print_section("Padding", end - remainder, remainder);
634
635	restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
636	return 0;
637}
638
639static int check_object(struct kmem_cache *s, struct page *page,
640					void *object, u8 val)
641{
642	u8 *p = object;
643	u8 *endobject = object + s->objsize;
644
645	if (s->flags & SLAB_RED_ZONE) {
646		if (!check_bytes_and_report(s, page, object, "Redzone",
647			endobject, val, s->inuse - s->objsize))
648			return 0;
649	} else {
650		if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
651			check_bytes_and_report(s, page, p, "Alignment padding",
652				endobject, POISON_INUSE, s->inuse - s->objsize);
653		}
654	}
655
656	if (s->flags & SLAB_POISON) {
657		if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
658			(!check_bytes_and_report(s, page, p, "Poison", p,
659					POISON_FREE, s->objsize - 1) ||
660			 !check_bytes_and_report(s, page, p, "Poison",
661				p + s->objsize - 1, POISON_END, 1)))
662			return 0;
663		/*
664		 * check_pad_bytes cleans up on its own.
665		 */
666		check_pad_bytes(s, page, p);
667	}
668
669	if (!s->offset && val == SLUB_RED_ACTIVE)
670		/*
671		 * Object and freepointer overlap. Cannot check
672		 * freepointer while object is allocated.
673		 */
674		return 1;
675
676	/* Check free pointer validity */
677	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
678		object_err(s, page, p, "Freepointer corrupt");
679		/*
680		 * No choice but to zap it and thus lose the remainder
681		 * of the free objects in this slab. May cause
682		 * another error because the object count is now wrong.
683		 */
684		set_freepointer(s, p, NULL);
685		return 0;
686	}
687	return 1;
688}
689
690static int check_slab(struct kmem_cache *s, struct page *page)
691{
692	int maxobj;
693
694	VM_BUG_ON(!irqs_disabled());
695
696	if (!PageSlab(page)) {
697		slab_err(s, page, "Not a valid slab page");
698		return 0;
699	}
700
701	maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
702	if (page->objects > maxobj) {
703		slab_err(s, page, "objects %u > max %u",
704			s->name, page->objects, maxobj);
705		return 0;
706	}
707	if (page->inuse > page->objects) {
708		slab_err(s, page, "inuse %u > max %u",
709			s->name, page->inuse, page->objects);
710		return 0;
711	}
712	/* Slab_pad_check fixes things up after itself */
713	slab_pad_check(s, page);
714	return 1;
715}
716
717/*
718 * Determine if a certain object on a page is on the freelist. Must hold the
719 * slab lock to guarantee that the chains are in a consistent state.
720 */
721static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
722{
723	int nr = 0;
724	void *fp = page->freelist;
725	void *object = NULL;
726	unsigned long max_objects;
727
728	while (fp && nr <= page->objects) {
729		if (fp == search)
730			return 1;
731		if (!check_valid_pointer(s, page, fp)) {
732			if (object) {
733				object_err(s, page, object,
734					"Freechain corrupt");
735				set_freepointer(s, object, NULL);
736				break;
737			} else {
738				slab_err(s, page, "Freepointer corrupt");
739				page->freelist = NULL;
740				page->inuse = page->objects;
741				slab_fix(s, "Freelist cleared");
742				return 0;
743			}
744			break;
745		}
746		object = fp;
747		fp = get_freepointer(s, object);
748		nr++;
749	}
750
751	max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
752	if (max_objects > MAX_OBJS_PER_PAGE)
753		max_objects = MAX_OBJS_PER_PAGE;
754
755	if (page->objects != max_objects) {
756		slab_err(s, page, "Wrong number of objects. Found %d but "
757			"should be %d", page->objects, max_objects);
758		page->objects = max_objects;
759		slab_fix(s, "Number of objects adjusted.");
760	}
761	if (page->inuse != page->objects - nr) {
762		slab_err(s, page, "Wrong object count. Counter is %d but "
763			"counted were %d", page->inuse, page->objects - nr);
764		page->inuse = page->objects - nr;
765		slab_fix(s, "Object count adjusted.");
766	}
767	return search == NULL;
768}
769
770static void trace(struct kmem_cache *s, struct page *page, void *object,
771								int alloc)
772{
773	if (s->flags & SLAB_TRACE) {
774		printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
775			s->name,
776			alloc ? "alloc" : "free",
777			object, page->inuse,
778			page->freelist);
779
780		if (!alloc)
781			print_section("Object", (void *)object, s->objsize);
782
783		dump_stack();
784	}
785}
786
787/*
788 * Hooks for other subsystems that check memory allocations. In a typical
789 * production configuration these hooks all should produce no code at all.
790 */
791static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
792{
793	flags &= gfp_allowed_mask;
794	lockdep_trace_alloc(flags);
795	might_sleep_if(flags & __GFP_WAIT);
796
797	return should_failslab(s->objsize, flags, s->flags);
798}
799
800static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
801{
802	flags &= gfp_allowed_mask;
803	kmemcheck_slab_alloc(s, flags, object, s->objsize);
804	kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
805}
806
807static inline void slab_free_hook(struct kmem_cache *s, void *x)
808{
809	kmemleak_free_recursive(x, s->flags);
810}
811
812static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
813{
814	kmemcheck_slab_free(s, object, s->objsize);
815	debug_check_no_locks_freed(object, s->objsize);
816	if (!(s->flags & SLAB_DEBUG_OBJECTS))
817		debug_check_no_obj_freed(object, s->objsize);
818}
819
820/*
821 * Tracking of fully allocated slabs for debugging purposes.
822 */
823static void add_full(struct kmem_cache_node *n, struct page *page)
824{
825	spin_lock(&n->list_lock);
826	list_add(&page->lru, &n->full);
827	spin_unlock(&n->list_lock);
828}
829
830static void remove_full(struct kmem_cache *s, struct page *page)
831{
832	struct kmem_cache_node *n;
833
834	if (!(s->flags & SLAB_STORE_USER))
835		return;
836
837	n = get_node(s, page_to_nid(page));
838
839	spin_lock(&n->list_lock);
840	list_del(&page->lru);
841	spin_unlock(&n->list_lock);
842}
843
844/* Tracking of the number of slabs for debugging purposes */
845static inline unsigned long slabs_node(struct kmem_cache *s, int node)
846{
847	struct kmem_cache_node *n = get_node(s, node);
848
849	return atomic_long_read(&n->nr_slabs);
850}
851
852static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
853{
854	return atomic_long_read(&n->nr_slabs);
855}
856
857static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
858{
859	struct kmem_cache_node *n = get_node(s, node);
860
861	/*
862	 * May be called early in order to allocate a slab for the
863	 * kmem_cache_node structure. Solve the chicken-egg
864	 * dilemma by deferring the increment of the count during
865	 * bootstrap (see early_kmem_cache_node_alloc).
866	 */
867	if (n) {
868		atomic_long_inc(&n->nr_slabs);
869		atomic_long_add(objects, &n->total_objects);
870	}
871}
872static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
873{
874	struct kmem_cache_node *n = get_node(s, node);
875
876	atomic_long_dec(&n->nr_slabs);
877	atomic_long_sub(objects, &n->total_objects);
878}
879
880/* Object debug checks for alloc/free paths */
881static void setup_object_debug(struct kmem_cache *s, struct page *page,
882								void *object)
883{
884	if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
885		return;
886
887	init_object(s, object, SLUB_RED_INACTIVE);
888	init_tracking(s, object);
889}
890
891static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page,
892					void *object, unsigned long addr)
893{
894	if (!check_slab(s, page))
895		goto bad;
896
897	if (!on_freelist(s, page, object)) {
898		object_err(s, page, object, "Object already allocated");
899		goto bad;
900	}
901
902	if (!check_valid_pointer(s, page, object)) {
903		object_err(s, page, object, "Freelist Pointer check fails");
904		goto bad;
905	}
906
907	if (!check_object(s, page, object, SLUB_RED_INACTIVE))
908		goto bad;
909
910	/* Success perform special debug activities for allocs */
911	if (s->flags & SLAB_STORE_USER)
912		set_track(s, object, TRACK_ALLOC, addr);
913	trace(s, page, object, 1);
914	init_object(s, object, SLUB_RED_ACTIVE);
915	return 1;
916
917bad:
918	if (PageSlab(page)) {
919		/*
920		 * If this is a slab page then lets do the best we can
921		 * to avoid issues in the future. Marking all objects
922		 * as used avoids touching the remaining objects.
923		 */
924		slab_fix(s, "Marking all objects used");
925		page->inuse = page->objects;
926		page->freelist = NULL;
927	}
928	return 0;
929}
930
931static noinline int free_debug_processing(struct kmem_cache *s,
932		 struct page *page, void *object, unsigned long addr)
933{
934	if (!check_slab(s, page))
935		goto fail;
936
937	if (!check_valid_pointer(s, page, object)) {
938		slab_err(s, page, "Invalid object pointer 0x%p", object);
939		goto fail;
940	}
941
942	if (on_freelist(s, page, object)) {
943		object_err(s, page, object, "Object already free");
944		goto fail;
945	}
946
947	if (!check_object(s, page, object, SLUB_RED_ACTIVE))
948		return 0;
949
950	if (unlikely(s != page->slab)) {
951		if (!PageSlab(page)) {
952			slab_err(s, page, "Attempt to free object(0x%p) "
953				"outside of slab", object);
954		} else if (!page->slab) {
955			printk(KERN_ERR
956				"SLUB <none>: no slab for object 0x%p.\n",
957						object);
958			dump_stack();
959		} else
960			object_err(s, page, object,
961					"page slab pointer corrupt.");
962		goto fail;
963	}
964
965	/* Special debug activities for freeing objects */
966	if (!PageSlubFrozen(page) && !page->freelist)
967		remove_full(s, page);
968	if (s->flags & SLAB_STORE_USER)
969		set_track(s, object, TRACK_FREE, addr);
970	trace(s, page, object, 0);
971	init_object(s, object, SLUB_RED_INACTIVE);
972	return 1;
973
974fail:
975	slab_fix(s, "Object at 0x%p not freed", object);
976	return 0;
977}
978
979static int __init setup_slub_debug(char *str)
980{
981	slub_debug = DEBUG_DEFAULT_FLAGS;
982	if (*str++ != '=' || !*str)
983		/*
984		 * No options specified. Switch on full debugging.
985		 */
986		goto out;
987
988	if (*str == ',')
989		/*
990		 * No options but restriction on slabs. This means full
991		 * debugging for slabs matching a pattern.
992		 */
993		goto check_slabs;
994
995	if (tolower(*str) == 'o') {
996		/*
997		 * Avoid enabling debugging on caches if its minimum order
998		 * would increase as a result.
999		 */
1000		disable_higher_order_debug = 1;
1001		goto out;
1002	}
1003
1004	slub_debug = 0;
1005	if (*str == '-')
1006		/*
1007		 * Switch off all debugging measures.
1008		 */
1009		goto out;
1010
1011	/*
1012	 * Determine which debug features should be switched on
1013	 */
1014	for (; *str && *str != ','; str++) {
1015		switch (tolower(*str)) {
1016		case 'f':
1017			slub_debug |= SLAB_DEBUG_FREE;
1018			break;
1019		case 'z':
1020			slub_debug |= SLAB_RED_ZONE;
1021			break;
1022		case 'p':
1023			slub_debug |= SLAB_POISON;
1024			break;
1025		case 'u':
1026			slub_debug |= SLAB_STORE_USER;
1027			break;
1028		case 't':
1029			slub_debug |= SLAB_TRACE;
1030			break;
1031		case 'a':
1032			slub_debug |= SLAB_FAILSLAB;
1033			break;
1034		default:
1035			printk(KERN_ERR "slub_debug option '%c' "
1036				"unknown. skipped\n", *str);
1037		}
1038	}
1039
1040check_slabs:
1041	if (*str == ',')
1042		slub_debug_slabs = str + 1;
1043out:
1044	return 1;
1045}
1046
1047__setup("slub_debug", setup_slub_debug);
1048
1049static unsigned long kmem_cache_flags(unsigned long objsize,
1050	unsigned long flags, const char *name,
1051	void (*ctor)(void *))
1052{
1053	/*
1054	 * Enable debugging if selected on the kernel commandline.
1055	 */
1056	if (slub_debug && (!slub_debug_slabs ||
1057		!strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
1058		flags |= slub_debug;
1059
1060	return flags;
1061}
1062#else
1063static inline void setup_object_debug(struct kmem_cache *s,
1064			struct page *page, void *object) {}
1065
1066static inline int alloc_debug_processing(struct kmem_cache *s,
1067	struct page *page, void *object, unsigned long addr) { return 0; }
1068
1069static inline int free_debug_processing(struct kmem_cache *s,
1070	struct page *page, void *object, unsigned long addr) { return 0; }
1071
1072static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1073			{ return 1; }
1074static inline int check_object(struct kmem_cache *s, struct page *page,
1075			void *object, u8 val) { return 1; }
1076static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1077static inline unsigned long kmem_cache_flags(unsigned long objsize,
1078	unsigned long flags, const char *name,
1079	void (*ctor)(void *))
1080{
1081	return flags;
1082}
1083#define slub_debug 0
1084
1085#define disable_higher_order_debug 0
1086
1087static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1088							{ return 0; }
1089static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1090							{ return 0; }
1091static inline void inc_slabs_node(struct kmem_cache *s, int node,
1092							int objects) {}
1093static inline void dec_slabs_node(struct kmem_cache *s, int node,
1094							int objects) {}
1095
1096static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
1097							{ return 0; }
1098
1099static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1100		void *object) {}
1101
1102static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
1103
1104static inline void slab_free_hook_irq(struct kmem_cache *s,
1105		void *object) {}
1106
1107#endif /* CONFIG_SLUB_DEBUG */
1108
1109/*
1110 * Slab allocation and freeing
1111 */
1112static inline struct page *alloc_slab_page(gfp_t flags, int node,
1113					struct kmem_cache_order_objects oo)
1114{
1115	int order = oo_order(oo);
1116
1117	flags |= __GFP_NOTRACK;
1118
1119	if (node == NUMA_NO_NODE)
1120		return alloc_pages(flags, order);
1121	else
1122		return alloc_pages_exact_node(node, flags, order);
1123}
1124
1125static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1126{
1127	struct page *page;
1128	struct kmem_cache_order_objects oo = s->oo;
1129	gfp_t alloc_gfp;
1130
1131	flags |= s->allocflags;
1132
1133	/*
1134	 * Let the initial higher-order allocation fail under memory pressure
1135	 * so we fall-back to the minimum order allocation.
1136	 */
1137	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1138
1139	page = alloc_slab_page(alloc_gfp, node, oo);
1140	if (unlikely(!page)) {
1141		oo = s->min;
1142		/*
1143		 * Allocation may have failed due to fragmentation.
1144		 * Try a lower order alloc if possible
1145		 */
1146		page = alloc_slab_page(flags, node, oo);
1147		if (!page)
1148			return NULL;
1149
1150		stat(s, ORDER_FALLBACK);
1151	}
1152
1153	if (kmemcheck_enabled
1154		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1155		int pages = 1 << oo_order(oo);
1156
1157		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
1158
1159		/*
1160		 * Objects from caches that have a constructor don't get
1161		 * cleared when they're allocated, so we need to do it here.
1162		 */
1163		if (s->ctor)
1164			kmemcheck_mark_uninitialized_pages(page, pages);
1165		else
1166			kmemcheck_mark_unallocated_pages(page, pages);
1167	}
1168
1169	page->objects = oo_objects(oo);
1170	mod_zone_page_state(page_zone(page),
1171		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
1172		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1173		1 << oo_order(oo));
1174
1175	return page;
1176}
1177
1178static void setup_object(struct kmem_cache *s, struct page *page,
1179				void *object)
1180{
1181	setup_object_debug(s, page, object);
1182	if (unlikely(s->ctor))
1183		s->ctor(object);
1184}
1185
1186static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1187{
1188	struct page *page;
1189	void *start;
1190	void *last;
1191	void *p;
1192
1193	BUG_ON(flags & GFP_SLAB_BUG_MASK);
1194
1195	page = allocate_slab(s,
1196		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1197	if (!page)
1198		goto out;
1199
1200	inc_slabs_node(s, page_to_nid(page), page->objects);
1201	page->slab = s;
1202	page->flags |= 1 << PG_slab;
1203
1204	start = page_address(page);
1205
1206	if (unlikely(s->flags & SLAB_POISON))
1207		memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
1208
1209	last = start;
1210	for_each_object(p, s, start, page->objects) {
1211		setup_object(s, page, last);
1212		set_freepointer(s, last, p);
1213		last = p;
1214	}
1215	setup_object(s, page, last);
1216	set_freepointer(s, last, NULL);
1217
1218	page->freelist = start;
1219	page->inuse = 0;
1220out:
1221	return page;
1222}
1223
1224static void __free_slab(struct kmem_cache *s, struct page *page)
1225{
1226	int order = compound_order(page);
1227	int pages = 1 << order;
1228
1229	if (kmem_cache_debug(s)) {
1230		void *p;
1231
1232		slab_pad_check(s, page);
1233		for_each_object(p, s, page_address(page),
1234						page->objects)
1235			check_object(s, page, p, SLUB_RED_INACTIVE);
1236	}
1237
1238	kmemcheck_free_shadow(page, compound_order(page));
1239
1240	mod_zone_page_state(page_zone(page),
1241		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
1242		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1243		-pages);
1244
1245	__ClearPageSlab(page);
1246	reset_page_mapcount(page);
1247	if (current->reclaim_state)
1248		current->reclaim_state->reclaimed_slab += pages;
1249	__free_pages(page, order);
1250}
1251
1252static void rcu_free_slab(struct rcu_head *h)
1253{
1254	struct page *page;
1255
1256	page = container_of((struct list_head *)h, struct page, lru);
1257	__free_slab(page->slab, page);
1258}
1259
1260static void free_slab(struct kmem_cache *s, struct page *page)
1261{
1262	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1263		/*
1264		 * RCU free overloads the RCU head over the LRU
1265		 */
1266		struct rcu_head *head = (void *)&page->lru;
1267
1268		call_rcu(head, rcu_free_slab);
1269	} else
1270		__free_slab(s, page);
1271}
1272
1273static void discard_slab(struct kmem_cache *s, struct page *page)
1274{
1275	dec_slabs_node(s, page_to_nid(page), page->objects);
1276	free_slab(s, page);
1277}
1278
1279/*
1280 * Per slab locking using the pagelock
1281 */
1282static __always_inline void slab_lock(struct page *page)
1283{
1284	bit_spin_lock(PG_locked, &page->flags);
1285}
1286
1287static __always_inline void slab_unlock(struct page *page)
1288{
1289	__bit_spin_unlock(PG_locked, &page->flags);
1290}
1291
1292static __always_inline int slab_trylock(struct page *page)
1293{
1294	int rc = 1;
1295
1296	rc = bit_spin_trylock(PG_locked, &page->flags);
1297	return rc;
1298}
1299
1300/*
1301 * Management of partially allocated slabs
1302 */
1303static void add_partial(struct kmem_cache_node *n,
1304				struct page *page, int tail)
1305{
1306	spin_lock(&n->list_lock);
1307	n->nr_partial++;
1308	if (tail)
1309		list_add_tail(&page->lru, &n->partial);
1310	else
1311		list_add(&page->lru, &n->partial);
1312	spin_unlock(&n->list_lock);
1313}
1314
1315static inline void __remove_partial(struct kmem_cache_node *n,
1316					struct page *page)
1317{
1318	list_del(&page->lru);
1319	n->nr_partial--;
1320}
1321
1322static void remove_partial(struct kmem_cache *s, struct page *page)
1323{
1324	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1325
1326	spin_lock(&n->list_lock);
1327	__remove_partial(n, page);
1328	spin_unlock(&n->list_lock);
1329}
1330
1331/*
1332 * Lock slab and remove from the partial list.
1333 *
1334 * Must hold list_lock.
1335 */
1336static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
1337							struct page *page)
1338{
1339	if (slab_trylock(page)) {
1340		__remove_partial(n, page);
1341		__SetPageSlubFrozen(page);
1342		return 1;
1343	}
1344	return 0;
1345}
1346
1347/*
1348 * Try to allocate a partial slab from a specific node.
1349 */
1350static struct page *get_partial_node(struct kmem_cache_node *n)
1351{
1352	struct page *page;
1353
1354	/*
1355	 * Racy check. If we mistakenly see no partial slabs then we
1356	 * just allocate an empty slab. If we mistakenly try to get a
1357	 * partial slab and there is none available then get_partials()
1358	 * will return NULL.
1359	 */
1360	if (!n || !n->nr_partial)
1361		return NULL;
1362
1363	spin_lock(&n->list_lock);
1364	list_for_each_entry(page, &n->partial, lru)
1365		if (lock_and_freeze_slab(n, page))
1366			goto out;
1367	page = NULL;
1368out:
1369	spin_unlock(&n->list_lock);
1370	return page;
1371}
1372
1373/*
1374 * Get a page from somewhere. Search in increasing NUMA distances.
1375 */
1376static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1377{
1378#ifdef CONFIG_NUMA
1379	struct zonelist *zonelist;
1380	struct zoneref *z;
1381	struct zone *zone;
1382	enum zone_type high_zoneidx = gfp_zone(flags);
1383	struct page *page;
1384
1385	/*
1386	 * The defrag ratio allows a configuration of the tradeoffs between
1387	 * inter node defragmentation and node local allocations. A lower
1388	 * defrag_ratio increases the tendency to do local allocations
1389	 * instead of attempting to obtain partial slabs from other nodes.
1390	 *
1391	 * If the defrag_ratio is set to 0 then kmalloc() always
1392	 * returns node local objects. If the ratio is higher then kmalloc()
1393	 * may return off node objects because partial slabs are obtained
1394	 * from other nodes and filled up.
1395	 *
1396	 * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
1397	 * defrag_ratio = 1000) then every (well almost) allocation will
1398	 * first attempt to defrag slab caches on other nodes. This means
1399	 * scanning over all nodes to look for partial slabs which may be
1400	 * expensive if we do it every time we are trying to find a slab
1401	 * with available objects.
1402	 */
1403	if (!s->remote_node_defrag_ratio ||
1404			get_cycles() % 1024 > s->remote_node_defrag_ratio)
1405		return NULL;
1406
1407	get_mems_allowed();
1408	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1409	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1410		struct kmem_cache_node *n;
1411
1412		n = get_node(s, zone_to_nid(zone));
1413
1414		if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1415				n->nr_partial > s->min_partial) {
1416			page = get_partial_node(n);
1417			if (page) {
1418				put_mems_allowed();
1419				return page;
1420			}
1421		}
1422	}
1423	put_mems_allowed();
1424#endif
1425	return NULL;
1426}
1427
1428/*
1429 * Get a partial page, lock it and return it.
1430 */
1431static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1432{
1433	struct page *page;
1434	int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1435
1436	page = get_partial_node(get_node(s, searchnode));
1437	if (page || node != -1)
1438		return page;
1439
1440	return get_any_partial(s, flags);
1441}
1442
1443/*
1444 * Move a page back to the lists.
1445 *
1446 * Must be called with the slab lock held.
1447 *
1448 * On exit the slab lock will have been dropped.
1449 */
1450static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1451	__releases(bitlock)
1452{
1453	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1454
1455	__ClearPageSlubFrozen(page);
1456	if (page->inuse) {
1457
1458		if (page->freelist) {
1459			add_partial(n, page, tail);
1460			stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1461		} else {
1462			stat(s, DEACTIVATE_FULL);
1463			if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
1464				add_full(n, page);
1465		}
1466		slab_unlock(page);
1467	} else {
1468		stat(s, DEACTIVATE_EMPTY);
1469		if (n->nr_partial < s->min_partial) {
1470			/*
1471			 * Adding an empty slab to the partial slabs in order
1472			 * to avoid page allocator overhead. This slab needs
1473			 * to come after the other slabs with objects in
1474			 * so that the others get filled first. That way the
1475			 * size of the partial list stays small.
1476			 *
1477			 * kmem_cache_shrink can reclaim any empty slabs from
1478			 * the partial list.
1479			 */
1480			add_partial(n, page, 1);
1481			slab_unlock(page);
1482		} else {
1483			slab_unlock(page);
1484			stat(s, FREE_SLAB);
1485			discard_slab(s, page);
1486		}
1487	}
1488}
1489
1490/*
1491 * Remove the cpu slab
1492 */
1493static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1494	__releases(bitlock)
1495{
1496	struct page *page = c->page;
1497	int tail = 1;
1498
1499	if (page->freelist)
1500		stat(s, DEACTIVATE_REMOTE_FREES);
1501	/*
1502	 * Merge cpu freelist into slab freelist. Typically we get here
1503	 * because both freelists are empty. So this is unlikely
1504	 * to occur.
1505	 */
1506	while (unlikely(c->freelist)) {
1507		void **object;
1508
1509		tail = 0;	/* Hot objects. Put the slab first */
1510
1511		/* Retrieve object from cpu_freelist */
1512		object = c->freelist;
1513		c->freelist = get_freepointer(s, c->freelist);
1514
1515		/* And put onto the regular freelist */
1516		set_freepointer(s, object, page->freelist);
1517		page->freelist = object;
1518		page->inuse--;
1519	}
1520	c->page = NULL;
1521	unfreeze_slab(s, page, tail);
1522}
1523
1524static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1525{
1526	stat(s, CPUSLAB_FLUSH);
1527	slab_lock(c->page);
1528	deactivate_slab(s, c);
1529}
1530
1531/*
1532 * Flush cpu slab.
1533 *
1534 * Called from IPI handler with interrupts disabled.
1535 */
1536static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1537{
1538	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
1539
1540	if (likely(c && c->page))
1541		flush_slab(s, c);
1542}
1543
1544static void flush_cpu_slab(void *d)
1545{
1546	struct kmem_cache *s = d;
1547
1548	__flush_cpu_slab(s, smp_processor_id());
1549}
1550
1551static void flush_all(struct kmem_cache *s)
1552{
1553	on_each_cpu(flush_cpu_slab, s, 1);
1554}
1555
1556/*
1557 * Check if the objects in a per cpu structure fit numa
1558 * locality expectations.
1559 */
1560static inline int node_match(struct kmem_cache_cpu *c, int node)
1561{
1562#ifdef CONFIG_NUMA
1563	if (node != NUMA_NO_NODE && c->node != node)
1564		return 0;
1565#endif
1566	return 1;
1567}
1568
1569static int count_free(struct page *page)
1570{
1571	return page->objects - page->inuse;
1572}
1573
1574static unsigned long count_partial(struct kmem_cache_node *n,
1575					int (*get_count)(struct page *))
1576{
1577	unsigned long flags;
1578	unsigned long x = 0;
1579	struct page *page;
1580
1581	spin_lock_irqsave(&n->list_lock, flags);
1582	list_for_each_entry(page, &n->partial, lru)
1583		x += get_count(page);
1584	spin_unlock_irqrestore(&n->list_lock, flags);
1585	return x;
1586}
1587
1588static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
1589{
1590#ifdef CONFIG_SLUB_DEBUG
1591	return atomic_long_read(&n->total_objects);
1592#else
1593	return 0;
1594#endif
1595}
1596
1597static noinline void
1598slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1599{
1600	int node;
1601
1602	printk(KERN_WARNING
1603		"SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1604		nid, gfpflags);
1605	printk(KERN_WARNING "  cache: %s, object size: %d, buffer size: %d, "
1606		"default order: %d, min order: %d\n", s->name, s->objsize,
1607		s->size, oo_order(s->oo), oo_order(s->min));
1608
1609	if (oo_order(s->min) > get_order(s->objsize))
1610		printk(KERN_WARNING "  %s debugging increased min order, use "
1611		       "slub_debug=O to disable.\n", s->name);
1612
1613	for_each_online_node(node) {
1614		struct kmem_cache_node *n = get_node(s, node);
1615		unsigned long nr_slabs;
1616		unsigned long nr_objs;
1617		unsigned long nr_free;
1618
1619		if (!n)
1620			continue;
1621
1622		nr_free  = count_partial(n, count_free);
1623		nr_slabs = node_nr_slabs(n);
1624		nr_objs  = node_nr_objs(n);
1625
1626		printk(KERN_WARNING
1627			"  node %d: slabs: %ld, objs: %ld, free: %ld\n",
1628			node, nr_slabs, nr_objs, nr_free);
1629	}
1630}
1631
1632/*
1633 * Slow path. The lockless freelist is empty or we need to perform
1634 * debugging duties.
1635 *
1636 * Interrupts are disabled.
1637 *
1638 * Processing is still very fast if new objects have been freed to the
1639 * regular freelist. In that case we simply take over the regular freelist
1640 * as the lockless freelist and zap the regular freelist.
1641 *
1642 * If that is not working then we fall back to the partial lists. We take the
1643 * first element of the freelist as the object to allocate now and move the
1644 * rest of the freelist to the lockless freelist.
1645 *
1646 * And if we were unable to get a new slab from the partial slab lists then
1647 * we need to allocate a new slab. This is the slowest path since it involves
1648 * a call to the page allocator and the setup of a new slab.
1649 */
1650static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1651			  unsigned long addr, struct kmem_cache_cpu *c)
1652{
1653	void **object;
1654	struct page *new;
1655
1656	/* We handle __GFP_ZERO in the caller */
1657	gfpflags &= ~__GFP_ZERO;
1658
1659	if (!c->page)
1660		goto new_slab;
1661
1662	slab_lock(c->page);
1663	if (unlikely(!node_match(c, node)))
1664		goto another_slab;
1665
1666	stat(s, ALLOC_REFILL);
1667
1668load_freelist:
1669	object = c->page->freelist;
1670	if (unlikely(!object))
1671		goto another_slab;
1672	if (kmem_cache_debug(s))
1673		goto debug;
1674
1675	c->freelist = get_freepointer(s, object);
1676	c->page->inuse = c->page->objects;
1677	c->page->freelist = NULL;
1678	c->node = page_to_nid(c->page);
1679unlock_out:
1680	slab_unlock(c->page);
1681	stat(s, ALLOC_SLOWPATH);
1682	return object;
1683
1684another_slab:
1685	deactivate_slab(s, c);
1686
1687new_slab:
1688	new = get_partial(s, gfpflags, node);
1689	if (new) {
1690		c->page = new;
1691		stat(s, ALLOC_FROM_PARTIAL);
1692		goto load_freelist;
1693	}
1694
1695	gfpflags &= gfp_allowed_mask;
1696	if (gfpflags & __GFP_WAIT)
1697		local_irq_enable();
1698
1699	new = new_slab(s, gfpflags, node);
1700
1701	if (gfpflags & __GFP_WAIT)
1702		local_irq_disable();
1703
1704	if (new) {
1705		c = __this_cpu_ptr(s->cpu_slab);
1706		stat(s, ALLOC_SLAB);
1707		if (c->page)
1708			flush_slab(s, c);
1709		slab_lock(new);
1710		__SetPageSlubFrozen(new);
1711		c->page = new;
1712		goto load_freelist;
1713	}
1714	if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1715		slab_out_of_memory(s, gfpflags, node);
1716	return NULL;
1717debug:
1718	if (!alloc_debug_processing(s, c->page, object, addr))
1719		goto another_slab;
1720
1721	c->page->inuse++;
1722	c->page->freelist = get_freepointer(s, object);
1723	c->node = NUMA_NO_NODE;
1724	goto unlock_out;
1725}
1726
1727/*
1728 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
1729 * have the fastpath folded into their functions. So no function call
1730 * overhead for requests that can be satisfied on the fastpath.
1731 *
1732 * The fastpath works by first checking if the lockless freelist can be used.
1733 * If not then __slab_alloc is called for slow processing.
1734 *
1735 * Otherwise we can simply pick the next object from the lockless free list.
1736 */
1737static __always_inline void *slab_alloc(struct kmem_cache *s,
1738		gfp_t gfpflags, int node, unsigned long addr)
1739{
1740	void **object;
1741	struct kmem_cache_cpu *c;
1742	unsigned long flags;
1743
1744	if (slab_pre_alloc_hook(s, gfpflags))
1745		return NULL;
1746
1747	local_irq_save(flags);
1748	c = __this_cpu_ptr(s->cpu_slab);
1749	object = c->freelist;
1750	if (unlikely(!object || !node_match(c, node)))
1751
1752		object = __slab_alloc(s, gfpflags, node, addr, c);
1753
1754	else {
1755		c->freelist = get_freepointer(s, object);
1756		stat(s, ALLOC_FASTPATH);
1757	}
1758	local_irq_restore(flags);
1759
1760	if (unlikely(gfpflags & __GFP_ZERO) && object)
1761		memset(object, 0, s->objsize);
1762
1763	slab_post_alloc_hook(s, gfpflags, object);
1764
1765	return object;
1766}
1767
1768void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1769{
1770	void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
1771
1772	trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
1773
1774	return ret;
1775}
1776EXPORT_SYMBOL(kmem_cache_alloc);
1777
1778#ifdef CONFIG_TRACING
1779void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
1780{
1781	void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
1782	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
1783	return ret;
1784}
1785EXPORT_SYMBOL(kmem_cache_alloc_trace);
1786
1787void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
1788{
1789	void *ret = kmalloc_order(size, flags, order);
1790	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
1791	return ret;
1792}
1793EXPORT_SYMBOL(kmalloc_order_trace);
1794#endif
1795
1796#ifdef CONFIG_NUMA
1797void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1798{
1799	void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
1800
1801	trace_kmem_cache_alloc_node(_RET_IP_, ret,
1802				    s->objsize, s->size, gfpflags, node);
1803
1804	return ret;
1805}
1806EXPORT_SYMBOL(kmem_cache_alloc_node);
1807
1808#ifdef CONFIG_TRACING
1809void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
1810				    gfp_t gfpflags,
1811				    int node, size_t size)
1812{
1813	void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
1814
1815	trace_kmalloc_node(_RET_IP_, ret,
1816			   size, s->size, gfpflags, node);
1817	return ret;
1818}
1819EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
1820#endif
1821#endif
1822
1823/*
1824 * Slow patch handling. This may still be called frequently since objects
1825 * have a longer lifetime than the cpu slabs in most processing loads.
1826 *
1827 * So we still attempt to reduce cache line usage. Just take the slab
1828 * lock and free the item. If there is no additional partial page
1829 * handling required then we can return immediately.
1830 */
1831static void __slab_free(struct kmem_cache *s, struct page *page,
1832			void *x, unsigned long addr)
1833{
1834	void *prior;
1835	void **object = (void *)x;
1836
1837	stat(s, FREE_SLOWPATH);
1838	slab_lock(page);
1839
1840	if (kmem_cache_debug(s))
1841		goto debug;
1842
1843checks_ok:
1844	prior = page->freelist;
1845	set_freepointer(s, object, prior);
1846	page->freelist = object;
1847	page->inuse--;
1848
1849	if (unlikely(PageSlubFrozen(page))) {
1850		stat(s, FREE_FROZEN);
1851		goto out_unlock;
1852	}
1853
1854	if (unlikely(!page->inuse))
1855		goto slab_empty;
1856
1857	/*
1858	 * Objects left in the slab. If it was not on the partial list before
1859	 * then add it.
1860	 */
1861	if (unlikely(!prior)) {
1862		add_partial(get_node(s, page_to_nid(page)), page, 1);
1863		stat(s, FREE_ADD_PARTIAL);
1864	}
1865
1866out_unlock:
1867	slab_unlock(page);
1868	return;
1869
1870slab_empty:
1871	if (prior) {
1872		/*
1873		 * Slab still on the partial list.
1874		 */
1875		remove_partial(s, page);
1876		stat(s, FREE_REMOVE_PARTIAL);
1877	}
1878	slab_unlock(page);
1879	stat(s, FREE_SLAB);
1880	discard_slab(s, page);
1881	return;
1882
1883debug:
1884	if (!free_debug_processing(s, page, x, addr))
1885		goto out_unlock;
1886	goto checks_ok;
1887}
1888
1889/*
1890 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
1891 * can perform fastpath freeing without additional function calls.
1892 *
1893 * The fastpath is only possible if we are freeing to the current cpu slab
1894 * of this processor. This typically the case if we have just allocated
1895 * the item before.
1896 *
1897 * If fastpath is not possible then fall back to __slab_free where we deal
1898 * with all sorts of special processing.
1899 */
1900static __always_inline void slab_free(struct kmem_cache *s,
1901			struct page *page, void *x, unsigned long addr)
1902{
1903	void **object = (void *)x;
1904	struct kmem_cache_cpu *c;
1905	unsigned long flags;
1906
1907	slab_free_hook(s, x);
1908
1909	local_irq_save(flags);
1910	c = __this_cpu_ptr(s->cpu_slab);
1911
1912	slab_free_hook_irq(s, x);
1913
1914	if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
1915		set_freepointer(s, object, c->freelist);
1916		c->freelist = object;
1917		stat(s, FREE_FASTPATH);
1918	} else
1919		__slab_free(s, page, x, addr);
1920
1921	local_irq_restore(flags);
1922}
1923
1924void kmem_cache_free(struct kmem_cache *s, void *x)
1925{
1926	struct page *page;
1927
1928	page = virt_to_head_page(x);
1929
1930	slab_free(s, page, x, _RET_IP_);
1931
1932	trace_kmem_cache_free(_RET_IP_, x);
1933}
1934EXPORT_SYMBOL(kmem_cache_free);
1935
1936/* Figure out on which slab page the object resides */
1937static struct page *get_object_page(const void *x)
1938{
1939	struct page *page = virt_to_head_page(x);
1940
1941	if (!PageSlab(page))
1942		return NULL;
1943
1944	return page;
1945}
1946
1947/*
1948 * Object placement in a slab is made very easy because we always start at
1949 * offset 0. If we tune the size of the object to the alignment then we can
1950 * get the required alignment by putting one properly sized object after
1951 * another.
1952 *
1953 * Notice that the allocation order determines the sizes of the per cpu
1954 * caches. Each processor has always one slab available for allocations.
1955 * Increasing the allocation order reduces the number of times that slabs
1956 * must be moved on and off the partial lists and is therefore a factor in
1957 * locking overhead.
1958 */
1959
1960/*
1961 * Mininum / Maximum order of slab pages. This influences locking overhead
1962 * and slab fragmentation. A higher order reduces the number of partial slabs
1963 * and increases the number of allocations possible without having to
1964 * take the list_lock.
1965 */
1966static int slub_min_order;
1967static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
1968static int slub_min_objects;
1969
1970/*
1971 * Merge control. If this is set then no merging of slab caches will occur.
1972 * (Could be removed. This was introduced to pacify the merge skeptics.)
1973 */
1974static int slub_nomerge;
1975
1976/*
1977 * Calculate the order of allocation given an slab object size.
1978 *
1979 * The order of allocation has significant impact on performance and other
1980 * system components. Generally order 0 allocations should be preferred since
1981 * order 0 does not cause fragmentation in the page allocator. Larger objects
1982 * be problematic to put into order 0 slabs because there may be too much
1983 * unused space left. We go to a higher order if more than 1/16th of the slab
1984 * would be wasted.
1985 *
1986 * In order to reach satisfactory performance we must ensure that a minimum
1987 * number of objects is in one slab. Otherwise we may generate too much
1988 * activity on the partial lists which requires taking the list_lock. This is
1989 * less a concern for large slabs though which are rarely used.
1990 *
1991 * slub_max_order specifies the order where we begin to stop considering the
1992 * number of objects in a slab as critical. If we reach slub_max_order then
1993 * we try to keep the page order as low as possible. So we accept more waste
1994 * of space in favor of a small page order.
1995 *
1996 * Higher order allocations also allow the placement of more objects in a
1997 * slab and thereby reduce object handling overhead. If the user has
1998 * requested a higher mininum order then we start with that one instead of
1999 * the smallest order which will fit the object.
2000 */
2001static inline int slab_order(int size, int min_objects,
2002				int max_order, int fract_leftover)
2003{
2004	int order;
2005	int rem;
2006	int min_order = slub_min_order;
2007
2008	if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE)
2009		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
2010
2011	for (order = max(min_order,
2012				fls(min_objects * size - 1) - PAGE_SHIFT);
2013			order <= max_order; order++) {
2014
2015		unsigned long slab_size = PAGE_SIZE << order;
2016
2017		if (slab_size < min_objects * size)
2018			continue;
2019
2020		rem = slab_size % size;
2021
2022		if (rem <= slab_size / fract_leftover)
2023			break;
2024
2025	}
2026
2027	return order;
2028}
2029
2030static inline int calculate_order(int size)
2031{
2032	int order;
2033	int min_objects;
2034	int fraction;
2035	int max_objects;
2036
2037	/*
2038	 * Attempt to find best configuration for a slab. This
2039	 * works by first attempting to generate a layout with
2040	 * the best configuration and backing off gradually.
2041	 *
2042	 * First we reduce the acceptable waste in a slab. Then
2043	 * we reduce the minimum objects required in a slab.
2044	 */
2045	min_objects = slub_min_objects;
2046	if (!min_objects)
2047		min_objects = 4 * (fls(nr_cpu_ids) + 1);
2048	max_objects = (PAGE_SIZE << slub_max_order)/size;
2049	min_objects = min(min_objects, max_objects);
2050
2051	while (min_objects > 1) {
2052		fraction = 16;
2053		while (fraction >= 4) {
2054			order = slab_order(size, min_objects,
2055						slub_max_order, fraction);
2056			if (order <= slub_max_order)
2057				return order;
2058			fraction /= 2;
2059		}
2060		min_objects--;
2061	}
2062
2063	/*
2064	 * We were unable to place multiple objects in a slab. Now
2065	 * lets see if we can place a single object there.
2066	 */
2067	order = slab_order(size, 1, slub_max_order, 1);
2068	if (order <= slub_max_order)
2069		return order;
2070
2071	/*
2072	 * Doh this slab cannot be placed using slub_max_order.
2073	 */
2074	order = slab_order(size, 1, MAX_ORDER, 1);
2075	if (order < MAX_ORDER)
2076		return order;
2077	return -ENOSYS;
2078}
2079
2080/*
2081 * Figure out what the alignment of the objects will be.
2082 */
2083static unsigned long calculate_alignment(unsigned long flags,
2084		unsigned long align, unsigned long size)
2085{
2086	/*
2087	 * If the user wants hardware cache aligned objects then follow that
2088	 * suggestion if the object is sufficiently large.
2089	 *
2090	 * The hardware cache alignment cannot override the specified
2091	 * alignment though. If that is greater then use it.
2092	 */
2093	if (flags & SLAB_HWCACHE_ALIGN) {
2094		unsigned long ralign = cache_line_size();
2095		while (size <= ralign / 2)
2096			ralign /= 2;
2097		align = max(align, ralign);
2098	}
2099
2100	if (align < ARCH_SLAB_MINALIGN)
2101		align = ARCH_SLAB_MINALIGN;
2102
2103	return ALIGN(align, sizeof(void *));
2104}
2105
2106static void
2107init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2108{
2109	n->nr_partial = 0;
2110	spin_lock_init(&n->list_lock);
2111	INIT_LIST_HEAD(&n->partial);
2112#ifdef CONFIG_SLUB_DEBUG
2113	atomic_long_set(&n->nr_slabs, 0);
2114	atomic_long_set(&n->total_objects, 0);
2115	INIT_LIST_HEAD(&n->full);
2116#endif
2117}
2118
2119static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2120{
2121	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2122			SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
2123
2124	s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2125
2126	return s->cpu_slab != NULL;
2127}
2128
2129static struct kmem_cache *kmem_cache_node;
2130
2131/*
2132 * No kmalloc_node yet so do it by hand. We know that this is the first
2133 * slab on the node for this slabcache. There are no concurrent accesses
2134 * possible.
2135 *
2136 * Note that this function only works on the kmalloc_node_cache
2137 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
2138 * memory on a fresh node that has no slab structures yet.
2139 */
2140static void early_kmem_cache_node_alloc(int node)
2141{
2142	struct page *page;
2143	struct kmem_cache_node *n;
2144	unsigned long flags;
2145
2146	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
2147
2148	page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
2149
2150	BUG_ON(!page);
2151	if (page_to_nid(page) != node) {
2152		printk(KERN_ERR "SLUB: Unable to allocate memory from "
2153				"node %d\n", node);
2154		printk(KERN_ERR "SLUB: Allocating a useless per node structure "
2155				"in order to be able to continue\n");
2156	}
2157
2158	n = page->freelist;
2159	BUG_ON(!n);
2160	page->freelist = get_freepointer(kmem_cache_node, n);
2161	page->inuse++;
2162	kmem_cache_node->node[node] = n;
2163#ifdef CONFIG_SLUB_DEBUG
2164	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
2165	init_tracking(kmem_cache_node, n);
2166#endif
2167	init_kmem_cache_node(n, kmem_cache_node);
2168	inc_slabs_node(kmem_cache_node, node, page->objects);
2169
2170	/*
2171	 * lockdep requires consistent irq usage for each lock
2172	 * so even though there cannot be a race this early in
2173	 * the boot sequence, we still disable irqs.
2174	 */
2175	local_irq_save(flags);
2176	add_partial(n, page, 0);
2177	local_irq_restore(flags);
2178}
2179
2180static void free_kmem_cache_nodes(struct kmem_cache *s)
2181{
2182	int node;
2183
2184	for_each_node_state(node, N_NORMAL_MEMORY) {
2185		struct kmem_cache_node *n = s->node[node];
2186
2187		if (n)
2188			kmem_cache_free(kmem_cache_node, n);
2189
2190		s->node[node] = NULL;
2191	}
2192}
2193
2194static int init_kmem_cache_nodes(struct kmem_cache *s)
2195{
2196	int node;
2197
2198	for_each_node_state(node, N_NORMAL_MEMORY) {
2199		struct kmem_cache_node *n;
2200
2201		if (slab_state == DOWN) {
2202			early_kmem_cache_node_alloc(node);
2203			continue;
2204		}
2205		n = kmem_cache_alloc_node(kmem_cache_node,
2206						GFP_KERNEL, node);
2207
2208		if (!n) {
2209			free_kmem_cache_nodes(s);
2210			return 0;
2211		}
2212
2213		s->node[node] = n;
2214		init_kmem_cache_node(n, s);
2215	}
2216	return 1;
2217}
2218
2219static void set_min_partial(struct kmem_cache *s, unsigned long min)
2220{
2221	if (min < MIN_PARTIAL)
2222		min = MIN_PARTIAL;
2223	else if (min > MAX_PARTIAL)
2224		min = MAX_PARTIAL;
2225	s->min_partial = min;
2226}
2227
2228/*
2229 * calculate_sizes() determines the order and the distribution of data within
2230 * a slab object.
2231 */
2232static int calculate_sizes(struct kmem_cache *s, int forced_order)
2233{
2234	unsigned long flags = s->flags;
2235	unsigned long size = s->objsize;
2236	unsigned long align = s->align;
2237	int order;
2238
2239	/*
2240	 * Round up object size to the next word boundary. We can only
2241	 * place the free pointer at word boundaries and this determines
2242	 * the possible location of the free pointer.
2243	 */
2244	size = ALIGN(size, sizeof(void *));
2245
2246#ifdef CONFIG_SLUB_DEBUG
2247	/*
2248	 * Determine if we can poison the object itself. If the user of
2249	 * the slab may touch the object after free or before allocation
2250	 * then we should never poison the object itself.
2251	 */
2252	if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
2253			!s->ctor)
2254		s->flags |= __OBJECT_POISON;
2255	else
2256		s->flags &= ~__OBJECT_POISON;
2257
2258
2259	/*
2260	 * If we are Redzoning then check if there is some space between the
2261	 * end of the object and the free pointer. If not then add an
2262	 * additional word to have some bytes to store Redzone information.
2263	 */
2264	if ((flags & SLAB_RED_ZONE) && size == s->objsize)
2265		size += sizeof(void *);
2266#endif
2267
2268	/*
2269	 * With that we have determined the number of bytes in actual use
2270	 * by the object. This is the potential offset to the free pointer.
2271	 */
2272	s->inuse = size;
2273
2274	if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
2275		s->ctor)) {
2276		/*
2277		 * Relocate free pointer after the object if it is not
2278		 * permitted to overwrite the first word of the object on
2279		 * kmem_cache_free.
2280		 *
2281		 * This is the case if we do RCU, have a constructor or
2282		 * destructor or are poisoning the objects.
2283		 */
2284		s->offset = size;
2285		size += sizeof(void *);
2286	}
2287
2288#ifdef CONFIG_SLUB_DEBUG
2289	if (flags & SLAB_STORE_USER)
2290		/*
2291		 * Need to store information about allocs and frees after
2292		 * the object.
2293		 */
2294		size += 2 * sizeof(struct track);
2295
2296	if (flags & SLAB_RED_ZONE)
2297		/*
2298		 * Add some empty padding so that we can catch
2299		 * overwrites from earlier objects rather than let
2300		 * tracking information or the free pointer be
2301		 * corrupted if a user writes before the start
2302		 * of the object.
2303		 */
2304		size += sizeof(void *);
2305#endif
2306
2307	/*
2308	 * Determine the alignment based on various parameters that the
2309	 * user specified and the dynamic determination of cache line size
2310	 * on bootup.
2311	 */
2312	align = calculate_alignment(flags, align, s->objsize);
2313	s->align = align;
2314
2315	/*
2316	 * SLUB stores one object immediately after another beginning from
2317	 * offset 0. In order to align the objects we have to simply size
2318	 * each object to conform to the alignment.
2319	 */
2320	size = ALIGN(size, align);
2321	s->size = size;
2322	if (forced_order >= 0)
2323		order = forced_order;
2324	else
2325		order = calculate_order(size);
2326
2327	if (order < 0)
2328		return 0;
2329
2330	s->allocflags = 0;
2331	if (order)
2332		s->allocflags |= __GFP_COMP;
2333
2334	if (s->flags & SLAB_CACHE_DMA)
2335		s->allocflags |= SLUB_DMA;
2336
2337	if (s->flags & SLAB_RECLAIM_ACCOUNT)
2338		s->allocflags |= __GFP_RECLAIMABLE;
2339
2340	/*
2341	 * Determine the number of objects per slab
2342	 */
2343	s->oo = oo_make(order, size);
2344	s->min = oo_make(get_order(size), size);
2345	if (oo_objects(s->oo) > oo_objects(s->max))
2346		s->max = s->oo;
2347
2348	return !!oo_objects(s->oo);
2349
2350}
2351
2352static int kmem_cache_open(struct kmem_cache *s,
2353		const char *name, size_t size,
2354		size_t align, unsigned long flags,
2355		void (*ctor)(void *))
2356{
2357	memset(s, 0, kmem_size);
2358	s->name = name;
2359	s->ctor = ctor;
2360	s->objsize = size;
2361	s->align = align;
2362	s->flags = kmem_cache_flags(size, flags, name, ctor);
2363
2364	if (!calculate_sizes(s, -1))
2365		goto error;
2366	if (disable_higher_order_debug) {
2367		/*
2368		 * Disable debugging flags that store metadata if the min slab
2369		 * order increased.
2370		 */
2371		if (get_order(s->size) > get_order(s->objsize)) {
2372			s->flags &= ~DEBUG_METADATA_FLAGS;
2373			s->offset = 0;
2374			if (!calculate_sizes(s, -1))
2375				goto error;
2376		}
2377	}
2378
2379	/*
2380	 * The larger the object size is, the more pages we want on the partial
2381	 * list to avoid pounding the page allocator excessively.
2382	 */
2383	set_min_partial(s, ilog2(s->size));
2384	s->refcount = 1;
2385#ifdef CONFIG_NUMA
2386	s->remote_node_defrag_ratio = 1000;
2387#endif
2388	if (!init_kmem_cache_nodes(s))
2389		goto error;
2390
2391	if (alloc_kmem_cache_cpus(s))
2392		return 1;
2393
2394	free_kmem_cache_nodes(s);
2395error:
2396	if (flags & SLAB_PANIC)
2397		panic("Cannot create slab %s size=%lu realsize=%u "
2398			"order=%u offset=%u flags=%lx\n",
2399			s->name, (unsigned long)size, s->size, oo_order(s->oo),
2400			s->offset, flags);
2401	return 0;
2402}
2403
2404/*
2405 * Check if a given pointer is valid
2406 */
2407int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2408{
2409	struct page *page;
2410
2411	if (!kern_ptr_validate(object, s->size))
2412		return 0;
2413
2414	page = get_object_page(object);
2415
2416	if (!page || s != page->slab)
2417		/* No slab or wrong slab */
2418		return 0;
2419
2420	if (!check_valid_pointer(s, page, object))
2421		return 0;
2422
2423	/*
2424	 * We could also check if the object is on the slabs freelist.
2425	 * But this would be too expensive and it seems that the main
2426	 * purpose of kmem_ptr_valid() is to check if the object belongs
2427	 * to a certain slab.
2428	 */
2429	return 1;
2430}
2431EXPORT_SYMBOL(kmem_ptr_validate);
2432
2433/*
2434 * Determine the size of a slab object
2435 */
2436unsigned int kmem_cache_size(struct kmem_cache *s)
2437{
2438	return s->objsize;
2439}
2440EXPORT_SYMBOL(kmem_cache_size);
2441
2442const char *kmem_cache_name(struct kmem_cache *s)
2443{
2444	return s->name;
2445}
2446EXPORT_SYMBOL(kmem_cache_name);
2447
2448static void list_slab_objects(struct kmem_cache *s, struct page *page,
2449							const char *text)
2450{
2451#ifdef CONFIG_SLUB_DEBUG
2452	void *addr = page_address(page);
2453	void *p;
2454	unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
2455				     sizeof(long), GFP_ATOMIC);
2456	if (!map)
2457		return;
2458	slab_err(s, page, "%s", text);
2459	slab_lock(page);
2460	for_each_free_object(p, s, page->freelist)
2461		set_bit(slab_index(p, s, addr), map);
2462
2463	for_each_object(p, s, addr, page->objects) {
2464
2465		if (!test_bit(slab_index(p, s, addr), map)) {
2466			printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
2467							p, p - addr);
2468			print_tracking(s, p);
2469		}
2470	}
2471	slab_unlock(page);
2472	kfree(map);
2473#endif
2474}
2475
2476/*
2477 * Attempt to free all partial slabs on a node.
2478 */
2479static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
2480{
2481	unsigned long flags;
2482	struct page *page, *h;
2483
2484	spin_lock_irqsave(&n->list_lock, flags);
2485	list_for_each_entry_safe(page, h, &n->partial, lru) {
2486		if (!page->inuse) {
2487			__remove_partial(n, page);
2488			discard_slab(s, page);
2489		} else {
2490			list_slab_objects(s, page,
2491				"Objects remaining on kmem_cache_close()");
2492		}
2493	}
2494	spin_unlock_irqrestore(&n->list_lock, flags);
2495}
2496
2497/*
2498 * Release all resources used by a slab cache.
2499 */
2500static inline int kmem_cache_close(struct kmem_cache *s)
2501{
2502	int node;
2503
2504	flush_all(s);
2505	free_percpu(s->cpu_slab);
2506	/* Attempt to free all objects */
2507	for_each_node_state(node, N_NORMAL_MEMORY) {
2508		struct kmem_cache_node *n = get_node(s, node);
2509
2510		free_partial(s, n);
2511		if (n->nr_partial || slabs_node(s, node))
2512			return 1;
2513	}
2514	free_kmem_cache_nodes(s);
2515	return 0;
2516}
2517
2518/*
2519 * Close a cache and release the kmem_cache structure
2520 * (must be used for caches created using kmem_cache_create)
2521 */
2522void kmem_cache_destroy(struct kmem_cache *s)
2523{
2524	down_write(&slub_lock);
2525	s->refcount--;
2526	if (!s->refcount) {
2527		list_del(&s->list);
2528		if (kmem_cache_close(s)) {
2529			printk(KERN_ERR "SLUB %s: %s called for cache that "
2530				"still has objects.\n", s->name, __func__);
2531			dump_stack();
2532		}
2533		if (s->flags & SLAB_DESTROY_BY_RCU)
2534			rcu_barrier();
2535		sysfs_slab_remove(s);
2536	}
2537	up_write(&slub_lock);
2538}
2539EXPORT_SYMBOL(kmem_cache_destroy);
2540
2541/********************************************************************
2542 *		Kmalloc subsystem
2543 *******************************************************************/
2544
2545struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
2546EXPORT_SYMBOL(kmalloc_caches);
2547
2548static struct kmem_cache *kmem_cache;
2549
2550#ifdef CONFIG_ZONE_DMA
2551static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
2552#endif
2553
2554static int __init setup_slub_min_order(char *str)
2555{
2556	get_option(&str, &slub_min_order);
2557
2558	return 1;
2559}
2560
2561__setup("slub_min_order=", setup_slub_min_order);
2562
2563static int __init setup_slub_max_order(char *str)
2564{
2565	get_option(&str, &slub_max_order);
2566	slub_max_order = min(slub_max_order, MAX_ORDER - 1);
2567
2568	return 1;
2569}
2570
2571__setup("slub_max_order=", setup_slub_max_order);
2572
2573static int __init setup_slub_min_objects(char *str)
2574{
2575	get_option(&str, &slub_min_objects);
2576
2577	return 1;
2578}
2579
2580__setup("slub_min_objects=", setup_slub_min_objects);
2581
2582static int __init setup_slub_nomerge(char *str)
2583{
2584	slub_nomerge = 1;
2585	return 1;
2586}
2587
2588__setup("slub_nomerge", setup_slub_nomerge);
2589
2590static struct kmem_cache *__init create_kmalloc_cache(const char *name,
2591						int size, unsigned int flags)
2592{
2593	struct kmem_cache *s;
2594
2595	s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
2596
2597	/*
2598	 * This function is called with IRQs disabled during early-boot on
2599	 * single CPU so there's no need to take slub_lock here.
2600	 */
2601	if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
2602								flags, NULL))
2603		goto panic;
2604
2605	list_add(&s->list, &slab_caches);
2606	return s;
2607
2608panic:
2609	panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
2610	return NULL;
2611}
2612
2613/*
2614 * Conversion table for small slabs sizes / 8 to the index in the
2615 * kmalloc array. This is necessary for slabs < 192 since we have non power
2616 * of two cache sizes there. The size of larger slabs can be determined using
2617 * fls.
2618 */
2619static s8 size_index[24] = {
2620	3,	/* 8 */
2621	4,	/* 16 */
2622	5,	/* 24 */
2623	5,	/* 32 */
2624	6,	/* 40 */
2625	6,	/* 48 */
2626	6,	/* 56 */
2627	6,	/* 64 */
2628	1,	/* 72 */
2629	1,	/* 80 */
2630	1,	/* 88 */
2631	1,	/* 96 */
2632	7,	/* 104 */
2633	7,	/* 112 */
2634	7,	/* 120 */
2635	7,	/* 128 */
2636	2,	/* 136 */
2637	2,	/* 144 */
2638	2,	/* 152 */
2639	2,	/* 160 */
2640	2,	/* 168 */
2641	2,	/* 176 */
2642	2,	/* 184 */
2643	2	/* 192 */
2644};
2645
2646static inline int size_index_elem(size_t bytes)
2647{
2648	return (bytes - 1) / 8;
2649}
2650
2651static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2652{
2653	int index;
2654
2655	if (size <= 192) {
2656		if (!size)
2657			return ZERO_SIZE_PTR;
2658
2659		index = size_index[size_index_elem(size)];
2660	} else
2661		index = fls(size - 1);
2662
2663#ifdef CONFIG_ZONE_DMA
2664	if (unlikely((flags & SLUB_DMA)))
2665		return kmalloc_dma_caches[index];
2666
2667#endif
2668	return kmalloc_caches[index];
2669}
2670
2671void *__kmalloc(size_t size, gfp_t flags)
2672{
2673	struct kmem_cache *s;
2674	void *ret;
2675
2676	if (unlikely(size > SLUB_MAX_SIZE))
2677		return kmalloc_large(size, flags);
2678
2679	s = get_slab(size, flags);
2680
2681	if (unlikely(ZERO_OR_NULL_PTR(s)))
2682		return s;
2683
2684	ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_);
2685
2686	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
2687
2688	return ret;
2689}
2690EXPORT_SYMBOL(__kmalloc);
2691
2692#ifdef CONFIG_NUMA
2693static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2694{
2695	struct page *page;
2696	void *ptr = NULL;
2697
2698	flags |= __GFP_COMP | __GFP_NOTRACK;
2699	page = alloc_pages_node(node, flags, get_order(size));
2700	if (page)
2701		ptr = page_address(page);
2702
2703	kmemleak_alloc(ptr, size, 1, flags);
2704	return ptr;
2705}
2706
2707void *__kmalloc_node(size_t size, gfp_t flags, int node)
2708{
2709	struct kmem_cache *s;
2710	void *ret;
2711
2712	if (unlikely(size > SLUB_MAX_SIZE)) {
2713		ret = kmalloc_large_node(size, flags, node);
2714
2715		trace_kmalloc_node(_RET_IP_, ret,
2716				   size, PAGE_SIZE << get_order(size),
2717				   flags, node);
2718
2719		return ret;
2720	}
2721
2722	s = get_slab(size, flags);
2723
2724	if (unlikely(ZERO_OR_NULL_PTR(s)))
2725		return s;
2726
2727	ret = slab_alloc(s, flags, node, _RET_IP_);
2728
2729	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
2730
2731	return ret;
2732}
2733EXPORT_SYMBOL(__kmalloc_node);
2734#endif
2735
2736size_t ksize(const void *object)
2737{
2738	struct page *page;
2739	struct kmem_cache *s;
2740
2741	if (unlikely(object == ZERO_SIZE_PTR))
2742		return 0;
2743
2744	page = virt_to_head_page(object);
2745
2746	if (unlikely(!PageSlab(page))) {
2747		WARN_ON(!PageCompound(page));
2748		return PAGE_SIZE << compound_order(page);
2749	}
2750	s = page->slab;
2751
2752#ifdef CONFIG_SLUB_DEBUG
2753	/*
2754	 * Debugging requires use of the padding between object
2755	 * and whatever may come after it.
2756	 */
2757	if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2758		return s->objsize;
2759
2760#endif
2761	/*
2762	 * If we have the need to store the freelist pointer
2763	 * back there or track user information then we can
2764	 * only use the space before that information.
2765	 */
2766	if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2767		return s->inuse;
2768	/*
2769	 * Else we can use all the padding etc for the allocation
2770	 */
2771	return s->size;
2772}
2773EXPORT_SYMBOL(ksize);
2774
2775void kfree(const void *x)
2776{
2777	struct page *page;
2778	void *object = (void *)x;
2779
2780	trace_kfree(_RET_IP_, x);
2781
2782	if (unlikely(ZERO_OR_NULL_PTR(x)))
2783		return;
2784
2785	page = virt_to_head_page(x);
2786	if (unlikely(!PageSlab(page))) {
2787		BUG_ON(!PageCompound(page));
2788		kmemleak_free(x);
2789		put_page(page);
2790		return;
2791	}
2792	slab_free(page->slab, page, object, _RET_IP_);
2793}
2794EXPORT_SYMBOL(kfree);
2795
2796/*
2797 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
2798 * the remaining slabs by the number of items in use. The slabs with the
2799 * most items in use come first. New allocations will then fill those up
2800 * and thus they can be removed from the partial lists.
2801 *
2802 * The slabs with the least items are placed last. This results in them
2803 * being allocated from last increasing the chance that the last objects
2804 * are freed in them.
2805 */
2806int kmem_cache_shrink(struct kmem_cache *s)
2807{
2808	int node;
2809	int i;
2810	struct kmem_cache_node *n;
2811	struct page *page;
2812	struct page *t;
2813	int objects = oo_objects(s->max);
2814	struct list_head *slabs_by_inuse =
2815		kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
2816	unsigned long flags;
2817
2818	if (!slabs_by_inuse)
2819		return -ENOMEM;
2820
2821	flush_all(s);
2822	for_each_node_state(node, N_NORMAL_MEMORY) {
2823		n = get_node(s, node);
2824
2825		if (!n->nr_partial)
2826			continue;
2827
2828		for (i = 0; i < objects; i++)
2829			INIT_LIST_HEAD(slabs_by_inuse + i);
2830
2831		spin_lock_irqsave(&n->list_lock, flags);
2832
2833		/*
2834		 * Build lists indexed by the items in use in each slab.
2835		 *
2836		 * Note that concurrent frees may occur while we hold the
2837		 * list_lock. page->inuse here is the upper limit.
2838		 */
2839		list_for_each_entry_safe(page, t, &n->partial, lru) {
2840			if (!page->inuse && slab_trylock(page)) {
2841				/*
2842				 * Must hold slab lock here because slab_free
2843				 * may have freed the last object and be
2844				 * waiting to release the slab.
2845				 */
2846				__remove_partial(n, page);
2847				slab_unlock(page);
2848				discard_slab(s, page);
2849			} else {
2850				list_move(&page->lru,
2851				slabs_by_inuse + page->inuse);
2852			}
2853		}
2854
2855		/*
2856		 * Rebuild the partial list with the slabs filled up most
2857		 * first and the least used slabs at the end.
2858		 */
2859		for (i = objects - 1; i >= 0; i--)
2860			list_splice(slabs_by_inuse + i, n->partial.prev);
2861
2862		spin_unlock_irqrestore(&n->list_lock, flags);
2863	}
2864
2865	kfree(slabs_by_inuse);
2866	return 0;
2867}
2868EXPORT_SYMBOL(kmem_cache_shrink);
2869
2870#if defined(CONFIG_MEMORY_HOTPLUG)
2871static int slab_mem_going_offline_callback(void *arg)
2872{
2873	struct kmem_cache *s;
2874
2875	down_read(&slub_lock);
2876	list_for_each_entry(s, &slab_caches, list)
2877		kmem_cache_shrink(s);
2878	up_read(&slub_lock);
2879
2880	return 0;
2881}
2882
2883static void slab_mem_offline_callback(void *arg)
2884{
2885	struct kmem_cache_node *n;
2886	struct kmem_cache *s;
2887	struct memory_notify *marg = arg;
2888	int offline_node;
2889
2890	offline_node = marg->status_change_nid;
2891
2892	/*
2893	 * If the node still has available memory. we need kmem_cache_node
2894	 * for it yet.
2895	 */
2896	if (offline_node < 0)
2897		return;
2898
2899	down_read(&slub_lock);
2900	list_for_each_entry(s, &slab_caches, list) {
2901		n = get_node(s, offline_node);
2902		if (n) {
2903			/*
2904			 * if n->nr_slabs > 0, slabs still exist on the node
2905			 * that is going down. We were unable to free them,
2906			 * and offline_pages() function shouldn't call this
2907			 * callback. So, we must fail.
2908			 */
2909			BUG_ON(slabs_node(s, offline_node));
2910
2911			s->node[offline_node] = NULL;
2912			kmem_cache_free(kmem_cache_node, n);
2913		}
2914	}
2915	up_read(&slub_lock);
2916}
2917
2918static int slab_mem_going_online_callback(void *arg)
2919{
2920	struct kmem_cache_node *n;
2921	struct kmem_cache *s;
2922	struct memory_notify *marg = arg;
2923	int nid = marg->status_change_nid;
2924	int ret = 0;
2925
2926	/*
2927	 * If the node's memory is already available, then kmem_cache_node is
2928	 * already created. Nothing to do.
2929	 */
2930	if (nid < 0)
2931		return 0;
2932
2933	/*
2934	 * We are bringing a node online. No memory is available yet. We must
2935	 * allocate a kmem_cache_node structure in order to bring the node
2936	 * online.
2937	 */
2938	down_read(&slub_lock);
2939	list_for_each_entry(s, &slab_caches, list) {
2940		/*
2941		 * XXX: kmem_cache_alloc_node will fallback to other nodes
2942		 *      since memory is not yet available from the node that
2943		 *      is brought up.
2944		 */
2945		n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
2946		if (!n) {
2947			ret = -ENOMEM;
2948			goto out;
2949		}
2950		init_kmem_cache_node(n, s);
2951		s->node[nid] = n;
2952	}
2953out:
2954	up_read(&slub_lock);
2955	return ret;
2956}
2957
2958static int slab_memory_callback(struct notifier_block *self,
2959				unsigned long action, void *arg)
2960{
2961	int ret = 0;
2962
2963	switch (action) {
2964	case MEM_GOING_ONLINE:
2965		ret = slab_mem_going_online_callback(arg);
2966		break;
2967	case MEM_GOING_OFFLINE:
2968		ret = slab_mem_going_offline_callback(arg);
2969		break;
2970	case MEM_OFFLINE:
2971	case MEM_CANCEL_ONLINE:
2972		slab_mem_offline_callback(arg);
2973		break;
2974	case MEM_ONLINE:
2975	case MEM_CANCEL_OFFLINE:
2976		break;
2977	}
2978	if (ret)
2979		ret = notifier_from_errno(ret);
2980	else
2981		ret = NOTIFY_OK;
2982	return ret;
2983}
2984
2985#endif /* CONFIG_MEMORY_HOTPLUG */
2986
2987/********************************************************************
2988 *			Basic setup of slabs
2989 *******************************************************************/
2990
2991/*
2992 * Used for early kmem_cache structures that were allocated using
2993 * the page allocator
2994 */
2995
2996static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
2997{
2998	int node;
2999
3000	list_add(&s->list, &slab_caches);
3001	s->refcount = -1;
3002
3003	for_each_node_state(node, N_NORMAL_MEMORY) {
3004		struct kmem_cache_node *n = get_node(s, node);
3005		struct page *p;
3006
3007		if (n) {
3008			list_for_each_entry(p, &n->partial, lru)
3009				p->slab = s;
3010
3011#ifdef CONFIG_SLAB_DEBUG
3012			list_for_each_entry(p, &n->full, lru)
3013				p->slab = s;
3014#endif
3015		}
3016	}
3017}
3018
3019void __init kmem_cache_init(void)
3020{
3021	int i;
3022	int caches = 0;
3023	struct kmem_cache *temp_kmem_cache;
3024	int order;
3025	struct kmem_cache *temp_kmem_cache_node;
3026	unsigned long kmalloc_size;
3027
3028	kmem_size = offsetof(struct kmem_cache, node) +
3029				nr_node_ids * sizeof(struct kmem_cache_node *);
3030
3031	/* Allocate two kmem_caches from the page allocator */
3032	kmalloc_size = ALIGN(kmem_size, cache_line_size());
3033	order = get_order(2 * kmalloc_size);
3034	kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
3035
3036	/*
3037	 * Must first have the slab cache available for the allocations of the
3038	 * struct kmem_cache_node's. There is special bootstrap code in
3039	 * kmem_cache_open for slab_state == DOWN.
3040	 */
3041	kmem_cache_node = (void *)kmem_cache + kmalloc_size;
3042
3043	kmem_cache_open(kmem_cache_node, "kmem_cache_node",
3044		sizeof(struct kmem_cache_node),
3045		0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3046
3047	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
3048
3049	/* Able to allocate the per node structures */
3050	slab_state = PARTIAL;
3051
3052	temp_kmem_cache = kmem_cache;
3053	kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
3054		0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3055	kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3056	memcpy(kmem_cache, temp_kmem_cache, kmem_size);
3057
3058	/*
3059	 * Allocate kmem_cache_node properly from the kmem_cache slab.
3060	 * kmem_cache_node is separately allocated so no need to
3061	 * update any list pointers.
3062	 */
3063	temp_kmem_cache_node = kmem_cache_node;
3064
3065	kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3066	memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
3067
3068	kmem_cache_bootstrap_fixup(kmem_cache_node);
3069
3070	caches++;
3071	kmem_cache_bootstrap_fixup(kmem_cache);
3072	caches++;
3073	/* Free temporary boot structure */
3074	free_pages((unsigned long)temp_kmem_cache, order);
3075
3076	/* Now we can use the kmem_cache to allocate kmalloc slabs */
3077
3078	/*
3079	 * Patch up the size_index table if we have strange large alignment
3080	 * requirements for the kmalloc array. This is only the case for
3081	 * MIPS it seems. The standard arches will not generate any code here.
3082	 *
3083	 * Largest permitted alignment is 256 bytes due to the way we
3084	 * handle the index determination for the smaller caches.
3085	 *
3086	 * Make sure that nothing crazy happens if someone starts tinkering
3087	 * around with ARCH_KMALLOC_MINALIGN
3088	 */
3089	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
3090		(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
3091
3092	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
3093		int elem = size_index_elem(i);
3094		if (elem >= ARRAY_SIZE(size_index))
3095			break;
3096		size_index[elem] = KMALLOC_SHIFT_LOW;
3097	}
3098
3099	if (KMALLOC_MIN_SIZE == 64) {
3100		/*
3101		 * The 96 byte size cache is not used if the alignment
3102		 * is 64 byte.
3103		 */
3104		for (i = 64 + 8; i <= 96; i += 8)
3105			size_index[size_index_elem(i)] = 7;
3106	} else if (KMALLOC_MIN_SIZE == 128) {
3107		/*
3108		 * The 192 byte sized cache is not used if the alignment
3109		 * is 128 byte. Redirect kmalloc to use the 256 byte cache
3110		 * instead.
3111		 */
3112		for (i = 128 + 8; i <= 192; i += 8)
3113			size_index[size_index_elem(i)] = 8;
3114	}
3115
3116	/* Caches that are not of the two-to-the-power-of size */
3117	if (KMALLOC_MIN_SIZE <= 32) {
3118		kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0);
3119		caches++;
3120	}
3121
3122	if (KMALLOC_MIN_SIZE <= 64) {
3123		kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0);
3124		caches++;
3125	}
3126
3127	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3128		kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0);
3129		caches++;
3130	}
3131
3132	slab_state = UP;
3133
3134	/* Provide the correct kmalloc names now that the caches are up */
3135	if (KMALLOC_MIN_SIZE <= 32) {
3136		kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT);
3137		BUG_ON(!kmalloc_caches[1]->name);
3138	}
3139
3140	if (KMALLOC_MIN_SIZE <= 64) {
3141		kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT);
3142		BUG_ON(!kmalloc_caches[2]->name);
3143	}
3144
3145	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3146		char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
3147
3148		BUG_ON(!s);
3149		kmalloc_caches[i]->name = s;
3150	}
3151
3152#ifdef CONFIG_SMP
3153	register_cpu_notifier(&slab_notifier);
3154#endif
3155
3156#ifdef CONFIG_ZONE_DMA
3157	for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
3158		struct kmem_cache *s = kmalloc_caches[i];
3159
3160		if (s && s->size) {
3161			char *name = kasprintf(GFP_NOWAIT,
3162				 "dma-kmalloc-%d", s->objsize);
3163
3164			BUG_ON(!name);
3165			kmalloc_dma_caches[i] = create_kmalloc_cache(name,
3166				s->objsize, SLAB_CACHE_DMA);
3167		}
3168	}
3169#endif
3170	printk(KERN_INFO
3171		"SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
3172		" CPUs=%d, Nodes=%d\n",
3173		caches, cache_line_size(),
3174		slub_min_order, slub_max_order, slub_min_objects,
3175		nr_cpu_ids, nr_node_ids);
3176}
3177
3178void __init kmem_cache_init_late(void)
3179{
3180}
3181
3182/*
3183 * Find a mergeable slab cache
3184 */
3185static int slab_unmergeable(struct kmem_cache *s)
3186{
3187	if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
3188		return 1;
3189
3190	if (s->ctor)
3191		return 1;
3192
3193	/*
3194	 * We may have set a slab to be unmergeable during bootstrap.
3195	 */
3196	if (s->refcount < 0)
3197		return 1;
3198
3199	return 0;
3200}
3201
3202static struct kmem_cache *find_mergeable(size_t size,
3203		size_t align, unsigned long flags, const char *name,
3204		void (*ctor)(void *))
3205{
3206	struct kmem_cache *s;
3207
3208	if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
3209		return NULL;
3210
3211	if (ctor)
3212		return NULL;
3213
3214	size = ALIGN(size, sizeof(void *));
3215	align = calculate_alignment(flags, align, size);
3216	size = ALIGN(size, align);
3217	flags = kmem_cache_flags(size, flags, name, NULL);
3218
3219	list_for_each_entry(s, &slab_caches, list) {
3220		if (slab_unmergeable(s))
3221			continue;
3222
3223		if (size > s->size)
3224			continue;
3225
3226		if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
3227				continue;
3228		/*
3229		 * Check if alignment is compatible.
3230		 * Courtesy of Adrian Drzewiecki
3231		 */
3232		if ((s->size & ~(align - 1)) != s->size)
3233			continue;
3234
3235		if (s->size - size >= sizeof(void *))
3236			continue;
3237
3238		return s;
3239	}
3240	return NULL;
3241}
3242
3243struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3244		size_t align, unsigned long flags, void (*ctor)(void *))
3245{
3246	struct kmem_cache *s;
3247	char *n;
3248
3249	if (WARN_ON(!name))
3250		return NULL;
3251
3252	down_write(&slub_lock);
3253	s = find_mergeable(size, align, flags, name, ctor);
3254	if (s) {
3255		s->refcount++;
3256		/*
3257		 * Adjust the object sizes so that we clear
3258		 * the complete object on kzalloc.
3259		 */
3260		s->objsize = max(s->objsize, (int)size);
3261		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3262
3263		if (sysfs_slab_alias(s, name)) {
3264			s->refcount--;
3265			goto err;
3266		}
3267		up_write(&slub_lock);
3268		return s;
3269	}
3270
3271	n = kstrdup(name, GFP_KERNEL);
3272	if (!n)
3273		goto err;
3274
3275	s = kmalloc(kmem_size, GFP_KERNEL);
3276	if (s) {
3277		if (kmem_cache_open(s, n,
3278				size, align, flags, ctor)) {
3279			list_add(&s->list, &slab_caches);
3280			if (sysfs_slab_add(s)) {
3281				list_del(&s->list);
3282				kfree(n);
3283				kfree(s);
3284				goto err;
3285			}
3286			up_write(&slub_lock);
3287			return s;
3288		}
3289		kfree(n);
3290		kfree(s);
3291	}
3292	up_write(&slub_lock);
3293
3294err:
3295	if (flags & SLAB_PANIC)
3296		panic("Cannot create slabcache %s\n", name);
3297	else
3298		s = NULL;
3299	return s;
3300}
3301EXPORT_SYMBOL(kmem_cache_create);
3302
3303#ifdef CONFIG_SMP
3304/*
3305 * Use the cpu notifier to insure that the cpu slabs are flushed when
3306 * necessary.
3307 */
3308static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3309		unsigned long action, void *hcpu)
3310{
3311	long cpu = (long)hcpu;
3312	struct kmem_cache *s;
3313	unsigned long flags;
3314
3315	switch (action) {
3316	case CPU_UP_CANCELED:
3317	case CPU_UP_CANCELED_FROZEN:
3318	case CPU_DEAD:
3319	case CPU_DEAD_FROZEN:
3320		down_read(&slub_lock);
3321		list_for_each_entry(s, &slab_caches, list) {
3322			local_irq_save(flags);
3323			__flush_cpu_slab(s, cpu);
3324			local_irq_restore(flags);
3325		}
3326		up_read(&slub_lock);
3327		break;
3328	default:
3329		break;
3330	}
3331	return NOTIFY_OK;
3332}
3333
3334static struct notifier_block __cpuinitdata slab_notifier = {
3335	.notifier_call = slab_cpuup_callback
3336};
3337
3338#endif
3339
3340void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3341{
3342	struct kmem_cache *s;
3343	void *ret;
3344
3345	if (unlikely(size > SLUB_MAX_SIZE))
3346		return kmalloc_large(size, gfpflags);
3347
3348	s = get_slab(size, gfpflags);
3349
3350	if (unlikely(ZERO_OR_NULL_PTR(s)))
3351		return s;
3352
3353	ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
3354
3355	/* Honor the call site pointer we recieved. */
3356	trace_kmalloc(caller, ret, size, s->size, gfpflags);
3357
3358	return ret;
3359}
3360
3361#ifdef CONFIG_NUMA
3362void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3363					int node, unsigned long caller)
3364{
3365	struct kmem_cache *s;
3366	void *ret;
3367
3368	if (unlikely(size > SLUB_MAX_SIZE)) {
3369		ret = kmalloc_large_node(size, gfpflags, node);
3370
3371		trace_kmalloc_node(caller, ret,
3372				   size, PAGE_SIZE << get_order(size),
3373				   gfpflags, node);
3374
3375		return ret;
3376	}
3377
3378	s = get_slab(size, gfpflags);
3379
3380	if (unlikely(ZERO_OR_NULL_PTR(s)))
3381		return s;
3382
3383	ret = slab_alloc(s, gfpflags, node, caller);
3384
3385	/* Honor the call site pointer we recieved. */
3386	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
3387
3388	return ret;
3389}
3390#endif
3391
3392#ifdef CONFIG_SYSFS
3393static int count_inuse(struct page *page)
3394{
3395	return page->inuse;
3396}
3397
3398static int count_total(struct page *page)
3399{
3400	return page->objects;
3401}
3402#endif
3403
3404#ifdef CONFIG_SLUB_DEBUG
3405static int validate_slab(struct kmem_cache *s, struct page *page,
3406						unsigned long *map)
3407{
3408	void *p;
3409	void *addr = page_address(page);
3410
3411	if (!check_slab(s, page) ||
3412			!on_freelist(s, page, NULL))
3413		return 0;
3414
3415	/* Now we know that a valid freelist exists */
3416	bitmap_zero(map, page->objects);
3417
3418	for_each_free_object(p, s, page->freelist) {
3419		set_bit(slab_index(p, s, addr), map);
3420		if (!check_object(s, page, p, 0))
3421			return 0;
3422	}
3423
3424	for_each_object(p, s, addr, page->objects)
3425		if (!test_bit(slab_index(p, s, addr), map))
3426			if (!check_object(s, page, p, 1))
3427				return 0;
3428	return 1;
3429}
3430
3431static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3432						unsigned long *map)
3433{
3434	if (slab_trylock(page)) {
3435		validate_slab(s, page, map);
3436		slab_unlock(page);
3437	} else
3438		printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
3439			s->name, page);
3440}
3441
3442static int validate_slab_node(struct kmem_cache *s,
3443		struct kmem_cache_node *n, unsigned long *map)
3444{
3445	unsigned long count = 0;
3446	struct page *page;
3447	unsigned long flags;
3448
3449	spin_lock_irqsave(&n->list_lock, flags);
3450
3451	list_for_each_entry(page, &n->partial, lru) {
3452		validate_slab_slab(s, page, map);
3453		count++;
3454	}
3455	if (count != n->nr_partial)
3456		printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
3457			"counter=%ld\n", s->name, count, n->nr_partial);
3458
3459	if (!(s->flags & SLAB_STORE_USER))
3460		goto out;
3461
3462	list_for_each_entry(page, &n->full, lru) {
3463		validate_slab_slab(s, page, map);
3464		count++;
3465	}
3466	if (count != atomic_long_read(&n->nr_slabs))
3467		printk(KERN_ERR "SLUB: %s %ld slabs counted but "
3468			"counter=%ld\n", s->name, count,
3469			atomic_long_read(&n->nr_slabs));
3470
3471out:
3472	spin_unlock_irqrestore(&n->list_lock, flags);
3473	return count;
3474}
3475
3476static long validate_slab_cache(struct kmem_cache *s)
3477{
3478	int node;
3479	unsigned long count = 0;
3480	unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
3481				sizeof(unsigned long), GFP_KERNEL);
3482
3483	if (!map)
3484		return -ENOMEM;
3485
3486	flush_all(s);
3487	for_each_node_state(node, N_NORMAL_MEMORY) {
3488		struct kmem_cache_node *n = get_node(s, node);
3489
3490		count += validate_slab_node(s, n, map);
3491	}
3492	kfree(map);
3493	return count;
3494}
3495/*
3496 * Generate lists of code addresses where slabcache objects are allocated
3497 * and freed.
3498 */
3499
3500struct location {
3501	unsigned long count;
3502	unsigned long addr;
3503	long long sum_time;
3504	long min_time;
3505	long max_time;
3506	long min_pid;
3507	long max_pid;
3508	DECLARE_BITMAP(cpus, NR_CPUS);
3509	nodemask_t nodes;
3510};
3511
3512struct loc_track {
3513	unsigned long max;
3514	unsigned long count;
3515	struct location *loc;
3516};
3517
3518static void free_loc_track(struct loc_track *t)
3519{
3520	if (t->max)
3521		free_pages((unsigned long)t->loc,
3522			get_order(sizeof(struct location) * t->max));
3523}
3524
3525static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
3526{
3527	struct location *l;
3528	int order;
3529
3530	order = get_order(sizeof(struct location) * max);
3531
3532	l = (void *)__get_free_pages(flags, order);
3533	if (!l)
3534		return 0;
3535
3536	if (t->count) {
3537		memcpy(l, t->loc, sizeof(struct location) * t->count);
3538		free_loc_track(t);
3539	}
3540	t->max = max;
3541	t->loc = l;
3542	return 1;
3543}
3544
3545static int add_location(struct loc_track *t, struct kmem_cache *s,
3546				const struct track *track)
3547{
3548	long start, end, pos;
3549	struct location *l;
3550	unsigned long caddr;
3551	unsigned long age = jiffies - track->when;
3552
3553	start = -1;
3554	end = t->count;
3555
3556	for ( ; ; ) {
3557		pos = start + (end - start + 1) / 2;
3558
3559		/*
3560		 * There is nothing at "end". If we end up there
3561		 * we need to add something to before end.
3562		 */
3563		if (pos == end)
3564			break;
3565
3566		caddr = t->loc[pos].addr;
3567		if (track->addr == caddr) {
3568
3569			l = &t->loc[pos];
3570			l->count++;
3571			if (track->when) {
3572				l->sum_time += age;
3573				if (age < l->min_time)
3574					l->min_time = age;
3575				if (age > l->max_time)
3576					l->max_time = age;
3577
3578				if (track->pid < l->min_pid)
3579					l->min_pid = track->pid;
3580				if (track->pid > l->max_pid)
3581					l->max_pid = track->pid;
3582
3583				cpumask_set_cpu(track->cpu,
3584						to_cpumask(l->cpus));
3585			}
3586			node_set(page_to_nid(virt_to_page(track)), l->nodes);
3587			return 1;
3588		}
3589
3590		if (track->addr < caddr)
3591			end = pos;
3592		else
3593			start = pos;
3594	}
3595
3596	/*
3597	 * Not found. Insert new tracking element.
3598	 */
3599	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
3600		return 0;
3601
3602	l = t->loc + pos;
3603	if (pos < t->count)
3604		memmove(l + 1, l,
3605			(t->count - pos) * sizeof(struct location));
3606	t->count++;
3607	l->count = 1;
3608	l->addr = track->addr;
3609	l->sum_time = age;
3610	l->min_time = age;
3611	l->max_time = age;
3612	l->min_pid = track->pid;
3613	l->max_pid = track->pid;
3614	cpumask_clear(to_cpumask(l->cpus));
3615	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
3616	nodes_clear(l->nodes);
3617	node_set(page_to_nid(virt_to_page(track)), l->nodes);
3618	return 1;
3619}
3620
3621static void process_slab(struct loc_track *t, struct kmem_cache *s,
3622		struct page *page, enum track_item alloc,
3623		unsigned long *map)
3624{
3625	void *addr = page_address(page);
3626	void *p;
3627
3628	bitmap_zero(map, page->objects);
3629	for_each_free_object(p, s, page->freelist)
3630		set_bit(slab_index(p, s, addr), map);
3631
3632	for_each_object(p, s, addr, page->objects)
3633		if (!test_bit(slab_index(p, s, addr), map))
3634			add_location(t, s, get_track(s, p, alloc));
3635}
3636
3637static int list_locations(struct kmem_cache *s, char *buf,
3638					enum track_item alloc)
3639{
3640	int len = 0;
3641	unsigned long i;
3642	struct loc_track t = { 0, 0, NULL };
3643	int node;
3644	unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
3645				     sizeof(unsigned long), GFP_KERNEL);
3646
3647	if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
3648				     GFP_TEMPORARY)) {
3649		kfree(map);
3650		return sprintf(buf, "Out of memory\n");
3651	}
3652	/* Push back cpu slabs */
3653	flush_all(s);
3654
3655	for_each_node_state(node, N_NORMAL_MEMORY) {
3656		struct kmem_cache_node *n = get_node(s, node);
3657		unsigned long flags;
3658		struct page *page;
3659
3660		if (!atomic_long_read(&n->nr_slabs))
3661			continue;
3662
3663		spin_lock_irqsave(&n->list_lock, flags);
3664		list_for_each_entry(page, &n->partial, lru)
3665			process_slab(&t, s, page, alloc, map);
3666		list_for_each_entry(page, &n->full, lru)
3667			process_slab(&t, s, page, alloc, map);
3668		spin_unlock_irqrestore(&n->list_lock, flags);
3669	}
3670
3671	for (i = 0; i < t.count; i++) {
3672		struct location *l = &t.loc[i];
3673
3674		if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
3675			break;
3676		len += sprintf(buf + len, "%7ld ", l->count);
3677
3678		if (l->addr)
3679			len += sprint_symbol(buf + len, (unsigned long)l->addr);
3680		else
3681			len += sprintf(buf + len, "<not-available>");
3682
3683		if (l->sum_time != l->min_time) {
3684			len += sprintf(buf + len, " age=%ld/%ld/%ld",
3685				l->min_time,
3686				(long)div_u64(l->sum_time, l->count),
3687				l->max_time);
3688		} else
3689			len += sprintf(buf + len, " age=%ld",
3690				l->min_time);
3691
3692		if (l->min_pid != l->max_pid)
3693			len += sprintf(buf + len, " pid=%ld-%ld",
3694				l->min_pid, l->max_pid);
3695		else
3696			len += sprintf(buf + len, " pid=%ld",
3697				l->min_pid);
3698
3699		if (num_online_cpus() > 1 &&
3700				!cpumask_empty(to_cpumask(l->cpus)) &&
3701				len < PAGE_SIZE - 60) {
3702			len += sprintf(buf + len, " cpus=");
3703			len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3704						 to_cpumask(l->cpus));
3705		}
3706
3707		if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
3708				len < PAGE_SIZE - 60) {
3709			len += sprintf(buf + len, " nodes=");
3710			len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3711					l->nodes);
3712		}
3713
3714		len += sprintf(buf + len, "\n");
3715	}
3716
3717	free_loc_track(&t);
3718	kfree(map);
3719	if (!t.count)
3720		len += sprintf(buf, "No data\n");
3721	return len;
3722}
3723#endif
3724
3725#ifdef SLUB_RESILIENCY_TEST
3726static void resiliency_test(void)
3727{
3728	u8 *p;
3729
3730	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10);
3731
3732	printk(KERN_ERR "SLUB resiliency testing\n");
3733	printk(KERN_ERR "-----------------------\n");
3734	printk(KERN_ERR "A. Corruption after allocation\n");
3735
3736	p = kzalloc(16, GFP_KERNEL);
3737	p[16] = 0x12;
3738	printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
3739			" 0x12->0x%p\n\n", p + 16);
3740
3741	validate_slab_cache(kmalloc_caches[4]);
3742
3743	/* Hmmm... The next two are dangerous */
3744	p = kzalloc(32, GFP_KERNEL);
3745	p[32 + sizeof(void *)] = 0x34;
3746	printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
3747			" 0x34 -> -0x%p\n", p);
3748	printk(KERN_ERR
3749		"If allocated object is overwritten then not detectable\n\n");
3750
3751	validate_slab_cache(kmalloc_caches[5]);
3752	p = kzalloc(64, GFP_KERNEL);
3753	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
3754	*p = 0x56;
3755	printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
3756									p);
3757	printk(KERN_ERR
3758		"If allocated object is overwritten then not detectable\n\n");
3759	validate_slab_cache(kmalloc_caches[6]);
3760
3761	printk(KERN_ERR "\nB. Corruption after free\n");
3762	p = kzalloc(128, GFP_KERNEL);
3763	kfree(p);
3764	*p = 0x78;
3765	printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
3766	validate_slab_cache(kmalloc_caches[7]);
3767
3768	p = kzalloc(256, GFP_KERNEL);
3769	kfree(p);
3770	p[50] = 0x9a;
3771	printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
3772			p);
3773	validate_slab_cache(kmalloc_caches[8]);
3774
3775	p = kzalloc(512, GFP_KERNEL);
3776	kfree(p);
3777	p[512] = 0xab;
3778	printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
3779	validate_slab_cache(kmalloc_caches[9]);
3780}
3781#else
3782#ifdef CONFIG_SYSFS
3783static void resiliency_test(void) {};
3784#endif
3785#endif
3786
3787#ifdef CONFIG_SYSFS
3788enum slab_stat_type {
3789	SL_ALL,			/* All slabs */
3790	SL_PARTIAL,		/* Only partially allocated slabs */
3791	SL_CPU,			/* Only slabs used for cpu caches */
3792	SL_OBJECTS,		/* Determine allocated objects not slabs */
3793	SL_TOTAL		/* Determine object capacity not slabs */
3794};
3795
3796#define SO_ALL		(1 << SL_ALL)
3797#define SO_PARTIAL	(1 << SL_PARTIAL)
3798#define SO_CPU		(1 << SL_CPU)
3799#define SO_OBJECTS	(1 << SL_OBJECTS)
3800#define SO_TOTAL	(1 << SL_TOTAL)
3801
3802static ssize_t show_slab_objects(struct kmem_cache *s,
3803			    char *buf, unsigned long flags)
3804{
3805	unsigned long total = 0;
3806	int node;
3807	int x;
3808	unsigned long *nodes;
3809	unsigned long *per_cpu;
3810
3811	nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
3812	if (!nodes)
3813		return -ENOMEM;
3814	per_cpu = nodes + nr_node_ids;
3815
3816	if (flags & SO_CPU) {
3817		int cpu;
3818
3819		for_each_possible_cpu(cpu) {
3820			struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3821
3822			if (!c || c->node < 0)
3823				continue;
3824
3825			if (c->page) {
3826					if (flags & SO_TOTAL)
3827						x = c->page->objects;
3828				else if (flags & SO_OBJECTS)
3829					x = c->page->inuse;
3830				else
3831					x = 1;
3832
3833				total += x;
3834				nodes[c->node] += x;
3835			}
3836			per_cpu[c->node]++;
3837		}
3838	}
3839
3840	down_read(&slub_lock);
3841#ifdef CONFIG_SLUB_DEBUG
3842	if (flags & SO_ALL) {
3843		for_each_node_state(node, N_NORMAL_MEMORY) {
3844			struct kmem_cache_node *n = get_node(s, node);
3845
3846		if (flags & SO_TOTAL)
3847			x = atomic_long_read(&n->total_objects);
3848		else if (flags & SO_OBJECTS)
3849			x = atomic_long_read(&n->total_objects) -
3850				count_partial(n, count_free);
3851
3852			else
3853				x = atomic_long_read(&n->nr_slabs);
3854			total += x;
3855			nodes[node] += x;
3856		}
3857
3858	} else
3859#endif
3860	if (flags & SO_PARTIAL) {
3861		for_each_node_state(node, N_NORMAL_MEMORY) {
3862			struct kmem_cache_node *n = get_node(s, node);
3863
3864			if (flags & SO_TOTAL)
3865				x = count_partial(n, count_total);
3866			else if (flags & SO_OBJECTS)
3867				x = count_partial(n, count_inuse);
3868			else
3869				x = n->nr_partial;
3870			total += x;
3871			nodes[node] += x;
3872		}
3873	}
3874	x = sprintf(buf, "%lu", total);
3875#ifdef CONFIG_NUMA
3876	for_each_node_state(node, N_NORMAL_MEMORY)
3877		if (nodes[node])
3878			x += sprintf(buf + x, " N%d=%lu",
3879					node, nodes[node]);
3880#endif
3881	kfree(nodes);
3882	return x + sprintf(buf + x, "\n");
3883}
3884
3885#ifdef CONFIG_SLUB_DEBUG
3886static int any_slab_objects(struct kmem_cache *s)
3887{
3888	int node;
3889
3890	for_each_online_node(node) {
3891		struct kmem_cache_node *n = get_node(s, node);
3892
3893		if (!n)
3894			continue;
3895
3896		if (atomic_long_read(&n->total_objects))
3897			return 1;
3898	}
3899	return 0;
3900}
3901#endif
3902
3903#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3904#define to_slab(n) container_of(n, struct kmem_cache, kobj);
3905
3906struct slab_attribute {
3907	struct attribute attr;
3908	ssize_t (*show)(struct kmem_cache *s, char *buf);
3909	ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
3910};
3911
3912#define SLAB_ATTR_RO(_name) \
3913	static struct slab_attribute _name##_attr = __ATTR_RO(_name)
3914
3915#define SLAB_ATTR(_name) \
3916	static struct slab_attribute _name##_attr =  \
3917	__ATTR(_name, 0644, _name##_show, _name##_store)
3918
3919static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
3920{
3921	return sprintf(buf, "%d\n", s->size);
3922}
3923SLAB_ATTR_RO(slab_size);
3924
3925static ssize_t align_show(struct kmem_cache *s, char *buf)
3926{
3927	return sprintf(buf, "%d\n", s->align);
3928}
3929SLAB_ATTR_RO(align);
3930
3931static ssize_t object_size_show(struct kmem_cache *s, char *buf)
3932{
3933	return sprintf(buf, "%d\n", s->objsize);
3934}
3935SLAB_ATTR_RO(object_size);
3936
3937static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
3938{
3939	return sprintf(buf, "%d\n", oo_objects(s->oo));
3940}
3941SLAB_ATTR_RO(objs_per_slab);
3942
3943static ssize_t order_store(struct kmem_cache *s,
3944				const char *buf, size_t length)
3945{
3946	unsigned long order;
3947	int err;
3948
3949	err = strict_strtoul(buf, 10, &order);
3950	if (err)
3951		return err;
3952
3953	if (order > slub_max_order || order < slub_min_order)
3954		return -EINVAL;
3955
3956	calculate_sizes(s, order);
3957	return length;
3958}
3959
3960static ssize_t order_show(struct kmem_cache *s, char *buf)
3961{
3962	return sprintf(buf, "%d\n", oo_order(s->oo));
3963}
3964SLAB_ATTR(order);
3965
3966static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
3967{
3968	return sprintf(buf, "%lu\n", s->min_partial);
3969}
3970
3971static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
3972				 size_t length)
3973{
3974	unsigned long min;
3975	int err;
3976
3977	err = strict_strtoul(buf, 10, &min);
3978	if (err)
3979		return err;
3980
3981	set_min_partial(s, min);
3982	return length;
3983}
3984SLAB_ATTR(min_partial);
3985
3986static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3987{
3988	if (s->ctor) {
3989		int n = sprint_symbol(buf, (unsigned long)s->ctor);
3990
3991		return n + sprintf(buf + n, "\n");
3992	}
3993	return 0;
3994}
3995SLAB_ATTR_RO(ctor);
3996
3997static ssize_t aliases_show(struct kmem_cache *s, char *buf)
3998{
3999	return sprintf(buf, "%d\n", s->refcount - 1);
4000}
4001SLAB_ATTR_RO(aliases);
4002
4003static ssize_t partial_show(struct kmem_cache *s, char *buf)
4004{
4005	return show_slab_objects(s, buf, SO_PARTIAL);
4006}
4007SLAB_ATTR_RO(partial);
4008
4009static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
4010{
4011	return show_slab_objects(s, buf, SO_CPU);
4012}
4013SLAB_ATTR_RO(cpu_slabs);
4014
4015static ssize_t objects_show(struct kmem_cache *s, char *buf)
4016{
4017	return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
4018}
4019SLAB_ATTR_RO(objects);
4020
4021static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
4022{
4023	return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
4024}
4025SLAB_ATTR_RO(objects_partial);
4026
4027static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4028{
4029	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
4030}
4031
4032static ssize_t reclaim_account_store(struct kmem_cache *s,
4033				const char *buf, size_t length)
4034{
4035	s->flags &= ~SLAB_RECLAIM_ACCOUNT;
4036	if (buf[0] == '1')
4037		s->flags |= SLAB_RECLAIM_ACCOUNT;
4038	return length;
4039}
4040SLAB_ATTR(reclaim_account);
4041
4042static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
4043{
4044	return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
4045}
4046SLAB_ATTR_RO(hwcache_align);
4047
4048#ifdef CONFIG_ZONE_DMA
4049static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
4050{
4051	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
4052}
4053SLAB_ATTR_RO(cache_dma);
4054#endif
4055
4056static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
4057{
4058	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
4059}
4060SLAB_ATTR_RO(destroy_by_rcu);
4061
4062#ifdef CONFIG_SLUB_DEBUG
4063static ssize_t slabs_show(struct kmem_cache *s, char *buf)
4064{
4065	return show_slab_objects(s, buf, SO_ALL);
4066}
4067SLAB_ATTR_RO(slabs);
4068
4069static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
4070{
4071	return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
4072}
4073SLAB_ATTR_RO(total_objects);
4074
4075static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
4076{
4077	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
4078}
4079
4080static ssize_t sanity_checks_store(struct kmem_cache *s,
4081				const char *buf, size_t length)
4082{
4083	s->flags &= ~SLAB_DEBUG_FREE;
4084	if (buf[0] == '1')
4085		s->flags |= SLAB_DEBUG_FREE;
4086	return length;
4087}
4088SLAB_ATTR(sanity_checks);
4089
4090static ssize_t trace_show(struct kmem_cache *s, char *buf)
4091{
4092	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
4093}
4094
4095static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4096							size_t length)
4097{
4098	s->flags &= ~SLAB_TRACE;
4099	if (buf[0] == '1')
4100		s->flags |= SLAB_TRACE;
4101	return length;
4102}
4103SLAB_ATTR(trace);
4104
4105static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
4106{
4107	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
4108}
4109
4110static ssize_t red_zone_store(struct kmem_cache *s,
4111				const char *buf, size_t length)
4112{
4113	if (any_slab_objects(s))
4114		return -EBUSY;
4115
4116	s->flags &= ~SLAB_RED_ZONE;
4117	if (buf[0] == '1')
4118		s->flags |= SLAB_RED_ZONE;
4119	calculate_sizes(s, -1);
4120	return length;
4121}
4122SLAB_ATTR(red_zone);
4123
4124static ssize_t poison_show(struct kmem_cache *s, char *buf)
4125{
4126	return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
4127}
4128
4129static ssize_t poison_store(struct kmem_cache *s,
4130				const char *buf, size_t length)
4131{
4132	if (any_slab_objects(s))
4133		return -EBUSY;
4134
4135	s->flags &= ~SLAB_POISON;
4136	if (buf[0] == '1')
4137		s->flags |= SLAB_POISON;
4138	calculate_sizes(s, -1);
4139	return length;
4140}
4141SLAB_ATTR(poison);
4142
4143static ssize_t store_user_show(struct kmem_cache *s, char *buf)
4144{
4145	return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
4146}
4147
4148static ssize_t store_user_store(struct kmem_cache *s,
4149				const char *buf, size_t length)
4150{
4151	if (any_slab_objects(s))
4152		return -EBUSY;
4153
4154	s->flags &= ~SLAB_STORE_USER;
4155	if (buf[0] == '1')
4156		s->flags |= SLAB_STORE_USER;
4157	calculate_sizes(s, -1);
4158	return length;
4159}
4160SLAB_ATTR(store_user);
4161
4162static ssize_t validate_show(struct kmem_cache *s, char *buf)
4163{
4164	return 0;
4165}
4166
4167static ssize_t validate_store(struct kmem_cache *s,
4168			const char *buf, size_t length)
4169{
4170	int ret = -EINVAL;
4171
4172	if (buf[0] == '1') {
4173		ret = validate_slab_cache(s);
4174		if (ret >= 0)
4175			ret = length;
4176	}
4177	return ret;
4178}
4179SLAB_ATTR(validate);
4180
4181static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
4182{
4183	if (!(s->flags & SLAB_STORE_USER))
4184		return -ENOSYS;
4185	return list_locations(s, buf, TRACK_ALLOC);
4186}
4187SLAB_ATTR_RO(alloc_calls);
4188
4189static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
4190{
4191	if (!(s->flags & SLAB_STORE_USER))
4192		return -ENOSYS;
4193	return list_locations(s, buf, TRACK_FREE);
4194}
4195SLAB_ATTR_RO(free_calls);
4196#endif /* CONFIG_SLUB_DEBUG */
4197
4198#ifdef CONFIG_FAILSLAB
4199static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4200{
4201	return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4202}
4203
4204static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4205							size_t length)
4206{
4207	s->flags &= ~SLAB_FAILSLAB;
4208	if (buf[0] == '1')
4209		s->flags |= SLAB_FAILSLAB;
4210	return length;
4211}
4212SLAB_ATTR(failslab);
4213#endif
4214
4215static ssize_t shrink_show(struct kmem_cache *s, char *buf)
4216{
4217	return 0;
4218}
4219
4220static ssize_t shrink_store(struct kmem_cache *s,
4221			const char *buf, size_t length)
4222{
4223	if (buf[0] == '1') {
4224		int rc = kmem_cache_shrink(s);
4225
4226		if (rc)
4227			return rc;
4228	} else
4229		return -EINVAL;
4230	return length;
4231}
4232SLAB_ATTR(shrink);
4233
4234#ifdef CONFIG_NUMA
4235static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
4236{
4237	return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
4238}
4239
4240static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
4241				const char *buf, size_t length)
4242{
4243	unsigned long ratio;
4244	int err;
4245
4246	err = strict_strtoul(buf, 10, &ratio);
4247	if (err)
4248		return err;
4249
4250	if (ratio <= 100)
4251		s->remote_node_defrag_ratio = ratio * 10;
4252
4253	return length;
4254}
4255SLAB_ATTR(remote_node_defrag_ratio);
4256#endif
4257
4258#ifdef CONFIG_SLUB_STATS
4259static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4260{
4261	unsigned long sum  = 0;
4262	int cpu;
4263	int len;
4264	int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
4265
4266	if (!data)
4267		return -ENOMEM;
4268
4269	for_each_online_cpu(cpu) {
4270		unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
4271
4272		data[cpu] = x;
4273		sum += x;
4274	}
4275
4276	len = sprintf(buf, "%lu", sum);
4277
4278#ifdef CONFIG_SMP
4279	for_each_online_cpu(cpu) {
4280		if (data[cpu] && len < PAGE_SIZE - 20)
4281			len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
4282	}
4283#endif
4284	kfree(data);
4285	return len + sprintf(buf + len, "\n");
4286}
4287
4288static void clear_stat(struct kmem_cache *s, enum stat_item si)
4289{
4290	int cpu;
4291
4292	for_each_online_cpu(cpu)
4293		per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
4294}
4295
4296#define STAT_ATTR(si, text) 					\
4297static ssize_t text##_show(struct kmem_cache *s, char *buf)	\
4298{								\
4299	return show_stat(s, buf, si);				\
4300}								\
4301static ssize_t text##_store(struct kmem_cache *s,		\
4302				const char *buf, size_t length)	\
4303{								\
4304	if (buf[0] != '0')					\
4305		return -EINVAL;					\
4306	clear_stat(s, si);					\
4307	return length;						\
4308}								\
4309SLAB_ATTR(text);						\
4310
4311STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
4312STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
4313STAT_ATTR(FREE_FASTPATH, free_fastpath);
4314STAT_ATTR(FREE_SLOWPATH, free_slowpath);
4315STAT_ATTR(FREE_FROZEN, free_frozen);
4316STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
4317STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
4318STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
4319STAT_ATTR(ALLOC_SLAB, alloc_slab);
4320STAT_ATTR(ALLOC_REFILL, alloc_refill);
4321STAT_ATTR(FREE_SLAB, free_slab);
4322STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
4323STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
4324STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
4325STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
4326STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
4327STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
4328STAT_ATTR(ORDER_FALLBACK, order_fallback);
4329#endif
4330
4331static struct attribute *slab_attrs[] = {
4332	&slab_size_attr.attr,
4333	&object_size_attr.attr,
4334	&objs_per_slab_attr.attr,
4335	&order_attr.attr,
4336	&min_partial_attr.attr,
4337	&objects_attr.attr,
4338	&objects_partial_attr.attr,
4339	&partial_attr.attr,
4340	&cpu_slabs_attr.attr,
4341	&ctor_attr.attr,
4342	&aliases_attr.attr,
4343	&align_attr.attr,
4344	&hwcache_align_attr.attr,
4345	&reclaim_account_attr.attr,
4346	&destroy_by_rcu_attr.attr,
4347	&shrink_attr.attr,
4348#ifdef CONFIG_SLUB_DEBUG
4349	&total_objects_attr.attr,
4350	&slabs_attr.attr,
4351	&sanity_checks_attr.attr,
4352	&trace_attr.attr,
4353	&red_zone_attr.attr,
4354	&poison_attr.attr,
4355	&store_user_attr.attr,
4356	&validate_attr.attr,
4357	&alloc_calls_attr.attr,
4358	&free_calls_attr.attr,
4359#endif
4360#ifdef CONFIG_ZONE_DMA
4361	&cache_dma_attr.attr,
4362#endif
4363#ifdef CONFIG_NUMA
4364	&remote_node_defrag_ratio_attr.attr,
4365#endif
4366#ifdef CONFIG_SLUB_STATS
4367	&alloc_fastpath_attr.attr,
4368	&alloc_slowpath_attr.attr,
4369	&free_fastpath_attr.attr,
4370	&free_slowpath_attr.attr,
4371	&free_frozen_attr.attr,
4372	&free_add_partial_attr.attr,
4373	&free_remove_partial_attr.attr,
4374	&alloc_from_partial_attr.attr,
4375	&alloc_slab_attr.attr,
4376	&alloc_refill_attr.attr,
4377	&free_slab_attr.attr,
4378	&cpuslab_flush_attr.attr,
4379	&deactivate_full_attr.attr,
4380	&deactivate_empty_attr.attr,
4381	&deactivate_to_head_attr.attr,
4382	&deactivate_to_tail_attr.attr,
4383	&deactivate_remote_frees_attr.attr,
4384	&order_fallback_attr.attr,
4385#endif
4386#ifdef CONFIG_FAILSLAB
4387	&failslab_attr.attr,
4388#endif
4389
4390	NULL
4391};
4392
4393static struct attribute_group slab_attr_group = {
4394	.attrs = slab_attrs,
4395};
4396
4397static ssize_t slab_attr_show(struct kobject *kobj,
4398				struct attribute *attr,
4399				char *buf)
4400{
4401	struct slab_attribute *attribute;
4402	struct kmem_cache *s;
4403	int err;
4404
4405	attribute = to_slab_attr(attr);
4406	s = to_slab(kobj);
4407
4408	if (!attribute->show)
4409		return -EIO;
4410
4411	err = attribute->show(s, buf);
4412
4413	return err;
4414}
4415
4416static ssize_t slab_attr_store(struct kobject *kobj,
4417				struct attribute *attr,
4418				const char *buf, size_t len)
4419{
4420	struct slab_attribute *attribute;
4421	struct kmem_cache *s;
4422	int err;
4423
4424	attribute = to_slab_attr(attr);
4425	s = to_slab(kobj);
4426
4427	if (!attribute->store)
4428		return -EIO;
4429
4430	err = attribute->store(s, buf, len);
4431
4432	return err;
4433}
4434
4435static void kmem_cache_release(struct kobject *kobj)
4436{
4437	struct kmem_cache *s = to_slab(kobj);
4438
4439	kfree(s->name);
4440	kfree(s);
4441}
4442
4443static const struct sysfs_ops slab_sysfs_ops = {
4444	.show = slab_attr_show,
4445	.store = slab_attr_store,
4446};
4447
4448static struct kobj_type slab_ktype = {
4449	.sysfs_ops = &slab_sysfs_ops,
4450	.release = kmem_cache_release
4451};
4452
4453static int uevent_filter(struct kset *kset, struct kobject *kobj)
4454{
4455	struct kobj_type *ktype = get_ktype(kobj);
4456
4457	if (ktype == &slab_ktype)
4458		return 1;
4459	return 0;
4460}
4461
4462static const struct kset_uevent_ops slab_uevent_ops = {
4463	.filter = uevent_filter,
4464};
4465
4466static struct kset *slab_kset;
4467
4468#define ID_STR_LENGTH 64
4469
4470/* Create a unique string id for a slab cache:
4471 *
4472 * Format	:[flags-]size
4473 */
4474static char *create_unique_id(struct kmem_cache *s)
4475{
4476	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
4477	char *p = name;
4478
4479	BUG_ON(!name);
4480
4481	*p++ = ':';
4482	/*
4483	 * First flags affecting slabcache operations. We will only
4484	 * get here for aliasable slabs so we do not need to support
4485	 * too many flags. The flags here must cover all flags that
4486	 * are matched during merging to guarantee that the id is
4487	 * unique.
4488	 */
4489	if (s->flags & SLAB_CACHE_DMA)
4490		*p++ = 'd';
4491	if (s->flags & SLAB_RECLAIM_ACCOUNT)
4492		*p++ = 'a';
4493	if (s->flags & SLAB_DEBUG_FREE)
4494		*p++ = 'F';
4495	if (!(s->flags & SLAB_NOTRACK))
4496		*p++ = 't';
4497	if (p != name + 1)
4498		*p++ = '-';
4499	p += sprintf(p, "%07d", s->size);
4500	BUG_ON(p > name + ID_STR_LENGTH - 1);
4501	return name;
4502}
4503
4504static int sysfs_slab_add(struct kmem_cache *s)
4505{
4506	int err;
4507	const char *name;
4508	int unmergeable;
4509
4510	if (slab_state < SYSFS)
4511		/* Defer until later */
4512		return 0;
4513
4514	unmergeable = slab_unmergeable(s);
4515	if (unmergeable) {
4516		/*
4517		 * Slabcache can never be merged so we can use the name proper.
4518		 * This is typically the case for debug situations. In that
4519		 * case we can catch duplicate names easily.
4520		 */
4521		sysfs_remove_link(&slab_kset->kobj, s->name);
4522		name = s->name;
4523	} else {
4524		/*
4525		 * Create a unique name for the slab as a target
4526		 * for the symlinks.
4527		 */
4528		name = create_unique_id(s);
4529	}
4530
4531	s->kobj.kset = slab_kset;
4532	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
4533	if (err) {
4534		kobject_put(&s->kobj);
4535		return err;
4536	}
4537
4538	err = sysfs_create_group(&s->kobj, &slab_attr_group);
4539	if (err) {
4540		kobject_del(&s->kobj);
4541		kobject_put(&s->kobj);
4542		return err;
4543	}
4544	kobject_uevent(&s->kobj, KOBJ_ADD);
4545	if (!unmergeable) {
4546		/* Setup first alias */
4547		sysfs_slab_alias(s, s->name);
4548		kfree(name);
4549	}
4550	return 0;
4551}
4552
4553static void sysfs_slab_remove(struct kmem_cache *s)
4554{
4555	if (slab_state < SYSFS)
4556		/*
4557		 * Sysfs has not been setup yet so no need to remove the
4558		 * cache from sysfs.
4559		 */
4560		return;
4561
4562	kobject_uevent(&s->kobj, KOBJ_REMOVE);
4563	kobject_del(&s->kobj);
4564	kobject_put(&s->kobj);
4565}
4566
4567/*
4568 * Need to buffer aliases during bootup until sysfs becomes
4569 * available lest we lose that information.
4570 */
4571struct saved_alias {
4572	struct kmem_cache *s;
4573	const char *name;
4574	struct saved_alias *next;
4575};
4576
4577static struct saved_alias *alias_list;
4578
4579static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
4580{
4581	struct saved_alias *al;
4582
4583	if (slab_state == SYSFS) {
4584		/*
4585		 * If we have a leftover link then remove it.
4586		 */
4587		sysfs_remove_link(&slab_kset->kobj, name);
4588		return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
4589	}
4590
4591	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
4592	if (!al)
4593		return -ENOMEM;
4594
4595	al->s = s;
4596	al->name = name;
4597	al->next = alias_list;
4598	alias_list = al;
4599	return 0;
4600}
4601
4602static int __init slab_sysfs_init(void)
4603{
4604	struct kmem_cache *s;
4605	int err;
4606
4607	down_write(&slub_lock);
4608
4609	slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
4610	if (!slab_kset) {
4611		up_write(&slub_lock);
4612		printk(KERN_ERR "Cannot register slab subsystem.\n");
4613		return -ENOSYS;
4614	}
4615
4616	slab_state = SYSFS;
4617
4618	list_for_each_entry(s, &slab_caches, list) {
4619		err = sysfs_slab_add(s);
4620		if (err)
4621			printk(KERN_ERR "SLUB: Unable to add boot slab %s"
4622						" to sysfs\n", s->name);
4623	}
4624
4625	while (alias_list) {
4626		struct saved_alias *al = alias_list;
4627
4628		alias_list = alias_list->next;
4629		err = sysfs_slab_alias(al->s, al->name);
4630		if (err)
4631			printk(KERN_ERR "SLUB: Unable to add boot slab alias"
4632					" %s to sysfs\n", s->name);
4633		kfree(al);
4634	}
4635
4636	up_write(&slub_lock);
4637	resiliency_test();
4638	return 0;
4639}
4640
4641__initcall(slab_sysfs_init);
4642#endif /* CONFIG_SYSFS */
4643
4644/*
4645 * The /proc/slabinfo ABI
4646 */
4647#ifdef CONFIG_SLABINFO
4648static void print_slabinfo_header(struct seq_file *m)
4649{
4650	seq_puts(m, "slabinfo - version: 2.1\n");
4651	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4652		 "<objperslab> <pagesperslab>");
4653	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4654	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4655	seq_putc(m, '\n');
4656}
4657
4658static void *s_start(struct seq_file *m, loff_t *pos)
4659{
4660	loff_t n = *pos;
4661
4662	down_read(&slub_lock);
4663	if (!n)
4664		print_slabinfo_header(m);
4665
4666	return seq_list_start(&slab_caches, *pos);
4667}
4668
4669static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4670{
4671	return seq_list_next(p, &slab_caches, pos);
4672}
4673
4674static void s_stop(struct seq_file *m, void *p)
4675{
4676	up_read(&slub_lock);
4677}
4678
4679static int s_show(struct seq_file *m, void *p)
4680{
4681	unsigned long nr_partials = 0;
4682	unsigned long nr_slabs = 0;
4683	unsigned long nr_inuse = 0;
4684	unsigned long nr_objs = 0;
4685	unsigned long nr_free = 0;
4686	struct kmem_cache *s;
4687	int node;
4688
4689	s = list_entry(p, struct kmem_cache, list);
4690
4691	for_each_online_node(node) {
4692		struct kmem_cache_node *n = get_node(s, node);
4693
4694		if (!n)
4695			continue;
4696
4697		nr_partials += n->nr_partial;
4698		nr_slabs += atomic_long_read(&n->nr_slabs);
4699		nr_objs += atomic_long_read(&n->total_objects);
4700		nr_free += count_partial(n, count_free);
4701	}
4702
4703	nr_inuse = nr_objs - nr_free;
4704
4705	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
4706		   nr_objs, s->size, oo_objects(s->oo),
4707		   (1 << oo_order(s->oo)));
4708	seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
4709	seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
4710		   0UL);
4711	seq_putc(m, '\n');
4712	return 0;
4713}
4714
4715static const struct seq_operations slabinfo_op = {
4716	.start = s_start,
4717	.next = s_next,
4718	.stop = s_stop,
4719	.show = s_show,
4720};
4721
4722static int slabinfo_open(struct inode *inode, struct file *file)
4723{
4724	return seq_open(file, &slabinfo_op);
4725}
4726
4727static const struct file_operations proc_slabinfo_operations = {
4728	.open		= slabinfo_open,
4729	.read		= seq_read,
4730	.llseek		= seq_lseek,
4731	.release	= seq_release,
4732};
4733
4734static int __init slab_proc_init(void)
4735{
4736	proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
4737	return 0;
4738}
4739module_init(slab_proc_init);
4740#endif /* CONFIG_SLABINFO */
4741