slub.c revision ba84c73c7ae21fc891a3c2576fa3be42752fce53
1/*
2 * SLUB: A slab allocator that limits cache line use instead of queuing
3 * objects in per cpu and per node lists.
4 *
5 * The allocator synchronizes using per slab locks and only
6 * uses a centralized lock to manage a pool of partial slabs.
7 *
8 * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
9 */
10
11#include <linux/mm.h>
12#include <linux/module.h>
13#include <linux/bit_spinlock.h>
14#include <linux/interrupt.h>
15#include <linux/bitops.h>
16#include <linux/slab.h>
17#include <linux/seq_file.h>
18#include <linux/cpu.h>
19#include <linux/cpuset.h>
20#include <linux/mempolicy.h>
21#include <linux/ctype.h>
22#include <linux/kallsyms.h>
23#include <linux/memory.h>
24
25/*
26 * Lock order:
27 *   1. slab_lock(page)
28 *   2. slab->list_lock
29 *
30 *   The slab_lock protects operations on the object of a particular
31 *   slab and its metadata in the page struct. If the slab lock
32 *   has been taken then no allocations nor frees can be performed
33 *   on the objects in the slab nor can the slab be added or removed
34 *   from the partial or full lists since this would mean modifying
35 *   the page_struct of the slab.
36 *
37 *   The list_lock protects the partial and full list on each node and
38 *   the partial slab counter. If taken then no new slabs may be added or
39 *   removed from the lists nor make the number of partial slabs be modified.
40 *   (Note that the total number of slabs is an atomic value that may be
41 *   modified without taking the list lock).
42 *
43 *   The list_lock is a centralized lock and thus we avoid taking it as
44 *   much as possible. As long as SLUB does not have to handle partial
45 *   slabs, operations can continue without any centralized lock. F.e.
46 *   allocating a long series of objects that fill up slabs does not require
47 *   the list lock.
48 *
49 *   The lock order is sometimes inverted when we are trying to get a slab
50 *   off a list. We take the list_lock and then look for a page on the list
51 *   to use. While we do that objects in the slabs may be freed. We can
52 *   only operate on the slab if we have also taken the slab_lock. So we use
53 *   a slab_trylock() on the slab. If trylock was successful then no frees
54 *   can occur anymore and we can use the slab for allocations etc. If the
55 *   slab_trylock() does not succeed then frees are in progress in the slab and
56 *   we must stay away from it for a while since we may cause a bouncing
57 *   cacheline if we try to acquire the lock. So go onto the next slab.
58 *   If all pages are busy then we may allocate a new slab instead of reusing
59 *   a partial slab. A new slab has noone operating on it and thus there is
60 *   no danger of cacheline contention.
61 *
62 *   Interrupts are disabled during allocation and deallocation in order to
63 *   make the slab allocator safe to use in the context of an irq. In addition
64 *   interrupts are disabled to ensure that the processor does not change
65 *   while handling per_cpu slabs, due to kernel preemption.
66 *
67 * SLUB assigns one slab for allocation to each processor.
68 * Allocations only occur from these slabs called cpu slabs.
69 *
70 * Slabs with free elements are kept on a partial list and during regular
71 * operations no list for full slabs is used. If an object in a full slab is
72 * freed then the slab will show up again on the partial lists.
73 * We track full slabs for debugging purposes though because otherwise we
74 * cannot scan all objects.
75 *
76 * Slabs are freed when they become empty. Teardown and setup is
77 * minimal so we rely on the page allocators per cpu caches for
78 * fast frees and allocs.
79 *
80 * Overloading of page flags that are otherwise used for LRU management.
81 *
82 * PageActive 		The slab is frozen and exempt from list processing.
83 * 			This means that the slab is dedicated to a purpose
84 * 			such as satisfying allocations for a specific
85 * 			processor. Objects may be freed in the slab while
86 * 			it is frozen but slab_free will then skip the usual
87 * 			list operations. It is up to the processor holding
88 * 			the slab to integrate the slab into the slab lists
89 * 			when the slab is no longer needed.
90 *
91 * 			One use of this flag is to mark slabs that are
92 * 			used for allocations. Then such a slab becomes a cpu
93 * 			slab. The cpu slab may be equipped with an additional
94 * 			freelist that allows lockless access to
95 * 			free objects in addition to the regular freelist
96 * 			that requires the slab lock.
97 *
98 * PageError		Slab requires special handling due to debug
99 * 			options set. This moves	slab handling out of
100 * 			the fast path and disables lockless freelists.
101 */
102
103#define FROZEN (1 << PG_active)
104
105#ifdef CONFIG_SLUB_DEBUG
106#define SLABDEBUG (1 << PG_error)
107#else
108#define SLABDEBUG 0
109#endif
110
111static inline int SlabFrozen(struct page *page)
112{
113	return page->flags & FROZEN;
114}
115
116static inline void SetSlabFrozen(struct page *page)
117{
118	page->flags |= FROZEN;
119}
120
121static inline void ClearSlabFrozen(struct page *page)
122{
123	page->flags &= ~FROZEN;
124}
125
126static inline int SlabDebug(struct page *page)
127{
128	return page->flags & SLABDEBUG;
129}
130
131static inline void SetSlabDebug(struct page *page)
132{
133	page->flags |= SLABDEBUG;
134}
135
136static inline void ClearSlabDebug(struct page *page)
137{
138	page->flags &= ~SLABDEBUG;
139}
140
141/*
142 * Issues still to be resolved:
143 *
144 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
145 *
146 * - Variable sizing of the per node arrays
147 */
148
149/* Enable to test recovery from slab corruption on boot */
150#undef SLUB_RESILIENCY_TEST
151
152#if PAGE_SHIFT <= 12
153
154/*
155 * Small page size. Make sure that we do not fragment memory
156 */
157#define DEFAULT_MAX_ORDER 1
158#define DEFAULT_MIN_OBJECTS 4
159
160#else
161
162/*
163 * Large page machines are customarily able to handle larger
164 * page orders.
165 */
166#define DEFAULT_MAX_ORDER 2
167#define DEFAULT_MIN_OBJECTS 8
168
169#endif
170
171/*
172 * Mininum number of partial slabs. These will be left on the partial
173 * lists even if they are empty. kmem_cache_shrink may reclaim them.
174 */
175#define MIN_PARTIAL 5
176
177/*
178 * Maximum number of desirable partial slabs.
179 * The existence of more partial slabs makes kmem_cache_shrink
180 * sort the partial list by the number of objects in the.
181 */
182#define MAX_PARTIAL 10
183
184#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
185				SLAB_POISON | SLAB_STORE_USER)
186
187/*
188 * Set of flags that will prevent slab merging
189 */
190#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
191		SLAB_TRACE | SLAB_DESTROY_BY_RCU)
192
193#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
194		SLAB_CACHE_DMA)
195
196#ifndef ARCH_KMALLOC_MINALIGN
197#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
198#endif
199
200#ifndef ARCH_SLAB_MINALIGN
201#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
202#endif
203
204/* Internal SLUB flags */
205#define __OBJECT_POISON		0x80000000 /* Poison object */
206#define __SYSFS_ADD_DEFERRED	0x40000000 /* Not yet visible via sysfs */
207
208/* Not all arches define cache_line_size */
209#ifndef cache_line_size
210#define cache_line_size()	L1_CACHE_BYTES
211#endif
212
213static int kmem_size = sizeof(struct kmem_cache);
214
215#ifdef CONFIG_SMP
216static struct notifier_block slab_notifier;
217#endif
218
219static enum {
220	DOWN,		/* No slab functionality available */
221	PARTIAL,	/* kmem_cache_open() works but kmalloc does not */
222	UP,		/* Everything works but does not show up in sysfs */
223	SYSFS		/* Sysfs up */
224} slab_state = DOWN;
225
226/* A list of all slab caches on the system */
227static DECLARE_RWSEM(slub_lock);
228static LIST_HEAD(slab_caches);
229
230/*
231 * Tracking user of a slab.
232 */
233struct track {
234	void *addr;		/* Called from address */
235	int cpu;		/* Was running on cpu */
236	int pid;		/* Pid context */
237	unsigned long when;	/* When did the operation occur */
238};
239
240enum track_item { TRACK_ALLOC, TRACK_FREE };
241
242#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
243static int sysfs_slab_add(struct kmem_cache *);
244static int sysfs_slab_alias(struct kmem_cache *, const char *);
245static void sysfs_slab_remove(struct kmem_cache *);
246#else
247static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
248static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
249							{ return 0; }
250static inline void sysfs_slab_remove(struct kmem_cache *s)
251{
252	kfree(s);
253}
254#endif
255
256/********************************************************************
257 * 			Core slab cache functions
258 *******************************************************************/
259
260int slab_is_available(void)
261{
262	return slab_state >= UP;
263}
264
265static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
266{
267#ifdef CONFIG_NUMA
268	return s->node[node];
269#else
270	return &s->local_node;
271#endif
272}
273
274static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
275{
276#ifdef CONFIG_SMP
277	return s->cpu_slab[cpu];
278#else
279	return &s->cpu_slab;
280#endif
281}
282
283static inline int check_valid_pointer(struct kmem_cache *s,
284				struct page *page, const void *object)
285{
286	void *base;
287
288	if (!object)
289		return 1;
290
291	base = page_address(page);
292	if (object < base || object >= base + s->objects * s->size ||
293		(object - base) % s->size) {
294		return 0;
295	}
296
297	return 1;
298}
299
300/*
301 * Slow version of get and set free pointer.
302 *
303 * This version requires touching the cache lines of kmem_cache which
304 * we avoid to do in the fast alloc free paths. There we obtain the offset
305 * from the page struct.
306 */
307static inline void *get_freepointer(struct kmem_cache *s, void *object)
308{
309	return *(void **)(object + s->offset);
310}
311
312static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
313{
314	*(void **)(object + s->offset) = fp;
315}
316
317/* Loop over all objects in a slab */
318#define for_each_object(__p, __s, __addr) \
319	for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
320			__p += (__s)->size)
321
322/* Scan freelist */
323#define for_each_free_object(__p, __s, __free) \
324	for (__p = (__free); __p; __p = get_freepointer((__s), __p))
325
326/* Determine object index from a given position */
327static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
328{
329	return (p - addr) / s->size;
330}
331
332#ifdef CONFIG_SLUB_DEBUG
333/*
334 * Debug settings:
335 */
336#ifdef CONFIG_SLUB_DEBUG_ON
337static int slub_debug = DEBUG_DEFAULT_FLAGS;
338#else
339static int slub_debug;
340#endif
341
342static char *slub_debug_slabs;
343
344/*
345 * Object debugging
346 */
347static void print_section(char *text, u8 *addr, unsigned int length)
348{
349	int i, offset;
350	int newline = 1;
351	char ascii[17];
352
353	ascii[16] = 0;
354
355	for (i = 0; i < length; i++) {
356		if (newline) {
357			printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
358			newline = 0;
359		}
360		printk(KERN_CONT " %02x", addr[i]);
361		offset = i % 16;
362		ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
363		if (offset == 15) {
364			printk(KERN_CONT " %s\n", ascii);
365			newline = 1;
366		}
367	}
368	if (!newline) {
369		i %= 16;
370		while (i < 16) {
371			printk(KERN_CONT "   ");
372			ascii[i] = ' ';
373			i++;
374		}
375		printk(KERN_CONT " %s\n", ascii);
376	}
377}
378
379static struct track *get_track(struct kmem_cache *s, void *object,
380	enum track_item alloc)
381{
382	struct track *p;
383
384	if (s->offset)
385		p = object + s->offset + sizeof(void *);
386	else
387		p = object + s->inuse;
388
389	return p + alloc;
390}
391
392static void set_track(struct kmem_cache *s, void *object,
393				enum track_item alloc, void *addr)
394{
395	struct track *p;
396
397	if (s->offset)
398		p = object + s->offset + sizeof(void *);
399	else
400		p = object + s->inuse;
401
402	p += alloc;
403	if (addr) {
404		p->addr = addr;
405		p->cpu = smp_processor_id();
406		p->pid = current ? current->pid : -1;
407		p->when = jiffies;
408	} else
409		memset(p, 0, sizeof(struct track));
410}
411
412static void init_tracking(struct kmem_cache *s, void *object)
413{
414	if (!(s->flags & SLAB_STORE_USER))
415		return;
416
417	set_track(s, object, TRACK_FREE, NULL);
418	set_track(s, object, TRACK_ALLOC, NULL);
419}
420
421static void print_track(const char *s, struct track *t)
422{
423	if (!t->addr)
424		return;
425
426	printk(KERN_ERR "INFO: %s in ", s);
427	__print_symbol("%s", (unsigned long)t->addr);
428	printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
429}
430
431static void print_tracking(struct kmem_cache *s, void *object)
432{
433	if (!(s->flags & SLAB_STORE_USER))
434		return;
435
436	print_track("Allocated", get_track(s, object, TRACK_ALLOC));
437	print_track("Freed", get_track(s, object, TRACK_FREE));
438}
439
440static void print_page_info(struct page *page)
441{
442	printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
443		page, page->inuse, page->freelist, page->flags);
444
445}
446
447static void slab_bug(struct kmem_cache *s, char *fmt, ...)
448{
449	va_list args;
450	char buf[100];
451
452	va_start(args, fmt);
453	vsnprintf(buf, sizeof(buf), fmt, args);
454	va_end(args);
455	printk(KERN_ERR "========================================"
456			"=====================================\n");
457	printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
458	printk(KERN_ERR "----------------------------------------"
459			"-------------------------------------\n\n");
460}
461
462static void slab_fix(struct kmem_cache *s, char *fmt, ...)
463{
464	va_list args;
465	char buf[100];
466
467	va_start(args, fmt);
468	vsnprintf(buf, sizeof(buf), fmt, args);
469	va_end(args);
470	printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
471}
472
473static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
474{
475	unsigned int off;	/* Offset of last byte */
476	u8 *addr = page_address(page);
477
478	print_tracking(s, p);
479
480	print_page_info(page);
481
482	printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
483			p, p - addr, get_freepointer(s, p));
484
485	if (p > addr + 16)
486		print_section("Bytes b4", p - 16, 16);
487
488	print_section("Object", p, min(s->objsize, 128));
489
490	if (s->flags & SLAB_RED_ZONE)
491		print_section("Redzone", p + s->objsize,
492			s->inuse - s->objsize);
493
494	if (s->offset)
495		off = s->offset + sizeof(void *);
496	else
497		off = s->inuse;
498
499	if (s->flags & SLAB_STORE_USER)
500		off += 2 * sizeof(struct track);
501
502	if (off != s->size)
503		/* Beginning of the filler is the free pointer */
504		print_section("Padding", p + off, s->size - off);
505
506	dump_stack();
507}
508
509static void object_err(struct kmem_cache *s, struct page *page,
510			u8 *object, char *reason)
511{
512	slab_bug(s, reason);
513	print_trailer(s, page, object);
514}
515
516static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
517{
518	va_list args;
519	char buf[100];
520
521	va_start(args, fmt);
522	vsnprintf(buf, sizeof(buf), fmt, args);
523	va_end(args);
524	slab_bug(s, fmt);
525	print_page_info(page);
526	dump_stack();
527}
528
529static void init_object(struct kmem_cache *s, void *object, int active)
530{
531	u8 *p = object;
532
533	if (s->flags & __OBJECT_POISON) {
534		memset(p, POISON_FREE, s->objsize - 1);
535		p[s->objsize - 1] = POISON_END;
536	}
537
538	if (s->flags & SLAB_RED_ZONE)
539		memset(p + s->objsize,
540			active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
541			s->inuse - s->objsize);
542}
543
544static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
545{
546	while (bytes) {
547		if (*start != (u8)value)
548			return start;
549		start++;
550		bytes--;
551	}
552	return NULL;
553}
554
555static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
556						void *from, void *to)
557{
558	slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
559	memset(from, data, to - from);
560}
561
562static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
563			u8 *object, char *what,
564			u8 *start, unsigned int value, unsigned int bytes)
565{
566	u8 *fault;
567	u8 *end;
568
569	fault = check_bytes(start, value, bytes);
570	if (!fault)
571		return 1;
572
573	end = start + bytes;
574	while (end > fault && end[-1] == value)
575		end--;
576
577	slab_bug(s, "%s overwritten", what);
578	printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
579					fault, end - 1, fault[0], value);
580	print_trailer(s, page, object);
581
582	restore_bytes(s, what, value, fault, end);
583	return 0;
584}
585
586/*
587 * Object layout:
588 *
589 * object address
590 * 	Bytes of the object to be managed.
591 * 	If the freepointer may overlay the object then the free
592 * 	pointer is the first word of the object.
593 *
594 * 	Poisoning uses 0x6b (POISON_FREE) and the last byte is
595 * 	0xa5 (POISON_END)
596 *
597 * object + s->objsize
598 * 	Padding to reach word boundary. This is also used for Redzoning.
599 * 	Padding is extended by another word if Redzoning is enabled and
600 * 	objsize == inuse.
601 *
602 * 	We fill with 0xbb (RED_INACTIVE) for inactive objects and with
603 * 	0xcc (RED_ACTIVE) for objects in use.
604 *
605 * object + s->inuse
606 * 	Meta data starts here.
607 *
608 * 	A. Free pointer (if we cannot overwrite object on free)
609 * 	B. Tracking data for SLAB_STORE_USER
610 * 	C. Padding to reach required alignment boundary or at mininum
611 * 		one word if debuggin is on to be able to detect writes
612 * 		before the word boundary.
613 *
614 *	Padding is done using 0x5a (POISON_INUSE)
615 *
616 * object + s->size
617 * 	Nothing is used beyond s->size.
618 *
619 * If slabcaches are merged then the objsize and inuse boundaries are mostly
620 * ignored. And therefore no slab options that rely on these boundaries
621 * may be used with merged slabcaches.
622 */
623
624static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
625{
626	unsigned long off = s->inuse;	/* The end of info */
627
628	if (s->offset)
629		/* Freepointer is placed after the object. */
630		off += sizeof(void *);
631
632	if (s->flags & SLAB_STORE_USER)
633		/* We also have user information there */
634		off += 2 * sizeof(struct track);
635
636	if (s->size == off)
637		return 1;
638
639	return check_bytes_and_report(s, page, p, "Object padding",
640				p + off, POISON_INUSE, s->size - off);
641}
642
643static int slab_pad_check(struct kmem_cache *s, struct page *page)
644{
645	u8 *start;
646	u8 *fault;
647	u8 *end;
648	int length;
649	int remainder;
650
651	if (!(s->flags & SLAB_POISON))
652		return 1;
653
654	start = page_address(page);
655	end = start + (PAGE_SIZE << s->order);
656	length = s->objects * s->size;
657	remainder = end - (start + length);
658	if (!remainder)
659		return 1;
660
661	fault = check_bytes(start + length, POISON_INUSE, remainder);
662	if (!fault)
663		return 1;
664	while (end > fault && end[-1] == POISON_INUSE)
665		end--;
666
667	slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
668	print_section("Padding", start, length);
669
670	restore_bytes(s, "slab padding", POISON_INUSE, start, end);
671	return 0;
672}
673
674static int check_object(struct kmem_cache *s, struct page *page,
675					void *object, int active)
676{
677	u8 *p = object;
678	u8 *endobject = object + s->objsize;
679
680	if (s->flags & SLAB_RED_ZONE) {
681		unsigned int red =
682			active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
683
684		if (!check_bytes_and_report(s, page, object, "Redzone",
685			endobject, red, s->inuse - s->objsize))
686			return 0;
687	} else {
688		if ((s->flags & SLAB_POISON) && s->objsize < s->inuse)
689			check_bytes_and_report(s, page, p, "Alignment padding", endobject,
690				POISON_INUSE, s->inuse - s->objsize);
691	}
692
693	if (s->flags & SLAB_POISON) {
694		if (!active && (s->flags & __OBJECT_POISON) &&
695			(!check_bytes_and_report(s, page, p, "Poison", p,
696					POISON_FREE, s->objsize - 1) ||
697			 !check_bytes_and_report(s, page, p, "Poison",
698				p + s->objsize - 1, POISON_END, 1)))
699			return 0;
700		/*
701		 * check_pad_bytes cleans up on its own.
702		 */
703		check_pad_bytes(s, page, p);
704	}
705
706	if (!s->offset && active)
707		/*
708		 * Object and freepointer overlap. Cannot check
709		 * freepointer while object is allocated.
710		 */
711		return 1;
712
713	/* Check free pointer validity */
714	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
715		object_err(s, page, p, "Freepointer corrupt");
716		/*
717		 * No choice but to zap it and thus loose the remainder
718		 * of the free objects in this slab. May cause
719		 * another error because the object count is now wrong.
720		 */
721		set_freepointer(s, p, NULL);
722		return 0;
723	}
724	return 1;
725}
726
727static int check_slab(struct kmem_cache *s, struct page *page)
728{
729	VM_BUG_ON(!irqs_disabled());
730
731	if (!PageSlab(page)) {
732		slab_err(s, page, "Not a valid slab page");
733		return 0;
734	}
735	if (page->inuse > s->objects) {
736		slab_err(s, page, "inuse %u > max %u",
737			s->name, page->inuse, s->objects);
738		return 0;
739	}
740	/* Slab_pad_check fixes things up after itself */
741	slab_pad_check(s, page);
742	return 1;
743}
744
745/*
746 * Determine if a certain object on a page is on the freelist. Must hold the
747 * slab lock to guarantee that the chains are in a consistent state.
748 */
749static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
750{
751	int nr = 0;
752	void *fp = page->freelist;
753	void *object = NULL;
754
755	while (fp && nr <= s->objects) {
756		if (fp == search)
757			return 1;
758		if (!check_valid_pointer(s, page, fp)) {
759			if (object) {
760				object_err(s, page, object,
761					"Freechain corrupt");
762				set_freepointer(s, object, NULL);
763				break;
764			} else {
765				slab_err(s, page, "Freepointer corrupt");
766				page->freelist = NULL;
767				page->inuse = s->objects;
768				slab_fix(s, "Freelist cleared");
769				return 0;
770			}
771			break;
772		}
773		object = fp;
774		fp = get_freepointer(s, object);
775		nr++;
776	}
777
778	if (page->inuse != s->objects - nr) {
779		slab_err(s, page, "Wrong object count. Counter is %d but "
780			"counted were %d", page->inuse, s->objects - nr);
781		page->inuse = s->objects - nr;
782		slab_fix(s, "Object count adjusted.");
783	}
784	return search == NULL;
785}
786
787static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc)
788{
789	if (s->flags & SLAB_TRACE) {
790		printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
791			s->name,
792			alloc ? "alloc" : "free",
793			object, page->inuse,
794			page->freelist);
795
796		if (!alloc)
797			print_section("Object", (void *)object, s->objsize);
798
799		dump_stack();
800	}
801}
802
803/*
804 * Tracking of fully allocated slabs for debugging purposes.
805 */
806static void add_full(struct kmem_cache_node *n, struct page *page)
807{
808	spin_lock(&n->list_lock);
809	list_add(&page->lru, &n->full);
810	spin_unlock(&n->list_lock);
811}
812
813static void remove_full(struct kmem_cache *s, struct page *page)
814{
815	struct kmem_cache_node *n;
816
817	if (!(s->flags & SLAB_STORE_USER))
818		return;
819
820	n = get_node(s, page_to_nid(page));
821
822	spin_lock(&n->list_lock);
823	list_del(&page->lru);
824	spin_unlock(&n->list_lock);
825}
826
827static void setup_object_debug(struct kmem_cache *s, struct page *page,
828								void *object)
829{
830	if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
831		return;
832
833	init_object(s, object, 0);
834	init_tracking(s, object);
835}
836
837static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
838						void *object, void *addr)
839{
840	if (!check_slab(s, page))
841		goto bad;
842
843	if (object && !on_freelist(s, page, object)) {
844		object_err(s, page, object, "Object already allocated");
845		goto bad;
846	}
847
848	if (!check_valid_pointer(s, page, object)) {
849		object_err(s, page, object, "Freelist Pointer check fails");
850		goto bad;
851	}
852
853	if (object && !check_object(s, page, object, 0))
854		goto bad;
855
856	/* Success perform special debug activities for allocs */
857	if (s->flags & SLAB_STORE_USER)
858		set_track(s, object, TRACK_ALLOC, addr);
859	trace(s, page, object, 1);
860	init_object(s, object, 1);
861	return 1;
862
863bad:
864	if (PageSlab(page)) {
865		/*
866		 * If this is a slab page then lets do the best we can
867		 * to avoid issues in the future. Marking all objects
868		 * as used avoids touching the remaining objects.
869		 */
870		slab_fix(s, "Marking all objects used");
871		page->inuse = s->objects;
872		page->freelist = NULL;
873	}
874	return 0;
875}
876
877static int free_debug_processing(struct kmem_cache *s, struct page *page,
878						void *object, void *addr)
879{
880	if (!check_slab(s, page))
881		goto fail;
882
883	if (!check_valid_pointer(s, page, object)) {
884		slab_err(s, page, "Invalid object pointer 0x%p", object);
885		goto fail;
886	}
887
888	if (on_freelist(s, page, object)) {
889		object_err(s, page, object, "Object already free");
890		goto fail;
891	}
892
893	if (!check_object(s, page, object, 1))
894		return 0;
895
896	if (unlikely(s != page->slab)) {
897		if (!PageSlab(page))
898			slab_err(s, page, "Attempt to free object(0x%p) "
899				"outside of slab", object);
900		else
901		if (!page->slab) {
902			printk(KERN_ERR
903				"SLUB <none>: no slab for object 0x%p.\n",
904						object);
905			dump_stack();
906		} else
907			object_err(s, page, object,
908					"page slab pointer corrupt.");
909		goto fail;
910	}
911
912	/* Special debug activities for freeing objects */
913	if (!SlabFrozen(page) && !page->freelist)
914		remove_full(s, page);
915	if (s->flags & SLAB_STORE_USER)
916		set_track(s, object, TRACK_FREE, addr);
917	trace(s, page, object, 0);
918	init_object(s, object, 0);
919	return 1;
920
921fail:
922	slab_fix(s, "Object at 0x%p not freed", object);
923	return 0;
924}
925
926static int __init setup_slub_debug(char *str)
927{
928	slub_debug = DEBUG_DEFAULT_FLAGS;
929	if (*str++ != '=' || !*str)
930		/*
931		 * No options specified. Switch on full debugging.
932		 */
933		goto out;
934
935	if (*str == ',')
936		/*
937		 * No options but restriction on slabs. This means full
938		 * debugging for slabs matching a pattern.
939		 */
940		goto check_slabs;
941
942	slub_debug = 0;
943	if (*str == '-')
944		/*
945		 * Switch off all debugging measures.
946		 */
947		goto out;
948
949	/*
950	 * Determine which debug features should be switched on
951	 */
952	for (; *str && *str != ','; str++) {
953		switch (tolower(*str)) {
954		case 'f':
955			slub_debug |= SLAB_DEBUG_FREE;
956			break;
957		case 'z':
958			slub_debug |= SLAB_RED_ZONE;
959			break;
960		case 'p':
961			slub_debug |= SLAB_POISON;
962			break;
963		case 'u':
964			slub_debug |= SLAB_STORE_USER;
965			break;
966		case 't':
967			slub_debug |= SLAB_TRACE;
968			break;
969		default:
970			printk(KERN_ERR "slub_debug option '%c' "
971				"unknown. skipped\n", *str);
972		}
973	}
974
975check_slabs:
976	if (*str == ',')
977		slub_debug_slabs = str + 1;
978out:
979	return 1;
980}
981
982__setup("slub_debug", setup_slub_debug);
983
984static unsigned long kmem_cache_flags(unsigned long objsize,
985	unsigned long flags, const char *name,
986	void (*ctor)(struct kmem_cache *, void *))
987{
988	/*
989	 * The page->offset field is only 16 bit wide. This is an offset
990	 * in units of words from the beginning of an object. If the slab
991	 * size is bigger then we cannot move the free pointer behind the
992	 * object anymore.
993	 *
994	 * On 32 bit platforms the limit is 256k. On 64bit platforms
995	 * the limit is 512k.
996	 *
997	 * Debugging or ctor may create a need to move the free
998	 * pointer. Fail if this happens.
999	 */
1000	if (objsize >= 65535 * sizeof(void *)) {
1001		BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON |
1002				SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
1003		BUG_ON(ctor);
1004	} else {
1005		/*
1006		 * Enable debugging if selected on the kernel commandline.
1007		 */
1008		if (slub_debug && (!slub_debug_slabs ||
1009		    strncmp(slub_debug_slabs, name,
1010		    	strlen(slub_debug_slabs)) == 0))
1011				flags |= slub_debug;
1012	}
1013
1014	return flags;
1015}
1016#else
1017static inline void setup_object_debug(struct kmem_cache *s,
1018			struct page *page, void *object) {}
1019
1020static inline int alloc_debug_processing(struct kmem_cache *s,
1021	struct page *page, void *object, void *addr) { return 0; }
1022
1023static inline int free_debug_processing(struct kmem_cache *s,
1024	struct page *page, void *object, void *addr) { return 0; }
1025
1026static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1027			{ return 1; }
1028static inline int check_object(struct kmem_cache *s, struct page *page,
1029			void *object, int active) { return 1; }
1030static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1031static inline unsigned long kmem_cache_flags(unsigned long objsize,
1032	unsigned long flags, const char *name,
1033	void (*ctor)(struct kmem_cache *, void *))
1034{
1035	return flags;
1036}
1037#define slub_debug 0
1038#endif
1039/*
1040 * Slab allocation and freeing
1041 */
1042static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1043{
1044	struct page *page;
1045	int pages = 1 << s->order;
1046
1047	if (s->order)
1048		flags |= __GFP_COMP;
1049
1050	if (s->flags & SLAB_CACHE_DMA)
1051		flags |= SLUB_DMA;
1052
1053	if (s->flags & SLAB_RECLAIM_ACCOUNT)
1054		flags |= __GFP_RECLAIMABLE;
1055
1056	if (node == -1)
1057		page = alloc_pages(flags, s->order);
1058	else
1059		page = alloc_pages_node(node, flags, s->order);
1060
1061	if (!page)
1062		return NULL;
1063
1064	mod_zone_page_state(page_zone(page),
1065		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
1066		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1067		pages);
1068
1069	return page;
1070}
1071
1072static void setup_object(struct kmem_cache *s, struct page *page,
1073				void *object)
1074{
1075	setup_object_debug(s, page, object);
1076	if (unlikely(s->ctor))
1077		s->ctor(s, object);
1078}
1079
1080static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1081{
1082	struct page *page;
1083	struct kmem_cache_node *n;
1084	void *start;
1085	void *last;
1086	void *p;
1087
1088	BUG_ON(flags & GFP_SLAB_BUG_MASK);
1089
1090	page = allocate_slab(s,
1091		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1092	if (!page)
1093		goto out;
1094
1095	n = get_node(s, page_to_nid(page));
1096	if (n)
1097		atomic_long_inc(&n->nr_slabs);
1098	page->slab = s;
1099	page->flags |= 1 << PG_slab;
1100	if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
1101			SLAB_STORE_USER | SLAB_TRACE))
1102		SetSlabDebug(page);
1103
1104	start = page_address(page);
1105
1106	if (unlikely(s->flags & SLAB_POISON))
1107		memset(start, POISON_INUSE, PAGE_SIZE << s->order);
1108
1109	last = start;
1110	for_each_object(p, s, start) {
1111		setup_object(s, page, last);
1112		set_freepointer(s, last, p);
1113		last = p;
1114	}
1115	setup_object(s, page, last);
1116	set_freepointer(s, last, NULL);
1117
1118	page->freelist = start;
1119	page->inuse = 0;
1120out:
1121	return page;
1122}
1123
1124static void __free_slab(struct kmem_cache *s, struct page *page)
1125{
1126	int pages = 1 << s->order;
1127
1128	if (unlikely(SlabDebug(page))) {
1129		void *p;
1130
1131		slab_pad_check(s, page);
1132		for_each_object(p, s, page_address(page))
1133			check_object(s, page, p, 0);
1134		ClearSlabDebug(page);
1135	}
1136
1137	mod_zone_page_state(page_zone(page),
1138		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
1139		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1140		-pages);
1141
1142	__free_pages(page, s->order);
1143}
1144
1145static void rcu_free_slab(struct rcu_head *h)
1146{
1147	struct page *page;
1148
1149	page = container_of((struct list_head *)h, struct page, lru);
1150	__free_slab(page->slab, page);
1151}
1152
1153static void free_slab(struct kmem_cache *s, struct page *page)
1154{
1155	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1156		/*
1157		 * RCU free overloads the RCU head over the LRU
1158		 */
1159		struct rcu_head *head = (void *)&page->lru;
1160
1161		call_rcu(head, rcu_free_slab);
1162	} else
1163		__free_slab(s, page);
1164}
1165
1166static void discard_slab(struct kmem_cache *s, struct page *page)
1167{
1168	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1169
1170	atomic_long_dec(&n->nr_slabs);
1171	reset_page_mapcount(page);
1172	__ClearPageSlab(page);
1173	free_slab(s, page);
1174}
1175
1176/*
1177 * Per slab locking using the pagelock
1178 */
1179static __always_inline void slab_lock(struct page *page)
1180{
1181	bit_spin_lock(PG_locked, &page->flags);
1182}
1183
1184static __always_inline void slab_unlock(struct page *page)
1185{
1186	bit_spin_unlock(PG_locked, &page->flags);
1187}
1188
1189static __always_inline int slab_trylock(struct page *page)
1190{
1191	int rc = 1;
1192
1193	rc = bit_spin_trylock(PG_locked, &page->flags);
1194	return rc;
1195}
1196
1197/*
1198 * Management of partially allocated slabs
1199 */
1200static void add_partial(struct kmem_cache_node *n,
1201				struct page *page, int tail)
1202{
1203	spin_lock(&n->list_lock);
1204	n->nr_partial++;
1205	if (tail)
1206		list_add_tail(&page->lru, &n->partial);
1207	else
1208		list_add(&page->lru, &n->partial);
1209	spin_unlock(&n->list_lock);
1210}
1211
1212static void remove_partial(struct kmem_cache *s,
1213						struct page *page)
1214{
1215	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1216
1217	spin_lock(&n->list_lock);
1218	list_del(&page->lru);
1219	n->nr_partial--;
1220	spin_unlock(&n->list_lock);
1221}
1222
1223/*
1224 * Lock slab and remove from the partial list.
1225 *
1226 * Must hold list_lock.
1227 */
1228static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page)
1229{
1230	if (slab_trylock(page)) {
1231		list_del(&page->lru);
1232		n->nr_partial--;
1233		SetSlabFrozen(page);
1234		return 1;
1235	}
1236	return 0;
1237}
1238
1239/*
1240 * Try to allocate a partial slab from a specific node.
1241 */
1242static struct page *get_partial_node(struct kmem_cache_node *n)
1243{
1244	struct page *page;
1245
1246	/*
1247	 * Racy check. If we mistakenly see no partial slabs then we
1248	 * just allocate an empty slab. If we mistakenly try to get a
1249	 * partial slab and there is none available then get_partials()
1250	 * will return NULL.
1251	 */
1252	if (!n || !n->nr_partial)
1253		return NULL;
1254
1255	spin_lock(&n->list_lock);
1256	list_for_each_entry(page, &n->partial, lru)
1257		if (lock_and_freeze_slab(n, page))
1258			goto out;
1259	page = NULL;
1260out:
1261	spin_unlock(&n->list_lock);
1262	return page;
1263}
1264
1265/*
1266 * Get a page from somewhere. Search in increasing NUMA distances.
1267 */
1268static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1269{
1270#ifdef CONFIG_NUMA
1271	struct zonelist *zonelist;
1272	struct zone **z;
1273	struct page *page;
1274
1275	/*
1276	 * The defrag ratio allows a configuration of the tradeoffs between
1277	 * inter node defragmentation and node local allocations. A lower
1278	 * defrag_ratio increases the tendency to do local allocations
1279	 * instead of attempting to obtain partial slabs from other nodes.
1280	 *
1281	 * If the defrag_ratio is set to 0 then kmalloc() always
1282	 * returns node local objects. If the ratio is higher then kmalloc()
1283	 * may return off node objects because partial slabs are obtained
1284	 * from other nodes and filled up.
1285	 *
1286	 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes
1287	 * defrag_ratio = 1000) then every (well almost) allocation will
1288	 * first attempt to defrag slab caches on other nodes. This means
1289	 * scanning over all nodes to look for partial slabs which may be
1290	 * expensive if we do it every time we are trying to find a slab
1291	 * with available objects.
1292	 */
1293	if (!s->remote_node_defrag_ratio ||
1294			get_cycles() % 1024 > s->remote_node_defrag_ratio)
1295		return NULL;
1296
1297	zonelist = &NODE_DATA(slab_node(current->mempolicy))
1298					->node_zonelists[gfp_zone(flags)];
1299	for (z = zonelist->zones; *z; z++) {
1300		struct kmem_cache_node *n;
1301
1302		n = get_node(s, zone_to_nid(*z));
1303
1304		if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
1305				n->nr_partial > MIN_PARTIAL) {
1306			page = get_partial_node(n);
1307			if (page)
1308				return page;
1309		}
1310	}
1311#endif
1312	return NULL;
1313}
1314
1315/*
1316 * Get a partial page, lock it and return it.
1317 */
1318static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1319{
1320	struct page *page;
1321	int searchnode = (node == -1) ? numa_node_id() : node;
1322
1323	page = get_partial_node(get_node(s, searchnode));
1324	if (page || (flags & __GFP_THISNODE))
1325		return page;
1326
1327	return get_any_partial(s, flags);
1328}
1329
1330/*
1331 * Move a page back to the lists.
1332 *
1333 * Must be called with the slab lock held.
1334 *
1335 * On exit the slab lock will have been dropped.
1336 */
1337static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1338{
1339	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1340
1341	ClearSlabFrozen(page);
1342	if (page->inuse) {
1343
1344		if (page->freelist)
1345			add_partial(n, page, tail);
1346		else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
1347			add_full(n, page);
1348		slab_unlock(page);
1349
1350	} else {
1351		if (n->nr_partial < MIN_PARTIAL) {
1352			/*
1353			 * Adding an empty slab to the partial slabs in order
1354			 * to avoid page allocator overhead. This slab needs
1355			 * to come after the other slabs with objects in
1356			 * order to fill them up. That way the size of the
1357			 * partial list stays small. kmem_cache_shrink can
1358			 * reclaim empty slabs from the partial list.
1359			 */
1360			add_partial(n, page, 1);
1361			slab_unlock(page);
1362		} else {
1363			slab_unlock(page);
1364			discard_slab(s, page);
1365		}
1366	}
1367}
1368
1369/*
1370 * Remove the cpu slab
1371 */
1372static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1373{
1374	struct page *page = c->page;
1375	int tail = 1;
1376	/*
1377	 * Merge cpu freelist into freelist. Typically we get here
1378	 * because both freelists are empty. So this is unlikely
1379	 * to occur.
1380	 */
1381	while (unlikely(c->freelist)) {
1382		void **object;
1383
1384		tail = 0;	/* Hot objects. Put the slab first */
1385
1386		/* Retrieve object from cpu_freelist */
1387		object = c->freelist;
1388		c->freelist = c->freelist[c->offset];
1389
1390		/* And put onto the regular freelist */
1391		object[c->offset] = page->freelist;
1392		page->freelist = object;
1393		page->inuse--;
1394	}
1395	c->page = NULL;
1396	unfreeze_slab(s, page, tail);
1397}
1398
1399static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1400{
1401	slab_lock(c->page);
1402	deactivate_slab(s, c);
1403}
1404
1405/*
1406 * Flush cpu slab.
1407 * Called from IPI handler with interrupts disabled.
1408 */
1409static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1410{
1411	struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1412
1413	if (likely(c && c->page))
1414		flush_slab(s, c);
1415}
1416
1417static void flush_cpu_slab(void *d)
1418{
1419	struct kmem_cache *s = d;
1420
1421	__flush_cpu_slab(s, smp_processor_id());
1422}
1423
1424static void flush_all(struct kmem_cache *s)
1425{
1426#ifdef CONFIG_SMP
1427	on_each_cpu(flush_cpu_slab, s, 1, 1);
1428#else
1429	unsigned long flags;
1430
1431	local_irq_save(flags);
1432	flush_cpu_slab(s);
1433	local_irq_restore(flags);
1434#endif
1435}
1436
1437/*
1438 * Check if the objects in a per cpu structure fit numa
1439 * locality expectations.
1440 */
1441static inline int node_match(struct kmem_cache_cpu *c, int node)
1442{
1443#ifdef CONFIG_NUMA
1444	if (node != -1 && c->node != node)
1445		return 0;
1446#endif
1447	return 1;
1448}
1449
1450/*
1451 * Slow path. The lockless freelist is empty or we need to perform
1452 * debugging duties.
1453 *
1454 * Interrupts are disabled.
1455 *
1456 * Processing is still very fast if new objects have been freed to the
1457 * regular freelist. In that case we simply take over the regular freelist
1458 * as the lockless freelist and zap the regular freelist.
1459 *
1460 * If that is not working then we fall back to the partial lists. We take the
1461 * first element of the freelist as the object to allocate now and move the
1462 * rest of the freelist to the lockless freelist.
1463 *
1464 * And if we were unable to get a new slab from the partial slab lists then
1465 * we need to allocate a new slab. This is slowest path since we may sleep.
1466 */
1467static void *__slab_alloc(struct kmem_cache *s,
1468		gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
1469{
1470	void **object;
1471	struct page *new;
1472
1473	if (!c->page)
1474		goto new_slab;
1475
1476	slab_lock(c->page);
1477	if (unlikely(!node_match(c, node)))
1478		goto another_slab;
1479load_freelist:
1480	object = c->page->freelist;
1481	if (unlikely(!object))
1482		goto another_slab;
1483	if (unlikely(SlabDebug(c->page)))
1484		goto debug;
1485
1486	object = c->page->freelist;
1487	c->freelist = object[c->offset];
1488	c->page->inuse = s->objects;
1489	c->page->freelist = NULL;
1490	c->node = page_to_nid(c->page);
1491	slab_unlock(c->page);
1492	return object;
1493
1494another_slab:
1495	deactivate_slab(s, c);
1496
1497new_slab:
1498	new = get_partial(s, gfpflags, node);
1499	if (new) {
1500		c->page = new;
1501		goto load_freelist;
1502	}
1503
1504	if (gfpflags & __GFP_WAIT)
1505		local_irq_enable();
1506
1507	new = new_slab(s, gfpflags, node);
1508
1509	if (gfpflags & __GFP_WAIT)
1510		local_irq_disable();
1511
1512	if (new) {
1513		c = get_cpu_slab(s, smp_processor_id());
1514		if (c->page)
1515			flush_slab(s, c);
1516		slab_lock(new);
1517		SetSlabFrozen(new);
1518		c->page = new;
1519		goto load_freelist;
1520	}
1521	return NULL;
1522debug:
1523	object = c->page->freelist;
1524	if (!alloc_debug_processing(s, c->page, object, addr))
1525		goto another_slab;
1526
1527	c->page->inuse++;
1528	c->page->freelist = object[c->offset];
1529	c->node = -1;
1530	slab_unlock(c->page);
1531	return object;
1532}
1533
1534/*
1535 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
1536 * have the fastpath folded into their functions. So no function call
1537 * overhead for requests that can be satisfied on the fastpath.
1538 *
1539 * The fastpath works by first checking if the lockless freelist can be used.
1540 * If not then __slab_alloc is called for slow processing.
1541 *
1542 * Otherwise we can simply pick the next object from the lockless free list.
1543 */
1544static __always_inline void *slab_alloc(struct kmem_cache *s,
1545		gfp_t gfpflags, int node, void *addr)
1546{
1547	void **object;
1548	unsigned long flags;
1549	struct kmem_cache_cpu *c;
1550
1551	local_irq_save(flags);
1552	c = get_cpu_slab(s, smp_processor_id());
1553	if (unlikely(!c->freelist || !node_match(c, node)))
1554
1555		object = __slab_alloc(s, gfpflags, node, addr, c);
1556
1557	else {
1558		object = c->freelist;
1559		c->freelist = object[c->offset];
1560	}
1561	local_irq_restore(flags);
1562
1563	if (unlikely((gfpflags & __GFP_ZERO) && object))
1564		memset(object, 0, c->objsize);
1565
1566	return object;
1567}
1568
1569void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1570{
1571	return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
1572}
1573EXPORT_SYMBOL(kmem_cache_alloc);
1574
1575#ifdef CONFIG_NUMA
1576void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1577{
1578	return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
1579}
1580EXPORT_SYMBOL(kmem_cache_alloc_node);
1581#endif
1582
1583/*
1584 * Slow patch handling. This may still be called frequently since objects
1585 * have a longer lifetime than the cpu slabs in most processing loads.
1586 *
1587 * So we still attempt to reduce cache line usage. Just take the slab
1588 * lock and free the item. If there is no additional partial page
1589 * handling required then we can return immediately.
1590 */
1591static void __slab_free(struct kmem_cache *s, struct page *page,
1592				void *x, void *addr, unsigned int offset)
1593{
1594	void *prior;
1595	void **object = (void *)x;
1596
1597	slab_lock(page);
1598
1599	if (unlikely(SlabDebug(page)))
1600		goto debug;
1601checks_ok:
1602	prior = object[offset] = page->freelist;
1603	page->freelist = object;
1604	page->inuse--;
1605
1606	if (unlikely(SlabFrozen(page)))
1607		goto out_unlock;
1608
1609	if (unlikely(!page->inuse))
1610		goto slab_empty;
1611
1612	/*
1613	 * Objects left in the slab. If it
1614	 * was not on the partial list before
1615	 * then add it.
1616	 */
1617	if (unlikely(!prior))
1618		add_partial(get_node(s, page_to_nid(page)), page, 1);
1619
1620out_unlock:
1621	slab_unlock(page);
1622	return;
1623
1624slab_empty:
1625	if (prior)
1626		/*
1627		 * Slab still on the partial list.
1628		 */
1629		remove_partial(s, page);
1630
1631	slab_unlock(page);
1632	discard_slab(s, page);
1633	return;
1634
1635debug:
1636	if (!free_debug_processing(s, page, x, addr))
1637		goto out_unlock;
1638	goto checks_ok;
1639}
1640
1641/*
1642 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
1643 * can perform fastpath freeing without additional function calls.
1644 *
1645 * The fastpath is only possible if we are freeing to the current cpu slab
1646 * of this processor. This typically the case if we have just allocated
1647 * the item before.
1648 *
1649 * If fastpath is not possible then fall back to __slab_free where we deal
1650 * with all sorts of special processing.
1651 */
1652static __always_inline void slab_free(struct kmem_cache *s,
1653			struct page *page, void *x, void *addr)
1654{
1655	void **object = (void *)x;
1656	unsigned long flags;
1657	struct kmem_cache_cpu *c;
1658
1659	local_irq_save(flags);
1660	debug_check_no_locks_freed(object, s->objsize);
1661	c = get_cpu_slab(s, smp_processor_id());
1662	if (likely(page == c->page && c->node >= 0)) {
1663		object[c->offset] = c->freelist;
1664		c->freelist = object;
1665	} else
1666		__slab_free(s, page, x, addr, c->offset);
1667
1668	local_irq_restore(flags);
1669}
1670
1671void kmem_cache_free(struct kmem_cache *s, void *x)
1672{
1673	struct page *page;
1674
1675	page = virt_to_head_page(x);
1676
1677	slab_free(s, page, x, __builtin_return_address(0));
1678}
1679EXPORT_SYMBOL(kmem_cache_free);
1680
1681/* Figure out on which slab object the object resides */
1682static struct page *get_object_page(const void *x)
1683{
1684	struct page *page = virt_to_head_page(x);
1685
1686	if (!PageSlab(page))
1687		return NULL;
1688
1689	return page;
1690}
1691
1692/*
1693 * Object placement in a slab is made very easy because we always start at
1694 * offset 0. If we tune the size of the object to the alignment then we can
1695 * get the required alignment by putting one properly sized object after
1696 * another.
1697 *
1698 * Notice that the allocation order determines the sizes of the per cpu
1699 * caches. Each processor has always one slab available for allocations.
1700 * Increasing the allocation order reduces the number of times that slabs
1701 * must be moved on and off the partial lists and is therefore a factor in
1702 * locking overhead.
1703 */
1704
1705/*
1706 * Mininum / Maximum order of slab pages. This influences locking overhead
1707 * and slab fragmentation. A higher order reduces the number of partial slabs
1708 * and increases the number of allocations possible without having to
1709 * take the list_lock.
1710 */
1711static int slub_min_order;
1712static int slub_max_order = DEFAULT_MAX_ORDER;
1713static int slub_min_objects = DEFAULT_MIN_OBJECTS;
1714
1715/*
1716 * Merge control. If this is set then no merging of slab caches will occur.
1717 * (Could be removed. This was introduced to pacify the merge skeptics.)
1718 */
1719static int slub_nomerge;
1720
1721/*
1722 * Calculate the order of allocation given an slab object size.
1723 *
1724 * The order of allocation has significant impact on performance and other
1725 * system components. Generally order 0 allocations should be preferred since
1726 * order 0 does not cause fragmentation in the page allocator. Larger objects
1727 * be problematic to put into order 0 slabs because there may be too much
1728 * unused space left. We go to a higher order if more than 1/8th of the slab
1729 * would be wasted.
1730 *
1731 * In order to reach satisfactory performance we must ensure that a minimum
1732 * number of objects is in one slab. Otherwise we may generate too much
1733 * activity on the partial lists which requires taking the list_lock. This is
1734 * less a concern for large slabs though which are rarely used.
1735 *
1736 * slub_max_order specifies the order where we begin to stop considering the
1737 * number of objects in a slab as critical. If we reach slub_max_order then
1738 * we try to keep the page order as low as possible. So we accept more waste
1739 * of space in favor of a small page order.
1740 *
1741 * Higher order allocations also allow the placement of more objects in a
1742 * slab and thereby reduce object handling overhead. If the user has
1743 * requested a higher mininum order then we start with that one instead of
1744 * the smallest order which will fit the object.
1745 */
1746static inline int slab_order(int size, int min_objects,
1747				int max_order, int fract_leftover)
1748{
1749	int order;
1750	int rem;
1751	int min_order = slub_min_order;
1752
1753	for (order = max(min_order,
1754				fls(min_objects * size - 1) - PAGE_SHIFT);
1755			order <= max_order; order++) {
1756
1757		unsigned long slab_size = PAGE_SIZE << order;
1758
1759		if (slab_size < min_objects * size)
1760			continue;
1761
1762		rem = slab_size % size;
1763
1764		if (rem <= slab_size / fract_leftover)
1765			break;
1766
1767	}
1768
1769	return order;
1770}
1771
1772static inline int calculate_order(int size)
1773{
1774	int order;
1775	int min_objects;
1776	int fraction;
1777
1778	/*
1779	 * Attempt to find best configuration for a slab. This
1780	 * works by first attempting to generate a layout with
1781	 * the best configuration and backing off gradually.
1782	 *
1783	 * First we reduce the acceptable waste in a slab. Then
1784	 * we reduce the minimum objects required in a slab.
1785	 */
1786	min_objects = slub_min_objects;
1787	while (min_objects > 1) {
1788		fraction = 8;
1789		while (fraction >= 4) {
1790			order = slab_order(size, min_objects,
1791						slub_max_order, fraction);
1792			if (order <= slub_max_order)
1793				return order;
1794			fraction /= 2;
1795		}
1796		min_objects /= 2;
1797	}
1798
1799	/*
1800	 * We were unable to place multiple objects in a slab. Now
1801	 * lets see if we can place a single object there.
1802	 */
1803	order = slab_order(size, 1, slub_max_order, 1);
1804	if (order <= slub_max_order)
1805		return order;
1806
1807	/*
1808	 * Doh this slab cannot be placed using slub_max_order.
1809	 */
1810	order = slab_order(size, 1, MAX_ORDER, 1);
1811	if (order <= MAX_ORDER)
1812		return order;
1813	return -ENOSYS;
1814}
1815
1816/*
1817 * Figure out what the alignment of the objects will be.
1818 */
1819static unsigned long calculate_alignment(unsigned long flags,
1820		unsigned long align, unsigned long size)
1821{
1822	/*
1823	 * If the user wants hardware cache aligned objects then
1824	 * follow that suggestion if the object is sufficiently
1825	 * large.
1826	 *
1827	 * The hardware cache alignment cannot override the
1828	 * specified alignment though. If that is greater
1829	 * then use it.
1830	 */
1831	if ((flags & SLAB_HWCACHE_ALIGN) &&
1832			size > cache_line_size() / 2)
1833		return max_t(unsigned long, align, cache_line_size());
1834
1835	if (align < ARCH_SLAB_MINALIGN)
1836		return ARCH_SLAB_MINALIGN;
1837
1838	return ALIGN(align, sizeof(void *));
1839}
1840
1841static void init_kmem_cache_cpu(struct kmem_cache *s,
1842			struct kmem_cache_cpu *c)
1843{
1844	c->page = NULL;
1845	c->freelist = NULL;
1846	c->node = 0;
1847	c->offset = s->offset / sizeof(void *);
1848	c->objsize = s->objsize;
1849}
1850
1851static void init_kmem_cache_node(struct kmem_cache_node *n)
1852{
1853	n->nr_partial = 0;
1854	atomic_long_set(&n->nr_slabs, 0);
1855	spin_lock_init(&n->list_lock);
1856	INIT_LIST_HEAD(&n->partial);
1857#ifdef CONFIG_SLUB_DEBUG
1858	INIT_LIST_HEAD(&n->full);
1859#endif
1860}
1861
1862#ifdef CONFIG_SMP
1863/*
1864 * Per cpu array for per cpu structures.
1865 *
1866 * The per cpu array places all kmem_cache_cpu structures from one processor
1867 * close together meaning that it becomes possible that multiple per cpu
1868 * structures are contained in one cacheline. This may be particularly
1869 * beneficial for the kmalloc caches.
1870 *
1871 * A desktop system typically has around 60-80 slabs. With 100 here we are
1872 * likely able to get per cpu structures for all caches from the array defined
1873 * here. We must be able to cover all kmalloc caches during bootstrap.
1874 *
1875 * If the per cpu array is exhausted then fall back to kmalloc
1876 * of individual cachelines. No sharing is possible then.
1877 */
1878#define NR_KMEM_CACHE_CPU 100
1879
1880static DEFINE_PER_CPU(struct kmem_cache_cpu,
1881				kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
1882
1883static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
1884static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE;
1885
1886static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
1887							int cpu, gfp_t flags)
1888{
1889	struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
1890
1891	if (c)
1892		per_cpu(kmem_cache_cpu_free, cpu) =
1893				(void *)c->freelist;
1894	else {
1895		/* Table overflow: So allocate ourselves */
1896		c = kmalloc_node(
1897			ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
1898			flags, cpu_to_node(cpu));
1899		if (!c)
1900			return NULL;
1901	}
1902
1903	init_kmem_cache_cpu(s, c);
1904	return c;
1905}
1906
1907static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
1908{
1909	if (c < per_cpu(kmem_cache_cpu, cpu) ||
1910			c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
1911		kfree(c);
1912		return;
1913	}
1914	c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
1915	per_cpu(kmem_cache_cpu_free, cpu) = c;
1916}
1917
1918static void free_kmem_cache_cpus(struct kmem_cache *s)
1919{
1920	int cpu;
1921
1922	for_each_online_cpu(cpu) {
1923		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1924
1925		if (c) {
1926			s->cpu_slab[cpu] = NULL;
1927			free_kmem_cache_cpu(c, cpu);
1928		}
1929	}
1930}
1931
1932static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
1933{
1934	int cpu;
1935
1936	for_each_online_cpu(cpu) {
1937		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1938
1939		if (c)
1940			continue;
1941
1942		c = alloc_kmem_cache_cpu(s, cpu, flags);
1943		if (!c) {
1944			free_kmem_cache_cpus(s);
1945			return 0;
1946		}
1947		s->cpu_slab[cpu] = c;
1948	}
1949	return 1;
1950}
1951
1952/*
1953 * Initialize the per cpu array.
1954 */
1955static void init_alloc_cpu_cpu(int cpu)
1956{
1957	int i;
1958
1959	if (cpu_isset(cpu, kmem_cach_cpu_free_init_once))
1960		return;
1961
1962	for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
1963		free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
1964
1965	cpu_set(cpu, kmem_cach_cpu_free_init_once);
1966}
1967
1968static void __init init_alloc_cpu(void)
1969{
1970	int cpu;
1971
1972	for_each_online_cpu(cpu)
1973		init_alloc_cpu_cpu(cpu);
1974  }
1975
1976#else
1977static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
1978static inline void init_alloc_cpu(void) {}
1979
1980static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
1981{
1982	init_kmem_cache_cpu(s, &s->cpu_slab);
1983	return 1;
1984}
1985#endif
1986
1987#ifdef CONFIG_NUMA
1988/*
1989 * No kmalloc_node yet so do it by hand. We know that this is the first
1990 * slab on the node for this slabcache. There are no concurrent accesses
1991 * possible.
1992 *
1993 * Note that this function only works on the kmalloc_node_cache
1994 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
1995 * memory on a fresh node that has no slab structures yet.
1996 */
1997static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
1998							   int node)
1999{
2000	struct page *page;
2001	struct kmem_cache_node *n;
2002	unsigned long flags;
2003
2004	BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
2005
2006	page = new_slab(kmalloc_caches, gfpflags, node);
2007
2008	BUG_ON(!page);
2009	if (page_to_nid(page) != node) {
2010		printk(KERN_ERR "SLUB: Unable to allocate memory from "
2011				"node %d\n", node);
2012		printk(KERN_ERR "SLUB: Allocating a useless per node structure "
2013				"in order to be able to continue\n");
2014	}
2015
2016	n = page->freelist;
2017	BUG_ON(!n);
2018	page->freelist = get_freepointer(kmalloc_caches, n);
2019	page->inuse++;
2020	kmalloc_caches->node[node] = n;
2021#ifdef CONFIG_SLUB_DEBUG
2022	init_object(kmalloc_caches, n, 1);
2023	init_tracking(kmalloc_caches, n);
2024#endif
2025	init_kmem_cache_node(n);
2026	atomic_long_inc(&n->nr_slabs);
2027	/*
2028	 * lockdep requires consistent irq usage for each lock
2029	 * so even though there cannot be a race this early in
2030	 * the boot sequence, we still disable irqs.
2031	 */
2032	local_irq_save(flags);
2033	add_partial(n, page, 0);
2034	local_irq_restore(flags);
2035	return n;
2036}
2037
2038static void free_kmem_cache_nodes(struct kmem_cache *s)
2039{
2040	int node;
2041
2042	for_each_node_state(node, N_NORMAL_MEMORY) {
2043		struct kmem_cache_node *n = s->node[node];
2044		if (n && n != &s->local_node)
2045			kmem_cache_free(kmalloc_caches, n);
2046		s->node[node] = NULL;
2047	}
2048}
2049
2050static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2051{
2052	int node;
2053	int local_node;
2054
2055	if (slab_state >= UP)
2056		local_node = page_to_nid(virt_to_page(s));
2057	else
2058		local_node = 0;
2059
2060	for_each_node_state(node, N_NORMAL_MEMORY) {
2061		struct kmem_cache_node *n;
2062
2063		if (local_node == node)
2064			n = &s->local_node;
2065		else {
2066			if (slab_state == DOWN) {
2067				n = early_kmem_cache_node_alloc(gfpflags,
2068								node);
2069				continue;
2070			}
2071			n = kmem_cache_alloc_node(kmalloc_caches,
2072							gfpflags, node);
2073
2074			if (!n) {
2075				free_kmem_cache_nodes(s);
2076				return 0;
2077			}
2078
2079		}
2080		s->node[node] = n;
2081		init_kmem_cache_node(n);
2082	}
2083	return 1;
2084}
2085#else
2086static void free_kmem_cache_nodes(struct kmem_cache *s)
2087{
2088}
2089
2090static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2091{
2092	init_kmem_cache_node(&s->local_node);
2093	return 1;
2094}
2095#endif
2096
2097/*
2098 * calculate_sizes() determines the order and the distribution of data within
2099 * a slab object.
2100 */
2101static int calculate_sizes(struct kmem_cache *s)
2102{
2103	unsigned long flags = s->flags;
2104	unsigned long size = s->objsize;
2105	unsigned long align = s->align;
2106
2107	/*
2108	 * Determine if we can poison the object itself. If the user of
2109	 * the slab may touch the object after free or before allocation
2110	 * then we should never poison the object itself.
2111	 */
2112	if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
2113			!s->ctor)
2114		s->flags |= __OBJECT_POISON;
2115	else
2116		s->flags &= ~__OBJECT_POISON;
2117
2118	/*
2119	 * Round up object size to the next word boundary. We can only
2120	 * place the free pointer at word boundaries and this determines
2121	 * the possible location of the free pointer.
2122	 */
2123	size = ALIGN(size, sizeof(void *));
2124
2125#ifdef CONFIG_SLUB_DEBUG
2126	/*
2127	 * If we are Redzoning then check if there is some space between the
2128	 * end of the object and the free pointer. If not then add an
2129	 * additional word to have some bytes to store Redzone information.
2130	 */
2131	if ((flags & SLAB_RED_ZONE) && size == s->objsize)
2132		size += sizeof(void *);
2133#endif
2134
2135	/*
2136	 * With that we have determined the number of bytes in actual use
2137	 * by the object. This is the potential offset to the free pointer.
2138	 */
2139	s->inuse = size;
2140
2141	if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
2142		s->ctor)) {
2143		/*
2144		 * Relocate free pointer after the object if it is not
2145		 * permitted to overwrite the first word of the object on
2146		 * kmem_cache_free.
2147		 *
2148		 * This is the case if we do RCU, have a constructor or
2149		 * destructor or are poisoning the objects.
2150		 */
2151		s->offset = size;
2152		size += sizeof(void *);
2153	}
2154
2155#ifdef CONFIG_SLUB_DEBUG
2156	if (flags & SLAB_STORE_USER)
2157		/*
2158		 * Need to store information about allocs and frees after
2159		 * the object.
2160		 */
2161		size += 2 * sizeof(struct track);
2162
2163	if (flags & SLAB_RED_ZONE)
2164		/*
2165		 * Add some empty padding so that we can catch
2166		 * overwrites from earlier objects rather than let
2167		 * tracking information or the free pointer be
2168		 * corrupted if an user writes before the start
2169		 * of the object.
2170		 */
2171		size += sizeof(void *);
2172#endif
2173
2174	/*
2175	 * Determine the alignment based on various parameters that the
2176	 * user specified and the dynamic determination of cache line size
2177	 * on bootup.
2178	 */
2179	align = calculate_alignment(flags, align, s->objsize);
2180
2181	/*
2182	 * SLUB stores one object immediately after another beginning from
2183	 * offset 0. In order to align the objects we have to simply size
2184	 * each object to conform to the alignment.
2185	 */
2186	size = ALIGN(size, align);
2187	s->size = size;
2188
2189	s->order = calculate_order(size);
2190	if (s->order < 0)
2191		return 0;
2192
2193	/*
2194	 * Determine the number of objects per slab
2195	 */
2196	s->objects = (PAGE_SIZE << s->order) / size;
2197
2198	return !!s->objects;
2199
2200}
2201
2202static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2203		const char *name, size_t size,
2204		size_t align, unsigned long flags,
2205		void (*ctor)(struct kmem_cache *, void *))
2206{
2207	memset(s, 0, kmem_size);
2208	s->name = name;
2209	s->ctor = ctor;
2210	s->objsize = size;
2211	s->align = align;
2212	s->flags = kmem_cache_flags(size, flags, name, ctor);
2213
2214	if (!calculate_sizes(s))
2215		goto error;
2216
2217	s->refcount = 1;
2218#ifdef CONFIG_NUMA
2219	s->remote_node_defrag_ratio = 100;
2220#endif
2221	if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
2222		goto error;
2223
2224	if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
2225		return 1;
2226	free_kmem_cache_nodes(s);
2227error:
2228	if (flags & SLAB_PANIC)
2229		panic("Cannot create slab %s size=%lu realsize=%u "
2230			"order=%u offset=%u flags=%lx\n",
2231			s->name, (unsigned long)size, s->size, s->order,
2232			s->offset, flags);
2233	return 0;
2234}
2235
2236/*
2237 * Check if a given pointer is valid
2238 */
2239int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2240{
2241	struct page *page;
2242
2243	page = get_object_page(object);
2244
2245	if (!page || s != page->slab)
2246		/* No slab or wrong slab */
2247		return 0;
2248
2249	if (!check_valid_pointer(s, page, object))
2250		return 0;
2251
2252	/*
2253	 * We could also check if the object is on the slabs freelist.
2254	 * But this would be too expensive and it seems that the main
2255	 * purpose of kmem_ptr_valid is to check if the object belongs
2256	 * to a certain slab.
2257	 */
2258	return 1;
2259}
2260EXPORT_SYMBOL(kmem_ptr_validate);
2261
2262/*
2263 * Determine the size of a slab object
2264 */
2265unsigned int kmem_cache_size(struct kmem_cache *s)
2266{
2267	return s->objsize;
2268}
2269EXPORT_SYMBOL(kmem_cache_size);
2270
2271const char *kmem_cache_name(struct kmem_cache *s)
2272{
2273	return s->name;
2274}
2275EXPORT_SYMBOL(kmem_cache_name);
2276
2277/*
2278 * Attempt to free all slabs on a node. Return the number of slabs we
2279 * were unable to free.
2280 */
2281static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
2282			struct list_head *list)
2283{
2284	int slabs_inuse = 0;
2285	unsigned long flags;
2286	struct page *page, *h;
2287
2288	spin_lock_irqsave(&n->list_lock, flags);
2289	list_for_each_entry_safe(page, h, list, lru)
2290		if (!page->inuse) {
2291			list_del(&page->lru);
2292			discard_slab(s, page);
2293		} else
2294			slabs_inuse++;
2295	spin_unlock_irqrestore(&n->list_lock, flags);
2296	return slabs_inuse;
2297}
2298
2299/*
2300 * Release all resources used by a slab cache.
2301 */
2302static inline int kmem_cache_close(struct kmem_cache *s)
2303{
2304	int node;
2305
2306	flush_all(s);
2307
2308	/* Attempt to free all objects */
2309	free_kmem_cache_cpus(s);
2310	for_each_node_state(node, N_NORMAL_MEMORY) {
2311		struct kmem_cache_node *n = get_node(s, node);
2312
2313		n->nr_partial -= free_list(s, n, &n->partial);
2314		if (atomic_long_read(&n->nr_slabs))
2315			return 1;
2316	}
2317	free_kmem_cache_nodes(s);
2318	return 0;
2319}
2320
2321/*
2322 * Close a cache and release the kmem_cache structure
2323 * (must be used for caches created using kmem_cache_create)
2324 */
2325void kmem_cache_destroy(struct kmem_cache *s)
2326{
2327	down_write(&slub_lock);
2328	s->refcount--;
2329	if (!s->refcount) {
2330		list_del(&s->list);
2331		up_write(&slub_lock);
2332		if (kmem_cache_close(s))
2333			WARN_ON(1);
2334		sysfs_slab_remove(s);
2335	} else
2336		up_write(&slub_lock);
2337}
2338EXPORT_SYMBOL(kmem_cache_destroy);
2339
2340/********************************************************************
2341 *		Kmalloc subsystem
2342 *******************************************************************/
2343
2344struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned;
2345EXPORT_SYMBOL(kmalloc_caches);
2346
2347#ifdef CONFIG_ZONE_DMA
2348static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
2349#endif
2350
2351static int __init setup_slub_min_order(char *str)
2352{
2353	get_option(&str, &slub_min_order);
2354
2355	return 1;
2356}
2357
2358__setup("slub_min_order=", setup_slub_min_order);
2359
2360static int __init setup_slub_max_order(char *str)
2361{
2362	get_option(&str, &slub_max_order);
2363
2364	return 1;
2365}
2366
2367__setup("slub_max_order=", setup_slub_max_order);
2368
2369static int __init setup_slub_min_objects(char *str)
2370{
2371	get_option(&str, &slub_min_objects);
2372
2373	return 1;
2374}
2375
2376__setup("slub_min_objects=", setup_slub_min_objects);
2377
2378static int __init setup_slub_nomerge(char *str)
2379{
2380	slub_nomerge = 1;
2381	return 1;
2382}
2383
2384__setup("slub_nomerge", setup_slub_nomerge);
2385
2386static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
2387		const char *name, int size, gfp_t gfp_flags)
2388{
2389	unsigned int flags = 0;
2390
2391	if (gfp_flags & SLUB_DMA)
2392		flags = SLAB_CACHE_DMA;
2393
2394	down_write(&slub_lock);
2395	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
2396			flags, NULL))
2397		goto panic;
2398
2399	list_add(&s->list, &slab_caches);
2400	up_write(&slub_lock);
2401	if (sysfs_slab_add(s))
2402		goto panic;
2403	return s;
2404
2405panic:
2406	panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
2407}
2408
2409#ifdef CONFIG_ZONE_DMA
2410
2411static void sysfs_add_func(struct work_struct *w)
2412{
2413	struct kmem_cache *s;
2414
2415	down_write(&slub_lock);
2416	list_for_each_entry(s, &slab_caches, list) {
2417		if (s->flags & __SYSFS_ADD_DEFERRED) {
2418			s->flags &= ~__SYSFS_ADD_DEFERRED;
2419			sysfs_slab_add(s);
2420		}
2421	}
2422	up_write(&slub_lock);
2423}
2424
2425static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
2426
2427static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2428{
2429	struct kmem_cache *s;
2430	char *text;
2431	size_t realsize;
2432
2433	s = kmalloc_caches_dma[index];
2434	if (s)
2435		return s;
2436
2437	/* Dynamically create dma cache */
2438	if (flags & __GFP_WAIT)
2439		down_write(&slub_lock);
2440	else {
2441		if (!down_write_trylock(&slub_lock))
2442			goto out;
2443	}
2444
2445	if (kmalloc_caches_dma[index])
2446		goto unlock_out;
2447
2448	realsize = kmalloc_caches[index].objsize;
2449	text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize),
2450	s = kmalloc(kmem_size, flags & ~SLUB_DMA);
2451
2452	if (!s || !text || !kmem_cache_open(s, flags, text,
2453			realsize, ARCH_KMALLOC_MINALIGN,
2454			SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
2455		kfree(s);
2456		kfree(text);
2457		goto unlock_out;
2458	}
2459
2460	list_add(&s->list, &slab_caches);
2461	kmalloc_caches_dma[index] = s;
2462
2463	schedule_work(&sysfs_add_work);
2464
2465unlock_out:
2466	up_write(&slub_lock);
2467out:
2468	return kmalloc_caches_dma[index];
2469}
2470#endif
2471
2472/*
2473 * Conversion table for small slabs sizes / 8 to the index in the
2474 * kmalloc array. This is necessary for slabs < 192 since we have non power
2475 * of two cache sizes there. The size of larger slabs can be determined using
2476 * fls.
2477 */
2478static s8 size_index[24] = {
2479	3,	/* 8 */
2480	4,	/* 16 */
2481	5,	/* 24 */
2482	5,	/* 32 */
2483	6,	/* 40 */
2484	6,	/* 48 */
2485	6,	/* 56 */
2486	6,	/* 64 */
2487	1,	/* 72 */
2488	1,	/* 80 */
2489	1,	/* 88 */
2490	1,	/* 96 */
2491	7,	/* 104 */
2492	7,	/* 112 */
2493	7,	/* 120 */
2494	7,	/* 128 */
2495	2,	/* 136 */
2496	2,	/* 144 */
2497	2,	/* 152 */
2498	2,	/* 160 */
2499	2,	/* 168 */
2500	2,	/* 176 */
2501	2,	/* 184 */
2502	2	/* 192 */
2503};
2504
2505static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2506{
2507	int index;
2508
2509	if (size <= 192) {
2510		if (!size)
2511			return ZERO_SIZE_PTR;
2512
2513		index = size_index[(size - 1) / 8];
2514	} else
2515		index = fls(size - 1);
2516
2517#ifdef CONFIG_ZONE_DMA
2518	if (unlikely((flags & SLUB_DMA)))
2519		return dma_kmalloc_cache(index, flags);
2520
2521#endif
2522	return &kmalloc_caches[index];
2523}
2524
2525void *__kmalloc(size_t size, gfp_t flags)
2526{
2527	struct kmem_cache *s;
2528
2529	if (unlikely(size > PAGE_SIZE / 2))
2530		return (void *)__get_free_pages(flags | __GFP_COMP,
2531							get_order(size));
2532
2533	s = get_slab(size, flags);
2534
2535	if (unlikely(ZERO_OR_NULL_PTR(s)))
2536		return s;
2537
2538	return slab_alloc(s, flags, -1, __builtin_return_address(0));
2539}
2540EXPORT_SYMBOL(__kmalloc);
2541
2542#ifdef CONFIG_NUMA
2543void *__kmalloc_node(size_t size, gfp_t flags, int node)
2544{
2545	struct kmem_cache *s;
2546
2547	if (unlikely(size > PAGE_SIZE / 2))
2548		return (void *)__get_free_pages(flags | __GFP_COMP,
2549							get_order(size));
2550
2551	s = get_slab(size, flags);
2552
2553	if (unlikely(ZERO_OR_NULL_PTR(s)))
2554		return s;
2555
2556	return slab_alloc(s, flags, node, __builtin_return_address(0));
2557}
2558EXPORT_SYMBOL(__kmalloc_node);
2559#endif
2560
2561size_t ksize(const void *object)
2562{
2563	struct page *page;
2564	struct kmem_cache *s;
2565
2566	BUG_ON(!object);
2567	if (unlikely(object == ZERO_SIZE_PTR))
2568		return 0;
2569
2570	page = virt_to_head_page(object);
2571	BUG_ON(!page);
2572
2573	if (unlikely(!PageSlab(page)))
2574		return PAGE_SIZE << compound_order(page);
2575
2576	s = page->slab;
2577	BUG_ON(!s);
2578
2579	/*
2580	 * Debugging requires use of the padding between object
2581	 * and whatever may come after it.
2582	 */
2583	if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2584		return s->objsize;
2585
2586	/*
2587	 * If we have the need to store the freelist pointer
2588	 * back there or track user information then we can
2589	 * only use the space before that information.
2590	 */
2591	if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2592		return s->inuse;
2593
2594	/*
2595	 * Else we can use all the padding etc for the allocation
2596	 */
2597	return s->size;
2598}
2599EXPORT_SYMBOL(ksize);
2600
2601void kfree(const void *x)
2602{
2603	struct page *page;
2604
2605	if (unlikely(ZERO_OR_NULL_PTR(x)))
2606		return;
2607
2608	page = virt_to_head_page(x);
2609	if (unlikely(!PageSlab(page))) {
2610		put_page(page);
2611		return;
2612	}
2613	slab_free(page->slab, page, (void *)x, __builtin_return_address(0));
2614}
2615EXPORT_SYMBOL(kfree);
2616
2617static unsigned long count_partial(struct kmem_cache_node *n)
2618{
2619	unsigned long flags;
2620	unsigned long x = 0;
2621	struct page *page;
2622
2623	spin_lock_irqsave(&n->list_lock, flags);
2624	list_for_each_entry(page, &n->partial, lru)
2625		x += page->inuse;
2626	spin_unlock_irqrestore(&n->list_lock, flags);
2627	return x;
2628}
2629
2630/*
2631 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
2632 * the remaining slabs by the number of items in use. The slabs with the
2633 * most items in use come first. New allocations will then fill those up
2634 * and thus they can be removed from the partial lists.
2635 *
2636 * The slabs with the least items are placed last. This results in them
2637 * being allocated from last increasing the chance that the last objects
2638 * are freed in them.
2639 */
2640int kmem_cache_shrink(struct kmem_cache *s)
2641{
2642	int node;
2643	int i;
2644	struct kmem_cache_node *n;
2645	struct page *page;
2646	struct page *t;
2647	struct list_head *slabs_by_inuse =
2648		kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
2649	unsigned long flags;
2650
2651	if (!slabs_by_inuse)
2652		return -ENOMEM;
2653
2654	flush_all(s);
2655	for_each_node_state(node, N_NORMAL_MEMORY) {
2656		n = get_node(s, node);
2657
2658		if (!n->nr_partial)
2659			continue;
2660
2661		for (i = 0; i < s->objects; i++)
2662			INIT_LIST_HEAD(slabs_by_inuse + i);
2663
2664		spin_lock_irqsave(&n->list_lock, flags);
2665
2666		/*
2667		 * Build lists indexed by the items in use in each slab.
2668		 *
2669		 * Note that concurrent frees may occur while we hold the
2670		 * list_lock. page->inuse here is the upper limit.
2671		 */
2672		list_for_each_entry_safe(page, t, &n->partial, lru) {
2673			if (!page->inuse && slab_trylock(page)) {
2674				/*
2675				 * Must hold slab lock here because slab_free
2676				 * may have freed the last object and be
2677				 * waiting to release the slab.
2678				 */
2679				list_del(&page->lru);
2680				n->nr_partial--;
2681				slab_unlock(page);
2682				discard_slab(s, page);
2683			} else {
2684				list_move(&page->lru,
2685				slabs_by_inuse + page->inuse);
2686			}
2687		}
2688
2689		/*
2690		 * Rebuild the partial list with the slabs filled up most
2691		 * first and the least used slabs at the end.
2692		 */
2693		for (i = s->objects - 1; i >= 0; i--)
2694			list_splice(slabs_by_inuse + i, n->partial.prev);
2695
2696		spin_unlock_irqrestore(&n->list_lock, flags);
2697	}
2698
2699	kfree(slabs_by_inuse);
2700	return 0;
2701}
2702EXPORT_SYMBOL(kmem_cache_shrink);
2703
2704#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2705static int slab_mem_going_offline_callback(void *arg)
2706{
2707	struct kmem_cache *s;
2708
2709	down_read(&slub_lock);
2710	list_for_each_entry(s, &slab_caches, list)
2711		kmem_cache_shrink(s);
2712	up_read(&slub_lock);
2713
2714	return 0;
2715}
2716
2717static void slab_mem_offline_callback(void *arg)
2718{
2719	struct kmem_cache_node *n;
2720	struct kmem_cache *s;
2721	struct memory_notify *marg = arg;
2722	int offline_node;
2723
2724	offline_node = marg->status_change_nid;
2725
2726	/*
2727	 * If the node still has available memory. we need kmem_cache_node
2728	 * for it yet.
2729	 */
2730	if (offline_node < 0)
2731		return;
2732
2733	down_read(&slub_lock);
2734	list_for_each_entry(s, &slab_caches, list) {
2735		n = get_node(s, offline_node);
2736		if (n) {
2737			/*
2738			 * if n->nr_slabs > 0, slabs still exist on the node
2739			 * that is going down. We were unable to free them,
2740			 * and offline_pages() function shoudn't call this
2741			 * callback. So, we must fail.
2742			 */
2743			BUG_ON(atomic_long_read(&n->nr_slabs));
2744
2745			s->node[offline_node] = NULL;
2746			kmem_cache_free(kmalloc_caches, n);
2747		}
2748	}
2749	up_read(&slub_lock);
2750}
2751
2752static int slab_mem_going_online_callback(void *arg)
2753{
2754	struct kmem_cache_node *n;
2755	struct kmem_cache *s;
2756	struct memory_notify *marg = arg;
2757	int nid = marg->status_change_nid;
2758	int ret = 0;
2759
2760	/*
2761	 * If the node's memory is already available, then kmem_cache_node is
2762	 * already created. Nothing to do.
2763	 */
2764	if (nid < 0)
2765		return 0;
2766
2767	/*
2768	 * We are bringing a node online. No memory is availabe yet. We must
2769	 * allocate a kmem_cache_node structure in order to bring the node
2770	 * online.
2771	 */
2772	down_read(&slub_lock);
2773	list_for_each_entry(s, &slab_caches, list) {
2774		/*
2775		 * XXX: kmem_cache_alloc_node will fallback to other nodes
2776		 *      since memory is not yet available from the node that
2777		 *      is brought up.
2778		 */
2779		n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL);
2780		if (!n) {
2781			ret = -ENOMEM;
2782			goto out;
2783		}
2784		init_kmem_cache_node(n);
2785		s->node[nid] = n;
2786	}
2787out:
2788	up_read(&slub_lock);
2789	return ret;
2790}
2791
2792static int slab_memory_callback(struct notifier_block *self,
2793				unsigned long action, void *arg)
2794{
2795	int ret = 0;
2796
2797	switch (action) {
2798	case MEM_GOING_ONLINE:
2799		ret = slab_mem_going_online_callback(arg);
2800		break;
2801	case MEM_GOING_OFFLINE:
2802		ret = slab_mem_going_offline_callback(arg);
2803		break;
2804	case MEM_OFFLINE:
2805	case MEM_CANCEL_ONLINE:
2806		slab_mem_offline_callback(arg);
2807		break;
2808	case MEM_ONLINE:
2809	case MEM_CANCEL_OFFLINE:
2810		break;
2811	}
2812
2813	ret = notifier_from_errno(ret);
2814	return ret;
2815}
2816
2817#endif /* CONFIG_MEMORY_HOTPLUG */
2818
2819/********************************************************************
2820 *			Basic setup of slabs
2821 *******************************************************************/
2822
2823void __init kmem_cache_init(void)
2824{
2825	int i;
2826	int caches = 0;
2827
2828	init_alloc_cpu();
2829
2830#ifdef CONFIG_NUMA
2831	/*
2832	 * Must first have the slab cache available for the allocations of the
2833	 * struct kmem_cache_node's. There is special bootstrap code in
2834	 * kmem_cache_open for slab_state == DOWN.
2835	 */
2836	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
2837		sizeof(struct kmem_cache_node), GFP_KERNEL);
2838	kmalloc_caches[0].refcount = -1;
2839	caches++;
2840
2841	hotplug_memory_notifier(slab_memory_callback, 1);
2842#endif
2843
2844	/* Able to allocate the per node structures */
2845	slab_state = PARTIAL;
2846
2847	/* Caches that are not of the two-to-the-power-of size */
2848	if (KMALLOC_MIN_SIZE <= 64) {
2849		create_kmalloc_cache(&kmalloc_caches[1],
2850				"kmalloc-96", 96, GFP_KERNEL);
2851		caches++;
2852	}
2853	if (KMALLOC_MIN_SIZE <= 128) {
2854		create_kmalloc_cache(&kmalloc_caches[2],
2855				"kmalloc-192", 192, GFP_KERNEL);
2856		caches++;
2857	}
2858
2859	for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) {
2860		create_kmalloc_cache(&kmalloc_caches[i],
2861			"kmalloc", 1 << i, GFP_KERNEL);
2862		caches++;
2863	}
2864
2865
2866	/*
2867	 * Patch up the size_index table if we have strange large alignment
2868	 * requirements for the kmalloc array. This is only the case for
2869	 * mips it seems. The standard arches will not generate any code here.
2870	 *
2871	 * Largest permitted alignment is 256 bytes due to the way we
2872	 * handle the index determination for the smaller caches.
2873	 *
2874	 * Make sure that nothing crazy happens if someone starts tinkering
2875	 * around with ARCH_KMALLOC_MINALIGN
2876	 */
2877	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
2878		(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
2879
2880	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
2881		size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
2882
2883	slab_state = UP;
2884
2885	/* Provide the correct kmalloc names now that the caches are up */
2886	for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++)
2887		kmalloc_caches[i]. name =
2888			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
2889
2890#ifdef CONFIG_SMP
2891	register_cpu_notifier(&slab_notifier);
2892	kmem_size = offsetof(struct kmem_cache, cpu_slab) +
2893				nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
2894#else
2895	kmem_size = sizeof(struct kmem_cache);
2896#endif
2897
2898
2899	printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
2900		" CPUs=%d, Nodes=%d\n",
2901		caches, cache_line_size(),
2902		slub_min_order, slub_max_order, slub_min_objects,
2903		nr_cpu_ids, nr_node_ids);
2904}
2905
2906/*
2907 * Find a mergeable slab cache
2908 */
2909static int slab_unmergeable(struct kmem_cache *s)
2910{
2911	if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
2912		return 1;
2913
2914	if (s->ctor)
2915		return 1;
2916
2917	/*
2918	 * We may have set a slab to be unmergeable during bootstrap.
2919	 */
2920	if (s->refcount < 0)
2921		return 1;
2922
2923	return 0;
2924}
2925
2926static struct kmem_cache *find_mergeable(size_t size,
2927		size_t align, unsigned long flags, const char *name,
2928		void (*ctor)(struct kmem_cache *, void *))
2929{
2930	struct kmem_cache *s;
2931
2932	if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
2933		return NULL;
2934
2935	if (ctor)
2936		return NULL;
2937
2938	size = ALIGN(size, sizeof(void *));
2939	align = calculate_alignment(flags, align, size);
2940	size = ALIGN(size, align);
2941	flags = kmem_cache_flags(size, flags, name, NULL);
2942
2943	list_for_each_entry(s, &slab_caches, list) {
2944		if (slab_unmergeable(s))
2945			continue;
2946
2947		if (size > s->size)
2948			continue;
2949
2950		if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
2951				continue;
2952		/*
2953		 * Check if alignment is compatible.
2954		 * Courtesy of Adrian Drzewiecki
2955		 */
2956		if ((s->size & ~(align - 1)) != s->size)
2957			continue;
2958
2959		if (s->size - size >= sizeof(void *))
2960			continue;
2961
2962		return s;
2963	}
2964	return NULL;
2965}
2966
2967struct kmem_cache *kmem_cache_create(const char *name, size_t size,
2968		size_t align, unsigned long flags,
2969		void (*ctor)(struct kmem_cache *, void *))
2970{
2971	struct kmem_cache *s;
2972
2973	down_write(&slub_lock);
2974	s = find_mergeable(size, align, flags, name, ctor);
2975	if (s) {
2976		int cpu;
2977
2978		s->refcount++;
2979		/*
2980		 * Adjust the object sizes so that we clear
2981		 * the complete object on kzalloc.
2982		 */
2983		s->objsize = max(s->objsize, (int)size);
2984
2985		/*
2986		 * And then we need to update the object size in the
2987		 * per cpu structures
2988		 */
2989		for_each_online_cpu(cpu)
2990			get_cpu_slab(s, cpu)->objsize = s->objsize;
2991		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
2992		up_write(&slub_lock);
2993		if (sysfs_slab_alias(s, name))
2994			goto err;
2995		return s;
2996	}
2997	s = kmalloc(kmem_size, GFP_KERNEL);
2998	if (s) {
2999		if (kmem_cache_open(s, GFP_KERNEL, name,
3000				size, align, flags, ctor)) {
3001			list_add(&s->list, &slab_caches);
3002			up_write(&slub_lock);
3003			if (sysfs_slab_add(s))
3004				goto err;
3005			return s;
3006		}
3007		kfree(s);
3008	}
3009	up_write(&slub_lock);
3010
3011err:
3012	if (flags & SLAB_PANIC)
3013		panic("Cannot create slabcache %s\n", name);
3014	else
3015		s = NULL;
3016	return s;
3017}
3018EXPORT_SYMBOL(kmem_cache_create);
3019
3020#ifdef CONFIG_SMP
3021/*
3022 * Use the cpu notifier to insure that the cpu slabs are flushed when
3023 * necessary.
3024 */
3025static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3026		unsigned long action, void *hcpu)
3027{
3028	long cpu = (long)hcpu;
3029	struct kmem_cache *s;
3030	unsigned long flags;
3031
3032	switch (action) {
3033	case CPU_UP_PREPARE:
3034	case CPU_UP_PREPARE_FROZEN:
3035		init_alloc_cpu_cpu(cpu);
3036		down_read(&slub_lock);
3037		list_for_each_entry(s, &slab_caches, list)
3038			s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
3039							GFP_KERNEL);
3040		up_read(&slub_lock);
3041		break;
3042
3043	case CPU_UP_CANCELED:
3044	case CPU_UP_CANCELED_FROZEN:
3045	case CPU_DEAD:
3046	case CPU_DEAD_FROZEN:
3047		down_read(&slub_lock);
3048		list_for_each_entry(s, &slab_caches, list) {
3049			struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3050
3051			local_irq_save(flags);
3052			__flush_cpu_slab(s, cpu);
3053			local_irq_restore(flags);
3054			free_kmem_cache_cpu(c, cpu);
3055			s->cpu_slab[cpu] = NULL;
3056		}
3057		up_read(&slub_lock);
3058		break;
3059	default:
3060		break;
3061	}
3062	return NOTIFY_OK;
3063}
3064
3065static struct notifier_block __cpuinitdata slab_notifier = {
3066	&slab_cpuup_callback, NULL, 0
3067};
3068
3069#endif
3070
3071void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
3072{
3073	struct kmem_cache *s;
3074
3075	if (unlikely(size > PAGE_SIZE / 2))
3076		return (void *)__get_free_pages(gfpflags | __GFP_COMP,
3077							get_order(size));
3078	s = get_slab(size, gfpflags);
3079
3080	if (unlikely(ZERO_OR_NULL_PTR(s)))
3081		return s;
3082
3083	return slab_alloc(s, gfpflags, -1, caller);
3084}
3085
3086void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3087					int node, void *caller)
3088{
3089	struct kmem_cache *s;
3090
3091	if (unlikely(size > PAGE_SIZE / 2))
3092		return (void *)__get_free_pages(gfpflags | __GFP_COMP,
3093							get_order(size));
3094	s = get_slab(size, gfpflags);
3095
3096	if (unlikely(ZERO_OR_NULL_PTR(s)))
3097		return s;
3098
3099	return slab_alloc(s, gfpflags, node, caller);
3100}
3101
3102#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
3103static int validate_slab(struct kmem_cache *s, struct page *page,
3104						unsigned long *map)
3105{
3106	void *p;
3107	void *addr = page_address(page);
3108
3109	if (!check_slab(s, page) ||
3110			!on_freelist(s, page, NULL))
3111		return 0;
3112
3113	/* Now we know that a valid freelist exists */
3114	bitmap_zero(map, s->objects);
3115
3116	for_each_free_object(p, s, page->freelist) {
3117		set_bit(slab_index(p, s, addr), map);
3118		if (!check_object(s, page, p, 0))
3119			return 0;
3120	}
3121
3122	for_each_object(p, s, addr)
3123		if (!test_bit(slab_index(p, s, addr), map))
3124			if (!check_object(s, page, p, 1))
3125				return 0;
3126	return 1;
3127}
3128
3129static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3130						unsigned long *map)
3131{
3132	if (slab_trylock(page)) {
3133		validate_slab(s, page, map);
3134		slab_unlock(page);
3135	} else
3136		printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
3137			s->name, page);
3138
3139	if (s->flags & DEBUG_DEFAULT_FLAGS) {
3140		if (!SlabDebug(page))
3141			printk(KERN_ERR "SLUB %s: SlabDebug not set "
3142				"on slab 0x%p\n", s->name, page);
3143	} else {
3144		if (SlabDebug(page))
3145			printk(KERN_ERR "SLUB %s: SlabDebug set on "
3146				"slab 0x%p\n", s->name, page);
3147	}
3148}
3149
3150static int validate_slab_node(struct kmem_cache *s,
3151		struct kmem_cache_node *n, unsigned long *map)
3152{
3153	unsigned long count = 0;
3154	struct page *page;
3155	unsigned long flags;
3156
3157	spin_lock_irqsave(&n->list_lock, flags);
3158
3159	list_for_each_entry(page, &n->partial, lru) {
3160		validate_slab_slab(s, page, map);
3161		count++;
3162	}
3163	if (count != n->nr_partial)
3164		printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
3165			"counter=%ld\n", s->name, count, n->nr_partial);
3166
3167	if (!(s->flags & SLAB_STORE_USER))
3168		goto out;
3169
3170	list_for_each_entry(page, &n->full, lru) {
3171		validate_slab_slab(s, page, map);
3172		count++;
3173	}
3174	if (count != atomic_long_read(&n->nr_slabs))
3175		printk(KERN_ERR "SLUB: %s %ld slabs counted but "
3176			"counter=%ld\n", s->name, count,
3177			atomic_long_read(&n->nr_slabs));
3178
3179out:
3180	spin_unlock_irqrestore(&n->list_lock, flags);
3181	return count;
3182}
3183
3184static long validate_slab_cache(struct kmem_cache *s)
3185{
3186	int node;
3187	unsigned long count = 0;
3188	unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) *
3189				sizeof(unsigned long), GFP_KERNEL);
3190
3191	if (!map)
3192		return -ENOMEM;
3193
3194	flush_all(s);
3195	for_each_node_state(node, N_NORMAL_MEMORY) {
3196		struct kmem_cache_node *n = get_node(s, node);
3197
3198		count += validate_slab_node(s, n, map);
3199	}
3200	kfree(map);
3201	return count;
3202}
3203
3204#ifdef SLUB_RESILIENCY_TEST
3205static void resiliency_test(void)
3206{
3207	u8 *p;
3208
3209	printk(KERN_ERR "SLUB resiliency testing\n");
3210	printk(KERN_ERR "-----------------------\n");
3211	printk(KERN_ERR "A. Corruption after allocation\n");
3212
3213	p = kzalloc(16, GFP_KERNEL);
3214	p[16] = 0x12;
3215	printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
3216			" 0x12->0x%p\n\n", p + 16);
3217
3218	validate_slab_cache(kmalloc_caches + 4);
3219
3220	/* Hmmm... The next two are dangerous */
3221	p = kzalloc(32, GFP_KERNEL);
3222	p[32 + sizeof(void *)] = 0x34;
3223	printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
3224		 	" 0x34 -> -0x%p\n", p);
3225	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
3226
3227	validate_slab_cache(kmalloc_caches + 5);
3228	p = kzalloc(64, GFP_KERNEL);
3229	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
3230	*p = 0x56;
3231	printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
3232									p);
3233	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
3234	validate_slab_cache(kmalloc_caches + 6);
3235
3236	printk(KERN_ERR "\nB. Corruption after free\n");
3237	p = kzalloc(128, GFP_KERNEL);
3238	kfree(p);
3239	*p = 0x78;
3240	printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
3241	validate_slab_cache(kmalloc_caches + 7);
3242
3243	p = kzalloc(256, GFP_KERNEL);
3244	kfree(p);
3245	p[50] = 0x9a;
3246	printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
3247	validate_slab_cache(kmalloc_caches + 8);
3248
3249	p = kzalloc(512, GFP_KERNEL);
3250	kfree(p);
3251	p[512] = 0xab;
3252	printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
3253	validate_slab_cache(kmalloc_caches + 9);
3254}
3255#else
3256static void resiliency_test(void) {};
3257#endif
3258
3259/*
3260 * Generate lists of code addresses where slabcache objects are allocated
3261 * and freed.
3262 */
3263
3264struct location {
3265	unsigned long count;
3266	void *addr;
3267	long long sum_time;
3268	long min_time;
3269	long max_time;
3270	long min_pid;
3271	long max_pid;
3272	cpumask_t cpus;
3273	nodemask_t nodes;
3274};
3275
3276struct loc_track {
3277	unsigned long max;
3278	unsigned long count;
3279	struct location *loc;
3280};
3281
3282static void free_loc_track(struct loc_track *t)
3283{
3284	if (t->max)
3285		free_pages((unsigned long)t->loc,
3286			get_order(sizeof(struct location) * t->max));
3287}
3288
3289static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
3290{
3291	struct location *l;
3292	int order;
3293
3294	order = get_order(sizeof(struct location) * max);
3295
3296	l = (void *)__get_free_pages(flags, order);
3297	if (!l)
3298		return 0;
3299
3300	if (t->count) {
3301		memcpy(l, t->loc, sizeof(struct location) * t->count);
3302		free_loc_track(t);
3303	}
3304	t->max = max;
3305	t->loc = l;
3306	return 1;
3307}
3308
3309static int add_location(struct loc_track *t, struct kmem_cache *s,
3310				const struct track *track)
3311{
3312	long start, end, pos;
3313	struct location *l;
3314	void *caddr;
3315	unsigned long age = jiffies - track->when;
3316
3317	start = -1;
3318	end = t->count;
3319
3320	for ( ; ; ) {
3321		pos = start + (end - start + 1) / 2;
3322
3323		/*
3324		 * There is nothing at "end". If we end up there
3325		 * we need to add something to before end.
3326		 */
3327		if (pos == end)
3328			break;
3329
3330		caddr = t->loc[pos].addr;
3331		if (track->addr == caddr) {
3332
3333			l = &t->loc[pos];
3334			l->count++;
3335			if (track->when) {
3336				l->sum_time += age;
3337				if (age < l->min_time)
3338					l->min_time = age;
3339				if (age > l->max_time)
3340					l->max_time = age;
3341
3342				if (track->pid < l->min_pid)
3343					l->min_pid = track->pid;
3344				if (track->pid > l->max_pid)
3345					l->max_pid = track->pid;
3346
3347				cpu_set(track->cpu, l->cpus);
3348			}
3349			node_set(page_to_nid(virt_to_page(track)), l->nodes);
3350			return 1;
3351		}
3352
3353		if (track->addr < caddr)
3354			end = pos;
3355		else
3356			start = pos;
3357	}
3358
3359	/*
3360	 * Not found. Insert new tracking element.
3361	 */
3362	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
3363		return 0;
3364
3365	l = t->loc + pos;
3366	if (pos < t->count)
3367		memmove(l + 1, l,
3368			(t->count - pos) * sizeof(struct location));
3369	t->count++;
3370	l->count = 1;
3371	l->addr = track->addr;
3372	l->sum_time = age;
3373	l->min_time = age;
3374	l->max_time = age;
3375	l->min_pid = track->pid;
3376	l->max_pid = track->pid;
3377	cpus_clear(l->cpus);
3378	cpu_set(track->cpu, l->cpus);
3379	nodes_clear(l->nodes);
3380	node_set(page_to_nid(virt_to_page(track)), l->nodes);
3381	return 1;
3382}
3383
3384static void process_slab(struct loc_track *t, struct kmem_cache *s,
3385		struct page *page, enum track_item alloc)
3386{
3387	void *addr = page_address(page);
3388	DECLARE_BITMAP(map, s->objects);
3389	void *p;
3390
3391	bitmap_zero(map, s->objects);
3392	for_each_free_object(p, s, page->freelist)
3393		set_bit(slab_index(p, s, addr), map);
3394
3395	for_each_object(p, s, addr)
3396		if (!test_bit(slab_index(p, s, addr), map))
3397			add_location(t, s, get_track(s, p, alloc));
3398}
3399
3400static int list_locations(struct kmem_cache *s, char *buf,
3401					enum track_item alloc)
3402{
3403	int len = 0;
3404	unsigned long i;
3405	struct loc_track t = { 0, 0, NULL };
3406	int node;
3407
3408	if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
3409			GFP_TEMPORARY))
3410		return sprintf(buf, "Out of memory\n");
3411
3412	/* Push back cpu slabs */
3413	flush_all(s);
3414
3415	for_each_node_state(node, N_NORMAL_MEMORY) {
3416		struct kmem_cache_node *n = get_node(s, node);
3417		unsigned long flags;
3418		struct page *page;
3419
3420		if (!atomic_long_read(&n->nr_slabs))
3421			continue;
3422
3423		spin_lock_irqsave(&n->list_lock, flags);
3424		list_for_each_entry(page, &n->partial, lru)
3425			process_slab(&t, s, page, alloc);
3426		list_for_each_entry(page, &n->full, lru)
3427			process_slab(&t, s, page, alloc);
3428		spin_unlock_irqrestore(&n->list_lock, flags);
3429	}
3430
3431	for (i = 0; i < t.count; i++) {
3432		struct location *l = &t.loc[i];
3433
3434		if (len > PAGE_SIZE - 100)
3435			break;
3436		len += sprintf(buf + len, "%7ld ", l->count);
3437
3438		if (l->addr)
3439			len += sprint_symbol(buf + len, (unsigned long)l->addr);
3440		else
3441			len += sprintf(buf + len, "<not-available>");
3442
3443		if (l->sum_time != l->min_time) {
3444			unsigned long remainder;
3445
3446			len += sprintf(buf + len, " age=%ld/%ld/%ld",
3447			l->min_time,
3448			div_long_long_rem(l->sum_time, l->count, &remainder),
3449			l->max_time);
3450		} else
3451			len += sprintf(buf + len, " age=%ld",
3452				l->min_time);
3453
3454		if (l->min_pid != l->max_pid)
3455			len += sprintf(buf + len, " pid=%ld-%ld",
3456				l->min_pid, l->max_pid);
3457		else
3458			len += sprintf(buf + len, " pid=%ld",
3459				l->min_pid);
3460
3461		if (num_online_cpus() > 1 && !cpus_empty(l->cpus) &&
3462				len < PAGE_SIZE - 60) {
3463			len += sprintf(buf + len, " cpus=");
3464			len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3465					l->cpus);
3466		}
3467
3468		if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
3469				len < PAGE_SIZE - 60) {
3470			len += sprintf(buf + len, " nodes=");
3471			len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3472					l->nodes);
3473		}
3474
3475		len += sprintf(buf + len, "\n");
3476	}
3477
3478	free_loc_track(&t);
3479	if (!t.count)
3480		len += sprintf(buf, "No data\n");
3481	return len;
3482}
3483
3484enum slab_stat_type {
3485	SL_FULL,
3486	SL_PARTIAL,
3487	SL_CPU,
3488	SL_OBJECTS
3489};
3490
3491#define SO_FULL		(1 << SL_FULL)
3492#define SO_PARTIAL	(1 << SL_PARTIAL)
3493#define SO_CPU		(1 << SL_CPU)
3494#define SO_OBJECTS	(1 << SL_OBJECTS)
3495
3496static unsigned long slab_objects(struct kmem_cache *s,
3497			char *buf, unsigned long flags)
3498{
3499	unsigned long total = 0;
3500	int cpu;
3501	int node;
3502	int x;
3503	unsigned long *nodes;
3504	unsigned long *per_cpu;
3505
3506	nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
3507	per_cpu = nodes + nr_node_ids;
3508
3509	for_each_possible_cpu(cpu) {
3510		struct page *page;
3511		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3512
3513		if (!c)
3514			continue;
3515
3516		page = c->page;
3517		node = c->node;
3518		if (node < 0)
3519			continue;
3520		if (page) {
3521			if (flags & SO_CPU) {
3522				if (flags & SO_OBJECTS)
3523					x = page->inuse;
3524				else
3525					x = 1;
3526				total += x;
3527				nodes[node] += x;
3528			}
3529			per_cpu[node]++;
3530		}
3531	}
3532
3533	for_each_node_state(node, N_NORMAL_MEMORY) {
3534		struct kmem_cache_node *n = get_node(s, node);
3535
3536		if (flags & SO_PARTIAL) {
3537			if (flags & SO_OBJECTS)
3538				x = count_partial(n);
3539			else
3540				x = n->nr_partial;
3541			total += x;
3542			nodes[node] += x;
3543		}
3544
3545		if (flags & SO_FULL) {
3546			int full_slabs = atomic_long_read(&n->nr_slabs)
3547					- per_cpu[node]
3548					- n->nr_partial;
3549
3550			if (flags & SO_OBJECTS)
3551				x = full_slabs * s->objects;
3552			else
3553				x = full_slabs;
3554			total += x;
3555			nodes[node] += x;
3556		}
3557	}
3558
3559	x = sprintf(buf, "%lu", total);
3560#ifdef CONFIG_NUMA
3561	for_each_node_state(node, N_NORMAL_MEMORY)
3562		if (nodes[node])
3563			x += sprintf(buf + x, " N%d=%lu",
3564					node, nodes[node]);
3565#endif
3566	kfree(nodes);
3567	return x + sprintf(buf + x, "\n");
3568}
3569
3570static int any_slab_objects(struct kmem_cache *s)
3571{
3572	int node;
3573	int cpu;
3574
3575	for_each_possible_cpu(cpu) {
3576		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3577
3578		if (c && c->page)
3579			return 1;
3580	}
3581
3582	for_each_online_node(node) {
3583		struct kmem_cache_node *n = get_node(s, node);
3584
3585		if (!n)
3586			continue;
3587
3588		if (n->nr_partial || atomic_long_read(&n->nr_slabs))
3589			return 1;
3590	}
3591	return 0;
3592}
3593
3594#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3595#define to_slab(n) container_of(n, struct kmem_cache, kobj);
3596
3597struct slab_attribute {
3598	struct attribute attr;
3599	ssize_t (*show)(struct kmem_cache *s, char *buf);
3600	ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
3601};
3602
3603#define SLAB_ATTR_RO(_name) \
3604	static struct slab_attribute _name##_attr = __ATTR_RO(_name)
3605
3606#define SLAB_ATTR(_name) \
3607	static struct slab_attribute _name##_attr =  \
3608	__ATTR(_name, 0644, _name##_show, _name##_store)
3609
3610static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
3611{
3612	return sprintf(buf, "%d\n", s->size);
3613}
3614SLAB_ATTR_RO(slab_size);
3615
3616static ssize_t align_show(struct kmem_cache *s, char *buf)
3617{
3618	return sprintf(buf, "%d\n", s->align);
3619}
3620SLAB_ATTR_RO(align);
3621
3622static ssize_t object_size_show(struct kmem_cache *s, char *buf)
3623{
3624	return sprintf(buf, "%d\n", s->objsize);
3625}
3626SLAB_ATTR_RO(object_size);
3627
3628static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
3629{
3630	return sprintf(buf, "%d\n", s->objects);
3631}
3632SLAB_ATTR_RO(objs_per_slab);
3633
3634static ssize_t order_show(struct kmem_cache *s, char *buf)
3635{
3636	return sprintf(buf, "%d\n", s->order);
3637}
3638SLAB_ATTR_RO(order);
3639
3640static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3641{
3642	if (s->ctor) {
3643		int n = sprint_symbol(buf, (unsigned long)s->ctor);
3644
3645		return n + sprintf(buf + n, "\n");
3646	}
3647	return 0;
3648}
3649SLAB_ATTR_RO(ctor);
3650
3651static ssize_t aliases_show(struct kmem_cache *s, char *buf)
3652{
3653	return sprintf(buf, "%d\n", s->refcount - 1);
3654}
3655SLAB_ATTR_RO(aliases);
3656
3657static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3658{
3659	return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU);
3660}
3661SLAB_ATTR_RO(slabs);
3662
3663static ssize_t partial_show(struct kmem_cache *s, char *buf)
3664{
3665	return slab_objects(s, buf, SO_PARTIAL);
3666}
3667SLAB_ATTR_RO(partial);
3668
3669static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
3670{
3671	return slab_objects(s, buf, SO_CPU);
3672}
3673SLAB_ATTR_RO(cpu_slabs);
3674
3675static ssize_t objects_show(struct kmem_cache *s, char *buf)
3676{
3677	return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS);
3678}
3679SLAB_ATTR_RO(objects);
3680
3681static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
3682{
3683	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
3684}
3685
3686static ssize_t sanity_checks_store(struct kmem_cache *s,
3687				const char *buf, size_t length)
3688{
3689	s->flags &= ~SLAB_DEBUG_FREE;
3690	if (buf[0] == '1')
3691		s->flags |= SLAB_DEBUG_FREE;
3692	return length;
3693}
3694SLAB_ATTR(sanity_checks);
3695
3696static ssize_t trace_show(struct kmem_cache *s, char *buf)
3697{
3698	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
3699}
3700
3701static ssize_t trace_store(struct kmem_cache *s, const char *buf,
3702							size_t length)
3703{
3704	s->flags &= ~SLAB_TRACE;
3705	if (buf[0] == '1')
3706		s->flags |= SLAB_TRACE;
3707	return length;
3708}
3709SLAB_ATTR(trace);
3710
3711static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3712{
3713	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3714}
3715
3716static ssize_t reclaim_account_store(struct kmem_cache *s,
3717				const char *buf, size_t length)
3718{
3719	s->flags &= ~SLAB_RECLAIM_ACCOUNT;
3720	if (buf[0] == '1')
3721		s->flags |= SLAB_RECLAIM_ACCOUNT;
3722	return length;
3723}
3724SLAB_ATTR(reclaim_account);
3725
3726static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
3727{
3728	return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
3729}
3730SLAB_ATTR_RO(hwcache_align);
3731
3732#ifdef CONFIG_ZONE_DMA
3733static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
3734{
3735	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
3736}
3737SLAB_ATTR_RO(cache_dma);
3738#endif
3739
3740static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
3741{
3742	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
3743}
3744SLAB_ATTR_RO(destroy_by_rcu);
3745
3746static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
3747{
3748	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
3749}
3750
3751static ssize_t red_zone_store(struct kmem_cache *s,
3752				const char *buf, size_t length)
3753{
3754	if (any_slab_objects(s))
3755		return -EBUSY;
3756
3757	s->flags &= ~SLAB_RED_ZONE;
3758	if (buf[0] == '1')
3759		s->flags |= SLAB_RED_ZONE;
3760	calculate_sizes(s);
3761	return length;
3762}
3763SLAB_ATTR(red_zone);
3764
3765static ssize_t poison_show(struct kmem_cache *s, char *buf)
3766{
3767	return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
3768}
3769
3770static ssize_t poison_store(struct kmem_cache *s,
3771				const char *buf, size_t length)
3772{
3773	if (any_slab_objects(s))
3774		return -EBUSY;
3775
3776	s->flags &= ~SLAB_POISON;
3777	if (buf[0] == '1')
3778		s->flags |= SLAB_POISON;
3779	calculate_sizes(s);
3780	return length;
3781}
3782SLAB_ATTR(poison);
3783
3784static ssize_t store_user_show(struct kmem_cache *s, char *buf)
3785{
3786	return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
3787}
3788
3789static ssize_t store_user_store(struct kmem_cache *s,
3790				const char *buf, size_t length)
3791{
3792	if (any_slab_objects(s))
3793		return -EBUSY;
3794
3795	s->flags &= ~SLAB_STORE_USER;
3796	if (buf[0] == '1')
3797		s->flags |= SLAB_STORE_USER;
3798	calculate_sizes(s);
3799	return length;
3800}
3801SLAB_ATTR(store_user);
3802
3803static ssize_t validate_show(struct kmem_cache *s, char *buf)
3804{
3805	return 0;
3806}
3807
3808static ssize_t validate_store(struct kmem_cache *s,
3809			const char *buf, size_t length)
3810{
3811	int ret = -EINVAL;
3812
3813	if (buf[0] == '1') {
3814		ret = validate_slab_cache(s);
3815		if (ret >= 0)
3816			ret = length;
3817	}
3818	return ret;
3819}
3820SLAB_ATTR(validate);
3821
3822static ssize_t shrink_show(struct kmem_cache *s, char *buf)
3823{
3824	return 0;
3825}
3826
3827static ssize_t shrink_store(struct kmem_cache *s,
3828			const char *buf, size_t length)
3829{
3830	if (buf[0] == '1') {
3831		int rc = kmem_cache_shrink(s);
3832
3833		if (rc)
3834			return rc;
3835	} else
3836		return -EINVAL;
3837	return length;
3838}
3839SLAB_ATTR(shrink);
3840
3841static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
3842{
3843	if (!(s->flags & SLAB_STORE_USER))
3844		return -ENOSYS;
3845	return list_locations(s, buf, TRACK_ALLOC);
3846}
3847SLAB_ATTR_RO(alloc_calls);
3848
3849static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
3850{
3851	if (!(s->flags & SLAB_STORE_USER))
3852		return -ENOSYS;
3853	return list_locations(s, buf, TRACK_FREE);
3854}
3855SLAB_ATTR_RO(free_calls);
3856
3857#ifdef CONFIG_NUMA
3858static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
3859{
3860	return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
3861}
3862
3863static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
3864				const char *buf, size_t length)
3865{
3866	int n = simple_strtoul(buf, NULL, 10);
3867
3868	if (n < 100)
3869		s->remote_node_defrag_ratio = n * 10;
3870	return length;
3871}
3872SLAB_ATTR(remote_node_defrag_ratio);
3873#endif
3874
3875static struct attribute *slab_attrs[] = {
3876	&slab_size_attr.attr,
3877	&object_size_attr.attr,
3878	&objs_per_slab_attr.attr,
3879	&order_attr.attr,
3880	&objects_attr.attr,
3881	&slabs_attr.attr,
3882	&partial_attr.attr,
3883	&cpu_slabs_attr.attr,
3884	&ctor_attr.attr,
3885	&aliases_attr.attr,
3886	&align_attr.attr,
3887	&sanity_checks_attr.attr,
3888	&trace_attr.attr,
3889	&hwcache_align_attr.attr,
3890	&reclaim_account_attr.attr,
3891	&destroy_by_rcu_attr.attr,
3892	&red_zone_attr.attr,
3893	&poison_attr.attr,
3894	&store_user_attr.attr,
3895	&validate_attr.attr,
3896	&shrink_attr.attr,
3897	&alloc_calls_attr.attr,
3898	&free_calls_attr.attr,
3899#ifdef CONFIG_ZONE_DMA
3900	&cache_dma_attr.attr,
3901#endif
3902#ifdef CONFIG_NUMA
3903	&remote_node_defrag_ratio_attr.attr,
3904#endif
3905	NULL
3906};
3907
3908static struct attribute_group slab_attr_group = {
3909	.attrs = slab_attrs,
3910};
3911
3912static ssize_t slab_attr_show(struct kobject *kobj,
3913				struct attribute *attr,
3914				char *buf)
3915{
3916	struct slab_attribute *attribute;
3917	struct kmem_cache *s;
3918	int err;
3919
3920	attribute = to_slab_attr(attr);
3921	s = to_slab(kobj);
3922
3923	if (!attribute->show)
3924		return -EIO;
3925
3926	err = attribute->show(s, buf);
3927
3928	return err;
3929}
3930
3931static ssize_t slab_attr_store(struct kobject *kobj,
3932				struct attribute *attr,
3933				const char *buf, size_t len)
3934{
3935	struct slab_attribute *attribute;
3936	struct kmem_cache *s;
3937	int err;
3938
3939	attribute = to_slab_attr(attr);
3940	s = to_slab(kobj);
3941
3942	if (!attribute->store)
3943		return -EIO;
3944
3945	err = attribute->store(s, buf, len);
3946
3947	return err;
3948}
3949
3950static void kmem_cache_release(struct kobject *kobj)
3951{
3952	struct kmem_cache *s = to_slab(kobj);
3953
3954	kfree(s);
3955}
3956
3957static struct sysfs_ops slab_sysfs_ops = {
3958	.show = slab_attr_show,
3959	.store = slab_attr_store,
3960};
3961
3962static struct kobj_type slab_ktype = {
3963	.sysfs_ops = &slab_sysfs_ops,
3964	.release = kmem_cache_release
3965};
3966
3967static int uevent_filter(struct kset *kset, struct kobject *kobj)
3968{
3969	struct kobj_type *ktype = get_ktype(kobj);
3970
3971	if (ktype == &slab_ktype)
3972		return 1;
3973	return 0;
3974}
3975
3976static struct kset_uevent_ops slab_uevent_ops = {
3977	.filter = uevent_filter,
3978};
3979
3980static struct kset *slab_kset;
3981
3982#define ID_STR_LENGTH 64
3983
3984/* Create a unique string id for a slab cache:
3985 * format
3986 * :[flags-]size:[memory address of kmemcache]
3987 */
3988static char *create_unique_id(struct kmem_cache *s)
3989{
3990	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
3991	char *p = name;
3992
3993	BUG_ON(!name);
3994
3995	*p++ = ':';
3996	/*
3997	 * First flags affecting slabcache operations. We will only
3998	 * get here for aliasable slabs so we do not need to support
3999	 * too many flags. The flags here must cover all flags that
4000	 * are matched during merging to guarantee that the id is
4001	 * unique.
4002	 */
4003	if (s->flags & SLAB_CACHE_DMA)
4004		*p++ = 'd';
4005	if (s->flags & SLAB_RECLAIM_ACCOUNT)
4006		*p++ = 'a';
4007	if (s->flags & SLAB_DEBUG_FREE)
4008		*p++ = 'F';
4009	if (p != name + 1)
4010		*p++ = '-';
4011	p += sprintf(p, "%07d", s->size);
4012	BUG_ON(p > name + ID_STR_LENGTH - 1);
4013	return name;
4014}
4015
4016static int sysfs_slab_add(struct kmem_cache *s)
4017{
4018	int err;
4019	const char *name;
4020	int unmergeable;
4021
4022	if (slab_state < SYSFS)
4023		/* Defer until later */
4024		return 0;
4025
4026	unmergeable = slab_unmergeable(s);
4027	if (unmergeable) {
4028		/*
4029		 * Slabcache can never be merged so we can use the name proper.
4030		 * This is typically the case for debug situations. In that
4031		 * case we can catch duplicate names easily.
4032		 */
4033		sysfs_remove_link(&slab_kset->kobj, s->name);
4034		name = s->name;
4035	} else {
4036		/*
4037		 * Create a unique name for the slab as a target
4038		 * for the symlinks.
4039		 */
4040		name = create_unique_id(s);
4041	}
4042
4043	s->kobj.kset = slab_kset;
4044	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
4045	if (err) {
4046		kobject_put(&s->kobj);
4047		return err;
4048	}
4049
4050	err = sysfs_create_group(&s->kobj, &slab_attr_group);
4051	if (err)
4052		return err;
4053	kobject_uevent(&s->kobj, KOBJ_ADD);
4054	if (!unmergeable) {
4055		/* Setup first alias */
4056		sysfs_slab_alias(s, s->name);
4057		kfree(name);
4058	}
4059	return 0;
4060}
4061
4062static void sysfs_slab_remove(struct kmem_cache *s)
4063{
4064	kobject_uevent(&s->kobj, KOBJ_REMOVE);
4065	kobject_del(&s->kobj);
4066	kobject_put(&s->kobj);
4067}
4068
4069/*
4070 * Need to buffer aliases during bootup until sysfs becomes
4071 * available lest we loose that information.
4072 */
4073struct saved_alias {
4074	struct kmem_cache *s;
4075	const char *name;
4076	struct saved_alias *next;
4077};
4078
4079static struct saved_alias *alias_list;
4080
4081static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
4082{
4083	struct saved_alias *al;
4084
4085	if (slab_state == SYSFS) {
4086		/*
4087		 * If we have a leftover link then remove it.
4088		 */
4089		sysfs_remove_link(&slab_kset->kobj, name);
4090		return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
4091	}
4092
4093	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
4094	if (!al)
4095		return -ENOMEM;
4096
4097	al->s = s;
4098	al->name = name;
4099	al->next = alias_list;
4100	alias_list = al;
4101	return 0;
4102}
4103
4104static int __init slab_sysfs_init(void)
4105{
4106	struct kmem_cache *s;
4107	int err;
4108
4109	slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
4110	if (!slab_kset) {
4111		printk(KERN_ERR "Cannot register slab subsystem.\n");
4112		return -ENOSYS;
4113	}
4114
4115	slab_state = SYSFS;
4116
4117	list_for_each_entry(s, &slab_caches, list) {
4118		err = sysfs_slab_add(s);
4119		if (err)
4120			printk(KERN_ERR "SLUB: Unable to add boot slab %s"
4121						" to sysfs\n", s->name);
4122	}
4123
4124	while (alias_list) {
4125		struct saved_alias *al = alias_list;
4126
4127		alias_list = alias_list->next;
4128		err = sysfs_slab_alias(al->s, al->name);
4129		if (err)
4130			printk(KERN_ERR "SLUB: Unable to add boot slab alias"
4131					" %s to sysfs\n", s->name);
4132		kfree(al);
4133	}
4134
4135	resiliency_test();
4136	return 0;
4137}
4138
4139__initcall(slab_sysfs_init);
4140#endif
4141
4142/*
4143 * The /proc/slabinfo ABI
4144 */
4145#ifdef CONFIG_SLABINFO
4146
4147ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4148                       size_t count, loff_t *ppos)
4149{
4150	return -EINVAL;
4151}
4152
4153
4154static void print_slabinfo_header(struct seq_file *m)
4155{
4156	seq_puts(m, "slabinfo - version: 2.1\n");
4157	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4158		 "<objperslab> <pagesperslab>");
4159	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4160	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4161	seq_putc(m, '\n');
4162}
4163
4164static void *s_start(struct seq_file *m, loff_t *pos)
4165{
4166	loff_t n = *pos;
4167
4168	down_read(&slub_lock);
4169	if (!n)
4170		print_slabinfo_header(m);
4171
4172	return seq_list_start(&slab_caches, *pos);
4173}
4174
4175static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4176{
4177	return seq_list_next(p, &slab_caches, pos);
4178}
4179
4180static void s_stop(struct seq_file *m, void *p)
4181{
4182	up_read(&slub_lock);
4183}
4184
4185static int s_show(struct seq_file *m, void *p)
4186{
4187	unsigned long nr_partials = 0;
4188	unsigned long nr_slabs = 0;
4189	unsigned long nr_inuse = 0;
4190	unsigned long nr_objs;
4191	struct kmem_cache *s;
4192	int node;
4193
4194	s = list_entry(p, struct kmem_cache, list);
4195
4196	for_each_online_node(node) {
4197		struct kmem_cache_node *n = get_node(s, node);
4198
4199		if (!n)
4200			continue;
4201
4202		nr_partials += n->nr_partial;
4203		nr_slabs += atomic_long_read(&n->nr_slabs);
4204		nr_inuse += count_partial(n);
4205	}
4206
4207	nr_objs = nr_slabs * s->objects;
4208	nr_inuse += (nr_slabs - nr_partials) * s->objects;
4209
4210	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
4211		   nr_objs, s->size, s->objects, (1 << s->order));
4212	seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
4213	seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
4214		   0UL);
4215	seq_putc(m, '\n');
4216	return 0;
4217}
4218
4219const struct seq_operations slabinfo_op = {
4220	.start = s_start,
4221	.next = s_next,
4222	.stop = s_stop,
4223	.show = s_show,
4224};
4225
4226#endif /* CONFIG_SLABINFO */
4227