hugetlbpage.c revision 021c73354921a315ae2fceb1ad7807d1569a5a74
1/*
2 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
3 *
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
5 *
6 * Based on the IA-32 version:
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
8 */
9
10#include <linux/init.h>
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/hugetlb.h>
14#include <linux/pagemap.h>
15#include <linux/smp_lock.h>
16#include <linux/slab.h>
17#include <linux/err.h>
18#include <linux/sysctl.h>
19#include <asm/mman.h>
20#include <asm/pgalloc.h>
21#include <asm/tlb.h>
22#include <asm/tlbflush.h>
23#include <asm/mmu_context.h>
24#include <asm/machdep.h>
25#include <asm/cputable.h>
26#include <asm/tlb.h>
27
28#include <linux/sysctl.h>
29
30#define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
31#define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
32
33/* Modelled after find_linux_pte() */
34pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
35{
36	pgd_t *pg;
37	pud_t *pu;
38	pmd_t *pm;
39	pte_t *pt;
40
41	BUG_ON(! in_hugepage_area(mm->context, addr));
42
43	addr &= HPAGE_MASK;
44
45	pg = pgd_offset(mm, addr);
46	if (!pgd_none(*pg)) {
47		pu = pud_offset(pg, addr);
48		if (!pud_none(*pu)) {
49			pm = pmd_offset(pu, addr);
50#ifdef CONFIG_PPC_64K_PAGES
51			/* Currently, we use the normal PTE offset within full
52			 * size PTE pages, thus our huge PTEs are scattered in
53			 * the PTE page and we do waste some. We may change
54			 * that in the future, but the current mecanism keeps
55			 * things much simpler
56			 */
57			if (!pmd_none(*pm)) {
58				/* Note: pte_offset_* are all equivalent on
59				 * ppc64 as we don't have HIGHMEM
60				 */
61				pt = pte_offset_kernel(pm, addr);
62				return pt;
63			}
64#else /* CONFIG_PPC_64K_PAGES */
65			/* On 4k pages, we put huge PTEs in the PMD page */
66			pt = (pte_t *)pm;
67			return pt;
68#endif /* CONFIG_PPC_64K_PAGES */
69		}
70	}
71
72	return NULL;
73}
74
75pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
76{
77	pgd_t *pg;
78	pud_t *pu;
79	pmd_t *pm;
80	pte_t *pt;
81
82	BUG_ON(! in_hugepage_area(mm->context, addr));
83
84	addr &= HPAGE_MASK;
85
86	pg = pgd_offset(mm, addr);
87	pu = pud_alloc(mm, pg, addr);
88
89	if (pu) {
90		pm = pmd_alloc(mm, pu, addr);
91		if (pm) {
92#ifdef CONFIG_PPC_64K_PAGES
93			/* See comment in huge_pte_offset. Note that if we ever
94			 * want to put the page size in the PMD, we would have
95			 * to open code our own pte_alloc* function in order
96			 * to populate and set the size atomically
97			 */
98			pt = pte_alloc_map(mm, pm, addr);
99#else /* CONFIG_PPC_64K_PAGES */
100			pt = (pte_t *)pm;
101#endif /* CONFIG_PPC_64K_PAGES */
102			return pt;
103		}
104	}
105
106	return NULL;
107}
108
109void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
110		     pte_t *ptep, pte_t pte)
111{
112	if (pte_present(*ptep)) {
113		/* We open-code pte_clear because we need to pass the right
114		 * argument to hpte_update (huge / !huge)
115		 */
116		unsigned long old = pte_update(ptep, ~0UL);
117		if (old & _PAGE_HASHPTE)
118			hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
119		flush_tlb_pending();
120	}
121	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
122}
123
124pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
125			      pte_t *ptep)
126{
127	unsigned long old = pte_update(ptep, ~0UL);
128
129	if (old & _PAGE_HASHPTE)
130		hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
131	*ptep = __pte(0);
132
133	return __pte(old);
134}
135
136/*
137 * This function checks for proper alignment of input addr and len parameters.
138 */
139int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
140{
141	if (len & ~HPAGE_MASK)
142		return -EINVAL;
143	if (addr & ~HPAGE_MASK)
144		return -EINVAL;
145	if (! (within_hugepage_low_range(addr, len)
146	       || within_hugepage_high_range(addr, len)) )
147		return -EINVAL;
148	return 0;
149}
150
151struct slb_flush_info {
152	struct mm_struct *mm;
153	u16 newareas;
154};
155
156static void flush_low_segments(void *parm)
157{
158	struct slb_flush_info *fi = parm;
159	unsigned long i;
160
161	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
162
163	if (current->active_mm != fi->mm)
164		return;
165
166	/* Only need to do anything if this CPU is working in the same
167	 * mm as the one which has changed */
168
169	/* update the paca copy of the context struct */
170	get_paca()->context = current->active_mm->context;
171
172	asm volatile("isync" : : : "memory");
173	for (i = 0; i < NUM_LOW_AREAS; i++) {
174		if (! (fi->newareas & (1U << i)))
175			continue;
176		asm volatile("slbie %0"
177			     : : "r" ((i << SID_SHIFT) | SLBIE_C));
178	}
179	asm volatile("isync" : : : "memory");
180}
181
182static void flush_high_segments(void *parm)
183{
184	struct slb_flush_info *fi = parm;
185	unsigned long i, j;
186
187
188	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
189
190	if (current->active_mm != fi->mm)
191		return;
192
193	/* Only need to do anything if this CPU is working in the same
194	 * mm as the one which has changed */
195
196	/* update the paca copy of the context struct */
197	get_paca()->context = current->active_mm->context;
198
199	asm volatile("isync" : : : "memory");
200	for (i = 0; i < NUM_HIGH_AREAS; i++) {
201		if (! (fi->newareas & (1U << i)))
202			continue;
203		for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
204			asm volatile("slbie %0"
205				     :: "r" (((i << HTLB_AREA_SHIFT)
206					      + (j << SID_SHIFT)) | SLBIE_C));
207	}
208	asm volatile("isync" : : : "memory");
209}
210
211static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
212{
213	unsigned long start = area << SID_SHIFT;
214	unsigned long end = (area+1) << SID_SHIFT;
215	struct vm_area_struct *vma;
216
217	BUG_ON(area >= NUM_LOW_AREAS);
218
219	/* Check no VMAs are in the region */
220	vma = find_vma(mm, start);
221	if (vma && (vma->vm_start < end))
222		return -EBUSY;
223
224	return 0;
225}
226
227static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
228{
229	unsigned long start = area << HTLB_AREA_SHIFT;
230	unsigned long end = (area+1) << HTLB_AREA_SHIFT;
231	struct vm_area_struct *vma;
232
233	BUG_ON(area >= NUM_HIGH_AREAS);
234
235	/* Hack, so that each addresses is controlled by exactly one
236	 * of the high or low area bitmaps, the first high area starts
237	 * at 4GB, not 0 */
238	if (start == 0)
239		start = 0x100000000UL;
240
241	/* Check no VMAs are in the region */
242	vma = find_vma(mm, start);
243	if (vma && (vma->vm_start < end))
244		return -EBUSY;
245
246	return 0;
247}
248
249static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
250{
251	unsigned long i;
252	struct slb_flush_info fi;
253
254	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
255	BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
256
257	newareas &= ~(mm->context.low_htlb_areas);
258	if (! newareas)
259		return 0; /* The segments we want are already open */
260
261	for (i = 0; i < NUM_LOW_AREAS; i++)
262		if ((1 << i) & newareas)
263			if (prepare_low_area_for_htlb(mm, i) != 0)
264				return -EBUSY;
265
266	mm->context.low_htlb_areas |= newareas;
267
268	/* the context change must make it to memory before the flush,
269	 * so that further SLB misses do the right thing. */
270	mb();
271
272	fi.mm = mm;
273	fi.newareas = newareas;
274	on_each_cpu(flush_low_segments, &fi, 0, 1);
275
276	return 0;
277}
278
279static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
280{
281	struct slb_flush_info fi;
282	unsigned long i;
283
284	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
285	BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
286		     != NUM_HIGH_AREAS);
287
288	newareas &= ~(mm->context.high_htlb_areas);
289	if (! newareas)
290		return 0; /* The areas we want are already open */
291
292	for (i = 0; i < NUM_HIGH_AREAS; i++)
293		if ((1 << i) & newareas)
294			if (prepare_high_area_for_htlb(mm, i) != 0)
295				return -EBUSY;
296
297	mm->context.high_htlb_areas |= newareas;
298
299	/* update the paca copy of the context struct */
300	get_paca()->context = mm->context;
301
302	/* the context change must make it to memory before the flush,
303	 * so that further SLB misses do the right thing. */
304	mb();
305
306	fi.mm = mm;
307	fi.newareas = newareas;
308	on_each_cpu(flush_high_segments, &fi, 0, 1);
309
310	return 0;
311}
312
313int prepare_hugepage_range(unsigned long addr, unsigned long len)
314{
315	int err = 0;
316
317	if ( (addr+len) < addr )
318		return -EINVAL;
319
320	if (addr < 0x100000000UL)
321		err = open_low_hpage_areas(current->mm,
322					  LOW_ESID_MASK(addr, len));
323	if ((addr + len) > 0x100000000UL)
324		err = open_high_hpage_areas(current->mm,
325					    HTLB_AREA_MASK(addr, len));
326	if (err) {
327		printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
328		       " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
329		       addr, len,
330		       LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
331		return err;
332	}
333
334	return 0;
335}
336
337struct page *
338follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
339{
340	pte_t *ptep;
341	struct page *page;
342
343	if (! in_hugepage_area(mm->context, address))
344		return ERR_PTR(-EINVAL);
345
346	ptep = huge_pte_offset(mm, address);
347	page = pte_page(*ptep);
348	if (page)
349		page += (address % HPAGE_SIZE) / PAGE_SIZE;
350
351	return page;
352}
353
354int pmd_huge(pmd_t pmd)
355{
356	return 0;
357}
358
359struct page *
360follow_huge_pmd(struct mm_struct *mm, unsigned long address,
361		pmd_t *pmd, int write)
362{
363	BUG();
364	return NULL;
365}
366
367/* Because we have an exclusive hugepage region which lies within the
368 * normal user address space, we have to take special measures to make
369 * non-huge mmap()s evade the hugepage reserved regions. */
370unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
371				     unsigned long len, unsigned long pgoff,
372				     unsigned long flags)
373{
374	struct mm_struct *mm = current->mm;
375	struct vm_area_struct *vma;
376	unsigned long start_addr;
377
378	if (len > TASK_SIZE)
379		return -ENOMEM;
380
381	if (addr) {
382		addr = PAGE_ALIGN(addr);
383		vma = find_vma(mm, addr);
384		if (((TASK_SIZE - len) >= addr)
385		    && (!vma || (addr+len) <= vma->vm_start)
386		    && !is_hugepage_only_range(mm, addr,len))
387			return addr;
388	}
389	if (len > mm->cached_hole_size) {
390	        start_addr = addr = mm->free_area_cache;
391	} else {
392	        start_addr = addr = TASK_UNMAPPED_BASE;
393	        mm->cached_hole_size = 0;
394	}
395
396full_search:
397	vma = find_vma(mm, addr);
398	while (TASK_SIZE - len >= addr) {
399		BUG_ON(vma && (addr >= vma->vm_end));
400
401		if (touches_hugepage_low_range(mm, addr, len)) {
402			addr = ALIGN(addr+1, 1<<SID_SHIFT);
403			vma = find_vma(mm, addr);
404			continue;
405		}
406		if (touches_hugepage_high_range(mm, addr, len)) {
407			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
408			vma = find_vma(mm, addr);
409			continue;
410		}
411		if (!vma || addr + len <= vma->vm_start) {
412			/*
413			 * Remember the place where we stopped the search:
414			 */
415			mm->free_area_cache = addr + len;
416			return addr;
417		}
418		if (addr + mm->cached_hole_size < vma->vm_start)
419		        mm->cached_hole_size = vma->vm_start - addr;
420		addr = vma->vm_end;
421		vma = vma->vm_next;
422	}
423
424	/* Make sure we didn't miss any holes */
425	if (start_addr != TASK_UNMAPPED_BASE) {
426		start_addr = addr = TASK_UNMAPPED_BASE;
427		mm->cached_hole_size = 0;
428		goto full_search;
429	}
430	return -ENOMEM;
431}
432
433/*
434 * This mmap-allocator allocates new areas top-down from below the
435 * stack's low limit (the base):
436 *
437 * Because we have an exclusive hugepage region which lies within the
438 * normal user address space, we have to take special measures to make
439 * non-huge mmap()s evade the hugepage reserved regions.
440 */
441unsigned long
442arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
443			  const unsigned long len, const unsigned long pgoff,
444			  const unsigned long flags)
445{
446	struct vm_area_struct *vma, *prev_vma;
447	struct mm_struct *mm = current->mm;
448	unsigned long base = mm->mmap_base, addr = addr0;
449	unsigned long largest_hole = mm->cached_hole_size;
450	int first_time = 1;
451
452	/* requested length too big for entire address space */
453	if (len > TASK_SIZE)
454		return -ENOMEM;
455
456	/* dont allow allocations above current base */
457	if (mm->free_area_cache > base)
458		mm->free_area_cache = base;
459
460	/* requesting a specific address */
461	if (addr) {
462		addr = PAGE_ALIGN(addr);
463		vma = find_vma(mm, addr);
464		if (TASK_SIZE - len >= addr &&
465				(!vma || addr + len <= vma->vm_start)
466				&& !is_hugepage_only_range(mm, addr,len))
467			return addr;
468	}
469
470	if (len <= largest_hole) {
471	        largest_hole = 0;
472		mm->free_area_cache = base;
473	}
474try_again:
475	/* make sure it can fit in the remaining address space */
476	if (mm->free_area_cache < len)
477		goto fail;
478
479	/* either no address requested or cant fit in requested address hole */
480	addr = (mm->free_area_cache - len) & PAGE_MASK;
481	do {
482hugepage_recheck:
483		if (touches_hugepage_low_range(mm, addr, len)) {
484			addr = (addr & ((~0) << SID_SHIFT)) - len;
485			goto hugepage_recheck;
486		} else if (touches_hugepage_high_range(mm, addr, len)) {
487			addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
488			goto hugepage_recheck;
489		}
490
491		/*
492		 * Lookup failure means no vma is above this address,
493		 * i.e. return with success:
494		 */
495 	 	if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
496			return addr;
497
498		/*
499		 * new region fits between prev_vma->vm_end and
500		 * vma->vm_start, use it:
501		 */
502		if (addr+len <= vma->vm_start &&
503		          (!prev_vma || (addr >= prev_vma->vm_end))) {
504			/* remember the address as a hint for next time */
505		        mm->cached_hole_size = largest_hole;
506		        return (mm->free_area_cache = addr);
507		} else {
508			/* pull free_area_cache down to the first hole */
509		        if (mm->free_area_cache == vma->vm_end) {
510				mm->free_area_cache = vma->vm_start;
511				mm->cached_hole_size = largest_hole;
512			}
513		}
514
515		/* remember the largest hole we saw so far */
516		if (addr + largest_hole < vma->vm_start)
517		        largest_hole = vma->vm_start - addr;
518
519		/* try just below the current vma->vm_start */
520		addr = vma->vm_start-len;
521	} while (len <= vma->vm_start);
522
523fail:
524	/*
525	 * if hint left us with no space for the requested
526	 * mapping then try again:
527	 */
528	if (first_time) {
529		mm->free_area_cache = base;
530		largest_hole = 0;
531		first_time = 0;
532		goto try_again;
533	}
534	/*
535	 * A failed mmap() very likely causes application failure,
536	 * so fall back to the bottom-up function here. This scenario
537	 * can happen with large stack limits and large mmap()
538	 * allocations.
539	 */
540	mm->free_area_cache = TASK_UNMAPPED_BASE;
541	mm->cached_hole_size = ~0UL;
542	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
543	/*
544	 * Restore the topdown base:
545	 */
546	mm->free_area_cache = base;
547	mm->cached_hole_size = ~0UL;
548
549	return addr;
550}
551
552static int htlb_check_hinted_area(unsigned long addr, unsigned long len)
553{
554	struct vm_area_struct *vma;
555
556	vma = find_vma(current->mm, addr);
557	if (!vma || ((addr + len) <= vma->vm_start))
558		return 0;
559
560	return -ENOMEM;
561}
562
563static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
564{
565	unsigned long addr = 0;
566	struct vm_area_struct *vma;
567
568	vma = find_vma(current->mm, addr);
569	while (addr + len <= 0x100000000UL) {
570		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
571
572		if (! __within_hugepage_low_range(addr, len, segmask)) {
573			addr = ALIGN(addr+1, 1<<SID_SHIFT);
574			vma = find_vma(current->mm, addr);
575			continue;
576		}
577
578		if (!vma || (addr + len) <= vma->vm_start)
579			return addr;
580		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
581		/* Depending on segmask this might not be a confirmed
582		 * hugepage region, so the ALIGN could have skipped
583		 * some VMAs */
584		vma = find_vma(current->mm, addr);
585	}
586
587	return -ENOMEM;
588}
589
590static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
591{
592	unsigned long addr = 0x100000000UL;
593	struct vm_area_struct *vma;
594
595	vma = find_vma(current->mm, addr);
596	while (addr + len <= TASK_SIZE_USER64) {
597		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
598
599		if (! __within_hugepage_high_range(addr, len, areamask)) {
600			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
601			vma = find_vma(current->mm, addr);
602			continue;
603		}
604
605		if (!vma || (addr + len) <= vma->vm_start)
606			return addr;
607		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
608		/* Depending on segmask this might not be a confirmed
609		 * hugepage region, so the ALIGN could have skipped
610		 * some VMAs */
611		vma = find_vma(current->mm, addr);
612	}
613
614	return -ENOMEM;
615}
616
617unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
618					unsigned long len, unsigned long pgoff,
619					unsigned long flags)
620{
621	int lastshift;
622	u16 areamask, curareas;
623
624	if (HPAGE_SHIFT == 0)
625		return -EINVAL;
626	if (len & ~HPAGE_MASK)
627		return -EINVAL;
628
629	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
630		return -EINVAL;
631
632	/* Paranoia, caller should have dealt with this */
633	BUG_ON((addr + len)  < addr);
634
635	if (test_thread_flag(TIF_32BIT)) {
636		/* Paranoia, caller should have dealt with this */
637		BUG_ON((addr + len) > 0x100000000UL);
638
639		curareas = current->mm->context.low_htlb_areas;
640
641		/* First see if we can use the hint address */
642		if (addr && (htlb_check_hinted_area(addr, len) == 0)) {
643			areamask = LOW_ESID_MASK(addr, len);
644			if (open_low_hpage_areas(current->mm, areamask) == 0)
645				return addr;
646		}
647
648		/* Next see if we can map in the existing low areas */
649		addr = htlb_get_low_area(len, curareas);
650		if (addr != -ENOMEM)
651			return addr;
652
653		/* Finally go looking for areas to open */
654		lastshift = 0;
655		for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
656		     ! lastshift; areamask >>=1) {
657			if (areamask & 1)
658				lastshift = 1;
659
660			addr = htlb_get_low_area(len, curareas | areamask);
661			if ((addr != -ENOMEM)
662			    && open_low_hpage_areas(current->mm, areamask) == 0)
663				return addr;
664		}
665	} else {
666		curareas = current->mm->context.high_htlb_areas;
667
668		/* First see if we can use the hint address */
669		/* We discourage 64-bit processes from doing hugepage
670		 * mappings below 4GB (must use MAP_FIXED) */
671		if ((addr >= 0x100000000UL)
672		    && (htlb_check_hinted_area(addr, len) == 0)) {
673			areamask = HTLB_AREA_MASK(addr, len);
674			if (open_high_hpage_areas(current->mm, areamask) == 0)
675				return addr;
676		}
677
678		/* Next see if we can map in the existing high areas */
679		addr = htlb_get_high_area(len, curareas);
680		if (addr != -ENOMEM)
681			return addr;
682
683		/* Finally go looking for areas to open */
684		lastshift = 0;
685		for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
686		     ! lastshift; areamask >>=1) {
687			if (areamask & 1)
688				lastshift = 1;
689
690			addr = htlb_get_high_area(len, curareas | areamask);
691			if ((addr != -ENOMEM)
692			    && open_high_hpage_areas(current->mm, areamask) == 0)
693				return addr;
694		}
695	}
696	printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
697	       " enough areas\n");
698	return -ENOMEM;
699}
700
701/*
702 * Called by asm hashtable.S for doing lazy icache flush
703 */
704static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
705						  pte_t pte, int trap)
706{
707	struct page *page;
708	int i;
709
710	if (!pfn_valid(pte_pfn(pte)))
711		return rflags;
712
713	page = pte_page(pte);
714
715	/* page is dirty */
716	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
717		if (trap == 0x400) {
718			for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
719				__flush_dcache_icache(page_address(page+i));
720			set_bit(PG_arch_1, &page->flags);
721		} else {
722			rflags |= HPTE_R_N;
723		}
724	}
725	return rflags;
726}
727
728int hash_huge_page(struct mm_struct *mm, unsigned long access,
729		   unsigned long ea, unsigned long vsid, int local,
730		   unsigned long trap)
731{
732	pte_t *ptep;
733	unsigned long old_pte, new_pte;
734	unsigned long va, rflags, pa;
735	long slot;
736	int err = 1;
737
738	ptep = huge_pte_offset(mm, ea);
739
740	/* Search the Linux page table for a match with va */
741	va = (vsid << 28) | (ea & 0x0fffffff);
742
743	/*
744	 * If no pte found or not present, send the problem up to
745	 * do_page_fault
746	 */
747	if (unlikely(!ptep || pte_none(*ptep)))
748		goto out;
749
750	/*
751	 * Check the user's access rights to the page.  If access should be
752	 * prevented then send the problem up to do_page_fault.
753	 */
754	if (unlikely(access & ~pte_val(*ptep)))
755		goto out;
756	/*
757	 * At this point, we have a pte (old_pte) which can be used to build
758	 * or update an HPTE. There are 2 cases:
759	 *
760	 * 1. There is a valid (present) pte with no associated HPTE (this is
761	 *	the most common case)
762	 * 2. There is a valid (present) pte with an associated HPTE. The
763	 *	current values of the pp bits in the HPTE prevent access
764	 *	because we are doing software DIRTY bit management and the
765	 *	page is currently not DIRTY.
766	 */
767
768
769	do {
770		old_pte = pte_val(*ptep);
771		if (old_pte & _PAGE_BUSY)
772			goto out;
773		new_pte = old_pte | _PAGE_BUSY |
774			_PAGE_ACCESSED | _PAGE_HASHPTE;
775	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
776					 old_pte, new_pte));
777
778	rflags = 0x2 | (!(new_pte & _PAGE_RW));
779 	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
780	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
781	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
782		/* No CPU has hugepages but lacks no execute, so we
783		 * don't need to worry about that case */
784		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
785						       trap);
786
787	/* Check if pte already has an hpte (case 2) */
788	if (unlikely(old_pte & _PAGE_HASHPTE)) {
789		/* There MIGHT be an HPTE for this pte */
790		unsigned long hash, slot;
791
792		hash = hpt_hash(va, HPAGE_SHIFT);
793		if (old_pte & _PAGE_F_SECOND)
794			hash = ~hash;
795		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
796		slot += (old_pte & _PAGE_F_GIX) >> 12;
797
798		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
799					 local) == -1)
800			old_pte &= ~_PAGE_HPTEFLAGS;
801	}
802
803	if (likely(!(old_pte & _PAGE_HASHPTE))) {
804		unsigned long hash = hpt_hash(va, HPAGE_SHIFT);
805		unsigned long hpte_group;
806
807		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
808
809repeat:
810		hpte_group = ((hash & htab_hash_mask) *
811			      HPTES_PER_GROUP) & ~0x7UL;
812
813		/* clear HPTE slot informations in new PTE */
814		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
815
816		/* Add in WIMG bits */
817		/* XXX We should store these in the pte */
818		/* --BenH: I think they are ... */
819		rflags |= _PAGE_COHERENT;
820
821		/* Insert into the hash table, primary slot */
822		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
823					  mmu_huge_psize);
824
825		/* Primary is full, try the secondary */
826		if (unlikely(slot == -1)) {
827			new_pte |= _PAGE_F_SECOND;
828			hpte_group = ((~hash & htab_hash_mask) *
829				      HPTES_PER_GROUP) & ~0x7UL;
830			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
831						  HPTE_V_SECONDARY,
832						  mmu_huge_psize);
833			if (slot == -1) {
834				if (mftb() & 0x1)
835					hpte_group = ((hash & htab_hash_mask) *
836						      HPTES_PER_GROUP)&~0x7UL;
837
838				ppc_md.hpte_remove(hpte_group);
839				goto repeat;
840                        }
841		}
842
843		if (unlikely(slot == -2))
844			panic("hash_huge_page: pte_insert failed\n");
845
846		new_pte |= (slot << 12) & _PAGE_F_GIX;
847	}
848
849	/*
850	 * No need to use ldarx/stdcx here
851	 */
852	*ptep = __pte(new_pte & ~_PAGE_BUSY);
853
854	err = 0;
855
856 out:
857	return err;
858}
859