pgtable.c revision 02a8f3abb708919149cb657a5202f4603f0c38e2
1/*
2 *    Copyright IBM Corp. 2007, 2011
3 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
4 */
5
6#include <linux/sched.h>
7#include <linux/kernel.h>
8#include <linux/errno.h>
9#include <linux/gfp.h>
10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/smp.h>
13#include <linux/highmem.h>
14#include <linux/pagemap.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/quicklist.h>
18#include <linux/rcupdate.h>
19#include <linux/slab.h>
20#include <linux/swapops.h>
21
22#include <asm/pgtable.h>
23#include <asm/pgalloc.h>
24#include <asm/tlb.h>
25#include <asm/tlbflush.h>
26#include <asm/mmu_context.h>
27
28#ifndef CONFIG_64BIT
29#define ALLOC_ORDER	1
30#define FRAG_MASK	0x0f
31#else
32#define ALLOC_ORDER	2
33#define FRAG_MASK	0x03
34#endif
35
36
37unsigned long *crst_table_alloc(struct mm_struct *mm)
38{
39	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
40
41	if (!page)
42		return NULL;
43	return (unsigned long *) page_to_phys(page);
44}
45
46void crst_table_free(struct mm_struct *mm, unsigned long *table)
47{
48	free_pages((unsigned long) table, ALLOC_ORDER);
49}
50
51#ifdef CONFIG_64BIT
52static void __crst_table_upgrade(void *arg)
53{
54	struct mm_struct *mm = arg;
55
56	if (current->active_mm == mm)
57		update_user_asce(mm);
58	__tlb_flush_local();
59}
60
61int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
62{
63	unsigned long *table, *pgd;
64	unsigned long entry;
65	int flush;
66
67	BUG_ON(limit > (1UL << 53));
68	flush = 0;
69repeat:
70	table = crst_table_alloc(mm);
71	if (!table)
72		return -ENOMEM;
73	spin_lock_bh(&mm->page_table_lock);
74	if (mm->context.asce_limit < limit) {
75		pgd = (unsigned long *) mm->pgd;
76		if (mm->context.asce_limit <= (1UL << 31)) {
77			entry = _REGION3_ENTRY_EMPTY;
78			mm->context.asce_limit = 1UL << 42;
79			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
80						_ASCE_USER_BITS |
81						_ASCE_TYPE_REGION3;
82		} else {
83			entry = _REGION2_ENTRY_EMPTY;
84			mm->context.asce_limit = 1UL << 53;
85			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
86						_ASCE_USER_BITS |
87						_ASCE_TYPE_REGION2;
88		}
89		crst_table_init(table, entry);
90		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
91		mm->pgd = (pgd_t *) table;
92		mm->task_size = mm->context.asce_limit;
93		table = NULL;
94		flush = 1;
95	}
96	spin_unlock_bh(&mm->page_table_lock);
97	if (table)
98		crst_table_free(mm, table);
99	if (mm->context.asce_limit < limit)
100		goto repeat;
101	if (flush)
102		on_each_cpu(__crst_table_upgrade, mm, 0);
103	return 0;
104}
105
106void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
107{
108	pgd_t *pgd;
109
110	if (current->active_mm == mm) {
111		clear_user_asce(mm);
112		__tlb_flush_mm(mm);
113	}
114	while (mm->context.asce_limit > limit) {
115		pgd = mm->pgd;
116		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
117		case _REGION_ENTRY_TYPE_R2:
118			mm->context.asce_limit = 1UL << 42;
119			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
120						_ASCE_USER_BITS |
121						_ASCE_TYPE_REGION3;
122			break;
123		case _REGION_ENTRY_TYPE_R3:
124			mm->context.asce_limit = 1UL << 31;
125			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
126						_ASCE_USER_BITS |
127						_ASCE_TYPE_SEGMENT;
128			break;
129		default:
130			BUG();
131		}
132		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
133		mm->task_size = mm->context.asce_limit;
134		crst_table_free(mm, (unsigned long *) pgd);
135	}
136	if (current->active_mm == mm)
137		update_user_asce(mm);
138}
139#endif
140
141#ifdef CONFIG_PGSTE
142
143/**
144 * gmap_alloc - allocate a guest address space
145 * @mm: pointer to the parent mm_struct
146 *
147 * Returns a guest address space structure.
148 */
149struct gmap *gmap_alloc(struct mm_struct *mm)
150{
151	struct gmap *gmap;
152	struct page *page;
153	unsigned long *table;
154
155	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
156	if (!gmap)
157		goto out;
158	INIT_LIST_HEAD(&gmap->crst_list);
159	gmap->mm = mm;
160	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
161	if (!page)
162		goto out_free;
163	list_add(&page->lru, &gmap->crst_list);
164	table = (unsigned long *) page_to_phys(page);
165	crst_table_init(table, _REGION1_ENTRY_EMPTY);
166	gmap->table = table;
167	gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
168		     _ASCE_USER_BITS | __pa(table);
169	list_add(&gmap->list, &mm->context.gmap_list);
170	return gmap;
171
172out_free:
173	kfree(gmap);
174out:
175	return NULL;
176}
177EXPORT_SYMBOL_GPL(gmap_alloc);
178
179static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
180{
181	struct gmap_pgtable *mp;
182	struct gmap_rmap *rmap;
183	struct page *page;
184
185	if (*table & _SEGMENT_ENTRY_INVALID)
186		return 0;
187	page = pfn_to_page(*table >> PAGE_SHIFT);
188	mp = (struct gmap_pgtable *) page->index;
189	list_for_each_entry(rmap, &mp->mapper, list) {
190		if (rmap->entry != table)
191			continue;
192		list_del(&rmap->list);
193		kfree(rmap);
194		break;
195	}
196	*table = mp->vmaddr | _SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_PROTECT;
197	return 1;
198}
199
200static void gmap_flush_tlb(struct gmap *gmap)
201{
202	if (MACHINE_HAS_IDTE)
203		__tlb_flush_idte((unsigned long) gmap->table |
204				 _ASCE_TYPE_REGION1);
205	else
206		__tlb_flush_global();
207}
208
209/**
210 * gmap_free - free a guest address space
211 * @gmap: pointer to the guest address space structure
212 */
213void gmap_free(struct gmap *gmap)
214{
215	struct page *page, *next;
216	unsigned long *table;
217	int i;
218
219
220	/* Flush tlb. */
221	if (MACHINE_HAS_IDTE)
222		__tlb_flush_idte((unsigned long) gmap->table |
223				 _ASCE_TYPE_REGION1);
224	else
225		__tlb_flush_global();
226
227	/* Free all segment & region tables. */
228	down_read(&gmap->mm->mmap_sem);
229	spin_lock(&gmap->mm->page_table_lock);
230	list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
231		table = (unsigned long *) page_to_phys(page);
232		if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
233			/* Remove gmap rmap structures for segment table. */
234			for (i = 0; i < PTRS_PER_PMD; i++, table++)
235				gmap_unlink_segment(gmap, table);
236		__free_pages(page, ALLOC_ORDER);
237	}
238	spin_unlock(&gmap->mm->page_table_lock);
239	up_read(&gmap->mm->mmap_sem);
240	list_del(&gmap->list);
241	kfree(gmap);
242}
243EXPORT_SYMBOL_GPL(gmap_free);
244
245/**
246 * gmap_enable - switch primary space to the guest address space
247 * @gmap: pointer to the guest address space structure
248 */
249void gmap_enable(struct gmap *gmap)
250{
251	S390_lowcore.gmap = (unsigned long) gmap;
252}
253EXPORT_SYMBOL_GPL(gmap_enable);
254
255/**
256 * gmap_disable - switch back to the standard primary address space
257 * @gmap: pointer to the guest address space structure
258 */
259void gmap_disable(struct gmap *gmap)
260{
261	S390_lowcore.gmap = 0UL;
262}
263EXPORT_SYMBOL_GPL(gmap_disable);
264
265/*
266 * gmap_alloc_table is assumed to be called with mmap_sem held
267 */
268static int gmap_alloc_table(struct gmap *gmap,
269			    unsigned long *table, unsigned long init)
270	__releases(&gmap->mm->page_table_lock)
271	__acquires(&gmap->mm->page_table_lock)
272{
273	struct page *page;
274	unsigned long *new;
275
276	/* since we dont free the gmap table until gmap_free we can unlock */
277	spin_unlock(&gmap->mm->page_table_lock);
278	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
279	spin_lock(&gmap->mm->page_table_lock);
280	if (!page)
281		return -ENOMEM;
282	new = (unsigned long *) page_to_phys(page);
283	crst_table_init(new, init);
284	if (*table & _REGION_ENTRY_INVALID) {
285		list_add(&page->lru, &gmap->crst_list);
286		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
287			(*table & _REGION_ENTRY_TYPE_MASK);
288	} else
289		__free_pages(page, ALLOC_ORDER);
290	return 0;
291}
292
293/**
294 * gmap_unmap_segment - unmap segment from the guest address space
295 * @gmap: pointer to the guest address space structure
296 * @addr: address in the guest address space
297 * @len: length of the memory area to unmap
298 *
299 * Returns 0 if the unmap succeeded, -EINVAL if not.
300 */
301int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
302{
303	unsigned long *table;
304	unsigned long off;
305	int flush;
306
307	if ((to | len) & (PMD_SIZE - 1))
308		return -EINVAL;
309	if (len == 0 || to + len < to)
310		return -EINVAL;
311
312	flush = 0;
313	down_read(&gmap->mm->mmap_sem);
314	spin_lock(&gmap->mm->page_table_lock);
315	for (off = 0; off < len; off += PMD_SIZE) {
316		/* Walk the guest addr space page table */
317		table = gmap->table + (((to + off) >> 53) & 0x7ff);
318		if (*table & _REGION_ENTRY_INVALID)
319			goto out;
320		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
321		table = table + (((to + off) >> 42) & 0x7ff);
322		if (*table & _REGION_ENTRY_INVALID)
323			goto out;
324		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
325		table = table + (((to + off) >> 31) & 0x7ff);
326		if (*table & _REGION_ENTRY_INVALID)
327			goto out;
328		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
329		table = table + (((to + off) >> 20) & 0x7ff);
330
331		/* Clear segment table entry in guest address space. */
332		flush |= gmap_unlink_segment(gmap, table);
333		*table = _SEGMENT_ENTRY_INVALID;
334	}
335out:
336	spin_unlock(&gmap->mm->page_table_lock);
337	up_read(&gmap->mm->mmap_sem);
338	if (flush)
339		gmap_flush_tlb(gmap);
340	return 0;
341}
342EXPORT_SYMBOL_GPL(gmap_unmap_segment);
343
344/**
345 * gmap_mmap_segment - map a segment to the guest address space
346 * @gmap: pointer to the guest address space structure
347 * @from: source address in the parent address space
348 * @to: target address in the guest address space
349 *
350 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
351 */
352int gmap_map_segment(struct gmap *gmap, unsigned long from,
353		     unsigned long to, unsigned long len)
354{
355	unsigned long *table;
356	unsigned long off;
357	int flush;
358
359	if ((from | to | len) & (PMD_SIZE - 1))
360		return -EINVAL;
361	if (len == 0 || from + len > TASK_MAX_SIZE ||
362	    from + len < from || to + len < to)
363		return -EINVAL;
364
365	flush = 0;
366	down_read(&gmap->mm->mmap_sem);
367	spin_lock(&gmap->mm->page_table_lock);
368	for (off = 0; off < len; off += PMD_SIZE) {
369		/* Walk the gmap address space page table */
370		table = gmap->table + (((to + off) >> 53) & 0x7ff);
371		if ((*table & _REGION_ENTRY_INVALID) &&
372		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
373			goto out_unmap;
374		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
375		table = table + (((to + off) >> 42) & 0x7ff);
376		if ((*table & _REGION_ENTRY_INVALID) &&
377		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
378			goto out_unmap;
379		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
380		table = table + (((to + off) >> 31) & 0x7ff);
381		if ((*table & _REGION_ENTRY_INVALID) &&
382		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
383			goto out_unmap;
384		table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
385		table = table + (((to + off) >> 20) & 0x7ff);
386
387		/* Store 'from' address in an invalid segment table entry. */
388		flush |= gmap_unlink_segment(gmap, table);
389		*table =  (from + off) | (_SEGMENT_ENTRY_INVALID |
390					  _SEGMENT_ENTRY_PROTECT);
391	}
392	spin_unlock(&gmap->mm->page_table_lock);
393	up_read(&gmap->mm->mmap_sem);
394	if (flush)
395		gmap_flush_tlb(gmap);
396	return 0;
397
398out_unmap:
399	spin_unlock(&gmap->mm->page_table_lock);
400	up_read(&gmap->mm->mmap_sem);
401	gmap_unmap_segment(gmap, to, len);
402	return -ENOMEM;
403}
404EXPORT_SYMBOL_GPL(gmap_map_segment);
405
406static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap)
407{
408	unsigned long *table;
409
410	table = gmap->table + ((address >> 53) & 0x7ff);
411	if (unlikely(*table & _REGION_ENTRY_INVALID))
412		return ERR_PTR(-EFAULT);
413	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
414	table = table + ((address >> 42) & 0x7ff);
415	if (unlikely(*table & _REGION_ENTRY_INVALID))
416		return ERR_PTR(-EFAULT);
417	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
418	table = table + ((address >> 31) & 0x7ff);
419	if (unlikely(*table & _REGION_ENTRY_INVALID))
420		return ERR_PTR(-EFAULT);
421	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
422	table = table + ((address >> 20) & 0x7ff);
423	return table;
424}
425
426/**
427 * __gmap_translate - translate a guest address to a user space address
428 * @address: guest address
429 * @gmap: pointer to guest mapping meta data structure
430 *
431 * Returns user space address which corresponds to the guest address or
432 * -EFAULT if no such mapping exists.
433 * This function does not establish potentially missing page table entries.
434 * The mmap_sem of the mm that belongs to the address space must be held
435 * when this function gets called.
436 */
437unsigned long __gmap_translate(unsigned long address, struct gmap *gmap)
438{
439	unsigned long *segment_ptr, vmaddr, segment;
440	struct gmap_pgtable *mp;
441	struct page *page;
442
443	current->thread.gmap_addr = address;
444	segment_ptr = gmap_table_walk(address, gmap);
445	if (IS_ERR(segment_ptr))
446		return PTR_ERR(segment_ptr);
447	/* Convert the gmap address to an mm address. */
448	segment = *segment_ptr;
449	if (!(segment & _SEGMENT_ENTRY_INVALID)) {
450		page = pfn_to_page(segment >> PAGE_SHIFT);
451		mp = (struct gmap_pgtable *) page->index;
452		return mp->vmaddr | (address & ~PMD_MASK);
453	} else if (segment & _SEGMENT_ENTRY_PROTECT) {
454		vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
455		return vmaddr | (address & ~PMD_MASK);
456	}
457	return -EFAULT;
458}
459EXPORT_SYMBOL_GPL(__gmap_translate);
460
461/**
462 * gmap_translate - translate a guest address to a user space address
463 * @address: guest address
464 * @gmap: pointer to guest mapping meta data structure
465 *
466 * Returns user space address which corresponds to the guest address or
467 * -EFAULT if no such mapping exists.
468 * This function does not establish potentially missing page table entries.
469 */
470unsigned long gmap_translate(unsigned long address, struct gmap *gmap)
471{
472	unsigned long rc;
473
474	down_read(&gmap->mm->mmap_sem);
475	rc = __gmap_translate(address, gmap);
476	up_read(&gmap->mm->mmap_sem);
477	return rc;
478}
479EXPORT_SYMBOL_GPL(gmap_translate);
480
481static int gmap_connect_pgtable(unsigned long address, unsigned long segment,
482				unsigned long *segment_ptr, struct gmap *gmap)
483{
484	unsigned long vmaddr;
485	struct vm_area_struct *vma;
486	struct gmap_pgtable *mp;
487	struct gmap_rmap *rmap;
488	struct mm_struct *mm;
489	struct page *page;
490	pgd_t *pgd;
491	pud_t *pud;
492	pmd_t *pmd;
493
494	mm = gmap->mm;
495	vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
496	vma = find_vma(mm, vmaddr);
497	if (!vma || vma->vm_start > vmaddr)
498		return -EFAULT;
499	/* Walk the parent mm page table */
500	pgd = pgd_offset(mm, vmaddr);
501	pud = pud_alloc(mm, pgd, vmaddr);
502	if (!pud)
503		return -ENOMEM;
504	pmd = pmd_alloc(mm, pud, vmaddr);
505	if (!pmd)
506		return -ENOMEM;
507	if (!pmd_present(*pmd) &&
508	    __pte_alloc(mm, vma, pmd, vmaddr))
509		return -ENOMEM;
510	/* pmd now points to a valid segment table entry. */
511	rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
512	if (!rmap)
513		return -ENOMEM;
514	/* Link gmap segment table entry location to page table. */
515	page = pmd_page(*pmd);
516	mp = (struct gmap_pgtable *) page->index;
517	rmap->gmap = gmap;
518	rmap->entry = segment_ptr;
519	rmap->vmaddr = address & PMD_MASK;
520	spin_lock(&mm->page_table_lock);
521	if (*segment_ptr == segment) {
522		list_add(&rmap->list, &mp->mapper);
523		/* Set gmap segment table entry to page table. */
524		*segment_ptr = pmd_val(*pmd) & PAGE_MASK;
525		rmap = NULL;
526	}
527	spin_unlock(&mm->page_table_lock);
528	kfree(rmap);
529	return 0;
530}
531
532static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table)
533{
534	struct gmap_rmap *rmap, *next;
535	struct gmap_pgtable *mp;
536	struct page *page;
537	int flush;
538
539	flush = 0;
540	spin_lock(&mm->page_table_lock);
541	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
542	mp = (struct gmap_pgtable *) page->index;
543	list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
544		*rmap->entry = mp->vmaddr | (_SEGMENT_ENTRY_INVALID |
545					     _SEGMENT_ENTRY_PROTECT);
546		list_del(&rmap->list);
547		kfree(rmap);
548		flush = 1;
549	}
550	spin_unlock(&mm->page_table_lock);
551	if (flush)
552		__tlb_flush_global();
553}
554
555/*
556 * this function is assumed to be called with mmap_sem held
557 */
558unsigned long __gmap_fault(unsigned long address, struct gmap *gmap)
559{
560	unsigned long *segment_ptr, segment;
561	struct gmap_pgtable *mp;
562	struct page *page;
563	int rc;
564
565	current->thread.gmap_addr = address;
566	segment_ptr = gmap_table_walk(address, gmap);
567	if (IS_ERR(segment_ptr))
568		return -EFAULT;
569	/* Convert the gmap address to an mm address. */
570	while (1) {
571		segment = *segment_ptr;
572		if (!(segment & _SEGMENT_ENTRY_INVALID)) {
573			/* Page table is present */
574			page = pfn_to_page(segment >> PAGE_SHIFT);
575			mp = (struct gmap_pgtable *) page->index;
576			return mp->vmaddr | (address & ~PMD_MASK);
577		}
578		if (!(segment & _SEGMENT_ENTRY_PROTECT))
579			/* Nothing mapped in the gmap address space. */
580			break;
581		rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap);
582		if (rc)
583			return rc;
584	}
585	return -EFAULT;
586}
587
588unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
589{
590	unsigned long rc;
591
592	down_read(&gmap->mm->mmap_sem);
593	rc = __gmap_fault(address, gmap);
594	up_read(&gmap->mm->mmap_sem);
595
596	return rc;
597}
598EXPORT_SYMBOL_GPL(gmap_fault);
599
600static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
601{
602	if (!non_swap_entry(entry))
603		dec_mm_counter(mm, MM_SWAPENTS);
604	else if (is_migration_entry(entry)) {
605		struct page *page = migration_entry_to_page(entry);
606
607		if (PageAnon(page))
608			dec_mm_counter(mm, MM_ANONPAGES);
609		else
610			dec_mm_counter(mm, MM_FILEPAGES);
611	}
612	free_swap_and_cache(entry);
613}
614
615/**
616 * The mm->mmap_sem lock must be held
617 */
618static void gmap_zap_unused(struct mm_struct *mm, unsigned long address)
619{
620	unsigned long ptev, pgstev;
621	spinlock_t *ptl;
622	pgste_t pgste;
623	pte_t *ptep, pte;
624
625	ptep = get_locked_pte(mm, address, &ptl);
626	if (unlikely(!ptep))
627		return;
628	pte = *ptep;
629	if (!pte_swap(pte))
630		goto out_pte;
631	/* Zap unused and logically-zero pages */
632	pgste = pgste_get_lock(ptep);
633	pgstev = pgste_val(pgste);
634	ptev = pte_val(pte);
635	if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
636	    ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) {
637		gmap_zap_swap_entry(pte_to_swp_entry(pte), mm);
638		pte_clear(mm, address, ptep);
639	}
640	pgste_set_unlock(ptep, pgste);
641out_pte:
642	pte_unmap_unlock(*ptep, ptl);
643}
644
645/*
646 * this function is assumed to be called with mmap_sem held
647 */
648void __gmap_zap(unsigned long address, struct gmap *gmap)
649{
650	unsigned long *table, *segment_ptr;
651	unsigned long segment, pgstev, ptev;
652	struct gmap_pgtable *mp;
653	struct page *page;
654
655	segment_ptr = gmap_table_walk(address, gmap);
656	if (IS_ERR(segment_ptr))
657		return;
658	segment = *segment_ptr;
659	if (segment & _SEGMENT_ENTRY_INVALID)
660		return;
661	page = pfn_to_page(segment >> PAGE_SHIFT);
662	mp = (struct gmap_pgtable *) page->index;
663	address = mp->vmaddr | (address & ~PMD_MASK);
664	/* Page table is present */
665	table = (unsigned long *)(segment & _SEGMENT_ENTRY_ORIGIN);
666	table = table + ((address >> 12) & 0xff);
667	pgstev = table[PTRS_PER_PTE];
668	ptev = table[0];
669	/* quick check, checked again with locks held */
670	if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
671	    ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID)))
672		gmap_zap_unused(gmap->mm, address);
673}
674EXPORT_SYMBOL_GPL(__gmap_zap);
675
676void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap)
677{
678
679	unsigned long *table, address, size;
680	struct vm_area_struct *vma;
681	struct gmap_pgtable *mp;
682	struct page *page;
683
684	down_read(&gmap->mm->mmap_sem);
685	address = from;
686	while (address < to) {
687		/* Walk the gmap address space page table */
688		table = gmap->table + ((address >> 53) & 0x7ff);
689		if (unlikely(*table & _REGION_ENTRY_INVALID)) {
690			address = (address + PMD_SIZE) & PMD_MASK;
691			continue;
692		}
693		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
694		table = table + ((address >> 42) & 0x7ff);
695		if (unlikely(*table & _REGION_ENTRY_INVALID)) {
696			address = (address + PMD_SIZE) & PMD_MASK;
697			continue;
698		}
699		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
700		table = table + ((address >> 31) & 0x7ff);
701		if (unlikely(*table & _REGION_ENTRY_INVALID)) {
702			address = (address + PMD_SIZE) & PMD_MASK;
703			continue;
704		}
705		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
706		table = table + ((address >> 20) & 0x7ff);
707		if (unlikely(*table & _SEGMENT_ENTRY_INVALID)) {
708			address = (address + PMD_SIZE) & PMD_MASK;
709			continue;
710		}
711		page = pfn_to_page(*table >> PAGE_SHIFT);
712		mp = (struct gmap_pgtable *) page->index;
713		vma = find_vma(gmap->mm, mp->vmaddr);
714		size = min(to - address, PMD_SIZE - (address & ~PMD_MASK));
715		zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK),
716			       size, NULL);
717		address = (address + PMD_SIZE) & PMD_MASK;
718	}
719	up_read(&gmap->mm->mmap_sem);
720}
721EXPORT_SYMBOL_GPL(gmap_discard);
722
723static LIST_HEAD(gmap_notifier_list);
724static DEFINE_SPINLOCK(gmap_notifier_lock);
725
726/**
727 * gmap_register_ipte_notifier - register a pte invalidation callback
728 * @nb: pointer to the gmap notifier block
729 */
730void gmap_register_ipte_notifier(struct gmap_notifier *nb)
731{
732	spin_lock(&gmap_notifier_lock);
733	list_add(&nb->list, &gmap_notifier_list);
734	spin_unlock(&gmap_notifier_lock);
735}
736EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
737
738/**
739 * gmap_unregister_ipte_notifier - remove a pte invalidation callback
740 * @nb: pointer to the gmap notifier block
741 */
742void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
743{
744	spin_lock(&gmap_notifier_lock);
745	list_del_init(&nb->list);
746	spin_unlock(&gmap_notifier_lock);
747}
748EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
749
750/**
751 * gmap_ipte_notify - mark a range of ptes for invalidation notification
752 * @gmap: pointer to guest mapping meta data structure
753 * @start: virtual address in the guest address space
754 * @len: size of area
755 *
756 * Returns 0 if for each page in the given range a gmap mapping exists and
757 * the invalidation notification could be set. If the gmap mapping is missing
758 * for one or more pages -EFAULT is returned. If no memory could be allocated
759 * -ENOMEM is returned. This function establishes missing page table entries.
760 */
761int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len)
762{
763	unsigned long addr;
764	spinlock_t *ptl;
765	pte_t *ptep, entry;
766	pgste_t pgste;
767	int rc = 0;
768
769	if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK))
770		return -EINVAL;
771	down_read(&gmap->mm->mmap_sem);
772	while (len) {
773		/* Convert gmap address and connect the page tables */
774		addr = __gmap_fault(start, gmap);
775		if (IS_ERR_VALUE(addr)) {
776			rc = addr;
777			break;
778		}
779		/* Get the page mapped */
780		if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) {
781			rc = -EFAULT;
782			break;
783		}
784		/* Walk the process page table, lock and get pte pointer */
785		ptep = get_locked_pte(gmap->mm, addr, &ptl);
786		if (unlikely(!ptep))
787			continue;
788		/* Set notification bit in the pgste of the pte */
789		entry = *ptep;
790		if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
791			pgste = pgste_get_lock(ptep);
792			pgste_val(pgste) |= PGSTE_IN_BIT;
793			pgste_set_unlock(ptep, pgste);
794			start += PAGE_SIZE;
795			len -= PAGE_SIZE;
796		}
797		spin_unlock(ptl);
798	}
799	up_read(&gmap->mm->mmap_sem);
800	return rc;
801}
802EXPORT_SYMBOL_GPL(gmap_ipte_notify);
803
804/**
805 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
806 * @mm: pointer to the process mm_struct
807 * @pte: pointer to the page table entry
808 *
809 * This function is assumed to be called with the page table lock held
810 * for the pte to notify.
811 */
812void gmap_do_ipte_notify(struct mm_struct *mm, pte_t *pte)
813{
814	unsigned long segment_offset;
815	struct gmap_notifier *nb;
816	struct gmap_pgtable *mp;
817	struct gmap_rmap *rmap;
818	struct page *page;
819
820	segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
821	segment_offset = segment_offset * (4096 / sizeof(pte_t));
822	page = pfn_to_page(__pa(pte) >> PAGE_SHIFT);
823	mp = (struct gmap_pgtable *) page->index;
824	spin_lock(&gmap_notifier_lock);
825	list_for_each_entry(rmap, &mp->mapper, list) {
826		list_for_each_entry(nb, &gmap_notifier_list, list)
827			nb->notifier_call(rmap->gmap,
828					  rmap->vmaddr + segment_offset);
829	}
830	spin_unlock(&gmap_notifier_lock);
831}
832
833static inline int page_table_with_pgste(struct page *page)
834{
835	return atomic_read(&page->_mapcount) == 0;
836}
837
838static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
839						    unsigned long vmaddr)
840{
841	struct page *page;
842	unsigned long *table;
843	struct gmap_pgtable *mp;
844
845	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
846	if (!page)
847		return NULL;
848	mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
849	if (!mp) {
850		__free_page(page);
851		return NULL;
852	}
853	if (!pgtable_page_ctor(page)) {
854		kfree(mp);
855		__free_page(page);
856		return NULL;
857	}
858	mp->vmaddr = vmaddr & PMD_MASK;
859	INIT_LIST_HEAD(&mp->mapper);
860	page->index = (unsigned long) mp;
861	atomic_set(&page->_mapcount, 0);
862	table = (unsigned long *) page_to_phys(page);
863	clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
864	clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
865		    PAGE_SIZE/2);
866	return table;
867}
868
869static inline void page_table_free_pgste(unsigned long *table)
870{
871	struct page *page;
872	struct gmap_pgtable *mp;
873
874	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
875	mp = (struct gmap_pgtable *) page->index;
876	BUG_ON(!list_empty(&mp->mapper));
877	pgtable_page_dtor(page);
878	atomic_set(&page->_mapcount, -1);
879	kfree(mp);
880	__free_page(page);
881}
882
883static inline unsigned long page_table_reset_pte(struct mm_struct *mm,
884			pmd_t *pmd, unsigned long addr, unsigned long end)
885{
886	pte_t *start_pte, *pte;
887	spinlock_t *ptl;
888	pgste_t pgste;
889
890	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
891	pte = start_pte;
892	do {
893		pgste = pgste_get_lock(pte);
894		pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
895		pgste_set_unlock(pte, pgste);
896	} while (pte++, addr += PAGE_SIZE, addr != end);
897	pte_unmap_unlock(start_pte, ptl);
898
899	return addr;
900}
901
902static inline unsigned long page_table_reset_pmd(struct mm_struct *mm,
903			pud_t *pud, unsigned long addr, unsigned long end)
904{
905	unsigned long next;
906	pmd_t *pmd;
907
908	pmd = pmd_offset(pud, addr);
909	do {
910		next = pmd_addr_end(addr, end);
911		if (pmd_none_or_clear_bad(pmd))
912			continue;
913		next = page_table_reset_pte(mm, pmd, addr, next);
914	} while (pmd++, addr = next, addr != end);
915
916	return addr;
917}
918
919static inline unsigned long page_table_reset_pud(struct mm_struct *mm,
920			pgd_t *pgd, unsigned long addr, unsigned long end)
921{
922	unsigned long next;
923	pud_t *pud;
924
925	pud = pud_offset(pgd, addr);
926	do {
927		next = pud_addr_end(addr, end);
928		if (pud_none_or_clear_bad(pud))
929			continue;
930		next = page_table_reset_pmd(mm, pud, addr, next);
931	} while (pud++, addr = next, addr != end);
932
933	return addr;
934}
935
936void page_table_reset_pgste(struct mm_struct *mm,
937			unsigned long start, unsigned long end)
938{
939	unsigned long addr, next;
940	pgd_t *pgd;
941
942	addr = start;
943	down_read(&mm->mmap_sem);
944	pgd = pgd_offset(mm, addr);
945	do {
946		next = pgd_addr_end(addr, end);
947		if (pgd_none_or_clear_bad(pgd))
948			continue;
949		next = page_table_reset_pud(mm, pgd, addr, next);
950	} while (pgd++, addr = next, addr != end);
951	up_read(&mm->mmap_sem);
952}
953EXPORT_SYMBOL(page_table_reset_pgste);
954
955int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
956			  unsigned long key, bool nq)
957{
958	spinlock_t *ptl;
959	pgste_t old, new;
960	pte_t *ptep;
961
962	down_read(&mm->mmap_sem);
963	ptep = get_locked_pte(current->mm, addr, &ptl);
964	if (unlikely(!ptep)) {
965		up_read(&mm->mmap_sem);
966		return -EFAULT;
967	}
968
969	new = old = pgste_get_lock(ptep);
970	pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
971			    PGSTE_ACC_BITS | PGSTE_FP_BIT);
972	pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
973	pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
974	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
975		unsigned long address, bits, skey;
976
977		address = pte_val(*ptep) & PAGE_MASK;
978		skey = (unsigned long) page_get_storage_key(address);
979		bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
980		skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
981		/* Set storage key ACC and FP */
982		page_set_storage_key(address, skey, !nq);
983		/* Merge host changed & referenced into pgste  */
984		pgste_val(new) |= bits << 52;
985	}
986	/* changing the guest storage key is considered a change of the page */
987	if ((pgste_val(new) ^ pgste_val(old)) &
988	    (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
989		pgste_val(new) |= PGSTE_HC_BIT;
990
991	pgste_set_unlock(ptep, new);
992	pte_unmap_unlock(*ptep, ptl);
993	up_read(&mm->mmap_sem);
994	return 0;
995}
996EXPORT_SYMBOL(set_guest_storage_key);
997
998#else /* CONFIG_PGSTE */
999
1000static inline int page_table_with_pgste(struct page *page)
1001{
1002	return 0;
1003}
1004
1005static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
1006						    unsigned long vmaddr)
1007{
1008	return NULL;
1009}
1010
1011static inline void page_table_free_pgste(unsigned long *table)
1012{
1013}
1014
1015static inline void gmap_disconnect_pgtable(struct mm_struct *mm,
1016					   unsigned long *table)
1017{
1018}
1019
1020#endif /* CONFIG_PGSTE */
1021
1022static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
1023{
1024	unsigned int old, new;
1025
1026	do {
1027		old = atomic_read(v);
1028		new = old ^ bits;
1029	} while (atomic_cmpxchg(v, old, new) != old);
1030	return new;
1031}
1032
1033/*
1034 * page table entry allocation/free routines.
1035 */
1036unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
1037{
1038	unsigned long *uninitialized_var(table);
1039	struct page *uninitialized_var(page);
1040	unsigned int mask, bit;
1041
1042	if (mm_has_pgste(mm))
1043		return page_table_alloc_pgste(mm, vmaddr);
1044	/* Allocate fragments of a 4K page as 1K/2K page table */
1045	spin_lock_bh(&mm->context.list_lock);
1046	mask = FRAG_MASK;
1047	if (!list_empty(&mm->context.pgtable_list)) {
1048		page = list_first_entry(&mm->context.pgtable_list,
1049					struct page, lru);
1050		table = (unsigned long *) page_to_phys(page);
1051		mask = atomic_read(&page->_mapcount);
1052		mask = mask | (mask >> 4);
1053	}
1054	if ((mask & FRAG_MASK) == FRAG_MASK) {
1055		spin_unlock_bh(&mm->context.list_lock);
1056		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
1057		if (!page)
1058			return NULL;
1059		if (!pgtable_page_ctor(page)) {
1060			__free_page(page);
1061			return NULL;
1062		}
1063		atomic_set(&page->_mapcount, 1);
1064		table = (unsigned long *) page_to_phys(page);
1065		clear_table(table, _PAGE_INVALID, PAGE_SIZE);
1066		spin_lock_bh(&mm->context.list_lock);
1067		list_add(&page->lru, &mm->context.pgtable_list);
1068	} else {
1069		for (bit = 1; mask & bit; bit <<= 1)
1070			table += PTRS_PER_PTE;
1071		mask = atomic_xor_bits(&page->_mapcount, bit);
1072		if ((mask & FRAG_MASK) == FRAG_MASK)
1073			list_del(&page->lru);
1074	}
1075	spin_unlock_bh(&mm->context.list_lock);
1076	return table;
1077}
1078
1079void page_table_free(struct mm_struct *mm, unsigned long *table)
1080{
1081	struct page *page;
1082	unsigned int bit, mask;
1083
1084	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1085	if (page_table_with_pgste(page)) {
1086		gmap_disconnect_pgtable(mm, table);
1087		return page_table_free_pgste(table);
1088	}
1089	/* Free 1K/2K page table fragment of a 4K page */
1090	bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
1091	spin_lock_bh(&mm->context.list_lock);
1092	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
1093		list_del(&page->lru);
1094	mask = atomic_xor_bits(&page->_mapcount, bit);
1095	if (mask & FRAG_MASK)
1096		list_add(&page->lru, &mm->context.pgtable_list);
1097	spin_unlock_bh(&mm->context.list_lock);
1098	if (mask == 0) {
1099		pgtable_page_dtor(page);
1100		atomic_set(&page->_mapcount, -1);
1101		__free_page(page);
1102	}
1103}
1104
1105static void __page_table_free_rcu(void *table, unsigned bit)
1106{
1107	struct page *page;
1108
1109	if (bit == FRAG_MASK)
1110		return page_table_free_pgste(table);
1111	/* Free 1K/2K page table fragment of a 4K page */
1112	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1113	if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
1114		pgtable_page_dtor(page);
1115		atomic_set(&page->_mapcount, -1);
1116		__free_page(page);
1117	}
1118}
1119
1120void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
1121{
1122	struct mm_struct *mm;
1123	struct page *page;
1124	unsigned int bit, mask;
1125
1126	mm = tlb->mm;
1127	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1128	if (page_table_with_pgste(page)) {
1129		gmap_disconnect_pgtable(mm, table);
1130		table = (unsigned long *) (__pa(table) | FRAG_MASK);
1131		tlb_remove_table(tlb, table);
1132		return;
1133	}
1134	bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
1135	spin_lock_bh(&mm->context.list_lock);
1136	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
1137		list_del(&page->lru);
1138	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
1139	if (mask & FRAG_MASK)
1140		list_add_tail(&page->lru, &mm->context.pgtable_list);
1141	spin_unlock_bh(&mm->context.list_lock);
1142	table = (unsigned long *) (__pa(table) | (bit << 4));
1143	tlb_remove_table(tlb, table);
1144}
1145
1146static void __tlb_remove_table(void *_table)
1147{
1148	const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
1149	void *table = (void *)((unsigned long) _table & ~mask);
1150	unsigned type = (unsigned long) _table & mask;
1151
1152	if (type)
1153		__page_table_free_rcu(table, type);
1154	else
1155		free_pages((unsigned long) table, ALLOC_ORDER);
1156}
1157
1158static void tlb_remove_table_smp_sync(void *arg)
1159{
1160	/* Simply deliver the interrupt */
1161}
1162
1163static void tlb_remove_table_one(void *table)
1164{
1165	/*
1166	 * This isn't an RCU grace period and hence the page-tables cannot be
1167	 * assumed to be actually RCU-freed.
1168	 *
1169	 * It is however sufficient for software page-table walkers that rely
1170	 * on IRQ disabling. See the comment near struct mmu_table_batch.
1171	 */
1172	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
1173	__tlb_remove_table(table);
1174}
1175
1176static void tlb_remove_table_rcu(struct rcu_head *head)
1177{
1178	struct mmu_table_batch *batch;
1179	int i;
1180
1181	batch = container_of(head, struct mmu_table_batch, rcu);
1182
1183	for (i = 0; i < batch->nr; i++)
1184		__tlb_remove_table(batch->tables[i]);
1185
1186	free_page((unsigned long)batch);
1187}
1188
1189void tlb_table_flush(struct mmu_gather *tlb)
1190{
1191	struct mmu_table_batch **batch = &tlb->batch;
1192
1193	if (*batch) {
1194		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
1195		*batch = NULL;
1196	}
1197}
1198
1199void tlb_remove_table(struct mmu_gather *tlb, void *table)
1200{
1201	struct mmu_table_batch **batch = &tlb->batch;
1202
1203	tlb->mm->context.flush_mm = 1;
1204	if (*batch == NULL) {
1205		*batch = (struct mmu_table_batch *)
1206			__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
1207		if (*batch == NULL) {
1208			__tlb_flush_mm_lazy(tlb->mm);
1209			tlb_remove_table_one(table);
1210			return;
1211		}
1212		(*batch)->nr = 0;
1213	}
1214	(*batch)->tables[(*batch)->nr++] = table;
1215	if ((*batch)->nr == MAX_TABLE_BATCH)
1216		tlb_flush_mmu(tlb);
1217}
1218
1219#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1220static inline void thp_split_vma(struct vm_area_struct *vma)
1221{
1222	unsigned long addr;
1223
1224	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
1225		follow_page(vma, addr, FOLL_SPLIT);
1226}
1227
1228static inline void thp_split_mm(struct mm_struct *mm)
1229{
1230	struct vm_area_struct *vma;
1231
1232	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
1233		thp_split_vma(vma);
1234		vma->vm_flags &= ~VM_HUGEPAGE;
1235		vma->vm_flags |= VM_NOHUGEPAGE;
1236	}
1237	mm->def_flags |= VM_NOHUGEPAGE;
1238}
1239#else
1240static inline void thp_split_mm(struct mm_struct *mm)
1241{
1242}
1243#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1244
1245static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
1246				struct mm_struct *mm, pud_t *pud,
1247				unsigned long addr, unsigned long end)
1248{
1249	unsigned long next, *table, *new;
1250	struct page *page;
1251	pmd_t *pmd;
1252
1253	pmd = pmd_offset(pud, addr);
1254	do {
1255		next = pmd_addr_end(addr, end);
1256again:
1257		if (pmd_none_or_clear_bad(pmd))
1258			continue;
1259		table = (unsigned long *) pmd_deref(*pmd);
1260		page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1261		if (page_table_with_pgste(page))
1262			continue;
1263		/* Allocate new page table with pgstes */
1264		new = page_table_alloc_pgste(mm, addr);
1265		if (!new)
1266			return -ENOMEM;
1267
1268		spin_lock(&mm->page_table_lock);
1269		if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
1270			/* Nuke pmd entry pointing to the "short" page table */
1271			pmdp_flush_lazy(mm, addr, pmd);
1272			pmd_clear(pmd);
1273			/* Copy ptes from old table to new table */
1274			memcpy(new, table, PAGE_SIZE/2);
1275			clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
1276			/* Establish new table */
1277			pmd_populate(mm, pmd, (pte_t *) new);
1278			/* Free old table with rcu, there might be a walker! */
1279			page_table_free_rcu(tlb, table);
1280			new = NULL;
1281		}
1282		spin_unlock(&mm->page_table_lock);
1283		if (new) {
1284			page_table_free_pgste(new);
1285			goto again;
1286		}
1287	} while (pmd++, addr = next, addr != end);
1288
1289	return addr;
1290}
1291
1292static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
1293				   struct mm_struct *mm, pgd_t *pgd,
1294				   unsigned long addr, unsigned long end)
1295{
1296	unsigned long next;
1297	pud_t *pud;
1298
1299	pud = pud_offset(pgd, addr);
1300	do {
1301		next = pud_addr_end(addr, end);
1302		if (pud_none_or_clear_bad(pud))
1303			continue;
1304		next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
1305		if (unlikely(IS_ERR_VALUE(next)))
1306			return next;
1307	} while (pud++, addr = next, addr != end);
1308
1309	return addr;
1310}
1311
1312static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
1313					unsigned long addr, unsigned long end)
1314{
1315	unsigned long next;
1316	pgd_t *pgd;
1317
1318	pgd = pgd_offset(mm, addr);
1319	do {
1320		next = pgd_addr_end(addr, end);
1321		if (pgd_none_or_clear_bad(pgd))
1322			continue;
1323		next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
1324		if (unlikely(IS_ERR_VALUE(next)))
1325			return next;
1326	} while (pgd++, addr = next, addr != end);
1327
1328	return 0;
1329}
1330
1331/*
1332 * switch on pgstes for its userspace process (for kvm)
1333 */
1334int s390_enable_sie(void)
1335{
1336	struct task_struct *tsk = current;
1337	struct mm_struct *mm = tsk->mm;
1338	struct mmu_gather tlb;
1339
1340	/* Do we have pgstes? if yes, we are done */
1341	if (mm_has_pgste(tsk->mm))
1342		return 0;
1343
1344	down_write(&mm->mmap_sem);
1345	/* split thp mappings and disable thp for future mappings */
1346	thp_split_mm(mm);
1347	/* Reallocate the page tables with pgstes */
1348	tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
1349	if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE))
1350		mm->context.has_pgste = 1;
1351	tlb_finish_mmu(&tlb, 0, TASK_SIZE);
1352	up_write(&mm->mmap_sem);
1353	return mm->context.has_pgste ? 0 : -ENOMEM;
1354}
1355EXPORT_SYMBOL_GPL(s390_enable_sie);
1356
1357#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1358int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
1359			   pmd_t *pmdp)
1360{
1361	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1362	/* No need to flush TLB
1363	 * On s390 reference bits are in storage key and never in TLB */
1364	return pmdp_test_and_clear_young(vma, address, pmdp);
1365}
1366
1367int pmdp_set_access_flags(struct vm_area_struct *vma,
1368			  unsigned long address, pmd_t *pmdp,
1369			  pmd_t entry, int dirty)
1370{
1371	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1372
1373	if (pmd_same(*pmdp, entry))
1374		return 0;
1375	pmdp_invalidate(vma, address, pmdp);
1376	set_pmd_at(vma->vm_mm, address, pmdp, entry);
1377	return 1;
1378}
1379
1380static void pmdp_splitting_flush_sync(void *arg)
1381{
1382	/* Simply deliver the interrupt */
1383}
1384
1385void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
1386			  pmd_t *pmdp)
1387{
1388	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1389	if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
1390			      (unsigned long *) pmdp)) {
1391		/* need to serialize against gup-fast (IRQ disabled) */
1392		smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
1393	}
1394}
1395
1396void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1397				pgtable_t pgtable)
1398{
1399	struct list_head *lh = (struct list_head *) pgtable;
1400
1401	assert_spin_locked(pmd_lockptr(mm, pmdp));
1402
1403	/* FIFO */
1404	if (!pmd_huge_pte(mm, pmdp))
1405		INIT_LIST_HEAD(lh);
1406	else
1407		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1408	pmd_huge_pte(mm, pmdp) = pgtable;
1409}
1410
1411pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1412{
1413	struct list_head *lh;
1414	pgtable_t pgtable;
1415	pte_t *ptep;
1416
1417	assert_spin_locked(pmd_lockptr(mm, pmdp));
1418
1419	/* FIFO */
1420	pgtable = pmd_huge_pte(mm, pmdp);
1421	lh = (struct list_head *) pgtable;
1422	if (list_empty(lh))
1423		pmd_huge_pte(mm, pmdp) = NULL;
1424	else {
1425		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1426		list_del(lh);
1427	}
1428	ptep = (pte_t *) pgtable;
1429	pte_val(*ptep) = _PAGE_INVALID;
1430	ptep++;
1431	pte_val(*ptep) = _PAGE_INVALID;
1432	return pgtable;
1433}
1434#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1435