pgtable.c revision 934bc131efc3e4be6a52f7dd6c4dbf99635e381a
1/*
2 *    Copyright IBM Corp. 2007, 2011
3 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
4 */
5
6#include <linux/sched.h>
7#include <linux/kernel.h>
8#include <linux/errno.h>
9#include <linux/gfp.h>
10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/smp.h>
13#include <linux/highmem.h>
14#include <linux/pagemap.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/quicklist.h>
18#include <linux/rcupdate.h>
19#include <linux/slab.h>
20#include <linux/swapops.h>
21
22#include <asm/pgtable.h>
23#include <asm/pgalloc.h>
24#include <asm/tlb.h>
25#include <asm/tlbflush.h>
26#include <asm/mmu_context.h>
27
28#ifndef CONFIG_64BIT
29#define ALLOC_ORDER	1
30#define FRAG_MASK	0x0f
31#else
32#define ALLOC_ORDER	2
33#define FRAG_MASK	0x03
34#endif
35
36
37unsigned long *crst_table_alloc(struct mm_struct *mm)
38{
39	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
40
41	if (!page)
42		return NULL;
43	return (unsigned long *) page_to_phys(page);
44}
45
46void crst_table_free(struct mm_struct *mm, unsigned long *table)
47{
48	free_pages((unsigned long) table, ALLOC_ORDER);
49}
50
51#ifdef CONFIG_64BIT
52static void __crst_table_upgrade(void *arg)
53{
54	struct mm_struct *mm = arg;
55
56	if (current->active_mm == mm)
57		update_user_asce(mm, 1);
58	__tlb_flush_local();
59}
60
61int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
62{
63	unsigned long *table, *pgd;
64	unsigned long entry;
65	int flush;
66
67	BUG_ON(limit > (1UL << 53));
68	flush = 0;
69repeat:
70	table = crst_table_alloc(mm);
71	if (!table)
72		return -ENOMEM;
73	spin_lock_bh(&mm->page_table_lock);
74	if (mm->context.asce_limit < limit) {
75		pgd = (unsigned long *) mm->pgd;
76		if (mm->context.asce_limit <= (1UL << 31)) {
77			entry = _REGION3_ENTRY_EMPTY;
78			mm->context.asce_limit = 1UL << 42;
79			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
80						_ASCE_USER_BITS |
81						_ASCE_TYPE_REGION3;
82		} else {
83			entry = _REGION2_ENTRY_EMPTY;
84			mm->context.asce_limit = 1UL << 53;
85			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
86						_ASCE_USER_BITS |
87						_ASCE_TYPE_REGION2;
88		}
89		crst_table_init(table, entry);
90		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
91		mm->pgd = (pgd_t *) table;
92		mm->task_size = mm->context.asce_limit;
93		table = NULL;
94		flush = 1;
95	}
96	spin_unlock_bh(&mm->page_table_lock);
97	if (table)
98		crst_table_free(mm, table);
99	if (mm->context.asce_limit < limit)
100		goto repeat;
101	if (flush)
102		on_each_cpu(__crst_table_upgrade, mm, 0);
103	return 0;
104}
105
106void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
107{
108	pgd_t *pgd;
109
110	if (current->active_mm == mm) {
111		clear_user_asce(mm, 1);
112		__tlb_flush_mm(mm);
113	}
114	while (mm->context.asce_limit > limit) {
115		pgd = mm->pgd;
116		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
117		case _REGION_ENTRY_TYPE_R2:
118			mm->context.asce_limit = 1UL << 42;
119			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
120						_ASCE_USER_BITS |
121						_ASCE_TYPE_REGION3;
122			break;
123		case _REGION_ENTRY_TYPE_R3:
124			mm->context.asce_limit = 1UL << 31;
125			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
126						_ASCE_USER_BITS |
127						_ASCE_TYPE_SEGMENT;
128			break;
129		default:
130			BUG();
131		}
132		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
133		mm->task_size = mm->context.asce_limit;
134		crst_table_free(mm, (unsigned long *) pgd);
135	}
136	if (current->active_mm == mm)
137		update_user_asce(mm, 1);
138}
139#endif
140
141#ifdef CONFIG_PGSTE
142
143/**
144 * gmap_alloc - allocate a guest address space
145 * @mm: pointer to the parent mm_struct
146 *
147 * Returns a guest address space structure.
148 */
149struct gmap *gmap_alloc(struct mm_struct *mm)
150{
151	struct gmap *gmap;
152	struct page *page;
153	unsigned long *table;
154
155	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
156	if (!gmap)
157		goto out;
158	INIT_LIST_HEAD(&gmap->crst_list);
159	gmap->mm = mm;
160	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
161	if (!page)
162		goto out_free;
163	list_add(&page->lru, &gmap->crst_list);
164	table = (unsigned long *) page_to_phys(page);
165	crst_table_init(table, _REGION1_ENTRY_EMPTY);
166	gmap->table = table;
167	gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
168		     _ASCE_USER_BITS | __pa(table);
169	list_add(&gmap->list, &mm->context.gmap_list);
170	return gmap;
171
172out_free:
173	kfree(gmap);
174out:
175	return NULL;
176}
177EXPORT_SYMBOL_GPL(gmap_alloc);
178
179static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
180{
181	struct gmap_pgtable *mp;
182	struct gmap_rmap *rmap;
183	struct page *page;
184
185	if (*table & _SEGMENT_ENTRY_INVALID)
186		return 0;
187	page = pfn_to_page(*table >> PAGE_SHIFT);
188	mp = (struct gmap_pgtable *) page->index;
189	list_for_each_entry(rmap, &mp->mapper, list) {
190		if (rmap->entry != table)
191			continue;
192		list_del(&rmap->list);
193		kfree(rmap);
194		break;
195	}
196	*table = mp->vmaddr | _SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_PROTECT;
197	return 1;
198}
199
200static void gmap_flush_tlb(struct gmap *gmap)
201{
202	if (MACHINE_HAS_IDTE)
203		__tlb_flush_asce(gmap->mm, (unsigned long) gmap->table |
204				 _ASCE_TYPE_REGION1);
205	else
206		__tlb_flush_global();
207}
208
209/**
210 * gmap_free - free a guest address space
211 * @gmap: pointer to the guest address space structure
212 */
213void gmap_free(struct gmap *gmap)
214{
215	struct page *page, *next;
216	unsigned long *table;
217	int i;
218
219
220	/* Flush tlb. */
221	if (MACHINE_HAS_IDTE)
222		__tlb_flush_asce(gmap->mm, (unsigned long) gmap->table |
223				 _ASCE_TYPE_REGION1);
224	else
225		__tlb_flush_global();
226
227	/* Free all segment & region tables. */
228	down_read(&gmap->mm->mmap_sem);
229	spin_lock(&gmap->mm->page_table_lock);
230	list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
231		table = (unsigned long *) page_to_phys(page);
232		if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
233			/* Remove gmap rmap structures for segment table. */
234			for (i = 0; i < PTRS_PER_PMD; i++, table++)
235				gmap_unlink_segment(gmap, table);
236		__free_pages(page, ALLOC_ORDER);
237	}
238	spin_unlock(&gmap->mm->page_table_lock);
239	up_read(&gmap->mm->mmap_sem);
240	list_del(&gmap->list);
241	kfree(gmap);
242}
243EXPORT_SYMBOL_GPL(gmap_free);
244
245/**
246 * gmap_enable - switch primary space to the guest address space
247 * @gmap: pointer to the guest address space structure
248 */
249void gmap_enable(struct gmap *gmap)
250{
251	S390_lowcore.gmap = (unsigned long) gmap;
252}
253EXPORT_SYMBOL_GPL(gmap_enable);
254
255/**
256 * gmap_disable - switch back to the standard primary address space
257 * @gmap: pointer to the guest address space structure
258 */
259void gmap_disable(struct gmap *gmap)
260{
261	S390_lowcore.gmap = 0UL;
262}
263EXPORT_SYMBOL_GPL(gmap_disable);
264
265/*
266 * gmap_alloc_table is assumed to be called with mmap_sem held
267 */
268static int gmap_alloc_table(struct gmap *gmap,
269			    unsigned long *table, unsigned long init)
270	__releases(&gmap->mm->page_table_lock)
271	__acquires(&gmap->mm->page_table_lock)
272{
273	struct page *page;
274	unsigned long *new;
275
276	/* since we dont free the gmap table until gmap_free we can unlock */
277	spin_unlock(&gmap->mm->page_table_lock);
278	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
279	spin_lock(&gmap->mm->page_table_lock);
280	if (!page)
281		return -ENOMEM;
282	new = (unsigned long *) page_to_phys(page);
283	crst_table_init(new, init);
284	if (*table & _REGION_ENTRY_INVALID) {
285		list_add(&page->lru, &gmap->crst_list);
286		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
287			(*table & _REGION_ENTRY_TYPE_MASK);
288	} else
289		__free_pages(page, ALLOC_ORDER);
290	return 0;
291}
292
293/**
294 * gmap_unmap_segment - unmap segment from the guest address space
295 * @gmap: pointer to the guest address space structure
296 * @addr: address in the guest address space
297 * @len: length of the memory area to unmap
298 *
299 * Returns 0 if the unmap succeeded, -EINVAL if not.
300 */
301int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
302{
303	unsigned long *table;
304	unsigned long off;
305	int flush;
306
307	if ((to | len) & (PMD_SIZE - 1))
308		return -EINVAL;
309	if (len == 0 || to + len < to)
310		return -EINVAL;
311
312	flush = 0;
313	down_read(&gmap->mm->mmap_sem);
314	spin_lock(&gmap->mm->page_table_lock);
315	for (off = 0; off < len; off += PMD_SIZE) {
316		/* Walk the guest addr space page table */
317		table = gmap->table + (((to + off) >> 53) & 0x7ff);
318		if (*table & _REGION_ENTRY_INVALID)
319			goto out;
320		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
321		table = table + (((to + off) >> 42) & 0x7ff);
322		if (*table & _REGION_ENTRY_INVALID)
323			goto out;
324		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
325		table = table + (((to + off) >> 31) & 0x7ff);
326		if (*table & _REGION_ENTRY_INVALID)
327			goto out;
328		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
329		table = table + (((to + off) >> 20) & 0x7ff);
330
331		/* Clear segment table entry in guest address space. */
332		flush |= gmap_unlink_segment(gmap, table);
333		*table = _SEGMENT_ENTRY_INVALID;
334	}
335out:
336	spin_unlock(&gmap->mm->page_table_lock);
337	up_read(&gmap->mm->mmap_sem);
338	if (flush)
339		gmap_flush_tlb(gmap);
340	return 0;
341}
342EXPORT_SYMBOL_GPL(gmap_unmap_segment);
343
344/**
345 * gmap_mmap_segment - map a segment to the guest address space
346 * @gmap: pointer to the guest address space structure
347 * @from: source address in the parent address space
348 * @to: target address in the guest address space
349 *
350 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
351 */
352int gmap_map_segment(struct gmap *gmap, unsigned long from,
353		     unsigned long to, unsigned long len)
354{
355	unsigned long *table;
356	unsigned long off;
357	int flush;
358
359	if ((from | to | len) & (PMD_SIZE - 1))
360		return -EINVAL;
361	if (len == 0 || from + len > TASK_MAX_SIZE ||
362	    from + len < from || to + len < to)
363		return -EINVAL;
364
365	flush = 0;
366	down_read(&gmap->mm->mmap_sem);
367	spin_lock(&gmap->mm->page_table_lock);
368	for (off = 0; off < len; off += PMD_SIZE) {
369		/* Walk the gmap address space page table */
370		table = gmap->table + (((to + off) >> 53) & 0x7ff);
371		if ((*table & _REGION_ENTRY_INVALID) &&
372		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
373			goto out_unmap;
374		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
375		table = table + (((to + off) >> 42) & 0x7ff);
376		if ((*table & _REGION_ENTRY_INVALID) &&
377		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
378			goto out_unmap;
379		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
380		table = table + (((to + off) >> 31) & 0x7ff);
381		if ((*table & _REGION_ENTRY_INVALID) &&
382		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
383			goto out_unmap;
384		table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
385		table = table + (((to + off) >> 20) & 0x7ff);
386
387		/* Store 'from' address in an invalid segment table entry. */
388		flush |= gmap_unlink_segment(gmap, table);
389		*table =  (from + off) | (_SEGMENT_ENTRY_INVALID |
390					  _SEGMENT_ENTRY_PROTECT);
391	}
392	spin_unlock(&gmap->mm->page_table_lock);
393	up_read(&gmap->mm->mmap_sem);
394	if (flush)
395		gmap_flush_tlb(gmap);
396	return 0;
397
398out_unmap:
399	spin_unlock(&gmap->mm->page_table_lock);
400	up_read(&gmap->mm->mmap_sem);
401	gmap_unmap_segment(gmap, to, len);
402	return -ENOMEM;
403}
404EXPORT_SYMBOL_GPL(gmap_map_segment);
405
406static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap)
407{
408	unsigned long *table;
409
410	table = gmap->table + ((address >> 53) & 0x7ff);
411	if (unlikely(*table & _REGION_ENTRY_INVALID))
412		return ERR_PTR(-EFAULT);
413	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
414	table = table + ((address >> 42) & 0x7ff);
415	if (unlikely(*table & _REGION_ENTRY_INVALID))
416		return ERR_PTR(-EFAULT);
417	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
418	table = table + ((address >> 31) & 0x7ff);
419	if (unlikely(*table & _REGION_ENTRY_INVALID))
420		return ERR_PTR(-EFAULT);
421	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
422	table = table + ((address >> 20) & 0x7ff);
423	return table;
424}
425
426/**
427 * __gmap_translate - translate a guest address to a user space address
428 * @address: guest address
429 * @gmap: pointer to guest mapping meta data structure
430 *
431 * Returns user space address which corresponds to the guest address or
432 * -EFAULT if no such mapping exists.
433 * This function does not establish potentially missing page table entries.
434 * The mmap_sem of the mm that belongs to the address space must be held
435 * when this function gets called.
436 */
437unsigned long __gmap_translate(unsigned long address, struct gmap *gmap)
438{
439	unsigned long *segment_ptr, vmaddr, segment;
440	struct gmap_pgtable *mp;
441	struct page *page;
442
443	current->thread.gmap_addr = address;
444	segment_ptr = gmap_table_walk(address, gmap);
445	if (IS_ERR(segment_ptr))
446		return PTR_ERR(segment_ptr);
447	/* Convert the gmap address to an mm address. */
448	segment = *segment_ptr;
449	if (!(segment & _SEGMENT_ENTRY_INVALID)) {
450		page = pfn_to_page(segment >> PAGE_SHIFT);
451		mp = (struct gmap_pgtable *) page->index;
452		return mp->vmaddr | (address & ~PMD_MASK);
453	} else if (segment & _SEGMENT_ENTRY_PROTECT) {
454		vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
455		return vmaddr | (address & ~PMD_MASK);
456	}
457	return -EFAULT;
458}
459EXPORT_SYMBOL_GPL(__gmap_translate);
460
461/**
462 * gmap_translate - translate a guest address to a user space address
463 * @address: guest address
464 * @gmap: pointer to guest mapping meta data structure
465 *
466 * Returns user space address which corresponds to the guest address or
467 * -EFAULT if no such mapping exists.
468 * This function does not establish potentially missing page table entries.
469 */
470unsigned long gmap_translate(unsigned long address, struct gmap *gmap)
471{
472	unsigned long rc;
473
474	down_read(&gmap->mm->mmap_sem);
475	rc = __gmap_translate(address, gmap);
476	up_read(&gmap->mm->mmap_sem);
477	return rc;
478}
479EXPORT_SYMBOL_GPL(gmap_translate);
480
481static int gmap_connect_pgtable(unsigned long address, unsigned long segment,
482				unsigned long *segment_ptr, struct gmap *gmap)
483{
484	unsigned long vmaddr;
485	struct vm_area_struct *vma;
486	struct gmap_pgtable *mp;
487	struct gmap_rmap *rmap;
488	struct mm_struct *mm;
489	struct page *page;
490	pgd_t *pgd;
491	pud_t *pud;
492	pmd_t *pmd;
493
494	mm = gmap->mm;
495	vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
496	vma = find_vma(mm, vmaddr);
497	if (!vma || vma->vm_start > vmaddr)
498		return -EFAULT;
499	/* Walk the parent mm page table */
500	pgd = pgd_offset(mm, vmaddr);
501	pud = pud_alloc(mm, pgd, vmaddr);
502	if (!pud)
503		return -ENOMEM;
504	pmd = pmd_alloc(mm, pud, vmaddr);
505	if (!pmd)
506		return -ENOMEM;
507	if (!pmd_present(*pmd) &&
508	    __pte_alloc(mm, vma, pmd, vmaddr))
509		return -ENOMEM;
510	/* large pmds cannot yet be handled */
511	if (pmd_large(*pmd))
512		return -EFAULT;
513	/* pmd now points to a valid segment table entry. */
514	rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
515	if (!rmap)
516		return -ENOMEM;
517	/* Link gmap segment table entry location to page table. */
518	page = pmd_page(*pmd);
519	mp = (struct gmap_pgtable *) page->index;
520	rmap->gmap = gmap;
521	rmap->entry = segment_ptr;
522	rmap->vmaddr = address & PMD_MASK;
523	spin_lock(&mm->page_table_lock);
524	if (*segment_ptr == segment) {
525		list_add(&rmap->list, &mp->mapper);
526		/* Set gmap segment table entry to page table. */
527		*segment_ptr = pmd_val(*pmd) & PAGE_MASK;
528		rmap = NULL;
529	}
530	spin_unlock(&mm->page_table_lock);
531	kfree(rmap);
532	return 0;
533}
534
535static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table)
536{
537	struct gmap_rmap *rmap, *next;
538	struct gmap_pgtable *mp;
539	struct page *page;
540	int flush;
541
542	flush = 0;
543	spin_lock(&mm->page_table_lock);
544	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
545	mp = (struct gmap_pgtable *) page->index;
546	list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
547		*rmap->entry = mp->vmaddr | (_SEGMENT_ENTRY_INVALID |
548					     _SEGMENT_ENTRY_PROTECT);
549		list_del(&rmap->list);
550		kfree(rmap);
551		flush = 1;
552	}
553	spin_unlock(&mm->page_table_lock);
554	if (flush)
555		__tlb_flush_global();
556}
557
558/*
559 * this function is assumed to be called with mmap_sem held
560 */
561unsigned long __gmap_fault(unsigned long address, struct gmap *gmap)
562{
563	unsigned long *segment_ptr, segment;
564	struct gmap_pgtable *mp;
565	struct page *page;
566	int rc;
567
568	current->thread.gmap_addr = address;
569	segment_ptr = gmap_table_walk(address, gmap);
570	if (IS_ERR(segment_ptr))
571		return -EFAULT;
572	/* Convert the gmap address to an mm address. */
573	while (1) {
574		segment = *segment_ptr;
575		if (!(segment & _SEGMENT_ENTRY_INVALID)) {
576			/* Page table is present */
577			page = pfn_to_page(segment >> PAGE_SHIFT);
578			mp = (struct gmap_pgtable *) page->index;
579			return mp->vmaddr | (address & ~PMD_MASK);
580		}
581		if (!(segment & _SEGMENT_ENTRY_PROTECT))
582			/* Nothing mapped in the gmap address space. */
583			break;
584		rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap);
585		if (rc)
586			return rc;
587	}
588	return -EFAULT;
589}
590
591unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
592{
593	unsigned long rc;
594
595	down_read(&gmap->mm->mmap_sem);
596	rc = __gmap_fault(address, gmap);
597	up_read(&gmap->mm->mmap_sem);
598
599	return rc;
600}
601EXPORT_SYMBOL_GPL(gmap_fault);
602
603static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
604{
605	if (!non_swap_entry(entry))
606		dec_mm_counter(mm, MM_SWAPENTS);
607	else if (is_migration_entry(entry)) {
608		struct page *page = migration_entry_to_page(entry);
609
610		if (PageAnon(page))
611			dec_mm_counter(mm, MM_ANONPAGES);
612		else
613			dec_mm_counter(mm, MM_FILEPAGES);
614	}
615	free_swap_and_cache(entry);
616}
617
618/**
619 * The mm->mmap_sem lock must be held
620 */
621static void gmap_zap_unused(struct mm_struct *mm, unsigned long address)
622{
623	unsigned long ptev, pgstev;
624	spinlock_t *ptl;
625	pgste_t pgste;
626	pte_t *ptep, pte;
627
628	ptep = get_locked_pte(mm, address, &ptl);
629	if (unlikely(!ptep))
630		return;
631	pte = *ptep;
632	if (!pte_swap(pte))
633		goto out_pte;
634	/* Zap unused and logically-zero pages */
635	pgste = pgste_get_lock(ptep);
636	pgstev = pgste_val(pgste);
637	ptev = pte_val(pte);
638	if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
639	    ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) {
640		gmap_zap_swap_entry(pte_to_swp_entry(pte), mm);
641		pte_clear(mm, address, ptep);
642	}
643	pgste_set_unlock(ptep, pgste);
644out_pte:
645	pte_unmap_unlock(*ptep, ptl);
646}
647
648/*
649 * this function is assumed to be called with mmap_sem held
650 */
651void __gmap_zap(unsigned long address, struct gmap *gmap)
652{
653	unsigned long *table, *segment_ptr;
654	unsigned long segment, pgstev, ptev;
655	struct gmap_pgtable *mp;
656	struct page *page;
657
658	segment_ptr = gmap_table_walk(address, gmap);
659	if (IS_ERR(segment_ptr))
660		return;
661	segment = *segment_ptr;
662	if (segment & _SEGMENT_ENTRY_INVALID)
663		return;
664	page = pfn_to_page(segment >> PAGE_SHIFT);
665	mp = (struct gmap_pgtable *) page->index;
666	address = mp->vmaddr | (address & ~PMD_MASK);
667	/* Page table is present */
668	table = (unsigned long *)(segment & _SEGMENT_ENTRY_ORIGIN);
669	table = table + ((address >> 12) & 0xff);
670	pgstev = table[PTRS_PER_PTE];
671	ptev = table[0];
672	/* quick check, checked again with locks held */
673	if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
674	    ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID)))
675		gmap_zap_unused(gmap->mm, address);
676}
677EXPORT_SYMBOL_GPL(__gmap_zap);
678
679void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap)
680{
681
682	unsigned long *table, address, size;
683	struct vm_area_struct *vma;
684	struct gmap_pgtable *mp;
685	struct page *page;
686
687	down_read(&gmap->mm->mmap_sem);
688	address = from;
689	while (address < to) {
690		/* Walk the gmap address space page table */
691		table = gmap->table + ((address >> 53) & 0x7ff);
692		if (unlikely(*table & _REGION_ENTRY_INVALID)) {
693			address = (address + PMD_SIZE) & PMD_MASK;
694			continue;
695		}
696		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
697		table = table + ((address >> 42) & 0x7ff);
698		if (unlikely(*table & _REGION_ENTRY_INVALID)) {
699			address = (address + PMD_SIZE) & PMD_MASK;
700			continue;
701		}
702		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
703		table = table + ((address >> 31) & 0x7ff);
704		if (unlikely(*table & _REGION_ENTRY_INVALID)) {
705			address = (address + PMD_SIZE) & PMD_MASK;
706			continue;
707		}
708		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
709		table = table + ((address >> 20) & 0x7ff);
710		if (unlikely(*table & _SEGMENT_ENTRY_INVALID)) {
711			address = (address + PMD_SIZE) & PMD_MASK;
712			continue;
713		}
714		page = pfn_to_page(*table >> PAGE_SHIFT);
715		mp = (struct gmap_pgtable *) page->index;
716		vma = find_vma(gmap->mm, mp->vmaddr);
717		size = min(to - address, PMD_SIZE - (address & ~PMD_MASK));
718		zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK),
719			       size, NULL);
720		address = (address + PMD_SIZE) & PMD_MASK;
721	}
722	up_read(&gmap->mm->mmap_sem);
723}
724EXPORT_SYMBOL_GPL(gmap_discard);
725
726static LIST_HEAD(gmap_notifier_list);
727static DEFINE_SPINLOCK(gmap_notifier_lock);
728
729/**
730 * gmap_register_ipte_notifier - register a pte invalidation callback
731 * @nb: pointer to the gmap notifier block
732 */
733void gmap_register_ipte_notifier(struct gmap_notifier *nb)
734{
735	spin_lock(&gmap_notifier_lock);
736	list_add(&nb->list, &gmap_notifier_list);
737	spin_unlock(&gmap_notifier_lock);
738}
739EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
740
741/**
742 * gmap_unregister_ipte_notifier - remove a pte invalidation callback
743 * @nb: pointer to the gmap notifier block
744 */
745void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
746{
747	spin_lock(&gmap_notifier_lock);
748	list_del_init(&nb->list);
749	spin_unlock(&gmap_notifier_lock);
750}
751EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
752
753/**
754 * gmap_ipte_notify - mark a range of ptes for invalidation notification
755 * @gmap: pointer to guest mapping meta data structure
756 * @start: virtual address in the guest address space
757 * @len: size of area
758 *
759 * Returns 0 if for each page in the given range a gmap mapping exists and
760 * the invalidation notification could be set. If the gmap mapping is missing
761 * for one or more pages -EFAULT is returned. If no memory could be allocated
762 * -ENOMEM is returned. This function establishes missing page table entries.
763 */
764int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len)
765{
766	unsigned long addr;
767	spinlock_t *ptl;
768	pte_t *ptep, entry;
769	pgste_t pgste;
770	int rc = 0;
771
772	if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK))
773		return -EINVAL;
774	down_read(&gmap->mm->mmap_sem);
775	while (len) {
776		/* Convert gmap address and connect the page tables */
777		addr = __gmap_fault(start, gmap);
778		if (IS_ERR_VALUE(addr)) {
779			rc = addr;
780			break;
781		}
782		/* Get the page mapped */
783		if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) {
784			rc = -EFAULT;
785			break;
786		}
787		/* Walk the process page table, lock and get pte pointer */
788		ptep = get_locked_pte(gmap->mm, addr, &ptl);
789		if (unlikely(!ptep))
790			continue;
791		/* Set notification bit in the pgste of the pte */
792		entry = *ptep;
793		if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
794			pgste = pgste_get_lock(ptep);
795			pgste_val(pgste) |= PGSTE_IN_BIT;
796			pgste_set_unlock(ptep, pgste);
797			start += PAGE_SIZE;
798			len -= PAGE_SIZE;
799		}
800		spin_unlock(ptl);
801	}
802	up_read(&gmap->mm->mmap_sem);
803	return rc;
804}
805EXPORT_SYMBOL_GPL(gmap_ipte_notify);
806
807/**
808 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
809 * @mm: pointer to the process mm_struct
810 * @pte: pointer to the page table entry
811 *
812 * This function is assumed to be called with the page table lock held
813 * for the pte to notify.
814 */
815void gmap_do_ipte_notify(struct mm_struct *mm, pte_t *pte)
816{
817	unsigned long segment_offset;
818	struct gmap_notifier *nb;
819	struct gmap_pgtable *mp;
820	struct gmap_rmap *rmap;
821	struct page *page;
822
823	segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
824	segment_offset = segment_offset * (4096 / sizeof(pte_t));
825	page = pfn_to_page(__pa(pte) >> PAGE_SHIFT);
826	mp = (struct gmap_pgtable *) page->index;
827	spin_lock(&gmap_notifier_lock);
828	list_for_each_entry(rmap, &mp->mapper, list) {
829		list_for_each_entry(nb, &gmap_notifier_list, list)
830			nb->notifier_call(rmap->gmap,
831					  rmap->vmaddr + segment_offset);
832	}
833	spin_unlock(&gmap_notifier_lock);
834}
835
836static inline int page_table_with_pgste(struct page *page)
837{
838	return atomic_read(&page->_mapcount) == 0;
839}
840
841static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
842						    unsigned long vmaddr)
843{
844	struct page *page;
845	unsigned long *table;
846	struct gmap_pgtable *mp;
847
848	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
849	if (!page)
850		return NULL;
851	mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
852	if (!mp) {
853		__free_page(page);
854		return NULL;
855	}
856	if (!pgtable_page_ctor(page)) {
857		kfree(mp);
858		__free_page(page);
859		return NULL;
860	}
861	mp->vmaddr = vmaddr & PMD_MASK;
862	INIT_LIST_HEAD(&mp->mapper);
863	page->index = (unsigned long) mp;
864	atomic_set(&page->_mapcount, 0);
865	table = (unsigned long *) page_to_phys(page);
866	clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
867	clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
868		    PAGE_SIZE/2);
869	return table;
870}
871
872static inline void page_table_free_pgste(unsigned long *table)
873{
874	struct page *page;
875	struct gmap_pgtable *mp;
876
877	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
878	mp = (struct gmap_pgtable *) page->index;
879	BUG_ON(!list_empty(&mp->mapper));
880	pgtable_page_dtor(page);
881	atomic_set(&page->_mapcount, -1);
882	kfree(mp);
883	__free_page(page);
884}
885
886static inline unsigned long page_table_reset_pte(struct mm_struct *mm, pmd_t *pmd,
887			unsigned long addr, unsigned long end, bool init_skey)
888{
889	pte_t *start_pte, *pte;
890	spinlock_t *ptl;
891	pgste_t pgste;
892
893	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
894	pte = start_pte;
895	do {
896		pgste = pgste_get_lock(pte);
897		pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
898		if (init_skey) {
899			unsigned long address;
900
901			pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
902					      PGSTE_GR_BIT | PGSTE_GC_BIT);
903
904			/* skip invalid and not writable pages */
905			if (pte_val(*pte) & _PAGE_INVALID ||
906			    !(pte_val(*pte) & _PAGE_WRITE)) {
907				pgste_set_unlock(pte, pgste);
908				continue;
909			}
910
911			address = pte_val(*pte) & PAGE_MASK;
912			page_set_storage_key(address, PAGE_DEFAULT_KEY, 1);
913		}
914		pgste_set_unlock(pte, pgste);
915	} while (pte++, addr += PAGE_SIZE, addr != end);
916	pte_unmap_unlock(start_pte, ptl);
917
918	return addr;
919}
920
921static inline unsigned long page_table_reset_pmd(struct mm_struct *mm, pud_t *pud,
922			unsigned long addr, unsigned long end, bool init_skey)
923{
924	unsigned long next;
925	pmd_t *pmd;
926
927	pmd = pmd_offset(pud, addr);
928	do {
929		next = pmd_addr_end(addr, end);
930		if (pmd_none_or_clear_bad(pmd))
931			continue;
932		next = page_table_reset_pte(mm, pmd, addr, next, init_skey);
933	} while (pmd++, addr = next, addr != end);
934
935	return addr;
936}
937
938static inline unsigned long page_table_reset_pud(struct mm_struct *mm, pgd_t *pgd,
939			unsigned long addr, unsigned long end, bool init_skey)
940{
941	unsigned long next;
942	pud_t *pud;
943
944	pud = pud_offset(pgd, addr);
945	do {
946		next = pud_addr_end(addr, end);
947		if (pud_none_or_clear_bad(pud))
948			continue;
949		next = page_table_reset_pmd(mm, pud, addr, next, init_skey);
950	} while (pud++, addr = next, addr != end);
951
952	return addr;
953}
954
955void page_table_reset_pgste(struct mm_struct *mm, unsigned long start,
956			    unsigned long end, bool init_skey)
957{
958	unsigned long addr, next;
959	pgd_t *pgd;
960
961	addr = start;
962	down_read(&mm->mmap_sem);
963	pgd = pgd_offset(mm, addr);
964	do {
965		next = pgd_addr_end(addr, end);
966		if (pgd_none_or_clear_bad(pgd))
967			continue;
968		next = page_table_reset_pud(mm, pgd, addr, next, init_skey);
969	} while (pgd++, addr = next, addr != end);
970	up_read(&mm->mmap_sem);
971}
972EXPORT_SYMBOL(page_table_reset_pgste);
973
974int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
975			  unsigned long key, bool nq)
976{
977	spinlock_t *ptl;
978	pgste_t old, new;
979	pte_t *ptep;
980
981	down_read(&mm->mmap_sem);
982	ptep = get_locked_pte(current->mm, addr, &ptl);
983	if (unlikely(!ptep)) {
984		up_read(&mm->mmap_sem);
985		return -EFAULT;
986	}
987
988	new = old = pgste_get_lock(ptep);
989	pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
990			    PGSTE_ACC_BITS | PGSTE_FP_BIT);
991	pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
992	pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
993	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
994		unsigned long address, bits, skey;
995
996		address = pte_val(*ptep) & PAGE_MASK;
997		skey = (unsigned long) page_get_storage_key(address);
998		bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
999		skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
1000		/* Set storage key ACC and FP */
1001		page_set_storage_key(address, skey, !nq);
1002		/* Merge host changed & referenced into pgste  */
1003		pgste_val(new) |= bits << 52;
1004	}
1005	/* changing the guest storage key is considered a change of the page */
1006	if ((pgste_val(new) ^ pgste_val(old)) &
1007	    (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
1008		pgste_val(new) |= PGSTE_HC_BIT;
1009
1010	pgste_set_unlock(ptep, new);
1011	pte_unmap_unlock(*ptep, ptl);
1012	up_read(&mm->mmap_sem);
1013	return 0;
1014}
1015EXPORT_SYMBOL(set_guest_storage_key);
1016
1017#else /* CONFIG_PGSTE */
1018
1019static inline int page_table_with_pgste(struct page *page)
1020{
1021	return 0;
1022}
1023
1024static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
1025						    unsigned long vmaddr)
1026{
1027	return NULL;
1028}
1029
1030void page_table_reset_pgste(struct mm_struct *mm, unsigned long start,
1031			    unsigned long end, bool init_skey)
1032{
1033}
1034
1035static inline void page_table_free_pgste(unsigned long *table)
1036{
1037}
1038
1039static inline void gmap_disconnect_pgtable(struct mm_struct *mm,
1040					   unsigned long *table)
1041{
1042}
1043
1044#endif /* CONFIG_PGSTE */
1045
1046static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
1047{
1048	unsigned int old, new;
1049
1050	do {
1051		old = atomic_read(v);
1052		new = old ^ bits;
1053	} while (atomic_cmpxchg(v, old, new) != old);
1054	return new;
1055}
1056
1057/*
1058 * page table entry allocation/free routines.
1059 */
1060unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
1061{
1062	unsigned long *uninitialized_var(table);
1063	struct page *uninitialized_var(page);
1064	unsigned int mask, bit;
1065
1066	if (mm_has_pgste(mm))
1067		return page_table_alloc_pgste(mm, vmaddr);
1068	/* Allocate fragments of a 4K page as 1K/2K page table */
1069	spin_lock_bh(&mm->context.list_lock);
1070	mask = FRAG_MASK;
1071	if (!list_empty(&mm->context.pgtable_list)) {
1072		page = list_first_entry(&mm->context.pgtable_list,
1073					struct page, lru);
1074		table = (unsigned long *) page_to_phys(page);
1075		mask = atomic_read(&page->_mapcount);
1076		mask = mask | (mask >> 4);
1077	}
1078	if ((mask & FRAG_MASK) == FRAG_MASK) {
1079		spin_unlock_bh(&mm->context.list_lock);
1080		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
1081		if (!page)
1082			return NULL;
1083		if (!pgtable_page_ctor(page)) {
1084			__free_page(page);
1085			return NULL;
1086		}
1087		atomic_set(&page->_mapcount, 1);
1088		table = (unsigned long *) page_to_phys(page);
1089		clear_table(table, _PAGE_INVALID, PAGE_SIZE);
1090		spin_lock_bh(&mm->context.list_lock);
1091		list_add(&page->lru, &mm->context.pgtable_list);
1092	} else {
1093		for (bit = 1; mask & bit; bit <<= 1)
1094			table += PTRS_PER_PTE;
1095		mask = atomic_xor_bits(&page->_mapcount, bit);
1096		if ((mask & FRAG_MASK) == FRAG_MASK)
1097			list_del(&page->lru);
1098	}
1099	spin_unlock_bh(&mm->context.list_lock);
1100	return table;
1101}
1102
1103void page_table_free(struct mm_struct *mm, unsigned long *table)
1104{
1105	struct page *page;
1106	unsigned int bit, mask;
1107
1108	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1109	if (page_table_with_pgste(page)) {
1110		gmap_disconnect_pgtable(mm, table);
1111		return page_table_free_pgste(table);
1112	}
1113	/* Free 1K/2K page table fragment of a 4K page */
1114	bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
1115	spin_lock_bh(&mm->context.list_lock);
1116	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
1117		list_del(&page->lru);
1118	mask = atomic_xor_bits(&page->_mapcount, bit);
1119	if (mask & FRAG_MASK)
1120		list_add(&page->lru, &mm->context.pgtable_list);
1121	spin_unlock_bh(&mm->context.list_lock);
1122	if (mask == 0) {
1123		pgtable_page_dtor(page);
1124		atomic_set(&page->_mapcount, -1);
1125		__free_page(page);
1126	}
1127}
1128
1129static void __page_table_free_rcu(void *table, unsigned bit)
1130{
1131	struct page *page;
1132
1133	if (bit == FRAG_MASK)
1134		return page_table_free_pgste(table);
1135	/* Free 1K/2K page table fragment of a 4K page */
1136	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1137	if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
1138		pgtable_page_dtor(page);
1139		atomic_set(&page->_mapcount, -1);
1140		__free_page(page);
1141	}
1142}
1143
1144void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
1145{
1146	struct mm_struct *mm;
1147	struct page *page;
1148	unsigned int bit, mask;
1149
1150	mm = tlb->mm;
1151	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1152	if (page_table_with_pgste(page)) {
1153		gmap_disconnect_pgtable(mm, table);
1154		table = (unsigned long *) (__pa(table) | FRAG_MASK);
1155		tlb_remove_table(tlb, table);
1156		return;
1157	}
1158	bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
1159	spin_lock_bh(&mm->context.list_lock);
1160	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
1161		list_del(&page->lru);
1162	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
1163	if (mask & FRAG_MASK)
1164		list_add_tail(&page->lru, &mm->context.pgtable_list);
1165	spin_unlock_bh(&mm->context.list_lock);
1166	table = (unsigned long *) (__pa(table) | (bit << 4));
1167	tlb_remove_table(tlb, table);
1168}
1169
1170static void __tlb_remove_table(void *_table)
1171{
1172	const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
1173	void *table = (void *)((unsigned long) _table & ~mask);
1174	unsigned type = (unsigned long) _table & mask;
1175
1176	if (type)
1177		__page_table_free_rcu(table, type);
1178	else
1179		free_pages((unsigned long) table, ALLOC_ORDER);
1180}
1181
1182static void tlb_remove_table_smp_sync(void *arg)
1183{
1184	/* Simply deliver the interrupt */
1185}
1186
1187static void tlb_remove_table_one(void *table)
1188{
1189	/*
1190	 * This isn't an RCU grace period and hence the page-tables cannot be
1191	 * assumed to be actually RCU-freed.
1192	 *
1193	 * It is however sufficient for software page-table walkers that rely
1194	 * on IRQ disabling. See the comment near struct mmu_table_batch.
1195	 */
1196	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
1197	__tlb_remove_table(table);
1198}
1199
1200static void tlb_remove_table_rcu(struct rcu_head *head)
1201{
1202	struct mmu_table_batch *batch;
1203	int i;
1204
1205	batch = container_of(head, struct mmu_table_batch, rcu);
1206
1207	for (i = 0; i < batch->nr; i++)
1208		__tlb_remove_table(batch->tables[i]);
1209
1210	free_page((unsigned long)batch);
1211}
1212
1213void tlb_table_flush(struct mmu_gather *tlb)
1214{
1215	struct mmu_table_batch **batch = &tlb->batch;
1216
1217	if (*batch) {
1218		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
1219		*batch = NULL;
1220	}
1221}
1222
1223void tlb_remove_table(struct mmu_gather *tlb, void *table)
1224{
1225	struct mmu_table_batch **batch = &tlb->batch;
1226
1227	tlb->mm->context.flush_mm = 1;
1228	if (*batch == NULL) {
1229		*batch = (struct mmu_table_batch *)
1230			__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
1231		if (*batch == NULL) {
1232			__tlb_flush_mm_lazy(tlb->mm);
1233			tlb_remove_table_one(table);
1234			return;
1235		}
1236		(*batch)->nr = 0;
1237	}
1238	(*batch)->tables[(*batch)->nr++] = table;
1239	if ((*batch)->nr == MAX_TABLE_BATCH)
1240		tlb_flush_mmu(tlb);
1241}
1242
1243#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1244static inline void thp_split_vma(struct vm_area_struct *vma)
1245{
1246	unsigned long addr;
1247
1248	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
1249		follow_page(vma, addr, FOLL_SPLIT);
1250}
1251
1252static inline void thp_split_mm(struct mm_struct *mm)
1253{
1254	struct vm_area_struct *vma;
1255
1256	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
1257		thp_split_vma(vma);
1258		vma->vm_flags &= ~VM_HUGEPAGE;
1259		vma->vm_flags |= VM_NOHUGEPAGE;
1260	}
1261	mm->def_flags |= VM_NOHUGEPAGE;
1262}
1263#else
1264static inline void thp_split_mm(struct mm_struct *mm)
1265{
1266}
1267#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1268
1269static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
1270				struct mm_struct *mm, pud_t *pud,
1271				unsigned long addr, unsigned long end)
1272{
1273	unsigned long next, *table, *new;
1274	struct page *page;
1275	pmd_t *pmd;
1276
1277	pmd = pmd_offset(pud, addr);
1278	do {
1279		next = pmd_addr_end(addr, end);
1280again:
1281		if (pmd_none_or_clear_bad(pmd))
1282			continue;
1283		table = (unsigned long *) pmd_deref(*pmd);
1284		page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1285		if (page_table_with_pgste(page))
1286			continue;
1287		/* Allocate new page table with pgstes */
1288		new = page_table_alloc_pgste(mm, addr);
1289		if (!new)
1290			return -ENOMEM;
1291
1292		spin_lock(&mm->page_table_lock);
1293		if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
1294			/* Nuke pmd entry pointing to the "short" page table */
1295			pmdp_flush_lazy(mm, addr, pmd);
1296			pmd_clear(pmd);
1297			/* Copy ptes from old table to new table */
1298			memcpy(new, table, PAGE_SIZE/2);
1299			clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
1300			/* Establish new table */
1301			pmd_populate(mm, pmd, (pte_t *) new);
1302			/* Free old table with rcu, there might be a walker! */
1303			page_table_free_rcu(tlb, table);
1304			new = NULL;
1305		}
1306		spin_unlock(&mm->page_table_lock);
1307		if (new) {
1308			page_table_free_pgste(new);
1309			goto again;
1310		}
1311	} while (pmd++, addr = next, addr != end);
1312
1313	return addr;
1314}
1315
1316static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
1317				   struct mm_struct *mm, pgd_t *pgd,
1318				   unsigned long addr, unsigned long end)
1319{
1320	unsigned long next;
1321	pud_t *pud;
1322
1323	pud = pud_offset(pgd, addr);
1324	do {
1325		next = pud_addr_end(addr, end);
1326		if (pud_none_or_clear_bad(pud))
1327			continue;
1328		next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
1329		if (unlikely(IS_ERR_VALUE(next)))
1330			return next;
1331	} while (pud++, addr = next, addr != end);
1332
1333	return addr;
1334}
1335
1336static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
1337					unsigned long addr, unsigned long end)
1338{
1339	unsigned long next;
1340	pgd_t *pgd;
1341
1342	pgd = pgd_offset(mm, addr);
1343	do {
1344		next = pgd_addr_end(addr, end);
1345		if (pgd_none_or_clear_bad(pgd))
1346			continue;
1347		next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
1348		if (unlikely(IS_ERR_VALUE(next)))
1349			return next;
1350	} while (pgd++, addr = next, addr != end);
1351
1352	return 0;
1353}
1354
1355/*
1356 * switch on pgstes for its userspace process (for kvm)
1357 */
1358int s390_enable_sie(void)
1359{
1360	struct task_struct *tsk = current;
1361	struct mm_struct *mm = tsk->mm;
1362	struct mmu_gather tlb;
1363
1364	/* Do we have pgstes? if yes, we are done */
1365	if (mm_has_pgste(tsk->mm))
1366		return 0;
1367
1368	down_write(&mm->mmap_sem);
1369	/* split thp mappings and disable thp for future mappings */
1370	thp_split_mm(mm);
1371	/* Reallocate the page tables with pgstes */
1372	tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
1373	if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE))
1374		mm->context.has_pgste = 1;
1375	tlb_finish_mmu(&tlb, 0, TASK_SIZE);
1376	up_write(&mm->mmap_sem);
1377	return mm->context.has_pgste ? 0 : -ENOMEM;
1378}
1379EXPORT_SYMBOL_GPL(s390_enable_sie);
1380
1381/*
1382 * Enable storage key handling from now on and initialize the storage
1383 * keys with the default key.
1384 */
1385void s390_enable_skey(void)
1386{
1387	/*
1388	 * To avoid races between multiple vcpus, ending in calling
1389	 * page_table_reset twice or more,
1390	 * the page_table_lock is taken for serialization.
1391	 */
1392	spin_lock(&current->mm->page_table_lock);
1393	if (mm_use_skey(current->mm)) {
1394		spin_unlock(&current->mm->page_table_lock);
1395		return;
1396	}
1397
1398	current->mm->context.use_skey = 1;
1399	spin_unlock(&current->mm->page_table_lock);
1400	page_table_reset_pgste(current->mm, 0, TASK_SIZE, true);
1401}
1402EXPORT_SYMBOL_GPL(s390_enable_skey);
1403
1404#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1405int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
1406			   pmd_t *pmdp)
1407{
1408	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1409	/* No need to flush TLB
1410	 * On s390 reference bits are in storage key and never in TLB */
1411	return pmdp_test_and_clear_young(vma, address, pmdp);
1412}
1413
1414int pmdp_set_access_flags(struct vm_area_struct *vma,
1415			  unsigned long address, pmd_t *pmdp,
1416			  pmd_t entry, int dirty)
1417{
1418	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1419
1420	if (pmd_same(*pmdp, entry))
1421		return 0;
1422	pmdp_invalidate(vma, address, pmdp);
1423	set_pmd_at(vma->vm_mm, address, pmdp, entry);
1424	return 1;
1425}
1426
1427static void pmdp_splitting_flush_sync(void *arg)
1428{
1429	/* Simply deliver the interrupt */
1430}
1431
1432void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
1433			  pmd_t *pmdp)
1434{
1435	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1436	if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
1437			      (unsigned long *) pmdp)) {
1438		/* need to serialize against gup-fast (IRQ disabled) */
1439		smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
1440	}
1441}
1442
1443void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1444				pgtable_t pgtable)
1445{
1446	struct list_head *lh = (struct list_head *) pgtable;
1447
1448	assert_spin_locked(pmd_lockptr(mm, pmdp));
1449
1450	/* FIFO */
1451	if (!pmd_huge_pte(mm, pmdp))
1452		INIT_LIST_HEAD(lh);
1453	else
1454		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1455	pmd_huge_pte(mm, pmdp) = pgtable;
1456}
1457
1458pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1459{
1460	struct list_head *lh;
1461	pgtable_t pgtable;
1462	pte_t *ptep;
1463
1464	assert_spin_locked(pmd_lockptr(mm, pmdp));
1465
1466	/* FIFO */
1467	pgtable = pmd_huge_pte(mm, pmdp);
1468	lh = (struct list_head *) pgtable;
1469	if (list_empty(lh))
1470		pmd_huge_pte(mm, pmdp) = NULL;
1471	else {
1472		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1473		list_del(lh);
1474	}
1475	ptep = (pte_t *) pgtable;
1476	pte_val(*ptep) = _PAGE_INVALID;
1477	ptep++;
1478	pte_val(*ptep) = _PAGE_INVALID;
1479	return pgtable;
1480}
1481#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1482