pgtable.c revision 9501d09fa3c4ca18971083dfb0c9aa1afc85f19c
1/*
2 *    Copyright IBM Corp. 2007, 2011
3 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
4 */
5
6#include <linux/sched.h>
7#include <linux/kernel.h>
8#include <linux/errno.h>
9#include <linux/gfp.h>
10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/smp.h>
13#include <linux/highmem.h>
14#include <linux/pagemap.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/quicklist.h>
18#include <linux/rcupdate.h>
19#include <linux/slab.h>
20
21#include <asm/pgtable.h>
22#include <asm/pgalloc.h>
23#include <asm/tlb.h>
24#include <asm/tlbflush.h>
25#include <asm/mmu_context.h>
26
27#ifndef CONFIG_64BIT
28#define ALLOC_ORDER	1
29#define FRAG_MASK	0x0f
30#else
31#define ALLOC_ORDER	2
32#define FRAG_MASK	0x03
33#endif
34
35
36unsigned long *crst_table_alloc(struct mm_struct *mm)
37{
38	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
39
40	if (!page)
41		return NULL;
42	return (unsigned long *) page_to_phys(page);
43}
44
45void crst_table_free(struct mm_struct *mm, unsigned long *table)
46{
47	free_pages((unsigned long) table, ALLOC_ORDER);
48}
49
50#ifdef CONFIG_64BIT
51int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
52{
53	unsigned long *table, *pgd;
54	unsigned long entry;
55
56	BUG_ON(limit > (1UL << 53));
57repeat:
58	table = crst_table_alloc(mm);
59	if (!table)
60		return -ENOMEM;
61	spin_lock_bh(&mm->page_table_lock);
62	if (mm->context.asce_limit < limit) {
63		pgd = (unsigned long *) mm->pgd;
64		if (mm->context.asce_limit <= (1UL << 31)) {
65			entry = _REGION3_ENTRY_EMPTY;
66			mm->context.asce_limit = 1UL << 42;
67			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
68						_ASCE_USER_BITS |
69						_ASCE_TYPE_REGION3;
70		} else {
71			entry = _REGION2_ENTRY_EMPTY;
72			mm->context.asce_limit = 1UL << 53;
73			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
74						_ASCE_USER_BITS |
75						_ASCE_TYPE_REGION2;
76		}
77		crst_table_init(table, entry);
78		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
79		mm->pgd = (pgd_t *) table;
80		mm->task_size = mm->context.asce_limit;
81		table = NULL;
82	}
83	spin_unlock_bh(&mm->page_table_lock);
84	if (table)
85		crst_table_free(mm, table);
86	if (mm->context.asce_limit < limit)
87		goto repeat;
88	return 0;
89}
90
91void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
92{
93	pgd_t *pgd;
94
95	while (mm->context.asce_limit > limit) {
96		pgd = mm->pgd;
97		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
98		case _REGION_ENTRY_TYPE_R2:
99			mm->context.asce_limit = 1UL << 42;
100			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
101						_ASCE_USER_BITS |
102						_ASCE_TYPE_REGION3;
103			break;
104		case _REGION_ENTRY_TYPE_R3:
105			mm->context.asce_limit = 1UL << 31;
106			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
107						_ASCE_USER_BITS |
108						_ASCE_TYPE_SEGMENT;
109			break;
110		default:
111			BUG();
112		}
113		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
114		mm->task_size = mm->context.asce_limit;
115		crst_table_free(mm, (unsigned long *) pgd);
116	}
117}
118#endif
119
120#ifdef CONFIG_PGSTE
121
122/**
123 * gmap_alloc - allocate a guest address space
124 * @mm: pointer to the parent mm_struct
125 *
126 * Returns a guest address space structure.
127 */
128struct gmap *gmap_alloc(struct mm_struct *mm)
129{
130	struct gmap *gmap;
131	struct page *page;
132	unsigned long *table;
133
134	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
135	if (!gmap)
136		goto out;
137	INIT_LIST_HEAD(&gmap->crst_list);
138	gmap->mm = mm;
139	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
140	if (!page)
141		goto out_free;
142	list_add(&page->lru, &gmap->crst_list);
143	table = (unsigned long *) page_to_phys(page);
144	crst_table_init(table, _REGION1_ENTRY_EMPTY);
145	gmap->table = table;
146	gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
147		     _ASCE_USER_BITS | __pa(table);
148	list_add(&gmap->list, &mm->context.gmap_list);
149	return gmap;
150
151out_free:
152	kfree(gmap);
153out:
154	return NULL;
155}
156EXPORT_SYMBOL_GPL(gmap_alloc);
157
158static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
159{
160	struct gmap_pgtable *mp;
161	struct gmap_rmap *rmap;
162	struct page *page;
163
164	if (*table & _SEGMENT_ENTRY_INV)
165		return 0;
166	page = pfn_to_page(*table >> PAGE_SHIFT);
167	mp = (struct gmap_pgtable *) page->index;
168	list_for_each_entry(rmap, &mp->mapper, list) {
169		if (rmap->entry != table)
170			continue;
171		list_del(&rmap->list);
172		kfree(rmap);
173		break;
174	}
175	*table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
176	return 1;
177}
178
179static void gmap_flush_tlb(struct gmap *gmap)
180{
181	if (MACHINE_HAS_IDTE)
182		__tlb_flush_idte((unsigned long) gmap->table |
183				 _ASCE_TYPE_REGION1);
184	else
185		__tlb_flush_global();
186}
187
188/**
189 * gmap_free - free a guest address space
190 * @gmap: pointer to the guest address space structure
191 */
192void gmap_free(struct gmap *gmap)
193{
194	struct page *page, *next;
195	unsigned long *table;
196	int i;
197
198
199	/* Flush tlb. */
200	if (MACHINE_HAS_IDTE)
201		__tlb_flush_idte((unsigned long) gmap->table |
202				 _ASCE_TYPE_REGION1);
203	else
204		__tlb_flush_global();
205
206	/* Free all segment & region tables. */
207	down_read(&gmap->mm->mmap_sem);
208	spin_lock(&gmap->mm->page_table_lock);
209	list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
210		table = (unsigned long *) page_to_phys(page);
211		if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
212			/* Remove gmap rmap structures for segment table. */
213			for (i = 0; i < PTRS_PER_PMD; i++, table++)
214				gmap_unlink_segment(gmap, table);
215		__free_pages(page, ALLOC_ORDER);
216	}
217	spin_unlock(&gmap->mm->page_table_lock);
218	up_read(&gmap->mm->mmap_sem);
219	list_del(&gmap->list);
220	kfree(gmap);
221}
222EXPORT_SYMBOL_GPL(gmap_free);
223
224/**
225 * gmap_enable - switch primary space to the guest address space
226 * @gmap: pointer to the guest address space structure
227 */
228void gmap_enable(struct gmap *gmap)
229{
230	S390_lowcore.gmap = (unsigned long) gmap;
231}
232EXPORT_SYMBOL_GPL(gmap_enable);
233
234/**
235 * gmap_disable - switch back to the standard primary address space
236 * @gmap: pointer to the guest address space structure
237 */
238void gmap_disable(struct gmap *gmap)
239{
240	S390_lowcore.gmap = 0UL;
241}
242EXPORT_SYMBOL_GPL(gmap_disable);
243
244/*
245 * gmap_alloc_table is assumed to be called with mmap_sem held
246 */
247static int gmap_alloc_table(struct gmap *gmap,
248			       unsigned long *table, unsigned long init)
249{
250	struct page *page;
251	unsigned long *new;
252
253	/* since we dont free the gmap table until gmap_free we can unlock */
254	spin_unlock(&gmap->mm->page_table_lock);
255	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
256	spin_lock(&gmap->mm->page_table_lock);
257	if (!page)
258		return -ENOMEM;
259	new = (unsigned long *) page_to_phys(page);
260	crst_table_init(new, init);
261	if (*table & _REGION_ENTRY_INV) {
262		list_add(&page->lru, &gmap->crst_list);
263		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
264			(*table & _REGION_ENTRY_TYPE_MASK);
265	} else
266		__free_pages(page, ALLOC_ORDER);
267	return 0;
268}
269
270/**
271 * gmap_unmap_segment - unmap segment from the guest address space
272 * @gmap: pointer to the guest address space structure
273 * @addr: address in the guest address space
274 * @len: length of the memory area to unmap
275 *
276 * Returns 0 if the unmap succeded, -EINVAL if not.
277 */
278int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
279{
280	unsigned long *table;
281	unsigned long off;
282	int flush;
283
284	if ((to | len) & (PMD_SIZE - 1))
285		return -EINVAL;
286	if (len == 0 || to + len < to)
287		return -EINVAL;
288
289	flush = 0;
290	down_read(&gmap->mm->mmap_sem);
291	spin_lock(&gmap->mm->page_table_lock);
292	for (off = 0; off < len; off += PMD_SIZE) {
293		/* Walk the guest addr space page table */
294		table = gmap->table + (((to + off) >> 53) & 0x7ff);
295		if (*table & _REGION_ENTRY_INV)
296			goto out;
297		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
298		table = table + (((to + off) >> 42) & 0x7ff);
299		if (*table & _REGION_ENTRY_INV)
300			goto out;
301		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
302		table = table + (((to + off) >> 31) & 0x7ff);
303		if (*table & _REGION_ENTRY_INV)
304			goto out;
305		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
306		table = table + (((to + off) >> 20) & 0x7ff);
307
308		/* Clear segment table entry in guest address space. */
309		flush |= gmap_unlink_segment(gmap, table);
310		*table = _SEGMENT_ENTRY_INV;
311	}
312out:
313	spin_unlock(&gmap->mm->page_table_lock);
314	up_read(&gmap->mm->mmap_sem);
315	if (flush)
316		gmap_flush_tlb(gmap);
317	return 0;
318}
319EXPORT_SYMBOL_GPL(gmap_unmap_segment);
320
321/**
322 * gmap_mmap_segment - map a segment to the guest address space
323 * @gmap: pointer to the guest address space structure
324 * @from: source address in the parent address space
325 * @to: target address in the guest address space
326 *
327 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not.
328 */
329int gmap_map_segment(struct gmap *gmap, unsigned long from,
330		     unsigned long to, unsigned long len)
331{
332	unsigned long *table;
333	unsigned long off;
334	int flush;
335
336	if ((from | to | len) & (PMD_SIZE - 1))
337		return -EINVAL;
338	if (len == 0 || from + len > PGDIR_SIZE ||
339	    from + len < from || to + len < to)
340		return -EINVAL;
341
342	flush = 0;
343	down_read(&gmap->mm->mmap_sem);
344	spin_lock(&gmap->mm->page_table_lock);
345	for (off = 0; off < len; off += PMD_SIZE) {
346		/* Walk the gmap address space page table */
347		table = gmap->table + (((to + off) >> 53) & 0x7ff);
348		if ((*table & _REGION_ENTRY_INV) &&
349		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
350			goto out_unmap;
351		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
352		table = table + (((to + off) >> 42) & 0x7ff);
353		if ((*table & _REGION_ENTRY_INV) &&
354		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
355			goto out_unmap;
356		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
357		table = table + (((to + off) >> 31) & 0x7ff);
358		if ((*table & _REGION_ENTRY_INV) &&
359		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
360			goto out_unmap;
361		table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
362		table = table + (((to + off) >> 20) & 0x7ff);
363
364		/* Store 'from' address in an invalid segment table entry. */
365		flush |= gmap_unlink_segment(gmap, table);
366		*table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off);
367	}
368	spin_unlock(&gmap->mm->page_table_lock);
369	up_read(&gmap->mm->mmap_sem);
370	if (flush)
371		gmap_flush_tlb(gmap);
372	return 0;
373
374out_unmap:
375	spin_unlock(&gmap->mm->page_table_lock);
376	up_read(&gmap->mm->mmap_sem);
377	gmap_unmap_segment(gmap, to, len);
378	return -ENOMEM;
379}
380EXPORT_SYMBOL_GPL(gmap_map_segment);
381
382/*
383 * this function is assumed to be called with mmap_sem held
384 */
385unsigned long __gmap_fault(unsigned long address, struct gmap *gmap)
386{
387	unsigned long *table, vmaddr, segment;
388	struct mm_struct *mm;
389	struct gmap_pgtable *mp;
390	struct gmap_rmap *rmap;
391	struct vm_area_struct *vma;
392	struct page *page;
393	pgd_t *pgd;
394	pud_t *pud;
395	pmd_t *pmd;
396
397	current->thread.gmap_addr = address;
398	mm = gmap->mm;
399	/* Walk the gmap address space page table */
400	table = gmap->table + ((address >> 53) & 0x7ff);
401	if (unlikely(*table & _REGION_ENTRY_INV))
402		return -EFAULT;
403	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
404	table = table + ((address >> 42) & 0x7ff);
405	if (unlikely(*table & _REGION_ENTRY_INV))
406		return -EFAULT;
407	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
408	table = table + ((address >> 31) & 0x7ff);
409	if (unlikely(*table & _REGION_ENTRY_INV))
410		return -EFAULT;
411	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
412	table = table + ((address >> 20) & 0x7ff);
413
414	/* Convert the gmap address to an mm address. */
415	segment = *table;
416	if (likely(!(segment & _SEGMENT_ENTRY_INV))) {
417		page = pfn_to_page(segment >> PAGE_SHIFT);
418		mp = (struct gmap_pgtable *) page->index;
419		return mp->vmaddr | (address & ~PMD_MASK);
420	} else if (segment & _SEGMENT_ENTRY_RO) {
421		vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
422		vma = find_vma(mm, vmaddr);
423		if (!vma || vma->vm_start > vmaddr)
424			return -EFAULT;
425
426		/* Walk the parent mm page table */
427		pgd = pgd_offset(mm, vmaddr);
428		pud = pud_alloc(mm, pgd, vmaddr);
429		if (!pud)
430			return -ENOMEM;
431		pmd = pmd_alloc(mm, pud, vmaddr);
432		if (!pmd)
433			return -ENOMEM;
434		if (!pmd_present(*pmd) &&
435		    __pte_alloc(mm, vma, pmd, vmaddr))
436			return -ENOMEM;
437		/* pmd now points to a valid segment table entry. */
438		rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
439		if (!rmap)
440			return -ENOMEM;
441		/* Link gmap segment table entry location to page table. */
442		page = pmd_page(*pmd);
443		mp = (struct gmap_pgtable *) page->index;
444		rmap->entry = table;
445		spin_lock(&mm->page_table_lock);
446		list_add(&rmap->list, &mp->mapper);
447		spin_unlock(&mm->page_table_lock);
448		/* Set gmap segment table entry to page table. */
449		*table = pmd_val(*pmd) & PAGE_MASK;
450		return vmaddr | (address & ~PMD_MASK);
451	}
452	return -EFAULT;
453}
454
455unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
456{
457	unsigned long rc;
458
459	down_read(&gmap->mm->mmap_sem);
460	rc = __gmap_fault(address, gmap);
461	up_read(&gmap->mm->mmap_sem);
462
463	return rc;
464}
465EXPORT_SYMBOL_GPL(gmap_fault);
466
467void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap)
468{
469
470	unsigned long *table, address, size;
471	struct vm_area_struct *vma;
472	struct gmap_pgtable *mp;
473	struct page *page;
474
475	down_read(&gmap->mm->mmap_sem);
476	address = from;
477	while (address < to) {
478		/* Walk the gmap address space page table */
479		table = gmap->table + ((address >> 53) & 0x7ff);
480		if (unlikely(*table & _REGION_ENTRY_INV)) {
481			address = (address + PMD_SIZE) & PMD_MASK;
482			continue;
483		}
484		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
485		table = table + ((address >> 42) & 0x7ff);
486		if (unlikely(*table & _REGION_ENTRY_INV)) {
487			address = (address + PMD_SIZE) & PMD_MASK;
488			continue;
489		}
490		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
491		table = table + ((address >> 31) & 0x7ff);
492		if (unlikely(*table & _REGION_ENTRY_INV)) {
493			address = (address + PMD_SIZE) & PMD_MASK;
494			continue;
495		}
496		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
497		table = table + ((address >> 20) & 0x7ff);
498		if (unlikely(*table & _SEGMENT_ENTRY_INV)) {
499			address = (address + PMD_SIZE) & PMD_MASK;
500			continue;
501		}
502		page = pfn_to_page(*table >> PAGE_SHIFT);
503		mp = (struct gmap_pgtable *) page->index;
504		vma = find_vma(gmap->mm, mp->vmaddr);
505		size = min(to - address, PMD_SIZE - (address & ~PMD_MASK));
506		zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK),
507			       size, NULL);
508		address = (address + PMD_SIZE) & PMD_MASK;
509	}
510	up_read(&gmap->mm->mmap_sem);
511}
512EXPORT_SYMBOL_GPL(gmap_discard);
513
514void gmap_unmap_notifier(struct mm_struct *mm, unsigned long *table)
515{
516	struct gmap_rmap *rmap, *next;
517	struct gmap_pgtable *mp;
518	struct page *page;
519	int flush;
520
521	flush = 0;
522	spin_lock(&mm->page_table_lock);
523	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
524	mp = (struct gmap_pgtable *) page->index;
525	list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
526		*rmap->entry =
527			_SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
528		list_del(&rmap->list);
529		kfree(rmap);
530		flush = 1;
531	}
532	spin_unlock(&mm->page_table_lock);
533	if (flush)
534		__tlb_flush_global();
535}
536
537static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
538						    unsigned long vmaddr)
539{
540	struct page *page;
541	unsigned long *table;
542	struct gmap_pgtable *mp;
543
544	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
545	if (!page)
546		return NULL;
547	mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
548	if (!mp) {
549		__free_page(page);
550		return NULL;
551	}
552	pgtable_page_ctor(page);
553	mp->vmaddr = vmaddr & PMD_MASK;
554	INIT_LIST_HEAD(&mp->mapper);
555	page->index = (unsigned long) mp;
556	atomic_set(&page->_mapcount, 3);
557	table = (unsigned long *) page_to_phys(page);
558	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
559	clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
560	return table;
561}
562
563static inline void page_table_free_pgste(unsigned long *table)
564{
565	struct page *page;
566	struct gmap_pgtable *mp;
567
568	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
569	mp = (struct gmap_pgtable *) page->index;
570	BUG_ON(!list_empty(&mp->mapper));
571	pgtable_page_dtor(page);
572	atomic_set(&page->_mapcount, -1);
573	kfree(mp);
574	__free_page(page);
575}
576
577#else /* CONFIG_PGSTE */
578
579static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
580						    unsigned long vmaddr)
581{
582	return NULL;
583}
584
585static inline void page_table_free_pgste(unsigned long *table)
586{
587}
588
589static inline void gmap_unmap_notifier(struct mm_struct *mm,
590					  unsigned long *table)
591{
592}
593
594#endif /* CONFIG_PGSTE */
595
596static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
597{
598	unsigned int old, new;
599
600	do {
601		old = atomic_read(v);
602		new = old ^ bits;
603	} while (atomic_cmpxchg(v, old, new) != old);
604	return new;
605}
606
607/*
608 * page table entry allocation/free routines.
609 */
610unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
611{
612	unsigned long *uninitialized_var(table);
613	struct page *uninitialized_var(page);
614	unsigned int mask, bit;
615
616	if (mm_has_pgste(mm))
617		return page_table_alloc_pgste(mm, vmaddr);
618	/* Allocate fragments of a 4K page as 1K/2K page table */
619	spin_lock_bh(&mm->context.list_lock);
620	mask = FRAG_MASK;
621	if (!list_empty(&mm->context.pgtable_list)) {
622		page = list_first_entry(&mm->context.pgtable_list,
623					struct page, lru);
624		table = (unsigned long *) page_to_phys(page);
625		mask = atomic_read(&page->_mapcount);
626		mask = mask | (mask >> 4);
627	}
628	if ((mask & FRAG_MASK) == FRAG_MASK) {
629		spin_unlock_bh(&mm->context.list_lock);
630		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
631		if (!page)
632			return NULL;
633		pgtable_page_ctor(page);
634		atomic_set(&page->_mapcount, 1);
635		table = (unsigned long *) page_to_phys(page);
636		clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
637		spin_lock_bh(&mm->context.list_lock);
638		list_add(&page->lru, &mm->context.pgtable_list);
639	} else {
640		for (bit = 1; mask & bit; bit <<= 1)
641			table += PTRS_PER_PTE;
642		mask = atomic_xor_bits(&page->_mapcount, bit);
643		if ((mask & FRAG_MASK) == FRAG_MASK)
644			list_del(&page->lru);
645	}
646	spin_unlock_bh(&mm->context.list_lock);
647	return table;
648}
649
650void page_table_free(struct mm_struct *mm, unsigned long *table)
651{
652	struct page *page;
653	unsigned int bit, mask;
654
655	if (mm_has_pgste(mm)) {
656		gmap_unmap_notifier(mm, table);
657		return page_table_free_pgste(table);
658	}
659	/* Free 1K/2K page table fragment of a 4K page */
660	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
661	bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
662	spin_lock_bh(&mm->context.list_lock);
663	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
664		list_del(&page->lru);
665	mask = atomic_xor_bits(&page->_mapcount, bit);
666	if (mask & FRAG_MASK)
667		list_add(&page->lru, &mm->context.pgtable_list);
668	spin_unlock_bh(&mm->context.list_lock);
669	if (mask == 0) {
670		pgtable_page_dtor(page);
671		atomic_set(&page->_mapcount, -1);
672		__free_page(page);
673	}
674}
675
676static void __page_table_free_rcu(void *table, unsigned bit)
677{
678	struct page *page;
679
680	if (bit == FRAG_MASK)
681		return page_table_free_pgste(table);
682	/* Free 1K/2K page table fragment of a 4K page */
683	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
684	if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
685		pgtable_page_dtor(page);
686		atomic_set(&page->_mapcount, -1);
687		__free_page(page);
688	}
689}
690
691void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
692{
693	struct mm_struct *mm;
694	struct page *page;
695	unsigned int bit, mask;
696
697	mm = tlb->mm;
698	if (mm_has_pgste(mm)) {
699		gmap_unmap_notifier(mm, table);
700		table = (unsigned long *) (__pa(table) | FRAG_MASK);
701		tlb_remove_table(tlb, table);
702		return;
703	}
704	bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
705	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
706	spin_lock_bh(&mm->context.list_lock);
707	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
708		list_del(&page->lru);
709	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
710	if (mask & FRAG_MASK)
711		list_add_tail(&page->lru, &mm->context.pgtable_list);
712	spin_unlock_bh(&mm->context.list_lock);
713	table = (unsigned long *) (__pa(table) | (bit << 4));
714	tlb_remove_table(tlb, table);
715}
716
717void __tlb_remove_table(void *_table)
718{
719	const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
720	void *table = (void *)((unsigned long) _table & ~mask);
721	unsigned type = (unsigned long) _table & mask;
722
723	if (type)
724		__page_table_free_rcu(table, type);
725	else
726		free_pages((unsigned long) table, ALLOC_ORDER);
727}
728
729static void tlb_remove_table_smp_sync(void *arg)
730{
731	/* Simply deliver the interrupt */
732}
733
734static void tlb_remove_table_one(void *table)
735{
736	/*
737	 * This isn't an RCU grace period and hence the page-tables cannot be
738	 * assumed to be actually RCU-freed.
739	 *
740	 * It is however sufficient for software page-table walkers that rely
741	 * on IRQ disabling. See the comment near struct mmu_table_batch.
742	 */
743	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
744	__tlb_remove_table(table);
745}
746
747static void tlb_remove_table_rcu(struct rcu_head *head)
748{
749	struct mmu_table_batch *batch;
750	int i;
751
752	batch = container_of(head, struct mmu_table_batch, rcu);
753
754	for (i = 0; i < batch->nr; i++)
755		__tlb_remove_table(batch->tables[i]);
756
757	free_page((unsigned long)batch);
758}
759
760void tlb_table_flush(struct mmu_gather *tlb)
761{
762	struct mmu_table_batch **batch = &tlb->batch;
763
764	if (*batch) {
765		__tlb_flush_mm(tlb->mm);
766		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
767		*batch = NULL;
768	}
769}
770
771void tlb_remove_table(struct mmu_gather *tlb, void *table)
772{
773	struct mmu_table_batch **batch = &tlb->batch;
774
775	if (*batch == NULL) {
776		*batch = (struct mmu_table_batch *)
777			__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
778		if (*batch == NULL) {
779			__tlb_flush_mm(tlb->mm);
780			tlb_remove_table_one(table);
781			return;
782		}
783		(*batch)->nr = 0;
784	}
785	(*batch)->tables[(*batch)->nr++] = table;
786	if ((*batch)->nr == MAX_TABLE_BATCH)
787		tlb_table_flush(tlb);
788}
789
790/*
791 * switch on pgstes for its userspace process (for kvm)
792 */
793int s390_enable_sie(void)
794{
795	struct task_struct *tsk = current;
796	struct mm_struct *mm, *old_mm;
797
798	/* Do we have switched amode? If no, we cannot do sie */
799	if (s390_user_mode == HOME_SPACE_MODE)
800		return -EINVAL;
801
802	/* Do we have pgstes? if yes, we are done */
803	if (mm_has_pgste(tsk->mm))
804		return 0;
805
806	/* lets check if we are allowed to replace the mm */
807	task_lock(tsk);
808	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
809#ifdef CONFIG_AIO
810	    !hlist_empty(&tsk->mm->ioctx_list) ||
811#endif
812	    tsk->mm != tsk->active_mm) {
813		task_unlock(tsk);
814		return -EINVAL;
815	}
816	task_unlock(tsk);
817
818	/* we copy the mm and let dup_mm create the page tables with_pgstes */
819	tsk->mm->context.alloc_pgste = 1;
820	/* make sure that both mms have a correct rss state */
821	sync_mm_rss(tsk->mm);
822	mm = dup_mm(tsk);
823	tsk->mm->context.alloc_pgste = 0;
824	if (!mm)
825		return -ENOMEM;
826
827	/* Now lets check again if something happened */
828	task_lock(tsk);
829	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
830#ifdef CONFIG_AIO
831	    !hlist_empty(&tsk->mm->ioctx_list) ||
832#endif
833	    tsk->mm != tsk->active_mm) {
834		mmput(mm);
835		task_unlock(tsk);
836		return -EINVAL;
837	}
838
839	/* ok, we are alone. No ptrace, no threads, etc. */
840	old_mm = tsk->mm;
841	tsk->mm = tsk->active_mm = mm;
842	preempt_disable();
843	update_mm(mm, tsk);
844	atomic_inc(&mm->context.attach_count);
845	atomic_dec(&old_mm->context.attach_count);
846	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
847	preempt_enable();
848	task_unlock(tsk);
849	mmput(old_mm);
850	return 0;
851}
852EXPORT_SYMBOL_GPL(s390_enable_sie);
853
854#if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
855bool kernel_page_present(struct page *page)
856{
857	unsigned long addr;
858	int cc;
859
860	addr = page_to_phys(page);
861	asm volatile(
862		"	lra	%1,0(%1)\n"
863		"	ipm	%0\n"
864		"	srl	%0,28"
865		: "=d" (cc), "+a" (addr) : : "cc");
866	return cc == 0;
867}
868#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */
869
870#ifdef CONFIG_TRANSPARENT_HUGEPAGE
871static void pmdp_splitting_flush_sync(void *arg)
872{
873	/* Simply deliver the interrupt */
874}
875
876void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
877			  pmd_t *pmdp)
878{
879	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
880	if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
881			      (unsigned long *) pmdp)) {
882		/* need to serialize against gup-fast (IRQ disabled) */
883		smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
884	}
885}
886
887void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
888{
889	struct list_head *lh = (struct list_head *) pgtable;
890
891	assert_spin_locked(&mm->page_table_lock);
892
893	/* FIFO */
894	if (!mm->pmd_huge_pte)
895		INIT_LIST_HEAD(lh);
896	else
897		list_add(lh, (struct list_head *) mm->pmd_huge_pte);
898	mm->pmd_huge_pte = pgtable;
899}
900
901pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
902{
903	struct list_head *lh;
904	pgtable_t pgtable;
905	pte_t *ptep;
906
907	assert_spin_locked(&mm->page_table_lock);
908
909	/* FIFO */
910	pgtable = mm->pmd_huge_pte;
911	lh = (struct list_head *) pgtable;
912	if (list_empty(lh))
913		mm->pmd_huge_pte = NULL;
914	else {
915		mm->pmd_huge_pte = (pgtable_t) lh->next;
916		list_del(lh);
917	}
918	ptep = (pte_t *) pgtable;
919	pte_val(*ptep) = _PAGE_TYPE_EMPTY;
920	ptep++;
921	pte_val(*ptep) = _PAGE_TYPE_EMPTY;
922	return pgtable;
923}
924#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
925