pgtable.c revision 2320c5793790fcda80e6dcc088dbda86040235e5
1/*
2 *    Copyright IBM Corp. 2007,2011
3 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
4 */
5
6#include <linux/sched.h>
7#include <linux/kernel.h>
8#include <linux/errno.h>
9#include <linux/gfp.h>
10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/smp.h>
13#include <linux/highmem.h>
14#include <linux/pagemap.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/quicklist.h>
18#include <linux/rcupdate.h>
19#include <linux/slab.h>
20
21#include <asm/system.h>
22#include <asm/pgtable.h>
23#include <asm/pgalloc.h>
24#include <asm/tlb.h>
25#include <asm/tlbflush.h>
26#include <asm/mmu_context.h>
27
28#ifndef CONFIG_64BIT
29#define ALLOC_ORDER	1
30#define FRAG_MASK	0x0f
31#else
32#define ALLOC_ORDER	2
33#define FRAG_MASK	0x03
34#endif
35
36
37unsigned long *crst_table_alloc(struct mm_struct *mm)
38{
39	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
40
41	if (!page)
42		return NULL;
43	return (unsigned long *) page_to_phys(page);
44}
45
46void crst_table_free(struct mm_struct *mm, unsigned long *table)
47{
48	free_pages((unsigned long) table, ALLOC_ORDER);
49}
50
51#ifdef CONFIG_64BIT
52int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
53{
54	unsigned long *table, *pgd;
55	unsigned long entry;
56
57	BUG_ON(limit > (1UL << 53));
58repeat:
59	table = crst_table_alloc(mm);
60	if (!table)
61		return -ENOMEM;
62	spin_lock_bh(&mm->page_table_lock);
63	if (mm->context.asce_limit < limit) {
64		pgd = (unsigned long *) mm->pgd;
65		if (mm->context.asce_limit <= (1UL << 31)) {
66			entry = _REGION3_ENTRY_EMPTY;
67			mm->context.asce_limit = 1UL << 42;
68			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
69						_ASCE_USER_BITS |
70						_ASCE_TYPE_REGION3;
71		} else {
72			entry = _REGION2_ENTRY_EMPTY;
73			mm->context.asce_limit = 1UL << 53;
74			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
75						_ASCE_USER_BITS |
76						_ASCE_TYPE_REGION2;
77		}
78		crst_table_init(table, entry);
79		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
80		mm->pgd = (pgd_t *) table;
81		mm->task_size = mm->context.asce_limit;
82		table = NULL;
83	}
84	spin_unlock_bh(&mm->page_table_lock);
85	if (table)
86		crst_table_free(mm, table);
87	if (mm->context.asce_limit < limit)
88		goto repeat;
89	update_mm(mm, current);
90	return 0;
91}
92
93void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
94{
95	pgd_t *pgd;
96
97	if (mm->context.asce_limit <= limit)
98		return;
99	__tlb_flush_mm(mm);
100	while (mm->context.asce_limit > limit) {
101		pgd = mm->pgd;
102		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
103		case _REGION_ENTRY_TYPE_R2:
104			mm->context.asce_limit = 1UL << 42;
105			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
106						_ASCE_USER_BITS |
107						_ASCE_TYPE_REGION3;
108			break;
109		case _REGION_ENTRY_TYPE_R3:
110			mm->context.asce_limit = 1UL << 31;
111			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
112						_ASCE_USER_BITS |
113						_ASCE_TYPE_SEGMENT;
114			break;
115		default:
116			BUG();
117		}
118		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
119		mm->task_size = mm->context.asce_limit;
120		crst_table_free(mm, (unsigned long *) pgd);
121	}
122	update_mm(mm, current);
123}
124#endif
125
126#ifdef CONFIG_PGSTE
127
128/**
129 * gmap_alloc - allocate a guest address space
130 * @mm: pointer to the parent mm_struct
131 *
132 * Returns a guest address space structure.
133 */
134struct gmap *gmap_alloc(struct mm_struct *mm)
135{
136	struct gmap *gmap;
137	struct page *page;
138	unsigned long *table;
139
140	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
141	if (!gmap)
142		goto out;
143	INIT_LIST_HEAD(&gmap->crst_list);
144	gmap->mm = mm;
145	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
146	if (!page)
147		goto out_free;
148	list_add(&page->lru, &gmap->crst_list);
149	table = (unsigned long *) page_to_phys(page);
150	crst_table_init(table, _REGION1_ENTRY_EMPTY);
151	gmap->table = table;
152	gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
153		     _ASCE_USER_BITS | __pa(table);
154	list_add(&gmap->list, &mm->context.gmap_list);
155	return gmap;
156
157out_free:
158	kfree(gmap);
159out:
160	return NULL;
161}
162EXPORT_SYMBOL_GPL(gmap_alloc);
163
164static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
165{
166	struct gmap_pgtable *mp;
167	struct gmap_rmap *rmap;
168	struct page *page;
169
170	if (*table & _SEGMENT_ENTRY_INV)
171		return 0;
172	page = pfn_to_page(*table >> PAGE_SHIFT);
173	mp = (struct gmap_pgtable *) page->index;
174	list_for_each_entry(rmap, &mp->mapper, list) {
175		if (rmap->entry != table)
176			continue;
177		list_del(&rmap->list);
178		kfree(rmap);
179		break;
180	}
181	*table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
182	return 1;
183}
184
185static void gmap_flush_tlb(struct gmap *gmap)
186{
187	if (MACHINE_HAS_IDTE)
188		__tlb_flush_idte((unsigned long) gmap->table |
189				 _ASCE_TYPE_REGION1);
190	else
191		__tlb_flush_global();
192}
193
194/**
195 * gmap_free - free a guest address space
196 * @gmap: pointer to the guest address space structure
197 */
198void gmap_free(struct gmap *gmap)
199{
200	struct page *page, *next;
201	unsigned long *table;
202	int i;
203
204
205	/* Flush tlb. */
206	if (MACHINE_HAS_IDTE)
207		__tlb_flush_idte((unsigned long) gmap->table |
208				 _ASCE_TYPE_REGION1);
209	else
210		__tlb_flush_global();
211
212	/* Free all segment & region tables. */
213	down_read(&gmap->mm->mmap_sem);
214	spin_lock(&gmap->mm->page_table_lock);
215	list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
216		table = (unsigned long *) page_to_phys(page);
217		if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
218			/* Remove gmap rmap structures for segment table. */
219			for (i = 0; i < PTRS_PER_PMD; i++, table++)
220				gmap_unlink_segment(gmap, table);
221		__free_pages(page, ALLOC_ORDER);
222	}
223	spin_unlock(&gmap->mm->page_table_lock);
224	up_read(&gmap->mm->mmap_sem);
225	list_del(&gmap->list);
226	kfree(gmap);
227}
228EXPORT_SYMBOL_GPL(gmap_free);
229
230/**
231 * gmap_enable - switch primary space to the guest address space
232 * @gmap: pointer to the guest address space structure
233 */
234void gmap_enable(struct gmap *gmap)
235{
236	S390_lowcore.gmap = (unsigned long) gmap;
237}
238EXPORT_SYMBOL_GPL(gmap_enable);
239
240/**
241 * gmap_disable - switch back to the standard primary address space
242 * @gmap: pointer to the guest address space structure
243 */
244void gmap_disable(struct gmap *gmap)
245{
246	S390_lowcore.gmap = 0UL;
247}
248EXPORT_SYMBOL_GPL(gmap_disable);
249
250/*
251 * gmap_alloc_table is assumed to be called with mmap_sem held
252 */
253static int gmap_alloc_table(struct gmap *gmap,
254			       unsigned long *table, unsigned long init)
255{
256	struct page *page;
257	unsigned long *new;
258
259	/* since we dont free the gmap table until gmap_free we can unlock */
260	spin_unlock(&gmap->mm->page_table_lock);
261	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
262	spin_lock(&gmap->mm->page_table_lock);
263	if (!page)
264		return -ENOMEM;
265	new = (unsigned long *) page_to_phys(page);
266	crst_table_init(new, init);
267	if (*table & _REGION_ENTRY_INV) {
268		list_add(&page->lru, &gmap->crst_list);
269		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
270			(*table & _REGION_ENTRY_TYPE_MASK);
271	} else
272		__free_pages(page, ALLOC_ORDER);
273	return 0;
274}
275
276/**
277 * gmap_unmap_segment - unmap segment from the guest address space
278 * @gmap: pointer to the guest address space structure
279 * @addr: address in the guest address space
280 * @len: length of the memory area to unmap
281 *
282 * Returns 0 if the unmap succeded, -EINVAL if not.
283 */
284int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
285{
286	unsigned long *table;
287	unsigned long off;
288	int flush;
289
290	if ((to | len) & (PMD_SIZE - 1))
291		return -EINVAL;
292	if (len == 0 || to + len < to)
293		return -EINVAL;
294
295	flush = 0;
296	down_read(&gmap->mm->mmap_sem);
297	spin_lock(&gmap->mm->page_table_lock);
298	for (off = 0; off < len; off += PMD_SIZE) {
299		/* Walk the guest addr space page table */
300		table = gmap->table + (((to + off) >> 53) & 0x7ff);
301		if (*table & _REGION_ENTRY_INV)
302			goto out;
303		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
304		table = table + (((to + off) >> 42) & 0x7ff);
305		if (*table & _REGION_ENTRY_INV)
306			goto out;
307		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
308		table = table + (((to + off) >> 31) & 0x7ff);
309		if (*table & _REGION_ENTRY_INV)
310			goto out;
311		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
312		table = table + (((to + off) >> 20) & 0x7ff);
313
314		/* Clear segment table entry in guest address space. */
315		flush |= gmap_unlink_segment(gmap, table);
316		*table = _SEGMENT_ENTRY_INV;
317	}
318out:
319	spin_unlock(&gmap->mm->page_table_lock);
320	up_read(&gmap->mm->mmap_sem);
321	if (flush)
322		gmap_flush_tlb(gmap);
323	return 0;
324}
325EXPORT_SYMBOL_GPL(gmap_unmap_segment);
326
327/**
328 * gmap_mmap_segment - map a segment to the guest address space
329 * @gmap: pointer to the guest address space structure
330 * @from: source address in the parent address space
331 * @to: target address in the guest address space
332 *
333 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not.
334 */
335int gmap_map_segment(struct gmap *gmap, unsigned long from,
336		     unsigned long to, unsigned long len)
337{
338	unsigned long *table;
339	unsigned long off;
340	int flush;
341
342	if ((from | to | len) & (PMD_SIZE - 1))
343		return -EINVAL;
344	if (len == 0 || from + len > PGDIR_SIZE ||
345	    from + len < from || to + len < to)
346		return -EINVAL;
347
348	flush = 0;
349	down_read(&gmap->mm->mmap_sem);
350	spin_lock(&gmap->mm->page_table_lock);
351	for (off = 0; off < len; off += PMD_SIZE) {
352		/* Walk the gmap address space page table */
353		table = gmap->table + (((to + off) >> 53) & 0x7ff);
354		if ((*table & _REGION_ENTRY_INV) &&
355		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
356			goto out_unmap;
357		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
358		table = table + (((to + off) >> 42) & 0x7ff);
359		if ((*table & _REGION_ENTRY_INV) &&
360		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
361			goto out_unmap;
362		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
363		table = table + (((to + off) >> 31) & 0x7ff);
364		if ((*table & _REGION_ENTRY_INV) &&
365		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
366			goto out_unmap;
367		table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
368		table = table + (((to + off) >> 20) & 0x7ff);
369
370		/* Store 'from' address in an invalid segment table entry. */
371		flush |= gmap_unlink_segment(gmap, table);
372		*table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off);
373	}
374	spin_unlock(&gmap->mm->page_table_lock);
375	up_read(&gmap->mm->mmap_sem);
376	if (flush)
377		gmap_flush_tlb(gmap);
378	return 0;
379
380out_unmap:
381	spin_unlock(&gmap->mm->page_table_lock);
382	up_read(&gmap->mm->mmap_sem);
383	gmap_unmap_segment(gmap, to, len);
384	return -ENOMEM;
385}
386EXPORT_SYMBOL_GPL(gmap_map_segment);
387
388/*
389 * this function is assumed to be called with mmap_sem held
390 */
391unsigned long __gmap_fault(unsigned long address, struct gmap *gmap)
392{
393	unsigned long *table, vmaddr, segment;
394	struct mm_struct *mm;
395	struct gmap_pgtable *mp;
396	struct gmap_rmap *rmap;
397	struct vm_area_struct *vma;
398	struct page *page;
399	pgd_t *pgd;
400	pud_t *pud;
401	pmd_t *pmd;
402
403	current->thread.gmap_addr = address;
404	mm = gmap->mm;
405	/* Walk the gmap address space page table */
406	table = gmap->table + ((address >> 53) & 0x7ff);
407	if (unlikely(*table & _REGION_ENTRY_INV))
408		return -EFAULT;
409	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
410	table = table + ((address >> 42) & 0x7ff);
411	if (unlikely(*table & _REGION_ENTRY_INV))
412		return -EFAULT;
413	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
414	table = table + ((address >> 31) & 0x7ff);
415	if (unlikely(*table & _REGION_ENTRY_INV))
416		return -EFAULT;
417	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
418	table = table + ((address >> 20) & 0x7ff);
419
420	/* Convert the gmap address to an mm address. */
421	segment = *table;
422	if (likely(!(segment & _SEGMENT_ENTRY_INV))) {
423		page = pfn_to_page(segment >> PAGE_SHIFT);
424		mp = (struct gmap_pgtable *) page->index;
425		return mp->vmaddr | (address & ~PMD_MASK);
426	} else if (segment & _SEGMENT_ENTRY_RO) {
427		vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
428		vma = find_vma(mm, vmaddr);
429		if (!vma || vma->vm_start > vmaddr)
430			return -EFAULT;
431
432		/* Walk the parent mm page table */
433		pgd = pgd_offset(mm, vmaddr);
434		pud = pud_alloc(mm, pgd, vmaddr);
435		if (!pud)
436			return -ENOMEM;
437		pmd = pmd_alloc(mm, pud, vmaddr);
438		if (!pmd)
439			return -ENOMEM;
440		if (!pmd_present(*pmd) &&
441		    __pte_alloc(mm, vma, pmd, vmaddr))
442			return -ENOMEM;
443		/* pmd now points to a valid segment table entry. */
444		rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
445		if (!rmap)
446			return -ENOMEM;
447		/* Link gmap segment table entry location to page table. */
448		page = pmd_page(*pmd);
449		mp = (struct gmap_pgtable *) page->index;
450		rmap->entry = table;
451		spin_lock(&mm->page_table_lock);
452		list_add(&rmap->list, &mp->mapper);
453		spin_unlock(&mm->page_table_lock);
454		/* Set gmap segment table entry to page table. */
455		*table = pmd_val(*pmd) & PAGE_MASK;
456		return vmaddr | (address & ~PMD_MASK);
457	}
458	return -EFAULT;
459}
460
461unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
462{
463	unsigned long rc;
464
465	down_read(&gmap->mm->mmap_sem);
466	rc = __gmap_fault(address, gmap);
467	up_read(&gmap->mm->mmap_sem);
468
469	return rc;
470}
471EXPORT_SYMBOL_GPL(gmap_fault);
472
473void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap)
474{
475
476	unsigned long *table, address, size;
477	struct vm_area_struct *vma;
478	struct gmap_pgtable *mp;
479	struct page *page;
480
481	down_read(&gmap->mm->mmap_sem);
482	address = from;
483	while (address < to) {
484		/* Walk the gmap address space page table */
485		table = gmap->table + ((address >> 53) & 0x7ff);
486		if (unlikely(*table & _REGION_ENTRY_INV)) {
487			address = (address + PMD_SIZE) & PMD_MASK;
488			continue;
489		}
490		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
491		table = table + ((address >> 42) & 0x7ff);
492		if (unlikely(*table & _REGION_ENTRY_INV)) {
493			address = (address + PMD_SIZE) & PMD_MASK;
494			continue;
495		}
496		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
497		table = table + ((address >> 31) & 0x7ff);
498		if (unlikely(*table & _REGION_ENTRY_INV)) {
499			address = (address + PMD_SIZE) & PMD_MASK;
500			continue;
501		}
502		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
503		table = table + ((address >> 20) & 0x7ff);
504		if (unlikely(*table & _SEGMENT_ENTRY_INV)) {
505			address = (address + PMD_SIZE) & PMD_MASK;
506			continue;
507		}
508		page = pfn_to_page(*table >> PAGE_SHIFT);
509		mp = (struct gmap_pgtable *) page->index;
510		vma = find_vma(gmap->mm, mp->vmaddr);
511		size = min(to - address, PMD_SIZE - (address & ~PMD_MASK));
512		zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK),
513			       size, NULL);
514		address = (address + PMD_SIZE) & PMD_MASK;
515	}
516	up_read(&gmap->mm->mmap_sem);
517}
518EXPORT_SYMBOL_GPL(gmap_discard);
519
520void gmap_unmap_notifier(struct mm_struct *mm, unsigned long *table)
521{
522	struct gmap_rmap *rmap, *next;
523	struct gmap_pgtable *mp;
524	struct page *page;
525	int flush;
526
527	flush = 0;
528	spin_lock(&mm->page_table_lock);
529	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
530	mp = (struct gmap_pgtable *) page->index;
531	list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
532		*rmap->entry =
533			_SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
534		list_del(&rmap->list);
535		kfree(rmap);
536		flush = 1;
537	}
538	spin_unlock(&mm->page_table_lock);
539	if (flush)
540		__tlb_flush_global();
541}
542
543static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
544						    unsigned long vmaddr)
545{
546	struct page *page;
547	unsigned long *table;
548	struct gmap_pgtable *mp;
549
550	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
551	if (!page)
552		return NULL;
553	mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
554	if (!mp) {
555		__free_page(page);
556		return NULL;
557	}
558	pgtable_page_ctor(page);
559	mp->vmaddr = vmaddr & PMD_MASK;
560	INIT_LIST_HEAD(&mp->mapper);
561	page->index = (unsigned long) mp;
562	atomic_set(&page->_mapcount, 3);
563	table = (unsigned long *) page_to_phys(page);
564	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
565	clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
566	return table;
567}
568
569static inline void page_table_free_pgste(unsigned long *table)
570{
571	struct page *page;
572	struct gmap_pgtable *mp;
573
574	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
575	mp = (struct gmap_pgtable *) page->index;
576	BUG_ON(!list_empty(&mp->mapper));
577	pgtable_page_dtor(page);
578	atomic_set(&page->_mapcount, -1);
579	kfree(mp);
580	__free_page(page);
581}
582
583#else /* CONFIG_PGSTE */
584
585static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
586						    unsigned long vmaddr)
587{
588	return NULL;
589}
590
591static inline void page_table_free_pgste(unsigned long *table)
592{
593}
594
595static inline void gmap_unmap_notifier(struct mm_struct *mm,
596					  unsigned long *table)
597{
598}
599
600#endif /* CONFIG_PGSTE */
601
602static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
603{
604	unsigned int old, new;
605
606	do {
607		old = atomic_read(v);
608		new = old ^ bits;
609	} while (atomic_cmpxchg(v, old, new) != old);
610	return new;
611}
612
613/*
614 * page table entry allocation/free routines.
615 */
616unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
617{
618	struct page *page;
619	unsigned long *table;
620	unsigned int mask, bit;
621
622	if (mm_has_pgste(mm))
623		return page_table_alloc_pgste(mm, vmaddr);
624	/* Allocate fragments of a 4K page as 1K/2K page table */
625	spin_lock_bh(&mm->context.list_lock);
626	mask = FRAG_MASK;
627	if (!list_empty(&mm->context.pgtable_list)) {
628		page = list_first_entry(&mm->context.pgtable_list,
629					struct page, lru);
630		table = (unsigned long *) page_to_phys(page);
631		mask = atomic_read(&page->_mapcount);
632		mask = mask | (mask >> 4);
633	}
634	if ((mask & FRAG_MASK) == FRAG_MASK) {
635		spin_unlock_bh(&mm->context.list_lock);
636		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
637		if (!page)
638			return NULL;
639		pgtable_page_ctor(page);
640		atomic_set(&page->_mapcount, 1);
641		table = (unsigned long *) page_to_phys(page);
642		clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
643		spin_lock_bh(&mm->context.list_lock);
644		list_add(&page->lru, &mm->context.pgtable_list);
645	} else {
646		for (bit = 1; mask & bit; bit <<= 1)
647			table += PTRS_PER_PTE;
648		mask = atomic_xor_bits(&page->_mapcount, bit);
649		if ((mask & FRAG_MASK) == FRAG_MASK)
650			list_del(&page->lru);
651	}
652	spin_unlock_bh(&mm->context.list_lock);
653	return table;
654}
655
656void page_table_free(struct mm_struct *mm, unsigned long *table)
657{
658	struct page *page;
659	unsigned int bit, mask;
660
661	if (mm_has_pgste(mm)) {
662		gmap_unmap_notifier(mm, table);
663		return page_table_free_pgste(table);
664	}
665	/* Free 1K/2K page table fragment of a 4K page */
666	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
667	bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
668	spin_lock_bh(&mm->context.list_lock);
669	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
670		list_del(&page->lru);
671	mask = atomic_xor_bits(&page->_mapcount, bit);
672	if (mask & FRAG_MASK)
673		list_add(&page->lru, &mm->context.pgtable_list);
674	spin_unlock_bh(&mm->context.list_lock);
675	if (mask == 0) {
676		pgtable_page_dtor(page);
677		atomic_set(&page->_mapcount, -1);
678		__free_page(page);
679	}
680}
681
682#ifdef CONFIG_HAVE_RCU_TABLE_FREE
683
684static void __page_table_free_rcu(void *table, unsigned bit)
685{
686	struct page *page;
687
688	if (bit == FRAG_MASK)
689		return page_table_free_pgste(table);
690	/* Free 1K/2K page table fragment of a 4K page */
691	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
692	if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
693		pgtable_page_dtor(page);
694		atomic_set(&page->_mapcount, -1);
695		__free_page(page);
696	}
697}
698
699void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
700{
701	struct mm_struct *mm;
702	struct page *page;
703	unsigned int bit, mask;
704
705	mm = tlb->mm;
706	if (mm_has_pgste(mm)) {
707		gmap_unmap_notifier(mm, table);
708		table = (unsigned long *) (__pa(table) | FRAG_MASK);
709		tlb_remove_table(tlb, table);
710		return;
711	}
712	bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
713	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
714	spin_lock_bh(&mm->context.list_lock);
715	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
716		list_del(&page->lru);
717	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
718	if (mask & FRAG_MASK)
719		list_add_tail(&page->lru, &mm->context.pgtable_list);
720	spin_unlock_bh(&mm->context.list_lock);
721	table = (unsigned long *) (__pa(table) | (bit << 4));
722	tlb_remove_table(tlb, table);
723}
724
725void __tlb_remove_table(void *_table)
726{
727	const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
728	void *table = (void *)((unsigned long) _table & ~mask);
729	unsigned type = (unsigned long) _table & mask;
730
731	if (type)
732		__page_table_free_rcu(table, type);
733	else
734		free_pages((unsigned long) table, ALLOC_ORDER);
735}
736
737#endif
738
739/*
740 * switch on pgstes for its userspace process (for kvm)
741 */
742int s390_enable_sie(void)
743{
744	struct task_struct *tsk = current;
745	struct mm_struct *mm, *old_mm;
746
747	/* Do we have switched amode? If no, we cannot do sie */
748	if (user_mode == HOME_SPACE_MODE)
749		return -EINVAL;
750
751	/* Do we have pgstes? if yes, we are done */
752	if (mm_has_pgste(tsk->mm))
753		return 0;
754
755	/* lets check if we are allowed to replace the mm */
756	task_lock(tsk);
757	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
758#ifdef CONFIG_AIO
759	    !hlist_empty(&tsk->mm->ioctx_list) ||
760#endif
761	    tsk->mm != tsk->active_mm) {
762		task_unlock(tsk);
763		return -EINVAL;
764	}
765	task_unlock(tsk);
766
767	/* we copy the mm and let dup_mm create the page tables with_pgstes */
768	tsk->mm->context.alloc_pgste = 1;
769	mm = dup_mm(tsk);
770	tsk->mm->context.alloc_pgste = 0;
771	if (!mm)
772		return -ENOMEM;
773
774	/* Now lets check again if something happened */
775	task_lock(tsk);
776	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
777#ifdef CONFIG_AIO
778	    !hlist_empty(&tsk->mm->ioctx_list) ||
779#endif
780	    tsk->mm != tsk->active_mm) {
781		mmput(mm);
782		task_unlock(tsk);
783		return -EINVAL;
784	}
785
786	/* ok, we are alone. No ptrace, no threads, etc. */
787	old_mm = tsk->mm;
788	tsk->mm = tsk->active_mm = mm;
789	preempt_disable();
790	update_mm(mm, tsk);
791	atomic_inc(&mm->context.attach_count);
792	atomic_dec(&old_mm->context.attach_count);
793	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
794	preempt_enable();
795	task_unlock(tsk);
796	mmput(old_mm);
797	return 0;
798}
799EXPORT_SYMBOL_GPL(s390_enable_sie);
800
801#if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
802bool kernel_page_present(struct page *page)
803{
804	unsigned long addr;
805	int cc;
806
807	addr = page_to_phys(page);
808	asm volatile(
809		"	lra	%1,0(%1)\n"
810		"	ipm	%0\n"
811		"	srl	%0,28"
812		: "=d" (cc), "+a" (addr) : : "cc");
813	return cc == 0;
814}
815#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */
816