1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
22 */
23
24#include <linux/init.h>
25#include <linux/bitmap.h>
26#include <linux/debugfs.h>
27#include <linux/export.h>
28#include <linux/slab.h>
29#include <linux/irq.h>
30#include <linux/interrupt.h>
31#include <linux/spinlock.h>
32#include <linux/pci.h>
33#include <linux/dmar.h>
34#include <linux/dma-mapping.h>
35#include <linux/mempool.h>
36#include <linux/timer.h>
37#include <linux/iova.h>
38#include <linux/iommu.h>
39#include <linux/intel-iommu.h>
40#include <linux/syscore_ops.h>
41#include <linux/tboot.h>
42#include <linux/dmi.h>
43#include <linux/pci-ats.h>
44#include <linux/memblock.h>
45#include <asm/cacheflush.h>
46#include <asm/iommu.h>
47
48#define ROOT_SIZE		VTD_PAGE_SIZE
49#define CONTEXT_SIZE		VTD_PAGE_SIZE
50
51#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
52#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
53#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
54
55#define IOAPIC_RANGE_START	(0xfee00000)
56#define IOAPIC_RANGE_END	(0xfeefffff)
57#define IOVA_START_ADDR		(0x1000)
58
59#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
60
61#define MAX_AGAW_WIDTH 64
62
63#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
64#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
65
66/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
67   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
68#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
69				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
70#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
71
72#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
73#define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
74#define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64))
75
76/* page table handling */
77#define LEVEL_STRIDE		(9)
78#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
79
80/*
81 * This bitmap is used to advertise the page sizes our hardware support
82 * to the IOMMU core, which will then use this information to split
83 * physically contiguous memory regions it is mapping into page sizes
84 * that we support.
85 *
86 * Traditionally the IOMMU core just handed us the mappings directly,
87 * after making sure the size is an order of a 4KiB page and that the
88 * mapping has natural alignment.
89 *
90 * To retain this behavior, we currently advertise that we support
91 * all page sizes that are an order of 4KiB.
92 *
93 * If at some point we'd like to utilize the IOMMU core's new behavior,
94 * we could change this to advertise the real page sizes we support.
95 */
96#define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
97
98static inline int agaw_to_level(int agaw)
99{
100	return agaw + 2;
101}
102
103static inline int agaw_to_width(int agaw)
104{
105	return 30 + agaw * LEVEL_STRIDE;
106}
107
108static inline int width_to_agaw(int width)
109{
110	return (width - 30) / LEVEL_STRIDE;
111}
112
113static inline unsigned int level_to_offset_bits(int level)
114{
115	return (level - 1) * LEVEL_STRIDE;
116}
117
118static inline int pfn_level_offset(unsigned long pfn, int level)
119{
120	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
121}
122
123static inline unsigned long level_mask(int level)
124{
125	return -1UL << level_to_offset_bits(level);
126}
127
128static inline unsigned long level_size(int level)
129{
130	return 1UL << level_to_offset_bits(level);
131}
132
133static inline unsigned long align_to_level(unsigned long pfn, int level)
134{
135	return (pfn + level_size(level) - 1) & level_mask(level);
136}
137
138static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
139{
140	return  1 << ((lvl - 1) * LEVEL_STRIDE);
141}
142
143/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
144   are never going to work. */
145static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
146{
147	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
148}
149
150static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
151{
152	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
153}
154static inline unsigned long page_to_dma_pfn(struct page *pg)
155{
156	return mm_to_dma_pfn(page_to_pfn(pg));
157}
158static inline unsigned long virt_to_dma_pfn(void *p)
159{
160	return page_to_dma_pfn(virt_to_page(p));
161}
162
163/* global iommu list, set NULL for ignored DMAR units */
164static struct intel_iommu **g_iommus;
165
166static void __init check_tylersburg_isoch(void);
167static int rwbf_quirk;
168
169/*
170 * set to 1 to panic kernel if can't successfully enable VT-d
171 * (used when kernel is launched w/ TXT)
172 */
173static int force_on = 0;
174
175/*
176 * 0: Present
177 * 1-11: Reserved
178 * 12-63: Context Ptr (12 - (haw-1))
179 * 64-127: Reserved
180 */
181struct root_entry {
182	u64	val;
183	u64	rsvd1;
184};
185#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186static inline bool root_present(struct root_entry *root)
187{
188	return (root->val & 1);
189}
190static inline void set_root_present(struct root_entry *root)
191{
192	root->val |= 1;
193}
194static inline void set_root_value(struct root_entry *root, unsigned long value)
195{
196	root->val |= value & VTD_PAGE_MASK;
197}
198
199static inline struct context_entry *
200get_context_addr_from_root(struct root_entry *root)
201{
202	return (struct context_entry *)
203		(root_present(root)?phys_to_virt(
204		root->val & VTD_PAGE_MASK) :
205		NULL);
206}
207
208/*
209 * low 64 bits:
210 * 0: present
211 * 1: fault processing disable
212 * 2-3: translation type
213 * 12-63: address space root
214 * high 64 bits:
215 * 0-2: address width
216 * 3-6: aval
217 * 8-23: domain id
218 */
219struct context_entry {
220	u64 lo;
221	u64 hi;
222};
223
224static inline bool context_present(struct context_entry *context)
225{
226	return (context->lo & 1);
227}
228static inline void context_set_present(struct context_entry *context)
229{
230	context->lo |= 1;
231}
232
233static inline void context_set_fault_enable(struct context_entry *context)
234{
235	context->lo &= (((u64)-1) << 2) | 1;
236}
237
238static inline void context_set_translation_type(struct context_entry *context,
239						unsigned long value)
240{
241	context->lo &= (((u64)-1) << 4) | 3;
242	context->lo |= (value & 3) << 2;
243}
244
245static inline void context_set_address_root(struct context_entry *context,
246					    unsigned long value)
247{
248	context->lo |= value & VTD_PAGE_MASK;
249}
250
251static inline void context_set_address_width(struct context_entry *context,
252					     unsigned long value)
253{
254	context->hi |= value & 7;
255}
256
257static inline void context_set_domain_id(struct context_entry *context,
258					 unsigned long value)
259{
260	context->hi |= (value & ((1 << 16) - 1)) << 8;
261}
262
263static inline void context_clear_entry(struct context_entry *context)
264{
265	context->lo = 0;
266	context->hi = 0;
267}
268
269/*
270 * 0: readable
271 * 1: writable
272 * 2-6: reserved
273 * 7: super page
274 * 8-10: available
275 * 11: snoop behavior
276 * 12-63: Host physcial address
277 */
278struct dma_pte {
279	u64 val;
280};
281
282static inline void dma_clear_pte(struct dma_pte *pte)
283{
284	pte->val = 0;
285}
286
287static inline void dma_set_pte_readable(struct dma_pte *pte)
288{
289	pte->val |= DMA_PTE_READ;
290}
291
292static inline void dma_set_pte_writable(struct dma_pte *pte)
293{
294	pte->val |= DMA_PTE_WRITE;
295}
296
297static inline void dma_set_pte_snp(struct dma_pte *pte)
298{
299	pte->val |= DMA_PTE_SNP;
300}
301
302static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
303{
304	pte->val = (pte->val & ~3) | (prot & 3);
305}
306
307static inline u64 dma_pte_addr(struct dma_pte *pte)
308{
309#ifdef CONFIG_64BIT
310	return pte->val & VTD_PAGE_MASK;
311#else
312	/* Must have a full atomic 64-bit read */
313	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
314#endif
315}
316
317static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
318{
319	pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
320}
321
322static inline bool dma_pte_present(struct dma_pte *pte)
323{
324	return (pte->val & 3) != 0;
325}
326
327static inline bool dma_pte_superpage(struct dma_pte *pte)
328{
329	return (pte->val & (1 << 7));
330}
331
332static inline int first_pte_in_page(struct dma_pte *pte)
333{
334	return !((unsigned long)pte & ~VTD_PAGE_MASK);
335}
336
337/*
338 * This domain is a statically identity mapping domain.
339 *	1. This domain creats a static 1:1 mapping to all usable memory.
340 * 	2. It maps to each iommu if successful.
341 *	3. Each iommu mapps to this domain if successful.
342 */
343static struct dmar_domain *si_domain;
344static int hw_pass_through = 1;
345
346/* devices under the same p2p bridge are owned in one domain */
347#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
348
349/* domain represents a virtual machine, more than one devices
350 * across iommus may be owned in one domain, e.g. kvm guest.
351 */
352#define DOMAIN_FLAG_VIRTUAL_MACHINE	(1 << 1)
353
354/* si_domain contains mulitple devices */
355#define DOMAIN_FLAG_STATIC_IDENTITY	(1 << 2)
356
357/* define the limit of IOMMUs supported in each domain */
358#ifdef	CONFIG_X86
359# define	IOMMU_UNITS_SUPPORTED	MAX_IO_APICS
360#else
361# define	IOMMU_UNITS_SUPPORTED	64
362#endif
363
364struct dmar_domain {
365	int	id;			/* domain id */
366	int	nid;			/* node id */
367	DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
368					/* bitmap of iommus this domain uses*/
369
370	struct list_head devices; 	/* all devices' list */
371	struct iova_domain iovad;	/* iova's that belong to this domain */
372
373	struct dma_pte	*pgd;		/* virtual address */
374	int		gaw;		/* max guest address width */
375
376	/* adjusted guest address width, 0 is level 2 30-bit */
377	int		agaw;
378
379	int		flags;		/* flags to find out type of domain */
380
381	int		iommu_coherency;/* indicate coherency of iommu access */
382	int		iommu_snooping; /* indicate snooping control feature*/
383	int		iommu_count;	/* reference count of iommu */
384	int		iommu_superpage;/* Level of superpages supported:
385					   0 == 4KiB (no superpages), 1 == 2MiB,
386					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
387	spinlock_t	iommu_lock;	/* protect iommu set in domain */
388	u64		max_addr;	/* maximum mapped address */
389};
390
391/* PCI domain-device relationship */
392struct device_domain_info {
393	struct list_head link;	/* link to domain siblings */
394	struct list_head global; /* link to global list */
395	int segment;		/* PCI domain */
396	u8 bus;			/* PCI bus number */
397	u8 devfn;		/* PCI devfn number */
398	struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
399	struct intel_iommu *iommu; /* IOMMU used by this device */
400	struct dmar_domain *domain; /* pointer to domain */
401};
402
403static void flush_unmaps_timeout(unsigned long data);
404
405DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
406
407#define HIGH_WATER_MARK 250
408struct deferred_flush_tables {
409	int next;
410	struct iova *iova[HIGH_WATER_MARK];
411	struct dmar_domain *domain[HIGH_WATER_MARK];
412};
413
414static struct deferred_flush_tables *deferred_flush;
415
416/* bitmap for indexing intel_iommus */
417static int g_num_of_iommus;
418
419static DEFINE_SPINLOCK(async_umap_flush_lock);
420static LIST_HEAD(unmaps_to_do);
421
422static int timer_on;
423static long list_size;
424
425static void domain_remove_dev_info(struct dmar_domain *domain);
426
427#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
428int dmar_disabled = 0;
429#else
430int dmar_disabled = 1;
431#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
432
433int intel_iommu_enabled = 0;
434EXPORT_SYMBOL_GPL(intel_iommu_enabled);
435
436static int dmar_map_gfx = 1;
437static int dmar_forcedac;
438static int intel_iommu_strict;
439static int intel_iommu_superpage = 1;
440
441int intel_iommu_gfx_mapped;
442EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
443
444#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
445static DEFINE_SPINLOCK(device_domain_lock);
446static LIST_HEAD(device_domain_list);
447
448static struct iommu_ops intel_iommu_ops;
449
450static int __init intel_iommu_setup(char *str)
451{
452	if (!str)
453		return -EINVAL;
454	while (*str) {
455		if (!strncmp(str, "on", 2)) {
456			dmar_disabled = 0;
457			printk(KERN_INFO "Intel-IOMMU: enabled\n");
458		} else if (!strncmp(str, "off", 3)) {
459			dmar_disabled = 1;
460			printk(KERN_INFO "Intel-IOMMU: disabled\n");
461		} else if (!strncmp(str, "igfx_off", 8)) {
462			dmar_map_gfx = 0;
463			printk(KERN_INFO
464				"Intel-IOMMU: disable GFX device mapping\n");
465		} else if (!strncmp(str, "forcedac", 8)) {
466			printk(KERN_INFO
467				"Intel-IOMMU: Forcing DAC for PCI devices\n");
468			dmar_forcedac = 1;
469		} else if (!strncmp(str, "strict", 6)) {
470			printk(KERN_INFO
471				"Intel-IOMMU: disable batched IOTLB flush\n");
472			intel_iommu_strict = 1;
473		} else if (!strncmp(str, "sp_off", 6)) {
474			printk(KERN_INFO
475				"Intel-IOMMU: disable supported super page\n");
476			intel_iommu_superpage = 0;
477		}
478
479		str += strcspn(str, ",");
480		while (*str == ',')
481			str++;
482	}
483	return 0;
484}
485__setup("intel_iommu=", intel_iommu_setup);
486
487static struct kmem_cache *iommu_domain_cache;
488static struct kmem_cache *iommu_devinfo_cache;
489static struct kmem_cache *iommu_iova_cache;
490
491static inline void *alloc_pgtable_page(int node)
492{
493	struct page *page;
494	void *vaddr = NULL;
495
496	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
497	if (page)
498		vaddr = page_address(page);
499	return vaddr;
500}
501
502static inline void free_pgtable_page(void *vaddr)
503{
504	free_page((unsigned long)vaddr);
505}
506
507static inline void *alloc_domain_mem(void)
508{
509	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
510}
511
512static void free_domain_mem(void *vaddr)
513{
514	kmem_cache_free(iommu_domain_cache, vaddr);
515}
516
517static inline void * alloc_devinfo_mem(void)
518{
519	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
520}
521
522static inline void free_devinfo_mem(void *vaddr)
523{
524	kmem_cache_free(iommu_devinfo_cache, vaddr);
525}
526
527struct iova *alloc_iova_mem(void)
528{
529	return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
530}
531
532void free_iova_mem(struct iova *iova)
533{
534	kmem_cache_free(iommu_iova_cache, iova);
535}
536
537
538static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
539{
540	unsigned long sagaw;
541	int agaw = -1;
542
543	sagaw = cap_sagaw(iommu->cap);
544	for (agaw = width_to_agaw(max_gaw);
545	     agaw >= 0; agaw--) {
546		if (test_bit(agaw, &sagaw))
547			break;
548	}
549
550	return agaw;
551}
552
553/*
554 * Calculate max SAGAW for each iommu.
555 */
556int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
557{
558	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
559}
560
561/*
562 * calculate agaw for each iommu.
563 * "SAGAW" may be different across iommus, use a default agaw, and
564 * get a supported less agaw for iommus that don't support the default agaw.
565 */
566int iommu_calculate_agaw(struct intel_iommu *iommu)
567{
568	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
569}
570
571/* This functionin only returns single iommu in a domain */
572static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
573{
574	int iommu_id;
575
576	/* si_domain and vm domain should not get here. */
577	BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
578	BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
579
580	iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
581	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
582		return NULL;
583
584	return g_iommus[iommu_id];
585}
586
587static void domain_update_iommu_coherency(struct dmar_domain *domain)
588{
589	int i;
590
591	domain->iommu_coherency = 1;
592
593	for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
594		if (!ecap_coherent(g_iommus[i]->ecap)) {
595			domain->iommu_coherency = 0;
596			break;
597		}
598	}
599}
600
601static void domain_update_iommu_snooping(struct dmar_domain *domain)
602{
603	int i;
604
605	domain->iommu_snooping = 1;
606
607	for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
608		if (!ecap_sc_support(g_iommus[i]->ecap)) {
609			domain->iommu_snooping = 0;
610			break;
611		}
612	}
613}
614
615static void domain_update_iommu_superpage(struct dmar_domain *domain)
616{
617	struct dmar_drhd_unit *drhd;
618	struct intel_iommu *iommu = NULL;
619	int mask = 0xf;
620
621	if (!intel_iommu_superpage) {
622		domain->iommu_superpage = 0;
623		return;
624	}
625
626	/* set iommu_superpage to the smallest common denominator */
627	for_each_active_iommu(iommu, drhd) {
628		mask &= cap_super_page_val(iommu->cap);
629		if (!mask) {
630			break;
631		}
632	}
633	domain->iommu_superpage = fls(mask);
634}
635
636/* Some capabilities may be different across iommus */
637static void domain_update_iommu_cap(struct dmar_domain *domain)
638{
639	domain_update_iommu_coherency(domain);
640	domain_update_iommu_snooping(domain);
641	domain_update_iommu_superpage(domain);
642}
643
644static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
645{
646	struct dmar_drhd_unit *drhd = NULL;
647	int i;
648
649	for_each_drhd_unit(drhd) {
650		if (drhd->ignored)
651			continue;
652		if (segment != drhd->segment)
653			continue;
654
655		for (i = 0; i < drhd->devices_cnt; i++) {
656			if (drhd->devices[i] &&
657			    drhd->devices[i]->bus->number == bus &&
658			    drhd->devices[i]->devfn == devfn)
659				return drhd->iommu;
660			if (drhd->devices[i] &&
661			    drhd->devices[i]->subordinate &&
662			    drhd->devices[i]->subordinate->number <= bus &&
663			    drhd->devices[i]->subordinate->subordinate >= bus)
664				return drhd->iommu;
665		}
666
667		if (drhd->include_all)
668			return drhd->iommu;
669	}
670
671	return NULL;
672}
673
674static void domain_flush_cache(struct dmar_domain *domain,
675			       void *addr, int size)
676{
677	if (!domain->iommu_coherency)
678		clflush_cache_range(addr, size);
679}
680
681/* Gets context entry for a given bus and devfn */
682static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
683		u8 bus, u8 devfn)
684{
685	struct root_entry *root;
686	struct context_entry *context;
687	unsigned long phy_addr;
688	unsigned long flags;
689
690	spin_lock_irqsave(&iommu->lock, flags);
691	root = &iommu->root_entry[bus];
692	context = get_context_addr_from_root(root);
693	if (!context) {
694		context = (struct context_entry *)
695				alloc_pgtable_page(iommu->node);
696		if (!context) {
697			spin_unlock_irqrestore(&iommu->lock, flags);
698			return NULL;
699		}
700		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
701		phy_addr = virt_to_phys((void *)context);
702		set_root_value(root, phy_addr);
703		set_root_present(root);
704		__iommu_flush_cache(iommu, root, sizeof(*root));
705	}
706	spin_unlock_irqrestore(&iommu->lock, flags);
707	return &context[devfn];
708}
709
710static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
711{
712	struct root_entry *root;
713	struct context_entry *context;
714	int ret;
715	unsigned long flags;
716
717	spin_lock_irqsave(&iommu->lock, flags);
718	root = &iommu->root_entry[bus];
719	context = get_context_addr_from_root(root);
720	if (!context) {
721		ret = 0;
722		goto out;
723	}
724	ret = context_present(&context[devfn]);
725out:
726	spin_unlock_irqrestore(&iommu->lock, flags);
727	return ret;
728}
729
730static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
731{
732	struct root_entry *root;
733	struct context_entry *context;
734	unsigned long flags;
735
736	spin_lock_irqsave(&iommu->lock, flags);
737	root = &iommu->root_entry[bus];
738	context = get_context_addr_from_root(root);
739	if (context) {
740		context_clear_entry(&context[devfn]);
741		__iommu_flush_cache(iommu, &context[devfn], \
742			sizeof(*context));
743	}
744	spin_unlock_irqrestore(&iommu->lock, flags);
745}
746
747static void free_context_table(struct intel_iommu *iommu)
748{
749	struct root_entry *root;
750	int i;
751	unsigned long flags;
752	struct context_entry *context;
753
754	spin_lock_irqsave(&iommu->lock, flags);
755	if (!iommu->root_entry) {
756		goto out;
757	}
758	for (i = 0; i < ROOT_ENTRY_NR; i++) {
759		root = &iommu->root_entry[i];
760		context = get_context_addr_from_root(root);
761		if (context)
762			free_pgtable_page(context);
763	}
764	free_pgtable_page(iommu->root_entry);
765	iommu->root_entry = NULL;
766out:
767	spin_unlock_irqrestore(&iommu->lock, flags);
768}
769
770static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
771				      unsigned long pfn, int target_level)
772{
773	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
774	struct dma_pte *parent, *pte = NULL;
775	int level = agaw_to_level(domain->agaw);
776	int offset;
777
778	BUG_ON(!domain->pgd);
779	BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
780	parent = domain->pgd;
781
782	while (level > 0) {
783		void *tmp_page;
784
785		offset = pfn_level_offset(pfn, level);
786		pte = &parent[offset];
787		if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
788			break;
789		if (level == target_level)
790			break;
791
792		if (!dma_pte_present(pte)) {
793			uint64_t pteval;
794
795			tmp_page = alloc_pgtable_page(domain->nid);
796
797			if (!tmp_page)
798				return NULL;
799
800			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
801			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
802			if (cmpxchg64(&pte->val, 0ULL, pteval)) {
803				/* Someone else set it while we were thinking; use theirs. */
804				free_pgtable_page(tmp_page);
805			} else {
806				dma_pte_addr(pte);
807				domain_flush_cache(domain, pte, sizeof(*pte));
808			}
809		}
810		parent = phys_to_virt(dma_pte_addr(pte));
811		level--;
812	}
813
814	return pte;
815}
816
817
818/* return address's pte at specific level */
819static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
820					 unsigned long pfn,
821					 int level, int *large_page)
822{
823	struct dma_pte *parent, *pte = NULL;
824	int total = agaw_to_level(domain->agaw);
825	int offset;
826
827	parent = domain->pgd;
828	while (level <= total) {
829		offset = pfn_level_offset(pfn, total);
830		pte = &parent[offset];
831		if (level == total)
832			return pte;
833
834		if (!dma_pte_present(pte)) {
835			*large_page = total;
836			break;
837		}
838
839		if (pte->val & DMA_PTE_LARGE_PAGE) {
840			*large_page = total;
841			return pte;
842		}
843
844		parent = phys_to_virt(dma_pte_addr(pte));
845		total--;
846	}
847	return NULL;
848}
849
850/* clear last level pte, a tlb flush should be followed */
851static int dma_pte_clear_range(struct dmar_domain *domain,
852				unsigned long start_pfn,
853				unsigned long last_pfn)
854{
855	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
856	unsigned int large_page = 1;
857	struct dma_pte *first_pte, *pte;
858	int order;
859
860	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
861	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
862	BUG_ON(start_pfn > last_pfn);
863
864	/* we don't need lock here; nobody else touches the iova range */
865	do {
866		large_page = 1;
867		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
868		if (!pte) {
869			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
870			continue;
871		}
872		do {
873			dma_clear_pte(pte);
874			start_pfn += lvl_to_nr_pages(large_page);
875			pte++;
876		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
877
878		domain_flush_cache(domain, first_pte,
879				   (void *)pte - (void *)first_pte);
880
881	} while (start_pfn && start_pfn <= last_pfn);
882
883	order = (large_page - 1) * 9;
884	return order;
885}
886
887/* free page table pages. last level pte should already be cleared */
888static void dma_pte_free_pagetable(struct dmar_domain *domain,
889				   unsigned long start_pfn,
890				   unsigned long last_pfn)
891{
892	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
893	struct dma_pte *first_pte, *pte;
894	int total = agaw_to_level(domain->agaw);
895	int level;
896	unsigned long tmp;
897	int large_page = 2;
898
899	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
900	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
901	BUG_ON(start_pfn > last_pfn);
902
903	/* We don't need lock here; nobody else touches the iova range */
904	level = 2;
905	while (level <= total) {
906		tmp = align_to_level(start_pfn, level);
907
908		/* If we can't even clear one PTE at this level, we're done */
909		if (tmp + level_size(level) - 1 > last_pfn)
910			return;
911
912		do {
913			large_page = level;
914			first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
915			if (large_page > level)
916				level = large_page + 1;
917			if (!pte) {
918				tmp = align_to_level(tmp + 1, level + 1);
919				continue;
920			}
921			do {
922				if (dma_pte_present(pte)) {
923					free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
924					dma_clear_pte(pte);
925				}
926				pte++;
927				tmp += level_size(level);
928			} while (!first_pte_in_page(pte) &&
929				 tmp + level_size(level) - 1 <= last_pfn);
930
931			domain_flush_cache(domain, first_pte,
932					   (void *)pte - (void *)first_pte);
933
934		} while (tmp && tmp + level_size(level) - 1 <= last_pfn);
935		level++;
936	}
937	/* free pgd */
938	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
939		free_pgtable_page(domain->pgd);
940		domain->pgd = NULL;
941	}
942}
943
944/* iommu handling */
945static int iommu_alloc_root_entry(struct intel_iommu *iommu)
946{
947	struct root_entry *root;
948	unsigned long flags;
949
950	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
951	if (!root)
952		return -ENOMEM;
953
954	__iommu_flush_cache(iommu, root, ROOT_SIZE);
955
956	spin_lock_irqsave(&iommu->lock, flags);
957	iommu->root_entry = root;
958	spin_unlock_irqrestore(&iommu->lock, flags);
959
960	return 0;
961}
962
963static void iommu_set_root_entry(struct intel_iommu *iommu)
964{
965	void *addr;
966	u32 sts;
967	unsigned long flag;
968
969	addr = iommu->root_entry;
970
971	raw_spin_lock_irqsave(&iommu->register_lock, flag);
972	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
973
974	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
975
976	/* Make sure hardware complete it */
977	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
978		      readl, (sts & DMA_GSTS_RTPS), sts);
979
980	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
981}
982
983static void iommu_flush_write_buffer(struct intel_iommu *iommu)
984{
985	u32 val;
986	unsigned long flag;
987
988	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
989		return;
990
991	raw_spin_lock_irqsave(&iommu->register_lock, flag);
992	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
993
994	/* Make sure hardware complete it */
995	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
996		      readl, (!(val & DMA_GSTS_WBFS)), val);
997
998	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
999}
1000
1001/* return value determine if we need a write buffer flush */
1002static void __iommu_flush_context(struct intel_iommu *iommu,
1003				  u16 did, u16 source_id, u8 function_mask,
1004				  u64 type)
1005{
1006	u64 val = 0;
1007	unsigned long flag;
1008
1009	switch (type) {
1010	case DMA_CCMD_GLOBAL_INVL:
1011		val = DMA_CCMD_GLOBAL_INVL;
1012		break;
1013	case DMA_CCMD_DOMAIN_INVL:
1014		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1015		break;
1016	case DMA_CCMD_DEVICE_INVL:
1017		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1018			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1019		break;
1020	default:
1021		BUG();
1022	}
1023	val |= DMA_CCMD_ICC;
1024
1025	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1026	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1027
1028	/* Make sure hardware complete it */
1029	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1030		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1031
1032	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1033}
1034
1035/* return value determine if we need a write buffer flush */
1036static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1037				u64 addr, unsigned int size_order, u64 type)
1038{
1039	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1040	u64 val = 0, val_iva = 0;
1041	unsigned long flag;
1042
1043	switch (type) {
1044	case DMA_TLB_GLOBAL_FLUSH:
1045		/* global flush doesn't need set IVA_REG */
1046		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1047		break;
1048	case DMA_TLB_DSI_FLUSH:
1049		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1050		break;
1051	case DMA_TLB_PSI_FLUSH:
1052		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1053		/* Note: always flush non-leaf currently */
1054		val_iva = size_order | addr;
1055		break;
1056	default:
1057		BUG();
1058	}
1059	/* Note: set drain read/write */
1060#if 0
1061	/*
1062	 * This is probably to be super secure.. Looks like we can
1063	 * ignore it without any impact.
1064	 */
1065	if (cap_read_drain(iommu->cap))
1066		val |= DMA_TLB_READ_DRAIN;
1067#endif
1068	if (cap_write_drain(iommu->cap))
1069		val |= DMA_TLB_WRITE_DRAIN;
1070
1071	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1072	/* Note: Only uses first TLB reg currently */
1073	if (val_iva)
1074		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1075	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1076
1077	/* Make sure hardware complete it */
1078	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1079		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1080
1081	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1082
1083	/* check IOTLB invalidation granularity */
1084	if (DMA_TLB_IAIG(val) == 0)
1085		printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1086	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1087		pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1088			(unsigned long long)DMA_TLB_IIRG(type),
1089			(unsigned long long)DMA_TLB_IAIG(val));
1090}
1091
1092static struct device_domain_info *iommu_support_dev_iotlb(
1093	struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1094{
1095	int found = 0;
1096	unsigned long flags;
1097	struct device_domain_info *info;
1098	struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1099
1100	if (!ecap_dev_iotlb_support(iommu->ecap))
1101		return NULL;
1102
1103	if (!iommu->qi)
1104		return NULL;
1105
1106	spin_lock_irqsave(&device_domain_lock, flags);
1107	list_for_each_entry(info, &domain->devices, link)
1108		if (info->bus == bus && info->devfn == devfn) {
1109			found = 1;
1110			break;
1111		}
1112	spin_unlock_irqrestore(&device_domain_lock, flags);
1113
1114	if (!found || !info->dev)
1115		return NULL;
1116
1117	if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1118		return NULL;
1119
1120	if (!dmar_find_matched_atsr_unit(info->dev))
1121		return NULL;
1122
1123	info->iommu = iommu;
1124
1125	return info;
1126}
1127
1128static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1129{
1130	if (!info)
1131		return;
1132
1133	pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1134}
1135
1136static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1137{
1138	if (!info->dev || !pci_ats_enabled(info->dev))
1139		return;
1140
1141	pci_disable_ats(info->dev);
1142}
1143
1144static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1145				  u64 addr, unsigned mask)
1146{
1147	u16 sid, qdep;
1148	unsigned long flags;
1149	struct device_domain_info *info;
1150
1151	spin_lock_irqsave(&device_domain_lock, flags);
1152	list_for_each_entry(info, &domain->devices, link) {
1153		if (!info->dev || !pci_ats_enabled(info->dev))
1154			continue;
1155
1156		sid = info->bus << 8 | info->devfn;
1157		qdep = pci_ats_queue_depth(info->dev);
1158		qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1159	}
1160	spin_unlock_irqrestore(&device_domain_lock, flags);
1161}
1162
1163static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1164				  unsigned long pfn, unsigned int pages, int map)
1165{
1166	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1167	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1168
1169	BUG_ON(pages == 0);
1170
1171	/*
1172	 * Fallback to domain selective flush if no PSI support or the size is
1173	 * too big.
1174	 * PSI requires page size to be 2 ^ x, and the base address is naturally
1175	 * aligned to the size
1176	 */
1177	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1178		iommu->flush.flush_iotlb(iommu, did, 0, 0,
1179						DMA_TLB_DSI_FLUSH);
1180	else
1181		iommu->flush.flush_iotlb(iommu, did, addr, mask,
1182						DMA_TLB_PSI_FLUSH);
1183
1184	/*
1185	 * In caching mode, changes of pages from non-present to present require
1186	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1187	 */
1188	if (!cap_caching_mode(iommu->cap) || !map)
1189		iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1190}
1191
1192static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1193{
1194	u32 pmen;
1195	unsigned long flags;
1196
1197	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1198	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1199	pmen &= ~DMA_PMEN_EPM;
1200	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1201
1202	/* wait for the protected region status bit to clear */
1203	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1204		readl, !(pmen & DMA_PMEN_PRS), pmen);
1205
1206	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1207}
1208
1209static int iommu_enable_translation(struct intel_iommu *iommu)
1210{
1211	u32 sts;
1212	unsigned long flags;
1213
1214	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1215	iommu->gcmd |= DMA_GCMD_TE;
1216	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1217
1218	/* Make sure hardware complete it */
1219	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1220		      readl, (sts & DMA_GSTS_TES), sts);
1221
1222	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1223	return 0;
1224}
1225
1226static int iommu_disable_translation(struct intel_iommu *iommu)
1227{
1228	u32 sts;
1229	unsigned long flag;
1230
1231	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1232	iommu->gcmd &= ~DMA_GCMD_TE;
1233	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1234
1235	/* Make sure hardware complete it */
1236	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1237		      readl, (!(sts & DMA_GSTS_TES)), sts);
1238
1239	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1240	return 0;
1241}
1242
1243
1244static int iommu_init_domains(struct intel_iommu *iommu)
1245{
1246	unsigned long ndomains;
1247	unsigned long nlongs;
1248
1249	ndomains = cap_ndoms(iommu->cap);
1250	pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1251			ndomains);
1252	nlongs = BITS_TO_LONGS(ndomains);
1253
1254	spin_lock_init(&iommu->lock);
1255
1256	/* TBD: there might be 64K domains,
1257	 * consider other allocation for future chip
1258	 */
1259	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1260	if (!iommu->domain_ids) {
1261		printk(KERN_ERR "Allocating domain id array failed\n");
1262		return -ENOMEM;
1263	}
1264	iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1265			GFP_KERNEL);
1266	if (!iommu->domains) {
1267		printk(KERN_ERR "Allocating domain array failed\n");
1268		return -ENOMEM;
1269	}
1270
1271	/*
1272	 * if Caching mode is set, then invalid translations are tagged
1273	 * with domainid 0. Hence we need to pre-allocate it.
1274	 */
1275	if (cap_caching_mode(iommu->cap))
1276		set_bit(0, iommu->domain_ids);
1277	return 0;
1278}
1279
1280
1281static void domain_exit(struct dmar_domain *domain);
1282static void vm_domain_exit(struct dmar_domain *domain);
1283
1284void free_dmar_iommu(struct intel_iommu *iommu)
1285{
1286	struct dmar_domain *domain;
1287	int i;
1288	unsigned long flags;
1289
1290	if ((iommu->domains) && (iommu->domain_ids)) {
1291		for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1292			domain = iommu->domains[i];
1293			clear_bit(i, iommu->domain_ids);
1294
1295			spin_lock_irqsave(&domain->iommu_lock, flags);
1296			if (--domain->iommu_count == 0) {
1297				if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1298					vm_domain_exit(domain);
1299				else
1300					domain_exit(domain);
1301			}
1302			spin_unlock_irqrestore(&domain->iommu_lock, flags);
1303		}
1304	}
1305
1306	if (iommu->gcmd & DMA_GCMD_TE)
1307		iommu_disable_translation(iommu);
1308
1309	if (iommu->irq) {
1310		irq_set_handler_data(iommu->irq, NULL);
1311		/* This will mask the irq */
1312		free_irq(iommu->irq, iommu);
1313		destroy_irq(iommu->irq);
1314	}
1315
1316	kfree(iommu->domains);
1317	kfree(iommu->domain_ids);
1318
1319	g_iommus[iommu->seq_id] = NULL;
1320
1321	/* if all iommus are freed, free g_iommus */
1322	for (i = 0; i < g_num_of_iommus; i++) {
1323		if (g_iommus[i])
1324			break;
1325	}
1326
1327	if (i == g_num_of_iommus)
1328		kfree(g_iommus);
1329
1330	/* free context mapping */
1331	free_context_table(iommu);
1332}
1333
1334static struct dmar_domain *alloc_domain(void)
1335{
1336	struct dmar_domain *domain;
1337
1338	domain = alloc_domain_mem();
1339	if (!domain)
1340		return NULL;
1341
1342	domain->nid = -1;
1343	memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1344	domain->flags = 0;
1345
1346	return domain;
1347}
1348
1349static int iommu_attach_domain(struct dmar_domain *domain,
1350			       struct intel_iommu *iommu)
1351{
1352	int num;
1353	unsigned long ndomains;
1354	unsigned long flags;
1355
1356	ndomains = cap_ndoms(iommu->cap);
1357
1358	spin_lock_irqsave(&iommu->lock, flags);
1359
1360	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1361	if (num >= ndomains) {
1362		spin_unlock_irqrestore(&iommu->lock, flags);
1363		printk(KERN_ERR "IOMMU: no free domain ids\n");
1364		return -ENOMEM;
1365	}
1366
1367	domain->id = num;
1368	set_bit(num, iommu->domain_ids);
1369	set_bit(iommu->seq_id, domain->iommu_bmp);
1370	iommu->domains[num] = domain;
1371	spin_unlock_irqrestore(&iommu->lock, flags);
1372
1373	return 0;
1374}
1375
1376static void iommu_detach_domain(struct dmar_domain *domain,
1377				struct intel_iommu *iommu)
1378{
1379	unsigned long flags;
1380	int num, ndomains;
1381	int found = 0;
1382
1383	spin_lock_irqsave(&iommu->lock, flags);
1384	ndomains = cap_ndoms(iommu->cap);
1385	for_each_set_bit(num, iommu->domain_ids, ndomains) {
1386		if (iommu->domains[num] == domain) {
1387			found = 1;
1388			break;
1389		}
1390	}
1391
1392	if (found) {
1393		clear_bit(num, iommu->domain_ids);
1394		clear_bit(iommu->seq_id, domain->iommu_bmp);
1395		iommu->domains[num] = NULL;
1396	}
1397	spin_unlock_irqrestore(&iommu->lock, flags);
1398}
1399
1400static struct iova_domain reserved_iova_list;
1401static struct lock_class_key reserved_rbtree_key;
1402
1403static int dmar_init_reserved_ranges(void)
1404{
1405	struct pci_dev *pdev = NULL;
1406	struct iova *iova;
1407	int i;
1408
1409	init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1410
1411	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1412		&reserved_rbtree_key);
1413
1414	/* IOAPIC ranges shouldn't be accessed by DMA */
1415	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1416		IOVA_PFN(IOAPIC_RANGE_END));
1417	if (!iova) {
1418		printk(KERN_ERR "Reserve IOAPIC range failed\n");
1419		return -ENODEV;
1420	}
1421
1422	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1423	for_each_pci_dev(pdev) {
1424		struct resource *r;
1425
1426		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1427			r = &pdev->resource[i];
1428			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1429				continue;
1430			iova = reserve_iova(&reserved_iova_list,
1431					    IOVA_PFN(r->start),
1432					    IOVA_PFN(r->end));
1433			if (!iova) {
1434				printk(KERN_ERR "Reserve iova failed\n");
1435				return -ENODEV;
1436			}
1437		}
1438	}
1439	return 0;
1440}
1441
1442static void domain_reserve_special_ranges(struct dmar_domain *domain)
1443{
1444	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1445}
1446
1447static inline int guestwidth_to_adjustwidth(int gaw)
1448{
1449	int agaw;
1450	int r = (gaw - 12) % 9;
1451
1452	if (r == 0)
1453		agaw = gaw;
1454	else
1455		agaw = gaw + 9 - r;
1456	if (agaw > 64)
1457		agaw = 64;
1458	return agaw;
1459}
1460
1461static int domain_init(struct dmar_domain *domain, int guest_width)
1462{
1463	struct intel_iommu *iommu;
1464	int adjust_width, agaw;
1465	unsigned long sagaw;
1466
1467	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1468	spin_lock_init(&domain->iommu_lock);
1469
1470	domain_reserve_special_ranges(domain);
1471
1472	/* calculate AGAW */
1473	iommu = domain_get_iommu(domain);
1474	if (guest_width > cap_mgaw(iommu->cap))
1475		guest_width = cap_mgaw(iommu->cap);
1476	domain->gaw = guest_width;
1477	adjust_width = guestwidth_to_adjustwidth(guest_width);
1478	agaw = width_to_agaw(adjust_width);
1479	sagaw = cap_sagaw(iommu->cap);
1480	if (!test_bit(agaw, &sagaw)) {
1481		/* hardware doesn't support it, choose a bigger one */
1482		pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1483		agaw = find_next_bit(&sagaw, 5, agaw);
1484		if (agaw >= 5)
1485			return -ENODEV;
1486	}
1487	domain->agaw = agaw;
1488	INIT_LIST_HEAD(&domain->devices);
1489
1490	if (ecap_coherent(iommu->ecap))
1491		domain->iommu_coherency = 1;
1492	else
1493		domain->iommu_coherency = 0;
1494
1495	if (ecap_sc_support(iommu->ecap))
1496		domain->iommu_snooping = 1;
1497	else
1498		domain->iommu_snooping = 0;
1499
1500	domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1501	domain->iommu_count = 1;
1502	domain->nid = iommu->node;
1503
1504	/* always allocate the top pgd */
1505	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1506	if (!domain->pgd)
1507		return -ENOMEM;
1508	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1509	return 0;
1510}
1511
1512static void domain_exit(struct dmar_domain *domain)
1513{
1514	struct dmar_drhd_unit *drhd;
1515	struct intel_iommu *iommu;
1516
1517	/* Domain 0 is reserved, so dont process it */
1518	if (!domain)
1519		return;
1520
1521	/* Flush any lazy unmaps that may reference this domain */
1522	if (!intel_iommu_strict)
1523		flush_unmaps_timeout(0);
1524
1525	domain_remove_dev_info(domain);
1526	/* destroy iovas */
1527	put_iova_domain(&domain->iovad);
1528
1529	/* clear ptes */
1530	dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1531
1532	/* free page tables */
1533	dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1534
1535	for_each_active_iommu(iommu, drhd)
1536		if (test_bit(iommu->seq_id, domain->iommu_bmp))
1537			iommu_detach_domain(domain, iommu);
1538
1539	free_domain_mem(domain);
1540}
1541
1542static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1543				 u8 bus, u8 devfn, int translation)
1544{
1545	struct context_entry *context;
1546	unsigned long flags;
1547	struct intel_iommu *iommu;
1548	struct dma_pte *pgd;
1549	unsigned long num;
1550	unsigned long ndomains;
1551	int id;
1552	int agaw;
1553	struct device_domain_info *info = NULL;
1554
1555	pr_debug("Set context mapping for %02x:%02x.%d\n",
1556		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1557
1558	BUG_ON(!domain->pgd);
1559	BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1560	       translation != CONTEXT_TT_MULTI_LEVEL);
1561
1562	iommu = device_to_iommu(segment, bus, devfn);
1563	if (!iommu)
1564		return -ENODEV;
1565
1566	context = device_to_context_entry(iommu, bus, devfn);
1567	if (!context)
1568		return -ENOMEM;
1569	spin_lock_irqsave(&iommu->lock, flags);
1570	if (context_present(context)) {
1571		spin_unlock_irqrestore(&iommu->lock, flags);
1572		return 0;
1573	}
1574
1575	id = domain->id;
1576	pgd = domain->pgd;
1577
1578	if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1579	    domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1580		int found = 0;
1581
1582		/* find an available domain id for this device in iommu */
1583		ndomains = cap_ndoms(iommu->cap);
1584		for_each_set_bit(num, iommu->domain_ids, ndomains) {
1585			if (iommu->domains[num] == domain) {
1586				id = num;
1587				found = 1;
1588				break;
1589			}
1590		}
1591
1592		if (found == 0) {
1593			num = find_first_zero_bit(iommu->domain_ids, ndomains);
1594			if (num >= ndomains) {
1595				spin_unlock_irqrestore(&iommu->lock, flags);
1596				printk(KERN_ERR "IOMMU: no free domain ids\n");
1597				return -EFAULT;
1598			}
1599
1600			set_bit(num, iommu->domain_ids);
1601			iommu->domains[num] = domain;
1602			id = num;
1603		}
1604
1605		/* Skip top levels of page tables for
1606		 * iommu which has less agaw than default.
1607		 * Unnecessary for PT mode.
1608		 */
1609		if (translation != CONTEXT_TT_PASS_THROUGH) {
1610			for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1611				pgd = phys_to_virt(dma_pte_addr(pgd));
1612				if (!dma_pte_present(pgd)) {
1613					spin_unlock_irqrestore(&iommu->lock, flags);
1614					return -ENOMEM;
1615				}
1616			}
1617		}
1618	}
1619
1620	context_set_domain_id(context, id);
1621
1622	if (translation != CONTEXT_TT_PASS_THROUGH) {
1623		info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1624		translation = info ? CONTEXT_TT_DEV_IOTLB :
1625				     CONTEXT_TT_MULTI_LEVEL;
1626	}
1627	/*
1628	 * In pass through mode, AW must be programmed to indicate the largest
1629	 * AGAW value supported by hardware. And ASR is ignored by hardware.
1630	 */
1631	if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1632		context_set_address_width(context, iommu->msagaw);
1633	else {
1634		context_set_address_root(context, virt_to_phys(pgd));
1635		context_set_address_width(context, iommu->agaw);
1636	}
1637
1638	context_set_translation_type(context, translation);
1639	context_set_fault_enable(context);
1640	context_set_present(context);
1641	domain_flush_cache(domain, context, sizeof(*context));
1642
1643	/*
1644	 * It's a non-present to present mapping. If hardware doesn't cache
1645	 * non-present entry we only need to flush the write-buffer. If the
1646	 * _does_ cache non-present entries, then it does so in the special
1647	 * domain #0, which we have to flush:
1648	 */
1649	if (cap_caching_mode(iommu->cap)) {
1650		iommu->flush.flush_context(iommu, 0,
1651					   (((u16)bus) << 8) | devfn,
1652					   DMA_CCMD_MASK_NOBIT,
1653					   DMA_CCMD_DEVICE_INVL);
1654		iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1655	} else {
1656		iommu_flush_write_buffer(iommu);
1657	}
1658	iommu_enable_dev_iotlb(info);
1659	spin_unlock_irqrestore(&iommu->lock, flags);
1660
1661	spin_lock_irqsave(&domain->iommu_lock, flags);
1662	if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1663		domain->iommu_count++;
1664		if (domain->iommu_count == 1)
1665			domain->nid = iommu->node;
1666		domain_update_iommu_cap(domain);
1667	}
1668	spin_unlock_irqrestore(&domain->iommu_lock, flags);
1669	return 0;
1670}
1671
1672static int
1673domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1674			int translation)
1675{
1676	int ret;
1677	struct pci_dev *tmp, *parent;
1678
1679	ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1680					 pdev->bus->number, pdev->devfn,
1681					 translation);
1682	if (ret)
1683		return ret;
1684
1685	/* dependent device mapping */
1686	tmp = pci_find_upstream_pcie_bridge(pdev);
1687	if (!tmp)
1688		return 0;
1689	/* Secondary interface's bus number and devfn 0 */
1690	parent = pdev->bus->self;
1691	while (parent != tmp) {
1692		ret = domain_context_mapping_one(domain,
1693						 pci_domain_nr(parent->bus),
1694						 parent->bus->number,
1695						 parent->devfn, translation);
1696		if (ret)
1697			return ret;
1698		parent = parent->bus->self;
1699	}
1700	if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1701		return domain_context_mapping_one(domain,
1702					pci_domain_nr(tmp->subordinate),
1703					tmp->subordinate->number, 0,
1704					translation);
1705	else /* this is a legacy PCI bridge */
1706		return domain_context_mapping_one(domain,
1707						  pci_domain_nr(tmp->bus),
1708						  tmp->bus->number,
1709						  tmp->devfn,
1710						  translation);
1711}
1712
1713static int domain_context_mapped(struct pci_dev *pdev)
1714{
1715	int ret;
1716	struct pci_dev *tmp, *parent;
1717	struct intel_iommu *iommu;
1718
1719	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1720				pdev->devfn);
1721	if (!iommu)
1722		return -ENODEV;
1723
1724	ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1725	if (!ret)
1726		return ret;
1727	/* dependent device mapping */
1728	tmp = pci_find_upstream_pcie_bridge(pdev);
1729	if (!tmp)
1730		return ret;
1731	/* Secondary interface's bus number and devfn 0 */
1732	parent = pdev->bus->self;
1733	while (parent != tmp) {
1734		ret = device_context_mapped(iommu, parent->bus->number,
1735					    parent->devfn);
1736		if (!ret)
1737			return ret;
1738		parent = parent->bus->self;
1739	}
1740	if (pci_is_pcie(tmp))
1741		return device_context_mapped(iommu, tmp->subordinate->number,
1742					     0);
1743	else
1744		return device_context_mapped(iommu, tmp->bus->number,
1745					     tmp->devfn);
1746}
1747
1748/* Returns a number of VTD pages, but aligned to MM page size */
1749static inline unsigned long aligned_nrpages(unsigned long host_addr,
1750					    size_t size)
1751{
1752	host_addr &= ~PAGE_MASK;
1753	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1754}
1755
1756/* Return largest possible superpage level for a given mapping */
1757static inline int hardware_largepage_caps(struct dmar_domain *domain,
1758					  unsigned long iov_pfn,
1759					  unsigned long phy_pfn,
1760					  unsigned long pages)
1761{
1762	int support, level = 1;
1763	unsigned long pfnmerge;
1764
1765	support = domain->iommu_superpage;
1766
1767	/* To use a large page, the virtual *and* physical addresses
1768	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1769	   of them will mean we have to use smaller pages. So just
1770	   merge them and check both at once. */
1771	pfnmerge = iov_pfn | phy_pfn;
1772
1773	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1774		pages >>= VTD_STRIDE_SHIFT;
1775		if (!pages)
1776			break;
1777		pfnmerge >>= VTD_STRIDE_SHIFT;
1778		level++;
1779		support--;
1780	}
1781	return level;
1782}
1783
1784static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1785			    struct scatterlist *sg, unsigned long phys_pfn,
1786			    unsigned long nr_pages, int prot)
1787{
1788	struct dma_pte *first_pte = NULL, *pte = NULL;
1789	phys_addr_t uninitialized_var(pteval);
1790	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1791	unsigned long sg_res;
1792	unsigned int largepage_lvl = 0;
1793	unsigned long lvl_pages = 0;
1794
1795	BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1796
1797	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1798		return -EINVAL;
1799
1800	prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1801
1802	if (sg)
1803		sg_res = 0;
1804	else {
1805		sg_res = nr_pages + 1;
1806		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1807	}
1808
1809	while (nr_pages > 0) {
1810		uint64_t tmp;
1811
1812		if (!sg_res) {
1813			sg_res = aligned_nrpages(sg->offset, sg->length);
1814			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1815			sg->dma_length = sg->length;
1816			pteval = page_to_phys(sg_page(sg)) | prot;
1817			phys_pfn = pteval >> VTD_PAGE_SHIFT;
1818		}
1819
1820		if (!pte) {
1821			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1822
1823			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1824			if (!pte)
1825				return -ENOMEM;
1826			/* It is large page*/
1827			if (largepage_lvl > 1)
1828				pteval |= DMA_PTE_LARGE_PAGE;
1829			else
1830				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1831
1832		}
1833		/* We don't need lock here, nobody else
1834		 * touches the iova range
1835		 */
1836		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1837		if (tmp) {
1838			static int dumps = 5;
1839			printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1840			       iov_pfn, tmp, (unsigned long long)pteval);
1841			if (dumps) {
1842				dumps--;
1843				debug_dma_dump_mappings(NULL);
1844			}
1845			WARN_ON(1);
1846		}
1847
1848		lvl_pages = lvl_to_nr_pages(largepage_lvl);
1849
1850		BUG_ON(nr_pages < lvl_pages);
1851		BUG_ON(sg_res < lvl_pages);
1852
1853		nr_pages -= lvl_pages;
1854		iov_pfn += lvl_pages;
1855		phys_pfn += lvl_pages;
1856		pteval += lvl_pages * VTD_PAGE_SIZE;
1857		sg_res -= lvl_pages;
1858
1859		/* If the next PTE would be the first in a new page, then we
1860		   need to flush the cache on the entries we've just written.
1861		   And then we'll need to recalculate 'pte', so clear it and
1862		   let it get set again in the if (!pte) block above.
1863
1864		   If we're done (!nr_pages) we need to flush the cache too.
1865
1866		   Also if we've been setting superpages, we may need to
1867		   recalculate 'pte' and switch back to smaller pages for the
1868		   end of the mapping, if the trailing size is not enough to
1869		   use another superpage (i.e. sg_res < lvl_pages). */
1870		pte++;
1871		if (!nr_pages || first_pte_in_page(pte) ||
1872		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
1873			domain_flush_cache(domain, first_pte,
1874					   (void *)pte - (void *)first_pte);
1875			pte = NULL;
1876		}
1877
1878		if (!sg_res && nr_pages)
1879			sg = sg_next(sg);
1880	}
1881	return 0;
1882}
1883
1884static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1885				    struct scatterlist *sg, unsigned long nr_pages,
1886				    int prot)
1887{
1888	return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1889}
1890
1891static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1892				     unsigned long phys_pfn, unsigned long nr_pages,
1893				     int prot)
1894{
1895	return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1896}
1897
1898static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1899{
1900	if (!iommu)
1901		return;
1902
1903	clear_context_table(iommu, bus, devfn);
1904	iommu->flush.flush_context(iommu, 0, 0, 0,
1905					   DMA_CCMD_GLOBAL_INVL);
1906	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1907}
1908
1909static void domain_remove_dev_info(struct dmar_domain *domain)
1910{
1911	struct device_domain_info *info;
1912	unsigned long flags;
1913	struct intel_iommu *iommu;
1914
1915	spin_lock_irqsave(&device_domain_lock, flags);
1916	while (!list_empty(&domain->devices)) {
1917		info = list_entry(domain->devices.next,
1918			struct device_domain_info, link);
1919		list_del(&info->link);
1920		list_del(&info->global);
1921		if (info->dev)
1922			info->dev->dev.archdata.iommu = NULL;
1923		spin_unlock_irqrestore(&device_domain_lock, flags);
1924
1925		iommu_disable_dev_iotlb(info);
1926		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1927		iommu_detach_dev(iommu, info->bus, info->devfn);
1928		free_devinfo_mem(info);
1929
1930		spin_lock_irqsave(&device_domain_lock, flags);
1931	}
1932	spin_unlock_irqrestore(&device_domain_lock, flags);
1933}
1934
1935/*
1936 * find_domain
1937 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1938 */
1939static struct dmar_domain *
1940find_domain(struct pci_dev *pdev)
1941{
1942	struct device_domain_info *info;
1943
1944	/* No lock here, assumes no domain exit in normal case */
1945	info = pdev->dev.archdata.iommu;
1946	if (info)
1947		return info->domain;
1948	return NULL;
1949}
1950
1951/* domain is initialized */
1952static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1953{
1954	struct dmar_domain *domain, *found = NULL;
1955	struct intel_iommu *iommu;
1956	struct dmar_drhd_unit *drhd;
1957	struct device_domain_info *info, *tmp;
1958	struct pci_dev *dev_tmp;
1959	unsigned long flags;
1960	int bus = 0, devfn = 0;
1961	int segment;
1962	int ret;
1963
1964	domain = find_domain(pdev);
1965	if (domain)
1966		return domain;
1967
1968	segment = pci_domain_nr(pdev->bus);
1969
1970	dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1971	if (dev_tmp) {
1972		if (pci_is_pcie(dev_tmp)) {
1973			bus = dev_tmp->subordinate->number;
1974			devfn = 0;
1975		} else {
1976			bus = dev_tmp->bus->number;
1977			devfn = dev_tmp->devfn;
1978		}
1979		spin_lock_irqsave(&device_domain_lock, flags);
1980		list_for_each_entry(info, &device_domain_list, global) {
1981			if (info->segment == segment &&
1982			    info->bus == bus && info->devfn == devfn) {
1983				found = info->domain;
1984				break;
1985			}
1986		}
1987		spin_unlock_irqrestore(&device_domain_lock, flags);
1988		/* pcie-pci bridge already has a domain, uses it */
1989		if (found) {
1990			domain = found;
1991			goto found_domain;
1992		}
1993	}
1994
1995	domain = alloc_domain();
1996	if (!domain)
1997		goto error;
1998
1999	/* Allocate new domain for the device */
2000	drhd = dmar_find_matched_drhd_unit(pdev);
2001	if (!drhd) {
2002		printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2003			pci_name(pdev));
2004		return NULL;
2005	}
2006	iommu = drhd->iommu;
2007
2008	ret = iommu_attach_domain(domain, iommu);
2009	if (ret) {
2010		free_domain_mem(domain);
2011		goto error;
2012	}
2013
2014	if (domain_init(domain, gaw)) {
2015		domain_exit(domain);
2016		goto error;
2017	}
2018
2019	/* register pcie-to-pci device */
2020	if (dev_tmp) {
2021		info = alloc_devinfo_mem();
2022		if (!info) {
2023			domain_exit(domain);
2024			goto error;
2025		}
2026		info->segment = segment;
2027		info->bus = bus;
2028		info->devfn = devfn;
2029		info->dev = NULL;
2030		info->domain = domain;
2031		/* This domain is shared by devices under p2p bridge */
2032		domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2033
2034		/* pcie-to-pci bridge already has a domain, uses it */
2035		found = NULL;
2036		spin_lock_irqsave(&device_domain_lock, flags);
2037		list_for_each_entry(tmp, &device_domain_list, global) {
2038			if (tmp->segment == segment &&
2039			    tmp->bus == bus && tmp->devfn == devfn) {
2040				found = tmp->domain;
2041				break;
2042			}
2043		}
2044		if (found) {
2045			spin_unlock_irqrestore(&device_domain_lock, flags);
2046			free_devinfo_mem(info);
2047			domain_exit(domain);
2048			domain = found;
2049		} else {
2050			list_add(&info->link, &domain->devices);
2051			list_add(&info->global, &device_domain_list);
2052			spin_unlock_irqrestore(&device_domain_lock, flags);
2053		}
2054	}
2055
2056found_domain:
2057	info = alloc_devinfo_mem();
2058	if (!info)
2059		goto error;
2060	info->segment = segment;
2061	info->bus = pdev->bus->number;
2062	info->devfn = pdev->devfn;
2063	info->dev = pdev;
2064	info->domain = domain;
2065	spin_lock_irqsave(&device_domain_lock, flags);
2066	/* somebody is fast */
2067	found = find_domain(pdev);
2068	if (found != NULL) {
2069		spin_unlock_irqrestore(&device_domain_lock, flags);
2070		if (found != domain) {
2071			domain_exit(domain);
2072			domain = found;
2073		}
2074		free_devinfo_mem(info);
2075		return domain;
2076	}
2077	list_add(&info->link, &domain->devices);
2078	list_add(&info->global, &device_domain_list);
2079	pdev->dev.archdata.iommu = info;
2080	spin_unlock_irqrestore(&device_domain_lock, flags);
2081	return domain;
2082error:
2083	/* recheck it here, maybe others set it */
2084	return find_domain(pdev);
2085}
2086
2087static int iommu_identity_mapping;
2088#define IDENTMAP_ALL		1
2089#define IDENTMAP_GFX		2
2090#define IDENTMAP_AZALIA		4
2091
2092static int iommu_domain_identity_map(struct dmar_domain *domain,
2093				     unsigned long long start,
2094				     unsigned long long end)
2095{
2096	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2097	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2098
2099	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2100			  dma_to_mm_pfn(last_vpfn))) {
2101		printk(KERN_ERR "IOMMU: reserve iova failed\n");
2102		return -ENOMEM;
2103	}
2104
2105	pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2106		 start, end, domain->id);
2107	/*
2108	 * RMRR range might have overlap with physical memory range,
2109	 * clear it first
2110	 */
2111	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2112
2113	return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2114				  last_vpfn - first_vpfn + 1,
2115				  DMA_PTE_READ|DMA_PTE_WRITE);
2116}
2117
2118static int iommu_prepare_identity_map(struct pci_dev *pdev,
2119				      unsigned long long start,
2120				      unsigned long long end)
2121{
2122	struct dmar_domain *domain;
2123	int ret;
2124
2125	domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2126	if (!domain)
2127		return -ENOMEM;
2128
2129	/* For _hardware_ passthrough, don't bother. But for software
2130	   passthrough, we do it anyway -- it may indicate a memory
2131	   range which is reserved in E820, so which didn't get set
2132	   up to start with in si_domain */
2133	if (domain == si_domain && hw_pass_through) {
2134		printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2135		       pci_name(pdev), start, end);
2136		return 0;
2137	}
2138
2139	printk(KERN_INFO
2140	       "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2141	       pci_name(pdev), start, end);
2142
2143	if (end < start) {
2144		WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2145			"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2146			dmi_get_system_info(DMI_BIOS_VENDOR),
2147			dmi_get_system_info(DMI_BIOS_VERSION),
2148		     dmi_get_system_info(DMI_PRODUCT_VERSION));
2149		ret = -EIO;
2150		goto error;
2151	}
2152
2153	if (end >> agaw_to_width(domain->agaw)) {
2154		WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2155		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2156		     agaw_to_width(domain->agaw),
2157		     dmi_get_system_info(DMI_BIOS_VENDOR),
2158		     dmi_get_system_info(DMI_BIOS_VERSION),
2159		     dmi_get_system_info(DMI_PRODUCT_VERSION));
2160		ret = -EIO;
2161		goto error;
2162	}
2163
2164	ret = iommu_domain_identity_map(domain, start, end);
2165	if (ret)
2166		goto error;
2167
2168	/* context entry init */
2169	ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2170	if (ret)
2171		goto error;
2172
2173	return 0;
2174
2175 error:
2176	domain_exit(domain);
2177	return ret;
2178}
2179
2180static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2181	struct pci_dev *pdev)
2182{
2183	if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2184		return 0;
2185	return iommu_prepare_identity_map(pdev, rmrr->base_address,
2186		rmrr->end_address);
2187}
2188
2189#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2190static inline void iommu_prepare_isa(void)
2191{
2192	struct pci_dev *pdev;
2193	int ret;
2194
2195	pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2196	if (!pdev)
2197		return;
2198
2199	printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2200	ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2201
2202	if (ret)
2203		printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2204		       "floppy might not work\n");
2205
2206}
2207#else
2208static inline void iommu_prepare_isa(void)
2209{
2210	return;
2211}
2212#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2213
2214static int md_domain_init(struct dmar_domain *domain, int guest_width);
2215
2216static int __init si_domain_init(int hw)
2217{
2218	struct dmar_drhd_unit *drhd;
2219	struct intel_iommu *iommu;
2220	int nid, ret = 0;
2221
2222	si_domain = alloc_domain();
2223	if (!si_domain)
2224		return -EFAULT;
2225
2226	pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2227
2228	for_each_active_iommu(iommu, drhd) {
2229		ret = iommu_attach_domain(si_domain, iommu);
2230		if (ret) {
2231			domain_exit(si_domain);
2232			return -EFAULT;
2233		}
2234	}
2235
2236	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2237		domain_exit(si_domain);
2238		return -EFAULT;
2239	}
2240
2241	si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2242
2243	if (hw)
2244		return 0;
2245
2246	for_each_online_node(nid) {
2247		unsigned long start_pfn, end_pfn;
2248		int i;
2249
2250		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2251			ret = iommu_domain_identity_map(si_domain,
2252					PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2253			if (ret)
2254				return ret;
2255		}
2256	}
2257
2258	return 0;
2259}
2260
2261static void domain_remove_one_dev_info(struct dmar_domain *domain,
2262					  struct pci_dev *pdev);
2263static int identity_mapping(struct pci_dev *pdev)
2264{
2265	struct device_domain_info *info;
2266
2267	if (likely(!iommu_identity_mapping))
2268		return 0;
2269
2270	info = pdev->dev.archdata.iommu;
2271	if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2272		return (info->domain == si_domain);
2273
2274	return 0;
2275}
2276
2277static int domain_add_dev_info(struct dmar_domain *domain,
2278			       struct pci_dev *pdev,
2279			       int translation)
2280{
2281	struct device_domain_info *info;
2282	unsigned long flags;
2283	int ret;
2284
2285	info = alloc_devinfo_mem();
2286	if (!info)
2287		return -ENOMEM;
2288
2289	info->segment = pci_domain_nr(pdev->bus);
2290	info->bus = pdev->bus->number;
2291	info->devfn = pdev->devfn;
2292	info->dev = pdev;
2293	info->domain = domain;
2294
2295	spin_lock_irqsave(&device_domain_lock, flags);
2296	list_add(&info->link, &domain->devices);
2297	list_add(&info->global, &device_domain_list);
2298	pdev->dev.archdata.iommu = info;
2299	spin_unlock_irqrestore(&device_domain_lock, flags);
2300
2301	ret = domain_context_mapping(domain, pdev, translation);
2302	if (ret) {
2303		spin_lock_irqsave(&device_domain_lock, flags);
2304		list_del(&info->link);
2305		list_del(&info->global);
2306		pdev->dev.archdata.iommu = NULL;
2307		spin_unlock_irqrestore(&device_domain_lock, flags);
2308		free_devinfo_mem(info);
2309		return ret;
2310	}
2311
2312	return 0;
2313}
2314
2315static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2316{
2317	if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2318		return 1;
2319
2320	if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2321		return 1;
2322
2323	if (!(iommu_identity_mapping & IDENTMAP_ALL))
2324		return 0;
2325
2326	/*
2327	 * We want to start off with all devices in the 1:1 domain, and
2328	 * take them out later if we find they can't access all of memory.
2329	 *
2330	 * However, we can't do this for PCI devices behind bridges,
2331	 * because all PCI devices behind the same bridge will end up
2332	 * with the same source-id on their transactions.
2333	 *
2334	 * Practically speaking, we can't change things around for these
2335	 * devices at run-time, because we can't be sure there'll be no
2336	 * DMA transactions in flight for any of their siblings.
2337	 *
2338	 * So PCI devices (unless they're on the root bus) as well as
2339	 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2340	 * the 1:1 domain, just in _case_ one of their siblings turns out
2341	 * not to be able to map all of memory.
2342	 */
2343	if (!pci_is_pcie(pdev)) {
2344		if (!pci_is_root_bus(pdev->bus))
2345			return 0;
2346		if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2347			return 0;
2348	} else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2349		return 0;
2350
2351	/*
2352	 * At boot time, we don't yet know if devices will be 64-bit capable.
2353	 * Assume that they will -- if they turn out not to be, then we can
2354	 * take them out of the 1:1 domain later.
2355	 */
2356	if (!startup) {
2357		/*
2358		 * If the device's dma_mask is less than the system's memory
2359		 * size then this is not a candidate for identity mapping.
2360		 */
2361		u64 dma_mask = pdev->dma_mask;
2362
2363		if (pdev->dev.coherent_dma_mask &&
2364		    pdev->dev.coherent_dma_mask < dma_mask)
2365			dma_mask = pdev->dev.coherent_dma_mask;
2366
2367		return dma_mask >= dma_get_required_mask(&pdev->dev);
2368	}
2369
2370	return 1;
2371}
2372
2373static int __init iommu_prepare_static_identity_mapping(int hw)
2374{
2375	struct pci_dev *pdev = NULL;
2376	int ret;
2377
2378	ret = si_domain_init(hw);
2379	if (ret)
2380		return -EFAULT;
2381
2382	for_each_pci_dev(pdev) {
2383		if (iommu_should_identity_map(pdev, 1)) {
2384			ret = domain_add_dev_info(si_domain, pdev,
2385					     hw ? CONTEXT_TT_PASS_THROUGH :
2386						  CONTEXT_TT_MULTI_LEVEL);
2387			if (ret) {
2388				/* device not associated with an iommu */
2389				if (ret == -ENODEV)
2390					continue;
2391				return ret;
2392			}
2393			pr_info("IOMMU: %s identity mapping for device %s\n",
2394				hw ? "hardware" : "software", pci_name(pdev));
2395		}
2396	}
2397
2398	return 0;
2399}
2400
2401static int __init init_dmars(void)
2402{
2403	struct dmar_drhd_unit *drhd;
2404	struct dmar_rmrr_unit *rmrr;
2405	struct pci_dev *pdev;
2406	struct intel_iommu *iommu;
2407	int i, ret;
2408
2409	/*
2410	 * for each drhd
2411	 *    allocate root
2412	 *    initialize and program root entry to not present
2413	 * endfor
2414	 */
2415	for_each_drhd_unit(drhd) {
2416		/*
2417		 * lock not needed as this is only incremented in the single
2418		 * threaded kernel __init code path all other access are read
2419		 * only
2420		 */
2421		if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2422			g_num_of_iommus++;
2423			continue;
2424		}
2425		printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2426			  IOMMU_UNITS_SUPPORTED);
2427	}
2428
2429	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2430			GFP_KERNEL);
2431	if (!g_iommus) {
2432		printk(KERN_ERR "Allocating global iommu array failed\n");
2433		ret = -ENOMEM;
2434		goto error;
2435	}
2436
2437	deferred_flush = kzalloc(g_num_of_iommus *
2438		sizeof(struct deferred_flush_tables), GFP_KERNEL);
2439	if (!deferred_flush) {
2440		ret = -ENOMEM;
2441		goto error;
2442	}
2443
2444	for_each_drhd_unit(drhd) {
2445		if (drhd->ignored)
2446			continue;
2447
2448		iommu = drhd->iommu;
2449		g_iommus[iommu->seq_id] = iommu;
2450
2451		ret = iommu_init_domains(iommu);
2452		if (ret)
2453			goto error;
2454
2455		/*
2456		 * TBD:
2457		 * we could share the same root & context tables
2458		 * among all IOMMU's. Need to Split it later.
2459		 */
2460		ret = iommu_alloc_root_entry(iommu);
2461		if (ret) {
2462			printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2463			goto error;
2464		}
2465		if (!ecap_pass_through(iommu->ecap))
2466			hw_pass_through = 0;
2467	}
2468
2469	/*
2470	 * Start from the sane iommu hardware state.
2471	 */
2472	for_each_drhd_unit(drhd) {
2473		if (drhd->ignored)
2474			continue;
2475
2476		iommu = drhd->iommu;
2477
2478		/*
2479		 * If the queued invalidation is already initialized by us
2480		 * (for example, while enabling interrupt-remapping) then
2481		 * we got the things already rolling from a sane state.
2482		 */
2483		if (iommu->qi)
2484			continue;
2485
2486		/*
2487		 * Clear any previous faults.
2488		 */
2489		dmar_fault(-1, iommu);
2490		/*
2491		 * Disable queued invalidation if supported and already enabled
2492		 * before OS handover.
2493		 */
2494		dmar_disable_qi(iommu);
2495	}
2496
2497	for_each_drhd_unit(drhd) {
2498		if (drhd->ignored)
2499			continue;
2500
2501		iommu = drhd->iommu;
2502
2503		if (dmar_enable_qi(iommu)) {
2504			/*
2505			 * Queued Invalidate not enabled, use Register Based
2506			 * Invalidate
2507			 */
2508			iommu->flush.flush_context = __iommu_flush_context;
2509			iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2510			printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2511			       "invalidation\n",
2512				iommu->seq_id,
2513			       (unsigned long long)drhd->reg_base_addr);
2514		} else {
2515			iommu->flush.flush_context = qi_flush_context;
2516			iommu->flush.flush_iotlb = qi_flush_iotlb;
2517			printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2518			       "invalidation\n",
2519				iommu->seq_id,
2520			       (unsigned long long)drhd->reg_base_addr);
2521		}
2522	}
2523
2524	if (iommu_pass_through)
2525		iommu_identity_mapping |= IDENTMAP_ALL;
2526
2527#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2528	iommu_identity_mapping |= IDENTMAP_GFX;
2529#endif
2530
2531	check_tylersburg_isoch();
2532
2533	/*
2534	 * If pass through is not set or not enabled, setup context entries for
2535	 * identity mappings for rmrr, gfx, and isa and may fall back to static
2536	 * identity mapping if iommu_identity_mapping is set.
2537	 */
2538	if (iommu_identity_mapping) {
2539		ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2540		if (ret) {
2541			printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2542			goto error;
2543		}
2544	}
2545	/*
2546	 * For each rmrr
2547	 *   for each dev attached to rmrr
2548	 *   do
2549	 *     locate drhd for dev, alloc domain for dev
2550	 *     allocate free domain
2551	 *     allocate page table entries for rmrr
2552	 *     if context not allocated for bus
2553	 *           allocate and init context
2554	 *           set present in root table for this bus
2555	 *     init context with domain, translation etc
2556	 *    endfor
2557	 * endfor
2558	 */
2559	printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2560	for_each_rmrr_units(rmrr) {
2561		for (i = 0; i < rmrr->devices_cnt; i++) {
2562			pdev = rmrr->devices[i];
2563			/*
2564			 * some BIOS lists non-exist devices in DMAR
2565			 * table.
2566			 */
2567			if (!pdev)
2568				continue;
2569			ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2570			if (ret)
2571				printk(KERN_ERR
2572				       "IOMMU: mapping reserved region failed\n");
2573		}
2574	}
2575
2576	iommu_prepare_isa();
2577
2578	/*
2579	 * for each drhd
2580	 *   enable fault log
2581	 *   global invalidate context cache
2582	 *   global invalidate iotlb
2583	 *   enable translation
2584	 */
2585	for_each_drhd_unit(drhd) {
2586		if (drhd->ignored) {
2587			/*
2588			 * we always have to disable PMRs or DMA may fail on
2589			 * this device
2590			 */
2591			if (force_on)
2592				iommu_disable_protect_mem_regions(drhd->iommu);
2593			continue;
2594		}
2595		iommu = drhd->iommu;
2596
2597		iommu_flush_write_buffer(iommu);
2598
2599		ret = dmar_set_interrupt(iommu);
2600		if (ret)
2601			goto error;
2602
2603		iommu_set_root_entry(iommu);
2604
2605		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2606		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2607
2608		ret = iommu_enable_translation(iommu);
2609		if (ret)
2610			goto error;
2611
2612		iommu_disable_protect_mem_regions(iommu);
2613	}
2614
2615	return 0;
2616error:
2617	for_each_drhd_unit(drhd) {
2618		if (drhd->ignored)
2619			continue;
2620		iommu = drhd->iommu;
2621		free_iommu(iommu);
2622	}
2623	kfree(g_iommus);
2624	return ret;
2625}
2626
2627/* This takes a number of _MM_ pages, not VTD pages */
2628static struct iova *intel_alloc_iova(struct device *dev,
2629				     struct dmar_domain *domain,
2630				     unsigned long nrpages, uint64_t dma_mask)
2631{
2632	struct pci_dev *pdev = to_pci_dev(dev);
2633	struct iova *iova = NULL;
2634
2635	/* Restrict dma_mask to the width that the iommu can handle */
2636	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2637
2638	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2639		/*
2640		 * First try to allocate an io virtual address in
2641		 * DMA_BIT_MASK(32) and if that fails then try allocating
2642		 * from higher range
2643		 */
2644		iova = alloc_iova(&domain->iovad, nrpages,
2645				  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2646		if (iova)
2647			return iova;
2648	}
2649	iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2650	if (unlikely(!iova)) {
2651		printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2652		       nrpages, pci_name(pdev));
2653		return NULL;
2654	}
2655
2656	return iova;
2657}
2658
2659static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2660{
2661	struct dmar_domain *domain;
2662	int ret;
2663
2664	domain = get_domain_for_dev(pdev,
2665			DEFAULT_DOMAIN_ADDRESS_WIDTH);
2666	if (!domain) {
2667		printk(KERN_ERR
2668			"Allocating domain for %s failed", pci_name(pdev));
2669		return NULL;
2670	}
2671
2672	/* make sure context mapping is ok */
2673	if (unlikely(!domain_context_mapped(pdev))) {
2674		ret = domain_context_mapping(domain, pdev,
2675					     CONTEXT_TT_MULTI_LEVEL);
2676		if (ret) {
2677			printk(KERN_ERR
2678				"Domain context map for %s failed",
2679				pci_name(pdev));
2680			return NULL;
2681		}
2682	}
2683
2684	return domain;
2685}
2686
2687static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2688{
2689	struct device_domain_info *info;
2690
2691	/* No lock here, assumes no domain exit in normal case */
2692	info = dev->dev.archdata.iommu;
2693	if (likely(info))
2694		return info->domain;
2695
2696	return __get_valid_domain_for_dev(dev);
2697}
2698
2699static int iommu_dummy(struct pci_dev *pdev)
2700{
2701	return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2702}
2703
2704/* Check if the pdev needs to go through non-identity map and unmap process.*/
2705static int iommu_no_mapping(struct device *dev)
2706{
2707	struct pci_dev *pdev;
2708	int found;
2709
2710	if (unlikely(dev->bus != &pci_bus_type))
2711		return 1;
2712
2713	pdev = to_pci_dev(dev);
2714	if (iommu_dummy(pdev))
2715		return 1;
2716
2717	if (!iommu_identity_mapping)
2718		return 0;
2719
2720	found = identity_mapping(pdev);
2721	if (found) {
2722		if (iommu_should_identity_map(pdev, 0))
2723			return 1;
2724		else {
2725			/*
2726			 * 32 bit DMA is removed from si_domain and fall back
2727			 * to non-identity mapping.
2728			 */
2729			domain_remove_one_dev_info(si_domain, pdev);
2730			printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2731			       pci_name(pdev));
2732			return 0;
2733		}
2734	} else {
2735		/*
2736		 * In case of a detached 64 bit DMA device from vm, the device
2737		 * is put into si_domain for identity mapping.
2738		 */
2739		if (iommu_should_identity_map(pdev, 0)) {
2740			int ret;
2741			ret = domain_add_dev_info(si_domain, pdev,
2742						  hw_pass_through ?
2743						  CONTEXT_TT_PASS_THROUGH :
2744						  CONTEXT_TT_MULTI_LEVEL);
2745			if (!ret) {
2746				printk(KERN_INFO "64bit %s uses identity mapping\n",
2747				       pci_name(pdev));
2748				return 1;
2749			}
2750		}
2751	}
2752
2753	return 0;
2754}
2755
2756static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2757				     size_t size, int dir, u64 dma_mask)
2758{
2759	struct pci_dev *pdev = to_pci_dev(hwdev);
2760	struct dmar_domain *domain;
2761	phys_addr_t start_paddr;
2762	struct iova *iova;
2763	int prot = 0;
2764	int ret;
2765	struct intel_iommu *iommu;
2766	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2767
2768	BUG_ON(dir == DMA_NONE);
2769
2770	if (iommu_no_mapping(hwdev))
2771		return paddr;
2772
2773	domain = get_valid_domain_for_dev(pdev);
2774	if (!domain)
2775		return 0;
2776
2777	iommu = domain_get_iommu(domain);
2778	size = aligned_nrpages(paddr, size);
2779
2780	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2781	if (!iova)
2782		goto error;
2783
2784	/*
2785	 * Check if DMAR supports zero-length reads on write only
2786	 * mappings..
2787	 */
2788	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2789			!cap_zlr(iommu->cap))
2790		prot |= DMA_PTE_READ;
2791	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2792		prot |= DMA_PTE_WRITE;
2793	/*
2794	 * paddr - (paddr + size) might be partial page, we should map the whole
2795	 * page.  Note: if two part of one page are separately mapped, we
2796	 * might have two guest_addr mapping to the same host paddr, but this
2797	 * is not a big problem
2798	 */
2799	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2800				 mm_to_dma_pfn(paddr_pfn), size, prot);
2801	if (ret)
2802		goto error;
2803
2804	/* it's a non-present to present mapping. Only flush if caching mode */
2805	if (cap_caching_mode(iommu->cap))
2806		iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2807	else
2808		iommu_flush_write_buffer(iommu);
2809
2810	start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2811	start_paddr += paddr & ~PAGE_MASK;
2812	return start_paddr;
2813
2814error:
2815	if (iova)
2816		__free_iova(&domain->iovad, iova);
2817	printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2818		pci_name(pdev), size, (unsigned long long)paddr, dir);
2819	return 0;
2820}
2821
2822static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2823				 unsigned long offset, size_t size,
2824				 enum dma_data_direction dir,
2825				 struct dma_attrs *attrs)
2826{
2827	return __intel_map_single(dev, page_to_phys(page) + offset, size,
2828				  dir, to_pci_dev(dev)->dma_mask);
2829}
2830
2831static void flush_unmaps(void)
2832{
2833	int i, j;
2834
2835	timer_on = 0;
2836
2837	/* just flush them all */
2838	for (i = 0; i < g_num_of_iommus; i++) {
2839		struct intel_iommu *iommu = g_iommus[i];
2840		if (!iommu)
2841			continue;
2842
2843		if (!deferred_flush[i].next)
2844			continue;
2845
2846		/* In caching mode, global flushes turn emulation expensive */
2847		if (!cap_caching_mode(iommu->cap))
2848			iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2849					 DMA_TLB_GLOBAL_FLUSH);
2850		for (j = 0; j < deferred_flush[i].next; j++) {
2851			unsigned long mask;
2852			struct iova *iova = deferred_flush[i].iova[j];
2853			struct dmar_domain *domain = deferred_flush[i].domain[j];
2854
2855			/* On real hardware multiple invalidations are expensive */
2856			if (cap_caching_mode(iommu->cap))
2857				iommu_flush_iotlb_psi(iommu, domain->id,
2858				iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2859			else {
2860				mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2861				iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2862						(uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2863			}
2864			__free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2865		}
2866		deferred_flush[i].next = 0;
2867	}
2868
2869	list_size = 0;
2870}
2871
2872static void flush_unmaps_timeout(unsigned long data)
2873{
2874	unsigned long flags;
2875
2876	spin_lock_irqsave(&async_umap_flush_lock, flags);
2877	flush_unmaps();
2878	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2879}
2880
2881static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2882{
2883	unsigned long flags;
2884	int next, iommu_id;
2885	struct intel_iommu *iommu;
2886
2887	spin_lock_irqsave(&async_umap_flush_lock, flags);
2888	if (list_size == HIGH_WATER_MARK)
2889		flush_unmaps();
2890
2891	iommu = domain_get_iommu(dom);
2892	iommu_id = iommu->seq_id;
2893
2894	next = deferred_flush[iommu_id].next;
2895	deferred_flush[iommu_id].domain[next] = dom;
2896	deferred_flush[iommu_id].iova[next] = iova;
2897	deferred_flush[iommu_id].next++;
2898
2899	if (!timer_on) {
2900		mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2901		timer_on = 1;
2902	}
2903	list_size++;
2904	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2905}
2906
2907static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2908			     size_t size, enum dma_data_direction dir,
2909			     struct dma_attrs *attrs)
2910{
2911	struct pci_dev *pdev = to_pci_dev(dev);
2912	struct dmar_domain *domain;
2913	unsigned long start_pfn, last_pfn;
2914	struct iova *iova;
2915	struct intel_iommu *iommu;
2916
2917	if (iommu_no_mapping(dev))
2918		return;
2919
2920	domain = find_domain(pdev);
2921	BUG_ON(!domain);
2922
2923	iommu = domain_get_iommu(domain);
2924
2925	iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2926	if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2927		      (unsigned long long)dev_addr))
2928		return;
2929
2930	start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2931	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2932
2933	pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2934		 pci_name(pdev), start_pfn, last_pfn);
2935
2936	/*  clear the whole page */
2937	dma_pte_clear_range(domain, start_pfn, last_pfn);
2938
2939	/* free page tables */
2940	dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2941
2942	if (intel_iommu_strict) {
2943		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2944				      last_pfn - start_pfn + 1, 0);
2945		/* free iova */
2946		__free_iova(&domain->iovad, iova);
2947	} else {
2948		add_unmap(domain, iova);
2949		/*
2950		 * queue up the release of the unmap to save the 1/6th of the
2951		 * cpu used up by the iotlb flush operation...
2952		 */
2953	}
2954}
2955
2956static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2957				  dma_addr_t *dma_handle, gfp_t flags,
2958				  struct dma_attrs *attrs)
2959{
2960	void *vaddr;
2961	int order;
2962
2963	size = PAGE_ALIGN(size);
2964	order = get_order(size);
2965
2966	if (!iommu_no_mapping(hwdev))
2967		flags &= ~(GFP_DMA | GFP_DMA32);
2968	else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2969		if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2970			flags |= GFP_DMA;
2971		else
2972			flags |= GFP_DMA32;
2973	}
2974
2975	vaddr = (void *)__get_free_pages(flags, order);
2976	if (!vaddr)
2977		return NULL;
2978	memset(vaddr, 0, size);
2979
2980	*dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2981					 DMA_BIDIRECTIONAL,
2982					 hwdev->coherent_dma_mask);
2983	if (*dma_handle)
2984		return vaddr;
2985	free_pages((unsigned long)vaddr, order);
2986	return NULL;
2987}
2988
2989static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2990				dma_addr_t dma_handle, struct dma_attrs *attrs)
2991{
2992	int order;
2993
2994	size = PAGE_ALIGN(size);
2995	order = get_order(size);
2996
2997	intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2998	free_pages((unsigned long)vaddr, order);
2999}
3000
3001static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3002			   int nelems, enum dma_data_direction dir,
3003			   struct dma_attrs *attrs)
3004{
3005	struct pci_dev *pdev = to_pci_dev(hwdev);
3006	struct dmar_domain *domain;
3007	unsigned long start_pfn, last_pfn;
3008	struct iova *iova;
3009	struct intel_iommu *iommu;
3010
3011	if (iommu_no_mapping(hwdev))
3012		return;
3013
3014	domain = find_domain(pdev);
3015	BUG_ON(!domain);
3016
3017	iommu = domain_get_iommu(domain);
3018
3019	iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3020	if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3021		      (unsigned long long)sglist[0].dma_address))
3022		return;
3023
3024	start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3025	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3026
3027	/*  clear the whole page */
3028	dma_pte_clear_range(domain, start_pfn, last_pfn);
3029
3030	/* free page tables */
3031	dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3032
3033	if (intel_iommu_strict) {
3034		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3035				      last_pfn - start_pfn + 1, 0);
3036		/* free iova */
3037		__free_iova(&domain->iovad, iova);
3038	} else {
3039		add_unmap(domain, iova);
3040		/*
3041		 * queue up the release of the unmap to save the 1/6th of the
3042		 * cpu used up by the iotlb flush operation...
3043		 */
3044	}
3045}
3046
3047static int intel_nontranslate_map_sg(struct device *hddev,
3048	struct scatterlist *sglist, int nelems, int dir)
3049{
3050	int i;
3051	struct scatterlist *sg;
3052
3053	for_each_sg(sglist, sg, nelems, i) {
3054		BUG_ON(!sg_page(sg));
3055		sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3056		sg->dma_length = sg->length;
3057	}
3058	return nelems;
3059}
3060
3061static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3062			enum dma_data_direction dir, struct dma_attrs *attrs)
3063{
3064	int i;
3065	struct pci_dev *pdev = to_pci_dev(hwdev);
3066	struct dmar_domain *domain;
3067	size_t size = 0;
3068	int prot = 0;
3069	struct iova *iova = NULL;
3070	int ret;
3071	struct scatterlist *sg;
3072	unsigned long start_vpfn;
3073	struct intel_iommu *iommu;
3074
3075	BUG_ON(dir == DMA_NONE);
3076	if (iommu_no_mapping(hwdev))
3077		return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3078
3079	domain = get_valid_domain_for_dev(pdev);
3080	if (!domain)
3081		return 0;
3082
3083	iommu = domain_get_iommu(domain);
3084
3085	for_each_sg(sglist, sg, nelems, i)
3086		size += aligned_nrpages(sg->offset, sg->length);
3087
3088	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3089				pdev->dma_mask);
3090	if (!iova) {
3091		sglist->dma_length = 0;
3092		return 0;
3093	}
3094
3095	/*
3096	 * Check if DMAR supports zero-length reads on write only
3097	 * mappings..
3098	 */
3099	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3100			!cap_zlr(iommu->cap))
3101		prot |= DMA_PTE_READ;
3102	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3103		prot |= DMA_PTE_WRITE;
3104
3105	start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3106
3107	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3108	if (unlikely(ret)) {
3109		/*  clear the page */
3110		dma_pte_clear_range(domain, start_vpfn,
3111				    start_vpfn + size - 1);
3112		/* free page tables */
3113		dma_pte_free_pagetable(domain, start_vpfn,
3114				       start_vpfn + size - 1);
3115		/* free iova */
3116		__free_iova(&domain->iovad, iova);
3117		return 0;
3118	}
3119
3120	/* it's a non-present to present mapping. Only flush if caching mode */
3121	if (cap_caching_mode(iommu->cap))
3122		iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3123	else
3124		iommu_flush_write_buffer(iommu);
3125
3126	return nelems;
3127}
3128
3129static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3130{
3131	return !dma_addr;
3132}
3133
3134struct dma_map_ops intel_dma_ops = {
3135	.alloc = intel_alloc_coherent,
3136	.free = intel_free_coherent,
3137	.map_sg = intel_map_sg,
3138	.unmap_sg = intel_unmap_sg,
3139	.map_page = intel_map_page,
3140	.unmap_page = intel_unmap_page,
3141	.mapping_error = intel_mapping_error,
3142};
3143
3144static inline int iommu_domain_cache_init(void)
3145{
3146	int ret = 0;
3147
3148	iommu_domain_cache = kmem_cache_create("iommu_domain",
3149					 sizeof(struct dmar_domain),
3150					 0,
3151					 SLAB_HWCACHE_ALIGN,
3152
3153					 NULL);
3154	if (!iommu_domain_cache) {
3155		printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3156		ret = -ENOMEM;
3157	}
3158
3159	return ret;
3160}
3161
3162static inline int iommu_devinfo_cache_init(void)
3163{
3164	int ret = 0;
3165
3166	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3167					 sizeof(struct device_domain_info),
3168					 0,
3169					 SLAB_HWCACHE_ALIGN,
3170					 NULL);
3171	if (!iommu_devinfo_cache) {
3172		printk(KERN_ERR "Couldn't create devinfo cache\n");
3173		ret = -ENOMEM;
3174	}
3175
3176	return ret;
3177}
3178
3179static inline int iommu_iova_cache_init(void)
3180{
3181	int ret = 0;
3182
3183	iommu_iova_cache = kmem_cache_create("iommu_iova",
3184					 sizeof(struct iova),
3185					 0,
3186					 SLAB_HWCACHE_ALIGN,
3187					 NULL);
3188	if (!iommu_iova_cache) {
3189		printk(KERN_ERR "Couldn't create iova cache\n");
3190		ret = -ENOMEM;
3191	}
3192
3193	return ret;
3194}
3195
3196static int __init iommu_init_mempool(void)
3197{
3198	int ret;
3199	ret = iommu_iova_cache_init();
3200	if (ret)
3201		return ret;
3202
3203	ret = iommu_domain_cache_init();
3204	if (ret)
3205		goto domain_error;
3206
3207	ret = iommu_devinfo_cache_init();
3208	if (!ret)
3209		return ret;
3210
3211	kmem_cache_destroy(iommu_domain_cache);
3212domain_error:
3213	kmem_cache_destroy(iommu_iova_cache);
3214
3215	return -ENOMEM;
3216}
3217
3218static void __init iommu_exit_mempool(void)
3219{
3220	kmem_cache_destroy(iommu_devinfo_cache);
3221	kmem_cache_destroy(iommu_domain_cache);
3222	kmem_cache_destroy(iommu_iova_cache);
3223
3224}
3225
3226static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3227{
3228	struct dmar_drhd_unit *drhd;
3229	u32 vtbar;
3230	int rc;
3231
3232	/* We know that this device on this chipset has its own IOMMU.
3233	 * If we find it under a different IOMMU, then the BIOS is lying
3234	 * to us. Hope that the IOMMU for this device is actually
3235	 * disabled, and it needs no translation...
3236	 */
3237	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3238	if (rc) {
3239		/* "can't" happen */
3240		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3241		return;
3242	}
3243	vtbar &= 0xffff0000;
3244
3245	/* we know that the this iommu should be at offset 0xa000 from vtbar */
3246	drhd = dmar_find_matched_drhd_unit(pdev);
3247	if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3248			    TAINT_FIRMWARE_WORKAROUND,
3249			    "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3250		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3251}
3252DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3253
3254static void __init init_no_remapping_devices(void)
3255{
3256	struct dmar_drhd_unit *drhd;
3257
3258	for_each_drhd_unit(drhd) {
3259		if (!drhd->include_all) {
3260			int i;
3261			for (i = 0; i < drhd->devices_cnt; i++)
3262				if (drhd->devices[i] != NULL)
3263					break;
3264			/* ignore DMAR unit if no pci devices exist */
3265			if (i == drhd->devices_cnt)
3266				drhd->ignored = 1;
3267		}
3268	}
3269
3270	for_each_drhd_unit(drhd) {
3271		int i;
3272		if (drhd->ignored || drhd->include_all)
3273			continue;
3274
3275		for (i = 0; i < drhd->devices_cnt; i++)
3276			if (drhd->devices[i] &&
3277			    !IS_GFX_DEVICE(drhd->devices[i]))
3278				break;
3279
3280		if (i < drhd->devices_cnt)
3281			continue;
3282
3283		/* This IOMMU has *only* gfx devices. Either bypass it or
3284		   set the gfx_mapped flag, as appropriate */
3285		if (dmar_map_gfx) {
3286			intel_iommu_gfx_mapped = 1;
3287		} else {
3288			drhd->ignored = 1;
3289			for (i = 0; i < drhd->devices_cnt; i++) {
3290				if (!drhd->devices[i])
3291					continue;
3292				drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3293			}
3294		}
3295	}
3296}
3297
3298#ifdef CONFIG_SUSPEND
3299static int init_iommu_hw(void)
3300{
3301	struct dmar_drhd_unit *drhd;
3302	struct intel_iommu *iommu = NULL;
3303
3304	for_each_active_iommu(iommu, drhd)
3305		if (iommu->qi)
3306			dmar_reenable_qi(iommu);
3307
3308	for_each_iommu(iommu, drhd) {
3309		if (drhd->ignored) {
3310			/*
3311			 * we always have to disable PMRs or DMA may fail on
3312			 * this device
3313			 */
3314			if (force_on)
3315				iommu_disable_protect_mem_regions(iommu);
3316			continue;
3317		}
3318
3319		iommu_flush_write_buffer(iommu);
3320
3321		iommu_set_root_entry(iommu);
3322
3323		iommu->flush.flush_context(iommu, 0, 0, 0,
3324					   DMA_CCMD_GLOBAL_INVL);
3325		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3326					 DMA_TLB_GLOBAL_FLUSH);
3327		if (iommu_enable_translation(iommu))
3328			return 1;
3329		iommu_disable_protect_mem_regions(iommu);
3330	}
3331
3332	return 0;
3333}
3334
3335static void iommu_flush_all(void)
3336{
3337	struct dmar_drhd_unit *drhd;
3338	struct intel_iommu *iommu;
3339
3340	for_each_active_iommu(iommu, drhd) {
3341		iommu->flush.flush_context(iommu, 0, 0, 0,
3342					   DMA_CCMD_GLOBAL_INVL);
3343		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3344					 DMA_TLB_GLOBAL_FLUSH);
3345	}
3346}
3347
3348static int iommu_suspend(void)
3349{
3350	struct dmar_drhd_unit *drhd;
3351	struct intel_iommu *iommu = NULL;
3352	unsigned long flag;
3353
3354	for_each_active_iommu(iommu, drhd) {
3355		iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3356						 GFP_ATOMIC);
3357		if (!iommu->iommu_state)
3358			goto nomem;
3359	}
3360
3361	iommu_flush_all();
3362
3363	for_each_active_iommu(iommu, drhd) {
3364		iommu_disable_translation(iommu);
3365
3366		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3367
3368		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3369			readl(iommu->reg + DMAR_FECTL_REG);
3370		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3371			readl(iommu->reg + DMAR_FEDATA_REG);
3372		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3373			readl(iommu->reg + DMAR_FEADDR_REG);
3374		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3375			readl(iommu->reg + DMAR_FEUADDR_REG);
3376
3377		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3378	}
3379	return 0;
3380
3381nomem:
3382	for_each_active_iommu(iommu, drhd)
3383		kfree(iommu->iommu_state);
3384
3385	return -ENOMEM;
3386}
3387
3388static void iommu_resume(void)
3389{
3390	struct dmar_drhd_unit *drhd;
3391	struct intel_iommu *iommu = NULL;
3392	unsigned long flag;
3393
3394	if (init_iommu_hw()) {
3395		if (force_on)
3396			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3397		else
3398			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3399		return;
3400	}
3401
3402	for_each_active_iommu(iommu, drhd) {
3403
3404		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3405
3406		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3407			iommu->reg + DMAR_FECTL_REG);
3408		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3409			iommu->reg + DMAR_FEDATA_REG);
3410		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3411			iommu->reg + DMAR_FEADDR_REG);
3412		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3413			iommu->reg + DMAR_FEUADDR_REG);
3414
3415		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3416	}
3417
3418	for_each_active_iommu(iommu, drhd)
3419		kfree(iommu->iommu_state);
3420}
3421
3422static struct syscore_ops iommu_syscore_ops = {
3423	.resume		= iommu_resume,
3424	.suspend	= iommu_suspend,
3425};
3426
3427static void __init init_iommu_pm_ops(void)
3428{
3429	register_syscore_ops(&iommu_syscore_ops);
3430}
3431
3432#else
3433static inline void init_iommu_pm_ops(void) {}
3434#endif	/* CONFIG_PM */
3435
3436LIST_HEAD(dmar_rmrr_units);
3437
3438static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3439{
3440	list_add(&rmrr->list, &dmar_rmrr_units);
3441}
3442
3443
3444int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3445{
3446	struct acpi_dmar_reserved_memory *rmrr;
3447	struct dmar_rmrr_unit *rmrru;
3448
3449	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3450	if (!rmrru)
3451		return -ENOMEM;
3452
3453	rmrru->hdr = header;
3454	rmrr = (struct acpi_dmar_reserved_memory *)header;
3455	rmrru->base_address = rmrr->base_address;
3456	rmrru->end_address = rmrr->end_address;
3457
3458	dmar_register_rmrr_unit(rmrru);
3459	return 0;
3460}
3461
3462static int __init
3463rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3464{
3465	struct acpi_dmar_reserved_memory *rmrr;
3466	int ret;
3467
3468	rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3469	ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3470		((void *)rmrr) + rmrr->header.length,
3471		&rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3472
3473	if (ret || (rmrru->devices_cnt == 0)) {
3474		list_del(&rmrru->list);
3475		kfree(rmrru);
3476	}
3477	return ret;
3478}
3479
3480static LIST_HEAD(dmar_atsr_units);
3481
3482int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3483{
3484	struct acpi_dmar_atsr *atsr;
3485	struct dmar_atsr_unit *atsru;
3486
3487	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3488	atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3489	if (!atsru)
3490		return -ENOMEM;
3491
3492	atsru->hdr = hdr;
3493	atsru->include_all = atsr->flags & 0x1;
3494
3495	list_add(&atsru->list, &dmar_atsr_units);
3496
3497	return 0;
3498}
3499
3500static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3501{
3502	int rc;
3503	struct acpi_dmar_atsr *atsr;
3504
3505	if (atsru->include_all)
3506		return 0;
3507
3508	atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3509	rc = dmar_parse_dev_scope((void *)(atsr + 1),
3510				(void *)atsr + atsr->header.length,
3511				&atsru->devices_cnt, &atsru->devices,
3512				atsr->segment);
3513	if (rc || !atsru->devices_cnt) {
3514		list_del(&atsru->list);
3515		kfree(atsru);
3516	}
3517
3518	return rc;
3519}
3520
3521int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3522{
3523	int i;
3524	struct pci_bus *bus;
3525	struct acpi_dmar_atsr *atsr;
3526	struct dmar_atsr_unit *atsru;
3527
3528	dev = pci_physfn(dev);
3529
3530	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3531		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3532		if (atsr->segment == pci_domain_nr(dev->bus))
3533			goto found;
3534	}
3535
3536	return 0;
3537
3538found:
3539	for (bus = dev->bus; bus; bus = bus->parent) {
3540		struct pci_dev *bridge = bus->self;
3541
3542		if (!bridge || !pci_is_pcie(bridge) ||
3543		    bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3544			return 0;
3545
3546		if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3547			for (i = 0; i < atsru->devices_cnt; i++)
3548				if (atsru->devices[i] == bridge)
3549					return 1;
3550			break;
3551		}
3552	}
3553
3554	if (atsru->include_all)
3555		return 1;
3556
3557	return 0;
3558}
3559
3560int __init dmar_parse_rmrr_atsr_dev(void)
3561{
3562	struct dmar_rmrr_unit *rmrr, *rmrr_n;
3563	struct dmar_atsr_unit *atsr, *atsr_n;
3564	int ret = 0;
3565
3566	list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3567		ret = rmrr_parse_dev(rmrr);
3568		if (ret)
3569			return ret;
3570	}
3571
3572	list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3573		ret = atsr_parse_dev(atsr);
3574		if (ret)
3575			return ret;
3576	}
3577
3578	return ret;
3579}
3580
3581/*
3582 * Here we only respond to action of unbound device from driver.
3583 *
3584 * Added device is not attached to its DMAR domain here yet. That will happen
3585 * when mapping the device to iova.
3586 */
3587static int device_notifier(struct notifier_block *nb,
3588				  unsigned long action, void *data)
3589{
3590	struct device *dev = data;
3591	struct pci_dev *pdev = to_pci_dev(dev);
3592	struct dmar_domain *domain;
3593
3594	if (iommu_no_mapping(dev))
3595		return 0;
3596
3597	domain = find_domain(pdev);
3598	if (!domain)
3599		return 0;
3600
3601	if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3602		domain_remove_one_dev_info(domain, pdev);
3603
3604		if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3605		    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3606		    list_empty(&domain->devices))
3607			domain_exit(domain);
3608	}
3609
3610	return 0;
3611}
3612
3613static struct notifier_block device_nb = {
3614	.notifier_call = device_notifier,
3615};
3616
3617int __init intel_iommu_init(void)
3618{
3619	int ret = 0;
3620
3621	/* VT-d is required for a TXT/tboot launch, so enforce that */
3622	force_on = tboot_force_iommu();
3623
3624	if (dmar_table_init()) {
3625		if (force_on)
3626			panic("tboot: Failed to initialize DMAR table\n");
3627		return 	-ENODEV;
3628	}
3629
3630	if (dmar_dev_scope_init() < 0) {
3631		if (force_on)
3632			panic("tboot: Failed to initialize DMAR device scope\n");
3633		return 	-ENODEV;
3634	}
3635
3636	if (no_iommu || dmar_disabled)
3637		return -ENODEV;
3638
3639	if (iommu_init_mempool()) {
3640		if (force_on)
3641			panic("tboot: Failed to initialize iommu memory\n");
3642		return 	-ENODEV;
3643	}
3644
3645	if (list_empty(&dmar_rmrr_units))
3646		printk(KERN_INFO "DMAR: No RMRR found\n");
3647
3648	if (list_empty(&dmar_atsr_units))
3649		printk(KERN_INFO "DMAR: No ATSR found\n");
3650
3651	if (dmar_init_reserved_ranges()) {
3652		if (force_on)
3653			panic("tboot: Failed to reserve iommu ranges\n");
3654		return 	-ENODEV;
3655	}
3656
3657	init_no_remapping_devices();
3658
3659	ret = init_dmars();
3660	if (ret) {
3661		if (force_on)
3662			panic("tboot: Failed to initialize DMARs\n");
3663		printk(KERN_ERR "IOMMU: dmar init failed\n");
3664		put_iova_domain(&reserved_iova_list);
3665		iommu_exit_mempool();
3666		return ret;
3667	}
3668	printk(KERN_INFO
3669	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3670
3671	init_timer(&unmap_timer);
3672#ifdef CONFIG_SWIOTLB
3673	swiotlb = 0;
3674#endif
3675	dma_ops = &intel_dma_ops;
3676
3677	init_iommu_pm_ops();
3678
3679	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3680
3681	bus_register_notifier(&pci_bus_type, &device_nb);
3682
3683	intel_iommu_enabled = 1;
3684
3685	return 0;
3686}
3687
3688static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3689					   struct pci_dev *pdev)
3690{
3691	struct pci_dev *tmp, *parent;
3692
3693	if (!iommu || !pdev)
3694		return;
3695
3696	/* dependent device detach */
3697	tmp = pci_find_upstream_pcie_bridge(pdev);
3698	/* Secondary interface's bus number and devfn 0 */
3699	if (tmp) {
3700		parent = pdev->bus->self;
3701		while (parent != tmp) {
3702			iommu_detach_dev(iommu, parent->bus->number,
3703					 parent->devfn);
3704			parent = parent->bus->self;
3705		}
3706		if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3707			iommu_detach_dev(iommu,
3708				tmp->subordinate->number, 0);
3709		else /* this is a legacy PCI bridge */
3710			iommu_detach_dev(iommu, tmp->bus->number,
3711					 tmp->devfn);
3712	}
3713}
3714
3715static void domain_remove_one_dev_info(struct dmar_domain *domain,
3716					  struct pci_dev *pdev)
3717{
3718	struct device_domain_info *info;
3719	struct intel_iommu *iommu;
3720	unsigned long flags;
3721	int found = 0;
3722	struct list_head *entry, *tmp;
3723
3724	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3725				pdev->devfn);
3726	if (!iommu)
3727		return;
3728
3729	spin_lock_irqsave(&device_domain_lock, flags);
3730	list_for_each_safe(entry, tmp, &domain->devices) {
3731		info = list_entry(entry, struct device_domain_info, link);
3732		if (info->segment == pci_domain_nr(pdev->bus) &&
3733		    info->bus == pdev->bus->number &&
3734		    info->devfn == pdev->devfn) {
3735			list_del(&info->link);
3736			list_del(&info->global);
3737			if (info->dev)
3738				info->dev->dev.archdata.iommu = NULL;
3739			spin_unlock_irqrestore(&device_domain_lock, flags);
3740
3741			iommu_disable_dev_iotlb(info);
3742			iommu_detach_dev(iommu, info->bus, info->devfn);
3743			iommu_detach_dependent_devices(iommu, pdev);
3744			free_devinfo_mem(info);
3745
3746			spin_lock_irqsave(&device_domain_lock, flags);
3747
3748			if (found)
3749				break;
3750			else
3751				continue;
3752		}
3753
3754		/* if there is no other devices under the same iommu
3755		 * owned by this domain, clear this iommu in iommu_bmp
3756		 * update iommu count and coherency
3757		 */
3758		if (iommu == device_to_iommu(info->segment, info->bus,
3759					    info->devfn))
3760			found = 1;
3761	}
3762
3763	spin_unlock_irqrestore(&device_domain_lock, flags);
3764
3765	if (found == 0) {
3766		unsigned long tmp_flags;
3767		spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3768		clear_bit(iommu->seq_id, domain->iommu_bmp);
3769		domain->iommu_count--;
3770		domain_update_iommu_cap(domain);
3771		spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3772
3773		if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3774		    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3775			spin_lock_irqsave(&iommu->lock, tmp_flags);
3776			clear_bit(domain->id, iommu->domain_ids);
3777			iommu->domains[domain->id] = NULL;
3778			spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3779		}
3780	}
3781}
3782
3783static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3784{
3785	struct device_domain_info *info;
3786	struct intel_iommu *iommu;
3787	unsigned long flags1, flags2;
3788
3789	spin_lock_irqsave(&device_domain_lock, flags1);
3790	while (!list_empty(&domain->devices)) {
3791		info = list_entry(domain->devices.next,
3792			struct device_domain_info, link);
3793		list_del(&info->link);
3794		list_del(&info->global);
3795		if (info->dev)
3796			info->dev->dev.archdata.iommu = NULL;
3797
3798		spin_unlock_irqrestore(&device_domain_lock, flags1);
3799
3800		iommu_disable_dev_iotlb(info);
3801		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3802		iommu_detach_dev(iommu, info->bus, info->devfn);
3803		iommu_detach_dependent_devices(iommu, info->dev);
3804
3805		/* clear this iommu in iommu_bmp, update iommu count
3806		 * and capabilities
3807		 */
3808		spin_lock_irqsave(&domain->iommu_lock, flags2);
3809		if (test_and_clear_bit(iommu->seq_id,
3810				       domain->iommu_bmp)) {
3811			domain->iommu_count--;
3812			domain_update_iommu_cap(domain);
3813		}
3814		spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3815
3816		free_devinfo_mem(info);
3817		spin_lock_irqsave(&device_domain_lock, flags1);
3818	}
3819	spin_unlock_irqrestore(&device_domain_lock, flags1);
3820}
3821
3822/* domain id for virtual machine, it won't be set in context */
3823static unsigned long vm_domid;
3824
3825static struct dmar_domain *iommu_alloc_vm_domain(void)
3826{
3827	struct dmar_domain *domain;
3828
3829	domain = alloc_domain_mem();
3830	if (!domain)
3831		return NULL;
3832
3833	domain->id = vm_domid++;
3834	domain->nid = -1;
3835	memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3836	domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3837
3838	return domain;
3839}
3840
3841static int md_domain_init(struct dmar_domain *domain, int guest_width)
3842{
3843	int adjust_width;
3844
3845	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3846	spin_lock_init(&domain->iommu_lock);
3847
3848	domain_reserve_special_ranges(domain);
3849
3850	/* calculate AGAW */
3851	domain->gaw = guest_width;
3852	adjust_width = guestwidth_to_adjustwidth(guest_width);
3853	domain->agaw = width_to_agaw(adjust_width);
3854
3855	INIT_LIST_HEAD(&domain->devices);
3856
3857	domain->iommu_count = 0;
3858	domain->iommu_coherency = 0;
3859	domain->iommu_snooping = 0;
3860	domain->iommu_superpage = 0;
3861	domain->max_addr = 0;
3862	domain->nid = -1;
3863
3864	/* always allocate the top pgd */
3865	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3866	if (!domain->pgd)
3867		return -ENOMEM;
3868	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3869	return 0;
3870}
3871
3872static void iommu_free_vm_domain(struct dmar_domain *domain)
3873{
3874	unsigned long flags;
3875	struct dmar_drhd_unit *drhd;
3876	struct intel_iommu *iommu;
3877	unsigned long i;
3878	unsigned long ndomains;
3879
3880	for_each_drhd_unit(drhd) {
3881		if (drhd->ignored)
3882			continue;
3883		iommu = drhd->iommu;
3884
3885		ndomains = cap_ndoms(iommu->cap);
3886		for_each_set_bit(i, iommu->domain_ids, ndomains) {
3887			if (iommu->domains[i] == domain) {
3888				spin_lock_irqsave(&iommu->lock, flags);
3889				clear_bit(i, iommu->domain_ids);
3890				iommu->domains[i] = NULL;
3891				spin_unlock_irqrestore(&iommu->lock, flags);
3892				break;
3893			}
3894		}
3895	}
3896}
3897
3898static void vm_domain_exit(struct dmar_domain *domain)
3899{
3900	/* Domain 0 is reserved, so dont process it */
3901	if (!domain)
3902		return;
3903
3904	vm_domain_remove_all_dev_info(domain);
3905	/* destroy iovas */
3906	put_iova_domain(&domain->iovad);
3907
3908	/* clear ptes */
3909	dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3910
3911	/* free page tables */
3912	dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3913
3914	iommu_free_vm_domain(domain);
3915	free_domain_mem(domain);
3916}
3917
3918static int intel_iommu_domain_init(struct iommu_domain *domain)
3919{
3920	struct dmar_domain *dmar_domain;
3921
3922	dmar_domain = iommu_alloc_vm_domain();
3923	if (!dmar_domain) {
3924		printk(KERN_ERR
3925			"intel_iommu_domain_init: dmar_domain == NULL\n");
3926		return -ENOMEM;
3927	}
3928	if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3929		printk(KERN_ERR
3930			"intel_iommu_domain_init() failed\n");
3931		vm_domain_exit(dmar_domain);
3932		return -ENOMEM;
3933	}
3934	domain_update_iommu_cap(dmar_domain);
3935	domain->priv = dmar_domain;
3936
3937	return 0;
3938}
3939
3940static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3941{
3942	struct dmar_domain *dmar_domain = domain->priv;
3943
3944	domain->priv = NULL;
3945	vm_domain_exit(dmar_domain);
3946}
3947
3948static int intel_iommu_attach_device(struct iommu_domain *domain,
3949				     struct device *dev)
3950{
3951	struct dmar_domain *dmar_domain = domain->priv;
3952	struct pci_dev *pdev = to_pci_dev(dev);
3953	struct intel_iommu *iommu;
3954	int addr_width;
3955
3956	/* normally pdev is not mapped */
3957	if (unlikely(domain_context_mapped(pdev))) {
3958		struct dmar_domain *old_domain;
3959
3960		old_domain = find_domain(pdev);
3961		if (old_domain) {
3962			if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3963			    dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3964				domain_remove_one_dev_info(old_domain, pdev);
3965			else
3966				domain_remove_dev_info(old_domain);
3967		}
3968	}
3969
3970	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3971				pdev->devfn);
3972	if (!iommu)
3973		return -ENODEV;
3974
3975	/* check if this iommu agaw is sufficient for max mapped address */
3976	addr_width = agaw_to_width(iommu->agaw);
3977	if (addr_width > cap_mgaw(iommu->cap))
3978		addr_width = cap_mgaw(iommu->cap);
3979
3980	if (dmar_domain->max_addr > (1LL << addr_width)) {
3981		printk(KERN_ERR "%s: iommu width (%d) is not "
3982		       "sufficient for the mapped address (%llx)\n",
3983		       __func__, addr_width, dmar_domain->max_addr);
3984		return -EFAULT;
3985	}
3986	dmar_domain->gaw = addr_width;
3987
3988	/*
3989	 * Knock out extra levels of page tables if necessary
3990	 */
3991	while (iommu->agaw < dmar_domain->agaw) {
3992		struct dma_pte *pte;
3993
3994		pte = dmar_domain->pgd;
3995		if (dma_pte_present(pte)) {
3996			dmar_domain->pgd = (struct dma_pte *)
3997				phys_to_virt(dma_pte_addr(pte));
3998			free_pgtable_page(pte);
3999		}
4000		dmar_domain->agaw--;
4001	}
4002
4003	return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4004}
4005
4006static void intel_iommu_detach_device(struct iommu_domain *domain,
4007				      struct device *dev)
4008{
4009	struct dmar_domain *dmar_domain = domain->priv;
4010	struct pci_dev *pdev = to_pci_dev(dev);
4011
4012	domain_remove_one_dev_info(dmar_domain, pdev);
4013}
4014
4015static int intel_iommu_map(struct iommu_domain *domain,
4016			   unsigned long iova, phys_addr_t hpa,
4017			   size_t size, int iommu_prot)
4018{
4019	struct dmar_domain *dmar_domain = domain->priv;
4020	u64 max_addr;
4021	int prot = 0;
4022	int ret;
4023
4024	if (iommu_prot & IOMMU_READ)
4025		prot |= DMA_PTE_READ;
4026	if (iommu_prot & IOMMU_WRITE)
4027		prot |= DMA_PTE_WRITE;
4028	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4029		prot |= DMA_PTE_SNP;
4030
4031	max_addr = iova + size;
4032	if (dmar_domain->max_addr < max_addr) {
4033		u64 end;
4034
4035		/* check if minimum agaw is sufficient for mapped address */
4036		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4037		if (end < max_addr) {
4038			printk(KERN_ERR "%s: iommu width (%d) is not "
4039			       "sufficient for the mapped address (%llx)\n",
4040			       __func__, dmar_domain->gaw, max_addr);
4041			return -EFAULT;
4042		}
4043		dmar_domain->max_addr = max_addr;
4044	}
4045	/* Round up size to next multiple of PAGE_SIZE, if it and
4046	   the low bits of hpa would take us onto the next page */
4047	size = aligned_nrpages(hpa, size);
4048	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4049				 hpa >> VTD_PAGE_SHIFT, size, prot);
4050	return ret;
4051}
4052
4053static size_t intel_iommu_unmap(struct iommu_domain *domain,
4054			     unsigned long iova, size_t size)
4055{
4056	struct dmar_domain *dmar_domain = domain->priv;
4057	int order;
4058
4059	order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4060			    (iova + size - 1) >> VTD_PAGE_SHIFT);
4061
4062	if (dmar_domain->max_addr == iova + size)
4063		dmar_domain->max_addr = iova;
4064
4065	return PAGE_SIZE << order;
4066}
4067
4068static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4069					    unsigned long iova)
4070{
4071	struct dmar_domain *dmar_domain = domain->priv;
4072	struct dma_pte *pte;
4073	u64 phys = 0;
4074
4075	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4076	if (pte)
4077		phys = dma_pte_addr(pte);
4078
4079	return phys;
4080}
4081
4082static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4083				      unsigned long cap)
4084{
4085	struct dmar_domain *dmar_domain = domain->priv;
4086
4087	if (cap == IOMMU_CAP_CACHE_COHERENCY)
4088		return dmar_domain->iommu_snooping;
4089	if (cap == IOMMU_CAP_INTR_REMAP)
4090		return intr_remapping_enabled;
4091
4092	return 0;
4093}
4094
4095/*
4096 * Group numbers are arbitrary.  Device with the same group number
4097 * indicate the iommu cannot differentiate between them.  To avoid
4098 * tracking used groups we just use the seg|bus|devfn of the lowest
4099 * level we're able to differentiate devices
4100 */
4101static int intel_iommu_device_group(struct device *dev, unsigned int *groupid)
4102{
4103	struct pci_dev *pdev = to_pci_dev(dev);
4104	struct pci_dev *bridge;
4105	union {
4106		struct {
4107			u8 devfn;
4108			u8 bus;
4109			u16 segment;
4110		} pci;
4111		u32 group;
4112	} id;
4113
4114	if (iommu_no_mapping(dev))
4115		return -ENODEV;
4116
4117	id.pci.segment = pci_domain_nr(pdev->bus);
4118	id.pci.bus = pdev->bus->number;
4119	id.pci.devfn = pdev->devfn;
4120
4121	if (!device_to_iommu(id.pci.segment, id.pci.bus, id.pci.devfn))
4122		return -ENODEV;
4123
4124	bridge = pci_find_upstream_pcie_bridge(pdev);
4125	if (bridge) {
4126		if (pci_is_pcie(bridge)) {
4127			id.pci.bus = bridge->subordinate->number;
4128			id.pci.devfn = 0;
4129		} else {
4130			id.pci.bus = bridge->bus->number;
4131			id.pci.devfn = bridge->devfn;
4132		}
4133	}
4134
4135	if (!pdev->is_virtfn && iommu_group_mf)
4136		id.pci.devfn = PCI_DEVFN(PCI_SLOT(id.pci.devfn), 0);
4137
4138	*groupid = id.group;
4139
4140	return 0;
4141}
4142
4143static struct iommu_ops intel_iommu_ops = {
4144	.domain_init	= intel_iommu_domain_init,
4145	.domain_destroy = intel_iommu_domain_destroy,
4146	.attach_dev	= intel_iommu_attach_device,
4147	.detach_dev	= intel_iommu_detach_device,
4148	.map		= intel_iommu_map,
4149	.unmap		= intel_iommu_unmap,
4150	.iova_to_phys	= intel_iommu_iova_to_phys,
4151	.domain_has_cap = intel_iommu_domain_has_cap,
4152	.device_group	= intel_iommu_device_group,
4153	.pgsize_bitmap	= INTEL_IOMMU_PGSIZES,
4154};
4155
4156static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4157{
4158	/*
4159	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4160	 * but needs it:
4161	 */
4162	printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4163	rwbf_quirk = 1;
4164
4165	/* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4166	if (dev->revision == 0x07) {
4167		printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4168		dmar_map_gfx = 0;
4169	}
4170}
4171
4172DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4173
4174#define GGC 0x52
4175#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4176#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4177#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4178#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4179#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4180#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4181#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4182#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4183
4184static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4185{
4186	unsigned short ggc;
4187
4188	if (pci_read_config_word(dev, GGC, &ggc))
4189		return;
4190
4191	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4192		printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4193		dmar_map_gfx = 0;
4194	} else if (dmar_map_gfx) {
4195		/* we have to ensure the gfx device is idle before we flush */
4196		printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4197		intel_iommu_strict = 1;
4198       }
4199}
4200DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4201DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4202DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4203DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4204
4205/* On Tylersburg chipsets, some BIOSes have been known to enable the
4206   ISOCH DMAR unit for the Azalia sound device, but not give it any
4207   TLB entries, which causes it to deadlock. Check for that.  We do
4208   this in a function called from init_dmars(), instead of in a PCI
4209   quirk, because we don't want to print the obnoxious "BIOS broken"
4210   message if VT-d is actually disabled.
4211*/
4212static void __init check_tylersburg_isoch(void)
4213{
4214	struct pci_dev *pdev;
4215	uint32_t vtisochctrl;
4216
4217	/* If there's no Azalia in the system anyway, forget it. */
4218	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4219	if (!pdev)
4220		return;
4221	pci_dev_put(pdev);
4222
4223	/* System Management Registers. Might be hidden, in which case
4224	   we can't do the sanity check. But that's OK, because the
4225	   known-broken BIOSes _don't_ actually hide it, so far. */
4226	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4227	if (!pdev)
4228		return;
4229
4230	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4231		pci_dev_put(pdev);
4232		return;
4233	}
4234
4235	pci_dev_put(pdev);
4236
4237	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4238	if (vtisochctrl & 1)
4239		return;
4240
4241	/* Drop all bits other than the number of TLB entries */
4242	vtisochctrl &= 0x1c;
4243
4244	/* If we have the recommended number of TLB entries (16), fine. */
4245	if (vtisochctrl == 0x10)
4246		return;
4247
4248	/* Zero TLB entries? You get to ride the short bus to school. */
4249	if (!vtisochctrl) {
4250		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4251		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4252		     dmi_get_system_info(DMI_BIOS_VENDOR),
4253		     dmi_get_system_info(DMI_BIOS_VERSION),
4254		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4255		iommu_identity_mapping |= IDENTMAP_AZALIA;
4256		return;
4257	}
4258
4259	printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4260	       vtisochctrl);
4261}
4262