iommu.c revision 25985edcedea6396277003854657b5f3cb31a628
1/*
2 * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
3 *
4 * Rewrite, cleanup:
5 *
6 * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
7 * Copyright (C) 2006 Olof Johansson <olof@lixom.net>
8 *
9 * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
10 *
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
25 */
26
27#include <linux/init.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/mm.h>
31#include <linux/spinlock.h>
32#include <linux/string.h>
33#include <linux/pci.h>
34#include <linux/dma-mapping.h>
35#include <linux/crash_dump.h>
36#include <linux/memory.h>
37#include <asm/io.h>
38#include <asm/prom.h>
39#include <asm/rtas.h>
40#include <asm/iommu.h>
41#include <asm/pci-bridge.h>
42#include <asm/machdep.h>
43#include <asm/abs_addr.h>
44#include <asm/pSeries_reconfig.h>
45#include <asm/firmware.h>
46#include <asm/tce.h>
47#include <asm/ppc-pci.h>
48#include <asm/udbg.h>
49#include <asm/mmzone.h>
50
51#include "plpar_wrappers.h"
52
53
54static int tce_build_pSeries(struct iommu_table *tbl, long index,
55			      long npages, unsigned long uaddr,
56			      enum dma_data_direction direction,
57			      struct dma_attrs *attrs)
58{
59	u64 proto_tce;
60	u64 *tcep;
61	u64 rpn;
62
63	proto_tce = TCE_PCI_READ; // Read allowed
64
65	if (direction != DMA_TO_DEVICE)
66		proto_tce |= TCE_PCI_WRITE;
67
68	tcep = ((u64 *)tbl->it_base) + index;
69
70	while (npages--) {
71		/* can't move this out since we might cross MEMBLOCK boundary */
72		rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
73		*tcep = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
74
75		uaddr += TCE_PAGE_SIZE;
76		tcep++;
77	}
78	return 0;
79}
80
81
82static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
83{
84	u64 *tcep;
85
86	tcep = ((u64 *)tbl->it_base) + index;
87
88	while (npages--)
89		*(tcep++) = 0;
90}
91
92static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
93{
94	u64 *tcep;
95
96	tcep = ((u64 *)tbl->it_base) + index;
97
98	return *tcep;
99}
100
101static void tce_free_pSeriesLP(struct iommu_table*, long, long);
102static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
103
104static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
105				long npages, unsigned long uaddr,
106				enum dma_data_direction direction,
107				struct dma_attrs *attrs)
108{
109	u64 rc = 0;
110	u64 proto_tce, tce;
111	u64 rpn;
112	int ret = 0;
113	long tcenum_start = tcenum, npages_start = npages;
114
115	rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
116	proto_tce = TCE_PCI_READ;
117	if (direction != DMA_TO_DEVICE)
118		proto_tce |= TCE_PCI_WRITE;
119
120	while (npages--) {
121		tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
122		rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce);
123
124		if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
125			ret = (int)rc;
126			tce_free_pSeriesLP(tbl, tcenum_start,
127			                   (npages_start - (npages + 1)));
128			break;
129		}
130
131		if (rc && printk_ratelimit()) {
132			printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
133			printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
134			printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
135			printk("\ttce val = 0x%llx\n", tce );
136			show_stack(current, (unsigned long *)__get_SP());
137		}
138
139		tcenum++;
140		rpn++;
141	}
142	return ret;
143}
144
145static DEFINE_PER_CPU(u64 *, tce_page);
146
147static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
148				     long npages, unsigned long uaddr,
149				     enum dma_data_direction direction,
150				     struct dma_attrs *attrs)
151{
152	u64 rc = 0;
153	u64 proto_tce;
154	u64 *tcep;
155	u64 rpn;
156	long l, limit;
157	long tcenum_start = tcenum, npages_start = npages;
158	int ret = 0;
159
160	if (npages == 1) {
161		return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
162		                           direction, attrs);
163	}
164
165	tcep = __get_cpu_var(tce_page);
166
167	/* This is safe to do since interrupts are off when we're called
168	 * from iommu_alloc{,_sg}()
169	 */
170	if (!tcep) {
171		tcep = (u64 *)__get_free_page(GFP_ATOMIC);
172		/* If allocation fails, fall back to the loop implementation */
173		if (!tcep) {
174			return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
175					    direction, attrs);
176		}
177		__get_cpu_var(tce_page) = tcep;
178	}
179
180	rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
181	proto_tce = TCE_PCI_READ;
182	if (direction != DMA_TO_DEVICE)
183		proto_tce |= TCE_PCI_WRITE;
184
185	/* We can map max one pageful of TCEs at a time */
186	do {
187		/*
188		 * Set up the page with TCE data, looping through and setting
189		 * the values.
190		 */
191		limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
192
193		for (l = 0; l < limit; l++) {
194			tcep[l] = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
195			rpn++;
196		}
197
198		rc = plpar_tce_put_indirect((u64)tbl->it_index,
199					    (u64)tcenum << 12,
200					    (u64)virt_to_abs(tcep),
201					    limit);
202
203		npages -= limit;
204		tcenum += limit;
205	} while (npages > 0 && !rc);
206
207	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
208		ret = (int)rc;
209		tce_freemulti_pSeriesLP(tbl, tcenum_start,
210		                        (npages_start - (npages + limit)));
211		return ret;
212	}
213
214	if (rc && printk_ratelimit()) {
215		printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
216		printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
217		printk("\tnpages  = 0x%llx\n", (u64)npages);
218		printk("\ttce[0] val = 0x%llx\n", tcep[0]);
219		show_stack(current, (unsigned long *)__get_SP());
220	}
221	return ret;
222}
223
224static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
225{
226	u64 rc;
227
228	while (npages--) {
229		rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0);
230
231		if (rc && printk_ratelimit()) {
232			printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
233			printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
234			printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
235			show_stack(current, (unsigned long *)__get_SP());
236		}
237
238		tcenum++;
239	}
240}
241
242
243static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
244{
245	u64 rc;
246
247	rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
248
249	if (rc && printk_ratelimit()) {
250		printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
251		printk("\trc      = %lld\n", rc);
252		printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
253		printk("\tnpages  = 0x%llx\n", (u64)npages);
254		show_stack(current, (unsigned long *)__get_SP());
255	}
256}
257
258static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
259{
260	u64 rc;
261	unsigned long tce_ret;
262
263	rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret);
264
265	if (rc && printk_ratelimit()) {
266		printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);
267		printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
268		printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
269		show_stack(current, (unsigned long *)__get_SP());
270	}
271
272	return tce_ret;
273}
274
275/* this is compatible with cells for the device tree property */
276struct dynamic_dma_window_prop {
277	__be32	liobn;		/* tce table number */
278	__be64	dma_base;	/* address hi,lo */
279	__be32	tce_shift;	/* ilog2(tce_page_size) */
280	__be32	window_shift;	/* ilog2(tce_window_size) */
281};
282
283struct direct_window {
284	struct device_node *device;
285	const struct dynamic_dma_window_prop *prop;
286	struct list_head list;
287};
288
289/* Dynamic DMA Window support */
290struct ddw_query_response {
291	u32 windows_available;
292	u32 largest_available_block;
293	u32 page_size;
294	u32 migration_capable;
295};
296
297struct ddw_create_response {
298	u32 liobn;
299	u32 addr_hi;
300	u32 addr_lo;
301};
302
303static LIST_HEAD(direct_window_list);
304/* prevents races between memory on/offline and window creation */
305static DEFINE_SPINLOCK(direct_window_list_lock);
306/* protects initializing window twice for same device */
307static DEFINE_MUTEX(direct_window_init_mutex);
308#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
309
310static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
311					unsigned long num_pfn, const void *arg)
312{
313	const struct dynamic_dma_window_prop *maprange = arg;
314	int rc;
315	u64 tce_size, num_tce, dma_offset, next;
316	u32 tce_shift;
317	long limit;
318
319	tce_shift = be32_to_cpu(maprange->tce_shift);
320	tce_size = 1ULL << tce_shift;
321	next = start_pfn << PAGE_SHIFT;
322	num_tce = num_pfn << PAGE_SHIFT;
323
324	/* round back to the beginning of the tce page size */
325	num_tce += next & (tce_size - 1);
326	next &= ~(tce_size - 1);
327
328	/* covert to number of tces */
329	num_tce |= tce_size - 1;
330	num_tce >>= tce_shift;
331
332	do {
333		/*
334		 * Set up the page with TCE data, looping through and setting
335		 * the values.
336		 */
337		limit = min_t(long, num_tce, 512);
338		dma_offset = next + be64_to_cpu(maprange->dma_base);
339
340		rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),
341					     dma_offset,
342					     0, limit);
343		num_tce -= limit;
344	} while (num_tce > 0 && !rc);
345
346	return rc;
347}
348
349static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
350					unsigned long num_pfn, const void *arg)
351{
352	const struct dynamic_dma_window_prop *maprange = arg;
353	u64 *tcep, tce_size, num_tce, dma_offset, next, proto_tce, liobn;
354	u32 tce_shift;
355	u64 rc = 0;
356	long l, limit;
357
358	local_irq_disable();	/* to protect tcep and the page behind it */
359	tcep = __get_cpu_var(tce_page);
360
361	if (!tcep) {
362		tcep = (u64 *)__get_free_page(GFP_ATOMIC);
363		if (!tcep) {
364			local_irq_enable();
365			return -ENOMEM;
366		}
367		__get_cpu_var(tce_page) = tcep;
368	}
369
370	proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
371
372	liobn = (u64)be32_to_cpu(maprange->liobn);
373	tce_shift = be32_to_cpu(maprange->tce_shift);
374	tce_size = 1ULL << tce_shift;
375	next = start_pfn << PAGE_SHIFT;
376	num_tce = num_pfn << PAGE_SHIFT;
377
378	/* round back to the beginning of the tce page size */
379	num_tce += next & (tce_size - 1);
380	next &= ~(tce_size - 1);
381
382	/* covert to number of tces */
383	num_tce |= tce_size - 1;
384	num_tce >>= tce_shift;
385
386	/* We can map max one pageful of TCEs at a time */
387	do {
388		/*
389		 * Set up the page with TCE data, looping through and setting
390		 * the values.
391		 */
392		limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);
393		dma_offset = next + be64_to_cpu(maprange->dma_base);
394
395		for (l = 0; l < limit; l++) {
396			tcep[l] = proto_tce | next;
397			next += tce_size;
398		}
399
400		rc = plpar_tce_put_indirect(liobn,
401					    dma_offset,
402					    (u64)virt_to_abs(tcep),
403					    limit);
404
405		num_tce -= limit;
406	} while (num_tce > 0 && !rc);
407
408	/* error cleanup: caller will clear whole range */
409
410	local_irq_enable();
411	return rc;
412}
413
414static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
415		unsigned long num_pfn, void *arg)
416{
417	return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
418}
419
420
421#ifdef CONFIG_PCI
422static void iommu_table_setparms(struct pci_controller *phb,
423				 struct device_node *dn,
424				 struct iommu_table *tbl)
425{
426	struct device_node *node;
427	const unsigned long *basep;
428	const u32 *sizep;
429
430	node = phb->dn;
431
432	basep = of_get_property(node, "linux,tce-base", NULL);
433	sizep = of_get_property(node, "linux,tce-size", NULL);
434	if (basep == NULL || sizep == NULL) {
435		printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %s has "
436				"missing tce entries !\n", dn->full_name);
437		return;
438	}
439
440	tbl->it_base = (unsigned long)__va(*basep);
441
442	if (!is_kdump_kernel())
443		memset((void *)tbl->it_base, 0, *sizep);
444
445	tbl->it_busno = phb->bus->number;
446
447	/* Units of tce entries */
448	tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT;
449
450	/* Test if we are going over 2GB of DMA space */
451	if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
452		udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
453		panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
454	}
455
456	phb->dma_window_base_cur += phb->dma_window_size;
457
458	/* Set the tce table size - measured in entries */
459	tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT;
460
461	tbl->it_index = 0;
462	tbl->it_blocksize = 16;
463	tbl->it_type = TCE_PCI;
464}
465
466/*
467 * iommu_table_setparms_lpar
468 *
469 * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
470 */
471static void iommu_table_setparms_lpar(struct pci_controller *phb,
472				      struct device_node *dn,
473				      struct iommu_table *tbl,
474				      const void *dma_window)
475{
476	unsigned long offset, size;
477
478	of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size);
479
480	tbl->it_busno = phb->bus->number;
481	tbl->it_base   = 0;
482	tbl->it_blocksize  = 16;
483	tbl->it_type = TCE_PCI;
484	tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
485	tbl->it_size = size >> IOMMU_PAGE_SHIFT;
486}
487
488static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
489{
490	struct device_node *dn;
491	struct iommu_table *tbl;
492	struct device_node *isa_dn, *isa_dn_orig;
493	struct device_node *tmp;
494	struct pci_dn *pci;
495	int children;
496
497	dn = pci_bus_to_OF_node(bus);
498
499	pr_debug("pci_dma_bus_setup_pSeries: setting up bus %s\n", dn->full_name);
500
501	if (bus->self) {
502		/* This is not a root bus, any setup will be done for the
503		 * device-side of the bridge in iommu_dev_setup_pSeries().
504		 */
505		return;
506	}
507	pci = PCI_DN(dn);
508
509	/* Check if the ISA bus on the system is under
510	 * this PHB.
511	 */
512	isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");
513
514	while (isa_dn && isa_dn != dn)
515		isa_dn = isa_dn->parent;
516
517	if (isa_dn_orig)
518		of_node_put(isa_dn_orig);
519
520	/* Count number of direct PCI children of the PHB. */
521	for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)
522		children++;
523
524	pr_debug("Children: %d\n", children);
525
526	/* Calculate amount of DMA window per slot. Each window must be
527	 * a power of two (due to pci_alloc_consistent requirements).
528	 *
529	 * Keep 256MB aside for PHBs with ISA.
530	 */
531
532	if (!isa_dn) {
533		/* No ISA/IDE - just set window size and return */
534		pci->phb->dma_window_size = 0x80000000ul; /* To be divided */
535
536		while (pci->phb->dma_window_size * children > 0x80000000ul)
537			pci->phb->dma_window_size >>= 1;
538		pr_debug("No ISA/IDE, window size is 0x%llx\n",
539			 pci->phb->dma_window_size);
540		pci->phb->dma_window_base_cur = 0;
541
542		return;
543	}
544
545	/* If we have ISA, then we probably have an IDE
546	 * controller too. Allocate a 128MB table but
547	 * skip the first 128MB to avoid stepping on ISA
548	 * space.
549	 */
550	pci->phb->dma_window_size = 0x8000000ul;
551	pci->phb->dma_window_base_cur = 0x8000000ul;
552
553	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
554			   pci->phb->node);
555
556	iommu_table_setparms(pci->phb, dn, tbl);
557	pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
558
559	/* Divide the rest (1.75GB) among the children */
560	pci->phb->dma_window_size = 0x80000000ul;
561	while (pci->phb->dma_window_size * children > 0x70000000ul)
562		pci->phb->dma_window_size >>= 1;
563
564	pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
565}
566
567
568static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
569{
570	struct iommu_table *tbl;
571	struct device_node *dn, *pdn;
572	struct pci_dn *ppci;
573	const void *dma_window = NULL;
574
575	dn = pci_bus_to_OF_node(bus);
576
577	pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %s\n",
578		 dn->full_name);
579
580	/* Find nearest ibm,dma-window, walking up the device tree */
581	for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
582		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
583		if (dma_window != NULL)
584			break;
585	}
586
587	if (dma_window == NULL) {
588		pr_debug("  no ibm,dma-window property !\n");
589		return;
590	}
591
592	ppci = PCI_DN(pdn);
593
594	pr_debug("  parent is %s, iommu_table: 0x%p\n",
595		 pdn->full_name, ppci->iommu_table);
596
597	if (!ppci->iommu_table) {
598		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
599				   ppci->phb->node);
600		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
601		ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
602		pr_debug("  created table: %p\n", ppci->iommu_table);
603	}
604}
605
606
607static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
608{
609	struct device_node *dn;
610	struct iommu_table *tbl;
611
612	pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));
613
614	dn = dev->dev.of_node;
615
616	/* If we're the direct child of a root bus, then we need to allocate
617	 * an iommu table ourselves. The bus setup code should have setup
618	 * the window sizes already.
619	 */
620	if (!dev->bus->self) {
621		struct pci_controller *phb = PCI_DN(dn)->phb;
622
623		pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
624		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
625				   phb->node);
626		iommu_table_setparms(phb, dn, tbl);
627		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
628		set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
629		return;
630	}
631
632	/* If this device is further down the bus tree, search upwards until
633	 * an already allocated iommu table is found and use that.
634	 */
635
636	while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL)
637		dn = dn->parent;
638
639	if (dn && PCI_DN(dn))
640		set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
641	else
642		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
643		       pci_name(dev));
644}
645
646static int __read_mostly disable_ddw;
647
648static int __init disable_ddw_setup(char *str)
649{
650	disable_ddw = 1;
651	printk(KERN_INFO "ppc iommu: disabling ddw.\n");
652
653	return 0;
654}
655
656early_param("disable_ddw", disable_ddw_setup);
657
658static void remove_ddw(struct device_node *np)
659{
660	struct dynamic_dma_window_prop *dwp;
661	struct property *win64;
662	const u32 *ddr_avail;
663	u64 liobn;
664	int len, ret;
665
666	ddr_avail = of_get_property(np, "ibm,ddw-applicable", &len);
667	win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
668	if (!win64 || !ddr_avail || len < 3 * sizeof(u32))
669		return;
670
671	dwp = win64->value;
672	liobn = (u64)be32_to_cpu(dwp->liobn);
673
674	/* clear the whole window, note the arg is in kernel pages */
675	ret = tce_clearrange_multi_pSeriesLP(0,
676		1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
677	if (ret)
678		pr_warning("%s failed to clear tces in window.\n",
679			 np->full_name);
680	else
681		pr_debug("%s successfully cleared tces in window.\n",
682			 np->full_name);
683
684	ret = rtas_call(ddr_avail[2], 1, 1, NULL, liobn);
685	if (ret)
686		pr_warning("%s: failed to remove direct window: rtas returned "
687			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
688			np->full_name, ret, ddr_avail[2], liobn);
689	else
690		pr_debug("%s: successfully removed direct window: rtas returned "
691			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
692			np->full_name, ret, ddr_avail[2], liobn);
693}
694
695
696static int dupe_ddw_if_already_created(struct pci_dev *dev, struct device_node *pdn)
697{
698	struct device_node *dn;
699	struct pci_dn *pcidn;
700	struct direct_window *window;
701	const struct dynamic_dma_window_prop *direct64;
702	u64 dma_addr = 0;
703
704	dn = pci_device_to_OF_node(dev);
705	pcidn = PCI_DN(dn);
706	spin_lock(&direct_window_list_lock);
707	/* check if we already created a window and dupe that config if so */
708	list_for_each_entry(window, &direct_window_list, list) {
709		if (window->device == pdn) {
710			direct64 = window->prop;
711			dma_addr = direct64->dma_base;
712			break;
713		}
714	}
715	spin_unlock(&direct_window_list_lock);
716
717	return dma_addr;
718}
719
720static u64 dupe_ddw_if_kexec(struct pci_dev *dev, struct device_node *pdn)
721{
722	struct device_node *dn;
723	struct pci_dn *pcidn;
724	int len;
725	struct direct_window *window;
726	const struct dynamic_dma_window_prop *direct64;
727	u64 dma_addr = 0;
728
729	dn = pci_device_to_OF_node(dev);
730	pcidn = PCI_DN(dn);
731	direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
732	if (direct64) {
733		window = kzalloc(sizeof(*window), GFP_KERNEL);
734		if (!window) {
735			remove_ddw(pdn);
736		} else {
737			window->device = pdn;
738			window->prop = direct64;
739			spin_lock(&direct_window_list_lock);
740			list_add(&window->list, &direct_window_list);
741			spin_unlock(&direct_window_list_lock);
742			dma_addr = direct64->dma_base;
743		}
744	}
745
746	return dma_addr;
747}
748
749static int query_ddw(struct pci_dev *dev, const u32 *ddr_avail,
750			struct ddw_query_response *query)
751{
752	struct device_node *dn;
753	struct pci_dn *pcidn;
754	u32 cfg_addr;
755	u64 buid;
756	int ret;
757
758	/*
759	 * Get the config address and phb buid of the PE window.
760	 * Rely on eeh to retrieve this for us.
761	 * Retrieve them from the pci device, not the node with the
762	 * dma-window property
763	 */
764	dn = pci_device_to_OF_node(dev);
765	pcidn = PCI_DN(dn);
766	cfg_addr = pcidn->eeh_config_addr;
767	if (pcidn->eeh_pe_config_addr)
768		cfg_addr = pcidn->eeh_pe_config_addr;
769	buid = pcidn->phb->buid;
770	ret = rtas_call(ddr_avail[0], 3, 5, (u32 *)query,
771		  cfg_addr, BUID_HI(buid), BUID_LO(buid));
772	dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
773		" returned %d\n", ddr_avail[0], cfg_addr, BUID_HI(buid),
774		BUID_LO(buid), ret);
775	return ret;
776}
777
778static int create_ddw(struct pci_dev *dev, const u32 *ddr_avail,
779			struct ddw_create_response *create, int page_shift,
780			int window_shift)
781{
782	struct device_node *dn;
783	struct pci_dn *pcidn;
784	u32 cfg_addr;
785	u64 buid;
786	int ret;
787
788	/*
789	 * Get the config address and phb buid of the PE window.
790	 * Rely on eeh to retrieve this for us.
791	 * Retrieve them from the pci device, not the node with the
792	 * dma-window property
793	 */
794	dn = pci_device_to_OF_node(dev);
795	pcidn = PCI_DN(dn);
796	cfg_addr = pcidn->eeh_config_addr;
797	if (pcidn->eeh_pe_config_addr)
798		cfg_addr = pcidn->eeh_pe_config_addr;
799	buid = pcidn->phb->buid;
800
801	do {
802		/* extra outputs are LIOBN and dma-addr (hi, lo) */
803		ret = rtas_call(ddr_avail[1], 5, 4, (u32 *)create, cfg_addr,
804				BUID_HI(buid), BUID_LO(buid), page_shift, window_shift);
805	} while (rtas_busy_delay(ret));
806	dev_info(&dev->dev,
807		"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
808		"(liobn = 0x%x starting addr = %x %x)\n", ddr_avail[1],
809		 cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,
810		 window_shift, ret, create->liobn, create->addr_hi, create->addr_lo);
811
812	return ret;
813}
814
815/*
816 * If the PE supports dynamic dma windows, and there is space for a table
817 * that can map all pages in a linear offset, then setup such a table,
818 * and record the dma-offset in the struct device.
819 *
820 * dev: the pci device we are checking
821 * pdn: the parent pe node with the ibm,dma_window property
822 * Future: also check if we can remap the base window for our base page size
823 *
824 * returns the dma offset for use by dma_set_mask
825 */
826static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
827{
828	int len, ret;
829	struct ddw_query_response query;
830	struct ddw_create_response create;
831	int page_shift;
832	u64 dma_addr, max_addr;
833	struct device_node *dn;
834	const u32 *uninitialized_var(ddr_avail);
835	struct direct_window *window;
836	struct property *uninitialized_var(win64);
837	struct dynamic_dma_window_prop *ddwprop;
838
839	mutex_lock(&direct_window_init_mutex);
840
841	dma_addr = dupe_ddw_if_already_created(dev, pdn);
842	if (dma_addr != 0)
843		goto out_unlock;
844
845	dma_addr = dupe_ddw_if_kexec(dev, pdn);
846	if (dma_addr != 0)
847		goto out_unlock;
848
849	/*
850	 * the ibm,ddw-applicable property holds the tokens for:
851	 * ibm,query-pe-dma-window
852	 * ibm,create-pe-dma-window
853	 * ibm,remove-pe-dma-window
854	 * for the given node in that order.
855	 * the property is actually in the parent, not the PE
856	 */
857	ddr_avail = of_get_property(pdn, "ibm,ddw-applicable", &len);
858	if (!ddr_avail || len < 3 * sizeof(u32))
859		goto out_unlock;
860
861       /*
862	 * Query if there is a second window of size to map the
863	 * whole partition.  Query returns number of windows, largest
864	 * block assigned to PE (partition endpoint), and two bitmasks
865	 * of page sizes: supported and supported for migrate-dma.
866	 */
867	dn = pci_device_to_OF_node(dev);
868	ret = query_ddw(dev, ddr_avail, &query);
869	if (ret != 0)
870		goto out_unlock;
871
872	if (query.windows_available == 0) {
873		/*
874		 * no additional windows are available for this device.
875		 * We might be able to reallocate the existing window,
876		 * trading in for a larger page size.
877		 */
878		dev_dbg(&dev->dev, "no free dynamic windows");
879		goto out_unlock;
880	}
881	if (query.page_size & 4) {
882		page_shift = 24; /* 16MB */
883	} else if (query.page_size & 2) {
884		page_shift = 16; /* 64kB */
885	} else if (query.page_size & 1) {
886		page_shift = 12; /* 4kB */
887	} else {
888		dev_dbg(&dev->dev, "no supported direct page size in mask %x",
889			  query.page_size);
890		goto out_unlock;
891	}
892	/* verify the window * number of ptes will map the partition */
893	/* check largest block * page size > max memory hotplug addr */
894	max_addr = memory_hotplug_max();
895	if (query.largest_available_block < (max_addr >> page_shift)) {
896		dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u "
897			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
898			  1ULL << page_shift);
899		goto out_unlock;
900	}
901	len = order_base_2(max_addr);
902	win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
903	if (!win64) {
904		dev_info(&dev->dev,
905			"couldn't allocate property for 64bit dma window\n");
906		goto out_unlock;
907	}
908	win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
909	win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
910	if (!win64->name || !win64->value) {
911		dev_info(&dev->dev,
912			"couldn't allocate property name and value\n");
913		goto out_free_prop;
914	}
915
916	ret = create_ddw(dev, ddr_avail, &create, page_shift, len);
917	if (ret != 0)
918		goto out_free_prop;
919
920	ddwprop->liobn = cpu_to_be32(create.liobn);
921	ddwprop->dma_base = cpu_to_be64(of_read_number(&create.addr_hi, 2));
922	ddwprop->tce_shift = cpu_to_be32(page_shift);
923	ddwprop->window_shift = cpu_to_be32(len);
924
925	dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n",
926		  create.liobn, dn->full_name);
927
928	window = kzalloc(sizeof(*window), GFP_KERNEL);
929	if (!window)
930		goto out_clear_window;
931
932	ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
933			win64->value, tce_setrange_multi_pSeriesLP_walk);
934	if (ret) {
935		dev_info(&dev->dev, "failed to map direct window for %s: %d\n",
936			 dn->full_name, ret);
937		goto out_clear_window;
938	}
939
940	ret = prom_add_property(pdn, win64);
941	if (ret) {
942		dev_err(&dev->dev, "unable to add dma window property for %s: %d",
943			 pdn->full_name, ret);
944		goto out_clear_window;
945	}
946
947	window->device = pdn;
948	window->prop = ddwprop;
949	spin_lock(&direct_window_list_lock);
950	list_add(&window->list, &direct_window_list);
951	spin_unlock(&direct_window_list_lock);
952
953	dma_addr = of_read_number(&create.addr_hi, 2);
954	goto out_unlock;
955
956out_clear_window:
957	remove_ddw(pdn);
958
959out_free_prop:
960	kfree(win64->name);
961	kfree(win64->value);
962	kfree(win64);
963
964out_unlock:
965	mutex_unlock(&direct_window_init_mutex);
966	return dma_addr;
967}
968
969static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
970{
971	struct device_node *pdn, *dn;
972	struct iommu_table *tbl;
973	const void *dma_window = NULL;
974	struct pci_dn *pci;
975
976	pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
977
978	/* dev setup for LPAR is a little tricky, since the device tree might
979	 * contain the dma-window properties per-device and not necessarily
980	 * for the bus. So we need to search upwards in the tree until we
981	 * either hit a dma-window property, OR find a parent with a table
982	 * already allocated.
983	 */
984	dn = pci_device_to_OF_node(dev);
985	pr_debug("  node is %s\n", dn->full_name);
986
987	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
988	     pdn = pdn->parent) {
989		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
990		if (dma_window)
991			break;
992	}
993
994	if (!pdn || !PCI_DN(pdn)) {
995		printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
996		       "no DMA window found for pci dev=%s dn=%s\n",
997				 pci_name(dev), dn? dn->full_name : "<null>");
998		return;
999	}
1000	pr_debug("  parent is %s\n", pdn->full_name);
1001
1002	pci = PCI_DN(pdn);
1003	if (!pci->iommu_table) {
1004		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
1005				   pci->phb->node);
1006		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
1007		pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
1008		pr_debug("  created table: %p\n", pci->iommu_table);
1009	} else {
1010		pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
1011	}
1012
1013	set_iommu_table_base(&dev->dev, pci->iommu_table);
1014}
1015
1016static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
1017{
1018	bool ddw_enabled = false;
1019	struct device_node *pdn, *dn;
1020	struct pci_dev *pdev;
1021	const void *dma_window = NULL;
1022	u64 dma_offset;
1023
1024	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
1025		return -EIO;
1026
1027	/* only attempt to use a new window if 64-bit DMA is requested */
1028	if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) {
1029		pdev = to_pci_dev(dev);
1030
1031		dn = pci_device_to_OF_node(pdev);
1032		dev_dbg(dev, "node is %s\n", dn->full_name);
1033
1034		/*
1035		 * the device tree might contain the dma-window properties
1036		 * per-device and not necessarily for the bus. So we need to
1037		 * search upwards in the tree until we either hit a dma-window
1038		 * property, OR find a parent with a table already allocated.
1039		 */
1040		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
1041				pdn = pdn->parent) {
1042			dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
1043			if (dma_window)
1044				break;
1045		}
1046		if (pdn && PCI_DN(pdn)) {
1047			dma_offset = enable_ddw(pdev, pdn);
1048			if (dma_offset != 0) {
1049				dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset);
1050				set_dma_offset(dev, dma_offset);
1051				set_dma_ops(dev, &dma_direct_ops);
1052				ddw_enabled = true;
1053			}
1054		}
1055	}
1056
1057	/* fall-through to iommu ops */
1058	if (!ddw_enabled) {
1059		dev_info(dev, "Using 32-bit DMA via iommu\n");
1060		set_dma_ops(dev, &dma_iommu_ops);
1061	}
1062
1063	*dev->dma_mask = dma_mask;
1064	return 0;
1065}
1066
1067#else  /* CONFIG_PCI */
1068#define pci_dma_bus_setup_pSeries	NULL
1069#define pci_dma_dev_setup_pSeries	NULL
1070#define pci_dma_bus_setup_pSeriesLP	NULL
1071#define pci_dma_dev_setup_pSeriesLP	NULL
1072#define dma_set_mask_pSeriesLP		NULL
1073#endif /* !CONFIG_PCI */
1074
1075static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
1076		void *data)
1077{
1078	struct direct_window *window;
1079	struct memory_notify *arg = data;
1080	int ret = 0;
1081
1082	switch (action) {
1083	case MEM_GOING_ONLINE:
1084		spin_lock(&direct_window_list_lock);
1085		list_for_each_entry(window, &direct_window_list, list) {
1086			ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
1087					arg->nr_pages, window->prop);
1088			/* XXX log error */
1089		}
1090		spin_unlock(&direct_window_list_lock);
1091		break;
1092	case MEM_CANCEL_ONLINE:
1093	case MEM_OFFLINE:
1094		spin_lock(&direct_window_list_lock);
1095		list_for_each_entry(window, &direct_window_list, list) {
1096			ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
1097					arg->nr_pages, window->prop);
1098			/* XXX log error */
1099		}
1100		spin_unlock(&direct_window_list_lock);
1101		break;
1102	default:
1103		break;
1104	}
1105	if (ret && action != MEM_CANCEL_ONLINE)
1106		return NOTIFY_BAD;
1107
1108	return NOTIFY_OK;
1109}
1110
1111static struct notifier_block iommu_mem_nb = {
1112	.notifier_call = iommu_mem_notifier,
1113};
1114
1115static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
1116{
1117	int err = NOTIFY_OK;
1118	struct device_node *np = node;
1119	struct pci_dn *pci = PCI_DN(np);
1120	struct direct_window *window;
1121
1122	switch (action) {
1123	case PSERIES_RECONFIG_REMOVE:
1124		if (pci && pci->iommu_table)
1125			iommu_free_table(pci->iommu_table, np->full_name);
1126
1127		spin_lock(&direct_window_list_lock);
1128		list_for_each_entry(window, &direct_window_list, list) {
1129			if (window->device == np) {
1130				list_del(&window->list);
1131				kfree(window);
1132				break;
1133			}
1134		}
1135		spin_unlock(&direct_window_list_lock);
1136
1137		/*
1138		 * Because the notifier runs after isolation of the
1139		 * slot, we are guaranteed any DMA window has already
1140		 * been revoked and the TCEs have been marked invalid,
1141		 * so we don't need a call to remove_ddw(np). However,
1142		 * if an additional notifier action is added before the
1143		 * isolate call, we should update this code for
1144		 * completeness with such a call.
1145		 */
1146		break;
1147	default:
1148		err = NOTIFY_DONE;
1149		break;
1150	}
1151	return err;
1152}
1153
1154static struct notifier_block iommu_reconfig_nb = {
1155	.notifier_call = iommu_reconfig_notifier,
1156};
1157
1158/* These are called very early. */
1159void iommu_init_early_pSeries(void)
1160{
1161	if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))
1162		return;
1163
1164	if (firmware_has_feature(FW_FEATURE_LPAR)) {
1165		if (firmware_has_feature(FW_FEATURE_MULTITCE)) {
1166			ppc_md.tce_build = tce_buildmulti_pSeriesLP;
1167			ppc_md.tce_free	 = tce_freemulti_pSeriesLP;
1168		} else {
1169			ppc_md.tce_build = tce_build_pSeriesLP;
1170			ppc_md.tce_free	 = tce_free_pSeriesLP;
1171		}
1172		ppc_md.tce_get   = tce_get_pSeriesLP;
1173		ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
1174		ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
1175		ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;
1176	} else {
1177		ppc_md.tce_build = tce_build_pSeries;
1178		ppc_md.tce_free  = tce_free_pSeries;
1179		ppc_md.tce_get   = tce_get_pseries;
1180		ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeries;
1181		ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeries;
1182	}
1183
1184
1185	pSeries_reconfig_notifier_register(&iommu_reconfig_nb);
1186	register_memory_notifier(&iommu_mem_nb);
1187
1188	set_pci_dma_ops(&dma_iommu_ops);
1189}
1190
1191static int __init disable_multitce(char *str)
1192{
1193	if (strcmp(str, "off") == 0 &&
1194	    firmware_has_feature(FW_FEATURE_LPAR) &&
1195	    firmware_has_feature(FW_FEATURE_MULTITCE)) {
1196		printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
1197		ppc_md.tce_build = tce_build_pSeriesLP;
1198		ppc_md.tce_free	 = tce_free_pSeriesLP;
1199		powerpc_firmware_features &= ~FW_FEATURE_MULTITCE;
1200	}
1201	return 1;
1202}
1203
1204__setup("multitce=", disable_multitce);
1205