balloon.c revision 83be7e52d46a5b3a9955a38a9597bf1de1851ea7
1/******************************************************************************
2 * Xen balloon driver - enables returning/claiming memory to/from Xen.
3 *
4 * Copyright (c) 2003, B Dragovic
5 * Copyright (c) 2003-2004, M Williamson, K Fraser
6 * Copyright (c) 2005 Dan M. Smith, IBM Corporation
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
32
33#include <linux/kernel.h>
34#include <linux/sched.h>
35#include <linux/errno.h>
36#include <linux/mm.h>
37#include <linux/bootmem.h>
38#include <linux/pagemap.h>
39#include <linux/highmem.h>
40#include <linux/mutex.h>
41#include <linux/list.h>
42#include <linux/gfp.h>
43
44#include <asm/page.h>
45#include <asm/pgalloc.h>
46#include <asm/pgtable.h>
47#include <asm/tlb.h>
48#include <asm/e820.h>
49
50#include <asm/xen/hypervisor.h>
51#include <asm/xen/hypercall.h>
52
53#include <xen/xen.h>
54#include <xen/interface/xen.h>
55#include <xen/interface/memory.h>
56#include <xen/balloon.h>
57#include <xen/features.h>
58#include <xen/page.h>
59
60/*
61 * balloon_process() state:
62 *
63 * BP_DONE: done or nothing to do,
64 * BP_EAGAIN: error, go to sleep,
65 * BP_ECANCELED: error, balloon operation canceled.
66 */
67
68enum bp_state {
69	BP_DONE,
70	BP_EAGAIN,
71	BP_ECANCELED
72};
73
74
75static DEFINE_MUTEX(balloon_mutex);
76
77struct balloon_stats balloon_stats;
78EXPORT_SYMBOL_GPL(balloon_stats);
79
80/* We increase/decrease in batches which fit in a page */
81static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
82
83#ifdef CONFIG_HIGHMEM
84#define inc_totalhigh_pages() (totalhigh_pages++)
85#define dec_totalhigh_pages() (totalhigh_pages--)
86#else
87#define inc_totalhigh_pages() do {} while(0)
88#define dec_totalhigh_pages() do {} while(0)
89#endif
90
91/* List of ballooned pages, threaded through the mem_map array. */
92static LIST_HEAD(ballooned_pages);
93
94/* Main work function, always executed in process context. */
95static void balloon_process(struct work_struct *work);
96static DECLARE_DELAYED_WORK(balloon_worker, balloon_process);
97
98/* When ballooning out (allocating memory to return to Xen) we don't really
99   want the kernel to try too hard since that can trigger the oom killer. */
100#define GFP_BALLOON \
101	(GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
102
103static void scrub_page(struct page *page)
104{
105#ifdef CONFIG_XEN_SCRUB_PAGES
106	clear_highpage(page);
107#endif
108}
109
110/* balloon_append: add the given page to the balloon. */
111static void __balloon_append(struct page *page)
112{
113	/* Lowmem is re-populated first, so highmem pages go at list tail. */
114	if (PageHighMem(page)) {
115		list_add_tail(&page->lru, &ballooned_pages);
116		balloon_stats.balloon_high++;
117		dec_totalhigh_pages();
118	} else {
119		list_add(&page->lru, &ballooned_pages);
120		balloon_stats.balloon_low++;
121	}
122}
123
124static void balloon_append(struct page *page)
125{
126	__balloon_append(page);
127	totalram_pages--;
128}
129
130/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
131static struct page *balloon_retrieve(bool prefer_highmem)
132{
133	struct page *page;
134
135	if (list_empty(&ballooned_pages))
136		return NULL;
137
138	if (prefer_highmem)
139		page = list_entry(ballooned_pages.prev, struct page, lru);
140	else
141		page = list_entry(ballooned_pages.next, struct page, lru);
142	list_del(&page->lru);
143
144	if (PageHighMem(page)) {
145		balloon_stats.balloon_high--;
146		inc_totalhigh_pages();
147	}
148	else
149		balloon_stats.balloon_low--;
150
151	totalram_pages++;
152
153	return page;
154}
155
156static struct page *balloon_first_page(void)
157{
158	if (list_empty(&ballooned_pages))
159		return NULL;
160	return list_entry(ballooned_pages.next, struct page, lru);
161}
162
163static struct page *balloon_next_page(struct page *page)
164{
165	struct list_head *next = page->lru.next;
166	if (next == &ballooned_pages)
167		return NULL;
168	return list_entry(next, struct page, lru);
169}
170
171static enum bp_state update_schedule(enum bp_state state)
172{
173	if (state == BP_DONE) {
174		balloon_stats.schedule_delay = 1;
175		balloon_stats.retry_count = 1;
176		return BP_DONE;
177	}
178
179	++balloon_stats.retry_count;
180
181	if (balloon_stats.max_retry_count != RETRY_UNLIMITED &&
182			balloon_stats.retry_count > balloon_stats.max_retry_count) {
183		balloon_stats.schedule_delay = 1;
184		balloon_stats.retry_count = 1;
185		return BP_ECANCELED;
186	}
187
188	balloon_stats.schedule_delay <<= 1;
189
190	if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay)
191		balloon_stats.schedule_delay = balloon_stats.max_schedule_delay;
192
193	return BP_EAGAIN;
194}
195
196static long current_credit(void)
197{
198	unsigned long target = balloon_stats.target_pages;
199
200	target = min(target,
201		     balloon_stats.current_pages +
202		     balloon_stats.balloon_low +
203		     balloon_stats.balloon_high);
204
205	return target - balloon_stats.current_pages;
206}
207
208static enum bp_state increase_reservation(unsigned long nr_pages)
209{
210	int rc;
211	unsigned long  pfn, i;
212	struct page   *page;
213	struct xen_memory_reservation reservation = {
214		.address_bits = 0,
215		.extent_order = 0,
216		.domid        = DOMID_SELF
217	};
218
219	if (nr_pages > ARRAY_SIZE(frame_list))
220		nr_pages = ARRAY_SIZE(frame_list);
221
222	page = balloon_first_page();
223	for (i = 0; i < nr_pages; i++) {
224		if (!page) {
225			nr_pages = i;
226			break;
227		}
228		frame_list[i] = page_to_pfn(page);
229		page = balloon_next_page(page);
230	}
231
232	set_xen_guest_handle(reservation.extent_start, frame_list);
233	reservation.nr_extents = nr_pages;
234	rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
235	if (rc <= 0)
236		return BP_EAGAIN;
237
238	for (i = 0; i < rc; i++) {
239		page = balloon_retrieve(false);
240		BUG_ON(page == NULL);
241
242		pfn = page_to_pfn(page);
243		BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
244		       phys_to_machine_mapping_valid(pfn));
245
246		set_phys_to_machine(pfn, frame_list[i]);
247
248		/* Link back into the page tables if not highmem. */
249		if (xen_pv_domain() && !PageHighMem(page)) {
250			int ret;
251			ret = HYPERVISOR_update_va_mapping(
252				(unsigned long)__va(pfn << PAGE_SHIFT),
253				mfn_pte(frame_list[i], PAGE_KERNEL),
254				0);
255			BUG_ON(ret);
256		}
257
258		/* Relinquish the page back to the allocator. */
259		ClearPageReserved(page);
260		init_page_count(page);
261		__free_page(page);
262	}
263
264	balloon_stats.current_pages += rc;
265
266	return BP_DONE;
267}
268
269static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
270{
271	enum bp_state state = BP_DONE;
272	unsigned long  pfn, i;
273	struct page   *page;
274	int ret;
275	struct xen_memory_reservation reservation = {
276		.address_bits = 0,
277		.extent_order = 0,
278		.domid        = DOMID_SELF
279	};
280
281	if (nr_pages > ARRAY_SIZE(frame_list))
282		nr_pages = ARRAY_SIZE(frame_list);
283
284	for (i = 0; i < nr_pages; i++) {
285		if ((page = alloc_page(gfp)) == NULL) {
286			nr_pages = i;
287			state = BP_EAGAIN;
288			break;
289		}
290
291		pfn = page_to_pfn(page);
292		frame_list[i] = pfn_to_mfn(pfn);
293
294		scrub_page(page);
295
296		if (xen_pv_domain() && !PageHighMem(page)) {
297			ret = HYPERVISOR_update_va_mapping(
298				(unsigned long)__va(pfn << PAGE_SHIFT),
299				__pte_ma(0), 0);
300			BUG_ON(ret);
301                }
302
303	}
304
305	/* Ensure that ballooned highmem pages don't have kmaps. */
306	kmap_flush_unused();
307	flush_tlb_all();
308
309	/* No more mappings: invalidate P2M and add to balloon. */
310	for (i = 0; i < nr_pages; i++) {
311		pfn = mfn_to_pfn(frame_list[i]);
312		__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
313		balloon_append(pfn_to_page(pfn));
314	}
315
316	set_xen_guest_handle(reservation.extent_start, frame_list);
317	reservation.nr_extents   = nr_pages;
318	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
319	BUG_ON(ret != nr_pages);
320
321	balloon_stats.current_pages -= nr_pages;
322
323	return state;
324}
325
326/*
327 * We avoid multiple worker processes conflicting via the balloon mutex.
328 * We may of course race updates of the target counts (which are protected
329 * by the balloon lock), or with changes to the Xen hard limit, but we will
330 * recover from these in time.
331 */
332static void balloon_process(struct work_struct *work)
333{
334	enum bp_state state = BP_DONE;
335	long credit;
336
337	mutex_lock(&balloon_mutex);
338
339	do {
340		credit = current_credit();
341
342		if (credit > 0)
343			state = increase_reservation(credit);
344
345		if (credit < 0)
346			state = decrease_reservation(-credit, GFP_BALLOON);
347
348		state = update_schedule(state);
349
350#ifndef CONFIG_PREEMPT
351		if (need_resched())
352			schedule();
353#endif
354	} while (credit && state == BP_DONE);
355
356	/* Schedule more work if there is some still to be done. */
357	if (state == BP_EAGAIN)
358		schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ);
359
360	mutex_unlock(&balloon_mutex);
361}
362
363/* Resets the Xen limit, sets new target, and kicks off processing. */
364void balloon_set_new_target(unsigned long target)
365{
366	/* No need for lock. Not read-modify-write updates. */
367	balloon_stats.target_pages = target;
368	schedule_delayed_work(&balloon_worker, 0);
369}
370EXPORT_SYMBOL_GPL(balloon_set_new_target);
371
372/**
373 * alloc_xenballooned_pages - get pages that have been ballooned out
374 * @nr_pages: Number of pages to get
375 * @pages: pages returned
376 * @return 0 on success, error otherwise
377 */
378int alloc_xenballooned_pages(int nr_pages, struct page** pages)
379{
380	int pgno = 0;
381	struct page* page;
382	mutex_lock(&balloon_mutex);
383	while (pgno < nr_pages) {
384		page = balloon_retrieve(true);
385		if (page) {
386			pages[pgno++] = page;
387		} else {
388			enum bp_state st;
389			st = decrease_reservation(nr_pages - pgno, GFP_HIGHUSER);
390			if (st != BP_DONE)
391				goto out_undo;
392		}
393	}
394	mutex_unlock(&balloon_mutex);
395	return 0;
396 out_undo:
397	while (pgno)
398		balloon_append(pages[--pgno]);
399	/* Free the memory back to the kernel soon */
400	schedule_delayed_work(&balloon_worker, 0);
401	mutex_unlock(&balloon_mutex);
402	return -ENOMEM;
403}
404EXPORT_SYMBOL(alloc_xenballooned_pages);
405
406/**
407 * free_xenballooned_pages - return pages retrieved with get_ballooned_pages
408 * @nr_pages: Number of pages
409 * @pages: pages to return
410 */
411void free_xenballooned_pages(int nr_pages, struct page** pages)
412{
413	int i;
414
415	mutex_lock(&balloon_mutex);
416
417	for (i = 0; i < nr_pages; i++) {
418		if (pages[i])
419			balloon_append(pages[i]);
420	}
421
422	/* The balloon may be too large now. Shrink it if needed. */
423	if (current_credit())
424		schedule_delayed_work(&balloon_worker, 0);
425
426	mutex_unlock(&balloon_mutex);
427}
428EXPORT_SYMBOL(free_xenballooned_pages);
429
430static int __init balloon_init(void)
431{
432	unsigned long pfn, extra_pfn_end;
433	struct page *page;
434
435	if (!xen_domain())
436		return -ENODEV;
437
438	pr_info("xen/balloon: Initialising balloon driver.\n");
439
440	balloon_stats.current_pages = xen_pv_domain() ? min(xen_start_info->nr_pages, max_pfn) : max_pfn;
441	balloon_stats.target_pages  = balloon_stats.current_pages;
442	balloon_stats.balloon_low   = 0;
443	balloon_stats.balloon_high  = 0;
444
445	balloon_stats.schedule_delay = 1;
446	balloon_stats.max_schedule_delay = 32;
447	balloon_stats.retry_count = 1;
448	balloon_stats.max_retry_count = RETRY_UNLIMITED;
449
450	/*
451	 * Initialise the balloon with excess memory space.  We need
452	 * to make sure we don't add memory which doesn't exist or
453	 * logically exist.  The E820 map can be trimmed to be smaller
454	 * than the amount of physical memory due to the mem= command
455	 * line parameter.  And if this is a 32-bit non-HIGHMEM kernel
456	 * on a system with memory which requires highmem to access,
457	 * don't try to use it.
458	 */
459	extra_pfn_end = min(min(max_pfn, e820_end_of_ram_pfn()),
460			    (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size));
461	for (pfn = PFN_UP(xen_extra_mem_start);
462	     pfn < extra_pfn_end;
463	     pfn++) {
464		page = pfn_to_page(pfn);
465		/* totalram_pages doesn't include the boot-time
466		   balloon extension, so don't subtract from it. */
467		__balloon_append(page);
468	}
469
470	return 0;
471}
472
473subsys_initcall(balloon_init);
474
475MODULE_LICENSE("GPL");
476