kvm.c revision 631bc4878220932fe67fc46fc7cf7cccdb1ec597
1/* 2 * KVM paravirt_ops implementation 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 17 * 18 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 19 * Copyright IBM Corporation, 2007 20 * Authors: Anthony Liguori <aliguori@us.ibm.com> 21 */ 22 23#include <linux/module.h> 24#include <linux/kernel.h> 25#include <linux/kvm_para.h> 26#include <linux/cpu.h> 27#include <linux/mm.h> 28#include <linux/highmem.h> 29#include <linux/hardirq.h> 30#include <linux/notifier.h> 31#include <linux/reboot.h> 32#include <linux/hash.h> 33#include <linux/sched.h> 34#include <linux/slab.h> 35#include <linux/kprobes.h> 36#include <asm/timer.h> 37#include <asm/cpu.h> 38#include <asm/traps.h> 39#include <asm/desc.h> 40 41#define MMU_QUEUE_SIZE 1024 42 43static int kvmapf = 1; 44 45static int parse_no_kvmapf(char *arg) 46{ 47 kvmapf = 0; 48 return 0; 49} 50 51early_param("no-kvmapf", parse_no_kvmapf); 52 53struct kvm_para_state { 54 u8 mmu_queue[MMU_QUEUE_SIZE]; 55 int mmu_queue_len; 56}; 57 58static DEFINE_PER_CPU(struct kvm_para_state, para_state); 59static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 60 61static struct kvm_para_state *kvm_para_state(void) 62{ 63 return &per_cpu(para_state, raw_smp_processor_id()); 64} 65 66/* 67 * No need for any "IO delay" on KVM 68 */ 69static void kvm_io_delay(void) 70{ 71} 72 73#define KVM_TASK_SLEEP_HASHBITS 8 74#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) 75 76struct kvm_task_sleep_node { 77 struct hlist_node link; 78 wait_queue_head_t wq; 79 u32 token; 80 int cpu; 81}; 82 83static struct kvm_task_sleep_head { 84 spinlock_t lock; 85 struct hlist_head list; 86} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; 87 88static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, 89 u32 token) 90{ 91 struct hlist_node *p; 92 93 hlist_for_each(p, &b->list) { 94 struct kvm_task_sleep_node *n = 95 hlist_entry(p, typeof(*n), link); 96 if (n->token == token) 97 return n; 98 } 99 100 return NULL; 101} 102 103void kvm_async_pf_task_wait(u32 token) 104{ 105 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); 106 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 107 struct kvm_task_sleep_node n, *e; 108 DEFINE_WAIT(wait); 109 110 spin_lock(&b->lock); 111 e = _find_apf_task(b, token); 112 if (e) { 113 /* dummy entry exist -> wake up was delivered ahead of PF */ 114 hlist_del(&e->link); 115 kfree(e); 116 spin_unlock(&b->lock); 117 return; 118 } 119 120 n.token = token; 121 n.cpu = smp_processor_id(); 122 init_waitqueue_head(&n.wq); 123 hlist_add_head(&n.link, &b->list); 124 spin_unlock(&b->lock); 125 126 for (;;) { 127 prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); 128 if (hlist_unhashed(&n.link)) 129 break; 130 local_irq_enable(); 131 schedule(); 132 local_irq_disable(); 133 } 134 finish_wait(&n.wq, &wait); 135 136 return; 137} 138EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); 139 140static void apf_task_wake_one(struct kvm_task_sleep_node *n) 141{ 142 hlist_del_init(&n->link); 143 if (waitqueue_active(&n->wq)) 144 wake_up(&n->wq); 145} 146 147static void apf_task_wake_all(void) 148{ 149 int i; 150 151 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { 152 struct hlist_node *p, *next; 153 struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; 154 spin_lock(&b->lock); 155 hlist_for_each_safe(p, next, &b->list) { 156 struct kvm_task_sleep_node *n = 157 hlist_entry(p, typeof(*n), link); 158 if (n->cpu == smp_processor_id()) 159 apf_task_wake_one(n); 160 } 161 spin_unlock(&b->lock); 162 } 163} 164 165void kvm_async_pf_task_wake(u32 token) 166{ 167 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); 168 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 169 struct kvm_task_sleep_node *n; 170 171 if (token == ~0) { 172 apf_task_wake_all(); 173 return; 174 } 175 176again: 177 spin_lock(&b->lock); 178 n = _find_apf_task(b, token); 179 if (!n) { 180 /* 181 * async PF was not yet handled. 182 * Add dummy entry for the token. 183 */ 184 n = kmalloc(sizeof(*n), GFP_ATOMIC); 185 if (!n) { 186 /* 187 * Allocation failed! Busy wait while other cpu 188 * handles async PF. 189 */ 190 spin_unlock(&b->lock); 191 cpu_relax(); 192 goto again; 193 } 194 n->token = token; 195 n->cpu = smp_processor_id(); 196 init_waitqueue_head(&n->wq); 197 hlist_add_head(&n->link, &b->list); 198 } else 199 apf_task_wake_one(n); 200 spin_unlock(&b->lock); 201 return; 202} 203EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); 204 205u32 kvm_read_and_reset_pf_reason(void) 206{ 207 u32 reason = 0; 208 209 if (__get_cpu_var(apf_reason).enabled) { 210 reason = __get_cpu_var(apf_reason).reason; 211 __get_cpu_var(apf_reason).reason = 0; 212 } 213 214 return reason; 215} 216EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); 217 218dotraplinkage void __kprobes 219do_async_page_fault(struct pt_regs *regs, unsigned long error_code) 220{ 221 switch (kvm_read_and_reset_pf_reason()) { 222 default: 223 do_page_fault(regs, error_code); 224 break; 225 case KVM_PV_REASON_PAGE_NOT_PRESENT: 226 /* page is swapped out by the host. */ 227 kvm_async_pf_task_wait((u32)read_cr2()); 228 break; 229 case KVM_PV_REASON_PAGE_READY: 230 kvm_async_pf_task_wake((u32)read_cr2()); 231 break; 232 } 233} 234 235static void kvm_mmu_op(void *buffer, unsigned len) 236{ 237 int r; 238 unsigned long a1, a2; 239 240 do { 241 a1 = __pa(buffer); 242 a2 = 0; /* on i386 __pa() always returns <4G */ 243 r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2); 244 buffer += r; 245 len -= r; 246 } while (len); 247} 248 249static void mmu_queue_flush(struct kvm_para_state *state) 250{ 251 if (state->mmu_queue_len) { 252 kvm_mmu_op(state->mmu_queue, state->mmu_queue_len); 253 state->mmu_queue_len = 0; 254 } 255} 256 257static void kvm_deferred_mmu_op(void *buffer, int len) 258{ 259 struct kvm_para_state *state = kvm_para_state(); 260 261 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { 262 kvm_mmu_op(buffer, len); 263 return; 264 } 265 if (state->mmu_queue_len + len > sizeof state->mmu_queue) 266 mmu_queue_flush(state); 267 memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len); 268 state->mmu_queue_len += len; 269} 270 271static void kvm_mmu_write(void *dest, u64 val) 272{ 273 __u64 pte_phys; 274 struct kvm_mmu_op_write_pte wpte; 275 276#ifdef CONFIG_HIGHPTE 277 struct page *page; 278 unsigned long dst = (unsigned long) dest; 279 280 page = kmap_atomic_to_page(dest); 281 pte_phys = page_to_pfn(page); 282 pte_phys <<= PAGE_SHIFT; 283 pte_phys += (dst & ~(PAGE_MASK)); 284#else 285 pte_phys = (unsigned long)__pa(dest); 286#endif 287 wpte.header.op = KVM_MMU_OP_WRITE_PTE; 288 wpte.pte_val = val; 289 wpte.pte_phys = pte_phys; 290 291 kvm_deferred_mmu_op(&wpte, sizeof wpte); 292} 293 294/* 295 * We only need to hook operations that are MMU writes. We hook these so that 296 * we can use lazy MMU mode to batch these operations. We could probably 297 * improve the performance of the host code if we used some of the information 298 * here to simplify processing of batched writes. 299 */ 300static void kvm_set_pte(pte_t *ptep, pte_t pte) 301{ 302 kvm_mmu_write(ptep, pte_val(pte)); 303} 304 305static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr, 306 pte_t *ptep, pte_t pte) 307{ 308 kvm_mmu_write(ptep, pte_val(pte)); 309} 310 311static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd) 312{ 313 kvm_mmu_write(pmdp, pmd_val(pmd)); 314} 315 316#if PAGETABLE_LEVELS >= 3 317#ifdef CONFIG_X86_PAE 318static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte) 319{ 320 kvm_mmu_write(ptep, pte_val(pte)); 321} 322 323static void kvm_pte_clear(struct mm_struct *mm, 324 unsigned long addr, pte_t *ptep) 325{ 326 kvm_mmu_write(ptep, 0); 327} 328 329static void kvm_pmd_clear(pmd_t *pmdp) 330{ 331 kvm_mmu_write(pmdp, 0); 332} 333#endif 334 335static void kvm_set_pud(pud_t *pudp, pud_t pud) 336{ 337 kvm_mmu_write(pudp, pud_val(pud)); 338} 339 340#if PAGETABLE_LEVELS == 4 341static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd) 342{ 343 kvm_mmu_write(pgdp, pgd_val(pgd)); 344} 345#endif 346#endif /* PAGETABLE_LEVELS >= 3 */ 347 348static void kvm_flush_tlb(void) 349{ 350 struct kvm_mmu_op_flush_tlb ftlb = { 351 .header.op = KVM_MMU_OP_FLUSH_TLB, 352 }; 353 354 kvm_deferred_mmu_op(&ftlb, sizeof ftlb); 355} 356 357static void kvm_release_pt(unsigned long pfn) 358{ 359 struct kvm_mmu_op_release_pt rpt = { 360 .header.op = KVM_MMU_OP_RELEASE_PT, 361 .pt_phys = (u64)pfn << PAGE_SHIFT, 362 }; 363 364 kvm_mmu_op(&rpt, sizeof rpt); 365} 366 367static void kvm_enter_lazy_mmu(void) 368{ 369 paravirt_enter_lazy_mmu(); 370} 371 372static void kvm_leave_lazy_mmu(void) 373{ 374 struct kvm_para_state *state = kvm_para_state(); 375 376 mmu_queue_flush(state); 377 paravirt_leave_lazy_mmu(); 378} 379 380static void __init paravirt_ops_setup(void) 381{ 382 pv_info.name = "KVM"; 383 pv_info.paravirt_enabled = 1; 384 385 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) 386 pv_cpu_ops.io_delay = kvm_io_delay; 387 388 if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { 389 pv_mmu_ops.set_pte = kvm_set_pte; 390 pv_mmu_ops.set_pte_at = kvm_set_pte_at; 391 pv_mmu_ops.set_pmd = kvm_set_pmd; 392#if PAGETABLE_LEVELS >= 3 393#ifdef CONFIG_X86_PAE 394 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; 395 pv_mmu_ops.pte_clear = kvm_pte_clear; 396 pv_mmu_ops.pmd_clear = kvm_pmd_clear; 397#endif 398 pv_mmu_ops.set_pud = kvm_set_pud; 399#if PAGETABLE_LEVELS == 4 400 pv_mmu_ops.set_pgd = kvm_set_pgd; 401#endif 402#endif 403 pv_mmu_ops.flush_tlb_user = kvm_flush_tlb; 404 pv_mmu_ops.release_pte = kvm_release_pt; 405 pv_mmu_ops.release_pmd = kvm_release_pt; 406 pv_mmu_ops.release_pud = kvm_release_pt; 407 408 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; 409 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; 410 } 411#ifdef CONFIG_X86_IO_APIC 412 no_timer_check = 1; 413#endif 414} 415 416void __cpuinit kvm_guest_cpu_init(void) 417{ 418 if (!kvm_para_available()) 419 return; 420 421 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { 422 u64 pa = __pa(&__get_cpu_var(apf_reason)); 423 424 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); 425 __get_cpu_var(apf_reason).enabled = 1; 426 printk(KERN_INFO"KVM setup async PF for cpu %d\n", 427 smp_processor_id()); 428 } 429} 430 431static void kvm_pv_disable_apf(void *unused) 432{ 433 if (!__get_cpu_var(apf_reason).enabled) 434 return; 435 436 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); 437 __get_cpu_var(apf_reason).enabled = 0; 438 439 printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", 440 smp_processor_id()); 441} 442 443static int kvm_pv_reboot_notify(struct notifier_block *nb, 444 unsigned long code, void *unused) 445{ 446 if (code == SYS_RESTART) 447 on_each_cpu(kvm_pv_disable_apf, NULL, 1); 448 return NOTIFY_DONE; 449} 450 451static struct notifier_block kvm_pv_reboot_nb = { 452 .notifier_call = kvm_pv_reboot_notify, 453}; 454 455#ifdef CONFIG_SMP 456static void __init kvm_smp_prepare_boot_cpu(void) 457{ 458 WARN_ON(kvm_register_clock("primary cpu clock")); 459 kvm_guest_cpu_init(); 460 native_smp_prepare_boot_cpu(); 461} 462 463static void kvm_guest_cpu_online(void *dummy) 464{ 465 kvm_guest_cpu_init(); 466} 467 468static void kvm_guest_cpu_offline(void *dummy) 469{ 470 kvm_pv_disable_apf(NULL); 471 apf_task_wake_all(); 472} 473 474static int __cpuinit kvm_cpu_notify(struct notifier_block *self, 475 unsigned long action, void *hcpu) 476{ 477 int cpu = (unsigned long)hcpu; 478 switch (action) { 479 case CPU_ONLINE: 480 case CPU_DOWN_FAILED: 481 case CPU_ONLINE_FROZEN: 482 smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); 483 break; 484 case CPU_DOWN_PREPARE: 485 case CPU_DOWN_PREPARE_FROZEN: 486 smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); 487 break; 488 default: 489 break; 490 } 491 return NOTIFY_OK; 492} 493 494static struct notifier_block __cpuinitdata kvm_cpu_notifier = { 495 .notifier_call = kvm_cpu_notify, 496}; 497#endif 498 499static void __init kvm_apf_trap_init(void) 500{ 501 set_intr_gate(14, &async_page_fault); 502} 503 504void __init kvm_guest_init(void) 505{ 506 int i; 507 508 if (!kvm_para_available()) 509 return; 510 511 paravirt_ops_setup(); 512 register_reboot_notifier(&kvm_pv_reboot_nb); 513 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) 514 spin_lock_init(&async_pf_sleepers[i].lock); 515 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) 516 x86_init.irqs.trap_init = kvm_apf_trap_init; 517 518#ifdef CONFIG_SMP 519 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 520 register_cpu_notifier(&kvm_cpu_notifier); 521#else 522 kvm_guest_cpu_init(); 523#endif 524} 525