From 7b105ca2903b84f023c49965d9a511c5e55256dc Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 15 May 2011 01:00:52 +0900 Subject: KVM: x86 emulator: Stop passing ctxt->ops as arg of emul functions Dereference it in the actual users. This not only cleans up the emulator but also makes it easy to convert the old emulation functions to the new em_xxx() form later. Note: Remove some inline keywords to let the compiler decide inlining. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 77c9d86..51df0b6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4513,7 +4513,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + inc_eip; - ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); + ret = emulate_int_real(&vcpu->arch.emulate_ctxt, irq); if (ret != X86EMUL_CONTINUE) return EMULATE_FAIL; -- cgit v1.1 From 8b0cedff040b652f3d36b1368778667581b0c140 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sun, 15 May 2011 23:22:04 +0800 Subject: KVM: use __copy_to_user/__clear_user to write guest page Simply use __copy_to_user/__clear_user to write guest page since we have already verified the user address when the memslot is set Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51df0b6..0b08986 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1388,7 +1388,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 1; kvm_x86_ops->patch_hypercall(vcpu, instructions); ((unsigned char *)instructions)[3] = 0xc3; /* ret */ - if (copy_to_user((void __user *)addr, instructions, 4)) + if (__copy_to_user((void __user *)addr, instructions, 4)) return 1; kvm->arch.hv_hypercall = data; break; @@ -1415,7 +1415,7 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); if (kvm_is_error_hva(addr)) return 1; - if (clear_user((void __user *)addr, PAGE_SIZE)) + if (__clear_user((void __user *)addr, PAGE_SIZE)) return 1; vcpu->arch.hv_vapic = data; break; -- cgit v1.1 From 24c82e576b7860a4f02a21103e9df39e11e97006 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 May 2011 05:56:07 -0400 Subject: KVM: Sanitize cpuid Instead of blacklisting known-unsupported cpuid leaves, whitelist known- supported leaves. This is more conservative and prevents us from reporting features we don't support. Also whitelist a few more leaves while at it. Signed-off-by: Avi Kivity Acked-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0b08986..da48622 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2283,6 +2283,13 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } +static bool supported_xcr0_bit(unsigned bit) +{ + u64 mask = ((u64)1 << bit); + + return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; +} + #define F(x) bit(X86_FEATURE_##x) static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, @@ -2393,6 +2400,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case 9: + break; case 0xb: { int i, level_type; @@ -2414,7 +2423,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (i = 1; *nent < maxnent && i < 64; ++i) { - if (entry[i].eax == 0) + if (entry[i].eax == 0 || !supported_xcr0_bit(i)) continue; do_cpuid_1_ent(&entry[i], function, i); entry[i].flags |= @@ -2451,6 +2460,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_supported_word6_x86_features; cpuid_mask(&entry->ecx, 6); break; + case 0x80000008: { + unsigned g_phys_as = (entry->eax >> 16) & 0xff; + unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); + unsigned phys_as = entry->eax & 0xff; + + if (!g_phys_as) + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); + entry->ebx = entry->edx = 0; + break; + } + case 0x80000019: + entry->ecx = entry->edx = 0; + break; + case 0x8000001a: + break; + case 0x8000001d: + break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ @@ -2460,10 +2487,16 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->edx &= kvm_supported_word5_x86_features; cpuid_mask(&entry->edx, 5); break; + case 3: /* Processor serial number */ + case 5: /* MONITOR/MWAIT */ + case 6: /* Thermal management */ + case 0xA: /* Architectural Performance Monitoring */ + case 0x80000007: /* Advanced power management */ case 0xC0000002: case 0xC0000003: case 0xC0000004: - /*Now nothing to do, reserved for the future*/ + default: + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } -- cgit v1.1 From d780592b99d7d8a5ff905f6bacca519d4a342c76 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 23 May 2011 10:33:05 +0200 Subject: KVM: Clean up error handling during VCPU creation So far kvm_arch_vcpu_setup is responsible for freeing the vcpu struct if it fails. Move this confusing resonsibility back into the hands of kvm_vm_ioctl_create_vcpu. Only kvm_arch_vcpu_setup of x86 is affected, all other archs cannot fail. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index da48622..aaa3735 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6126,12 +6126,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) if (r == 0) r = kvm_mmu_setup(vcpu); vcpu_put(vcpu); - if (r < 0) - goto free_vcpu; - return 0; -free_vcpu: - kvm_x86_ops->vcpu_free(vcpu); return r; } -- cgit v1.1 From adf52235b4082e67f31bf1fba36f1dce312633d6 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 25 May 2011 11:06:16 +0900 Subject: KVM: x86 emulator: Clean up init_emulate_ctxt() Use a local pointer to the emulate_ctxt for simplicity. Then, arrange the hard-to-read mode selection lines neatly. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aaa3735..ae2353c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4508,7 +4508,8 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct decode_cache *c = &ctxt->decode; int cs_db, cs_l; /* @@ -4521,15 +4522,15 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); - vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); - vcpu->arch.emulate_ctxt.mode = - (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_VM86 : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); + ctxt->eflags = kvm_get_rflags(vcpu); + ctxt->eip = kvm_rip_read(vcpu); + ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : + (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : + cs_l ? X86EMUL_MODE_PROT64 : + cs_db ? X86EMUL_MODE_PROT32 : + X86EMUL_MODE_PROT16; + ctxt->guest_mode = is_guest_mode(vcpu); + memset(c, 0, sizeof(struct decode_cache)); memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; -- cgit v1.1 From b5c9ff731f3cee5a2f2d7154f48f8006b48eb66d Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 25 May 2011 11:09:38 +0900 Subject: KVM: x86 emulator: Avoid clearing the whole decode_cache During tracing the emulator, we noticed that init_emulate_ctxt() sometimes took a bit longer time than we expected. This patch is for mitigating the problem by some degree. By looking into the function, we soon notice that it clears the whole decode_cache whose size is about 2.5K bytes now. Furthermore, most of the bytes are taken for the two read_cache arrays, which are used only by a few instructions. Considering the fact that we are not assuming the cache arrays have been cleared when we store actual data, we do not need to clear the arrays: 2K bytes elimination. In addition, we can avoid clearing the fetch_cache and regs arrays. This patch changes the initialization not to clear the arrays. On our 64-bit host, init_emulate_ctxt() becomes 0.3 to 0.5us faster with this patch applied. Signed-off-by: Takuya Yoshikawa Cc: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ae2353c..d88de56 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4506,6 +4506,20 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception.vector); } +static void init_decode_cache(struct decode_cache *c, + const unsigned long *regs) +{ + memset(c, 0, offsetof(struct decode_cache, regs)); + memcpy(c->regs, regs, sizeof(c->regs)); + + c->fetch.start = 0; + c->fetch.end = 0; + c->io_read.pos = 0; + c->io_read.end = 0; + c->mem_read.pos = 0; + c->mem_read.end = 0; +} + static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; @@ -4531,8 +4545,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) X86EMUL_MODE_PROT16; ctxt->guest_mode = is_guest_mode(vcpu); - memset(c, 0, sizeof(struct decode_cache)); - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + init_decode_cache(c, vcpu->arch.regs); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } -- cgit v1.1 From 5e1746d6205d1efa3193cc0c67aa2d15e54799bd Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:03:24 +0300 Subject: KVM: nVMX: Allow setting the VMXE bit in CR4 This patch allows the guest to enable the VMXE bit in CR4, which is a prerequisite to running VMXON. Whether to allow setting the VMXE bit now depends on the architecture (svm or vmx), so its checking has moved to kvm_x86_ops->set_cr4(). This function now returns an int: If kvm_x86_ops->set_cr4() returns 1, __kvm_set_cr4() will also return 1, and this will cause kvm_set_cr4() will throw a #GP. Turning on the VMXE bit is allowed only when the nested VMX feature is enabled, and turning it off is forbidden after a vmxon. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d88de56..460932b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -615,11 +615,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_read_cr3(vcpu))) return 1; - if (cr4 & X86_CR4_VMXE) + if (kvm_x86_ops->set_cr4(vcpu, cr4)) return 1; - kvm_x86_ops->set_cr4(vcpu, cr4); - if ((cr4 ^ old_cr4) & pdptr_bits) kvm_mmu_reset_context(vcpu); -- cgit v1.1 From 064aea774768749c6fd308b37818ea3a9600583d Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:04:56 +0300 Subject: KVM: nVMX: Decoding memory operands of VMX instructions This patch includes a utility function for decoding pointer operands of VMX instructions issued by L1 (a guest hypervisor) Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 460932b..27d12a3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3848,7 +3848,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, exception); } -static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, +int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { @@ -3858,6 +3858,7 @@ static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } +EXPORT_SYMBOL_GPL(kvm_read_guest_virt); static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, -- cgit v1.1 From 27d6c865211662721e6cf305706e4a3da35f12b4 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:06:59 +0300 Subject: KVM: nVMX: Implement VMCLEAR This patch implements the VMCLEAR instruction. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27d12a3..c1b5a18 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -347,6 +347,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) vcpu->arch.cr2 = fault->address; kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); } +EXPORT_SYMBOL_GPL(kvm_inject_page_fault); void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { -- cgit v1.1 From 6a4d7550601b5b17df227959bdbec208384f729c Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:08:00 +0300 Subject: KVM: nVMX: Implement VMPTRST This patch implements the VMPTRST instruction. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1b5a18..de262a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3869,7 +3869,7 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); } -static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, +int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) @@ -3901,6 +3901,7 @@ static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, out: return r; } +EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, -- cgit v1.1 From 9d74191ab1ea857d1cc27e439316eebf8ae46d19 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 29 May 2011 21:53:48 +0900 Subject: KVM: x86 emulator: Use the pointers ctxt and c consistently We should use the local variables ctxt and c when the emulate_ctxt and decode appears many times. At least, we need to be consistent about how we use these in a function. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 59 +++++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index de262a0..39d8b04 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4552,24 +4552,24 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct decode_cache *c = &ctxt->decode; int ret; init_emulate_ctxt(vcpu); - vcpu->arch.emulate_ctxt.decode.op_bytes = 2; - vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; - vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + - inc_eip; - ret = emulate_int_real(&vcpu->arch.emulate_ctxt, irq); + c->op_bytes = 2; + c->ad_bytes = 2; + c->eip = ctxt->eip + inc_eip; + ret = emulate_int_real(ctxt, irq); if (ret != X86EMUL_CONTINUE) return EMULATE_FAIL; - vcpu->arch.emulate_ctxt.eip = c->eip; + ctxt->eip = c->eip; memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_rip_write(vcpu, ctxt->eip); + kvm_set_rflags(vcpu, ctxt->eflags); if (irq == NMI_VECTOR) vcpu->arch.nmi_pending = false; @@ -4630,21 +4630,22 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, int insn_len) { int r; - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct decode_cache *c = &ctxt->decode; bool writeback = true; kvm_clear_exception_queue(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { init_emulate_ctxt(vcpu); - vcpu->arch.emulate_ctxt.interruptibility = 0; - vcpu->arch.emulate_ctxt.have_exception = false; - vcpu->arch.emulate_ctxt.perm_ok = false; + ctxt->interruptibility = 0; + ctxt->have_exception = false; + ctxt->perm_ok = false; - vcpu->arch.emulate_ctxt.only_vendor_specific_insn + ctxt->only_vendor_specific_insn = emulation_type & EMULTYPE_TRAP_UD; - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); + r = x86_decode_insn(ctxt, insn, insn_len); trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; @@ -4660,7 +4661,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); + kvm_rip_write(vcpu, c->eip); return EMULATE_DONE; } @@ -4672,7 +4673,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } restart: - r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); + r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) return EMULATE_DONE; @@ -4684,7 +4685,7 @@ restart: return handle_emulation_failure(vcpu); } - if (vcpu->arch.emulate_ctxt.have_exception) { + if (ctxt->have_exception) { inject_emulated_exception(vcpu); r = EMULATE_DONE; } else if (vcpu->arch.pio.count) { @@ -4703,13 +4704,12 @@ restart: r = EMULATE_DONE; if (writeback) { - toggle_interruptibility(vcpu, - vcpu->arch.emulate_ctxt.interruptibility); - kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + toggle_interruptibility(vcpu, ctxt->interruptibility); + kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + kvm_rip_write(vcpu, ctxt->eip); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; @@ -5130,8 +5130,7 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(&vcpu->arch.emulate_ctxt, - rip, instruction, 3, NULL); + return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); } static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) @@ -5849,21 +5848,21 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct decode_cache *c = &ctxt->decode; int ret; init_emulate_ctxt(vcpu); - ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, - tss_selector, reason, has_error_code, - error_code); + ret = emulator_task_switch(ctxt, tss_selector, reason, + has_error_code, error_code); if (ret) return EMULATE_FAIL; memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_rip_write(vcpu, ctxt->eip); + kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); return EMULATE_DONE; } -- cgit v1.1 From 36dd9bb5ce32bc39e25a5fcc61415f13e3ed5d17 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 1 Jun 2011 15:34:24 +0300 Subject: KVM: x86 emulator: rename decode_cache::eip to _eip The name eip conflicts with a field of the same name in x86_emulate_ctxt, which we plan to fold decode_cache into. The name _eip is unfortunate, but what's really needed is a refactoring here, not a better name. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 39d8b04..7e452fe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4560,13 +4560,13 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) c->op_bytes = 2; c->ad_bytes = 2; - c->eip = ctxt->eip + inc_eip; + c->_eip = ctxt->eip + inc_eip; ret = emulate_int_real(ctxt, irq); if (ret != X86EMUL_CONTINUE) return EMULATE_FAIL; - ctxt->eip = c->eip; + ctxt->eip = c->_eip; memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); @@ -4661,7 +4661,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, c->eip); + kvm_rip_write(vcpu, c->_eip); return EMULATE_DONE; } -- cgit v1.1 From 9dac77fa4011bdb4b541a8db087eac96a602faec Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 1 Jun 2011 15:34:25 +0300 Subject: KVM: x86 emulator: fold decode_cache into x86_emulate_ctxt This saves a lot of pointless casts x86_emulate_ctxt and decode_cache. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7e452fe..694538a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4507,24 +4507,24 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception.vector); } -static void init_decode_cache(struct decode_cache *c, +static void init_decode_cache(struct x86_emulate_ctxt *ctxt, const unsigned long *regs) { - memset(c, 0, offsetof(struct decode_cache, regs)); - memcpy(c->regs, regs, sizeof(c->regs)); + memset(&ctxt->twobyte, 0, + (void *)&ctxt->regs - (void *)&ctxt->twobyte); + memcpy(ctxt->regs, regs, sizeof(ctxt->regs)); - c->fetch.start = 0; - c->fetch.end = 0; - c->io_read.pos = 0; - c->io_read.end = 0; - c->mem_read.pos = 0; - c->mem_read.end = 0; + ctxt->fetch.start = 0; + ctxt->fetch.end = 0; + ctxt->io_read.pos = 0; + ctxt->io_read.end = 0; + ctxt->mem_read.pos = 0; + ctxt->mem_read.end = 0; } static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - struct decode_cache *c = &ctxt->decode; int cs_db, cs_l; /* @@ -4546,28 +4546,27 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) X86EMUL_MODE_PROT16; ctxt->guest_mode = is_guest_mode(vcpu); - init_decode_cache(c, vcpu->arch.regs); + init_decode_cache(ctxt, vcpu->arch.regs); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - struct decode_cache *c = &ctxt->decode; int ret; init_emulate_ctxt(vcpu); - c->op_bytes = 2; - c->ad_bytes = 2; - c->_eip = ctxt->eip + inc_eip; + ctxt->op_bytes = 2; + ctxt->ad_bytes = 2; + ctxt->_eip = ctxt->eip + inc_eip; ret = emulate_int_real(ctxt, irq); if (ret != X86EMUL_CONTINUE) return EMULATE_FAIL; - ctxt->eip = c->_eip; - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + ctxt->eip = ctxt->_eip; + memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); @@ -4631,7 +4630,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, { int r; struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - struct decode_cache *c = &ctxt->decode; bool writeback = true; kvm_clear_exception_queue(vcpu); @@ -4661,7 +4659,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, c->_eip); + kvm_rip_write(vcpu, ctxt->_eip); return EMULATE_DONE; } @@ -4669,7 +4667,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, changes registers values during IO operation */ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { vcpu->arch.emulate_regs_need_sync_from_vcpu = false; - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs); } restart: @@ -4707,7 +4705,7 @@ restart: toggle_interruptibility(vcpu, ctxt->interruptibility); kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); } else @@ -5718,8 +5716,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) * that usually, but some bad designed PV devices (vmware * backdoor interface) need this to work */ - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; } regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); @@ -5849,7 +5847,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - struct decode_cache *c = &ctxt->decode; int ret; init_emulate_ctxt(vcpu); @@ -5860,7 +5857,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, if (ret) return EMULATE_FAIL; - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); -- cgit v1.1 From c68b734fba402b9bfdd49e23b776c42dbeaf1f5b Mon Sep 17 00:00:00 2001 From: "Yang, Wei Y" Date: Fri, 3 Jun 2011 11:13:42 +0800 Subject: KVM: Add SMEP support when setting CR4 This patch adds SMEP handling when setting CR4. Signed-off-by: Yang, Wei Signed-off-by: Shan, Haitao Signed-off-by: Li, Xin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 694538a..ba5cd27 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -580,6 +580,14 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_XSAVE)); } +static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_SMEP)); +} + static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -599,14 +607,17 @@ static void update_cpuid(struct kvm_vcpu *vcpu) int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; - + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | + X86_CR4_PAE | X86_CR4_SMEP; if (cr4 & CR4_RESERVED_BITS) return 1; if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) return 1; + if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; -- cgit v1.1 From 611c120f7486a19e7df2225f875a52ef0b599ae8 Mon Sep 17 00:00:00 2001 From: "Yang, Wei Y" Date: Fri, 3 Jun 2011 11:14:03 +0800 Subject: KVM: Mask function7 ebx against host capability word9 This patch masks CPUID leaf 7 ebx against host capability word9. Signed-off-by: Yang, Wei Signed-off-by: Shan, Haitao Signed-off-by: Li, Xin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ba5cd27..ff4623b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2359,6 +2359,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | F(PMM) | F(PMM_EN); + /* cpuid 7.0.ebx */ + const u32 kvm_supported_word9_x86_features = + F(SMEP); + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); do_cpuid_1_ent(entry, function, index); @@ -2393,7 +2397,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } - /* function 4 and 0xb have additional index. */ + /* function 4 has additional index. */ case 4: { int i, cache_type; @@ -2410,8 +2414,22 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case 7: { + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* Mask ebx against host capbability word 9 */ + if (index == 0) { + entry->ebx &= kvm_supported_word9_x86_features; + cpuid_mask(&entry->ebx, 9); + } else + entry->ebx = 0; + entry->eax = 0; + entry->ecx = 0; + entry->edx = 0; + break; + } case 9: break; + /* function 0xb has additional index. */ case 0xb: { int i, level_type; -- cgit v1.1 From 02668b061db1b9f7f18872e594ac68e237db0bed Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 10 Jun 2011 11:35:30 +0200 Subject: KVM: fix XSAVE bit scanning (now properly) commit 123108f1c1aafd51d6a5c79cc04d7999dd88a930 tried to fix KVMs XSAVE valid feature scanning, but it was wrong. It was not considering the sparse nature of this bitfield, instead reading values from uninitialized members of the entries array. This patch now separates subleaf indicies from KVM's array indicies and fills the entry before querying it's value. This fixes AVX support in KVM guests. Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ff4623b..84f4607 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2447,16 +2447,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; } case 0xd: { - int i; + int idx, i; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - for (i = 1; *nent < maxnent && i < 64; ++i) { - if (entry[i].eax == 0 || !supported_xcr0_bit(i)) + for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) { + do_cpuid_1_ent(&entry[i], function, idx); + if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) continue; - do_cpuid_1_ent(&entry[i], function, i); entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; + ++i; } break; } -- cgit v1.1 From 4a00efdf0c7c93dc6f6b3ce7e2d6bd4cd1ac1651 Mon Sep 17 00:00:00 2001 From: "Yang, Wei Y" Date: Mon, 13 Jun 2011 21:52:33 +0800 Subject: KVM: Enable DRNG feature support for KVM This patch exposes DRNG feature to KVM guests. The RDRAND instruction can provide software with sequences of random numbers generated from white noise. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84f4607..6f1e54d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2345,7 +2345,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | - F(F16C); + F(F16C) | F(RDRAND); /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | -- cgit v1.1 From 74dc2b4ffe3af1eac367be0178f88487cb9b240a Mon Sep 17 00:00:00 2001 From: "Yang, Wei" Date: Tue, 14 Jun 2011 20:10:18 +0800 Subject: KVM: Add RDWRGSFS support when setting CR4 This patch adds RDWRGSFS support when setting CR4. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f1e54d..423e0cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -588,6 +588,14 @@ static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_SMEP)); } +static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); +} + static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -618,6 +626,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) return 1; + if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; -- cgit v1.1 From 176f61da82435eae09cc96f70b530d1ba0746b8b Mon Sep 17 00:00:00 2001 From: "Yang, Wei" Date: Tue, 14 Jun 2011 20:10:19 +0800 Subject: KVM: Expose RDWRGSFS bit to KVM guests This patch exposes RDWRGSFS bit to KVM guests. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 423e0cd..76c16a5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2372,7 +2372,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = - F(SMEP); + F(SMEP) | F(FSGSBASE); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.1 From a01c8f9b4e266df1d7166d23216f2060648f862d Mon Sep 17 00:00:00 2001 From: "Yang, Wei" Date: Tue, 14 Jun 2011 15:19:06 +0800 Subject: KVM: Enable ERMS feature support for KVM This patch exposes ERMS feature to KVM guests. The REP MOVSB/STOSB instruction can enhance fast strings attempts to move as much of the data with larger size load/stores as possible. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 76c16a5..0b803f0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2372,7 +2372,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = - F(SMEP) | F(FSGSBASE); + F(SMEP) | F(FSGSBASE) | F(ERMS); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.1 From c9aaa8957f203bd6df83b002fb40b98390bed078 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:14 -0400 Subject: KVM: Steal time implementation To implement steal time, we need the hypervisor to pass the guest information about how much time was spent running other processes outside the VM, while the vcpu had meaningful work to do - halt time does not count. This information is acquired through the run_delay field of delayacct/schedstats infrastructure, that counts time spent in a runqueue but not running. Steal time is a per-cpu information, so the traditional MSR-based infrastructure is used. A new msr, KVM_MSR_STEAL_TIME, holds the memory area address containing information about steal time This patch contains the hypervisor part of the steal time infrasructure, and can be backported independently of the guest portion. [avi, yongjie: export delayacct_on, to avoid build failures in some configs] Signed-off-by: Glauber Costa Tested-by: Eric B Munson CC: Rik van Riel CC: Jeremy Fitzhardinge CC: Peter Zijlstra CC: Anthony Liguori Signed-off-by: Yongjie Ren Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0b803f0..c96cdc0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -808,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 8 +#define KVM_SAVE_MSRS_BEGIN 9 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1488,6 +1488,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) } } +static void accumulate_steal_time(struct kvm_vcpu *vcpu) +{ + u64 delta; + + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + delta = current->sched_info.run_delay - vcpu->arch.st.last_steal; + vcpu->arch.st.last_steal = current->sched_info.run_delay; + vcpu->arch.st.accum_steal = delta; +} + +static void record_steal_time(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) + return; + + vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal; + vcpu->arch.st.steal.version += 2; + vcpu->arch.st.accum_steal = 0; + + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1570,6 +1599,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; + case MSR_KVM_STEAL_TIME: + + if (unlikely(!sched_info_on())) + return 1; + + if (data & KVM_STEAL_RESERVED_MASK) + return 1; + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, + data & KVM_STEAL_VALID_BITS)) + return 1; + + vcpu->arch.st.msr_val = data; + + if (!(data & KVM_MSR_ENABLED)) + break; + + vcpu->arch.st.last_steal = current->sched_info.run_delay; + + preempt_disable(); + accumulate_steal_time(vcpu); + preempt_enable(); + + kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); + + break; + case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: @@ -1855,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_ASYNC_PF_EN: data = vcpu->arch.apf.msr_val; break; + case MSR_KVM_STEAL_TIME: + data = vcpu->arch.st.msr_val; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -2166,6 +2225,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; } + + accumulate_steal_time(vcpu); + kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -2487,6 +2549,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + + if (sched_info_on()) + entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); + entry->ebx = 0; entry->ecx = 0; entry->edx = 0; @@ -5470,6 +5536,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 1; goto out; } + if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) + record_steal_time(vcpu); + } r = kvm_mmu_reload(vcpu); @@ -6206,6 +6275,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.apf.msr_val = 0; + vcpu->arch.st.msr_val = 0; kvmclock_reset(vcpu); -- cgit v1.1 From af7cc7d1ee422a612f6785e347a893d44cc892ea Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:22:46 +0800 Subject: KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup the code Introduce vcpu_mmio_gva_to_gpa to translate the gva to gpa, we can use it to cleanup the code between read emulation and write emulation Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c96cdc0..a1dbd04 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4010,6 +4010,27 @@ out: } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, + gpa_t *gpa, struct x86_exception *exception, + bool write) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + + if (write) + access |= PFERR_WRITE_MASK; + + *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + + if (*gpa == UNMAPPED_GVA) + return -1; + + /* For APIC access vmexit */ + if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + return 1; + + return 0; +} + static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, @@ -4017,8 +4038,8 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - gpa_t gpa; - int handled; + gpa_t gpa; + int handled, ret; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -4028,13 +4049,12 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } - gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); + ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false); - if (gpa == UNMAPPED_GVA) + if (ret < 0) return X86EMUL_PROPAGATE_FAULT; - /* For APIC access vmexit */ - if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + if (ret) goto mmio; if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) @@ -4085,16 +4105,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct x86_exception *exception, struct kvm_vcpu *vcpu) { - gpa_t gpa; - int handled; + gpa_t gpa; + int handled, ret; - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); + ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true); - if (gpa == UNMAPPED_GVA) + if (ret < 0) return X86EMUL_PROPAGATE_FAULT; /* For APIC access vmexit */ - if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + if (ret) goto mmio; if (emulator_write_phys(vcpu, gpa, val, bytes)) -- cgit v1.1 From bebb106a5afa32efdf5332ed4a40bf4d6d06b56e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:23:20 +0800 Subject: KVM: MMU: cache mmio info on page fault path If the page fault is caused by mmio, we can cache the mmio info, later, we do not need to walk guest page table and quickly know it is a mmio fault while we emulate the mmio instruction Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1dbd04..028a0f2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4016,6 +4016,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + if (vcpu_match_mmio_gva(vcpu, gva) && + check_write_user_access(vcpu, write, access, + vcpu->arch.access)) { + *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | + (gva & (PAGE_SIZE - 1)); + return 1; + } + if (write) access |= PFERR_WRITE_MASK; @@ -4028,6 +4036,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) return 1; + if (vcpu_match_mmio_gpa(vcpu, *gpa)) + return 1; + return 0; } -- cgit v1.1 From c37079586f317d7e7f1a70d36f0e5177691c89c2 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:28:04 +0800 Subject: KVM: MMU: remove bypass_guest_pf The idea is from Avi: | Maybe it's time to kill off bypass_guest_pf=1. It's not as effective as | it used to be, since unsync pages always use shadow_trap_nonpresent_pte, | and since we convert between the two nonpresent_ptes during sync and unsync. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 028a0f2..64c42d9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5091,7 +5091,6 @@ int kvm_arch_init(void *opaque) kvm_init_msr_list(); kvm_x86_ops = ops; - kvm_mmu_set_nonpresent_ptes(0ull, 0ull); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); -- cgit v1.1 From ce88decffd17bf9f373cc233c961ad2054965667 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:33:44 +0800 Subject: KVM: MMU: mmio page fault support The idea is from Avi: | We could cache the result of a miss in an spte by using a reserved bit, and | checking the page fault error code (or seeing if we get an ept violation or | ept misconfiguration), so if we get repeated mmio on a page, we don't need to | search the slot list/tree. | (https://lkml.org/lkml/2011/2/22/221) When the page fault is caused by mmio, we cache the info in the shadow page table, and also set the reserved bits in the shadow page table, so if the mmio is caused again, we can quickly identify it and emulate it directly Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it can be reduced by this feature, and also avoid walking guest page table for soft mmu. [jan: fix operator precedence issue] Signed-off-by: Xiao Guangrong Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 64c42d9..2c9661f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5062,6 +5062,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); +static void kvm_set_mmio_spte_mask(void) +{ + u64 mask; + int maxphyaddr = boot_cpu_data.x86_phys_bits; + + /* + * Set the reserved bits and the present bit of an paging-structure + * entry to generate page fault with PFER.RSV = 1. + */ + mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; + mask |= 1ull; + +#ifdef CONFIG_X86_64 + /* + * If reserved bit is not supported, clear the present bit to disable + * mmio page fault. + */ + if (maxphyaddr == 52) + mask &= ~1ull; +#endif + + kvm_mmu_set_mmio_spte_mask(mask); +} + int kvm_arch_init(void *opaque) { int r; @@ -5088,6 +5112,7 @@ int kvm_arch_init(void *opaque) if (r) goto out; + kvm_set_mmio_spte_mask(); kvm_init_msr_list(); kvm_x86_ops = ops; -- cgit v1.1 From 4f0226482d20f104e943ee9e6f1218b573953f63 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:34:24 +0800 Subject: KVM: MMU: trace mmio page fault Add tracepoints to trace mmio page fault Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2c9661f..84a28ea 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4021,6 +4021,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, vcpu->arch.access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); + trace_vcpu_match_mmio(gva, *gpa, write, false); return 1; } @@ -4036,8 +4037,10 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) return 1; - if (vcpu_match_mmio_gpa(vcpu, *gpa)) + if (vcpu_match_mmio_gpa(vcpu, *gpa)) { + trace_vcpu_match_mmio(gva, *gpa, write, true); return 1; + } return 0; } -- cgit v1.1 From ca7d58f375c650cf36900cb1da1ca2cc99b13393 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 Jul 2011 14:31:08 +0800 Subject: KVM: x86: fix broken read emulation spans a page boundary If the range spans a page boundary, the mmio access can be broke, fix it as write emulation. And we already get the guest physical address, so use it to read guest data directly to avoid walking guest page table again Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84a28ea..1453248 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4045,13 +4045,12 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, return 0; } -static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *exception) +static int emulator_read_emulated_onepage(unsigned long addr, + void *val, + unsigned int bytes, + struct x86_exception *exception, + struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; int handled, ret; @@ -4071,8 +4070,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, if (ret) goto mmio; - if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) - == X86EMUL_CONTINUE) + if (!kvm_read_guest(vcpu->kvm, gpa, val, bytes)) return X86EMUL_CONTINUE; mmio: @@ -4101,6 +4099,31 @@ mmio: return X86EMUL_IO_NEEDED; } +static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, + void *val, + unsigned int bytes, + struct x86_exception *exception) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + /* Crossing a page boundary? */ + if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { + int rc, now; + + now = -addr & ~PAGE_MASK; + rc = emulator_read_emulated_onepage(addr, val, now, exception, + vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; + addr += now; + val += now; + bytes -= now; + } + return emulator_read_emulated_onepage(addr, val, bytes, exception, + vcpu); +} + int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes) { -- cgit v1.1 From 77d197b2ca37b33b0461ab1e2dbe40cbe4a6fd6a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 Jul 2011 14:31:50 +0800 Subject: KVM: x86: abstract the operation for read/write emulation The operations of read emulation and write emulation are very similar, so we can abstract the operation of them, in larter patch, it is used to cleanup the same code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1453248..f5c60a8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4136,6 +4136,78 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, return 1; } +struct read_write_emulator_ops { + int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val, + int bytes); + int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes); + int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, + int bytes, void *val); + int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes); + bool write; +}; + +static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) +{ + if (vcpu->mmio_read_completed) { + memcpy(val, vcpu->mmio_data, bytes); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, + vcpu->mmio_phys_addr, *(u64 *)val); + vcpu->mmio_read_completed = 0; + return 1; + } + + return 0; +} + +static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + return !kvm_read_guest(vcpu->kvm, gpa, val, bytes); +} + +static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + return emulator_write_phys(vcpu, gpa, val, bytes); +} + +static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) +{ + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + return vcpu_mmio_write(vcpu, gpa, bytes, val); +} + +static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); + return X86EMUL_IO_NEEDED; +} + +static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + memcpy(vcpu->mmio_data, val, bytes); + memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + return X86EMUL_CONTINUE; +} + +static struct read_write_emulator_ops read_emultor = { + .read_write_prepare = read_prepare, + .read_write_emulate = read_emulate, + .read_write_mmio = vcpu_mmio_read, + .read_write_exit_mmio = read_exit_mmio, +}; + +static struct read_write_emulator_ops write_emultor = { + .read_write_emulate = write_emulate, + .read_write_mmio = write_mmio, + .read_write_exit_mmio = write_exit_mmio, + .write = true, +}; + static int emulator_write_emulated_onepage(unsigned long addr, const void *val, unsigned int bytes, -- cgit v1.1 From 22388a3c8ce2a2a004ce764194cce8a2f9b13d66 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 Jul 2011 14:32:31 +0800 Subject: KVM: x86: cleanup the code of read/write emulation Using the read/write operation to remove the same code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 146 +++++++++++++++++------------------------------------ 1 file changed, 45 insertions(+), 101 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f5c60a8..2b76ae3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4045,85 +4045,6 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, return 0; } -static int emulator_read_emulated_onepage(unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) -{ - gpa_t gpa; - int handled, ret; - - if (vcpu->mmio_read_completed) { - memcpy(val, vcpu->mmio_data, bytes); - trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_phys_addr, *(u64 *)val); - vcpu->mmio_read_completed = 0; - return X86EMUL_CONTINUE; - } - - ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false); - - if (ret < 0) - return X86EMUL_PROPAGATE_FAULT; - - if (ret) - goto mmio; - - if (!kvm_read_guest(vcpu->kvm, gpa, val, bytes)) - return X86EMUL_CONTINUE; - -mmio: - /* - * Is this MMIO handled locally? - */ - handled = vcpu_mmio_read(vcpu, gpa, bytes, val); - - if (handled == bytes) - return X86EMUL_CONTINUE; - - gpa += handled; - bytes -= handled; - val += handled; - - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); - - vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->run->mmio.len = min(vcpu->mmio_size, 8); - vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; - vcpu->mmio_index = 0; - - return X86EMUL_IO_NEEDED; -} - -static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *exception) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - /* Crossing a page boundary? */ - if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { - int rc, now; - - now = -addr & ~PAGE_MASK; - rc = emulator_read_emulated_onepage(addr, val, now, exception, - vcpu); - if (rc != X86EMUL_CONTINUE) - return rc; - addr += now; - val += now; - bytes -= now; - } - return emulator_read_emulated_onepage(addr, val, bytes, exception, - vcpu); -} - int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes) { @@ -4208,16 +4129,21 @@ static struct read_write_emulator_ops write_emultor = { .write = true, }; -static int emulator_write_emulated_onepage(unsigned long addr, - const void *val, - unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) +static int emulator_read_write_onepage(unsigned long addr, void *val, + unsigned int bytes, + struct x86_exception *exception, + struct kvm_vcpu *vcpu, + struct read_write_emulator_ops *ops) { gpa_t gpa; int handled, ret; + bool write = ops->write; + + if (ops->read_write_prepare && + ops->read_write_prepare(vcpu, val, bytes)) + return X86EMUL_CONTINUE; - ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true); + ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); if (ret < 0) return X86EMUL_PROPAGATE_FAULT; @@ -4226,15 +4152,14 @@ static int emulator_write_emulated_onepage(unsigned long addr, if (ret) goto mmio; - if (emulator_write_phys(vcpu, gpa, val, bytes)) + if (ops->read_write_emulate(vcpu, gpa, val, bytes)) return X86EMUL_CONTINUE; mmio: - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); /* * Is this MMIO handled locally? */ - handled = vcpu_mmio_write(vcpu, gpa, bytes, val); + handled = ops->read_write_mmio(vcpu, gpa, bytes, val); if (handled == bytes) return X86EMUL_CONTINUE; @@ -4243,23 +4168,20 @@ mmio: val += handled; vcpu->mmio_needed = 1; - memcpy(vcpu->mmio_data, val, bytes); vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; vcpu->mmio_size = bytes; vcpu->run->mmio.len = min(vcpu->mmio_size, 8); - vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; - memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + vcpu->run->mmio.is_write = vcpu->mmio_is_write = write; vcpu->mmio_index = 0; - return X86EMUL_CONTINUE; + return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); } -int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - const void *val, - unsigned int bytes, - struct x86_exception *exception) +int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, + void *val, unsigned int bytes, + struct x86_exception *exception, + struct read_write_emulator_ops *ops) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); @@ -4268,16 +4190,38 @@ int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, int rc, now; now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, exception, - vcpu); + rc = emulator_read_write_onepage(addr, val, now, exception, + vcpu, ops); + if (rc != X86EMUL_CONTINUE) return rc; addr += now; val += now; bytes -= now; } - return emulator_write_emulated_onepage(addr, val, bytes, exception, - vcpu); + + return emulator_read_write_onepage(addr, val, bytes, exception, + vcpu, ops); +} + +static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, + void *val, + unsigned int bytes, + struct x86_exception *exception) +{ + return emulator_read_write(ctxt, addr, val, bytes, + exception, &read_emultor); +} + +int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, + const void *val, + unsigned int bytes, + struct x86_exception *exception) +{ + return emulator_read_write(ctxt, addr, (void *)val, bytes, + exception, &write_emultor); } #define CMPXCHG_TYPE(t, ptr, old, new) \ -- cgit v1.1 From 8c3ba334f8588e1d5099f8602cf01897720e0eca Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 18 Jul 2011 17:17:15 +0300 Subject: KVM: x86: Raise the hard VCPU count limit The patch raises the hard limit of VCPU count to 254. This will allow developers to easily work on scalability and will allow users to test high VCPU setups easily without patching the kernel. To prevent possible issues with current setups, KVM_CAP_NR_VCPUS now returns the recommended VCPU limit (which is still 64) - this should be a safe value for everybody, while a new KVM_CAP_MAX_VCPUS returns the hard limit which is now 254. Cc: Avi Kivity Cc: Ingo Molnar Cc: Marcelo Tosatti Cc: Pekka Enberg Suggested-by: Pekka Enberg Signed-off-by: Sasha Levin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b76ae3..41dfebe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2086,6 +2086,9 @@ int kvm_dev_ioctl_check_extension(long ext) r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; case KVM_CAP_NR_VCPUS: + r = KVM_SOFT_MAX_VCPUS; + break; + case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; case KVM_CAP_NR_MEMSLOTS: -- cgit v1.1 From 14fa67ee95d4f7313fbf149fe37faccb903857c8 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Thu, 21 Jul 2011 15:38:10 -0700 Subject: KVM: x86: get_msr support for HV_X64_MSR_APIC_ASSIST_PAGE "get" support for the HV_X64_MSR_APIC_ASSIST_PAGE msr was missing, even though it is explicitly enumerated as something the vmm should save in msrs_to_save and reported to userland via the KVM_GET_MSR_INDEX_LIST ioctl. Add "get" support for HV_X64_MSR_APIC_ASSIST_PAGE. We simply return the guest visible value of this register, which seems to be correct as a set on the register is validated for us already. Signed-off-by: Mike Waychison Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41dfebe..e80f0d7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1825,6 +1825,8 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); + case HV_X64_MSR_APIC_ASSIST_PAGE: + return vcpu->arch.hv_vapic; default: pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; -- cgit v1.1 From d1613ad5d0018a009bd4865b0fa5930abb5ed259 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Sat, 23 Jul 2011 00:31:45 -0700 Subject: KVM: Really fix HV_X64_MSR_APIC_ASSIST_PAGE Commit 0945d4b228 tried to fix the get_msr path for the HV_X64_MSR_APIC_ASSIST_PAGE msr, but was poorly tested. We should be returning 0 if the read succeeded, and passing the value back to the caller via the pdata out argument, not returning the value directly. Signed-off-by: Mike Waychison Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e80f0d7..6cb353c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1826,7 +1826,8 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); case HV_X64_MSR_APIC_ASSIST_PAGE: - return vcpu->arch.hv_vapic; + data = vcpu->arch.hv_vapic; + break; default: pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; -- cgit v1.1 From 743eeb0b01d2fbf4154bf87bff1ebb6fb18aeb7a Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 27 Jul 2011 16:00:48 +0300 Subject: KVM: Intelligent device lookup on I/O bus Currently the method of dealing with an IO operation on a bus (PIO/MMIO) is to call the read or write callback for each device registered on the bus until we find a device which handles it. Since the number of devices on a bus can be significant due to ioeventfds and coalesced MMIO zones, this leads to a lot of overhead on each IO operation. Instead of registering devices, we now register ranges which points to a device. Lookup is done using an efficient bsearch instead of a linear search. Performance test was conducted by comparing exit count per second with 200 ioeventfds created on one byte and the guest is trying to access a different byte continuously (triggering usermode exits). Before the patch the guest has achieved 259k exits per second, after the patch the guest does 274k exits per second. Cc: Avi Kivity Cc: Marcelo Tosatti Signed-off-by: Sasha Levin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6cb353c..d28dff7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3562,7 +3562,11 @@ long kvm_arch_vm_ioctl(struct file *filp, if (r) { mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev); + &vpic->dev_master); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev_slave); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev_eclr); mutex_unlock(&kvm->slots_lock); kfree(vpic); goto create_irqchip_unlock; -- cgit v1.1 From 1d2887e2d849969f58ce79203f9785ebe065d494 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 30 Jul 2011 18:03:34 +0900 Subject: KVM: x86 emulator: Make x86_decode_insn() return proper macros Return EMULATION_OK/FAILED consistently. Also treat instruction fetch errors, not restricted to X86EMUL_UNHANDLEABLE, as EMULATION_FAILED; although this cannot happen in practice, the current logic will continue the emulation even if the decoder fails to fetch the instruction. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d28dff7..1fe9637 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4837,7 +4837,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; - if (r) { + if (r != EMULATION_OK) { if (emulation_type & EMULTYPE_TRAP_UD) return EMULATE_FAIL; if (reexecute_instruction(vcpu, cr2)) -- cgit v1.1 From 742bc67042e34a9fe1fed0b46e4cb1431a72c4bf Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 29 Jul 2011 19:44:21 -0300 Subject: KVM: x86: report valid microcode update ID Windows Server 2008 SP2 checked build with smp > 1 BSOD's during boot due to lack of microcode update: *** Assertion failed: The system BIOS on this machine does not properly support the processor. The system BIOS did not load any microcode update. A BIOS containing the latest microcode update is needed for system reliability. (CurrentUpdateRevision != 0) *** Source File: d:\longhorn\base\hals\update\intelupd\update.c, line 440 Report a non-zero microcode update signature to make it happy. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1fe9637..ea8f9f0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1842,7 +1842,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) switch (msr) { case MSR_IA32_PLATFORM_ID: - case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: case MSR_IA32_LASTBRANCHFROMIP: @@ -1863,6 +1862,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_FAM10H_MMIO_CONF_BASE: data = 0; break; + case MSR_IA32_UCODE_REV: + data = 0x100000000ULL; + break; case MSR_MTRRcap: data = 0x500 | KVM_NR_VAR_MTRR; break; -- cgit v1.1 From d5c1785d2f3aabe284d91bc7fc8f0abc58525dc9 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Tue, 2 Aug 2011 15:54:20 +0300 Subject: KVM: L1 TSC handling KVM assumed in several places that reading the TSC MSR returns the value for L1. This is incorrect, because when L2 is running, the correct TSC read exit emulation is to return L2's value. We therefore add a new x86_ops function, read_l1_tsc, to use in places that specifically need to read the L1 TSC, NOT the TSC of the current level of guest. Note that one change, of one line in kvm_arch_vcpu_load, is made redundant by a different patch sent by Zachary Amsden (and not yet applied): kvm_arch_vcpu_load() should not read the guest TSC, and if it didn't, of course we didn't have to change the call of kvm_get_msr() to read_l1_tsc(). [avi: moved callback to kvm_x86_ops tsc block] Signed-off-by: Nadav Har'El Acked-by: Zachary Amsdem Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ea8f9f0..6b37f18 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1098,7 +1098,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); + tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); kernel_ns = get_kernel_ns(); this_tsc_khz = vcpu_tsc_khz(v); if (unlikely(this_tsc_khz == 0)) { @@ -2218,7 +2218,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) s64 tsc_delta; u64 tsc; - kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc); + tsc = kvm_x86_ops->read_l1_tsc(vcpu); tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : tsc - vcpu->arch.last_guest_tsc; @@ -2242,7 +2242,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); } static int is_efer_nx(void) @@ -5729,7 +5729,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); -- cgit v1.1 From 7460fb4a340033107530df19e7e125bd0969bfb2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Sep 2011 13:43:14 +0300 Subject: KVM: Fix simultaneous NMIs If simultaneous NMIs happen, we're supposed to queue the second and next (collapsing them), but currently we sometimes collapse the second into the first. Fix by using a counter for pending NMIs instead of a bool; since the counter limit depends on whether the processor is currently in an NMI handler, which can only be checked in vcpu context (via the NMI mask), we add a new KVM_REQ_NMI to request recalculation of the counter. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6b37f18..d51e407 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -83,6 +83,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static void update_cr8_intercept(struct kvm_vcpu *vcpu); static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +static void process_nmi(struct kvm_vcpu *vcpu); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -359,8 +360,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) void kvm_inject_nmi(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.nmi_pending = 1; + atomic_inc(&vcpu->arch.nmi_queued); + kvm_make_request(KVM_REQ_NMI, vcpu); } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -2827,6 +2828,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { + process_nmi(vcpu); events->exception.injected = vcpu->arch.exception.pending && !kvm_exception_is_soft(vcpu->arch.exception.nr); @@ -2844,7 +2846,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); events->nmi.injected = vcpu->arch.nmi_injected; - events->nmi.pending = vcpu->arch.nmi_pending; + events->nmi.pending = vcpu->arch.nmi_pending != 0; events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); events->nmi.pad = 0; @@ -2864,6 +2866,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SHADOW)) return -EINVAL; + process_nmi(vcpu); vcpu->arch.exception.pending = events->exception.injected; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; @@ -4763,7 +4766,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) kvm_set_rflags(vcpu, ctxt->eflags); if (irq == NMI_VECTOR) - vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_pending = 0; else vcpu->arch.interrupt.pending = false; @@ -5572,7 +5575,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) /* try to inject new event if pending */ if (vcpu->arch.nmi_pending) { if (kvm_x86_ops->nmi_allowed(vcpu)) { - vcpu->arch.nmi_pending = false; + --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; kvm_x86_ops->set_nmi(vcpu); } @@ -5604,10 +5607,26 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) } } +static void process_nmi(struct kvm_vcpu *vcpu) +{ + unsigned limit = 2; + + /* + * x86 is limited to one NMI running, and one NMI pending after it. + * If an NMI is already in progress, limit further NMIs to just one. + * Otherwise, allow two (and we'll inject the first one immediately). + */ + if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) + limit = 1; + + vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); + vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; - bool nmi_pending; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; @@ -5647,6 +5666,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); + if (kvm_check_request(KVM_REQ_NMI, vcpu)) + process_nmi(vcpu); } @@ -5654,19 +5675,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(r)) goto out; - /* - * An NMI can be injected between local nmi_pending read and - * vcpu->arch.nmi_pending read inside inject_pending_event(). - * But in that case, KVM_REQ_EVENT will be set, which makes - * the race described above benign. - */ - nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending); - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { inject_pending_event(vcpu); /* enable NMI/IRQ window open exits if needed */ - if (nmi_pending) + if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); @@ -6374,7 +6387,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) { - vcpu->arch.nmi_pending = false; + atomic_set(&vcpu->arch.nmi_queued, 0); + vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; vcpu->arch.switch_db_regs = 0; @@ -6649,7 +6663,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED - || vcpu->arch.nmi_pending || + || atomic_read(&vcpu->arch.nmi_queued) || (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)); } -- cgit v1.1 From a3e06bbe8445f57eb949e6474c5a9b30f24d2057 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Thu, 22 Sep 2011 16:55:52 +0800 Subject: KVM: emulate lapic tsc deadline timer for guest This patch emulate lapic tsc deadline timer for guest: Enumerate tsc deadline timer capability by CPUID; Enable tsc deadline timer mode by lapic MMIO; Start tsc deadline timer by WRMSR; [jan: use do_div()] [avi: fix for !irqchip_in_kernel()] [marcelo: another fix for !irqchip_in_kernel()] Signed-off-by: Liu, Jinsong Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d51e407..cf26909 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -600,6 +600,8 @@ static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + struct kvm_lapic *apic = vcpu->arch.apic; + u32 timer_mode_mask; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (!best) @@ -611,6 +613,16 @@ static void update_cpuid(struct kvm_vcpu *vcpu) if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) best->ecx |= bit(X86_FEATURE_OSXSAVE); } + + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + best->function == 0x1) { + best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER); + timer_mode_mask = 3 << 17; + } else + timer_mode_mask = 1 << 17; + + if (apic) + apic->lapic_timer.timer_mode_mask = timer_mode_mask; } int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -826,6 +838,7 @@ static u32 msrs_to_save[] = { static unsigned num_msrs_to_save; static u32 emulated_msrs[] = { + MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, @@ -1001,7 +1014,7 @@ static inline int kvm_tsc_changes_freq(void) return ret; } -static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) { if (vcpu->arch.virtual_tsc_khz) return vcpu->arch.virtual_tsc_khz; @@ -1565,6 +1578,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_write(vcpu, msr, data); + case MSR_IA32_TSCDEADLINE: + kvm_set_lapic_tscdeadline_msr(vcpu, data); + break; case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; @@ -1894,6 +1910,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_read(vcpu, msr, pdata); break; + case MSR_IA32_TSCDEADLINE: + data = kvm_get_lapic_tscdeadline_msr(vcpu); + break; case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; -- cgit v1.1 From a1b60c1cd913c5ccfb38c717ba0bd22622425fa7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 6 Sep 2011 18:46:34 +0200 Subject: iommu/core: Convert iommu_found to iommu_present With per-bus iommu_ops the iommu_found function needs to work on a bus_type too. This patch adds a bus_type parameter to that function and converts all call-places. The function is also renamed to iommu_present because the function now checks if an iommu is present for a given bus and does not check for a global iommu anymore. Signed-off-by: Joerg Roedel --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84a28ea..73c6a42 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -2095,7 +2096,7 @@ int kvm_dev_ioctl_check_extension(long ext) r = 0; break; case KVM_CAP_IOMMU: - r = iommu_found(); + r = iommu_present(&pci_bus_type); break; case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; -- cgit v1.1 From 4d25a066b69fb749a39d0d4c610689dd765a0b0e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 21 Dec 2011 12:28:29 +0100 Subject: KVM: Don't automatically expose the TSC deadline timer in cpuid Unlike all of the other cpuid bits, the TSC deadline timer bit is set unconditionally, regardless of what userspace wants. This is broken in several ways: - if userspace doesn't use KVM_CREATE_IRQCHIP, and doesn't emulate the TSC deadline timer feature, a guest that uses the feature will break - live migration to older host kernels that don't support the TSC deadline timer will cause the feature to be pulled from under the guest's feet; breaking it - guests that are broken wrt the feature will fail. Fix by not enabling the feature automatically; instead report it to userspace. Because the feature depends on KVM_CREATE_IRQCHIP, which we cannot guarantee will be called, we expose it via a KVM_CAP_TSC_DEADLINE_TIMER and not KVM_GET_SUPPORTED_CPUID. Fixes the Illumos guest kernel, which uses the TSC deadline timer feature. [avi: add the KVM_CAP + documentation] Reported-by: Alexey Zaytsev Tested-by: Alexey Zaytsev Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c38efd7..4c938da 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -602,7 +602,6 @@ static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct kvm_lapic *apic = vcpu->arch.apic; - u32 timer_mode_mask; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (!best) @@ -615,15 +614,12 @@ static void update_cpuid(struct kvm_vcpu *vcpu) best->ecx |= bit(X86_FEATURE_OSXSAVE); } - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - best->function == 0x1) { - best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER); - timer_mode_mask = 3 << 17; - } else - timer_mode_mask = 1 << 17; - - if (apic) - apic->lapic_timer.timer_mode_mask = timer_mode_mask; + if (apic) { + if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) + apic->lapic_timer.timer_mode_mask = 3 << 17; + else + apic->lapic_timer.timer_mode_mask = 1 << 17; + } } int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -2135,6 +2131,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; + case KVM_CAP_TSC_DEADLINE_TIMER: + r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); + break; default: r = 0; break; -- cgit v1.1 From 90509a557798a023b3f5c46bebae62aa00e5da2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stephan=20B=C3=A4rwolf?= Date: Thu, 12 Jan 2012 16:43:03 +0100 Subject: KVM: x86: extend "struct x86_emulate_ops" with "get_cpuid" commit bdb42f5afebe208eae90406959383856ae2caf2b upstream. In order to be able to proceed checks on CPU-specific properties within the emulator, function "get_cpuid" is introduced. With "get_cpuid" it is possible to virtually call the guests "cpuid"-opcode without changing the VM's context. [mtosatti: cleanup/beautify code] Signed-off-by: Stephan Baerwolf Signed-off-by: Marcelo Tosatti Signed-off-by: Stefan Bader Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/x86.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4c938da..e04cae1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4655,6 +4655,28 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); } +static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +{ + struct kvm_cpuid_entry2 *cpuid = NULL; + + if (eax && ecx) + cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt), + *eax, *ecx); + + if (cpuid) { + *eax = cpuid->eax; + *ecx = cpuid->ecx; + if (ebx) + *ebx = cpuid->ebx; + if (edx) + *edx = cpuid->edx; + return true; + } + + return false; +} + static struct x86_emulate_ops emulate_ops = { .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, @@ -4685,6 +4707,7 @@ static struct x86_emulate_ops emulate_ops = { .get_fpu = emulator_get_fpu, .put_fpu = emulator_put_fpu, .intercept = emulator_intercept, + .get_cpuid = emulator_get_cpuid, }; static void cache_all_regs(struct kvm_vcpu *vcpu) -- cgit v1.1 From 645b177cbfce6b695bdbe0b4c131de584821840d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 5 Mar 2012 14:23:29 +0200 Subject: KVM: Ensure all vcpus are consistent with in-kernel irqchip settings (cherry picked from commit 3e515705a1f46beb1c942bb8043c16f8ac7b1e9e) If some vcpus are created before KVM_CREATE_IRQCHIP, then irqchip_in_kernel() and vcpu->arch.apic will be inconsistent, leading to potential NULL pointer dereferences. Fix by: - ensuring that no vcpus are installed when KVM_CREATE_IRQCHIP is called - ensuring that a vcpu has an apic if it is installed after KVM_CREATE_IRQCHIP This is somewhat long winded because vcpu->arch.apic is created without kvm->lock held. Based on earlier patch by Michael Ellerman. Signed-off-by: Michael Ellerman Signed-off-by: Avi Kivity Signed-off-by: Greg Kroah-Hartman Signed-off-by: Ben Hutchings --- arch/x86/kvm/x86.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e04cae1..4fc5323 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3579,6 +3579,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EEXIST; if (kvm->arch.vpic) goto create_irqchip_unlock; + r = -EINVAL; + if (atomic_read(&kvm->online_vcpus)) + goto create_irqchip_unlock; r = -ENOMEM; vpic = kvm_create_pic(kvm); if (vpic) { @@ -6486,6 +6489,11 @@ void kvm_arch_check_processor_compat(void *rtn) kvm_x86_ops->check_processor_compatibility(rtn); } +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct page *page; -- cgit v1.1 From 53f02039f794725f843494a36d82f045d78ba697 Mon Sep 17 00:00:00 2001 From: Petr Matousek Date: Tue, 6 Nov 2012 19:24:07 +0100 Subject: KVM: x86: invalid opcode oops on SET_SREGS with OSXSAVE bit set (CVE-2012-4461) commit 6d1068b3a98519247d8ba4ec85cd40ac136dbdf9 upstream. On hosts without the XSAVE support unprivileged local user can trigger oops similar to the one below by setting X86_CR4_OSXSAVE bit in guest cr4 register using KVM_SET_SREGS ioctl and later issuing KVM_RUN ioctl. invalid opcode: 0000 [#2] SMP Modules linked in: tun ip6table_filter ip6_tables ebtable_nat ebtables ... Pid: 24935, comm: zoog_kvm_monito Tainted: G D 3.2.0-3-686-pae EIP: 0060:[] EFLAGS: 00210246 CPU: 0 EIP is at kvm_arch_vcpu_ioctl_run+0x92a/0xd13 [kvm] EAX: 00000001 EBX: 000f387e ECX: 00000000 EDX: 00000000 ESI: 00000000 EDI: 00000000 EBP: ef5a0060 ESP: d7c63e70 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 Process zoog_kvm_monito (pid: 24935, ti=d7c62000 task=ed84a0c0 task.ti=d7c62000) Stack: 00000001 f70a1200 f8b940a9 ef5a0060 00000000 00200202 f8769009 00000000 ef5a0060 000f387e eda5c020 8722f9c8 00015bae 00000000 ed84a0c0 ed84a0c0 c12bf02d 0000ae80 ef7f8740 fffffffb f359b740 ef5a0060 f8b85dc1 0000ae80 Call Trace: [] ? kvm_arch_vcpu_ioctl_set_sregs+0x2fe/0x308 [kvm] ... [] ? syscall_call+0x7/0xb Code: 89 e8 e8 14 ee ff ff ba 00 00 04 00 89 e8 e8 98 48 ff ff 85 c0 74 1e 83 7d 48 00 75 18 8b 85 08 07 00 00 31 c9 8b 95 0c 07 00 00 <0f> 01 d1 c7 45 48 01 00 00 00 c7 45 1c 01 00 00 00 0f ae f0 89 EIP: [] kvm_arch_vcpu_ioctl_run+0x92a/0xd13 [kvm] SS:ESP 0068:d7c63e70 QEMU first retrieves the supported features via KVM_GET_SUPPORTED_CPUID and then sets them later. So guest's X86_FEATURE_XSAVE should be masked out on hosts without X86_FEATURE_XSAVE, making kvm_set_cr4 with X86_CR4_OSXSAVE fail. Userspaces that allow specifying guest cpuid with X86_FEATURE_XSAVE even on hosts that do not support it, might be susceptible to this attack from inside the guest as well. Allow setting X86_CR4_OSXSAVE bit only if host has XSAVE support. Signed-off-by: Petr Matousek Signed-off-by: Marcelo Tosatti [bwh: Backported to 3.2: both functions are in arch/x86/kvm/x86.c] Signed-off-by: Ben Hutchings --- arch/x86/kvm/x86.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4fc5323..f4063fd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -578,6 +578,9 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + if (!static_cpu_has(X86_FEATURE_XSAVE)) + return 0; + best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & bit(X86_FEATURE_XSAVE)); } @@ -6149,6 +6152,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int pending_vec, max_bits, idx; struct desc_ptr dt; + if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE)) + return -EINVAL; + dt.size = sregs->idt.limit; dt.address = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); -- cgit v1.1 From b7c5ee6d49b7cf5a52ae87b955d7ab984cb9c974 Mon Sep 17 00:00:00 2001 From: Andy Honig Date: Mon, 11 Mar 2013 09:34:52 -0700 Subject: KVM: x86: fix for buffer overflow in handling of MSR_KVM_SYSTEM_TIME (CVE-2013-1796) commit c300aa64ddf57d9c5d9c898a64b36877345dd4a9 upstream. If the guest sets the GPA of the time_page so that the request to update the time straddles a page then KVM will write onto an incorrect page. The write is done byusing kmap atomic to get a pointer to the page for the time structure and then performing a memcpy to that page starting at an offset that the guest controls. Well behaved guests always provide a 32-byte aligned address, however a malicious guest could use this to corrupt host kernel memory. Tested: Tested against kvmclock unit test. Signed-off-by: Andrew Honig Signed-off-by: Marcelo Tosatti Signed-off-by: Ben Hutchings --- arch/x86/kvm/x86.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f4063fd..50bac2e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1603,6 +1603,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); + /* Check that the address is 32-byte aligned. */ + if (vcpu->arch.time_offset & + (sizeof(struct pvclock_vcpu_time_info) - 1)) + break; + vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); -- cgit v1.1 From 767d3d43c0a02485a8574c0efe39524f246d698b Mon Sep 17 00:00:00 2001 From: Andy Honig Date: Wed, 20 Feb 2013 14:48:10 -0800 Subject: KVM: x86: Convert MSR_KVM_SYSTEM_TIME to use gfn_to_hva_cache functions (CVE-2013-1797) commit 0b79459b482e85cb7426aa7da683a9f2c97aeae1 upstream. There is a potential use after free issue with the handling of MSR_KVM_SYSTEM_TIME. If the guest specifies a GPA in a movable or removable memory such as frame buffers then KVM might continue to write to that address even after it's removed via KVM_SET_USER_MEMORY_REGION. KVM pins the page in memory so it's unlikely to cause an issue, but if the user space component re-purposes the memory previously used for the guest, then the guest will be able to corrupt that memory. Tested: Tested against kvmclock unit test Signed-off-by: Andrew Honig Signed-off-by: Marcelo Tosatti [bwh: Backported to 3.2: - Adjust context - We do not implement the PVCLOCK_GUEST_STOPPED flag] Signed-off-by: Ben Hutchings --- arch/x86/kvm/x86.c | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 50bac2e..2dd2e4e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1105,7 +1105,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags; struct kvm_vcpu_arch *vcpu = &v->arch; - void *shared_kaddr; unsigned long this_tsc_khz; s64 kernel_ns, max_kernel_ns; u64 tsc_timestamp; @@ -1141,7 +1140,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_restore(flags); - if (!vcpu->time_page) + if (!vcpu->pv_time_enabled) return 0; /* @@ -1199,14 +1198,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) */ vcpu->hv_clock.version += 2; - shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); - - memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); - - kunmap_atomic(shared_kaddr, KM_USER0); - - mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); return 0; } @@ -1496,10 +1490,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + vcpu->arch.pv_time_enabled = false; } static void accumulate_steal_time(struct kvm_vcpu *vcpu) @@ -1591,6 +1582,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { + u64 gpa_offset; kvmclock_reset(vcpu); vcpu->arch.time = data; @@ -1600,21 +1592,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!(data & 1)) break; - /* ...but clean it before doing the actual write */ - vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); + gpa_offset = data & ~(PAGE_MASK | 1); /* Check that the address is 32-byte aligned. */ - if (vcpu->arch.time_offset & - (sizeof(struct pvclock_vcpu_time_info) - 1)) + if (gpa_offset & (sizeof(struct pvclock_vcpu_time_info) - 1)) break; - vcpu->arch.time_page = - gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); - - if (is_error_page(vcpu->arch.time_page)) { - kvm_release_page_clean(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.pv_time, data & ~1ULL)) + vcpu->arch.pv_time_enabled = false; + else + vcpu->arch.pv_time_enabled = true; break; } case MSR_KVM_ASYNC_PF_EN: @@ -6554,6 +6542,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) goto fail_free_mce_banks; + vcpu->arch.pv_time_enabled = false; kvm_async_pf_hash_reset(vcpu); return 0; -- cgit v1.1 From c471da1e3f5c6e43397dccf47cefd8edc86aa9f0 Mon Sep 17 00:00:00 2001 From: Andrew Honig Date: Fri, 29 Mar 2013 09:35:21 -0700 Subject: KVM: Allow cross page reads and writes from cached translations. commit 8f964525a121f2ff2df948dac908dcc65be21b5b upstream. This patch adds support for kvm_gfn_to_hva_cache_init functions for reads and writes that will cross a page. If the range falls within the same memslot, then this will be a fast operation. If the range is split between two memslots, then the slower kvm_read_guest and kvm_write_guest are used. Tested: Test against kvm_clock unit tests. Signed-off-by: Andrew Honig Signed-off-by: Gleb Natapov [bwh: Backported to 3.2: - Drop change in lapic.c - Keep using __gfn_to_memslot() in kvm_gfn_to_hva_cache_init()] Signed-off-by: Ben Hutchings --- arch/x86/kvm/x86.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2dd2e4e..e82a53a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1480,7 +1480,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, + sizeof(u32))) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); @@ -1594,12 +1595,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) gpa_offset = data & ~(PAGE_MASK | 1); - /* Check that the address is 32-byte aligned. */ - if (gpa_offset & (sizeof(struct pvclock_vcpu_time_info) - 1)) - break; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, - &vcpu->arch.pv_time, data & ~1ULL)) + &vcpu->arch.pv_time, data & ~1ULL, + sizeof(struct pvclock_vcpu_time_info))) vcpu->arch.pv_time_enabled = false; else vcpu->arch.pv_time_enabled = true; @@ -1618,7 +1616,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 1; if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, - data & KVM_STEAL_VALID_BITS)) + data & KVM_STEAL_VALID_BITS, + sizeof(struct kvm_steal_time))) return 1; vcpu->arch.st.msr_val = data; -- cgit v1.1 From 7225ebc3c5bba27934da28579931ef46c5012411 Mon Sep 17 00:00:00 2001 From: "Zhanghaoyu (A)" Date: Fri, 14 Jun 2013 07:36:13 +0000 Subject: KVM: x86: remove vcpu's CPL check in host-invoked XCR set commit 764bcbc5a6d7a2f3e75c9f0e4caa984e2926e346 upstream. __kvm_set_xcr function does the CPL check when set xcr. __kvm_set_xcr is called in two flows, one is invoked by guest, call stack shown as below, handle_xsetbv(or xsetbv_interception) kvm_set_xcr __kvm_set_xcr the other one is invoked by host, for example during system reset: kvm_arch_vcpu_ioctl kvm_vcpu_ioctl_x86_set_xcrs __kvm_set_xcr The former does need the CPL check, but the latter does not. Signed-off-by: Zhang Haoyu [Tweaks to commit message. - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Ben Hutchings --- arch/x86/kvm/x86.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e82a53a..57867e4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -551,8 +551,6 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if (index != XCR_XFEATURE_ENABLED_MASK) return 1; xcr0 = xcr; - if (kvm_x86_ops->get_cpl(vcpu) != 0) - return 1; if (!(xcr0 & XSTATE_FP)) return 1; if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) @@ -566,7 +564,8 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - if (__kvm_set_xcr(vcpu, index, xcr)) { + if (kvm_x86_ops->get_cpl(vcpu) != 0 || + __kvm_set_xcr(vcpu, index, xcr)) { kvm_inject_gp(vcpu, 0); return 1; } -- cgit v1.1 From 6aa82e036079eaf208bd581c201dc61c9200bb2e Mon Sep 17 00:00:00 2001 From: Andy Honig Date: Wed, 20 Nov 2013 10:23:22 -0800 Subject: KVM: x86: Convert vapic synchronization to _cached functions (CVE-2013-6368) commit fda4e2e85589191b123d31cdc21fd33ee70f50fd upstream. In kvm_lapic_sync_from_vapic and kvm_lapic_sync_to_vapic there is the potential to corrupt kernel memory if userspace provides an address that is at the end of a page. This patches concerts those functions to use kvm_write_guest_cached and kvm_read_guest_cached. It also checks the vapic_address specified by userspace during ioctl processing and returns an error to userspace if the address is not a valid GPA. This is generally not guest triggerable, because the required write is done by firmware that runs before the guest. Also, it only affects AMD processors and oldish Intel that do not have the FlexPriority feature (unless you disable FlexPriority, of course; then newer processors are also affected). Fixes: b93463aa59d6 ('KVM: Accelerated apic support') Reported-by: Andrew Honig Signed-off-by: Andrew Honig Signed-off-by: Paolo Bonzini [dannf: backported to Debian's 3.2] Signed-off-by: Ben Hutchings --- arch/x86/kvm/x86.c | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 57867e4..7774cca 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3140,8 +3140,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&va, argp, sizeof va)) goto out; - r = 0; - kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); + r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); break; } case KVM_X86_SETUP_MCE: { @@ -5537,33 +5536,6 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) !kvm_event_needs_reinjection(vcpu); } -static void vapic_enter(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - struct page *page; - - if (!apic || !apic->vapic_addr) - return; - - page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - - vcpu->arch.apic->vapic_page = page; -} - -static void vapic_exit(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int idx; - - if (!apic || !apic->vapic_addr) - return; - - idx = srcu_read_lock(&vcpu->kvm->srcu); - kvm_release_page_dirty(apic->vapic_page); - mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - srcu_read_unlock(&vcpu->kvm->srcu, idx); -} - static void update_cr8_intercept(struct kvm_vcpu *vcpu) { int max_irr, tpr; @@ -5836,7 +5808,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) } vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - vapic_enter(vcpu); r = 1; while (r > 0) { @@ -5893,8 +5864,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - vapic_exit(vcpu); - return r; } -- cgit v1.1 From 7a9175abf3816d07fc8b87d74f3b4e4f19728f69 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 6 Jan 2014 12:00:02 -0200 Subject: KVM: x86: limit PIT timer frequency commit 9ed96e87c5748de4c2807ef17e81287c7304186c upstream. Limit PIT timer frequency similarly to the limit applied by LAPIC timer. Reviewed-by: Jan Kiszka Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini [bwh: Backported to 3.2: - Adjust context - s/ps->period/pt->period/] Signed-off-by: Ben Hutchings --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7774cca..b9fefaf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -92,6 +92,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); int ignore_msrs = 0; module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); +unsigned int min_timer_period_us = 500; +module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); + bool kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 kvm_max_guest_tsc_khz; -- cgit v1.1 From 76715b56c6fcdafae8d47d4fcfe8c940e76f0553 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 16 Sep 2014 03:24:05 +0300 Subject: KVM: x86: Check non-canonical addresses upon WRMSR commit 854e8bb1aa06c578c2c9145fa6bfe3680ef63b23 upstream. Upon WRMSR, the CPU should inject #GP if a non-canonical value (address) is written to certain MSRs. The behavior is "almost" identical for AMD and Intel (ignoring MSRs that are not implemented in either architecture since they would anyhow #GP). However, IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if non-canonical address is written on Intel but not on AMD (which ignores the top 32-bits). Accordingly, this patch injects a #GP on the MSRs which behave identically on Intel and AMD. To eliminate the differences between the architecutres, the value which is written to IA32_SYSENTER_ESP and IA32_SYSENTER_EIP is turned to canonical value before writing instead of injecting a #GP. Some references from Intel and AMD manuals: According to Intel SDM description of WRMSR instruction #GP is expected on WRMSR "If the source register contains a non-canonical address and ECX specifies one of the following MSRs: IA32_DS_AREA, IA32_FS_BASE, IA32_GS_BASE, IA32_KERNEL_GS_BASE, IA32_LSTAR, IA32_SYSENTER_EIP, IA32_SYSENTER_ESP." According to AMD manual instruction manual: LSTAR/CSTAR (SYSCALL): "The WRMSR instruction loads the target RIP into the LSTAR and CSTAR registers. If an RIP written by WRMSR is not in canonical form, a general-protection exception (#GP) occurs." IA32_GS_BASE and IA32_FS_BASE (WRFSBASE/WRGSBASE): "The address written to the base field must be in canonical form or a #GP fault will occur." IA32_KERNEL_GS_BASE (SWAPGS): "The address stored in the KernelGSbase MSR must be in canonical form." This patch fixes CVE-2014-3610. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini [bwh: Backported to 3.2: - The various set_msr() functions all separate msr_index and data parameters] Signed-off-by: Ben Hutchings --- arch/x86/kvm/x86.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b9fefaf..2d7d0df 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -893,7 +893,6 @@ void kvm_enable_efer_bits(u64 mask) } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); - /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -901,8 +900,34 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); */ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { + switch (msr_index) { + case MSR_FS_BASE: + case MSR_GS_BASE: + case MSR_KERNEL_GS_BASE: + case MSR_CSTAR: + case MSR_LSTAR: + if (is_noncanonical_address(data)) + return 1; + break; + case MSR_IA32_SYSENTER_EIP: + case MSR_IA32_SYSENTER_ESP: + /* + * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if + * non-canonical address is written on Intel but not on + * AMD (which ignores the top 32-bits, because it does + * not implement 64-bit SYSENTER). + * + * 64-bit code should hence be able to write a non-canonical + * value on AMD. Making the address canonical ensures that + * vmentry does not fail on Intel after writing a non-canonical + * value, and that something deterministic happens if the guest + * invokes 64-bit SYSENTER. + */ + data = get_canonical(data); + } return kvm_x86_ops->set_msr(vcpu, msr_index, data); } +EXPORT_SYMBOL_GPL(kvm_set_msr); /* * Adapt set_msr() to msr_io()'s calling convention -- cgit v1.1 From 1aded21661bda559a407cfb7c69d0e53b72bc671 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 17 Sep 2014 02:50:50 +0300 Subject: KVM: x86: Don't report guest userspace emulation error to userspace commit a2b9e6c1a35afcc0973acb72e591c714e78885ff upstream. Commit fc3a9157d314 ("KVM: X86: Don't report L2 emulation failures to user-space") disabled the reporting of L2 (nested guest) emulation failures to userspace due to race-condition between a vmexit and the instruction emulator. The same rational applies also to userspace applications that are permitted by the guest OS to access MMIO area or perform PIO. This patch extends the current behavior - of injecting a #UD instead of reporting it to userspace - also for guest userspace code. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini Signed-off-by: Ben Hutchings --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2d7d0df..bb179cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4846,7 +4846,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); - if (!is_guest_mode(vcpu)) { + if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; -- cgit v1.1 From 1ddf94afb9252e2845c41c78df59216ed4bd2fe6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 18 Sep 2015 17:33:04 +0200 Subject: KVM: x86: trap AMD MSRs for the TSeg base and mask commit 3afb1121800128aae9f5722e50097fcf1a9d4d88 upstream. These have roughly the same purpose as the SMRR, which we do not need to implement in KVM. However, Linux accesses MSR_K8_TSEG_ADDR at boot, which causes problems when running a Xen dom0 under KVM. Just return 0, meaning that processor protection of SMRAM is not in effect. Reported-by: M A Young Acked-by: Borislav Petkov Signed-off-by: Paolo Bonzini Signed-off-by: Ben Hutchings --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bb179cc..0e3289b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1885,6 +1885,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: case MSR_K8_SYSCFG: + case MSR_K8_TSEG_ADDR: + case MSR_K8_TSEG_MASK: case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: case MSR_P6_PERFCTR0: -- cgit v1.1