/* SPDX-License-Identifier: GPL-2.0-only */ /* * nestedsvm.c: Nested Virtualization * Copyright (c) 2011, Advanced Micro Devices, Inc * */ #include #include #include #include #include #include /* paging_mode_hap */ #include /* for local_event_delivery_(en|dis)able */ #include /* p2m_get_pagetable, p2m_get_nestedp2m */ #include "nestedhvm.h" #include "svm.h" #define NSVM_ERROR_VVMCB 1 #define NSVM_ERROR_VMENTRY 2 static void nestedsvm_vcpu_clgi(struct vcpu *v) { /* clear gif flag */ vcpu_nestedsvm(v).ns_gif = 0; local_event_delivery_disable(); /* mask events for PV drivers */ } static void nestedsvm_vcpu_stgi(struct vcpu *v) { /* enable gif flag */ vcpu_nestedsvm(v).ns_gif = 1; local_event_delivery_enable(); /* unmask events for PV drivers */ } int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); if ( nv->nv_vvmcx != NULL && nv->nv_vvmcxaddr != vmcbaddr ) { ASSERT(vvmcx_valid(v)); hvm_unmap_guest_frame(nv->nv_vvmcx, 1); nv->nv_vvmcx = NULL; nv->nv_vvmcxaddr = INVALID_PADDR; } if ( !nv->nv_vvmcx ) { bool_t writable; void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(vmcbaddr), 1, &writable); if ( !vvmcx ) return 0; if ( !writable ) { hvm_unmap_guest_frame(vvmcx, 1); return 0; } nv->nv_vvmcx = vvmcx; nv->nv_vvmcxaddr = vmcbaddr; } return 1; } /* Interface methods */ int cf_check nsvm_vcpu_initialise(struct vcpu *v) { void *msrpm; struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0); svm->ns_cached_msrpm = msrpm; if ( msrpm == NULL ) goto err; memset(msrpm, 0x0, MSRPM_SIZE); msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0); svm->ns_merged_msrpm = msrpm; if ( msrpm == NULL ) goto err; memset(msrpm, 0x0, MSRPM_SIZE); nv->nv_n2vmcx = alloc_vmcb(); if ( nv->nv_n2vmcx == NULL ) goto err; nv->nv_n2vmcx_pa = virt_to_maddr(nv->nv_n2vmcx); return 0; err: nsvm_vcpu_destroy(v); return -ENOMEM; } void cf_check nsvm_vcpu_destroy(struct vcpu *v) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); /* * When destroying the vcpu, it may be running on behalf of l2 guest. * Therefore we need to switch the VMCB pointer back to the l1 vmcb, * in order to avoid double free of l2 vmcb and the possible memory leak * of l1 vmcb page. */ if ( nv->nv_n1vmcx ) v->arch.hvm.svm.vmcb = nv->nv_n1vmcx; if ( svm->ns_cached_msrpm ) { free_xenheap_pages(svm->ns_cached_msrpm, get_order_from_bytes(MSRPM_SIZE)); svm->ns_cached_msrpm = NULL; } if ( svm->ns_merged_msrpm ) { free_xenheap_pages(svm->ns_merged_msrpm, get_order_from_bytes(MSRPM_SIZE)); svm->ns_merged_msrpm = NULL; } hvm_unmap_guest_frame(nv->nv_vvmcx, 1); nv->nv_vvmcx = NULL; if ( nv->nv_n2vmcx ) { free_vmcb(nv->nv_n2vmcx); nv->nv_n2vmcx = NULL; nv->nv_n2vmcx_pa = INVALID_PADDR; } svm->ns_iomap = NULL; } int cf_check nsvm_vcpu_reset(struct vcpu *v) { struct nestedsvm *svm = &vcpu_nestedsvm(v); svm->ns_msr_hsavepa = INVALID_PADDR; svm->ns_ovvmcb_pa = INVALID_PADDR; svm->ns_tscratio = DEFAULT_TSC_RATIO; svm->ns_cr_intercepts = 0; svm->ns_dr_intercepts = 0; svm->ns_exception_intercepts = 0; svm->ns_general1_intercepts = 0; svm->ns_general2_intercepts = 0; svm->ns_hap_enabled = 0; svm->ns_vmcb_guestcr3 = 0; svm->ns_vmcb_hostcr3 = 0; svm->ns_guest_asid = 0; svm->ns_hostflags.bytes = 0; svm->ns_vmexit.exitinfo1 = 0; svm->ns_vmexit.exitinfo2 = 0; svm->ns_iomap = NULL; nestedsvm_vcpu_stgi(v); return 0; } static uint64_t nestedsvm_fpu_vmentry(uint64_t n1cr0, struct vmcb_struct *vvmcb, struct vmcb_struct *n1vmcb, struct vmcb_struct *n2vmcb) { uint64_t vcr0; vcr0 = vvmcb->_cr0; if ( !(n1cr0 & X86_CR0_TS) && (n1vmcb->_cr0 & X86_CR0_TS) ) { /* * svm_fpu_leave() run while l1 guest was running. * Sync FPU state with l2 guest. */ vcr0 |= X86_CR0_TS; n2vmcb->_exception_intercepts |= (1U << X86_EXC_NM); } else if ( !(vcr0 & X86_CR0_TS) && (n2vmcb->_cr0 & X86_CR0_TS) ) { /* * svm_fpu_enter() run while l1 guest was running. * Sync FPU state with l2 guest. */ vcr0 &= ~X86_CR0_TS; n2vmcb->_exception_intercepts &= ~(1U << X86_EXC_NM); } return vcr0; } static void nestedsvm_fpu_vmexit(struct vmcb_struct *n1vmcb, struct vmcb_struct *n2vmcb, uint64_t n1cr0, uint64_t guest_cr0) { if ( !(guest_cr0 & X86_CR0_TS) && (n2vmcb->_cr0 & X86_CR0_TS) ) { /* * svm_fpu_leave() run while l2 guest was running. * Sync FPU state with l1 guest. */ n1vmcb->_cr0 |= X86_CR0_TS; n1vmcb->_exception_intercepts |= (1U << X86_EXC_NM); } else if ( !(n1cr0 & X86_CR0_TS) && (n1vmcb->_cr0 & X86_CR0_TS) ) { /* * svm_fpu_enter() run while l2 guest was running. * Sync FPU state with l1 guest. */ n1vmcb->_cr0 &= ~X86_CR0_TS; n1vmcb->_exception_intercepts &= ~(1U << X86_EXC_NM); } } static int nsvm_vcpu_hostsave(struct vcpu *v, unsigned int inst_len) { struct nestedsvm *svm = &vcpu_nestedsvm(v); struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct vmcb_struct *n1vmcb; n1vmcb = nv->nv_n1vmcx; ASSERT(n1vmcb != NULL); n1vmcb->rip += inst_len; /* * Save shadowed values. This ensures that the l1 guest * cannot override them to break out. */ n1vmcb->_efer = v->arch.hvm.guest_efer; n1vmcb->_cr0 = v->arch.hvm.guest_cr[0]; n1vmcb->_cr2 = v->arch.hvm.guest_cr[2]; n1vmcb->_cr4 = v->arch.hvm.guest_cr[4]; /* Remember the host interrupt flag */ svm->ns_hostflags.fields.rflagsif = !!(n1vmcb->rflags & X86_EFLAGS_IF); return 0; } static int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *n1vmcb, *n2vmcb; int rc; n1vmcb = nv->nv_n1vmcx; n2vmcb = nv->nv_n2vmcx; ASSERT(n1vmcb != NULL); ASSERT(n2vmcb != NULL); /* * nsvm_vmcb_prepare4vmexit() already saved register values * handled by VMSAVE/VMLOAD into n1vmcb directly. */ /* switch vmcb to l1 guest's vmcb */ v->arch.hvm.svm.vmcb = n1vmcb; v->arch.hvm.svm.vmcb_pa = nv->nv_n1vmcx_pa; /* EFER */ v->arch.hvm.guest_efer = n1vmcb->_efer; rc = hvm_set_efer(n1vmcb->_efer); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(X86_EXC_GP, 0); if ( rc != X86EMUL_OKAY ) gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc); /* CR4 */ v->arch.hvm.guest_cr[4] = n1vmcb->_cr4; rc = hvm_set_cr4(n1vmcb->_cr4, true); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(X86_EXC_GP, 0); if ( rc != X86EMUL_OKAY ) gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc); /* CR0 */ nestedsvm_fpu_vmexit(n1vmcb, n2vmcb, svm->ns_cr0, v->arch.hvm.guest_cr[0]); v->arch.hvm.guest_cr[0] = n1vmcb->_cr0 | X86_CR0_PE; n1vmcb->rflags &= ~X86_EFLAGS_VM; rc = hvm_set_cr0(n1vmcb->_cr0 | X86_CR0_PE, true); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(X86_EXC_GP, 0); if ( rc != X86EMUL_OKAY ) gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc); svm->ns_cr0 = v->arch.hvm.guest_cr[0]; /* CR2 */ v->arch.hvm.guest_cr[2] = n1vmcb->_cr2; hvm_update_guest_cr(v, 2); /* CR3 */ /* Nested paging mode */ if ( nestedhvm_paging_mode_hap(v) ) { /* host nested paging + guest nested paging. */ /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */ } else if ( paging_mode_hap(v->domain) ) { /* host nested paging + guest shadow paging. */ /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */ } else { /* host shadow paging + guest shadow paging. */ /* Reset MMU context -- XXX (hostrestore) not yet working*/ if ( !pagetable_is_null(v->arch.guest_table) ) put_page(pagetable_get_page(v->arch.guest_table)); v->arch.guest_table = pagetable_null(); /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */ } rc = hvm_set_cr3(n1vmcb->_cr3, false, true); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(X86_EXC_GP, 0); if ( rc != X86EMUL_OKAY ) gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); regs->rax = n1vmcb->rax; regs->rsp = n1vmcb->rsp; regs->rip = n1vmcb->rip; regs->rflags = n1vmcb->rflags; n1vmcb->_dr7 = 0; /* disable all breakpoints */ n1vmcb->_cpl = 0; /* * Clear exitintinfo to prevent a fault loop of re-injecting * exceptions forever. */ n1vmcb->exit_int_info.raw = 0; /* Cleanbits */ n1vmcb->cleanbits.raw = 0; return 0; } static int nsvm_vmrun_permissionmap(struct vcpu *v, bool_t viopm) { struct svm_vcpu *arch_svm = &v->arch.hvm.svm; struct nestedsvm *svm = &vcpu_nestedsvm(v); struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; struct vmcb_struct *host_vmcb = arch_svm->vmcb; unsigned long *ns_msrpm_ptr; unsigned int i; enum hvm_translation_result ret; unsigned long *ns_viomap; bool_t ioport_80 = 1, ioport_ed = 1; ns_msrpm_ptr = (unsigned long *)svm->ns_cached_msrpm; ret = hvm_copy_from_guest_phys(svm->ns_cached_msrpm, ns_vmcb->_msrpm_base_pa, MSRPM_SIZE); if ( ret != HVMTRANS_okay ) { gdprintk(XENLOG_ERR, "hvm_copy_from_guest_phys msrpm %u\n", ret); return 1; } /* Check l1 guest io permission map and get a shadow one based on * if l1 guest intercepts io ports 0x80 and/or 0xED. */ svm->ns_oiomap_pa = svm->ns_iomap_pa; svm->ns_iomap_pa = ns_vmcb->_iopm_base_pa; ns_viomap = hvm_map_guest_frame_ro(svm->ns_iomap_pa >> PAGE_SHIFT, 0); if ( ns_viomap ) { ioport_80 = test_bit(0x80, ns_viomap); ioport_ed = test_bit(0xed, ns_viomap); hvm_unmap_guest_frame(ns_viomap, 0); } svm->ns_iomap = nestedhvm_vcpu_iomap_get(ioport_80, ioport_ed); nv->nv_ioport80 = ioport_80; nv->nv_ioportED = ioport_ed; /* v->arch.hvm.svm.msrpm has type unsigned long, thus BYTES_PER_LONG. */ for ( i = 0; i < MSRPM_SIZE / BYTES_PER_LONG; i++ ) svm->ns_merged_msrpm[i] = arch_svm->msrpm[i] | ns_msrpm_ptr[i]; host_vmcb->_iopm_base_pa = virt_to_maddr(svm->ns_iomap); host_vmcb->_msrpm_base_pa = virt_to_maddr(svm->ns_merged_msrpm); return 0; } static void nestedsvm_vmcb_set_nestedp2m(struct vcpu *v, struct vmcb_struct *vvmcb, struct vmcb_struct *n2vmcb) { struct p2m_domain *p2m; ASSERT(v != NULL); ASSERT(vvmcb != NULL); ASSERT(n2vmcb != NULL); /* This will allow nsvm_vcpu_hostcr3() to return correct np2m_base */ vcpu_nestedsvm(v).ns_vmcb_hostcr3 = vvmcb->_h_cr3; p2m = p2m_get_nestedp2m(v); n2vmcb->_h_cr3 = pagetable_get_paddr(p2m_get_pagetable(p2m)); } static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb, *n1vmcb, *n2vmcb; vmcbcleanbits_t clean = {}; int rc; uint64_t cr0; ns_vmcb = nv->nv_vvmcx; n1vmcb = nv->nv_n1vmcx; n2vmcb = nv->nv_n2vmcx; ASSERT(ns_vmcb != NULL); ASSERT(n1vmcb != NULL); ASSERT(n2vmcb != NULL); /* Check if virtual VMCB cleanbits are valid */ if ( svm->ns_ovvmcb_pa != INVALID_PADDR && svm->ns_ovvmcb_pa == nv->nv_vvmcxaddr ) clean = ns_vmcb->cleanbits; /* Enable l2 guest intercepts */ if ( !clean.intercepts ) { svm->ns_cr_intercepts = ns_vmcb->_cr_intercepts; svm->ns_dr_intercepts = ns_vmcb->_dr_intercepts; svm->ns_exception_intercepts = ns_vmcb->_exception_intercepts; svm->ns_general1_intercepts = ns_vmcb->_general1_intercepts; svm->ns_general2_intercepts = ns_vmcb->_general2_intercepts; } /* We could track the cleanbits of the n1vmcb from * last emulated #VMEXIT to this emulated VMRUN to safe the merges * below. Those cleanbits would be tracked in an integer field * in struct nestedsvm. * But this effort is not worth doing because: * - Only the intercepts bit of the n1vmcb can effectively be used here * - The CPU runs more instructions for the tracking than can be * safed here. * The overhead comes from (ordered from highest to lowest): * - svm_ctxt_switch_to (CPU context switching) * - svm_fpu_enter, svm_fpu_leave (lazy FPU switching) * - emulated CLGI (clears VINTR intercept) * - host clears VINTR intercept * Test results show that the overhead is high enough that the * tracked intercepts bit of the n1vmcb is practically *always* cleared. */ n2vmcb->_cr_intercepts = n1vmcb->_cr_intercepts | ns_vmcb->_cr_intercepts; n2vmcb->_dr_intercepts = n1vmcb->_dr_intercepts | ns_vmcb->_dr_intercepts; n2vmcb->_exception_intercepts = n1vmcb->_exception_intercepts | ns_vmcb->_exception_intercepts; n2vmcb->_general1_intercepts = n1vmcb->_general1_intercepts | ns_vmcb->_general1_intercepts; n2vmcb->_general2_intercepts = n1vmcb->_general2_intercepts | ns_vmcb->_general2_intercepts; /* Nested Pause Filter */ if ( ns_vmcb->_general1_intercepts & GENERAL1_INTERCEPT_PAUSE ) n2vmcb->_pause_filter_count = min(n1vmcb->_pause_filter_count, ns_vmcb->_pause_filter_count); else n2vmcb->_pause_filter_count = n1vmcb->_pause_filter_count; /* TSC offset */ n2vmcb->_tsc_offset = n1vmcb->_tsc_offset + ns_vmcb->_tsc_offset; /* Nested IO permission bitmaps */ rc = nsvm_vmrun_permissionmap(v, clean.iopm); if ( rc ) return rc; /* ASID - Emulation handled in hvm_asid_handle_vmenter() */ /* TLB control */ n2vmcb->tlb_control = ns_vmcb->tlb_control; /* Virtual Interrupts */ if ( !clean.tpr ) { n2vmcb->_vintr = ns_vmcb->_vintr; n2vmcb->_vintr.fields.intr_masking = 1; } /* Interrupt state */ n2vmcb->int_stat = ns_vmcb->int_stat; /* Exit codes */ n2vmcb->exitcode = ns_vmcb->exitcode; n2vmcb->exitinfo1 = ns_vmcb->exitinfo1; n2vmcb->exitinfo2 = ns_vmcb->exitinfo2; n2vmcb->exit_int_info = ns_vmcb->exit_int_info; /* Pending Interrupts */ n2vmcb->event_inj = ns_vmcb->event_inj; n2vmcb->virt_ext.bytes = n1vmcb->virt_ext.bytes | ns_vmcb->virt_ext.bytes; /* NextRIP - only evaluated on #VMEXIT. */ /* * VMCB Save State Area */ /* Segments */ if ( !clean.seg ) { n2vmcb->es = ns_vmcb->es; n2vmcb->cs = ns_vmcb->cs; n2vmcb->ss = ns_vmcb->ss; n2vmcb->ds = ns_vmcb->ds; /* CPL */ n2vmcb->_cpl = ns_vmcb->_cpl; } if ( !clean.dt ) { n2vmcb->gdtr = ns_vmcb->gdtr; n2vmcb->idtr = ns_vmcb->idtr; } /* EFER */ v->arch.hvm.guest_efer = ns_vmcb->_efer; rc = hvm_set_efer(ns_vmcb->_efer); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(X86_EXC_GP, 0); if ( rc != X86EMUL_OKAY ) gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc); /* CR4 */ v->arch.hvm.guest_cr[4] = ns_vmcb->_cr4; rc = hvm_set_cr4(ns_vmcb->_cr4, true); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(X86_EXC_GP, 0); if ( rc != X86EMUL_OKAY ) gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc); /* CR0 */ svm->ns_cr0 = v->arch.hvm.guest_cr[0]; cr0 = nestedsvm_fpu_vmentry(svm->ns_cr0, ns_vmcb, n1vmcb, n2vmcb); v->arch.hvm.guest_cr[0] = ns_vmcb->_cr0; rc = hvm_set_cr0(cr0, true); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(X86_EXC_GP, 0); if ( rc != X86EMUL_OKAY ) gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc); /* CR2 */ v->arch.hvm.guest_cr[2] = ns_vmcb->_cr2; hvm_update_guest_cr(v, 2); /* Nested paging mode */ if ( nestedhvm_paging_mode_hap(v) ) { /* host nested paging + guest nested paging. */ n2vmcb->_np_enable = 1; nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb); /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */ rc = hvm_set_cr3(ns_vmcb->_cr3, false, true); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(X86_EXC_GP, 0); if ( rc != X86EMUL_OKAY ) gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); } else if ( paging_mode_hap(v->domain) ) { /* host nested paging + guest shadow paging. */ n2vmcb->_np_enable = 1; /* Keep h_cr3 as it is. */ n2vmcb->_h_cr3 = n1vmcb->_h_cr3; /* When l1 guest does shadow paging * we assume it intercepts page faults. */ /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */ rc = hvm_set_cr3(ns_vmcb->_cr3, false, true); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(X86_EXC_GP, 0); if ( rc != X86EMUL_OKAY ) gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); } else { /* host shadow paging + guest shadow paging. */ n2vmcb->_np_enable = 0; n2vmcb->_h_cr3 = 0x0; /* TODO: Once shadow-shadow paging is in place come back to here * and set host_vmcb->_cr3 to the shadowed shadow table. */ } /* DRn */ if ( !clean.dr ) { n2vmcb->_dr7 = ns_vmcb->_dr7; n2vmcb->_dr6 = ns_vmcb->_dr6; } /* RFLAGS */ n2vmcb->rflags = ns_vmcb->rflags; /* RIP */ n2vmcb->rip = ns_vmcb->rip; /* RSP */ n2vmcb->rsp = ns_vmcb->rsp; /* RAX */ n2vmcb->rax = ns_vmcb->rax; /* Keep the host values of the fs, gs, ldtr, tr, kerngsbase, * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp, * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation. */ /* PAT */ if ( !clean.np ) n2vmcb->_g_pat = ns_vmcb->_g_pat; if ( !clean.lbr ) { /* Debug Control MSR */ n2vmcb->_debugctlmsr = ns_vmcb->_debugctlmsr; /* LBR MSRs */ n2vmcb->_lastbranchfromip = ns_vmcb->_lastbranchfromip; n2vmcb->_lastbranchtoip = ns_vmcb->_lastbranchtoip; n2vmcb->_lastintfromip = ns_vmcb->_lastintfromip; n2vmcb->_lastinttoip = ns_vmcb->_lastinttoip; } /* Cleanbits */ n2vmcb->cleanbits.raw = 0; rc = svm_vmcb_isvalid(__func__, ns_vmcb, v, true); if ( rc ) { gdprintk(XENLOG_ERR, "virtual vmcb invalid\n"); return NSVM_ERROR_VVMCB; } rc = svm_vmcb_isvalid(__func__, n2vmcb, v, true); if ( rc ) { gdprintk(XENLOG_ERR, "n2vmcb invalid\n"); return NSVM_ERROR_VMENTRY; } /* Switch guest registers to l2 guest */ regs->rax = ns_vmcb->rax; regs->rip = ns_vmcb->rip; regs->rsp = ns_vmcb->rsp; regs->rflags = ns_vmcb->rflags; return 0; } static int nsvm_vcpu_vmentry(struct vcpu *v, struct cpu_user_regs *regs, unsigned int inst_len) { int ret; struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb; ns_vmcb = nv->nv_vvmcx; ASSERT(ns_vmcb != NULL); ASSERT(nv->nv_n2vmcx != NULL); ASSERT(nv->nv_n2vmcx_pa != INVALID_PADDR); /* Save values for later use. Needed for Nested-on-Nested and * Shadow-on-Shadow paging. */ svm->ns_vmcb_guestcr3 = ns_vmcb->_cr3; svm->ns_vmcb_hostcr3 = ns_vmcb->_h_cr3; /* Convert explicitely to boolean. Deals with l1 guests * that use flush-by-asid w/o checking the cpuid bits */ nv->nv_flushp2m = !!ns_vmcb->tlb_control; if ( svm->ns_guest_asid != ns_vmcb->_guest_asid ) { nv->nv_flushp2m = 1; hvm_asid_flush_vcpu_asid(&vcpu_nestedhvm(v).nv_n2asid); svm->ns_guest_asid = ns_vmcb->_guest_asid; } /* nested paging for the guest */ svm->ns_hap_enabled = !!ns_vmcb->_np_enable; /* Remember the V_INTR_MASK in hostflags */ svm->ns_hostflags.fields.vintrmask = !!ns_vmcb->_vintr.fields.intr_masking; /* Save l1 guest state (= host state) */ ret = nsvm_vcpu_hostsave(v, inst_len); if ( ret ) { gdprintk(XENLOG_ERR, "hostsave failed, ret = %i\n", ret); return ret; } /* switch vmcb to shadow vmcb */ v->arch.hvm.svm.vmcb = nv->nv_n2vmcx; v->arch.hvm.svm.vmcb_pa = nv->nv_n2vmcx_pa; ret = nsvm_vmcb_prepare4vmrun(v, regs); if ( ret ) { gdprintk(XENLOG_ERR, "prepare4vmrun failed, ret = %i\n", ret); return ret; } nestedsvm_vcpu_stgi(v); return 0; } int nsvm_vcpu_vmrun(struct vcpu *v, struct cpu_user_regs *regs) { int ret; unsigned int inst_len; struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); inst_len = svm_get_insn_len(v, INSTR_VMRUN); if ( inst_len == 0 ) { svm->ns_vmexit.exitcode = VMEXIT_SHUTDOWN; return -1; } nv->nv_vmswitch_in_progress = 1; ASSERT(nv->nv_vvmcx != NULL); /* save host state */ ret = nsvm_vcpu_vmentry(v, regs, inst_len); /* Switch vcpu to guest mode. In the error case * this ensures the host mode is restored correctly * and l1 guest keeps alive. */ nestedhvm_vcpu_enter_guestmode(v); switch ( ret ) { case 0: break; case NSVM_ERROR_VVMCB: gdprintk(XENLOG_ERR, "inject VMEXIT(INVALID)\n"); svm->ns_vmexit.exitcode = VMEXIT_INVALID; return -1; case NSVM_ERROR_VMENTRY: default: gdprintk(XENLOG_ERR, "nsvm_vcpu_vmentry failed, injecting #UD\n"); hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC); /* Must happen after hvm_inject_hw_exception or it doesn't work right. */ nv->nv_vmswitch_in_progress = 0; return 1; } /* If l1 guest uses shadow paging, update the paging mode. */ if ( !nestedhvm_paging_mode_hap(v) ) paging_update_paging_modes(v); nv->nv_vmswitch_in_progress = 0; return 0; } static int nsvm_vcpu_vmexit_inject(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb; struct vmcb_struct *vmcb = v->arch.hvm.svm.vmcb; if ( vmcb->_vintr.fields.vgif_enable ) ASSERT(vmcb->_vintr.fields.vgif == 0); else ASSERT(svm->ns_gif == 0); ns_vmcb = nv->nv_vvmcx; if ( nv->nv_vmexit_pending ) { switch ( exitcode ) { case VMEXIT_INTR: if ( unlikely(ns_vmcb->event_inj.v) && nv->nv_vmentry_pending && hvm_event_needs_reinjection(ns_vmcb->event_inj.type, ns_vmcb->event_inj.vector) ) ns_vmcb->exit_int_info = ns_vmcb->event_inj; break; case VMEXIT_EXCEPTION_PF: ns_vmcb->_cr2 = ns_vmcb->ei.exc.cr2; fallthrough; case VMEXIT_NPF: ns_vmcb->exitinfo2 = svm->ns_vmexit.exitinfo2; fallthrough; case VMEXIT_EXCEPTION_NP: case VMEXIT_EXCEPTION_SS: case VMEXIT_EXCEPTION_GP: case VMEXIT_EXCEPTION_15: case VMEXIT_EXCEPTION_MF: case VMEXIT_EXCEPTION_AC: ns_vmcb->exitinfo1 = svm->ns_vmexit.exitinfo1; break; default: break; } } ns_vmcb->exitcode = exitcode; ns_vmcb->event_inj.raw = 0; return 0; } int cf_check nsvm_vcpu_vmexit_event( struct vcpu *v, const struct x86_event *trap) { ASSERT(vcpu_nestedhvm(v).nv_vvmcx != NULL); nestedsvm_vmexit_defer(v, VMEXIT_EXCEPTION_DE + trap->vector, trap->error_code, trap->cr2); return NESTEDHVM_VMEXIT_DONE; } uint64_t cf_check nsvm_vcpu_hostcr3(struct vcpu *v) { return vcpu_nestedsvm(v).ns_vmcb_hostcr3; } static int nsvm_vmcb_guest_intercepts_msr(unsigned long *msr_bitmap, uint32_t msr, bool_t write) { bool_t enabled; unsigned long *msr_bit; msr_bit = svm_msrbit(msr_bitmap, msr); if ( msr_bit == NULL ) /* MSR not in the permission map: Let the guest handle it. */ return NESTEDHVM_VMEXIT_INJECT; msr &= 0x1fff; if ( write ) /* write access */ enabled = test_bit(msr * 2 + 1, msr_bit); else /* read access */ enabled = test_bit(msr * 2, msr_bit); if ( !enabled ) return NESTEDHVM_VMEXIT_HOST; return NESTEDHVM_VMEXIT_INJECT; } static int nsvm_vmcb_guest_intercepts_ioio(paddr_t iopm_pa, uint64_t exitinfo1) { unsigned long gfn = iopm_pa >> PAGE_SHIFT; unsigned long *io_bitmap; ioio_info_t ioinfo; uint16_t port; unsigned int size; bool_t enabled; ioinfo.bytes = exitinfo1; port = ioinfo.fields.port; size = ioinfo.fields.sz32 ? 4 : ioinfo.fields.sz16 ? 2 : 1; switch ( port ) { case 0 ... 8 * PAGE_SIZE - 1: /* first 4KB page */ break; case 8 * PAGE_SIZE ... 2 * 8 * PAGE_SIZE - 1: /* second 4KB page */ port -= 8 * PAGE_SIZE; ++gfn; break; default: BUG(); break; } for ( io_bitmap = hvm_map_guest_frame_ro(gfn, 0); ; ) { enabled = io_bitmap && test_bit(port, io_bitmap); if ( !enabled || !--size ) break; if ( unlikely(++port == 8 * PAGE_SIZE) ) { hvm_unmap_guest_frame(io_bitmap, 0); io_bitmap = hvm_map_guest_frame_ro(++gfn, 0); port -= 8 * PAGE_SIZE; } } hvm_unmap_guest_frame(io_bitmap, 0); if ( !enabled ) return NESTEDHVM_VMEXIT_HOST; return NESTEDHVM_VMEXIT_INJECT; } static bool_t nsvm_vmcb_guest_intercepts_exitcode(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode) { uint64_t exit_bits; struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; enum nestedhvm_vmexits vmexits; switch ( exitcode ) { case VMEXIT_CR0_READ ... VMEXIT_CR15_READ: case VMEXIT_CR0_WRITE ... VMEXIT_CR15_WRITE: exit_bits = 1ULL << (exitcode - VMEXIT_CR0_READ); if ( svm->ns_cr_intercepts & exit_bits ) break; return 0; case VMEXIT_DR0_READ ... VMEXIT_DR7_READ: case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE: exit_bits = 1ULL << (exitcode - VMEXIT_DR0_READ); if ( svm->ns_dr_intercepts & exit_bits ) break; return 0; case VMEXIT_EXCEPTION_DE ... VMEXIT_EXCEPTION_XF: exit_bits = 1ULL << (exitcode - VMEXIT_EXCEPTION_DE); if ( svm->ns_exception_intercepts & exit_bits ) break; return 0; case VMEXIT_INTR ... VMEXIT_SHUTDOWN: exit_bits = 1ULL << (exitcode - VMEXIT_INTR); if ( svm->ns_general1_intercepts & exit_bits ) break; return 0; case VMEXIT_VMRUN ... VMEXIT_XSETBV: exit_bits = 1ULL << (exitcode - VMEXIT_VMRUN); if ( svm->ns_general2_intercepts & exit_bits ) break; return 0; case VMEXIT_NPF: if ( nestedhvm_paging_mode_hap(v) ) break; return 0; case VMEXIT_INVALID: /* Always intercepted */ break; default: gdprintk(XENLOG_ERR, "Illegal exitcode %#"PRIx64"\n", exitcode); BUG(); break; } /* Special cases: Do more detailed checks */ switch ( exitcode ) { case VMEXIT_MSR: ASSERT(regs != NULL); if ( !nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr) ) break; ns_vmcb = nv->nv_vvmcx; vmexits = nsvm_vmcb_guest_intercepts_msr(svm->ns_cached_msrpm, regs->ecx, ns_vmcb->exitinfo1 != 0); if ( vmexits == NESTEDHVM_VMEXIT_HOST ) return 0; break; case VMEXIT_IOIO: if ( !nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr) ) break; ns_vmcb = nv->nv_vvmcx; vmexits = nsvm_vmcb_guest_intercepts_ioio(ns_vmcb->_iopm_base_pa, ns_vmcb->exitinfo1); if ( vmexits == NESTEDHVM_VMEXIT_HOST ) return 0; break; } return 1; } bool cf_check nsvm_vmcb_guest_intercepts_event( struct vcpu *v, unsigned int vector, int errcode) { return nsvm_vmcb_guest_intercepts_exitcode(v, guest_cpu_user_regs(), VMEXIT_EXCEPTION_DE + vector); } static int nsvm_vmcb_prepare4vmexit(struct vcpu *v, struct cpu_user_regs *regs) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; struct vmcb_struct *n2vmcb = nv->nv_n2vmcx; svm_vmsave_pa(nv->nv_n1vmcx_pa); /* Cache guest physical address of virtual vmcb * for VMCB Cleanbit emulation. */ svm->ns_ovvmcb_pa = nv->nv_vvmcxaddr; /* Intercepts - keep them as they are */ /* Pausefilter - keep it as is */ /* Nested IO permission bitmap */ /* Just keep the iopm_base_pa and msrpm_base_pa values. * The guest must not see the virtualized values. */ /* TSC offset */ /* Keep it. It's maintainted by the l1 guest. */ /* ASID */ /* ns_vmcb->_guest_asid = n2vmcb->_guest_asid; */ /* TLB control */ ns_vmcb->tlb_control = 0; /* Virtual Interrupts */ ns_vmcb->_vintr = n2vmcb->_vintr; if ( !svm->ns_hostflags.fields.vintrmask ) ns_vmcb->_vintr.fields.intr_masking = 0; /* Interrupt state */ ns_vmcb->int_stat = n2vmcb->int_stat; /* Exit codes */ ns_vmcb->exitcode = n2vmcb->exitcode; ns_vmcb->exitinfo1 = n2vmcb->exitinfo1; ns_vmcb->exitinfo2 = n2vmcb->exitinfo2; ns_vmcb->exit_int_info = n2vmcb->exit_int_info; /* Interrupts */ /* If we emulate a VMRUN/#VMEXIT in the same host #VMEXIT cycle we have * to make sure that we do not lose injected events. So check eventinj * here and copy it to exitintinfo if it is valid. * exitintinfo and eventinj can't be both valid because the case below * only happens on a VMRUN instruction intercept which has no valid * exitintinfo set. */ if ( unlikely(n2vmcb->event_inj.v) && hvm_event_needs_reinjection(n2vmcb->event_inj.type, n2vmcb->event_inj.vector) ) ns_vmcb->exit_int_info = n2vmcb->event_inj; ns_vmcb->event_inj.raw = 0; /* Nested paging mode */ if ( nestedhvm_paging_mode_hap(v) ) { /* host nested paging + guest nested paging. */ ns_vmcb->_np_enable = n2vmcb->_np_enable; ns_vmcb->_cr3 = n2vmcb->_cr3; /* The vmcb->h_cr3 is the shadowed h_cr3. The original * unshadowed guest h_cr3 is kept in ns_vmcb->h_cr3, * hence we keep the ns_vmcb->h_cr3 value. */ } else if ( paging_mode_hap(v->domain) ) { /* host nested paging + guest shadow paging. */ ns_vmcb->_np_enable = 0; /* Throw h_cr3 away. Guest is not allowed to set it or * it can break out, otherwise (security hole!) */ ns_vmcb->_h_cr3 = 0x0; /* Stop intercepting #PF (already done above * by restoring cached intercepts). */ ns_vmcb->_cr3 = n2vmcb->_cr3; } else { /* host shadow paging + guest shadow paging. */ ns_vmcb->_np_enable = 0; ns_vmcb->_h_cr3 = 0x0; /* The vmcb->_cr3 is the shadowed cr3. The original * unshadowed guest cr3 is kept in ns_vmcb->_cr3, * hence we keep the ns_vmcb->_cr3 value. */ } /* LBR virtualization - keep lbr control as is */ /* NextRIP */ ns_vmcb->nextrip = n2vmcb->nextrip; /* Decode Assist */ ns_vmcb->guest_ins_len = n2vmcb->guest_ins_len; memcpy(ns_vmcb->guest_ins, n2vmcb->guest_ins, sizeof(ns_vmcb->guest_ins)); /* * VMCB Save State Area */ /* Segments */ ns_vmcb->es = n2vmcb->es; ns_vmcb->cs = n2vmcb->cs; ns_vmcb->ss = n2vmcb->ss; ns_vmcb->ds = n2vmcb->ds; ns_vmcb->gdtr = n2vmcb->gdtr; ns_vmcb->idtr = n2vmcb->idtr; /* CPL */ ns_vmcb->_cpl = n2vmcb->_cpl; /* EFER */ ns_vmcb->_efer = n2vmcb->_efer; /* CRn */ ns_vmcb->_cr4 = n2vmcb->_cr4; ns_vmcb->_cr0 = n2vmcb->_cr0; /* DRn */ ns_vmcb->_dr7 = n2vmcb->_dr7; ns_vmcb->_dr6 = n2vmcb->_dr6; /* Restore registers from regs as those values * can be newer than in n2vmcb (e.g. due to an * instruction emulation right before). */ /* RFLAGS */ ns_vmcb->rflags = n2vmcb->rflags = regs->rflags; /* RIP */ ns_vmcb->rip = n2vmcb->rip = regs->rip; /* RSP */ ns_vmcb->rsp = n2vmcb->rsp = regs->rsp; /* RAX */ ns_vmcb->rax = n2vmcb->rax = regs->rax; /* Keep the l2 guest values of the fs, gs, ldtr, tr, kerngsbase, * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp, * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation. */ /* CR2 */ ns_vmcb->_cr2 = n2vmcb->_cr2; /* PAT */ ns_vmcb->_g_pat = n2vmcb->_g_pat; /* Debug Control MSR */ ns_vmcb->_debugctlmsr = n2vmcb->_debugctlmsr; /* LBR MSRs */ ns_vmcb->_lastbranchfromip = n2vmcb->_lastbranchfromip; ns_vmcb->_lastbranchtoip = n2vmcb->_lastbranchtoip; ns_vmcb->_lastintfromip = n2vmcb->_lastintfromip; ns_vmcb->_lastinttoip = n2vmcb->_lastinttoip; return 0; } bool cf_check nsvm_vmcb_hap_enabled(struct vcpu *v) { return vcpu_nestedsvm(v).ns_hap_enabled; } /* This function uses L2_gpa to walk the P2M page table in L1. If the * walk is successful, the translated value is returned in * L1_gpa. The result value tells what to do next. */ int cf_check nsvm_hap_walk_L1_p2m( struct vcpu *v, paddr_t L2_gpa, paddr_t *L1_gpa, unsigned int *page_order, uint8_t *p2m_acc, struct npfec npfec) { uint32_t pfec; unsigned long nested_cr3, gfn; nested_cr3 = nhvm_vcpu_p2m_base(v); pfec = PFEC_user_mode | PFEC_page_present; if ( npfec.write_access ) pfec |= PFEC_write_access; if ( npfec.insn_fetch ) pfec |= PFEC_insn_fetch; /* Walk the guest-supplied NPT table, just as if it were a pagetable */ gfn = paging_ga_to_gfn_cr3(v, nested_cr3, L2_gpa, &pfec, page_order); if ( gfn == gfn_x(INVALID_GFN) ) return NESTEDHVM_PAGEFAULT_INJECT; *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK); return NESTEDHVM_PAGEFAULT_DONE; } enum hvm_intblk cf_check nsvm_intr_blocked(struct vcpu *v) { struct nestedsvm *svm = &vcpu_nestedsvm(v); struct nestedvcpu *nv = &vcpu_nestedhvm(v); ASSERT(nestedhvm_enabled(v->domain)); if ( !nestedsvm_gif_isset(v) ) return hvm_intblk_svm_gif; if ( nestedhvm_vcpu_in_guestmode(v) ) { struct vmcb_struct *n2vmcb = nv->nv_n2vmcx; if ( svm->ns_hostflags.fields.vintrmask && !svm->ns_hostflags.fields.rflagsif ) return hvm_intblk_rflags_ie; /* when l1 guest passes its devices through to the l2 guest * and l2 guest does an MMIO access then we may want to * inject an VMEXIT(#INTR) exitcode into the l1 guest. * Delay the injection because this would result in delivering * an interrupt *within* the execution of an instruction. */ if ( v->io.req.state != STATE_IOREQ_NONE ) return hvm_intblk_shadow; if ( !nv->nv_vmexit_pending && n2vmcb->exit_int_info.v ) { /* Give the l2 guest a chance to finish the delivery of * the last injected interrupt or exception before we * emulate a VMEXIT (e.g. VMEXIT(INTR) ). */ return hvm_intblk_shadow; } } if ( nv->nv_vmexit_pending ) /* hvm_inject_hw_exception() must have run before. * exceptions have higher priority than interrupts. */ return hvm_intblk_rflags_ie; return hvm_intblk_none; } /* VMEXIT emulation */ void nestedsvm_vmexit_defer(struct vcpu *v, uint64_t exitcode, uint64_t exitinfo1, uint64_t exitinfo2) { struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *vmcb = v->arch.hvm.svm.vmcb; if ( vmcb->_vintr.fields.vgif_enable ) vmcb->_vintr.fields.vgif = 0; else nestedsvm_vcpu_clgi(v); svm->ns_vmexit.exitcode = exitcode; svm->ns_vmexit.exitinfo1 = exitinfo1; svm->ns_vmexit.exitinfo2 = exitinfo2; vcpu_nestedhvm(v).nv_vmexit_pending = 1; } enum nestedhvm_vmexits nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode) { bool_t is_intercepted; ASSERT(vcpu_nestedhvm(v).nv_vmexit_pending == 0); is_intercepted = nsvm_vmcb_guest_intercepts_exitcode(v, regs, exitcode); switch ( exitcode ) { case VMEXIT_INVALID: if ( is_intercepted ) return NESTEDHVM_VMEXIT_INJECT; return NESTEDHVM_VMEXIT_HOST; case VMEXIT_INTR: case VMEXIT_NMI: return NESTEDHVM_VMEXIT_HOST; case VMEXIT_EXCEPTION_NM: /* Host must handle lazy fpu context switching first. * Then inject the VMEXIT if L1 guest intercepts this. */ return NESTEDHVM_VMEXIT_HOST; case VMEXIT_NPF: if ( nestedhvm_paging_mode_hap(v) ) { if ( !is_intercepted ) return NESTEDHVM_VMEXIT_FATALERROR; /* host nested paging + guest nested paging */ return NESTEDHVM_VMEXIT_HOST; } if ( paging_mode_hap(v->domain) ) { if ( is_intercepted ) return NESTEDHVM_VMEXIT_FATALERROR; /* host nested paging + guest shadow paging */ return NESTEDHVM_VMEXIT_HOST; } /* host shadow paging + guest shadow paging */ /* Can this happen? */ BUG(); case VMEXIT_EXCEPTION_PF: if ( nestedhvm_paging_mode_hap(v) ) { /* host nested paging + guest nested paging */ if ( !is_intercepted ) /* l1 guest intercepts #PF unnecessarily */ return NESTEDHVM_VMEXIT_HOST; /* l2 guest intercepts #PF unnecessarily */ return NESTEDHVM_VMEXIT_INJECT; } if ( !paging_mode_hap(v->domain) ) /* host shadow paging + guest shadow paging */ return NESTEDHVM_VMEXIT_HOST; /* host nested paging + guest shadow paging */ return NESTEDHVM_VMEXIT_INJECT; case VMEXIT_VMMCALL: /* Always let the guest handle VMMCALL/VMCALL */ return NESTEDHVM_VMEXIT_INJECT; default: gprintk(XENLOG_ERR, "Unexpected nested vmexit: reason %#"PRIx64"\n", exitcode); break; } if ( is_intercepted ) return NESTEDHVM_VMEXIT_INJECT; return NESTEDHVM_VMEXIT_HOST; } enum nestedhvm_vmexits nestedsvm_vmexit_n2n1(struct vcpu *v, struct cpu_user_regs *regs) { int rc; enum nestedhvm_vmexits ret = NESTEDHVM_VMEXIT_DONE; ASSERT(vcpu_nestedhvm(v).nv_vmswitch_in_progress); ASSERT(nestedhvm_vcpu_in_guestmode(v)); rc = nsvm_vmcb_prepare4vmexit(v, regs); if ( rc ) ret = NESTEDHVM_VMEXIT_ERROR; rc = nsvm_vcpu_hostrestore(v, regs); if ( rc ) ret = NESTEDHVM_VMEXIT_FATALERROR; nestedhvm_vcpu_exit_guestmode(v); return ret; } /* The exitcode is in native SVM/VMX format. The forced exitcode * is in generic format. */ static enum nestedhvm_vmexits nestedsvm_vcpu_vmexit(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode) { int rc; struct nestedvcpu *nv = &vcpu_nestedhvm(v); nv->nv_vmswitch_in_progress = 1; ASSERT(nv->nv_vvmcx != NULL); /* On special intercepts the host has to handle * the vcpu is still in guest mode here. */ if ( nestedhvm_vcpu_in_guestmode(v) ) { enum nestedhvm_vmexits ret; ret = nestedsvm_vmexit_n2n1(v, regs); switch ( ret ) { case NESTEDHVM_VMEXIT_FATALERROR: gdprintk(XENLOG_ERR, "VMEXIT: fatal error\n"); return ret; case NESTEDHVM_VMEXIT_HOST: BUG(); case NESTEDHVM_VMEXIT_ERROR: exitcode = VMEXIT_INVALID; break; default: ASSERT(!nestedhvm_vcpu_in_guestmode(v)); break; } /* host state has been restored */ } ASSERT(!nestedhvm_vcpu_in_guestmode(v)); /* Prepare for running the l1 guest. Make the actual * modifications to the virtual VMCB/VMCS. */ rc = nsvm_vcpu_vmexit_inject(v, regs, exitcode); /* If l1 guest uses shadow paging, update the paging mode. */ if ( !nestedhvm_paging_mode_hap(v) ) paging_update_paging_modes(v); nv->nv_vmswitch_in_progress = 0; if ( rc ) return NESTEDHVM_VMEXIT_FATALERROR; return NESTEDHVM_VMEXIT_DONE; } /* VCPU switch */ void nsvm_vcpu_switch(void) { struct cpu_user_regs *regs = guest_cpu_user_regs(); struct vcpu *v = current; struct nestedvcpu *nv; struct nestedsvm *svm; if ( !nestedhvm_enabled(v->domain) ) return; nv = &vcpu_nestedhvm(v); svm = &vcpu_nestedsvm(v); ASSERT(v->arch.hvm.svm.vmcb != NULL); ASSERT(nv->nv_n1vmcx != NULL); ASSERT(nv->nv_n2vmcx != NULL); ASSERT(nv->nv_n1vmcx_pa != INVALID_PADDR); ASSERT(nv->nv_n2vmcx_pa != INVALID_PADDR); if ( nv->nv_vmexit_pending ) { vmexit: nestedsvm_vcpu_vmexit(v, regs, svm->ns_vmexit.exitcode); nv->nv_vmexit_pending = 0; nv->nv_vmentry_pending = 0; return; } if ( nv->nv_vmentry_pending ) { int ret; ASSERT(!nv->nv_vmexit_pending); ret = nsvm_vcpu_vmrun(v, regs); if ( ret ) goto vmexit; ASSERT(nestedhvm_vcpu_in_guestmode(v)); nv->nv_vmentry_pending = 0; } if ( nestedhvm_vcpu_in_guestmode(v) && nestedhvm_paging_mode_hap(v) ) { /* In case left the l2 guest due to a physical interrupt (e.g. IPI) * that is not for the l1 guest then we continue running the l2 guest * but check if the nestedp2m is still valid. */ if ( nv->nv_p2m == NULL ) nestedsvm_vmcb_set_nestedp2m(v, nv->nv_vvmcx, nv->nv_n2vmcx); } } /* Interrupts, Virtual GIF */ int nestedsvm_vcpu_interrupt(struct vcpu *v, const struct hvm_intack intack) { int ret; enum hvm_intblk intr; uint64_t exitcode = VMEXIT_INTR; uint64_t exitinfo2 = 0; ASSERT(nestedhvm_vcpu_in_guestmode(v)); intr = nhvm_interrupt_blocked(v); if ( intr != hvm_intblk_none ) return NSVM_INTR_MASKED; switch ( intack.source ) { case hvm_intsrc_pic: case hvm_intsrc_lapic: case hvm_intsrc_vector: exitcode = VMEXIT_INTR; exitinfo2 = intack.vector; break; case hvm_intsrc_nmi: exitcode = VMEXIT_NMI; exitinfo2 = intack.vector; break; case hvm_intsrc_mce: exitcode = VMEXIT_EXCEPTION_MC; exitinfo2 = intack.vector; break; case hvm_intsrc_none: return NSVM_INTR_NOTHANDLED; default: BUG(); } ret = nsvm_vmcb_guest_intercepts_exitcode(v, guest_cpu_user_regs(), exitcode); if ( ret ) { nestedsvm_vmexit_defer(v, exitcode, intack.source, exitinfo2); return NSVM_INTR_FORCEVMEXIT; } return NSVM_INTR_NOTINTERCEPTED; } bool_t nestedsvm_gif_isset(struct vcpu *v) { struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *vmcb = v->arch.hvm.svm.vmcb; /* get the vmcb gif value if using vgif */ if ( vmcb->_vintr.fields.vgif_enable ) return vmcb->_vintr.fields.vgif; else return svm->ns_gif; } void svm_vmexit_do_stgi(struct cpu_user_regs *regs, struct vcpu *v) { unsigned int inst_len; /* * STGI doesn't require SVME to be set to be used. See AMD APM vol * 2 section 15.4 for details. */ if ( !nestedhvm_enabled(v->domain) ) { hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC); return; } if ( (inst_len = svm_get_insn_len(v, INSTR_STGI)) == 0 ) return; nestedsvm_vcpu_stgi(v); __update_guest_eip(regs, inst_len); } void svm_vmexit_do_clgi(struct cpu_user_regs *regs, struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm.svm.vmcb; unsigned int inst_len; uint32_t general1_intercepts = vmcb_get_general1_intercepts(vmcb); vintr_t intr; if ( !nsvm_efer_svm_enabled(v) ) { hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC); return; } if ( (inst_len = svm_get_insn_len(v, INSTR_CLGI)) == 0 ) return; nestedsvm_vcpu_clgi(v); /* After a CLGI no interrupts should come */ intr = vmcb_get_vintr(vmcb); intr.fields.irq = 0; general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR; vmcb_set_vintr(vmcb, intr); vmcb_set_general1_intercepts(vmcb, general1_intercepts); __update_guest_eip(regs, inst_len); } /* * This runs on EFER change to see if nested features need to either be * turned off or on. */ void svm_nested_features_on_efer_update(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm.svm.vmcb; struct nestedsvm *svm = &vcpu_nestedsvm(v); u32 general2_intercepts; vintr_t vintr; /* * Need state for transfering the nested gif status so only write on * the hvm_vcpu EFER.SVME changing. */ if ( v->arch.hvm.guest_efer & EFER_SVME ) { if ( !vmcb->virt_ext.fields.vloadsave_enable && paging_mode_hap(v->domain) && cpu_has_svm_vloadsave ) { vmcb->virt_ext.fields.vloadsave_enable = 1; general2_intercepts = vmcb_get_general2_intercepts(vmcb); general2_intercepts &= ~(GENERAL2_INTERCEPT_VMLOAD | GENERAL2_INTERCEPT_VMSAVE); vmcb_set_general2_intercepts(vmcb, general2_intercepts); } if ( !vmcb->_vintr.fields.vgif_enable && cpu_has_svm_vgif ) { vintr = vmcb_get_vintr(vmcb); vintr.fields.vgif = svm->ns_gif; vintr.fields.vgif_enable = 1; vmcb_set_vintr(vmcb, vintr); general2_intercepts = vmcb_get_general2_intercepts(vmcb); general2_intercepts &= ~(GENERAL2_INTERCEPT_STGI | GENERAL2_INTERCEPT_CLGI); vmcb_set_general2_intercepts(vmcb, general2_intercepts); } } else { if ( vmcb->virt_ext.fields.vloadsave_enable ) { vmcb->virt_ext.fields.vloadsave_enable = 0; general2_intercepts = vmcb_get_general2_intercepts(vmcb); general2_intercepts |= (GENERAL2_INTERCEPT_VMLOAD | GENERAL2_INTERCEPT_VMSAVE); vmcb_set_general2_intercepts(vmcb, general2_intercepts); } if ( vmcb->_vintr.fields.vgif_enable ) { vintr = vmcb_get_vintr(vmcb); svm->ns_gif = vintr.fields.vgif; vintr.fields.vgif_enable = 0; vmcb_set_vintr(vmcb, vintr); general2_intercepts = vmcb_get_general2_intercepts(vmcb); general2_intercepts |= (GENERAL2_INTERCEPT_STGI | GENERAL2_INTERCEPT_CLGI); vmcb_set_general2_intercepts(vmcb, general2_intercepts); } } }