/* SPDX-License-Identifier: GPL-2.0-or-later */ /****************************************************************************** * arch/x86/mm/shadow/multi.c * * Simple, mostly-synchronous shadow page tables. * Parts of this code are Copyright (c) 2006 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "private.h" #include "types.h" /* THINGS TO DO LATER: * * TEARDOWN HEURISTICS * Also: have a heuristic for when to destroy a previous paging-mode's * shadows. When a guest is done with its start-of-day 32-bit tables * and reuses the memory we want to drop those shadows. Start with * shadows in a page in two modes as a hint, but beware of clever tricks * like reusing a pagetable for both PAE and 64-bit during boot... * * PAE LINEAR MAPS * Rework shadow_get_l*e() to have the option of using map_domain_page() * instead of linear maps. Add appropriate unmap_l*e calls in the users. * Then we can test the speed difference made by linear maps. If the * map_domain_page() version is OK on PAE, we could maybe allow a lightweight * l3-and-l2h-only shadow mode for PAE PV guests that would allow them * to share l2h pages again. * * PSE disabled / PSE36 * We don't support any modes other than PSE enabled, PSE36 disabled. * Neither of those would be hard to change, but we'd need to be able to * deal with shadows made in one mode and used in another. */ #define FETCH_TYPE_PREFETCH 1 #define FETCH_TYPE_DEMAND 2 #define FETCH_TYPE_WRITE 4 typedef enum { ft_prefetch = FETCH_TYPE_PREFETCH, ft_demand_read = FETCH_TYPE_DEMAND, ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE, } fetch_type_t; extern const char *const fetch_type_names[]; #if SHADOW_DEBUG_PROPAGATE && CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS const char *const fetch_type_names[] = { [ft_prefetch] = "prefetch", [ft_demand_read] = "demand read", [ft_demand_write] = "demand write", }; #endif #if SHADOW_PAGING_LEVELS == 3 # define for_each_shadow_table(v, i) \ for ( (i) = 0; \ (i) < ARRAY_SIZE((v)->arch.paging.shadow.shadow_table); \ ++(i) ) #else # define for_each_shadow_table(v, i) for ( (i) = 0; (i) < 1; ++(i) ) #endif /* Helper to perform a local TLB flush. */ static void sh_flush_local(const struct domain *d) { flush_local(guest_flush_tlb_flags(d)); } #if GUEST_PAGING_LEVELS >= 4 && defined(CONFIG_PV32) #define ASSERT_VALID_L2(t) \ ASSERT((t) == SH_type_l2_shadow || (t) == SH_type_l2h_shadow) #else #define ASSERT_VALID_L2(t) ASSERT((t) == SH_type_l2_shadow) #endif /**************************************************************************/ /* Hash table mapping from guest pagetables to shadows * * normal case: see private.h. * FL1's: maps the *gfn* of the start of a superpage to the mfn of a * shadow L1 which maps its "splinters". */ static inline mfn_t get_fl1_shadow_status(struct domain *d, gfn_t gfn) /* Look for FL1 shadows in the hash table */ { mfn_t smfn = shadow_hash_lookup(d, gfn_x(gfn), SH_type_fl1_shadow); ASSERT(mfn_eq(smfn, INVALID_MFN) || mfn_to_page(smfn)->u.sh.head); return smfn; } static inline void set_fl1_shadow_status(struct domain *d, gfn_t gfn, mfn_t smfn) /* Put an FL1 shadow into the hash table */ { SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%"PRI_mfn"\n", gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn)); ASSERT(mfn_to_page(smfn)->u.sh.head); shadow_hash_insert(d, gfn_x(gfn), SH_type_fl1_shadow, smfn); } static inline void delete_fl1_shadow_status(struct domain *d, gfn_t gfn, mfn_t smfn) /* Remove a shadow from the hash table */ { SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%"PRI_mfn"\n", gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn)); ASSERT(mfn_to_page(smfn)->u.sh.head); if ( !shadow_hash_delete(d, gfn_x(gfn), SH_type_fl1_shadow, smfn) ) { printk(XENLOG_G_ERR "%pd: %"PRI_gfn":FL1 hash entry not found for %"PRI_mfn"\n", d, gfn_x(gfn), mfn_x(smfn)); domain_crash(d); } } /**************************************************************************/ /* Functions for walking the guest page tables */ static inline bool sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec) { gfn_t root_gfn = _gfn(paging_mode_external(v->domain) ? cr3_pa(v->arch.hvm.guest_cr[3]) >> PAGE_SHIFT : pagetable_get_pfn(v->arch.guest_table)); #if GUEST_PAGING_LEVELS != 3 /* 32 or 64 */ const struct domain *d = v->domain; mfn_t root_mfn = (v->arch.flags & TF_kernel_mode ? pagetable_get_mfn(v->arch.guest_table) : pagetable_get_mfn(v->arch.guest_table_user)); void *root_map = map_domain_page(root_mfn); bool ok = guest_walk_tables(v, p2m_get_hostp2m(d), va, gw, pfec, root_gfn, root_mfn, root_map); unmap_domain_page(root_map); return ok; #elif !defined(CONFIG_HVM) ASSERT_UNREACHABLE(); (void)root_gfn; memset(gw, 0, sizeof(*gw)); return false; #else /* PAE */ return guest_walk_tables(v, p2m_get_hostp2m(v->domain), va, gw, pfec, root_gfn, INVALID_MFN, v->arch.paging.shadow.gl3e); #endif } /* This validation is called with lock held, and after write permission * removal. Then check is atomic and no more inconsistent content can * be observed before lock is released * * Return 1 to indicate success and 0 for inconsistency */ static inline uint32_t shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version) { struct domain *d = v->domain; guest_l1e_t *l1p; guest_l2e_t *l2p; #if GUEST_PAGING_LEVELS >= 4 guest_l3e_t *l3p; guest_l4e_t *l4p; #endif int mismatch = 0; ASSERT(paging_locked_by_me(d)); /* No need for smp_rmb() here; taking the paging lock was enough. */ if ( version == atomic_read(&d->arch.paging.shadow.gtable_dirty_version) ) return 1; /* We may consider caching guest page mapping from last * guest table walk. However considering this check happens * relatively less-frequent, and a bit burden here to * remap guest page is better than caching mapping in each * guest table walk. * * Also when inconsistency occurs, simply return to trigger * another fault instead of re-validate new path to make * logic simple. */ perfc_incr(shadow_check_gwalk); #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ l4p = map_domain_page(gw->l4mfn); mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4); unmap_domain_page(l4p); l3p = map_domain_page(gw->l3mfn); mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3); unmap_domain_page(l3p); #elif defined(CONFIG_HVM) mismatch |= (gw->l3e.l3 != v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3); #endif #endif l2p = map_domain_page(gw->l2mfn); mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2); unmap_domain_page(l2p); if ( !(guest_can_use_l2_superpages(v) && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) ) { l1p = map_domain_page(gw->l1mfn); mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1); unmap_domain_page(l1p); } return !mismatch; } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) static int shadow_check_gl1e(struct vcpu *v, walk_t *gw) { guest_l1e_t *l1p, nl1e; if ( !mfn_valid(gw->l1mfn) ) return 0; /* Can't just pull-through because mfn may have changed */ l1p = map_domain_page(gw->l1mfn); nl1e.l1 = l1p[guest_l1_table_offset(gw->va)].l1; unmap_domain_page(l1p); return gw->l1e.l1 != nl1e.l1; } #endif /* Remove write access permissions from a gwalk_t in a batch, and * return OR-ed result for TLB flush hint and need to rewalk the guest * pages. * * Syncing pages will remove write access to that page; but it may * also give write access to other pages in the path. If we resync any * pages, re-walk from the beginning. */ #define GW_RMWR_FLUSHTLB 1 #define GW_RMWR_REWALK 2 static inline uint32_t gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw) { struct domain *d = v->domain; uint32_t rc = 0; #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( mfn_is_out_of_sync(gw->l3mfn) ) { sh_resync(d, gw->l3mfn); rc = GW_RMWR_REWALK; } else #endif /* OOS */ if ( sh_remove_write_access(d, gw->l3mfn, 3, va) ) rc = GW_RMWR_FLUSHTLB; #endif /* GUEST_PAGING_LEVELS >= 4 */ #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( mfn_is_out_of_sync(gw->l2mfn) ) { sh_resync(d, gw->l2mfn); rc |= GW_RMWR_REWALK; } else #endif /* OOS */ if ( sh_remove_write_access(d, gw->l2mfn, 2, va) ) rc |= GW_RMWR_FLUSHTLB; #endif /* GUEST_PAGING_LEVELS >= 3 */ if ( !(guest_can_use_l2_superpages(v) && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) && !mfn_is_out_of_sync(gw->l1mfn) #endif /* OOS */ && sh_remove_write_access(d, gw->l1mfn, 1, va) ) rc |= GW_RMWR_FLUSHTLB; return rc; } /* Lightweight audit: pass all the shadows associated with this guest walk * through the audit mechanisms */ static void sh_audit_gw(struct vcpu *v, const walk_t *gw) { #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES struct domain *d = v->domain; mfn_t smfn; if ( !(SHADOW_AUDIT_ENABLE) ) return; #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ if ( mfn_valid(gw->l4mfn) && mfn_valid((smfn = get_shadow_status(d, gw->l4mfn, SH_type_l4_shadow))) ) sh_audit_l4_table(d, smfn, INVALID_MFN); if ( mfn_valid(gw->l3mfn) && mfn_valid((smfn = get_shadow_status(d, gw->l3mfn, SH_type_l3_shadow))) ) sh_audit_l3_table(d, smfn, INVALID_MFN); #endif /* PAE or 64... */ if ( mfn_valid(gw->l2mfn) ) { if ( mfn_valid((smfn = get_shadow_status(d, gw->l2mfn, SH_type_l2_shadow))) ) sh_audit_l2_table(d, smfn, INVALID_MFN); #if GUEST_PAGING_LEVELS >= 4 && defined(CONFIG_PV32) if ( mfn_valid((smfn = get_shadow_status(d, gw->l2mfn, SH_type_l2h_shadow))) ) sh_audit_l2_table(d, smfn, INVALID_MFN); #endif } if ( mfn_valid(gw->l1mfn) && mfn_valid((smfn = get_shadow_status(d, gw->l1mfn, SH_type_l1_shadow))) ) sh_audit_l1_table(d, smfn, INVALID_MFN); else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT) && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE) && mfn_valid( (smfn = get_fl1_shadow_status(d, guest_l2e_get_gfn(gw->l2e)))) ) sh_audit_fl1_table(d, smfn, INVALID_MFN); #endif /* SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES */ } /**************************************************************************/ /* Functions to compute the correct index into a shadow page, given an * index into the guest page (as returned by guest_get_index()). * This is trivial when the shadow and guest use the same sized PTEs, but * gets more interesting when those sizes are mismatched (e.g. 32-bit guest, * PAE- or 64-bit shadows). * * These functions also increment the shadow mfn, when necessary. When PTE * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and * use simple pointer arithmetic on a pointer to the guest L1e to figure out * which shadow page we really want. Similarly, when PTE sizes are * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address * space.) */ #if GUEST_PAGING_LEVELS == 2 /* From one page of a multi-page shadow, find the next one */ static inline mfn_t cf_check sh_next_page(mfn_t smfn) { struct page_info *pg = mfn_to_page(smfn), *next; struct page_list_head h = PAGE_LIST_HEAD_INIT(h); ASSERT(pg->u.sh.type == SH_type_l1_32_shadow || pg->u.sh.type == SH_type_fl1_32_shadow || pg->u.sh.type == SH_type_l2_32_shadow); ASSERT(pg->u.sh.type == SH_type_l2_32_shadow || pg->u.sh.head); next = page_list_next(pg, &h); ASSERT(next); ASSERT(next->u.sh.type == pg->u.sh.type); ASSERT(!next->u.sh.head); return page_to_mfn(next); } #else # define sh_next_page NULL #endif #define shadow_set_l2e(d, sl2e, new_sl2e, sl2mfn) \ shadow_set_l2e(d, sl2e, new_sl2e, sl2mfn, SH_type_fl1_shadow, sh_next_page) static inline u32 guest_index(void *ptr) { return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t); } static u32 cf_check shadow_l1_index(mfn_t *smfn, u32 guest_index) { #if (GUEST_PAGING_LEVELS == 2) ASSERT(mfn_to_page(*smfn)->u.sh.head); if ( guest_index >= SHADOW_L1_PAGETABLE_ENTRIES ) *smfn = sh_next_page(*smfn); return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES); #else return guest_index; #endif } static u32 cf_check shadow_l2_index(mfn_t *smfn, u32 guest_index) { #if (GUEST_PAGING_LEVELS == 2) int i; ASSERT(mfn_to_page(*smfn)->u.sh.head); // Because we use 2 shadow l2 entries for each guest entry, the number of // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2 for ( i = 0; i < guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2); i++ ) *smfn = sh_next_page(*smfn); // We multiply by two to get the index of the first of the two entries // used to shadow the specified guest entry. return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2; #else return guest_index; #endif } #if GUEST_PAGING_LEVELS >= 4 static u32 cf_check shadow_l3_index(mfn_t *smfn, u32 guest_index) { return guest_index; } static u32 cf_check shadow_l4_index(mfn_t *smfn, u32 guest_index) { return guest_index; } #endif // GUEST_PAGING_LEVELS >= 4 /**************************************************************************/ /* Function which computes shadow entries from their corresponding guest * entries. This is the "heart" of the shadow code. It operates using * level-1 shadow types, but handles all levels of entry. * Don't call it directly, but use the four wrappers below. */ static always_inline void _sh_propagate(struct vcpu *v, guest_intpte_t guest_intpte, mfn_t target_mfn, void *shadow_entry_ptr, int level, fetch_type_t ft, p2m_type_t p2mt) { guest_l1e_t guest_entry = { guest_intpte }; shadow_l1e_t *sp = shadow_entry_ptr; struct domain *d = v->domain; gfn_t target_gfn = guest_l1e_get_gfn(guest_entry); u32 pass_thru_flags; u32 gflags, sflags; bool mmio_mfn; /* We don't shadow PAE l3s */ ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3); /* Check there's something for the shadows to map to */ if ( (!p2m_is_valid(p2mt) && !p2m_is_grant(p2mt)) || !gfn_valid(d, target_gfn) ) { *sp = shadow_l1e_empty(); goto done; } gflags = guest_l1e_get_flags(guest_entry); if ( unlikely(!(gflags & _PAGE_PRESENT)) ) { #if !(SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* If a guest l1 entry is not present, shadow with the magic * guest-not-present entry. */ if ( level == 1 ) *sp = sh_l1e_gnp(); else #endif /* !OOS */ *sp = shadow_l1e_empty(); goto done; } if ( level == 1 && p2mt == p2m_mmio_dm ) { /* Guest l1e maps emulated MMIO space */ *sp = sh_l1e_mmio(target_gfn, gflags); if ( sh_l1e_is_magic(*sp) ) d->arch.paging.shadow.has_fast_mmio_entries = true; goto done; } // Must have a valid target_mfn unless this is a prefetch or an l1 // pointing at MMIO space. In the case of a prefetch, an invalid // mfn means that we can not usefully shadow anything, and so we // return early. // mmio_mfn = !mfn_valid(target_mfn) || (level == 1 && page_get_owner(mfn_to_page(target_mfn)) == dom_io); if ( mmio_mfn && !(level == 1 && (!shadow_mode_refcounts(d) || p2mt == p2m_mmio_direct)) ) { ASSERT((ft == ft_prefetch)); *sp = shadow_l1e_empty(); goto done; } // Propagate bits from the guest to the shadow. // Some of these may be overwritten, below. // Since we know the guest's PRESENT bit is set, we also set the shadow's // SHADOW_PRESENT bit. // pass_thru_flags = (_PAGE_ACCESSED | _PAGE_USER | _PAGE_RW | _PAGE_PRESENT); if ( guest_nx_enabled(v) ) pass_thru_flags |= _PAGE_NX_BIT; if ( level == 1 && !shadow_mode_refcounts(d) && mmio_mfn ) pass_thru_flags |= PAGE_CACHE_ATTRS; sflags = gflags & pass_thru_flags; /* * For HVM domains with direct access to MMIO areas, set the correct * caching attributes in the shadows to match what was asked for. */ if ( (level == 1) && is_hvm_domain(d) && (mmio_mfn || !is_special_page(mfn_to_page(target_mfn))) ) { int type; ASSERT(!(sflags & PAGE_CACHE_ATTRS)); /* * Compute the PAT index for shadow page entry when IOMMU is enabled. * 1) direct MMIO: compute the PAT index with gMTRR=UC and gPAT. * 2) if enables snoop control, compute the PAT index as WB. * 3) if disables snoop control, compute the PAT index with * gMTRR and gPAT. */ if ( !mmio_mfn && (type = hvm_get_mem_pinned_cacheattr(d, target_gfn, 0)) >= 0 ) sflags |= pat_type_2_pte_flags(type); else if ( d->arch.hvm.is_in_uc_mode ) sflags |= pat_type_2_pte_flags(X86_MT_UC); else if ( iomem_access_permitted(d, mfn_x(target_mfn), mfn_x(target_mfn)) ) { if ( p2mt == p2m_mmio_direct ) sflags |= get_pat_flags(v, gflags, gfn_to_paddr(target_gfn), mfn_to_maddr(target_mfn), X86_MT_UC); else if ( is_iommu_enabled(d) && iommu_snoop ) sflags |= pat_type_2_pte_flags(X86_MT_WB); else sflags |= get_pat_flags(v, gflags, gfn_to_paddr(target_gfn), mfn_to_maddr(target_mfn), NO_HARDCODE_MEM_TYPE); } } // Set the A&D bits for higher level shadows. // Higher level entries do not, strictly speaking, have dirty bits, but // since we use shadow linear tables, each of these entries may, at some // point in time, also serve as a shadow L1 entry. // By setting both the A&D bits in each of these, we eliminate the burden // on the hardware to update these bits on initial accesses. // if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) ) sflags |= _PAGE_ACCESSED | _PAGE_DIRTY; // If the A or D bit has not yet been set in the guest, then we must // prevent the corresponding kind of access. // if ( unlikely(!(gflags & _PAGE_ACCESSED)) ) sflags &= ~_PAGE_PRESENT; /* D bits exist in L1es and PSE L2es */ if ( unlikely(((level == 1) || ((level == 2) && (gflags & _PAGE_PSE) && guest_can_use_l2_superpages(v))) && !(gflags & _PAGE_DIRTY)) ) sflags &= ~_PAGE_RW; #ifdef CONFIG_HVM if ( unlikely(level == 1) && is_hvm_domain(d) ) { struct sh_dirty_vram *dirty_vram = d->arch.hvm.dirty_vram; if ( dirty_vram && dirty_vram->last_dirty == -1 && gfn_x(target_gfn) >= dirty_vram->begin_pfn && gfn_x(target_gfn) < dirty_vram->end_pfn ) { if ( ft & FETCH_TYPE_WRITE ) dirty_vram->last_dirty = NOW(); else sflags &= ~_PAGE_RW; } } #endif /* Read-only memory */ if ( p2m_is_readonly(p2mt) ) sflags &= ~_PAGE_RW; else if ( p2mt == p2m_mmio_direct && rangeset_contains_singleton(mmio_ro_ranges, mfn_x(target_mfn)) ) { sflags &= ~(_PAGE_RW | PAGE_CACHE_ATTRS); sflags |= _PAGE_UC; } // protect guest page tables // if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn) #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) /* Unless the page is out of sync and the guest is writing to it. */ && !(mfn_oos_may_write(target_mfn) && (ft == ft_demand_write)) #endif /* OOS */ ) ) sflags &= ~_PAGE_RW; /* * shadow_mode_log_dirty support * * Only allow the guest write access to a page a) on a demand fault, * or b) if the page is already marked as dirty. * * (We handle log-dirty entirely inside the shadow code, without using the * p2m_ram_logdirty p2m type: only HAP uses that.) */ if ( level == 1 && unlikely(shadow_mode_log_dirty(d)) && !mmio_mfn ) { if ( ft & FETCH_TYPE_WRITE ) paging_mark_dirty(d, target_mfn); else if ( (sflags & _PAGE_RW) && !paging_mfn_is_dirty(d, target_mfn) ) sflags &= ~_PAGE_RW; } // PV guests in 64-bit mode use two different page tables for user vs // supervisor permissions, making the guest's _PAGE_USER bit irrelevant. // It is always shadowed as present... if ( (GUEST_PAGING_LEVELS == 4) && !is_hvm_domain(d) && !is_pv_32bit_domain(d) ) { sflags |= _PAGE_USER; } *sp = shadow_l1e_from_mfn(target_mfn, sflags); done: SHADOW_DEBUG(PROPAGATE, "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n", fetch_type_names[ft], level, guest_entry.l1, sp->l1); } /* These four wrappers give us a little bit of type-safety back around * the use of void-* pointers and intpte types in _sh_propagate(), and * allow the compiler to optimize out some level checks. */ #if GUEST_PAGING_LEVELS >= 4 static void l4e_propagate_from_guest(struct vcpu *v, guest_l4e_t gl4e, mfn_t sl3mfn, shadow_l4e_t *sl4e, fetch_type_t ft) { if ( !mfn_eq(sl3mfn, INVALID_MFN) && (guest_l4e_get_flags(gl4e) & _PAGE_PRESENT) ) ASSERT(!guest_l4e_rsvd_bits(v, gl4e)); _sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw); } static void l3e_propagate_from_guest(struct vcpu *v, guest_l3e_t gl3e, mfn_t sl2mfn, shadow_l3e_t *sl3e, fetch_type_t ft) { if ( !mfn_eq(sl2mfn, INVALID_MFN) && (guest_l3e_get_flags(gl3e) & _PAGE_PRESENT) ) ASSERT(!guest_l3e_rsvd_bits(v, gl3e)); _sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw); } #endif // GUEST_PAGING_LEVELS >= 4 static void l2e_propagate_from_guest(struct vcpu *v, guest_l2e_t gl2e, mfn_t sl1mfn, shadow_l2e_t *sl2e, fetch_type_t ft) { if ( !mfn_eq(sl1mfn, INVALID_MFN) && (guest_l2e_get_flags(gl2e) & _PAGE_PRESENT) ) ASSERT(!guest_l2e_rsvd_bits(v, gl2e)); _sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw); } static void l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, mfn_t gmfn, shadow_l1e_t *sl1e, fetch_type_t ft, p2m_type_t p2mt) { if ( !mfn_eq(gmfn, INVALID_MFN) && (guest_l1e_get_flags(gl1e) & _PAGE_PRESENT) ) ASSERT(!guest_l1e_rsvd_bits(v, gl1e)); _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt); } /**************************************************************************/ /* Macros to walk pagetables. These take the shadow of a pagetable and * walk every "interesting" entry. That is, they don't touch Xen mappings, * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every * second entry (since pairs of entries are managed together). For multi-page * shadows they walk all pages. * * Arguments are an MFN, the variable to point to each entry, a variable * to indicate that we are done (we will shortcut to the end of the scan * when _done != 0), a variable to indicate that we should avoid Xen mappings, * and the code. * * WARNING: These macros have side-effects. They change the values of both * the pointer and the MFN. */ static inline void increment_ptr_to_guest_entry(void *ptr) { if ( ptr ) { guest_l1e_t **entry = ptr; (*entry)++; } } /* All kinds of l1: touch all entries */ #define _FOREACH_PRESENT_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ do { \ int _i; \ shadow_l1e_t *_sp = map_domain_page((_sl1mfn)); \ ASSERT(mfn_to_page(_sl1mfn)->u.sh.type == SH_type_l1_shadow \ || mfn_to_page(_sl1mfn)->u.sh.type == SH_type_fl1_shadow);\ for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \ { \ (_sl1e) = _sp + _i; \ if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \ {_code} \ if ( _done ) break; \ increment_ptr_to_guest_entry(_gl1p); \ } \ unmap_domain_page(_sp); \ } while (0) /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */ #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 #define FOREACH_PRESENT_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ do { \ int __done = 0; \ _FOREACH_PRESENT_L1E(_sl1mfn, _sl1e, _gl1p, \ ({ (__done = _done); }), _code); \ _sl1mfn = sh_next_page(_sl1mfn); \ if ( !__done ) \ _FOREACH_PRESENT_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code); \ } while (0) #else /* Everything else; l1 shadows are only one page */ #define FOREACH_PRESENT_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ _FOREACH_PRESENT_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) #endif #if GUEST_PAGING_LEVELS == 2 /* 32-bit l2 on PAE/64: four pages, touch every second entry */ #define FOREACH_PRESENT_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \ do { \ int _i, _j; \ ASSERT(shadow_mode_external(_dom)); \ ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_32_shadow); \ for ( _j = 0; _j < 4; _j++ ) \ { \ shadow_l2e_t *_sp = map_domain_page(_sl2mfn); \ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \ { \ (_sl2e) = _sp + _i; \ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ {_code} \ if ( _done ) break; \ increment_ptr_to_guest_entry(_gl2p); \ } \ unmap_domain_page(_sp); \ if ( _j < 3 ) _sl2mfn = sh_next_page(_sl2mfn); \ if ( _i < SHADOW_L2_PAGETABLE_ENTRIES ) break; \ } \ } while (0) #elif GUEST_PAGING_LEVELS == 3 /* PAE: touch all entries */ #define FOREACH_PRESENT_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \ do { \ int _i; \ shadow_l2e_t *_sp = map_domain_page((_sl2mfn)); \ ASSERT(shadow_mode_external(_dom)); \ ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_pae_shadow); \ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ { \ (_sl2e) = _sp + _i; \ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ {_code} \ if ( _done ) break; \ increment_ptr_to_guest_entry(_gl2p); \ } \ unmap_domain_page(_sp); \ } while (0) #else /* 64-bit l2: touch all entries except for PAE compat guests. */ #define FOREACH_PRESENT_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \ do { \ unsigned int _i, _end = SHADOW_L2_PAGETABLE_ENTRIES; \ shadow_l2e_t *_sp = map_domain_page((_sl2mfn)); \ ASSERT_VALID_L2(mfn_to_page(_sl2mfn)->u.sh.type); \ if ( is_pv_32bit_domain(_dom) /* implies !shadow_mode_external */ && \ mfn_to_page(_sl2mfn)->u.sh.type != SH_type_l2_64_shadow ) \ _end = COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom); \ for ( _i = 0; _i < _end; ++_i ) \ { \ (_sl2e) = _sp + _i; \ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ { \ _code; \ } \ if ( _done ) \ break; \ increment_ptr_to_guest_entry(_gl2p); \ } \ unmap_domain_page(_sp); \ } while (0) #endif /* different kinds of l2 */ #if GUEST_PAGING_LEVELS == 4 /* 64-bit l3: touch all entries */ #define FOREACH_PRESENT_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \ do { \ int _i; \ shadow_l3e_t *_sp = map_domain_page((_sl3mfn)); \ ASSERT(mfn_to_page(_sl3mfn)->u.sh.type == SH_type_l3_64_shadow);\ for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \ { \ (_sl3e) = _sp + _i; \ if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \ {_code} \ if ( _done ) break; \ increment_ptr_to_guest_entry(_gl3p); \ } \ unmap_domain_page(_sp); \ } while (0) /* 64-bit l4: avoid Xen mappings */ #define FOREACH_PRESENT_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \ do { \ shadow_l4e_t *_sp = map_domain_page((_sl4mfn)); \ int _xen = !shadow_mode_external(_dom); \ int _i; \ ASSERT(mfn_to_page(_sl4mfn)->u.sh.type == SH_type_l4_64_shadow);\ for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \ { \ if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \ { \ (_sl4e) = _sp + _i; \ if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \ {_code} \ if ( _done ) break; \ } \ increment_ptr_to_guest_entry(_gl4p); \ } \ unmap_domain_page(_sp); \ } while (0) #endif /**************************************************************************/ /* Create a shadow of a given guest page. */ static mfn_t cf_check sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type) { struct domain *d = v->domain; mfn_t smfn = shadow_alloc(d, shadow_type, mfn_x(gmfn)); SHADOW_DEBUG(MAKE_SHADOW, "(%"PRI_mfn", %u)=>%"PRI_mfn"\n", mfn_x(gmfn), shadow_type, mfn_x(smfn)); if ( sh_type_has_up_pointer(d, shadow_type) ) /* Lower-level shadow, not yet linked form a higher level */ mfn_to_page(smfn)->up = 0; #if GUEST_PAGING_LEVELS >= 4 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) if ( shadow_type == SH_type_l4_64_shadow && unlikely(d->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) ) { /* We're shadowing a new l4, but we've been assuming the guest uses * only one l4 per vcpu and context switches using an l4 entry. * Count the number of active l4 shadows. If there are enough * of them, decide that this isn't an old linux guest, and stop * pinning l3es. This is not very quick but it doesn't happen * very often. */ struct page_info *sp, *t; unsigned int l4count = 0; page_list_for_each(sp, &d->arch.paging.shadow.pinned_shadows) { if ( sp->u.sh.type == SH_type_l4_64_shadow ) l4count++; } if ( l4count > 2 * d->max_vcpus ) { /* Unpin all the pinned l3 tables, and don't pin any more. */ page_list_for_each_safe(sp, t, &d->arch.paging.shadow.pinned_shadows) { if ( sp->u.sh.type == SH_type_l3_64_shadow ) sh_unpin(d, page_to_mfn(sp)); } d->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL; } } #endif // Create the Xen mappings... if ( !shadow_mode_external(d) ) { switch (shadow_type) { case SH_type_l4_shadow: { shadow_l4e_t *l4t = map_domain_page(smfn); BUILD_BUG_ON(sizeof(l4_pgentry_t) != sizeof(shadow_l4e_t)); init_xen_l4_slots(l4t, gmfn, d, smfn, (!is_pv_32bit_domain(d) && VM_ASSIST(d, m2p_strict))); unmap_domain_page(l4t); } break; #ifdef CONFIG_PV32 case SH_type_l2h_shadow: BUILD_BUG_ON(sizeof(l2_pgentry_t) != sizeof(shadow_l2e_t)); if ( is_pv_32bit_domain(d) ) { shadow_l2e_t *l2t = map_domain_page(smfn); init_xen_pae_l2_slots(l2t, d); unmap_domain_page(l2t); } break; #endif default: /* Do nothing */ break; } } #endif /* GUEST_PAGING_LEVELS >= 4 */ shadow_promote(d, gmfn, shadow_type); set_shadow_status(d, gmfn, shadow_type, smfn); return smfn; } /* Make a splintered superpage shadow */ static mfn_t make_fl1_shadow(struct domain *d, gfn_t gfn) { mfn_t smfn = shadow_alloc(d, SH_type_fl1_shadow, gfn_x(gfn)); SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n", gfn_x(gfn), mfn_x(smfn)); set_fl1_shadow_status(d, gfn, smfn); return smfn; } /**************************************************************************/ /* These functions also take a virtual address and return the level-N * shadow table mfn and entry, but they create the shadow pagetables if * they are needed. The "demand" argument is non-zero when handling * a demand fault (so we know what to do about accessed bits &c). * If the necessary tables are not present in the guest, they return NULL. */ /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has * more levels than the guest, the upper levels are always fixed and do not * reflect any information from the guest, so we do not use these functions * to access them. */ #if GUEST_PAGING_LEVELS >= 4 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v, walk_t *gw, mfn_t *sl4mfn) { /* There is always a shadow of the top level table. Get it. */ *sl4mfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]); /* Reading the top level table is always valid. */ return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va); } static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, walk_t *gw, mfn_t *sl3mfn, fetch_type_t ft, int *resync) { struct domain *d = v->domain; mfn_t sl4mfn; shadow_l4e_t *sl4e; if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */ /* Get the l4e */ sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn); ASSERT(sl4e != NULL); if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) { *sl3mfn = shadow_l4e_get_mfn(*sl4e); ASSERT(mfn_valid(*sl3mfn)); } else { int r; shadow_l4e_t new_sl4e; /* No l3 shadow installed: find and install it. */ *sl3mfn = get_shadow_status(d, gw->l3mfn, SH_type_l3_shadow); if ( !mfn_valid(*sl3mfn) ) { /* No l3 shadow of this page exists at all: make one. */ *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow); } /* Install the new sl3 table in the sl4e */ l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft); r = shadow_set_l4e(d, sl4e, new_sl4e, sl4mfn); ASSERT((r & SHADOW_SET_FLUSH) == 0); if ( r & SHADOW_SET_ERROR ) return NULL; #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) *resync |= 1; #endif } /* Now follow it down a level. Guaranteed to succeed. */ return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va); } #endif /* GUEST_PAGING_LEVELS >= 4 */ static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v, walk_t *gw, mfn_t *sl2mfn, fetch_type_t ft, int *resync) { #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */ struct domain *d = v->domain; mfn_t sl3mfn = INVALID_MFN; shadow_l3e_t *sl3e; if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */ /* Get the l3e */ sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft, resync); if ( sl3e == NULL ) return NULL; if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) { *sl2mfn = shadow_l3e_get_mfn(*sl3e); ASSERT(mfn_valid(*sl2mfn)); } else { int r; shadow_l3e_t new_sl3e; unsigned int t = SH_type_l2_shadow; #ifdef CONFIG_PV32 /* Tag compat L2 containing hypervisor (m2p) mappings */ if ( is_pv_32bit_domain(d) && guest_l4_table_offset(gw->va) == 0 && guest_l3_table_offset(gw->va) == 3 ) t = SH_type_l2h_shadow; #endif /* No l2 shadow installed: find and install it. */ *sl2mfn = get_shadow_status(d, gw->l2mfn, t); if ( !mfn_valid(*sl2mfn) ) { /* No l2 shadow of this page exists at all: make one. */ *sl2mfn = sh_make_shadow(v, gw->l2mfn, t); } /* Install the new sl2 table in the sl3e */ l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft); r = shadow_set_l3e(d, sl3e, new_sl3e, sl3mfn); ASSERT((r & SHADOW_SET_FLUSH) == 0); if ( r & SHADOW_SET_ERROR ) return NULL; #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) *resync |= 1; #endif } /* Now follow it down a level. Guaranteed to succeed. */ return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); #elif !defined(CONFIG_HVM) return NULL; #elif GUEST_PAGING_LEVELS == 3 /* PAE... */ /* We never demand-shadow PAE l3es: they are only created in * sh_update_cr3(). Check if the relevant sl3e is present. */ shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table) + shadow_l3_linear_offset(gw->va); if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) return NULL; *sl2mfn = shadow_l3e_get_mfn(*sl3e); ASSERT(mfn_valid(*sl2mfn)); return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); #else /* 32bit... */ /* There is always a shadow of the top level table. Get it. */ *sl2mfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]); /* This next line is important: the guest l2 has a 16k * shadow, we need to return the right mfn of the four. This * call will set it for us as a side-effect. */ (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va)); /* Reading the top level table is always valid. */ return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); #endif } static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v, walk_t *gw, mfn_t *sl1mfn, fetch_type_t ft) { struct domain *d = v->domain; mfn_t sl2mfn; int resync = 0; shadow_l2e_t *sl2e; /* Get the l2e */ sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft, &resync); if ( sl2e == NULL ) return NULL; /* Install the sl1 in the l2e if it wasn't there or if we need to * re-do it to fix a PSE dirty bit. */ if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT && likely(ft != ft_demand_write || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW) || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) ) { *sl1mfn = shadow_l2e_get_mfn(*sl2e); ASSERT(mfn_valid(*sl1mfn)); } else { shadow_l2e_t new_sl2e; int r, flags = guest_l2e_get_flags(gw->l2e); /* No l1 shadow installed: find and install it. */ if ( !(flags & _PAGE_PRESENT) ) return NULL; /* No guest page. */ if ( guest_can_use_l2_superpages(v) && (flags & _PAGE_PSE) ) { /* Splintering a superpage */ gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e); *sl1mfn = get_fl1_shadow_status(d, l2gfn); if ( !mfn_valid(*sl1mfn) ) { /* No fl1 shadow of this superpage exists at all: make one. */ *sl1mfn = make_fl1_shadow(d, l2gfn); } } else { /* Shadowing an actual guest l1 table */ if ( !mfn_valid(gw->l1mfn) ) return NULL; /* No guest page. */ *sl1mfn = get_shadow_status(d, gw->l1mfn, SH_type_l1_shadow); if ( !mfn_valid(*sl1mfn) ) { /* No l1 shadow of this page exists at all: make one. */ *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow); } } /* Install the new sl1 table in the sl2e */ l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft); r = shadow_set_l2e(d, sl2e, new_sl2e, sl2mfn); ASSERT((r & SHADOW_SET_FLUSH) == 0); if ( r & SHADOW_SET_ERROR ) return NULL; /* This next line is important: in 32-on-PAE and 32-on-64 modes, * the guest l1 table has an 8k shadow, and we need to return * the right mfn of the pair. This call will set it for us as a * side-effect. (In all other cases, it's a no-op and will be * compiled out.) */ (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va)); } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) /* All pages walked are now pagetables. Safe to resync pages in case level 4 or 3 shadows were set. */ if ( resync ) shadow_resync_all(v); #endif /* Now follow it down a level. Guaranteed to succeed. */ return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va); } /**************************************************************************/ /* Destructors for shadow tables: * Unregister the shadow, decrement refcounts of any entries present in it, * and release the memory. * * N.B. These destructors do not clear the contents of the shadows. * This allows us to delay TLB shootdowns until the page is being reused. * See shadow_alloc() and shadow_free() for how this is handled. */ #if GUEST_PAGING_LEVELS >= 4 void sh_destroy_l4_shadow(struct domain *d, mfn_t smfn) { shadow_l4e_t *sl4e; struct page_info *sp = mfn_to_page(smfn); u32 t = sp->u.sh.type; mfn_t gmfn, sl4mfn; SHADOW_DEBUG(DESTROY_SHADOW, "%"PRI_mfn"\n", mfn_x(smfn)); ASSERT(t == SH_type_l4_shadow); ASSERT(sp->u.sh.head); /* Record that the guest page isn't shadowed any more (in this type) */ gmfn = backpointer(sp); delete_shadow_status(d, gmfn, t, smfn); shadow_demote(d, gmfn, t); /* Decrement refcounts of all the old entries */ sl4mfn = smfn; FOREACH_PRESENT_L4E(sl4mfn, sl4e, NULL, 0, d, { sh_put_ref(d, shadow_l4e_get_mfn(*sl4e), mfn_to_maddr(sl4mfn) | ((unsigned long)sl4e & ~PAGE_MASK)); }); /* Put the memory back in the pool */ shadow_free(d, smfn); } void sh_destroy_l3_shadow(struct domain *d, mfn_t smfn) { shadow_l3e_t *sl3e; struct page_info *sp = mfn_to_page(smfn); u32 t = sp->u.sh.type; mfn_t gmfn, sl3mfn; SHADOW_DEBUG(DESTROY_SHADOW, "%"PRI_mfn"\n", mfn_x(smfn)); ASSERT(t == SH_type_l3_shadow); ASSERT(sp->u.sh.head); /* Record that the guest page isn't shadowed any more (in this type) */ gmfn = backpointer(sp); delete_shadow_status(d, gmfn, t, smfn); shadow_demote(d, gmfn, t); /* Decrement refcounts of all the old entries */ sl3mfn = smfn; FOREACH_PRESENT_L3E(sl3mfn, sl3e, NULL, 0, { sh_put_ref(d, shadow_l3e_get_mfn(*sl3e), mfn_to_maddr(sl3mfn) | ((unsigned long)sl3e & ~PAGE_MASK)); }); /* Put the memory back in the pool */ shadow_free(d, smfn); } #endif /* GUEST_PAGING_LEVELS >= 4 */ void sh_destroy_l2_shadow(struct domain *d, mfn_t smfn) { shadow_l2e_t *sl2e; struct page_info *sp = mfn_to_page(smfn); u32 t = sp->u.sh.type; mfn_t gmfn, sl2mfn; SHADOW_DEBUG(DESTROY_SHADOW, "%"PRI_mfn"\n", mfn_x(smfn)); ASSERT_VALID_L2(t); ASSERT(sp->u.sh.head); /* Record that the guest page isn't shadowed any more (in this type) */ gmfn = backpointer(sp); delete_shadow_status(d, gmfn, t, smfn); shadow_demote(d, gmfn, t); /* Decrement refcounts of all the old entries */ sl2mfn = smfn; FOREACH_PRESENT_L2E(sl2mfn, sl2e, NULL, 0, d, { sh_put_ref(d, shadow_l2e_get_mfn(*sl2e), mfn_to_maddr(sl2mfn) | ((unsigned long)sl2e & ~PAGE_MASK)); }); /* Put the memory back in the pool */ shadow_free(d, smfn); } void sh_destroy_l1_shadow(struct domain *d, mfn_t smfn) { shadow_l1e_t *sl1e; struct page_info *sp = mfn_to_page(smfn); u32 t = sp->u.sh.type; SHADOW_DEBUG(DESTROY_SHADOW, "%"PRI_mfn"\n", mfn_x(smfn)); ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow); ASSERT(sp->u.sh.head); /* Record that the guest page isn't shadowed any more (in this type) */ if ( t == SH_type_fl1_shadow ) { gfn_t gfn = _gfn(sp->v.sh.back); delete_fl1_shadow_status(d, gfn, smfn); } else { mfn_t gmfn = backpointer(sp); delete_shadow_status(d, gmfn, t, smfn); shadow_demote(d, gmfn, t); } if ( shadow_mode_refcounts(d) ) { /* Decrement refcounts of all the old entries */ mfn_t sl1mfn = smfn; FOREACH_PRESENT_L1E(sl1mfn, sl1e, NULL, 0, { if ( !sh_l1e_is_magic(*sl1e) ) { shadow_vram_put_mfn(shadow_l1e_get_mfn(*sl1e), shadow_l1e_get_flags(*sl1e), sl1mfn, sl1e, d); shadow_put_page_from_l1e(*sl1e, d); } }); } /* Put the memory back in the pool */ shadow_free(d, smfn); } /**************************************************************************/ /* Functions to destroy non-Xen mappings in a pagetable hierarchy. * These are called from common code when we are running out of shadow * memory, and unpinning all the top-level shadows hasn't worked. * * With user_only == 1, we leave guest kernel-mode mappings in place too, * unhooking only the user-mode mappings * * This implementation is pretty crude and slow, but we hope that it won't * be called very often. */ #if GUEST_PAGING_LEVELS < 4 void sh_unhook_l2_mappings(struct domain *d, mfn_t sl2mfn, bool user_only) { shadow_l2e_t *sl2e; FOREACH_PRESENT_L2E(sl2mfn, sl2e, NULL, 0, d, { if ( !user_only || (sl2e->l2 & _PAGE_USER) ) shadow_set_l2e(d, sl2e, shadow_l2e_empty(), sl2mfn); }); } #elif GUEST_PAGING_LEVELS == 4 void sh_unhook_l4_mappings(struct domain *d, mfn_t sl4mfn, bool user_only) { shadow_l4e_t *sl4e; FOREACH_PRESENT_L4E(sl4mfn, sl4e, NULL, 0, d, { if ( !user_only || (sl4e->l4 & _PAGE_USER) ) shadow_set_l4e(d, sl4e, shadow_l4e_empty(), sl4mfn); }); } #endif /**************************************************************************/ /* Internal translation functions. * These functions require a pointer to the shadow entry that will be updated. */ /* These functions take a new guest entry, translate it to shadow and write * the shadow entry. * * They return the same bitmaps as the shadow_set_lXe() functions. */ #if GUEST_PAGING_LEVELS >= 4 static int cf_check validate_gl4e( struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se) { shadow_l4e_t new_sl4e; guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge; shadow_l4e_t *sl4p = se; mfn_t sl3mfn = INVALID_MFN; struct domain *d = v->domain; p2m_type_t p2mt; int result = 0; perfc_incr(shadow_validate_gl4e_calls); if ( (guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT) && !guest_l4e_rsvd_bits(v, new_gl4e) ) { gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e); mfn_t gl3mfn = get_gfn_query_unlocked(d, gfn_x(gl3gfn), &p2mt); if ( p2m_is_ram(p2mt) ) sl3mfn = get_shadow_status(d, gl3mfn, SH_type_l3_shadow); else if ( !p2m_is_pod(p2mt) ) result |= SHADOW_SET_ERROR; #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) if ( mfn_valid(sl3mfn) ) shadow_resync_all(v); #endif } l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch); // check for updates to xen reserved slots if ( !shadow_mode_external(d) ) { int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) / sizeof(shadow_l4e_t)); int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index); if ( unlikely(reserved_xen_slot) ) { // attempt by the guest to write to a xen reserved slot // SHADOW_PRINTK("out-of-range update " "sl4mfn=%"PRI_mfn" index=%#x val=%" SH_PRI_pte "\n", mfn_x(sl4mfn), shadow_index, new_sl4e.l4); if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT ) { printk(XENLOG_G_ERR "out-of-range l4e update\n"); result |= SHADOW_SET_ERROR; } // do not call shadow_set_l4e... return result; } } result |= shadow_set_l4e(d, sl4p, new_sl4e, sl4mfn); return result; } static int cf_check validate_gl3e( struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se) { struct domain *d = v->domain; shadow_l3e_t new_sl3e; guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge; shadow_l3e_t *sl3p = se; mfn_t sl2mfn = INVALID_MFN; p2m_type_t p2mt; int result = 0; perfc_incr(shadow_validate_gl3e_calls); if ( (guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT) && !guest_l3e_rsvd_bits(v, new_gl3e) ) { gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e); mfn_t gl2mfn = get_gfn_query_unlocked(d, gfn_x(gl2gfn), &p2mt); if ( p2m_is_ram(p2mt) ) sl2mfn = get_shadow_status(d, gl2mfn, SH_type_l2_shadow); else if ( !p2m_is_pod(p2mt) ) result |= SHADOW_SET_ERROR; #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) if ( mfn_valid(sl2mfn) ) shadow_resync_all(v); #endif } l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch); result |= shadow_set_l3e(d, sl3p, new_sl3e, sl3mfn); return result; } #endif // GUEST_PAGING_LEVELS >= 4 static int cf_check validate_gl2e( struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se) { struct domain *d = v->domain; shadow_l2e_t new_sl2e; guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge; shadow_l2e_t *sl2p = se; mfn_t sl1mfn = INVALID_MFN; p2m_type_t p2mt; int result = 0; perfc_incr(shadow_validate_gl2e_calls); if ( (guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT) && !guest_l2e_rsvd_bits(v, new_gl2e) ) { gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e); if ( guest_can_use_l2_superpages(v) && (guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) ) { // superpage -- need to look up the shadow L1 which holds the // splitters... sl1mfn = get_fl1_shadow_status(d, gl1gfn); #if 0 // XXX - it's possible that we want to do some kind of prefetch // for superpage fl1's here, but this is *not* on the demand path, // so we'll hold off trying that for now... // if ( !mfn_valid(sl1mfn) ) sl1mfn = make_fl1_shadow(d, gl1gfn); #endif } else { mfn_t gl1mfn = get_gfn_query_unlocked(d, gfn_x(gl1gfn), &p2mt); if ( p2m_is_ram(p2mt) ) sl1mfn = get_shadow_status(d, gl1mfn, SH_type_l1_shadow); else if ( !p2m_is_pod(p2mt) ) result |= SHADOW_SET_ERROR; } } l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch); result |= shadow_set_l2e(d, sl2p, new_sl2e, sl2mfn); return result; } static int cf_check validate_gl1e( struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se) { struct domain *d = v->domain; shadow_l1e_t new_sl1e; guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge; shadow_l1e_t *sl1p = se; gfn_t gfn; mfn_t gmfn = INVALID_MFN; p2m_type_t p2mt = p2m_invalid; int result = 0; #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) mfn_t gl1mfn; #endif /* OOS */ perfc_incr(shadow_validate_gl1e_calls); if ( (guest_l1e_get_flags(new_gl1e) & _PAGE_PRESENT) && !guest_l1e_rsvd_bits(v, new_gl1e) ) { gfn = guest_l1e_get_gfn(new_gl1e); gmfn = get_gfn_query_unlocked(d, gfn_x(gfn), &p2mt); } l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt); result |= shadow_set_l1e(d, sl1p, new_sl1e, p2mt, sl1mfn); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) gl1mfn = backpointer(mfn_to_page(sl1mfn)); if ( mfn_valid(gl1mfn) && mfn_is_out_of_sync(gl1mfn) ) { /* Update the OOS snapshot. */ mfn_t snpmfn = oos_snapshot_lookup(d, gl1mfn); guest_l1e_t *snp; ASSERT(mfn_valid(snpmfn)); snp = map_domain_page(snpmfn); snp[guest_index(new_ge)] = new_gl1e; unmap_domain_page(snp); } #endif /* OOS */ return result; } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /**************************************************************************/ /* Special validation function for re-syncing out-of-sync shadows. * Walks the *shadow* page, and for every entry that it finds, * revalidates the guest entry that corresponds to it. * N.B. This function is called with the vcpu that unsynced the page, * *not* the one that is causing it to be resynced. */ void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn) { struct domain *d = v->domain; mfn_t sl1mfn; shadow_l1e_t *sl1p; guest_l1e_t *gl1p, *gp, *snp; int rc = 0; ASSERT(mfn_valid(snpmfn)); sl1mfn = get_shadow_status(d, gl1mfn, SH_type_l1_shadow); ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */ snp = map_domain_page(snpmfn); gp = map_domain_page(gl1mfn); gl1p = gp; FOREACH_PRESENT_L1E(sl1mfn, sl1p, &gl1p, 0, { guest_l1e_t gl1e = *gl1p; if ( snp[guest_index(gl1p)].l1 != gl1e.l1 ) { gfn_t gfn; mfn_t gmfn = INVALID_MFN; p2m_type_t p2mt = p2m_invalid; shadow_l1e_t nsl1e; if ( (guest_l1e_get_flags(gl1e) & _PAGE_PRESENT) && !guest_l1e_rsvd_bits(v, gl1e) ) { gfn = guest_l1e_get_gfn(gl1e); gmfn = get_gfn_query_unlocked(d, gfn_x(gfn), &p2mt); } l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt); rc |= shadow_set_l1e(d, sl1p, nsl1e, p2mt, sl1mfn); snp[guest_index(gl1p)] = gl1e; } }); unmap_domain_page(gp); unmap_domain_page(snp); /* Setting shadow L1 entries should never need us to flush the TLB */ ASSERT(!(rc & SHADOW_SET_FLUSH)); } /* Figure out whether it's definitely safe not to sync this l1 table. * That is: if we can tell that it's only used once, and that the * toplevel shadow responsible is not one of ours. * N.B. This function is called with the vcpu that required the resync, * *not* the one that originally unsynced the page, but it is * called in the *mode* of the vcpu that unsynced it. Clear? Good. */ int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn) { struct domain *d = v->domain; struct page_info *sp; mfn_t smfn; unsigned int i; if ( !sh_type_has_up_pointer(d, SH_type_l1_shadow) ) return 0; smfn = get_shadow_status(d, gl1mfn, SH_type_l1_shadow); ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */ /* Up to l2 */ sp = mfn_to_page(smfn); if ( sp->u.sh.count != 1 || !sp->up ) return 0; smfn = maddr_to_mfn(sp->up); ASSERT(mfn_valid(smfn)); #if (SHADOW_PAGING_LEVELS == 4) /* up to l3 */ sp = mfn_to_page(smfn); ASSERT(sh_type_has_up_pointer(d, SH_type_l2_shadow)); if ( sp->u.sh.count != 1 || !sp->up ) return 0; smfn = maddr_to_mfn(sp->up); ASSERT(mfn_valid(smfn)); /* up to l4 */ sp = mfn_to_page(smfn); if ( sp->u.sh.count != 1 || !sh_type_has_up_pointer(d, SH_type_l3_64_shadow) || !sp->up ) return 0; smfn = maddr_to_mfn(sp->up); ASSERT(mfn_valid(smfn)); #endif for_each_shadow_table(v, i) if ( pagetable_get_pfn(v->arch.paging.shadow.shadow_table[i]) == mfn_x(smfn) ) return 0; /* Only in use in one toplevel shadow, and it's not the one we're * running on */ return 1; } #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ /**************************************************************************/ /* Functions which translate and install the shadows of arbitrary guest * entries that we have just seen the guest write. */ static inline int sh_map_and_validate(struct vcpu *v, mfn_t gmfn, void *new_gp, u32 size, u32 sh_type, u32 (*shadow_index)(mfn_t *smfn, u32 idx), int (*validate_ge)(struct vcpu *v, void *ge, mfn_t smfn, void *se)) /* Generic function for mapping and validating. */ { struct domain *d = v->domain; mfn_t smfn, smfn2, map_mfn; shadow_l1e_t *sl1p; u32 shadow_idx, guest_idx; int result = 0; /* Align address and size to guest entry boundaries */ size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1); new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1)); size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1); ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE); /* Map the shadow page */ smfn = get_shadow_status(d, gmfn, sh_type); ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */ guest_idx = guest_index(new_gp); map_mfn = smfn; shadow_idx = shadow_index(&map_mfn, guest_idx); sl1p = map_domain_page(map_mfn); /* Validate one entry at a time */ while ( size ) { smfn2 = smfn; guest_idx = guest_index(new_gp); shadow_idx = shadow_index(&smfn2, guest_idx); if ( !mfn_eq(smfn2, map_mfn) ) { /* We have moved to another page of the shadow */ map_mfn = smfn2; unmap_domain_page(sl1p); sl1p = map_domain_page(map_mfn); } result |= validate_ge(v, new_gp, map_mfn, &sl1p[shadow_idx]); size -= sizeof(guest_l1e_t); new_gp += sizeof(guest_l1e_t); } unmap_domain_page(sl1p); return result; } int sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn, void *new_gl4p, u32 size) { #if GUEST_PAGING_LEVELS >= 4 return sh_map_and_validate(v, gl4mfn, new_gl4p, size, SH_type_l4_shadow, shadow_l4_index, validate_gl4e); #else // ! GUEST_PAGING_LEVELS >= 4 BUG(); /* Called in wrong paging mode! */ #endif } int sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn, void *new_gl3p, u32 size) { #if GUEST_PAGING_LEVELS >= 4 return sh_map_and_validate(v, gl3mfn, new_gl3p, size, SH_type_l3_shadow, shadow_l3_index, validate_gl3e); #else // ! GUEST_PAGING_LEVELS >= 4 BUG(); /* Called in wrong paging mode! */ #endif } int sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size) { return sh_map_and_validate(v, gl2mfn, new_gl2p, size, SH_type_l2_shadow, shadow_l2_index, validate_gl2e); } int sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size) { #if GUEST_PAGING_LEVELS >= 4 && defined(CONFIG_PV32) return sh_map_and_validate(v, gl2mfn, new_gl2p, size, SH_type_l2h_shadow, shadow_l2_index, validate_gl2e); #else /* Non-PAE guests don't have different kinds of l2 table */ BUG(); /* Called in wrong paging mode! */ #endif } int sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn, void *new_gl1p, u32 size) { return sh_map_and_validate(v, gl1mfn, new_gl1p, size, SH_type_l1_shadow, shadow_l1_index, validate_gl1e); } /**************************************************************************/ /* Optimization: Prefetch multiple L1 entries. This is called after we have * demand-faulted a shadow l1e in the fault handler, to see if it's * worth fetching some more. */ #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH /* XXX magic number */ #define PREFETCH_DISTANCE 32 static void sh_prefetch(struct vcpu *v, walk_t *gw, shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn) { struct domain *d = v->domain; int i, dist; gfn_t gfn; mfn_t gmfn; guest_l1e_t *gl1p = NULL, gl1e; shadow_l1e_t sl1e; u32 gflags; p2m_type_t p2mt; #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) guest_l1e_t *snpl1p = NULL; #endif /* OOS */ /* Prefetch no further than the end of the _shadow_ l1 MFN */ dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e; /* And no more than a maximum fetches-per-fault */ if ( dist > PREFETCH_DISTANCE ) dist = PREFETCH_DISTANCE; if ( mfn_valid(gw->l1mfn) ) { /* Normal guest page; grab the next guest entry */ gl1p = map_domain_page(gw->l1mfn); gl1p += guest_l1_table_offset(gw->va); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( mfn_is_out_of_sync(gw->l1mfn) ) { mfn_t snpmfn = oos_snapshot_lookup(d, gw->l1mfn); ASSERT(mfn_valid(snpmfn)); snpl1p = map_domain_page(snpmfn); snpl1p += guest_l1_table_offset(gw->va); } #endif /* OOS */ } for ( i = 1; i < dist ; i++ ) { /* No point in prefetching if there's already a shadow */ if ( ptr_sl1e[i].l1 != 0 ) break; if ( mfn_valid(gw->l1mfn) ) { /* Normal guest page; grab the next guest entry */ gl1e = gl1p[i]; /* Not worth continuing if we hit an entry that will need another * fault for A/D-bit propagation anyway */ gflags = guest_l1e_get_flags(gl1e); if ( (gflags & _PAGE_PRESENT) && (!(gflags & _PAGE_ACCESSED) || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) ) break; } else { /* Fragmented superpage, unless we've been called wrongly */ ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE); /* Increment the l1e's GFN by the right number of guest pages */ gl1e = guest_l1e_from_gfn( _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i), guest_l1e_get_flags(gw->l1e)); } /* Look at the gfn that the l1e is pointing at */ if ( (guest_l1e_get_flags(gl1e) & _PAGE_PRESENT) && !guest_l1e_rsvd_bits(v, gl1e) ) { gfn = guest_l1e_get_gfn(gl1e); gmfn = get_gfn_query_unlocked(d, gfn_x(gfn), &p2mt); } else { gmfn = INVALID_MFN; p2mt = p2m_invalid; } /* Propagate the entry. */ l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt); shadow_set_l1e(d, ptr_sl1e + i, sl1e, p2mt, sl1mfn); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( snpl1p != NULL ) snpl1p[i] = gl1e; #endif /* OOS */ } if ( gl1p != NULL ) unmap_domain_page(gl1p); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( snpl1p != NULL ) unmap_domain_page(snpl1p); #endif /* OOS */ } #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */ #if GUEST_PAGING_LEVELS == 4 typedef u64 guest_va_t; typedef u64 guest_pa_t; #elif GUEST_PAGING_LEVELS == 3 typedef u32 guest_va_t; typedef u64 guest_pa_t; #else typedef u32 guest_va_t; typedef u32 guest_pa_t; #endif static inline void trace_shadow_gen(u32 event, guest_va_t va) { if ( tb_init_done ) { event |= (GUEST_PAGING_LEVELS-2)<<8; __trace_var(event, 0/*!tsc*/, sizeof(va), &va); } } static inline void trace_shadow_fixup(guest_l1e_t gl1e, guest_va_t va) { if ( tb_init_done ) { struct __packed { /* for PAE, guest_l1e may be 64 while guest_va may be 32; so put it first for alignment sake. */ guest_l1e_t gl1e; guest_va_t va; u32 flags; } d; u32 event; event = TRC_SHADOW_FIXUP | ((GUEST_PAGING_LEVELS-2)<<8); d.gl1e = gl1e; d.va = va; d.flags = this_cpu(trace_shadow_path_flags); __trace_var(event, 0/*!tsc*/, sizeof(d), &d); } } static inline void trace_not_shadow_fault(guest_l1e_t gl1e, guest_va_t va) { if ( tb_init_done ) { struct __packed { /* for PAE, guest_l1e may be 64 while guest_va may be 32; so put it first for alignment sake. */ guest_l1e_t gl1e; guest_va_t va; u32 flags; } d; u32 event; event = TRC_SHADOW_NOT_SHADOW | ((GUEST_PAGING_LEVELS-2)<<8); d.gl1e = gl1e; d.va = va; d.flags = this_cpu(trace_shadow_path_flags); __trace_var(event, 0/*!tsc*/, sizeof(d), &d); } } static inline void trace_shadow_emulate_other(u32 event, guest_va_t va, gfn_t gfn) { if ( tb_init_done ) { struct __packed { /* for PAE, guest_l1e may be 64 while guest_va may be 32; so put it first for alignment sake. */ #if GUEST_PAGING_LEVELS == 2 u32 gfn; #else u64 gfn; #endif guest_va_t va; } d; event |= ((GUEST_PAGING_LEVELS-2)<<8); d.gfn=gfn_x(gfn); d.va = va; __trace_var(event, 0/*!tsc*/, sizeof(d), &d); } } #ifdef CONFIG_HVM #if GUEST_PAGING_LEVELS == 3 static DEFINE_PER_CPU(guest_va_t,trace_emulate_initial_va); static DEFINE_PER_CPU(int,trace_extra_emulation_count); #endif static DEFINE_PER_CPU(guest_pa_t,trace_emulate_write_val); static void cf_check trace_emulate_write_val( const void *ptr, unsigned long vaddr, const void *src, unsigned int bytes) { #if GUEST_PAGING_LEVELS == 3 if ( vaddr == this_cpu(trace_emulate_initial_va) ) memcpy(&this_cpu(trace_emulate_write_val), src, bytes); else if ( (vaddr & ~(GUEST_PTE_SIZE - 1)) == this_cpu(trace_emulate_initial_va) ) { TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT); memcpy(&this_cpu(trace_emulate_write_val), (typeof(ptr))((unsigned long)ptr & ~(GUEST_PTE_SIZE - 1)), GUEST_PTE_SIZE); } #else memcpy(&this_cpu(trace_emulate_write_val), src, bytes); #endif } static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va) { if ( tb_init_done ) { struct __packed { /* for PAE, guest_l1e may be 64 while guest_va may be 32; so put it first for alignment sake. */ guest_l1e_t gl1e, write_val; guest_va_t va; uint32_t flags:29, emulation_count:3; } d; u32 event; event = TRC_SHADOW_EMULATE | ((GUEST_PAGING_LEVELS-2)<<8); d.gl1e = gl1e; d.write_val.l1 = this_cpu(trace_emulate_write_val); d.va = va; #if GUEST_PAGING_LEVELS == 3 d.emulation_count = this_cpu(trace_extra_emulation_count); #endif d.flags = this_cpu(trace_shadow_path_flags); __trace_var(event, 0/*!tsc*/, sizeof(d), &d); } } #endif /* CONFIG_HVM */ /**************************************************************************/ /* Entry points into the shadow code */ /* Called from pagefault handler in Xen, and from the HVM trap handlers * for pagefaults. Returns 1 if this fault was an artefact of the * shadow code (and the guest should retry) or 0 if it is not (and the * fault should be handled elsewhere or passed to the guest). */ static int cf_check sh_page_fault( struct vcpu *v, unsigned long va, struct cpu_user_regs *regs) { struct domain *d = v->domain; walk_t gw; gfn_t gfn = _gfn(0); mfn_t gmfn, sl1mfn = _mfn(0); shadow_l1e_t sl1e, *ptr_sl1e; #ifdef CONFIG_HVM paddr_t gpa; struct sh_emulate_ctxt emul_ctxt; const struct x86_emulate_ops *emul_ops; int r; #endif p2m_type_t p2mt; uint32_t rc, error_code; bool walk_ok; int version; unsigned int cpl; const struct npfec access = { .read_access = 1, .write_access = !!(regs->error_code & PFEC_write_access), .gla_valid = 1, .kind = npfec_kind_with_gla }; const fetch_type_t ft = access.write_access ? ft_demand_write : ft_demand_read; #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION int fast_emul = 0; #endif SHADOW_PRINTK("%pv va=%#lx err=%#x, rip=%lx\n", v, va, regs->error_code, regs->rip); perfc_incr(shadow_fault); #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION /* If faulting frame is successfully emulated in last shadow fault * it's highly likely to reach same emulation action for this frame. * Then try to emulate early to avoid lock aquisition. */ if ( v->arch.paging.last_write_emul_ok && v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) ) { /* check whether error code is 3, or else fall back to normal path * in case of some validation is required */ if ( regs->error_code == (PFEC_write_access | PFEC_page_present) ) { fast_emul = 1; gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Fall back to the slow path if we're trying to emulate writes to an out of sync page. */ if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) ) { fast_emul = 0; v->arch.paging.last_write_emul_ok = 0; goto page_fault_slow_path; } #endif /* OOS */ perfc_incr(shadow_fault_fast_emulate); goto early_emulation; } else v->arch.paging.last_write_emul_ok = 0; } #endif // // XXX: Need to think about eventually mapping superpages directly in the // shadow (when possible), as opposed to splintering them into a // bunch of 4K maps. // #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) if ( (regs->error_code & PFEC_reserved_bit) ) { #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* First, need to check that this isn't an out-of-sync * shadow l1e. If it is, we fall back to the slow path, which * will sync it up again. */ { shadow_l2e_t sl2e; mfn_t gl1mfn; if ( (get_unsafe(sl2e, (sh_linear_l2_table(v) + shadow_l2_linear_offset(va))) != 0) || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) || !mfn_valid(gl1mfn = backpointer(mfn_to_page( shadow_l2e_get_mfn(sl2e)))) || unlikely(mfn_is_out_of_sync(gl1mfn)) ) { /* Hit the slow path as if there had been no * shadow entry at all, and let it tidy up */ ASSERT(regs->error_code & PFEC_page_present); regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present); goto page_fault_slow_path; } } #endif /* SHOPT_OUT_OF_SYNC */ /* The only reasons for reserved bits to be set in shadow entries * are the two "magic" shadow_l1e entries. */ if ( likely((get_unsafe(sl1e, (sh_linear_l1_table(v) + shadow_l1_linear_offset(va))) == 0) && sh_l1e_is_magic(sl1e)) ) { if ( sh_l1e_is_gnp(sl1e) ) { /* Not-present in a guest PT: pass to the guest as * a not-present fault (by flipping two bits). */ ASSERT(regs->error_code & PFEC_page_present); regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present); sh_reset_early_unshadow(v); perfc_incr(shadow_fault_fast_gnp); SHADOW_PRINTK("fast path not-present\n"); trace_shadow_gen(TRC_SHADOW_FAST_PROPAGATE, va); return 0; } #ifdef CONFIG_HVM /* Magic MMIO marker: extract gfn for MMIO address */ ASSERT(sh_l1e_is_mmio(sl1e)); ASSERT(is_hvm_vcpu(v)); gpa = gfn_to_gaddr(sh_l1e_mmio_get_gfn(sl1e)) | (va & ~PAGE_MASK); perfc_incr(shadow_fault_fast_mmio); SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa); sh_reset_early_unshadow(v); trace_shadow_gen(TRC_SHADOW_FAST_MMIO, va); return handle_mmio_with_translation(va, gpa >> PAGE_SHIFT, access) ? EXCRET_fault_fixed : 0; #else /* When HVM is not enabled, there shouldn't be MMIO marker */ BUG(); #endif } else { /* This should be exceptionally rare: another vcpu has fixed * the tables between the fault and our reading the l1e. * Retry and let the hardware give us the right fault next time. */ perfc_incr(shadow_fault_fast_fail); SHADOW_PRINTK("fast path false alarm!\n"); trace_shadow_gen(TRC_SHADOW_FALSE_FAST_PATH, va); return EXCRET_fault_fixed; } } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) page_fault_slow_path: #endif #endif /* SHOPT_FAST_FAULT_PATH */ /* Detect if this page fault happened while we were already in Xen * doing a shadow operation. If that happens, the only thing we can * do is let Xen's normal fault handlers try to fix it. In any case, * a diagnostic trace of the fault will be more useful than * a BUG() when we try to take the lock again. */ if ( unlikely(paging_locked_by_me(d)) ) { printk(XENLOG_G_ERR "Recursive shadow fault: lock taken by %s\n", d->arch.paging.lock.locker_function); return 0; } cpl = is_hvm_domain(d) ? hvm_get_cpl(v) : (regs->ss & 3); rewalk: error_code = regs->error_code; /* * When CR4.SMAP is enabled, instructions which have a side effect of * accessing the system data structures (e.g. mov to %ds accessing the * LDT/GDT, or int $n accessing the IDT) are known as implicit supervisor * accesses. * * The distinction between implicit and explicit accesses form part of the * determination of access rights, controlling whether the access is * successful, or raises a #PF. * * Unfortunately, the processor throws away the implicit/explicit * distinction and does not provide it to the pagefault handler * (i.e. here.) in the #PF error code. Therefore, we must try to * reconstruct the lost state so it can be fed back into our pagewalk * through the guest tables. * * User mode accesses are easy to reconstruct: * * If we observe a cpl3 data fetch which was a supervisor walk, this * must have been an implicit access to a system table. * * Supervisor mode accesses are not easy: * * In principle, we could decode the instruction under %rip and have the * instruction emulator tell us if there is an implicit access. * However, this is racy with other vcpus updating the pagetable or * rewriting the instruction stream under our feet. * * Therefore, we do nothing. (If anyone has a sensible suggestion for * how to distinguish these cases, xen-devel@ is all ears...) * * As a result, one specific corner case will fail. If a guest OS with * SMAP enabled ends up mapping a system table with user mappings, sets * EFLAGS.AC to allow explicit accesses to user mappings, and implicitly * accesses the user mapping, hardware and the shadow code will disagree * on whether a #PF should be raised. * * Hardware raises #PF because implicit supervisor accesses to user * mappings are strictly disallowed. As we can't reconstruct the correct * input, the pagewalk is performed as if it were an explicit access, * which concludes that the access should have succeeded and the shadow * pagetables need modifying. The shadow pagetables are modified (to the * same value), and we re-enter the guest to re-execute the instruction, * which causes another #PF, and the vcpu livelocks, unable to make * forward progress. * * In practice, this is tolerable. No production OS will deliberately * construct this corner case (as doing so would mean that a system table * is directly accessable to userspace, and the OS is trivially rootable.) * If this corner case comes about accidentally, then a security-relevant * bug has been tickled. */ if ( !(error_code & (PFEC_insn_fetch|PFEC_user_mode)) && cpl == 3 ) error_code |= PFEC_implicit; /* The walk is done in a lock-free style, with some sanity check * postponed after grabbing paging lock later. Those delayed checks * will make sure no inconsistent mapping being translated into * shadow page table. */ version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version); smp_rmb(); walk_ok = sh_walk_guest_tables(v, va, &gw, error_code); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) regs->error_code &= ~PFEC_page_present; if ( gw.pfec & PFEC_page_present ) regs->error_code |= PFEC_page_present; #endif if ( !walk_ok ) { perfc_incr(shadow_fault_bail_real_fault); SHADOW_PRINTK("not a shadow fault\n"); sh_reset_early_unshadow(v); regs->error_code = gw.pfec & PFEC_arch_mask; goto propagate; } /* It's possible that the guest has put pagetables in memory that it has * already used for some special purpose (ioreq pages, or granted pages). * If that happens we'll have killed the guest already but it's still not * safe to propagate entries out of the guest PT so get out now. */ if ( unlikely(d->is_shutting_down && d->shutdown_code == SHUTDOWN_crash) ) { SHADOW_PRINTK("guest is shutting down\n"); goto propagate; } /* What mfn is the guest trying to access? */ gfn = guest_walk_to_gfn(&gw); gmfn = get_gfn(d, gfn, &p2mt); if ( shadow_mode_refcounts(d) && ((!p2m_is_valid(p2mt) && !p2m_is_grant(p2mt)) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) ) { perfc_incr(shadow_fault_bail_bad_gfn); SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n", gfn_x(gfn), mfn_x(gmfn)); sh_reset_early_unshadow(v); put_gfn(d, gfn_x(gfn)); goto propagate; } #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) /* Remember this successful VA->GFN translation for later. */ vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), regs->error_code | PFEC_page_present); #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ paging_lock(d); TRACE_CLEAR_PATH_FLAGS; /* Make sure there is enough free shadow memory to build a chain of * shadow tables. (We never allocate a top-level shadow on this path, * only a 32b l1, pae l1, or 64b l3+2+1. Note that while * SH_type_l1_shadow isn't correct in the latter case, all page * tables are the same size there.) * * Preallocate shadow pages *before* removing writable accesses * otherwhise an OOS L1 might be demoted and promoted again with * writable mappings. */ if ( !shadow_prealloc(d, SH_type_l1_shadow, GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1) ) { paging_unlock(d); put_gfn(d, gfn_x(gfn)); return 0; } rc = gw_remove_write_accesses(v, va, &gw); /* First bit set: Removed write access to a page. */ if ( rc & GW_RMWR_FLUSHTLB ) { /* Write permission removal is also a hint that other gwalks * overlapping with this one may be inconsistent */ perfc_incr(shadow_rm_write_flush_tlb); smp_wmb(); atomic_inc(&d->arch.paging.shadow.gtable_dirty_version); guest_flush_tlb_mask(d, d->dirty_cpumask); } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Second bit set: Resynced a page. Re-walk needed. */ if ( rc & GW_RMWR_REWALK ) { paging_unlock(d); put_gfn(d, gfn_x(gfn)); goto rewalk; } #endif /* OOS */ if ( !shadow_check_gwalk(v, va, &gw, version) ) { perfc_incr(shadow_inconsistent_gwalk); paging_unlock(d); put_gfn(d, gfn_x(gfn)); goto rewalk; } shadow_audit_tables(v); sh_audit_gw(v, &gw); /* Acquire the shadow. This must happen before we figure out the rights * for the shadow entry, since we might promote a page here. */ ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft); if ( unlikely(ptr_sl1e == NULL) ) { /* Couldn't get the sl1e! Since we know the guest entries * are OK, this can only have been caused by a failed * shadow_set_l*e(), which will have crashed the guest. * Get out of the fault handler immediately. */ /* Windows 7 apparently relies on the hardware to do something * it explicitly hasn't promised to do: load l3 values after * the cr3 is loaded. * In any case, in the PAE case, the ASSERT is not true; it can * happen because of actions the guest is taking. */ #if GUEST_PAGING_LEVELS == 3 v->arch.paging.mode->update_cr3(v, 0, false); #else ASSERT(d->is_shutting_down); #endif paging_unlock(d); put_gfn(d, gfn_x(gfn)); trace_shadow_gen(TRC_SHADOW_DOMF_DYING, va); return 0; } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Always unsync when writing to L1 page tables. */ if ( sh_mfn_is_a_page_table(gmfn) && ft == ft_demand_write ) sh_unsync(v, gmfn); if ( unlikely(d->is_shutting_down && d->shutdown_code == SHUTDOWN_crash) ) { /* We might end up with a crashed domain here if * sh_remove_shadows() in a previous sh_resync() call has * failed. We cannot safely continue since some page is still * OOS but not in the hash table anymore. */ paging_unlock(d); put_gfn(d, gfn_x(gfn)); return 0; } /* Final check: if someone has synced a page, it's possible that * our l1e is stale. Compare the entries, and rewalk if necessary. */ if ( shadow_check_gl1e(v, &gw) ) { perfc_incr(shadow_inconsistent_gwalk); paging_unlock(d); put_gfn(d, gfn_x(gfn)); goto rewalk; } #endif /* OOS */ /* Calculate the shadow entry and write it */ l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt); shadow_set_l1e(d, ptr_sl1e, sl1e, p2mt, sl1mfn); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( mfn_valid(gw.l1mfn) && mfn_is_out_of_sync(gw.l1mfn) ) { /* Update the OOS snapshot. */ mfn_t snpmfn = oos_snapshot_lookup(d, gw.l1mfn); guest_l1e_t *snp; ASSERT(mfn_valid(snpmfn)); snp = map_domain_page(snpmfn); snp[guest_l1_table_offset(va)] = gw.l1e; unmap_domain_page(snp); } #endif /* OOS */ #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH /* Prefetch some more shadow entries */ sh_prefetch(v, &gw, ptr_sl1e, sl1mfn); #endif /* Need to emulate accesses to page tables */ if ( sh_mfn_is_a_page_table(gmfn) #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Unless they've been allowed to go out of sync with their shadows and we don't need to unshadow it. */ && !(mfn_is_out_of_sync(gmfn) && !(regs->error_code & PFEC_user_mode)) #endif && (ft == ft_demand_write) ) { perfc_incr(shadow_fault_emulate_write); goto emulate; } #ifdef CONFIG_HVM /* Need to hand off device-model MMIO to the device model */ if ( p2mt == p2m_mmio_dm ) { ASSERT(is_hvm_vcpu(v)); sh_audit_gw(v, &gw); gpa = guest_walk_to_gpa(&gw); SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa); shadow_audit_tables(v); sh_reset_early_unshadow(v); paging_unlock(d); put_gfn(d, gfn_x(gfn)); perfc_incr(shadow_fault_mmio); trace_shadow_gen(TRC_SHADOW_MMIO, va); return handle_mmio_with_translation(va, gpa >> PAGE_SHIFT, access) ? EXCRET_fault_fixed : 0; } /* Ignore attempts to write to read-only memory. */ if ( p2m_is_readonly(p2mt) && (ft == ft_demand_write) ) goto emulate_readonly; /* skip over the instruction */ /* In HVM guests, we force CR0.WP always to be set, so that the * pagetables are always write-protected. If the guest thinks * CR0.WP is clear, we must emulate faulting supervisor writes to * allow the guest to write through read-only PTEs. Emulate if the * fault was a non-user write to a present page. */ if ( is_hvm_domain(d) && unlikely(!hvm_wp_enabled(v)) && regs->error_code == (PFEC_write_access|PFEC_page_present) && mfn_valid(gmfn) ) { perfc_incr(shadow_fault_emulate_wp); goto emulate; } #endif /* CONFIG_HVM */ perfc_incr(shadow_fault_fixed); d->arch.paging.log_dirty.fault_count++; sh_reset_early_unshadow(v); trace_shadow_fixup(gw.l1e, va); done: __maybe_unused; sh_audit_gw(v, &gw); SHADOW_PRINTK("fixed\n"); shadow_audit_tables(v); paging_unlock(d); put_gfn(d, gfn_x(gfn)); return EXCRET_fault_fixed; emulate: if ( !shadow_mode_refcounts(d) ) goto not_a_shadow_fault; #ifdef CONFIG_HVM /* * We do not emulate user writes. Instead we use them as a hint that the * page is no longer a page table. This behaviour differs from native, but * it seems very unlikely that any OS grants user access to page tables. */ if ( (regs->error_code & PFEC_user_mode) ) { SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n", mfn_x(gmfn)); perfc_incr(shadow_fault_emulate_failed); shadow_remove_all_shadows(d, gmfn); trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_USER, va, gfn); goto done; } /* * Write from userspace to ro-mem needs to jump here to avoid getting * caught by user-mode page-table check above. */ emulate_readonly: /* * Unshadow if we are writing to a toplevel pagetable that is * flagged as a dying process, and that is not currently used. */ if ( sh_mfn_is_a_page_table(gmfn) && mfn_to_page(gmfn)->pagetable_dying ) { int used = 0; struct vcpu *tmp; for_each_vcpu(d, tmp) { #if GUEST_PAGING_LEVELS == 3 unsigned int i; for_each_shadow_table(tmp, i) { mfn_t smfn = pagetable_get_mfn( tmp->arch.paging.shadow.shadow_table[i]); if ( mfn_x(smfn) ) { used |= (mfn_to_page(smfn)->v.sh.back == mfn_x(gmfn)); if ( used ) break; } } #else /* 32 or 64 */ used = mfn_eq(pagetable_get_mfn(tmp->arch.guest_table), gmfn); #endif if ( used ) break; } if ( !used ) sh_remove_shadows(d, gmfn, 1 /* fast */, 0 /* can fail */); } /* * We don't need to hold the lock for the whole emulation; we will * take it again when we write to the pagetables. */ sh_audit_gw(v, &gw); shadow_audit_tables(v); paging_unlock(d); put_gfn(d, gfn_x(gfn)); this_cpu(trace_emulate_write_val) = 0; #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION early_emulation: #endif /* * If we are in the middle of injecting an exception or interrupt then * we should not emulate: the fault is a side effect of the processor * trying to deliver the exception (e.g. IDT/GDT accesses, pushing the * exception frame onto the stack). Furthermore it is almost * certainly the case the handler stack is currently considered to be * a page table, so we should unshadow the faulting page before * exiting. */ if ( unlikely(hvm_event_pending(v)) ) { #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION if ( fast_emul ) { perfc_incr(shadow_fault_fast_emulate_fail); v->arch.paging.last_write_emul_ok = 0; } #endif shadow_remove_all_shadows(d, gmfn); trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ, va, gfn); return EXCRET_fault_fixed; } SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n", regs->rip, regs->rsp); emul_ops = shadow_init_emulation(&emul_ctxt, regs, GUEST_PTE_SIZE); r = x86_emulate(&emul_ctxt.ctxt, emul_ops); if ( r == X86EMUL_EXCEPTION ) { /* * This emulation covers writes to shadow pagetables. We tolerate #PF * (from accesses spanning pages, concurrent paging updated from * vcpus, etc) and #GP[0]/#SS[0] (from segmentation errors). Anything * else is an emulation bug, or a guest playing with the instruction * stream under Xen's feet. */ if ( emul_ctxt.ctxt.event.type == X86_EVENTTYPE_HW_EXCEPTION && ((emul_ctxt.ctxt.event.vector == X86_EXC_PF) || (((emul_ctxt.ctxt.event.vector == X86_EXC_GP) || (emul_ctxt.ctxt.event.vector == X86_EXC_SS)) && emul_ctxt.ctxt.event.error_code == 0)) ) hvm_inject_event(&emul_ctxt.ctxt.event); else { SHADOW_PRINTK( "Unexpected event (type %u, vector %#x) from emulation\n", emul_ctxt.ctxt.event.type, emul_ctxt.ctxt.event.vector); r = X86EMUL_UNHANDLEABLE; } } /* * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it * would be a good unshadow hint. If we *do* decide to unshadow-on-fault * then it must be 'failable': we cannot require the unshadow to succeed. */ if ( r == X86EMUL_UNHANDLEABLE || r == X86EMUL_UNIMPLEMENTED ) { perfc_incr(shadow_fault_emulate_failed); #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION if ( fast_emul ) { perfc_incr(shadow_fault_fast_emulate_fail); v->arch.paging.last_write_emul_ok = 0; } #endif SHADOW_PRINTK("emulator failure (rc=%d), unshadowing mfn %#lx\n", r, mfn_x(gmfn)); /* If this is actually a page table, then we have a bug, and need * to support more operations in the emulator. More likely, * though, this is a hint that this page should not be shadowed. */ shadow_remove_all_shadows(d, gmfn); trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED, va, gfn); goto emulate_done; } #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION /* Record successfully emulated information as heuristics to next * fault on same frame for acceleration. But be careful to verify * its attribute still as page table, or else unshadow triggered * in write emulation normally requires a re-sync with guest page * table to recover r/w permission. Incorrect record for such case * will cause unexpected more shadow faults due to propagation is * skipped. */ if ( (r == X86EMUL_OKAY) && sh_mfn_is_a_page_table(gmfn) ) { if ( !fast_emul ) { v->arch.paging.shadow.last_emulated_frame = va >> PAGE_SHIFT; v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn); v->arch.paging.last_write_emul_ok = 1; } } else if ( fast_emul ) v->arch.paging.last_write_emul_ok = 0; #endif if ( emul_ctxt.ctxt.retire.singlestep ) hvm_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC); #if GUEST_PAGING_LEVELS == 3 /* PAE guest */ /* * If there are no pending actions, emulate up to four extra instructions * in the hope of catching the "second half" of a 64-bit pagetable write. */ if ( r == X86EMUL_OKAY && !emul_ctxt.ctxt.retire.raw ) { int i, emulation_count=0; this_cpu(trace_emulate_initial_va) = va; for ( i = 0 ; i < 4 ; i++ ) { shadow_continue_emulation(&emul_ctxt, regs); v->arch.paging.last_write_was_pt = 0; r = x86_emulate(&emul_ctxt.ctxt, emul_ops); /* * Only continue the search for the second half if there are no * exceptions or pending actions. Otherwise, give up and re-enter * the guest. */ if ( r == X86EMUL_OKAY && !emul_ctxt.ctxt.retire.raw ) { emulation_count++; if ( v->arch.paging.last_write_was_pt ) { perfc_incr(shadow_em_ex_pt); TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN); break; /* Don't emulate past the other half of the write */ } else perfc_incr(shadow_em_ex_non_pt); } else { perfc_incr(shadow_em_ex_fail); TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_LAST_FAILED); if ( emul_ctxt.ctxt.retire.singlestep ) hvm_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC); break; /* Don't emulate again if we failed! */ } } this_cpu(trace_extra_emulation_count)=emulation_count; } #endif /* PAE guest */ trace_shadow_emulate(gw.l1e, va); emulate_done: SHADOW_PRINTK("emulated\n"); return EXCRET_fault_fixed; #endif /* CONFIG_HVM */ not_a_shadow_fault: sh_audit_gw(v, &gw); SHADOW_PRINTK("not a shadow fault\n"); shadow_audit_tables(v); sh_reset_early_unshadow(v); paging_unlock(d); put_gfn(d, gfn_x(gfn)); propagate: trace_not_shadow_fault(gw.l1e, va); return 0; } /* * Called when the guest requests an invlpg. Returns true if the invlpg * instruction should be issued on the hardware, or false if it's safe not * to do so. */ static bool cf_check sh_invlpg(struct vcpu *v, unsigned long linear) { mfn_t sl1mfn; shadow_l2e_t sl2e; perfc_incr(shadow_invlpg); #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) /* No longer safe to use cached gva->gfn translations */ vtlb_flush(v); #endif #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION v->arch.paging.last_write_emul_ok = 0; #endif /* First check that we can safely read the shadow l2e. SMP/PAE linux can * run as high as 6% of invlpg calls where we haven't shadowed the l2 * yet. */ #if SHADOW_PAGING_LEVELS == 4 { shadow_l3e_t sl3e; if ( !(shadow_l4e_get_flags( sh_linear_l4_table(v)[shadow_l4_linear_offset(linear)]) & _PAGE_PRESENT) ) return false; /* This must still be a copy-from-unsafe because we don't have the * paging lock, and the higher-level shadows might disappear * under our feet. */ if ( get_unsafe(sl3e, (sh_linear_l3_table(v) + shadow_l3_linear_offset(linear))) != 0 ) { perfc_incr(shadow_invlpg_fault); return false; } if ( !(shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) ) return false; } #elif !defined(CONFIG_HVM) return false; #else /* SHADOW_PAGING_LEVELS == 3 */ if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(linear)]) & _PAGE_PRESENT) ) // no need to flush anything if there's no SL2... return false; #endif /* This must still be a copy-from-unsafe because we don't have the shadow * lock, and the higher-level shadows might disappear under our feet. */ if ( get_unsafe(sl2e, (sh_linear_l2_table(v) + shadow_l2_linear_offset(linear))) != 0 ) { perfc_incr(shadow_invlpg_fault); return false; } // If there's nothing shadowed for this particular sl2e, then // there is no need to do an invlpg, either... // if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) ) return false; // Check to see if the SL2 is a splintered superpage... // If so, then we'll need to flush the entire TLB (because that's // easier than invalidating all of the individual 4K pages). // sl1mfn = shadow_l2e_get_mfn(sl2e); if ( mfn_to_page(sl1mfn)->u.sh.type == SH_type_fl1_shadow ) { sh_flush_local(v->domain); return false; } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Check to see if the SL1 is out of sync. */ { struct domain *d = v->domain; mfn_t gl1mfn = backpointer(mfn_to_page(sl1mfn)); struct page_info *pg = mfn_to_page(gl1mfn); if ( mfn_valid(gl1mfn) && page_is_out_of_sync(pg) ) { /* The test above may give false positives, since we don't * hold the paging lock yet. Check again with the lock held. */ paging_lock(d); /* This must still be a copy-from-unsafe because we didn't * have the paging lock last time we checked, and the * higher-level shadows might have disappeared under our * feet. */ if ( get_unsafe(sl2e, (sh_linear_l2_table(v) + shadow_l2_linear_offset(linear))) != 0 ) { perfc_incr(shadow_invlpg_fault); paging_unlock(d); return false; } if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) ) { paging_unlock(d); return false; } sl1mfn = shadow_l2e_get_mfn(sl2e); gl1mfn = backpointer(mfn_to_page(sl1mfn)); pg = mfn_to_page(gl1mfn); if ( likely(sh_mfn_is_a_page_table(gl1mfn) && page_is_out_of_sync(pg) ) ) { shadow_l1e_t *sl1; sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(linear); /* Remove the shadow entry that maps this VA */ shadow_set_l1e(d, sl1, shadow_l1e_empty(), p2m_invalid, sl1mfn); } paging_unlock(d); /* Need the invlpg, to pick up the disappeareance of the sl1e */ return true; } } #endif return true; } #ifdef CONFIG_HVM static unsigned long cf_check sh_gva_to_gfn( struct vcpu *v, struct p2m_domain *p2m, unsigned long va, uint32_t *pfec) /* Called to translate a guest virtual address to what the *guest* * pagetables would map it to. */ { walk_t gw; gfn_t gfn; bool walk_ok; #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) /* Check the vTLB cache first */ unsigned long vtlb_gfn = vtlb_lookup(v, va, *pfec); if ( vtlb_gfn != gfn_x(INVALID_GFN) ) return vtlb_gfn; #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ if ( !(walk_ok = sh_walk_guest_tables(v, va, &gw, *pfec)) ) { *pfec = gw.pfec; return gfn_x(INVALID_GFN); } gfn = guest_walk_to_gfn(&gw); #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) /* Remember this successful VA->GFN translation for later. */ vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), *pfec); #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ return gfn_x(gfn); } #endif /* CONFIG_HVM */ static inline void sh_update_linear_entries(struct vcpu *v) /* Sync up all the linear mappings for this vcpu's pagetables */ { struct domain *d = v->domain; /* * Linear pagetables in HVM guests * ------------------------------- * * For HVM guests, the linear pagetables are installed in the monitor * tables (since we can't put them in the shadow). Shadow linear * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START, * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for * a linear pagetable of the monitor tables themselves. We have * the same issue of having to re-copy PAE l3 entries whevever we use * PAE shadows. * * Because HVM guests run on the same monitor tables regardless of the * shadow tables in use, the linear mapping of the shadow tables has to * be updated every time v->arch.paging.shadow.shadow_table changes. */ /* Don't try to update the monitor table if it doesn't exist */ if ( !shadow_mode_external(d) || pagetable_get_pfn(v->arch.hvm.monitor_table) == 0 ) return; #if !defined(CONFIG_HVM) return; #elif SHADOW_PAGING_LEVELS == 4 /* For HVM, just need to update the l4e that points to the shadow l4. */ /* Use the linear map if we can; otherwise make a new mapping */ if ( v == current ) { __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] = l4e_from_pfn( pagetable_get_pfn(v->arch.paging.shadow.shadow_table[0]), __PAGE_HYPERVISOR_RW); } else { l4_pgentry_t *ml4e; ml4e = map_domain_page(pagetable_get_mfn(v->arch.hvm.monitor_table)); ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] = l4e_from_pfn( pagetable_get_pfn(v->arch.paging.shadow.shadow_table[0]), __PAGE_HYPERVISOR_RW); unmap_domain_page(ml4e); } #elif SHADOW_PAGING_LEVELS == 3 /* * HVM: To give ourselves a linear map of the shadows, we need to * extend a PAE shadow to 4 levels. We do this by having a monitor * l3 in slot 0 of the monitor l4 table, and copying the PAE l3 * entries into it. Then, by having the monitor l4e for shadow * pagetables also point to the monitor l4, we can use it to access * the shadows. */ { /* Install copies of the shadow l3es into the monitor l2 table * that maps SH_LINEAR_PT_VIRT_START. */ shadow_l3e_t *sl3e; l2_pgentry_t *ml2e; int i; /* Use linear mappings if we can; otherwise make new mappings */ if ( v == current ) ml2e = __linear_l2_table + l2_linear_offset(SH_LINEAR_PT_VIRT_START); else { mfn_t l3mfn, l2mfn; l4_pgentry_t *ml4e; l3_pgentry_t *ml3e; int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START); ml4e = map_domain_page(pagetable_get_mfn(v->arch.hvm.monitor_table)); ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT); l3mfn = l4e_get_mfn(ml4e[linear_slot]); ml3e = map_domain_page(l3mfn); unmap_domain_page(ml4e); ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT); l2mfn = l3e_get_mfn(ml3e[0]); ml2e = map_domain_page(l2mfn); unmap_domain_page(ml3e); } /* Shadow l3 tables are made up by sh_update_cr3 */ sl3e = v->arch.paging.shadow.l3table; for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) { ml2e[i] = (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT) ? l2e_from_mfn(shadow_l3e_get_mfn(sl3e[i]), __PAGE_HYPERVISOR_RW) : l2e_empty(); } if ( v != current ) unmap_domain_page(ml2e); } #else #error this should not happen #endif /* * Having modified the linear pagetable mapping, flush local host TLBs. * This was not needed when vmenter/vmexit always had the side effect of * flushing host TLBs but, with ASIDs, it is possible to finish this CR3 * update, vmenter the guest, vmexit due to a page fault, without an * intervening host TLB flush. Then the page fault code could use the * linear pagetable to read a top-level shadow page table entry. But, * without this change, it would fetch the wrong value due to a stale TLB. */ sh_flush_local(d); } static void cf_check sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) /* Updates vcpu->arch.cr3 after the guest has changed CR3. * Paravirtual guests should set v->arch.guest_table (and guest_table_user, * if appropriate). * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works; * this function will call hvm_update_guest_cr(v, 3) to tell them where the * shadow tables are. * If do_locking != 0, assume we are being called from outside the * shadow code, and must take and release the paging lock; otherwise * that is the caller's responsibility. */ { struct domain *d = v->domain; mfn_t gmfn; #if GUEST_PAGING_LEVELS == 3 const guest_l3e_t *gl3e; unsigned int i, guest_idx; #endif /* Don't do anything on an uninitialised vcpu */ if ( !is_hvm_domain(d) && !v->is_initialised ) { ASSERT(v->arch.cr3 == 0); return; } if ( do_locking ) paging_lock(v->domain); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Need to resync all the shadow entries on a TLB flush. Resync * current vcpus OOS pages before switching to the new shadow * tables so that the VA hint is still valid. */ shadow_resync_current_vcpu(v); #endif ASSERT(paging_locked_by_me(v->domain)); ASSERT(v->arch.paging.mode); //// //// vcpu->arch.guest_table is already set //// #ifndef NDEBUG /* Double-check that the HVM code has sent us a sane guest_table */ if ( is_hvm_domain(d) ) { ASSERT(shadow_mode_external(d)); if ( hvm_paging_enabled(v) ) ASSERT(pagetable_get_pfn(v->arch.guest_table)); else ASSERT(v->arch.guest_table.pfn == d->arch.paging.shadow.unpaged_pagetable.pfn); } #endif SHADOW_PRINTK("%pv guest_table=%"PRI_mfn"\n", v, (unsigned long)pagetable_get_pfn(v->arch.guest_table)); #if GUEST_PAGING_LEVELS == 4 if ( !(v->arch.flags & TF_kernel_mode) ) gmfn = pagetable_get_mfn(v->arch.guest_table_user); else #endif gmfn = pagetable_get_mfn(v->arch.guest_table); #if GUEST_PAGING_LEVELS == 3 /* * On PAE guests we don't use a mapping of the guest's own top-level * table. We cache the current state of that table and shadow that, * until the next CR3 write makes us refresh our cache. */ ASSERT(shadow_mode_external(d)); /* * Find where in the page the l3 table is, but ignore the low 2 bits of * guest_idx -- they are really just cache control. */ guest_idx = guest_index((void *)v->arch.hvm.guest_cr[3]) & ~3; gl3e = ((guest_l3e_t *)map_domain_page(gmfn)) + guest_idx; for ( i = 0; i < 4 ; i++ ) v->arch.paging.shadow.gl3e[i] = gl3e[i]; unmap_domain_page(gl3e); #endif //// //// vcpu->arch.paging.shadow.shadow_table[] //// /* We revoke write access to the new guest toplevel page(s) before we * replace the old shadow pagetable(s), so that we can safely use the * (old) shadow linear maps in the writeable mapping heuristics. */ #if GUEST_PAGING_LEVELS == 4 if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 ) guest_flush_tlb_mask(d, d->dirty_cpumask); sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow); if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) { ASSERT(d->is_dying || d->is_shutting_down); return; } if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) ) { mfn_t smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]); if ( !(v->arch.flags & TF_kernel_mode) && VM_ASSIST(d, m2p_strict) ) zap_ro_mpt(smfn); else if ( (v->arch.flags & TF_kernel_mode) && !VM_ASSIST(d, m2p_strict) ) fill_ro_mpt(smfn); } #elif GUEST_PAGING_LEVELS == 3 /* PAE guests have four shadow_table entries, based on the * current values of the guest's four l3es. */ { int flush = 0; gfn_t gl2gfn; mfn_t gl2mfn; p2m_type_t p2mt; gl3e = v->arch.paging.shadow.gl3e; /* First, make all four entries read-only. */ for ( i = 0; i < 4; i++ ) { if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT ) { gl2gfn = guest_l3e_get_gfn(gl3e[i]); gl2mfn = get_gfn_query_unlocked(d, gfn_x(gl2gfn), &p2mt); if ( p2m_is_ram(p2mt) ) flush |= sh_remove_write_access(d, gl2mfn, 2, 0); } } if ( flush ) guest_flush_tlb_mask(d, d->dirty_cpumask); /* Now install the new shadows. */ for ( i = 0; i < 4; i++ ) { if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT ) { gl2gfn = guest_l3e_get_gfn(gl3e[i]); gl2mfn = get_gfn_query_unlocked(d, gfn_x(gl2gfn), &p2mt); if ( p2m_is_ram(p2mt) ) sh_set_toplevel_shadow(v, i, gl2mfn, SH_type_l2_shadow, sh_make_shadow); else sh_set_toplevel_shadow(v, i, INVALID_MFN, 0, sh_make_shadow); } else sh_set_toplevel_shadow(v, i, INVALID_MFN, 0, sh_make_shadow); } } #elif GUEST_PAGING_LEVELS == 2 if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 ) guest_flush_tlb_mask(d, d->dirty_cpumask); sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow); if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) { ASSERT(d->is_dying || d->is_shutting_down); return; } #else #error This should never happen #endif /// /// v->arch.paging.shadow.l3table /// #if SHADOW_PAGING_LEVELS == 3 { mfn_t smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]); unsigned int i; for_each_shadow_table(v, i) { #if GUEST_PAGING_LEVELS == 2 /* 2-on-3: make a PAE l3 that points at the four-page l2 */ if ( i != 0 ) smfn = sh_next_page(smfn); #else /* 3-on-3: make a PAE l3 that points at the four l2 pages */ smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[i]); #endif v->arch.paging.shadow.l3table[i] = (mfn_x(smfn) == 0) ? shadow_l3e_empty() : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT); } } #endif /* SHADOW_PAGING_LEVELS == 3 */ /// /// v->arch.cr3 /// if ( shadow_mode_external(d) ) { make_cr3(v, pagetable_get_mfn(v->arch.hvm.monitor_table)); } #if SHADOW_PAGING_LEVELS == 4 else // not shadow_mode_external... { /* We don't support PV except guest == shadow == config levels */ BUILD_BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS); /* Just use the shadow top-level directly */ make_cr3(v, pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0])); } #endif /// /// v->arch.hvm.hw_cr[3] /// if ( shadow_mode_external(d) ) { ASSERT(is_hvm_domain(d)); #if SHADOW_PAGING_LEVELS == 3 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */ v->arch.hvm.hw_cr[3] = virt_to_maddr(&v->arch.paging.shadow.l3table); #else /* 4-on-4: Just use the shadow top-level directly */ v->arch.hvm.hw_cr[3] = pagetable_get_paddr(v->arch.paging.shadow.shadow_table[0]); #endif hvm_update_guest_cr3(v, noflush); } /* Fix up the linear pagetable mappings */ sh_update_linear_entries(v); #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) /* No longer safe to use cached gva->gfn translations */ vtlb_flush(v); #endif #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION v->arch.paging.last_write_emul_ok = 0; #endif #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Need to resync all the shadow entries on a TLB flush. We only * update the shadows, leaving the pages out of sync. Also, we try * to skip synchronization of shadows not mapped in the new * tables. */ shadow_sync_other_vcpus(v); #endif /* Release the lock, if we took it (otherwise it's the caller's problem) */ if ( do_locking ) paging_unlock(v->domain); } /**************************************************************************/ /* Functions to revoke guest rights */ #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC int sh_rm_write_access_from_sl1p(struct domain *d, mfn_t gmfn, mfn_t smfn, unsigned long off) { #ifdef CONFIG_HVM struct vcpu *curr = current; #endif int r; shadow_l1e_t *sl1p, sl1e; struct page_info *sp; ASSERT(mfn_valid(gmfn)); ASSERT(mfn_valid(smfn)); #ifdef CONFIG_HVM /* Remember if we've been told that this process is being torn down */ if ( curr->domain == d && is_hvm_domain(d) ) curr->arch.paging.shadow.pagetable_dying = mfn_to_page(gmfn)->pagetable_dying; #endif sp = mfn_to_page(smfn); if ( ((sp->count_info & PGC_count_mask) != 0) || (sp->u.sh.type != SH_type_l1_shadow && sp->u.sh.type != SH_type_fl1_shadow) ) goto fail; sl1p = map_domain_page(smfn); sl1p += off; sl1e = *sl1p; if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW)) != (_PAGE_PRESENT|_PAGE_RW)) || !mfn_eq(shadow_l1e_get_mfn(sl1e), gmfn) ) { unmap_domain_page(sl1p); goto fail; } /* Found it! Need to remove its write permissions. */ sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW); r = shadow_set_l1e(d, sl1p, sl1e, p2m_ram_rw, smfn); ASSERT( !(r & SHADOW_SET_ERROR) ); unmap_domain_page(sl1p); perfc_incr(shadow_writeable_h_7); return 1; fail: perfc_incr(shadow_writeable_h_8); return 0; } #endif /* OOS */ #if defined(CONFIG_HVM) && (SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC) static int cf_check sh_guess_wrmap( struct vcpu *v, unsigned long vaddr, mfn_t gmfn) /* Look up this vaddr in the current shadow and see if it's a writeable * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */ { struct domain *d = v->domain; shadow_l1e_t sl1e, *sl1p; shadow_l2e_t *sl2p; shadow_l3e_t *sl3p; #if SHADOW_PAGING_LEVELS >= 4 shadow_l4e_t *sl4p; #endif mfn_t sl1mfn; int r; /* Carefully look in the shadow linear map for the l1e we expect */ #if SHADOW_PAGING_LEVELS >= 4 /* * Non-external guests (i.e. PV) have a SHADOW_LINEAR mapping from the * moment their shadows are created. External guests (i.e. HVM) may not, * but always have a regular linear mapping, which we can use to observe * whether a SHADOW_LINEAR mapping is present. */ if ( paging_mode_external(d) ) { sl4p = __linear_l4_table + l4_linear_offset(SH_LINEAR_PT_VIRT_START); if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) ) return 0; } sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr); if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) ) return 0; sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr); if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) ) return 0; #else /* SHADOW_PAGING_LEVELS == 3 */ sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table) + shadow_l3_linear_offset(vaddr); if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) ) return 0; #endif sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr); if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) ) return 0; sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr); sl1e = *sl1p; if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW)) != (_PAGE_PRESENT|_PAGE_RW)) || !mfn_eq(shadow_l1e_get_mfn(sl1e), gmfn) ) return 0; /* Found it! Need to remove its write permissions. */ sl1mfn = shadow_l2e_get_mfn(*sl2p); sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW); r = shadow_set_l1e(d, sl1p, sl1e, p2m_ram_rw, sl1mfn); if ( r & SHADOW_SET_ERROR ) { /* Can only currently happen if we found a grant-mapped * page. Just make the guess fail. */ return 0; } TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND); return 1; } #endif int cf_check sh_rm_write_access_from_l1( struct domain *d, mfn_t sl1mfn, mfn_t readonly_mfn) /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */ { shadow_l1e_t *sl1e; int done = 0; #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC struct vcpu *curr = current; mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */ #endif FOREACH_PRESENT_L1E(sl1mfn, sl1e, NULL, done, { if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_RW) && mfn_eq(shadow_l1e_get_mfn(*sl1e), readonly_mfn) ) { shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW); shadow_set_l1e(d, sl1e, ro_sl1e, p2m_ram_rw, sl1mfn); #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC /* Remember the last shadow that we shot a writeable mapping in */ if ( curr->domain == d ) curr->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn); #endif if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info & PGT_count_mask) == 0 ) /* This breaks us cleanly out of the FOREACH macro */ done = 1; } }); return done; } int cf_check sh_rm_mappings_from_l1( struct domain *d, mfn_t sl1mfn, mfn_t target_mfn) /* Excises all mappings to guest frame from this shadow l1 table */ { shadow_l1e_t *sl1e; int done = 0; FOREACH_PRESENT_L1E(sl1mfn, sl1e, NULL, done, { if ( mfn_eq(shadow_l1e_get_mfn(*sl1e), target_mfn) ) { shadow_set_l1e(d, sl1e, shadow_l1e_empty(), p2m_invalid, sl1mfn); if ( sh_check_page_has_no_refs(mfn_to_page(target_mfn)) ) /* This breaks us cleanly out of the FOREACH macro */ done = 1; } }); return done; } /**************************************************************************/ /* Functions to excise all pointers to shadows from higher-level shadows. */ void sh_clear_shadow_entry(struct domain *d, void *ep, mfn_t smfn) /* Blank out a single shadow entry */ { switch ( mfn_to_page(smfn)->u.sh.type ) { case SH_type_l1_shadow: shadow_set_l1e(d, ep, shadow_l1e_empty(), p2m_invalid, smfn); break; case SH_type_l2_shadow: #if GUEST_PAGING_LEVELS >= 4 && defined(CONFIG_PV32) case SH_type_l2h_shadow: #endif shadow_set_l2e(d, ep, shadow_l2e_empty(), smfn); break; #if GUEST_PAGING_LEVELS >= 4 case SH_type_l3_shadow: shadow_set_l3e(d, ep, shadow_l3e_empty(), smfn); break; case SH_type_l4_shadow: shadow_set_l4e(d, ep, shadow_l4e_empty(), smfn); break; #endif default: BUG(); /* Called with the wrong kind of shadow. */ } } int cf_check sh_remove_l1_shadow(struct domain *d, mfn_t sl2mfn, mfn_t sl1mfn) /* Remove all mappings of this l1 shadow from this l2 shadow */ { shadow_l2e_t *sl2e; int done = 0; FOREACH_PRESENT_L2E(sl2mfn, sl2e, NULL, done, d, { if ( mfn_eq(shadow_l2e_get_mfn(*sl2e), sl1mfn) ) { shadow_set_l2e(d, sl2e, shadow_l2e_empty(), sl2mfn); if ( mfn_to_page(sl1mfn)->u.sh.type == 0 ) /* This breaks us cleanly out of the FOREACH macro */ done = 1; } }); return done; } #if GUEST_PAGING_LEVELS >= 4 int cf_check sh_remove_l2_shadow(struct domain *d, mfn_t sl3mfn, mfn_t sl2mfn) /* Remove all mappings of this l2 shadow from this l3 shadow */ { shadow_l3e_t *sl3e; int done = 0; FOREACH_PRESENT_L3E(sl3mfn, sl3e, NULL, done, { if ( mfn_eq(shadow_l3e_get_mfn(*sl3e), sl2mfn) ) { shadow_set_l3e(d, sl3e, shadow_l3e_empty(), sl3mfn); if ( mfn_to_page(sl2mfn)->u.sh.type == 0 ) /* This breaks us cleanly out of the FOREACH macro */ done = 1; } }); return done; } int cf_check sh_remove_l3_shadow(struct domain *d, mfn_t sl4mfn, mfn_t sl3mfn) /* Remove all mappings of this l3 shadow from this l4 shadow */ { shadow_l4e_t *sl4e; int done = 0; FOREACH_PRESENT_L4E(sl4mfn, sl4e, NULL, done, d, { if ( mfn_eq(shadow_l4e_get_mfn(*sl4e), sl3mfn) ) { shadow_set_l4e(d, sl4e, shadow_l4e_empty(), sl4mfn); if ( mfn_to_page(sl3mfn)->u.sh.type == 0 ) /* This breaks us cleanly out of the FOREACH macro */ done = 1; } }); return done; } #endif /* 64bit guest */ #ifdef CONFIG_HVM /**************************************************************************/ /* Function for the guest to inform us that a process is being torn * down. We remember that as a hint to unshadow its pagetables soon, * and in the meantime we unhook its top-level user-mode entries. */ #if GUEST_PAGING_LEVELS == 3 static void cf_check sh_pagetable_dying(paddr_t gpa) { struct vcpu *v = current; struct domain *d = v->domain; unsigned int i; int flush = 0; int fast_path = 0; paddr_t gcr3 = 0; p2m_type_t p2mt; char *gl3pa = NULL; guest_l3e_t *gl3e = NULL; unsigned long l3gfn; mfn_t l3mfn; ASSERT(is_hvm_domain(d)); gcr3 = v->arch.hvm.guest_cr[3]; /* fast path: the pagetable belongs to the current context */ if ( gcr3 == gpa ) fast_path = 1; l3gfn = gpa >> PAGE_SHIFT; l3mfn = get_gfn_query(d, _gfn(l3gfn), &p2mt); if ( !mfn_valid(l3mfn) || !p2m_is_ram(p2mt) ) { printk(XENLOG_DEBUG "sh_pagetable_dying: gpa not valid %"PRIpaddr"\n", gpa); goto out_put_gfn; } paging_lock(d); if ( !fast_path ) { gl3pa = map_domain_page(l3mfn); gl3e = (guest_l3e_t *)(gl3pa + ((unsigned long)gpa & ~PAGE_MASK)); } for_each_shadow_table(v, i) { mfn_t smfn, gmfn; if ( fast_path ) { if ( pagetable_is_null(v->arch.paging.shadow.shadow_table[i]) ) smfn = INVALID_MFN; else smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[i]); } else { /* retrieving the l2s */ gmfn = get_gfn_query_unlocked(d, gfn_x(guest_l3e_get_gfn(gl3e[i])), &p2mt); smfn = unlikely(mfn_eq(gmfn, INVALID_MFN)) ? INVALID_MFN : shadow_hash_lookup(d, mfn_x(gmfn), SH_type_l2_pae_shadow); } if ( !mfn_eq(smfn, INVALID_MFN) ) { gmfn = _mfn(mfn_to_page(smfn)->v.sh.back); mfn_to_page(gmfn)->pagetable_dying = true; shadow_unhook_mappings(d, smfn, 1/* user pages only */); flush = 1; } } if ( flush ) guest_flush_tlb_mask(d, d->dirty_cpumask); /* Remember that we've seen the guest use this interface, so we * can rely on it using it in future, instead of guessing at * when processes are being torn down. */ d->arch.paging.shadow.pagetable_dying_op = 1; v->arch.paging.shadow.pagetable_dying = 1; if ( !fast_path ) unmap_domain_page(gl3pa); paging_unlock(d); out_put_gfn: put_gfn(d, l3gfn); } #else static void cf_check sh_pagetable_dying(paddr_t gpa) { struct vcpu *v = current; struct domain *d = v->domain; mfn_t smfn, gmfn; p2m_type_t p2mt; ASSERT(is_hvm_domain(d)); gmfn = get_gfn_query(d, _gfn(gpa >> PAGE_SHIFT), &p2mt); paging_lock(d); #if GUEST_PAGING_LEVELS == 2 smfn = shadow_hash_lookup(d, mfn_x(gmfn), SH_type_l2_32_shadow); #else smfn = shadow_hash_lookup(d, mfn_x(gmfn), SH_type_l4_64_shadow); #endif if ( !mfn_eq(smfn, INVALID_MFN) ) { mfn_to_page(gmfn)->pagetable_dying = true; shadow_unhook_mappings(d, smfn, 1/* user pages only */); /* Now flush the TLB: we removed toplevel mappings. */ guest_flush_tlb_mask(d, d->dirty_cpumask); } /* Remember that we've seen the guest use this interface, so we * can rely on it using it in future, instead of guessing at * when processes are being torn down. */ d->arch.paging.shadow.pagetable_dying_op = 1; v->arch.paging.shadow.pagetable_dying = 1; paging_unlock(d); put_gfn(d, gpa >> PAGE_SHIFT); } #endif #endif /* CONFIG_HVM */ /**************************************************************************/ /* Audit tools */ #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES #define AUDIT_FAIL(_level, _fmt, _a...) do { \ printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \ "gl" #_level "mfn = %" PRI_mfn \ " sl" #_level "mfn = %" PRI_mfn \ " &gl" #_level "e = %p &sl" #_level "e = %p" \ " gl" #_level "e = %" SH_PRI_gpte \ " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \ GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ _level, guest_index(gl ## _level ## e), \ mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ gl ## _level ## e, sl ## _level ## e, \ gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \ ##_a); \ BUG(); \ done = 1; \ } while (0) #define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \ printk("Shadow %u-on-%u audit failed at level %i\n" \ "gl" #_level "mfn = %" PRI_mfn \ " sl" #_level "mfn = %" PRI_mfn \ " Error: " _fmt "\n", \ GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ _level, \ mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ ##_a); \ BUG(); \ done = 1; \ } while (0) static const char *sh_audit_flags(const struct domain *d, int level, int gflags, int sflags) /* Common code for auditing flag bits */ { if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) ) return "shadow is present but guest is not present"; if ( (sflags & _PAGE_GLOBAL) && !is_hvm_domain(d) ) return "global bit set in PV shadow"; if ( level == 2 && (sflags & _PAGE_PSE) ) return "PS bit set in shadow"; #if SHADOW_PAGING_LEVELS == 3 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */ #endif if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) ) return "accessed bit not propagated"; if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE))) && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) ) return "dirty bit not propagated"; if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) ) return "user/supervisor bit does not match"; if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) ) return "NX bit does not match"; if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) ) return "shadow grants write access but guest does not"; return NULL; } int cf_check sh_audit_l1_table(struct domain *d, mfn_t sl1mfn, mfn_t x) { guest_l1e_t *gl1e, *gp; shadow_l1e_t *sl1e; mfn_t mfn, gmfn, gl1mfn; gfn_t gfn; p2m_type_t p2mt; const char *s; int done = 0; /* Follow the backpointer */ ASSERT(mfn_to_page(sl1mfn)->u.sh.head); gl1mfn = backpointer(mfn_to_page(sl1mfn)); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */ if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) ) { oos_audit_hash_is_present(d, gl1mfn); return 0; } #endif gl1e = gp = map_domain_page(gl1mfn); FOREACH_PRESENT_L1E(sl1mfn, sl1e, &gl1e, done, { if ( sh_l1e_is_magic(*sl1e) ) { #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) if ( sh_l1e_is_gnp(*sl1e) ) { if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT ) AUDIT_FAIL(1, "shadow is GNP magic but guest is present"); } else { ASSERT(sh_l1e_is_mmio(*sl1e)); gfn = sh_l1e_mmio_get_gfn(*sl1e); if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) ) AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn " but guest gfn is %" SH_PRI_gfn, gfn_x(gfn), gfn_x(guest_l1e_get_gfn(*gl1e))); } #endif } else { s = sh_audit_flags(d, 1, guest_l1e_get_flags(*gl1e), shadow_l1e_get_flags(*sl1e)); if ( s ) AUDIT_FAIL(1, "%s", s); if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS ) { gfn = guest_l1e_get_gfn(*gl1e); mfn = shadow_l1e_get_mfn(*sl1e); gmfn = get_gfn_query_unlocked(d, gfn_x(gfn), &p2mt); if ( !p2m_is_grant(p2mt) && !mfn_eq(gmfn, mfn) ) AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn " --> %" PRI_mfn " != mfn %" PRI_mfn, gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); } } }); unmap_domain_page(gp); return done; } int cf_check sh_audit_fl1_table(struct domain *d, mfn_t sl1mfn, mfn_t x) { guest_l1e_t *gl1e, e; shadow_l1e_t *sl1e; mfn_t gl1mfn = INVALID_MFN; int f; int done = 0; /* fl1 has no useful backpointer: all we can check are flags */ e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */ FOREACH_PRESENT_L1E(sl1mfn, sl1e, NULL, done, { f = shadow_l1e_get_flags(*sl1e); f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2); if ( !(f == 0 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| _PAGE_ACCESSED) || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED) || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| _PAGE_ACCESSED|_PAGE_DIRTY) || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY) || sh_l1e_is_magic(*sl1e)) ) AUDIT_FAIL(1, "fl1e has bad flags"); }); return 0; } int cf_check sh_audit_l2_table(struct domain *d, mfn_t sl2mfn, mfn_t x) { guest_l2e_t *gl2e, *gp; shadow_l2e_t *sl2e; mfn_t mfn, gmfn, gl2mfn; gfn_t gfn; p2m_type_t p2mt; const char *s; int done = 0; /* Follow the backpointer */ ASSERT(mfn_to_page(sl2mfn)->u.sh.head); gl2mfn = backpointer(mfn_to_page(sl2mfn)); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Only L1's may be out of sync. */ if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) ) AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn)); #endif gl2e = gp = map_domain_page(gl2mfn); FOREACH_PRESENT_L2E(sl2mfn, sl2e, &gl2e, done, d, { s = sh_audit_flags(d, 2, guest_l2e_get_flags(*gl2e), shadow_l2e_get_flags(*sl2e)); if ( s ) AUDIT_FAIL(2, "%s", s); if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS ) { gfn = guest_l2e_get_gfn(*gl2e); mfn = shadow_l2e_get_mfn(*sl2e); gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? get_fl1_shadow_status(d, gfn) : get_shadow_status(d, get_gfn_query_unlocked(d, gfn_x(gfn), &p2mt), SH_type_l1_shadow); if ( !mfn_eq(gmfn, mfn) ) AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn " (--> %" PRI_mfn ")" " --> %" PRI_mfn " != mfn %" PRI_mfn, gfn_x(gfn), (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0 : mfn_x(get_gfn_query_unlocked(d, gfn_x(gfn), &p2mt)), mfn_x(gmfn), mfn_x(mfn)); } }); unmap_domain_page(gp); return 0; } #if GUEST_PAGING_LEVELS >= 4 int cf_check sh_audit_l3_table(struct domain *d, mfn_t sl3mfn, mfn_t x) { guest_l3e_t *gl3e, *gp; shadow_l3e_t *sl3e; mfn_t mfn, gmfn, gl3mfn; gfn_t gfn; p2m_type_t p2mt; const char *s; int done = 0; /* Follow the backpointer */ ASSERT(mfn_to_page(sl3mfn)->u.sh.head); gl3mfn = backpointer(mfn_to_page(sl3mfn)); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Only L1's may be out of sync. */ if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) ) AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn)); #endif gl3e = gp = map_domain_page(gl3mfn); FOREACH_PRESENT_L3E(sl3mfn, sl3e, &gl3e, done, { s = sh_audit_flags(d, 3, guest_l3e_get_flags(*gl3e), shadow_l3e_get_flags(*sl3e)); if ( s ) AUDIT_FAIL(3, "%s", s); if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS ) { unsigned int t = SH_type_l2_shadow; gfn = guest_l3e_get_gfn(*gl3e); mfn = shadow_l3e_get_mfn(*sl3e); #ifdef CONFIG_PV32 if ( guest_index(gl3e) == 3 && is_pv_32bit_domain(d) ) t = SH_type_l2h_shadow; #endif gmfn = get_shadow_status( d, get_gfn_query_unlocked(d, gfn_x(gfn), &p2mt), t); if ( !mfn_eq(gmfn, mfn) ) AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn " --> %" PRI_mfn " != mfn %" PRI_mfn, gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); } }); unmap_domain_page(gp); return 0; } int cf_check sh_audit_l4_table(struct domain *d, mfn_t sl4mfn, mfn_t x) { guest_l4e_t *gl4e, *gp; shadow_l4e_t *sl4e; mfn_t mfn, gmfn, gl4mfn; gfn_t gfn; p2m_type_t p2mt; const char *s; int done = 0; /* Follow the backpointer */ ASSERT(mfn_to_page(sl4mfn)->u.sh.head); gl4mfn = backpointer(mfn_to_page(sl4mfn)); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Only L1's may be out of sync. */ if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) ) AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn)); #endif gl4e = gp = map_domain_page(gl4mfn); FOREACH_PRESENT_L4E(sl4mfn, sl4e, &gl4e, done, d, { s = sh_audit_flags(d, 4, guest_l4e_get_flags(*gl4e), shadow_l4e_get_flags(*sl4e)); if ( s ) AUDIT_FAIL(4, "%s", s); if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS ) { gfn = guest_l4e_get_gfn(*gl4e); mfn = shadow_l4e_get_mfn(*sl4e); gmfn = get_shadow_status(d, get_gfn_query_unlocked( d, gfn_x(gfn), &p2mt), SH_type_l3_shadow); if ( !mfn_eq(gmfn, mfn) ) AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn " --> %" PRI_mfn " != mfn %" PRI_mfn, gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); } }); unmap_domain_page(gp); return 0; } #endif /* GUEST_PAGING_LEVELS >= 4 */ #undef AUDIT_FAIL #endif /* Audit code */ /**************************************************************************/ /* Entry points into this mode of the shadow code. * This will all be mangled by the preprocessor to uniquify everything. */ const struct paging_mode sh_paging_mode = { .page_fault = sh_page_fault, .invlpg = sh_invlpg, #ifdef CONFIG_HVM .gva_to_gfn = sh_gva_to_gfn, #endif .update_cr3 = sh_update_cr3, .guest_levels = GUEST_PAGING_LEVELS, #ifdef CONFIG_HVM #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC .shadow.guess_wrmap = sh_guess_wrmap, #endif .shadow.pagetable_dying = sh_pagetable_dying, .shadow.trace_emul_write_val = trace_emulate_write_val, #endif /* CONFIG_HVM */ .shadow.shadow_levels = SHADOW_PAGING_LEVELS, }; /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */