/* SPDX-License-Identifier: GPL-2.0-or-later */ /****************************************************************************** * arch/x86/pv/ro-page-fault.c * * Read-only page fault emulation for PV guests * * Copyright (c) 2002-2005 K A Fraser * Copyright (c) 2004 Christian Limpach */ #include #include #include "emulate.h" #include "mm.h" static int cf_check pv_emul_is_mem_write( const struct x86_emulate_state *state, struct x86_emulate_ctxt *ctxt) { return x86_insn_is_mem_write(state, ctxt) ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE; } /********************* * Writable Pagetables */ struct ptwr_emulate_ctxt { unsigned long cr2; l1_pgentry_t pte; }; static int cf_check ptwr_emulated_read( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { unsigned int rc = bytes; unsigned long addr = offset; if ( !__addr_ok(addr) || (rc = __copy_from_guest_pv(p_data, (void *)addr, bytes)) ) { x86_emul_pagefault(0, addr + bytes - rc, ctxt); /* Read fault. */ return X86EMUL_EXCEPTION; } return X86EMUL_OKAY; } static int cf_check ptwr_emulated_insn_fetch( unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { unsigned int rc = copy_from_guest_pv(p_data, (void *)offset, bytes); if ( rc ) { x86_emul_pagefault(PFEC_insn_fetch, offset + bytes - rc, ctxt); return X86EMUL_EXCEPTION; } return X86EMUL_OKAY; } /* * p_old being NULL indicates a plain write to occur, while a non-NULL * input requests a CMPXCHG-based update. */ static int ptwr_emulated_update(unsigned long addr, intpte_t *p_old, intpte_t val, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { mfn_t mfn; unsigned long unaligned_addr = addr; struct page_info *page; l1_pgentry_t pte, ol1e, nl1e, *pl1e; intpte_t old = p_old ? *p_old : 0; unsigned int offset = 0; struct vcpu *v = current; struct domain *d = v->domain; struct ptwr_emulate_ctxt *ptwr_ctxt = ctxt->data; int ret; /* Only allow naturally-aligned stores within the original %cr2 page. */ if ( unlikely(((addr ^ ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes - 1))) ) { gdprintk(XENLOG_WARNING, "bad access (cr2=%lx, addr=%lx, bytes=%u)\n", ptwr_ctxt->cr2, addr, bytes); return X86EMUL_UNHANDLEABLE; } /* Turn a sub-word access into a full-word access. */ if ( bytes != sizeof(val) ) { intpte_t full; unsigned int rc; offset = (addr & (sizeof(full) - 1)) * 8; /* Align address; read full word. */ addr &= ~(sizeof(full) - 1); if ( (rc = copy_from_guest_pv(&full, (void __user *)addr, sizeof(full))) != 0 ) { x86_emul_pagefault(0, /* Read fault. */ addr + sizeof(full) - rc, ctxt); return X86EMUL_EXCEPTION; } /* Mask out bits provided by caller. */ full &= ~((((intpte_t)1 << (bytes * 8)) - 1) << offset); /* Shift the caller value and OR in the missing bits. */ val &= (((intpte_t)1 << (bytes * 8)) - 1); val <<= offset; val |= full; /* Also fill in missing parts of the cmpxchg old value. */ old &= (((intpte_t)1 << (bytes * 8)) - 1); old <<= offset; old |= full; } pte = ptwr_ctxt->pte; mfn = l1e_get_mfn(pte); page = mfn_to_page(mfn); /* We are looking only for read-only mappings of p.t. pages. */ ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT); ASSERT(mfn_valid(mfn)); ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table); ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0); ASSERT(page_get_owner(page) == d); /* Check the new PTE. */ nl1e = l1e_from_intpte(val); if ( !(l1e_get_flags(nl1e) & _PAGE_PRESENT) ) { if ( pv_l1tf_check_l1e(d, nl1e) ) return X86EMUL_RETRY; } else { switch ( ret = get_page_from_l1e(nl1e, d, d) ) { default: if ( !is_pv_32bit_domain(d) || (bytes != 4) || !(unaligned_addr & 4) || p_old || !(l1e_get_flags(nl1e) & _PAGE_PRESENT) ) { gdprintk(XENLOG_WARNING, "could not get_page_from_l1e()\n"); return X86EMUL_UNHANDLEABLE; } /* * If this is an upper-half write to a PAE PTE then we assume that * the guest has simply got the two writes the wrong way round. We * zap the PRESENT bit on the assumption that the bottom half will * be written immediately after we return to the guest. */ gdprintk(XENLOG_DEBUG, "ptwr_emulate: fixing up invalid PAE PTE %" PRIpte"\n", l1e_get_intpte(nl1e)); l1e_remove_flags(nl1e, _PAGE_PRESENT); break; case 0: break; case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS: ASSERT(!(ret & ~(_PAGE_RW | PAGE_CACHE_ATTRS))); l1e_flip_flags(nl1e, ret); break; } } nl1e = adjust_guest_l1e(nl1e, d); /* Checked successfully: do the update (write or cmpxchg). */ pl1e = map_domain_page(mfn) + (addr & ~PAGE_MASK); if ( p_old ) { ol1e = l1e_from_intpte(old); old = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e), old, l1e_get_intpte(nl1e), mfn); if ( l1e_get_intpte(ol1e) == old ) ret = X86EMUL_OKAY; else { *p_old = old >> offset; ret = X86EMUL_CMPXCHG_FAILED; } if ( ret != X86EMUL_OKAY ) { unmap_domain_page(pl1e); put_page_from_l1e(nl1e, d); return ret; } } else { ol1e = *pl1e; if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) ) BUG(); } trace_ptwr_emulation(addr, nl1e); unmap_domain_page(pl1e); /* Finally, drop the old PTE. */ put_page_from_l1e(ol1e, d); return X86EMUL_OKAY; } static int cf_check ptwr_emulated_write( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { intpte_t val = 0; if ( (bytes > sizeof(val)) || (bytes & (bytes - 1)) || !bytes ) { gdprintk(XENLOG_WARNING, "bad write size (addr=%lx, bytes=%u)\n", offset, bytes); return X86EMUL_UNHANDLEABLE; } memcpy(&val, p_data, bytes); return ptwr_emulated_update(offset, NULL, val, bytes, ctxt); } static int cf_check ptwr_emulated_cmpxchg( enum x86_segment seg, unsigned long offset, void *p_old, void *p_new, unsigned int bytes, bool lock, struct x86_emulate_ctxt *ctxt) { intpte_t old = 0, new = 0; int rc; if ( (bytes > sizeof(new)) || (bytes & (bytes - 1)) ) { gdprintk(XENLOG_WARNING, "bad cmpxchg size (addr=%lx, bytes=%u)\n", offset, bytes); return X86EMUL_UNHANDLEABLE; } memcpy(&old, p_old, bytes); memcpy(&new, p_new, bytes); rc = ptwr_emulated_update(offset, &old, new, bytes, ctxt); memcpy(p_old, &old, bytes); return rc; } static const struct x86_emulate_ops ptwr_emulate_ops = { .read = ptwr_emulated_read, .insn_fetch = ptwr_emulated_insn_fetch, .write = ptwr_emulated_write, .cmpxchg = ptwr_emulated_cmpxchg, .validate = pv_emul_is_mem_write, }; /* Write page fault handler: check if guest is trying to modify a PTE. */ static int ptwr_do_page_fault(struct x86_emulate_ctxt *ctxt, unsigned long addr, l1_pgentry_t pte) { struct ptwr_emulate_ctxt ptwr_ctxt = { .cr2 = addr, .pte = pte, }; struct page_info *page; int rc = X86EMUL_UNHANDLEABLE; page = get_page_from_mfn(l1e_get_mfn(pte), current->domain); if ( !page ) return X86EMUL_UNHANDLEABLE; if ( page_lock(page) ) { if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ) { ctxt->data = &ptwr_ctxt; rc = x86_emulate(ctxt, &ptwr_emulate_ops); } page_unlock(page); } put_page(page); return rc; } /***************************************** * fault handling for read-only MMIO pages */ static const struct x86_emulate_ops mmio_ro_emulate_ops = { .read = x86emul_unhandleable_rw, .insn_fetch = ptwr_emulated_insn_fetch, .write = mmio_ro_emulated_write, .validate = pv_emul_is_mem_write, }; static const struct x86_emulate_ops mmcfg_intercept_ops = { .read = x86emul_unhandleable_rw, .insn_fetch = ptwr_emulated_insn_fetch, .write = mmcfg_intercept_write, .validate = pv_emul_is_mem_write, }; /* Check if guest is trying to modify a r/o MMIO page. */ static int mmio_ro_do_page_fault(struct x86_emulate_ctxt *ctxt, unsigned long addr, l1_pgentry_t pte) { struct mmio_ro_emulate_ctxt mmio_ro_ctxt = { .cr2 = addr }; mfn_t mfn = l1e_get_mfn(pte); if ( mfn_valid(mfn) ) { struct page_info *page = mfn_to_page(mfn); const struct domain *owner = page_get_owner_and_reference(page); if ( owner ) put_page(page); if ( owner != dom_io ) return X86EMUL_UNHANDLEABLE; } ctxt->data = &mmio_ro_ctxt; if ( pci_ro_mmcfg_decode(mfn_x(mfn), &mmio_ro_ctxt.seg, &mmio_ro_ctxt.bdf) ) return x86_emulate(ctxt, &mmcfg_intercept_ops); else return x86_emulate(ctxt, &mmio_ro_emulate_ops); } int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs) { l1_pgentry_t pte; const struct domain *currd = current->domain; unsigned int addr_size = is_pv_32bit_domain(currd) ? 32 : BITS_PER_LONG; struct x86_emulate_ctxt ctxt = { .regs = regs, .addr_size = addr_size, .sp_size = addr_size, .lma = addr_size > 32, }; int rc; bool mmio_ro; /* Not part of the initializer, for old gcc to cope. */ ctxt.cpu_policy = currd->arch.cpu_policy; /* Attempt to read the PTE that maps the VA being accessed. */ pte = guest_get_eff_kern_l1e(addr); /* We are only looking for read-only mappings */ if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) ) return 0; mmio_ro = is_hardware_domain(currd) && rangeset_contains_singleton(mmio_ro_ranges, l1e_get_pfn(pte)); if ( mmio_ro ) rc = mmio_ro_do_page_fault(&ctxt, addr, pte); else rc = ptwr_do_page_fault(&ctxt, addr, pte); switch ( rc ) { case X86EMUL_EXCEPTION: /* * This emulation covers writes to: * - L1 pagetables. * - MMCFG space or read-only MFNs. * We tolerate #PF (from hitting an adjacent page or a successful * concurrent pagetable update). Anything else is an emulation bug, * or a guest playing with the instruction stream under Xen's feet. */ if ( ctxt.event.type == X86_EVENTTYPE_HW_EXCEPTION && ctxt.event.vector == X86_EXC_PF ) pv_inject_event(&ctxt.event); else gdprintk(XENLOG_WARNING, "Unexpected event (type %u, vector %#x) from emulation\n", ctxt.event.type, ctxt.event.vector); /* Fallthrough */ case X86EMUL_OKAY: if ( ctxt.retire.singlestep ) pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC); /* Fallthrough */ case X86EMUL_RETRY: if ( mmio_ro ) perfc_incr(mmio_ro_emulations); else perfc_incr(ptwr_emulations); return EXCRET_fault_fixed; } return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */