/* SPDX-License-Identifier: GPL-2.0-or-later */ /****************************************************************************** * arch/x86/mm.c * * Copyright (c) 2002-2005 K A Fraser * Copyright (c) 2004 Christian Limpach */ /* * A description of the x86 page table API: * * Domains trap to do_mmu_update with a list of update requests. * This is a list of (ptr, val) pairs, where the requested operation * is *ptr = val. * * Reference counting of pages: * ---------------------------- * Each page has two refcounts: tot_count and type_count. * * TOT_COUNT is the obvious reference count. It counts all uses of a * physical page frame by a domain, including uses as a page directory, * a page table, or simple mappings via a PTE. This count prevents a * domain from releasing a frame back to the free pool when it still holds * a reference to it. * * TYPE_COUNT is more subtle. A frame can be put to one of three * mutually-exclusive uses: it might be used as a page directory, or a * page table, or it may be mapped writable by the domain [of course, a * frame may not be used in any of these three ways!]. * So, type_count is a count of the number of times a frame is being * referred to in its current incarnation. Therefore, a page can only * change its type when its type count is zero. * * Pinning the page type: * ---------------------- * The type of a page can be pinned/unpinned with the commands * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is, * pinning is not reference counted, so it can't be nested). * This is useful to prevent a page's type count falling to zero, at which * point safety checks would need to be carried out next time the count * is increased again. * * A further note on writable page mappings: * ----------------------------------------- * For simplicity, the count of writable mappings for a page may not * correspond to reality. The 'writable count' is incremented for every * PTE which maps the page with the _PAGE_RW flag set. However, for * write access to be possible the page directory entry must also have * its _PAGE_RW bit set. We do not check this as it complicates the * reference counting considerably [consider the case of multiple * directory entries referencing a single page table, some with the RW * bit set, others not -- it starts getting a bit messy]. * In normal use, this simplification shouldn't be a problem. * However, the logic can be added if required. * * One more note on read-only page mappings: * ----------------------------------------- * We want domains to be able to map pages for read-only access. The * main reason is that page tables and directories should be readable * by a domain, but it would not be safe for them to be writable. * However, domains have free access to rings 1 & 2 of the Intel * privilege model. In terms of page protection, these are considered * to be part of 'supervisor mode'. The WP bit in CR0 controls whether * read-only restrictions are respected in supervisor mode -- if the * bit is clear then any mapped page is writable. * * We get round this by always setting the WP bit and disallowing * updates to it. This is very unlikely to cause a problem for guest * OS's, which will generally use the WP bit to simplify copy-on-write * implementation (in that case, OS wants a fault when it writes to * an application-supplied buffer). * * PV domUs and IOMMUs: * -------------------- * For a guest to be able to DMA into a page, that page must be in the * domain's IOMMU. However, we *must not* allow DMA into 'special' * pages (such as page table pages, descriptor tables, &c); and we * must also ensure that mappings are removed from the IOMMU when the * page is freed. Finally, it is inherently racy to make any changes * based on a page with a non-zero type count. * * To that end, we put the page in the IOMMU only when a page gains * the PGT_writeable type; and we remove the page when it loses the * PGT_writeable type (not when the type count goes to zero). This * effectively protects the IOMMU status update with the type count we * have just acquired. We must also check for PGT_writable type when * doing the final put_page(), and remove it from the iommu if so. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CONFIG_PV #include "pv/mm.h" #endif /* Override macros from asm/page.h to make them work with mfn_t */ #undef virt_to_mfn #define virt_to_mfn(v) _mfn(__virt_to_mfn(v)) /* Mapping of the fixmap space needed early. */ l1_pgentry_t __section(".bss.page_aligned") __aligned(PAGE_SIZE) l1_fixmap[L1_PAGETABLE_ENTRIES]; l1_pgentry_t __section(".bss.page_aligned") __aligned(PAGE_SIZE) l1_fixmap_x[L1_PAGETABLE_ENTRIES]; /* Frame table size in pages. */ unsigned long max_page; unsigned long total_pages; bool __read_mostly machine_to_phys_mapping_valid; struct rangeset *__read_mostly mmio_ro_ranges; static uint32_t base_disallow_mask; /* Global bit is allowed to be set on L1 PTEs. Intended for user mappings. */ #define L1_DISALLOW_MASK ((base_disallow_mask | _PAGE_GNTTAB) & ~_PAGE_GLOBAL) #define L2_DISALLOW_MASK base_disallow_mask #define l3_disallow_mask(d) (!is_pv_32bit_domain(d) ? \ base_disallow_mask : 0xFFFFF198U) #define L4_DISALLOW_MASK (base_disallow_mask) #define l1_disallow_mask(d) \ ((d != dom_io) && \ (rangeset_is_empty((d)->iomem_caps) && \ rangeset_is_empty((d)->arch.ioport_caps) && \ !has_arch_pdevs(d) && \ is_pv_domain(d)) ? \ L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS)) static s8 __read_mostly opt_mmio_relax; static int __init cf_check parse_mmio_relax(const char *s) { if ( !*s ) opt_mmio_relax = 1; else opt_mmio_relax = parse_bool(s, NULL); if ( opt_mmio_relax < 0 && strcmp(s, "all") ) { opt_mmio_relax = 0; return -EINVAL; } return 0; } custom_param("mmio-relax", parse_mmio_relax); static void __init init_frametable_chunk(void *start, void *end) { unsigned long s = (unsigned long)start; unsigned long e = (unsigned long)end; unsigned long step; mfn_t mfn; ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1))); for ( ; s < e; s += step << PAGE_SHIFT ) { step = 1UL << (cpu_has_page1gb && !(s & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ? L3_PAGETABLE_SHIFT - PAGE_SHIFT : L2_PAGETABLE_SHIFT - PAGE_SHIFT); /* * The hardcoded 4 below is arbitrary - just pick whatever you think * is reasonable to waste as a trade-off for using a large page. */ while ( step && s + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) ) step >>= PAGETABLE_ORDER; mfn = alloc_boot_pages(step, step); map_pages_to_xen(s, mfn, step, PAGE_HYPERVISOR); } memset(start, 0, end - start); memset(end, -1, s - e); } void __init init_frametable(void) { unsigned int sidx, eidx, nidx; unsigned int max_idx = DIV_ROUND_UP(max_pdx, PDX_GROUP_COUNT); struct page_info *end_pg, *top_pg; BUILD_BUG_ON(XEN_VIRT_END > FRAMETABLE_VIRT_START); BUILD_BUG_ON(FRAMETABLE_VIRT_START & ((1UL << L2_PAGETABLE_SHIFT) - 1)); for ( sidx = 0; ; sidx = nidx ) { eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx); nidx = find_next_bit(pdx_group_valid, max_idx, eidx); if ( nidx >= max_idx ) break; init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT), pdx_to_page(eidx * PDX_GROUP_COUNT)); } end_pg = pdx_to_page(max_pdx - 1) + 1; top_pg = mem_hotplug ? pdx_to_page(max_idx * PDX_GROUP_COUNT - 1) + 1 : end_pg; init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT), top_pg); memset(end_pg, -1, (unsigned long)top_pg - (unsigned long)end_pg); } #ifndef NDEBUG static unsigned int __read_mostly root_pgt_pv_xen_slots = ROOT_PAGETABLE_PV_XEN_SLOTS; static l4_pgentry_t __read_mostly split_l4e; #else #define root_pgt_pv_xen_slots ROOT_PAGETABLE_PV_XEN_SLOTS #endif /* * Originally cloned from share_xen_page_with_guest(), just to avoid setting * PGC_xen_heap on non-heap (typically) MMIO pages. Other pieces got dropped * simply because they're not needed in this context. */ static void __init assign_io_page(struct page_info *page) { set_gpfn_from_mfn(mfn_x(page_to_mfn(page)), INVALID_M2P_ENTRY); /* The incremented type count pins as writable. */ page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1; page_set_owner(page, dom_io); page->count_info |= PGC_allocated | 1; } void __init arch_init_memory(void) { unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn; /* * Basic guest-accessible flags: * PRESENT, R/W, USER, A/D, AVAIL[0,1,2], AVAIL_HIGH, NX (if available). */ base_disallow_mask = ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_AVAIL | _PAGE_AVAIL_HIGH | _PAGE_NX); /* * First 1MB of RAM is historically marked as I/O. * Note that apart from IO Xen also uses the low 1MB to store the AP boot * trampoline and boot information metadata. Due to this always special * case the low 1MB. */ BUG_ON(pvh_boot && trampoline_phys != 0x1000); for ( i = 0; i < 0x100; i++ ) assign_io_page(mfn_to_page(_mfn(i))); /* Any areas not specified as RAM by the e820 map are considered I/O. */ for ( i = 0, pfn = 0; pfn < max_page; i++ ) { while ( (i < e820.nr_map) && (e820.map[i].type != E820_RAM) && (e820.map[i].type != E820_UNUSABLE) ) i++; if ( i >= e820.nr_map ) { /* No more RAM regions: mark as I/O right to end of memory map. */ rstart_pfn = rend_pfn = max_page; } else { /* Mark as I/O just up as far as next RAM region. */ rstart_pfn = min_t(unsigned long, max_page, PFN_UP(e820.map[i].addr)); rend_pfn = max_t(unsigned long, rstart_pfn, PFN_DOWN(e820.map[i].addr + e820.map[i].size)); } /* * Make sure any Xen mappings of RAM holes above 1MB are blown away. * In particular this ensures that RAM holes are respected even in * the statically-initialised 1-16MB mapping area. */ iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT)); ioend_pfn = min(rstart_pfn, 16UL << (20 - PAGE_SHIFT)); if ( iostart_pfn < ioend_pfn ) destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn), (unsigned long)mfn_to_virt(ioend_pfn)); /* Mark as I/O up to next RAM region. */ for ( ; pfn < rstart_pfn; pfn++ ) { if ( !mfn_valid(_mfn(pfn)) ) continue; assign_io_page(mfn_to_page(_mfn(pfn))); } /* Skip the RAM region. */ pfn = rend_pfn; } subarch_init_memory(); efi_init_memory(); #ifndef NDEBUG if ( highmem_start ) { unsigned long split_va = (unsigned long)__va(highmem_start); if ( split_va < HYPERVISOR_VIRT_END && split_va - 1 == (unsigned long)__va(highmem_start - 1) ) { root_pgt_pv_xen_slots = l4_table_offset(split_va) - ROOT_PAGETABLE_FIRST_XEN_SLOT; ASSERT(root_pgt_pv_xen_slots < ROOT_PAGETABLE_PV_XEN_SLOTS); if ( l4_table_offset(split_va) == l4_table_offset(split_va - 1) ) { mfn_t l3mfn = alloc_xen_pagetable(); if ( !mfn_eq(l3mfn, INVALID_MFN) ) { const l3_pgentry_t *l3idle = map_l3t_from_l4e( idle_pg_table[l4_table_offset(split_va)]); l3_pgentry_t *l3tab = map_domain_page(l3mfn); for ( i = 0; i < l3_table_offset(split_va); ++i ) l3tab[i] = l3idle[i]; for ( ; i < L3_PAGETABLE_ENTRIES; ++i ) l3tab[i] = l3e_empty(); split_l4e = l4e_from_mfn(l3mfn, __PAGE_HYPERVISOR_RW); UNMAP_DOMAIN_PAGE(l3idle); UNMAP_DOMAIN_PAGE(l3tab); } else ++root_pgt_pv_xen_slots; } } } #endif /* Generate a symbol to be used in linker script */ ASM_CONSTANT(FIXADDR_X_SIZE, FIXADDR_X_SIZE); } int page_is_ram_type(unsigned long mfn, unsigned long mem_type) { uint64_t maddr = pfn_to_paddr(mfn); int i; for ( i = 0; i < e820.nr_map; i++ ) { switch ( e820.map[i].type ) { case E820_RAM: if ( mem_type & RAM_TYPE_CONVENTIONAL ) break; continue; case E820_RESERVED: if ( mem_type & RAM_TYPE_RESERVED ) break; continue; case E820_UNUSABLE: if ( mem_type & RAM_TYPE_UNUSABLE ) break; continue; case E820_ACPI: case E820_NVS: if ( mem_type & RAM_TYPE_ACPI ) break; continue; default: /* unknown */ continue; } /* Test the range. */ if ( (e820.map[i].addr <= maddr) && ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) ) return 1; } return 0; } unsigned int page_get_ram_type(mfn_t mfn) { uint64_t last = 0, maddr = mfn_to_maddr(mfn); unsigned int i, type = 0; for ( i = 0; i < e820.nr_map; last = e820.map[i].addr + e820.map[i].size, i++ ) { if ( (maddr + PAGE_SIZE) > last && maddr < e820.map[i].addr ) type |= RAM_TYPE_UNKNOWN; if ( (maddr + PAGE_SIZE) <= e820.map[i].addr || maddr >= (e820.map[i].addr + e820.map[i].size) ) continue; switch ( e820.map[i].type ) { case E820_RAM: type |= RAM_TYPE_CONVENTIONAL; break; case E820_RESERVED: type |= RAM_TYPE_RESERVED; break; case E820_UNUSABLE: type |= RAM_TYPE_UNUSABLE; break; case E820_ACPI: case E820_NVS: type |= RAM_TYPE_ACPI; break; default: type |= RAM_TYPE_UNKNOWN; break; } } return type ?: RAM_TYPE_UNKNOWN; } unsigned long domain_get_maximum_gpfn(struct domain *d) { #ifdef CONFIG_HVM if ( is_hvm_domain(d) ) return p2m_get_hostp2m(d)->max_mapped_pfn; #endif /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */ return (arch_get_max_pfn(d) ?: 1) - 1; } void share_xen_page_with_guest(struct page_info *page, struct domain *d, enum XENSHARE_flags flags) { ASSERT(d != dom_io); /* Should use assign_io_page(). */ if ( page_get_owner(page) == d ) return; set_gpfn_from_mfn(mfn_x(page_to_mfn(page)), INVALID_M2P_ENTRY); spin_lock(&d->page_alloc_lock); /* The incremented type count pins as writable or read-only. */ page->u.inuse.type_info = (flags == SHARE_ro ? PGT_none : PGT_writable_page); page->u.inuse.type_info |= PGT_validated | 1; page_set_owner(page, d); smp_wmb(); /* install valid domain ptr before updating refcnt. */ ASSERT((page->count_info & ~PGC_xen_heap) == 0); /* Only add to the allocation list if the domain isn't dying. */ if ( !d->is_dying ) { page->count_info |= PGC_xen_heap | PGC_allocated | 1; if ( unlikely(d->xenheap_pages++ == 0) ) get_knownalive_domain(d); page_list_add_tail(page, &d->xenpage_list); } spin_unlock(&d->page_alloc_lock); } void make_cr3(struct vcpu *v, mfn_t mfn) { struct domain *d = v->domain; v->arch.cr3 = mfn_x(mfn) << PAGE_SHIFT; if ( is_pv_domain(d) && d->arch.pv.pcid ) v->arch.cr3 |= get_pcid_bits(v, false); } void write_ptbase(struct vcpu *v) { struct cpu_info *cpu_info = get_cpu_info(); unsigned long new_cr4; new_cr4 = (is_pv_vcpu(v) && !is_idle_vcpu(v)) ? pv_make_cr4(v) : mmu_cr4_features; if ( is_pv_vcpu(v) && v->domain->arch.pv.xpti ) { cpu_info->root_pgt_changed = true; cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)); if ( new_cr4 & X86_CR4_PCIDE ) cpu_info->pv_cr3 |= get_pcid_bits(v, true); switch_cr3_cr4(v->arch.cr3, new_cr4); } else { /* Make sure to clear use_pv_cr3 and xen_cr3 before pv_cr3. */ cpu_info->use_pv_cr3 = false; cpu_info->xen_cr3 = 0; /* switch_cr3_cr4() serializes. */ switch_cr3_cr4(v->arch.cr3, new_cr4); cpu_info->pv_cr3 = 0; } } /* * Should be called after CR3 is updated. * * Uses values found in vcpu->arch.(guest_table and guest_table_user), and * for HVM guests, arch.hvm.monitor_table and hvm's guest CR3. * * Update ref counts to shadow tables appropriately. */ void update_cr3(struct vcpu *v) { mfn_t cr3_mfn; if ( paging_mode_enabled(v->domain) ) { paging_update_cr3(v, false); return; } if ( !(v->arch.flags & TF_kernel_mode) ) cr3_mfn = pagetable_get_mfn(v->arch.guest_table_user); else cr3_mfn = pagetable_get_mfn(v->arch.guest_table); make_cr3(v, cr3_mfn); } static inline void set_tlbflush_timestamp(struct page_info *page) { /* * Record TLB information for flush later. We do not stamp page tables * when running in shadow mode: * 1. Pointless, since it's the shadow pt's which must be tracked. * 2. Shadow mode reuses this field for shadowed page tables to store * flags info -- we don't want to conflict with that. */ if ( !(page->count_info & PGC_shadowed_pt) ) page_set_tlbflush_timestamp(page); } const char __section(".bss.page_aligned.const") __aligned(PAGE_SIZE) zero_page[PAGE_SIZE]; static int _get_page_type(struct page_info *page, unsigned long type, bool preemptible); #ifdef CONFIG_PV_LINEAR_PT static bool inc_linear_entries(struct page_info *pg) { typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc; do { /* * The check below checks for the "linear use" count being non-zero * as well as overflow. Signed integer overflow is undefined behavior * according to the C spec. However, as long as linear_pt_count is * smaller in size than 'int', the arithmetic operation of the * increment below won't overflow; rather the result will be truncated * when stored. Ensure that this is always true. */ BUILD_BUG_ON(sizeof(nc) >= sizeof(int)); oc = nc++; if ( nc <= 0 ) return false; nc = cmpxchg(&pg->linear_pt_count, oc, nc); } while ( oc != nc ); return true; } static void dec_linear_entries(struct page_info *pg) { typeof(pg->linear_pt_count) oc; oc = arch_fetch_and_add(&pg->linear_pt_count, -1); ASSERT(oc > 0); } static bool inc_linear_uses(struct page_info *pg) { typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc; do { /* See the respective comment in inc_linear_entries(). */ BUILD_BUG_ON(sizeof(nc) >= sizeof(int)); oc = nc--; if ( nc >= 0 ) return false; nc = cmpxchg(&pg->linear_pt_count, oc, nc); } while ( oc != nc ); return true; } static void dec_linear_uses(struct page_info *pg) { typeof(pg->linear_pt_count) oc; oc = arch_fetch_and_add(&pg->linear_pt_count, 1); ASSERT(oc < 0); } /* * We allow root tables to map each other (a.k.a. linear page tables). It * needs some special care with reference counts and access permissions: * 1. The mapping entry must be read-only, or the guest may get write access * to its own PTEs. * 2. We must only bump the reference counts for an *already validated* * L2 table, or we can end up in a deadlock in get_page_type() by waiting * on a validation that is required to complete that validation. * 3. We only need to increment the reference counts for the mapped page * frame if it is mapped by a different root table. This is sufficient and * also necessary to allow validation of a root table mapping itself. */ static bool __read_mostly opt_pv_linear_pt = true; boolean_param("pv-linear-pt", opt_pv_linear_pt); #define define_get_linear_pagetable(level) \ static int \ get_##level##_linear_pagetable( \ level##_pgentry_t pde, mfn_t pde_mfn, struct domain *d) \ { \ unsigned long x, y; \ mfn_t mfn; \ \ if ( !opt_pv_linear_pt ) \ { \ gdprintk(XENLOG_WARNING, \ "Attempt to create linear p.t. (feature disabled)\n"); \ return 0; \ } \ \ if ( (level##e_get_flags(pde) & _PAGE_RW) ) \ { \ gdprintk(XENLOG_WARNING, \ "Attempt to create linear p.t. with write perms\n"); \ return 0; \ } \ \ if ( !mfn_eq(mfn = level##e_get_mfn(pde), pde_mfn) ) \ { \ struct page_info *page, *ptpg = mfn_to_page(pde_mfn); \ \ /* Make sure the page table belongs to the correct domain. */ \ if ( unlikely(page_get_owner(ptpg) != d) ) \ return 0; \ \ /* Make sure the mapped frame belongs to the correct domain. */ \ page = get_page_from_mfn(mfn, d); \ if ( unlikely(!page) ) \ return 0; \ \ /* \ * Ensure that the mapped frame is an already-validated page table \ * and is not itself having linear entries, as well as that the \ * containing page table is not iself in use as a linear page table \ * elsewhere. \ * If so, atomically increment the count (checking for overflow). \ */ \ if ( !inc_linear_entries(ptpg) ) \ { \ put_page(page); \ return 0; \ } \ if ( !inc_linear_uses(page) ) \ { \ dec_linear_entries(ptpg); \ put_page(page); \ return 0; \ } \ y = page->u.inuse.type_info; \ do { \ x = y; \ if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \ unlikely((x & (PGT_type_mask|PGT_validated)) != \ (PGT_##level##_page_table|PGT_validated)) ) \ { \ dec_linear_uses(page); \ dec_linear_entries(ptpg); \ put_page(page); \ return 0; \ } \ } \ while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \ } \ \ return 1; \ } #else /* CONFIG_PV_LINEAR_PT */ #define define_get_linear_pagetable(level) \ static int \ get_##level##_linear_pagetable( \ level##_pgentry_t pde, mfn_t pde_mfn, struct domain *d) \ { \ return 0; \ } static void dec_linear_uses(struct page_info *pg) { ASSERT(pg->linear_pt_count == 0); } static void dec_linear_entries(struct page_info *pg) { ASSERT(pg->linear_pt_count == 0); } #endif /* CONFIG_PV_LINEAR_PT */ bool is_iomem_page(mfn_t mfn) { struct page_info *page; if ( !mfn_valid(mfn) ) return true; /* Caller must know that it is an iomem page, or a reference is held. */ page = mfn_to_page(mfn); ASSERT((page->count_info & PGC_count_mask) != 0); return (page_get_owner(page) == dom_io); } /* Input ranges are inclusive. */ bool is_memory_hole(mfn_t start, mfn_t end) { unsigned long s = mfn_x(start); unsigned long e = mfn_x(end); unsigned int i; for ( i = 0; i < e820.nr_map; i++ ) { const struct e820entry *entry = &e820.map[i]; if ( !entry->size ) continue; /* Do not allow overlaps with any memory range. */ if ( s <= PFN_DOWN(entry->addr + entry->size - 1) && PFN_DOWN(entry->addr) <= e ) return false; } return true; } #ifndef NDEBUG struct mmio_emul_range_ctxt { const struct domain *d; unsigned long mfn; }; static int cf_check print_mmio_emul_range( unsigned long s, unsigned long e, void *arg) { const struct mmio_emul_range_ctxt *ctxt = arg; if ( ctxt->mfn > e ) return 0; if ( ctxt->mfn >= s ) { static DEFINE_SPINLOCK(last_lock); static const struct domain *last_d; static unsigned long last_s = ~0UL, last_e; bool print = false; spin_lock(&last_lock); if ( last_d != ctxt->d || last_s != s || last_e != e ) { last_d = ctxt->d; last_s = s; last_e = e; print = true; } spin_unlock(&last_lock); if ( print ) printk(XENLOG_G_INFO "d%d: Forcing write emulation on MFNs %lx-%lx\n", ctxt->d->domain_id, s, e); } return 1; } #endif /* * get_page_from_l1e returns: * 0 => success (page not present also counts as such) * <0 => error code * >0 => the page flags to be flipped */ int get_page_from_l1e( l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner) { unsigned long mfn = l1e_get_pfn(l1e); struct page_info *page = mfn_to_page(_mfn(mfn)); uint32_t l1f = l1e_get_flags(l1e); struct vcpu *curr = current; struct domain *real_pg_owner; bool write, valid; if ( unlikely(!(l1f & _PAGE_PRESENT)) ) { ASSERT_UNREACHABLE(); return 0; } if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) ) { gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n", l1f & l1_disallow_mask(l1e_owner)); return -EINVAL; } valid = mfn_valid(_mfn(mfn)); if ( !valid || (real_pg_owner = page_get_owner_and_reference(page)) == dom_io ) { int flip = 0; /* Only needed the reference to confirm dom_io ownership. */ if ( valid ) put_page(page); /* DOMID_IO reverts to caller for privilege checks. */ if ( pg_owner == dom_io ) pg_owner = curr->domain; if ( !iomem_access_permitted(pg_owner, mfn, mfn) ) { if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */ { gdprintk(XENLOG_WARNING, "d%d non-privileged attempt to map MMIO space %"PRI_mfn"\n", pg_owner->domain_id, mfn); return -EPERM; } return -EINVAL; } if ( pg_owner != l1e_owner && !iomem_access_permitted(l1e_owner, mfn, mfn) ) { if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */ { gdprintk(XENLOG_WARNING, "d%d attempted to map MMIO space %"PRI_mfn" in d%d to d%d\n", curr->domain->domain_id, mfn, pg_owner->domain_id, l1e_owner->domain_id); return -EPERM; } return -EINVAL; } if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) ) { /* MMIO pages must not be mapped cachable unless requested so. */ switch ( opt_mmio_relax ) { case 0: break; case 1: if ( !is_hardware_domain(l1e_owner) ) break; /* fallthrough */ case -1: return 0; default: ASSERT_UNREACHABLE(); } } else if ( l1f & _PAGE_RW ) { #ifndef NDEBUG const unsigned long *ro_map; unsigned int seg, bdf; if ( !pci_mmcfg_decode(mfn, &seg, &bdf) || ((ro_map = pci_get_ro_map(seg)) != NULL && test_bit(bdf, ro_map)) ) printk(XENLOG_G_WARNING "d%d: Forcing read-only access to MFN %lx\n", l1e_owner->domain_id, mfn); else rangeset_report_ranges(mmio_ro_ranges, 0, ~0UL, print_mmio_emul_range, &(struct mmio_emul_range_ctxt){ .d = l1e_owner, .mfn = mfn }); #endif flip = _PAGE_RW; } switch ( 0xFF & (XEN_MSR_PAT >> (8 * pte_flags_to_cacheattr(l1f))) ) { case X86_MT_UC: case X86_MT_UCM: case X86_MT_WC: /* not cacheable, allow */ break; default: /* potentially cacheable, force to UC */ flip |= ((l1f & PAGE_CACHE_ATTRS) ^ _PAGE_UC); break; } return flip; } if ( unlikely((real_pg_owner != pg_owner) && (!dom_cow || (real_pg_owner != dom_cow))) ) { /* * Let privileged domains transfer the right to map their target * domain's pages. This is used to allow stub-domain pvfb export to * dom0, until pvfb supports granted mappings. At that time this * minor hack can go away. */ if ( (real_pg_owner == NULL) || (pg_owner == l1e_owner) || xsm_priv_mapping(XSM_TARGET, pg_owner, real_pg_owner) ) { gdprintk(XENLOG_WARNING, "pg_owner d%d l1e_owner d%d, but real_pg_owner d%d\n", pg_owner->domain_id, l1e_owner->domain_id, real_pg_owner ? real_pg_owner->domain_id : -1); goto could_not_pin; } pg_owner = real_pg_owner; } /* * Extra paranoid check for shared memory. Writable mappings * disallowed (unshare first!) */ if ( (l1f & _PAGE_RW) && (real_pg_owner == dom_cow) ) goto could_not_pin; /* * Foreign mappings into guests in shadow external mode don't * contribute to writeable mapping refcounts. (This allows the * qemu-dm helper process in dom0 to map the domain's memory without * messing up the count of "real" writable mappings.) */ write = (l1f & _PAGE_RW) && ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)); if ( write && !get_page_type(page, PGT_writable_page) ) { gdprintk(XENLOG_WARNING, "Could not get page type PGT_writable_page\n"); goto could_not_pin; } if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_special_page(page) ) { if ( write ) put_page_type(page); put_page(page); gdprintk(XENLOG_WARNING, "Attempt to change cache attributes of Xen heap page\n"); return -EACCES; } /* * Track writeable non-coherent mappings to RAM pages, to trigger a cache * flush later if the target is used as anything but a PGT_writeable page. * We care about all writeable mappings, including foreign mappings. */ if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) && (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) ) set_bit(_PGT_non_coherent, &page->u.inuse.type_info); return 0; could_not_pin: gdprintk(XENLOG_WARNING, "Error getting mfn %" PRI_mfn " (pfn %" PRI_pfn ") from L1 entry %" PRIpte " for l1e_owner d%d, pg_owner d%d\n", mfn, get_gpfn_from_mfn(mfn), l1e_get_intpte(l1e), l1e_owner->domain_id, pg_owner->domain_id); if ( real_pg_owner != NULL ) put_page(page); return -EBUSY; } /* * The following flags are used to specify behavior of various get and * put commands. The first is also stored in page->partial_flags to * indicate the state of the page pointed to by * page->pte[page->nr_validated_entries]. See the comment in mm.h for * more information. */ #define PTF_partial_set (1 << 0) #define PTF_preemptible (1 << 2) #define PTF_defer (1 << 3) #define PTF_retain_ref_on_restart (1 << 4) #ifdef CONFIG_PV static int get_page_and_type_from_mfn( mfn_t mfn, unsigned long type, struct domain *d, unsigned int flags) { struct page_info *page = mfn_to_page(mfn); int rc; bool preemptible = flags & PTF_preemptible, partial_set = flags & PTF_partial_set, retain_ref = flags & PTF_retain_ref_on_restart; if ( likely(!partial_set) && unlikely(!get_page_from_mfn(mfn, d)) ) return -EINVAL; rc = _get_page_type(page, type, preemptible); /* * Retain the refcount if: * - page is fully validated (rc == 0) * - page is not validated (rc < 0) but: * - We came in with a reference (partial_set) * - page is partially validated (rc == -ERESTART), and the * caller has asked the ref to be retained in that case * - page is partially validated but there's been an error * (page == current->arch.old_guest_table) * * The partial_set-on-error clause is worth an explanation. There * are two scenarios where partial_set might be true coming in: * - mfn has been partially promoted / demoted as type `type`; * i.e. has PGT_partial set * - mfn has been partially demoted as L(type+1) (i.e., a linear * page; e.g. we're being called from get_page_from_l2e with * type == PGT_l1_table, but the mfn is PGT_l2_table) * * If there's an error, in the first case, _get_page_type will * either return -ERESTART, in which case we want to retain the * ref (as the caller will consider it retained), or -EINVAL, in * which case old_guest_table will be set; in both cases, we need * to retain the ref. * * In the second case, if there's an error, _get_page_type() can * *only* return -EINVAL, and *never* set old_guest_table. In * that case we also want to retain the reference, to allow the * page to continue to be torn down (i.e., PGT_partial cleared) * safely. * * Also note that we shouldn't be able to leave with the reference * count retained unless we succeeded, or the operation was * preemptible. */ if ( likely(!rc) || partial_set ) /* nothing */; else if ( page == current->arch.old_guest_table || (retain_ref && rc == -ERESTART) ) ASSERT(preemptible); else put_page(page); return rc; } define_get_linear_pagetable(l2); static int get_page_from_l2e( l2_pgentry_t l2e, mfn_t l2mfn, struct domain *d, unsigned int flags) { unsigned long mfn = l2e_get_pfn(l2e); int rc; if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) ) { gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n", l2e_get_flags(l2e) & L2_DISALLOW_MASK); return -EINVAL; } ASSERT(!(flags & PTF_preemptible)); rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, flags); if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, l2mfn, d) ) rc = 0; return rc; } define_get_linear_pagetable(l3); static int get_page_from_l3e( l3_pgentry_t l3e, mfn_t l3mfn, struct domain *d, unsigned int flags) { int rc; if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) ) { gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n", l3e_get_flags(l3e) & l3_disallow_mask(d)); return -EINVAL; } rc = get_page_and_type_from_mfn( l3e_get_mfn(l3e), PGT_l2_page_table, d, flags | PTF_preemptible); if ( unlikely(rc == -EINVAL) && !is_pv_32bit_domain(d) && get_l3_linear_pagetable(l3e, l3mfn, d) ) rc = 0; return rc; } define_get_linear_pagetable(l4); static int get_page_from_l4e( l4_pgentry_t l4e, mfn_t l4mfn, struct domain *d, unsigned int flags) { int rc; if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) ) { gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n", l4e_get_flags(l4e) & L4_DISALLOW_MASK); return -EINVAL; } rc = get_page_and_type_from_mfn( l4e_get_mfn(l4e), PGT_l3_page_table, d, flags | PTF_preemptible); if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, l4mfn, d) ) rc = 0; return rc; } #endif /* CONFIG_PV */ static int _put_page_type(struct page_info *page, unsigned int flags, struct page_info *ptpg); void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner) { unsigned long pfn = l1e_get_pfn(l1e); struct page_info *page; struct domain *pg_owner; if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(_mfn(pfn)) ) return; page = mfn_to_page(_mfn(pfn)); pg_owner = page_get_owner(page); /* * Check if this is a mapping that was established via a grant reference. * If it was then we should not be here: we require that such mappings are * explicitly destroyed via the grant-table interface. * * The upshot of this is that the guest can end up with active grants that * it cannot destroy (because it no longer has a PTE to present to the * grant-table interface). This can lead to subtle hard-to-catch bugs, * hence a special grant PTE flag can be enabled to catch the bug early. * * (Note that the undestroyable active grants are not a security hole in * Xen. All active grants can safely be cleaned up when the domain dies.) * * NB: the preprocessor conditional is required in order to prevent clang's * -Wtautological-constant-compare complaining about converting the result * of a << into a bool is always true if it's evaluated directly in the if * condition. */ #if _PAGE_GNTTAB if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) && !l1e_owner->is_shutting_down && !l1e_owner->is_dying ) { gprintk(XENLOG_WARNING, "Attempt to implicitly unmap %pd's grant PTE %" PRIpte "\n", l1e_owner, l1e_get_intpte(l1e)); pv_inject_hw_exception(X86_EXC_GP, 0); } #endif /* * Remember we didn't take a type-count of foreign writable mappings * to paging-external domains. */ if ( (l1e_get_flags(l1e) & _PAGE_RW) && ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) ) put_page_and_type(page); else put_page(page); } #ifdef CONFIG_PV static int put_pt_page(struct page_info *pg, struct page_info *ptpg, unsigned int flags) { int rc = 0; if ( flags & PTF_defer ) { ASSERT(!(flags & PTF_partial_set)); current->arch.old_guest_ptpg = ptpg; current->arch.old_guest_table = pg; current->arch.old_guest_table_partial = false; } else { rc = _put_page_type(pg, flags | PTF_preemptible, ptpg); if ( likely(!rc) ) put_page(pg); } return rc; } static int put_data_pages(struct page_info *page, bool writeable, int pt_shift) { unsigned int i, count = 1 << (pt_shift - PAGE_SHIFT); ASSERT(!(mfn_x(page_to_mfn(page)) & (count - 1))); for ( i = 0; i < count ; i++, page++ ) if ( writeable ) put_page_and_type(page); else put_page(page); return 0; } /* * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. * Note also that this automatically deals correctly with linear p.t.'s. */ static int put_page_from_l2e(l2_pgentry_t l2e, mfn_t l2mfn, unsigned int flags) { if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || mfn_eq(l2e_get_mfn(l2e), l2mfn) ) return 1; if ( l2e_get_flags(l2e) & _PAGE_PSE ) return put_data_pages(l2e_get_page(l2e), l2e_get_flags(l2e) & _PAGE_RW, L2_PAGETABLE_SHIFT); return put_pt_page(l2e_get_page(l2e), mfn_to_page(l2mfn), flags); } static int put_page_from_l3e(l3_pgentry_t l3e, mfn_t l3mfn, unsigned int flags) { if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || mfn_eq(l3e_get_mfn(l3e), l3mfn) ) return 1; if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) ) return put_data_pages(l3e_get_page(l3e), l3e_get_flags(l3e) & _PAGE_RW, L3_PAGETABLE_SHIFT); return put_pt_page(l3e_get_page(l3e), mfn_to_page(l3mfn), flags); } static int put_page_from_l4e(l4_pgentry_t l4e, mfn_t l4mfn, unsigned int flags) { if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) || mfn_eq(l4e_get_mfn(l4e), l4mfn) ) return 1; return put_pt_page(l4e_get_page(l4e), mfn_to_page(l4mfn), flags); } static int promote_l1_table(struct page_info *page) { struct domain *d = page_get_owner(page); l1_pgentry_t *pl1e; unsigned int i; int ret = 0; pl1e = __map_domain_page(page); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) { if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) ) { ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -EINTR : 0; if ( ret ) goto out; } else { switch ( ret = get_page_from_l1e(pl1e[i], d, d) ) { default: goto fail; case 0: break; case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS: ASSERT(!(ret & ~(_PAGE_RW | PAGE_CACHE_ATTRS))); l1e_flip_flags(pl1e[i], ret); break; } } pl1e[i] = adjust_guest_l1e(pl1e[i], d); } unmap_domain_page(pl1e); return 0; fail: gdprintk(XENLOG_WARNING, "Failure %d in promote_l1_table: slot %#x\n", ret, i); out: while ( i-- > 0 ) put_page_from_l1e(pl1e[i], d); unmap_domain_page(pl1e); return ret; } /* * Note: The checks performed by this function are just to enforce a * legacy restriction necessary on 32-bit hosts. There's not much point in * relaxing (dropping) this though, as 32-bit guests would still need to * conform to the original restrictions in order to be able to run on (old) * 32-bit Xen. */ static bool pae_xen_mappings_check(const struct domain *d, const l3_pgentry_t *pl3e) { /* * 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist, * which our caller has already verified. */ l3_pgentry_t l3e3 = pl3e[3]; const struct page_info *page = l3e_get_page(l3e3); /* * The Xen-private mappings include linear mappings. The L2 thus cannot * be shared by multiple L3 tables. The test here is adequate because: * 1. Cannot appear in slots != 3 because get_page_type() checks the * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3 * 2. Cannot appear in another page table's L3: * a. promote_l3_table() calls this function and this check will fail * b. mod_l3_entry() disallows updates to slot 3 in an existing table */ BUG_ON(page->u.inuse.type_info & PGT_pinned); BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2)); if ( (page->u.inuse.type_info & PGT_count_mask) != 1 ) { BUG_ON(!(page->u.inuse.type_info & PGT_count_mask)); gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is shared\n"); return false; } return true; } static int promote_l2_table(struct page_info *page, unsigned long type) { struct domain *d = page_get_owner(page); mfn_t l2mfn = page_to_mfn(page); l2_pgentry_t *pl2e; unsigned int i; int rc = 0; unsigned int partial_flags = page->partial_flags; pl2e = map_domain_page(l2mfn); /* * NB that promote_l2_table will never set partial_pte on an l2; but * demote_l2_table might if a linear_pagetable entry is interrupted * partway through de-validation. In that circumstance, * get_page_from_l2e() will always return -EINVAL; and we must * retain the type ref by doing the normal partial_flags tracking. */ for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++, partial_flags = 0 ) { l2_pgentry_t l2e = pl2e[i]; if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) rc = -EINTR; else if ( !is_guest_l2_slot(d, type, i) ) continue; else if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) { if ( !pv_l1tf_check_l2e(d, l2e) ) continue; rc = -EINTR; } else rc = get_page_from_l2e(l2e, l2mfn, d, partial_flags); /* * It shouldn't be possible for get_page_from_l2e to return * -ERESTART, since we never call this with PTF_preemptible. * (promote_l1_table may return -EINTR on an L1TF-vulnerable * entry.) * * NB that while on a "clean" promotion, we can never get * PGT_partial. It is possible to arrange for an l2e to * contain a partially-devalidated l2; but in that case, both * of the following functions will fail anyway (the first * because the page in question is not an l1; the second * because the page is not fully validated). */ ASSERT(rc != -ERESTART); if ( rc == -EINTR && i ) { page->nr_validated_ptes = i; page->partial_flags = partial_flags;; rc = -ERESTART; } else if ( rc < 0 && rc != -EINTR ) { gdprintk(XENLOG_WARNING, "Failure %d in promote_l2_table: slot %#x\n", rc, i); ASSERT(current->arch.old_guest_table == NULL); if ( i ) { /* * promote_l1_table() doesn't set old_guest_table; it does * its own tear-down immediately on failure. If it * did we'd need to check it and set partial_flags as we * do in alloc_l[34]_table(). * * Note on the use of ASSERT: if it's non-null and * hasn't been cleaned up yet, it should have * PGT_partial set; and so the type will be cleaned up * on domain destruction. Unfortunately, we would * leak the general ref held by old_guest_table; but * leaking a page is less bad than a host crash. */ ASSERT(current->arch.old_guest_table == NULL); page->nr_validated_ptes = i; page->partial_flags = partial_flags; current->arch.old_guest_ptpg = NULL; current->arch.old_guest_table = page; current->arch.old_guest_table_partial = true; } } if ( rc < 0 ) break; pl2e[i] = adjust_guest_l2e(l2e, d); } if ( !rc && (type & PGT_pae_xen_l2) ) init_xen_pae_l2_slots(pl2e, d); unmap_domain_page(pl2e); return rc; } static int promote_l3_table(struct page_info *page) { struct domain *d = page_get_owner(page); mfn_t l3mfn = page_to_mfn(page); l3_pgentry_t *pl3e; unsigned int i; int rc = 0; unsigned int partial_flags = page->partial_flags; l3_pgentry_t l3e = l3e_empty(); pl3e = map_domain_page(l3mfn); /* * PAE guests allocate full pages, but aren't required to initialize * more than the first four entries; when running in compatibility * mode, however, the full page is visible to the MMU, and hence all * 512 entries must be valid/verified, which is most easily achieved * by clearing them out. */ if ( is_pv_32bit_domain(d) ) memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++, partial_flags = 0 ) { l3e = pl3e[i]; if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) rc = -EINTR; else if ( i == 3 && is_pv_32bit_domain(d) ) { if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_flags(l3e) & l3_disallow_mask(d)) ) rc = -EINVAL; else rc = get_page_and_type_from_mfn( l3e_get_mfn(l3e), PGT_l2_page_table | PGT_pae_xen_l2, d, partial_flags | PTF_preemptible | PTF_retain_ref_on_restart); if ( !rc ) { if ( pae_xen_mappings_check(d, pl3e) ) { pl3e[i] = adjust_guest_l3e(l3e, d); break; } rc = -EINVAL; } } else if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) { if ( !pv_l1tf_check_l3e(d, l3e) ) continue; rc = -EINTR; } else rc = get_page_from_l3e(l3e, l3mfn, d, partial_flags | PTF_retain_ref_on_restart); if ( rc == -ERESTART ) { page->nr_validated_ptes = i; /* Set 'set', leave 'general ref' set if this entry was set */ page->partial_flags = PTF_partial_set; } else if ( rc == -EINTR && i ) { page->nr_validated_ptes = i; page->partial_flags = partial_flags; rc = -ERESTART; } if ( rc < 0 ) break; pl3e[i] = adjust_guest_l3e(l3e, d); } if ( rc < 0 && rc != -ERESTART && rc != -EINTR ) { gdprintk(XENLOG_WARNING, "Failure %d in promote_l3_table: slot %#x\n", rc, i); if ( i ) { page->nr_validated_ptes = i; page->partial_flags = partial_flags; if ( current->arch.old_guest_table ) { /* * We've experienced a validation failure. If * old_guest_table is set, "transfer" the general * reference count to pl3e[nr_validated_ptes] by * setting PTF_partial_set. * * As a precaution, check that old_guest_table is the * page pointed to by pl3e[nr_validated_ptes]. If * not, it's safer to leak a type ref on production * builds. */ if ( current->arch.old_guest_table == l3e_get_page(l3e) ) { ASSERT(current->arch.old_guest_table_partial); page->partial_flags = PTF_partial_set; } else ASSERT_UNREACHABLE(); } current->arch.old_guest_ptpg = NULL; current->arch.old_guest_table = page; current->arch.old_guest_table_partial = true; } while ( i-- > 0 ) pl3e[i] = unadjust_guest_l3e(pl3e[i], d); } unmap_domain_page(pl3e); return rc; } #endif /* CONFIG_PV */ /* * Fill an L4 with Xen entries. * * This function must write all ROOT_PAGETABLE_PV_XEN_SLOTS, to clobber any * values a guest may have left there from promote_l4_table(). * * l4t, l4mfn, and d are mandatory, but l4mfn doesn't need to be the mfn under * *l4t. All other parameters are optional and will either fill or zero the * appropriate slots. Pagetables not shared with guests will gain the * extended directmap. */ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn, const struct domain *d, mfn_t sl4mfn, bool ro_mpt) { /* * PV vcpus need a shortened directmap. HVM and Idle vcpus get the full * directmap. */ bool short_directmap = !paging_mode_external(d); /* Slot 256: RO M2P (if applicable). */ l4t[l4_table_offset(RO_MPT_VIRT_START)] = ro_mpt ? idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)] : l4e_empty(); /* Slot 257: PCI MMCFG. */ l4t[l4_table_offset(PCI_MCFG_VIRT_START)] = idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)]; /* Slot 258: Self linear mappings. */ ASSERT(!mfn_eq(l4mfn, INVALID_MFN)); l4t[l4_table_offset(LINEAR_PT_VIRT_START)] = l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW); /* Slot 259: Shadow linear mappings (if applicable) .*/ l4t[l4_table_offset(SH_LINEAR_PT_VIRT_START)] = mfn_eq(sl4mfn, INVALID_MFN) ? l4e_empty() : l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR_RW); /* Slot 260: Per-domain mappings. */ l4t[l4_table_offset(PERDOMAIN_VIRT_START)] = l4e_from_page(d->arch.perdomain_l3_pg, __PAGE_HYPERVISOR_RW); /* Slot 4: Per-domain mappings mirror. */ BUILD_BUG_ON(IS_ENABLED(CONFIG_PV32) && !l4_table_offset(PERDOMAIN_ALT_VIRT_START)); if ( !is_pv_64bit_domain(d) ) l4t[l4_table_offset(PERDOMAIN_ALT_VIRT_START)] = l4t[l4_table_offset(PERDOMAIN_VIRT_START)]; /* Slot 261-: text/data/bss, RW M2P, vmap, frametable, directmap. */ #ifndef NDEBUG if ( short_directmap && unlikely(root_pgt_pv_xen_slots < ROOT_PAGETABLE_PV_XEN_SLOTS) ) { /* * If using highmem-start=, artificially shorten the directmap to * simulate very large machines. */ l4_pgentry_t *next; memcpy(&l4t[l4_table_offset(XEN_VIRT_START)], &idle_pg_table[l4_table_offset(XEN_VIRT_START)], (ROOT_PAGETABLE_FIRST_XEN_SLOT + root_pgt_pv_xen_slots - l4_table_offset(XEN_VIRT_START)) * sizeof(*l4t)); next = &l4t[ROOT_PAGETABLE_FIRST_XEN_SLOT + root_pgt_pv_xen_slots]; if ( l4e_get_intpte(split_l4e) ) *next++ = split_l4e; memset(next, 0, _p(&l4t[ROOT_PAGETABLE_LAST_XEN_SLOT + 1]) - _p(next)); } else #endif { unsigned int slots = (short_directmap ? ROOT_PAGETABLE_PV_XEN_SLOTS : ROOT_PAGETABLE_XEN_SLOTS); memcpy(&l4t[l4_table_offset(XEN_VIRT_START)], &idle_pg_table[l4_table_offset(XEN_VIRT_START)], (ROOT_PAGETABLE_FIRST_XEN_SLOT + slots - l4_table_offset(XEN_VIRT_START)) * sizeof(*l4t)); } } bool fill_ro_mpt(mfn_t mfn) { l4_pgentry_t *l4tab = map_domain_page(mfn); bool ret = false; if ( !l4e_get_intpte(l4tab[l4_table_offset(RO_MPT_VIRT_START)]) ) { l4tab[l4_table_offset(RO_MPT_VIRT_START)] = idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]; ret = true; } unmap_domain_page(l4tab); return ret; } void zap_ro_mpt(mfn_t mfn) { l4_pgentry_t *l4tab = map_domain_page(mfn); l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty(); unmap_domain_page(l4tab); } #ifdef CONFIG_PV static int promote_l4_table(struct page_info *page) { struct domain *d = page_get_owner(page); mfn_t l4mfn = page_to_mfn(page); l4_pgentry_t *pl4e = map_domain_page(l4mfn); unsigned int i; int rc = 0; unsigned int partial_flags = page->partial_flags; for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++, partial_flags = 0 ) { l4_pgentry_t l4e; if ( !is_guest_l4_slot(d, i) ) continue; l4e = pl4e[i]; if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) { if ( !pv_l1tf_check_l4e(d, l4e) ) continue; rc = -EINTR; } else rc = get_page_from_l4e(l4e, l4mfn, d, partial_flags | PTF_retain_ref_on_restart); if ( rc == -ERESTART ) { page->nr_validated_ptes = i; /* Set 'set', leave 'general ref' set if this entry was set */ page->partial_flags = PTF_partial_set; } else if ( rc < 0 ) { if ( rc != -EINTR ) gdprintk(XENLOG_WARNING, "Failure %d in promote_l4_table: slot %#x\n", rc, i); if ( i ) { page->nr_validated_ptes = i; page->partial_flags = partial_flags; if ( rc == -EINTR ) rc = -ERESTART; else { if ( current->arch.old_guest_table ) { /* * We've experienced a validation failure. If * old_guest_table is set, "transfer" the general * reference count to pl3e[nr_validated_ptes] by * setting PTF_partial_set. * * As a precaution, check that old_guest_table is the * page pointed to by pl4e[nr_validated_ptes]. If * not, it's safer to leak a type ref on production * builds. */ if ( current->arch.old_guest_table == l4e_get_page(l4e) ) { ASSERT(current->arch.old_guest_table_partial); page->partial_flags = PTF_partial_set; } else ASSERT_UNREACHABLE(); } current->arch.old_guest_ptpg = NULL; current->arch.old_guest_table = page; current->arch.old_guest_table_partial = true; } } } if ( rc < 0 ) break; pl4e[i] = adjust_guest_l4e(l4e, d); } if ( !rc ) { init_xen_l4_slots(pl4e, l4mfn, d, INVALID_MFN, VM_ASSIST(d, m2p_strict)); atomic_inc(&d->arch.pv.nr_l4_pages); } unmap_domain_page(pl4e); return rc; } static void demote_l1_table(struct page_info *page) { struct domain *d = page_get_owner(page); l1_pgentry_t *pl1e; unsigned int i; pl1e = __map_domain_page(page); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) put_page_from_l1e(pl1e[i], d); unmap_domain_page(pl1e); } static int demote_l2_table(struct page_info *page) { struct domain *d = page_get_owner(page); mfn_t l2mfn = page_to_mfn(page); l2_pgentry_t *pl2e; int rc = 0; unsigned int partial_flags = page->partial_flags, i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); pl2e = map_domain_page(l2mfn); for ( ; ; ) { if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) ) rc = put_page_from_l2e(pl2e[i], l2mfn, partial_flags); if ( rc < 0 ) break; partial_flags = 0; if ( !i-- ) break; if ( hypercall_preempt_check() ) { rc = -EINTR; break; } } unmap_domain_page(pl2e); if ( rc >= 0 ) { page->u.inuse.type_info &= ~PGT_pae_xen_l2; rc = 0; } else if ( rc == -ERESTART ) { page->nr_validated_ptes = i; page->partial_flags = PTF_partial_set; } else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) { page->nr_validated_ptes = i + !(partial_flags & PTF_partial_set); page->partial_flags = partial_flags; rc = -ERESTART; } return rc; } static int demote_l3_table(struct page_info *page) { struct domain *d = page_get_owner(page); mfn_t l3mfn = page_to_mfn(page); l3_pgentry_t *pl3e; int rc = 0; unsigned int partial_flags = page->partial_flags, i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); pl3e = map_domain_page(l3mfn); for ( ; ; ) { rc = put_page_from_l3e(pl3e[i], l3mfn, partial_flags); if ( rc < 0 ) break; partial_flags = 0; if ( rc == 0 ) pl3e[i] = unadjust_guest_l3e(pl3e[i], d); if ( !i-- ) break; if ( hypercall_preempt_check() ) { rc = -EINTR; break; } } unmap_domain_page(pl3e); if ( rc == -ERESTART ) { page->nr_validated_ptes = i; page->partial_flags = PTF_partial_set; } else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) { page->nr_validated_ptes = i + !(partial_flags & PTF_partial_set); page->partial_flags = partial_flags; rc = -ERESTART; } return rc > 0 ? 0 : rc; } static int demote_l4_table(struct page_info *page) { struct domain *d = page_get_owner(page); mfn_t l4mfn = page_to_mfn(page); l4_pgentry_t *pl4e = map_domain_page(l4mfn); int rc = 0; unsigned partial_flags = page->partial_flags, i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); do { if ( is_guest_l4_slot(d, i) ) rc = put_page_from_l4e(pl4e[i], l4mfn, partial_flags); if ( rc < 0 ) break; partial_flags = 0; } while ( i-- ); if ( rc == -ERESTART ) { page->nr_validated_ptes = i; page->partial_flags = PTF_partial_set; } else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) { page->nr_validated_ptes = i + !(partial_flags & PTF_partial_set); page->partial_flags = partial_flags; rc = -ERESTART; } unmap_domain_page(pl4e); if ( rc >= 0 ) { atomic_dec(&d->arch.pv.nr_l4_pages); rc = 0; } return rc; } #endif /* CONFIG_PV */ #ifndef NDEBUG /* * We must never call _put_page_type() while holding a page_lock() for * that page; doing so may cause a deadlock under the right * conditions. * * Furthermore, there is no discipline for the order in which page locks * are grabbed; if there are any paths that grab the locks for two * different pages at once, we risk creating the conditions for a deadlock * to occur. * * These are believed to be safe, because it is believed that: * 1. No hypervisor paths ever lock two pages at once, and * 2. We never call _put_page_type() on a page while holding its page lock. * * Add a check to debug builds to catch any violations of these assumptions. * * NB that if we find valid, safe reasons to hold two page locks at * once, these checks will need to be adjusted. */ static DEFINE_PER_CPU(struct page_info *, current_locked_page); static inline void current_locked_page_set(struct page_info *page) { this_cpu(current_locked_page) = page; } static inline bool current_locked_page_check(struct page_info *page) { return this_cpu(current_locked_page) == page; } /* * We need a separate "not-equal" check so the non-debug stubs can * always return true. */ static inline bool current_locked_page_ne_check(struct page_info *page) { return this_cpu(current_locked_page) != page; } #else #define current_locked_page_set(x) #define current_locked_page_check(x) true #define current_locked_page_ne_check(x) true #endif int page_lock(struct page_info *page) { unsigned long x, nx; ASSERT(current_locked_page_check(NULL)); do { while ( (x = page->u.inuse.type_info) & PGT_locked ) cpu_relax(); nx = x + (1 | PGT_locked); if ( !(x & PGT_validated) || !(x & PGT_count_mask) || !(nx & PGT_count_mask) ) return 0; } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); current_locked_page_set(page); return 1; } void page_unlock(struct page_info *page) { unsigned long x, nx, y = page->u.inuse.type_info; ASSERT(current_locked_page_check(page)); do { x = y; ASSERT((x & PGT_count_mask) && (x & PGT_locked)); nx = x - (1 | PGT_locked); /* We must not drop the last reference here. */ ASSERT(nx & PGT_count_mask); } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x ); current_locked_page_set(NULL); } /* * L3 table locks: * * Used for serialization in map_pages_to_xen() and modify_xen_mappings(). * * For Xen PT pages, the page->u.inuse.type_info is unused and it is safe to * reuse the PGT_locked flag. This lock is taken only when we move down to L3 * tables and below, since L4 (and above, for 5-level paging) is still globally * protected by map_pgdir_lock. * * PV MMU update hypercalls call map_pages_to_xen while holding a page's page_lock(). * This has two implications: * - We cannot reuse reuse current_locked_page_* for debugging * - To avoid the chance of deadlock, even for different pages, we * must never grab page_lock() after grabbing l3t_lock(). This * includes any page_lock()-based locks, such as * mem_sharing_page_lock(). * * Also note that we grab the map_pgdir_lock while holding the * l3t_lock(), so to avoid deadlock we must avoid grabbing them in * reverse order. */ static void l3t_lock(struct page_info *page) { unsigned long x, nx; do { while ( (x = page->u.inuse.type_info) & PGT_locked ) cpu_relax(); nx = x | PGT_locked; } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); } static void l3t_unlock(struct page_info *page) { unsigned long x, nx, y = page->u.inuse.type_info; do { x = y; BUG_ON(!(x & PGT_locked)); nx = x & ~PGT_locked; } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x ); } #ifdef CONFIG_PV /* * PTE flags that a guest may change without re-validating the PTE. * All other bits affect translation, caching, or Xen's safety. */ #define FASTPATH_FLAG_WHITELIST \ (_PAGE_NX_BIT | _PAGE_AVAIL_HIGH | _PAGE_AVAIL | _PAGE_GLOBAL | \ _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER) /* * PDE flags that a guest may change without re-validating the PDE. * All other bits affect translation, caching, or Xen's safety. When guest * created linear page tables aren't allowed, intermediate page tables may * have _PAGE_RW altered without this requiring re-validation. */ #ifndef CONFIG_PV_LINEAR_PT # define FASTPATH_PDE_FLAG_WHITELIST (FASTPATH_FLAG_WHITELIST | _PAGE_RW) #else # define FASTPATH_PDE_FLAG_WHITELIST FASTPATH_FLAG_WHITELIST #endif /* Update the L1 entry at pl1e to new value nl1e. */ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, mfn_t gl1mfn, unsigned int cmd, struct vcpu *pt_vcpu, struct domain *pg_dom) { bool preserve_ad = (cmd == MMU_PT_UPDATE_PRESERVE_AD); l1_pgentry_t ol1e = l1e_read_atomic(pl1e); struct domain *pt_dom = pt_vcpu->domain; int rc = 0; ASSERT(!paging_mode_refcounts(pt_dom)); if ( l1e_get_flags(nl1e) & _PAGE_PRESENT ) { struct page_info *page = NULL; if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) ) { gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n", l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)); return -EINVAL; } /* Translate foreign guest address. */ if ( cmd != MMU_PT_UPDATE_NO_TRANSLATE && paging_mode_translate(pg_dom) ) { p2m_type_t p2mt; gfn_t gfn = _gfn(l1e_get_pfn(nl1e)); p2m_query_t q = l1e_get_flags(nl1e) & _PAGE_RW ? P2M_ALLOC | P2M_UNSHARE : P2M_ALLOC; page = get_page_from_gfn(pg_dom, gfn_x(gfn), &p2mt, q); if ( p2m_is_paged(p2mt) ) { if ( page ) put_page(page); p2m_mem_paging_populate(pg_dom, gfn); return -ENOENT; } if ( p2mt == p2m_ram_paging_in && !page ) return -ENOENT; /* Did our attempt to unshare fail? */ if ( (q & P2M_UNSHARE) && p2m_is_shared(p2mt) ) { /* We could not have obtained a page ref. */ ASSERT(!page); /* And mem_sharing_notify has already been called. */ return -ENOMEM; } if ( !page ) return -EINVAL; nl1e = l1e_from_page(page, l1e_get_flags(nl1e)); } nl1e = adjust_guest_l1e(nl1e, pt_dom); /* Fast path for sufficiently-similar mappings. */ if ( !l1e_has_changed(ol1e, nl1e, ~FASTPATH_FLAG_WHITELIST) ) { rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, preserve_ad); if ( page ) put_page(page); return rc ? 0 : -EBUSY; } switch ( rc = get_page_from_l1e(nl1e, pt_dom, pg_dom) ) { default: if ( page ) put_page(page); return rc; case 0: break; case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS: ASSERT(!(rc & ~(_PAGE_RW | PAGE_CACHE_ATTRS))); l1e_flip_flags(nl1e, rc); rc = 0; break; } if ( page ) put_page(page); if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, preserve_ad)) ) { ol1e = nl1e; rc = -EBUSY; } } else if ( pv_l1tf_check_l1e(pt_dom, nl1e) ) return -ERESTART; else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, preserve_ad)) ) { return -EBUSY; } put_page_from_l1e(ol1e, pt_dom); return rc; } /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame mfn. */ static int mod_l2_entry(l2_pgentry_t *pl2e, l2_pgentry_t nl2e, mfn_t mfn, int preserve_ad, struct vcpu *vcpu) { l2_pgentry_t ol2e; struct domain *d = vcpu->domain; struct page_info *l2pg = mfn_to_page(mfn); unsigned long type = l2pg->u.inuse.type_info; int rc = 0; if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) ) { gdprintk(XENLOG_WARNING, "L2 update in Xen-private area, slot %#lx\n", pgentry_ptr_to_slot(pl2e)); return -EPERM; } ol2e = l2e_read_atomic(pl2e); if ( l2e_get_flags(nl2e) & _PAGE_PRESENT ) { if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) ) { gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n", l2e_get_flags(nl2e) & L2_DISALLOW_MASK); return -EINVAL; } nl2e = adjust_guest_l2e(nl2e, d); /* Fast path for sufficiently-similar mappings. */ if ( !l2e_has_changed(ol2e, nl2e, ~FASTPATH_PDE_FLAG_WHITELIST) ) { if ( UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, mfn, vcpu, preserve_ad) ) return 0; return -EBUSY; } if ( unlikely((rc = get_page_from_l2e(nl2e, mfn, d, 0)) < 0) ) return rc; if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, mfn, vcpu, preserve_ad)) ) { ol2e = nl2e; rc = -EBUSY; } } else if ( pv_l1tf_check_l2e(d, nl2e) ) return -ERESTART; else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, mfn, vcpu, preserve_ad)) ) { return -EBUSY; } put_page_from_l2e(ol2e, mfn, PTF_defer); return rc; } /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame mfn. */ static int mod_l3_entry(l3_pgentry_t *pl3e, l3_pgentry_t nl3e, mfn_t mfn, int preserve_ad, struct vcpu *vcpu) { l3_pgentry_t ol3e; struct domain *d = vcpu->domain; int rc = 0; /* * Disallow updates to final L3 slot. It contains Xen mappings, and it * would be a pain to ensure they remain continuously valid throughout. */ if ( pgentry_ptr_to_slot(pl3e) >= 3 && is_pv_32bit_domain(d) ) return -EINVAL; ol3e = l3e_read_atomic(pl3e); if ( l3e_get_flags(nl3e) & _PAGE_PRESENT ) { if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) ) { gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n", l3e_get_flags(nl3e) & l3_disallow_mask(d)); return -EINVAL; } nl3e = adjust_guest_l3e(nl3e, d); /* Fast path for sufficiently-similar mappings. */ if ( !l3e_has_changed(ol3e, nl3e, ~FASTPATH_PDE_FLAG_WHITELIST) ) { rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, mfn, vcpu, preserve_ad); return rc ? 0 : -EFAULT; } rc = get_page_from_l3e(nl3e, mfn, d, 0); if ( unlikely(rc < 0) ) return rc; rc = 0; if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, mfn, vcpu, preserve_ad)) ) { ol3e = nl3e; rc = -EFAULT; } } else if ( pv_l1tf_check_l3e(d, nl3e) ) return -ERESTART; else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, mfn, vcpu, preserve_ad)) ) { return -EFAULT; } put_page_from_l3e(ol3e, mfn, PTF_defer); return rc; } /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame mfn. */ static int mod_l4_entry(l4_pgentry_t *pl4e, l4_pgentry_t nl4e, mfn_t mfn, int preserve_ad, struct vcpu *vcpu) { struct domain *d = vcpu->domain; l4_pgentry_t ol4e; int rc = 0; if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) ) { gdprintk(XENLOG_WARNING, "L4 update in Xen-private area, slot %#lx\n", pgentry_ptr_to_slot(pl4e)); return -EINVAL; } ol4e = l4e_read_atomic(pl4e); if ( l4e_get_flags(nl4e) & _PAGE_PRESENT ) { if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) ) { gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n", l4e_get_flags(nl4e) & L4_DISALLOW_MASK); return -EINVAL; } nl4e = adjust_guest_l4e(nl4e, d); /* Fast path for sufficiently-similar mappings. */ if ( !l4e_has_changed(ol4e, nl4e, ~FASTPATH_PDE_FLAG_WHITELIST) ) { rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, mfn, vcpu, preserve_ad); return rc ? 0 : -EFAULT; } rc = get_page_from_l4e(nl4e, mfn, d, 0); if ( unlikely(rc < 0) ) return rc; rc = 0; if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, mfn, vcpu, preserve_ad)) ) { ol4e = nl4e; rc = -EFAULT; } } else if ( pv_l1tf_check_l4e(d, nl4e) ) return -ERESTART; else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, mfn, vcpu, preserve_ad)) ) { return -EFAULT; } put_page_from_l4e(ol4e, mfn, PTF_defer); return rc; } #endif /* CONFIG_PV */ /* * In the course of a page's use, it may have caused other secondary * mappings to have changed: * - Xen's mappings may have been changed to accomodate the requested * cache attibutes * - A page may have been put into the IOMMU of a PV guest when it * gained a writable mapping. * * Now that the page is being freed, clean up these mappings if * appropriate. NB that at this point the page is still "allocated", * but not "live" (i.e., its refcount is 0), so it's safe to read the * count_info, owner, and type_info without synchronization. */ static int cleanup_page_mappings(struct page_info *page) { int rc = 0; unsigned long mfn = mfn_x(page_to_mfn(page)); /* * If this may be in a PV domain's IOMMU, remove it. * * NB that writable xenheap pages have their type set and cleared by * implementation-specific code, rather than by get_page_type(). As such: * - They aren't expected to have an IOMMU mapping, and * - We don't necessarily expect the type count to be zero when the final * put_page happens. * * Go ahead and attemp to call iommu_unmap() on xenheap pages anyway, just * in case; but only ASSERT() that the type count is zero and remove the * PGT_writable type for non-xenheap pages. */ if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page ) { struct domain *d = page_get_owner(page); if ( d && unlikely(need_iommu_pt_sync(d)) && is_pv_domain(d) ) rc = iommu_legacy_unmap(d, _dfn(mfn), 1u << PAGE_ORDER_4K); if ( likely(!is_special_page(page)) ) { ASSERT((page->u.inuse.type_info & (PGT_type_mask | PGT_count_mask)) == PGT_writable_page); /* * Clear the type to record the fact that all writable mappings * have been removed. But if either operation failed, leave * type_info alone. */ if ( likely(!rc) ) page->u.inuse.type_info &= ~(PGT_type_mask | PGT_count_mask); } } /* * Flush the cache if there were previously non-coherent writeable * mappings of this page. This forces the page to be coherent before it * is freed back to the heap. */ if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) ) { void *addr = __map_domain_page(page); cache_flush(addr, PAGE_SIZE); unmap_domain_page(addr); } return rc; } void put_page(struct page_info *page) { unsigned long nx, x, y = page->count_info; do { ASSERT((y & PGC_count_mask) != 0); x = y; nx = x - 1; } while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) ); if ( unlikely((nx & PGC_count_mask) == 0) ) { if ( !cleanup_page_mappings(page) ) free_domheap_page(page); else gdprintk(XENLOG_WARNING, "Leaking mfn %" PRI_mfn "\n", mfn_x(page_to_mfn(page))); } } struct domain *page_get_owner_and_reference(struct page_info *page) { unsigned long x, y = page->count_info; struct domain *owner; do { x = y; /* * Count == 0: Page is not allocated, so we cannot take a reference. * Count == -1: Reference count would wrap, which is invalid. * Count == -2: Remaining unused ref is reserved for get_page_light(). */ if ( unlikely(((x + 2) & PGC_count_mask) <= 2) ) return NULL; } while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x ); owner = page_get_owner(page); ASSERT(owner); return owner; } bool get_page(struct page_info *page, const struct domain *domain) { const struct domain *owner = page_get_owner_and_reference(page); if ( likely(owner == domain) ) return true; if ( !paging_mode_refcounts(domain) && !domain->is_dying ) gprintk(XENLOG_INFO, "Error mfn %"PRI_mfn": rd=%pd od=%pd caf=%08lx taf=%"PRtype_info"\n", mfn_x(page_to_mfn(page)), domain, owner, page->count_info - !!owner, page->u.inuse.type_info); if ( owner ) put_page(page); return false; } /* * Special version of get_page() to be used exclusively when * - a page is known to already have a non-zero reference count * - the page does not need its owner to be checked * - it will not be called more than once without dropping the thus * acquired reference again. * Due to get_page() reserving one reference, this call cannot fail. * * Note that some callers rely on this being a full memory barrier. */ static void get_page_light(struct page_info *page) { unsigned long x, nx, y = page->count_info; do { x = y; nx = x + 1; BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */ BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */ y = cmpxchg(&page->count_info, x, nx); } while ( unlikely(y != x) ); } static int validate_page(struct page_info *page, unsigned long type, int preemptible) { #ifdef CONFIG_PV struct domain *owner = page_get_owner(page); int rc; /* A page table is dirtied when its type count becomes non-zero. */ if ( likely(owner != NULL) ) paging_mark_dirty(owner, page_to_mfn(page)); switch ( type & PGT_type_mask ) { case PGT_l1_page_table: rc = promote_l1_table(page); break; case PGT_l2_page_table: ASSERT(preemptible); rc = promote_l2_table(page, type); break; case PGT_l3_page_table: ASSERT(preemptible); rc = promote_l3_table(page); break; case PGT_l4_page_table: ASSERT(preemptible); rc = promote_l4_table(page); break; case PGT_seg_desc_page: rc = validate_segdesc_page(page); break; default: printk("Bad type in validate_page %lx t=%" PRtype_info " c=%lx\n", type, page->u.inuse.type_info, page->count_info); rc = -EINVAL; BUG(); } /* No need for atomic update of type_info here: noone else updates it. */ smp_wmb(); switch ( rc ) { case 0: page->u.inuse.type_info |= PGT_validated; break; case -EINTR: ASSERT((page->u.inuse.type_info & (PGT_count_mask|PGT_validated|PGT_partial)) == 1); page->u.inuse.type_info &= ~PGT_count_mask; break; default: ASSERT(rc < 0); gdprintk(XENLOG_WARNING, "Error while validating mfn %" PRI_mfn " (pfn %" PRI_pfn ") for type %" PRtype_info ": caf=%08lx taf=%" PRtype_info "\n", mfn_x(page_to_mfn(page)), get_gpfn_from_mfn(mfn_x(page_to_mfn(page))), type, page->count_info, page->u.inuse.type_info); if ( page != current->arch.old_guest_table ) page->u.inuse.type_info = 0; else { ASSERT((page->u.inuse.type_info & (PGT_count_mask | PGT_validated)) == 1); case -ERESTART: get_page_light(page); page->u.inuse.type_info |= PGT_partial; } break; } return rc; #else ASSERT_UNREACHABLE(); return -EINVAL; #endif } int devalidate_page(struct page_info *page, unsigned long type, int preemptible) { #ifdef CONFIG_PV struct domain *owner = page_get_owner(page); int rc; if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) ) { /* A page table is dirtied when its type count becomes zero. */ paging_mark_dirty(owner, page_to_mfn(page)); ASSERT(shadow_mode_enabled(owner)); ASSERT(!paging_mode_refcounts(owner)); ASSERT(!paging_mode_translate(owner)); shadow_remove_all_shadows(owner, page_to_mfn(page)); } if ( !(type & PGT_partial) ) { page->nr_validated_ptes = 1U << PAGETABLE_ORDER; page->partial_flags = 0; } switch ( type & PGT_type_mask ) { case PGT_l1_page_table: demote_l1_table(page); rc = 0; break; case PGT_l2_page_table: ASSERT(preemptible); rc = demote_l2_table(page); break; case PGT_l3_page_table: ASSERT(preemptible); rc = demote_l3_table(page); break; case PGT_l4_page_table: ASSERT(preemptible); rc = demote_l4_table(page); break; default: gdprintk(XENLOG_WARNING, "type %" PRtype_info " mfn %" PRI_mfn "\n", type, mfn_x(page_to_mfn(page))); rc = -EINVAL; BUG(); } return rc; #else ASSERT_UNREACHABLE(); return -EINVAL; #endif } static int _put_final_page_type(struct page_info *page, unsigned long type, bool preemptible, struct page_info *ptpg) { int rc = devalidate_page(page, type, preemptible); if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) && (type & PGT_validated) && rc != -EINTR ) { /* Any time we begin de-validation of a page, adjust linear counts */ dec_linear_uses(page); dec_linear_entries(ptpg); } /* No need for atomic update of type_info here: noone else updates it. */ if ( rc == 0 ) { ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying); set_tlbflush_timestamp(page); smp_wmb(); page->u.inuse.type_info--; } else if ( rc == -EINTR ) { ASSERT((page->u.inuse.type_info & (PGT_count_mask|PGT_validated|PGT_partial)) == 1); smp_wmb(); page->u.inuse.type_info |= PGT_validated; } else { BUG_ON(rc != -ERESTART); /* get_page_light() includes a full barrier. */ get_page_light(page); page->u.inuse.type_info |= PGT_partial; } return rc; } static int _put_page_type(struct page_info *page, unsigned int flags, struct page_info *ptpg) { unsigned long nx, x, y = page->u.inuse.type_info; bool preemptible = flags & PTF_preemptible; ASSERT(current_locked_page_ne_check(page)); for ( ; ; ) { x = y; nx = x - 1; /* * Is this expected to do a full reference drop, or only * cleanup partial validation / devalidation? * * If the former, the caller must hold a "full" type ref; * which means the page must be validated. If the page is * *not* fully validated, continuing would almost certainly * open up a security hole. An exception to this is during * domain destruction, where PGT_validated can be dropped * without dropping a type ref. * * If the latter, do nothing unless type PGT_partial is set. * If it is set, the type count must be 1. */ if ( !(flags & PTF_partial_set) ) BUG_ON((x & PGT_partial) || !((x & PGT_validated) || page_get_owner(page)->is_dying)); else if ( !(x & PGT_partial) ) return 0; else BUG_ON((x & PGT_count_mask) != 1); ASSERT((x & PGT_count_mask) != 0); switch ( nx & (PGT_locked | PGT_count_mask) ) { case 0: if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && likely(nx & (PGT_validated|PGT_partial)) ) { int rc; /* * Page-table pages must be unvalidated when count is zero. The * 'free' is safe because the refcnt is non-zero and validated * bit is clear => other ops will spin or fail. */ nx = x & ~(PGT_validated|PGT_partial); if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ) break; /* We cleared the 'valid bit' so we do the clean up. */ rc = _put_final_page_type(page, x, preemptible, ptpg); if ( x & PGT_partial ) put_page(page); return rc; } if ( !ptpg || !PGT_type_equal(x, ptpg->u.inuse.type_info) ) { /* * set_tlbflush_timestamp() accesses the same union * linear_pt_count lives in. Pages (including page table ones), * however, don't need their flush time stamp set except when * the last reference is being dropped. For page table pages * this happens in _put_final_page_type(). */ set_tlbflush_timestamp(page); } else BUG_ON(!IS_ENABLED(CONFIG_PV_LINEAR_PT)); /* fall through */ default: if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ) break; if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) ) { dec_linear_uses(page); dec_linear_entries(ptpg); } return 0; case PGT_locked: ASSERT_UNREACHABLE(); return -EILSEQ; case PGT_locked | 1: /* * We must not drop the second to last reference when the page is * locked, as page_unlock() doesn't do any cleanup of the type. */ cpu_relax(); y = page->u.inuse.type_info; break; } if ( preemptible && hypercall_preempt_check() ) return -EINTR; } } static int _get_page_type(struct page_info *page, unsigned long type, bool preemptible) { unsigned long nx, x; int rc = 0; ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); ASSERT(!in_irq()); for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; ) { x = y; nx = x + 1; if ( unlikely((nx & PGT_count_mask) == 0) ) { gdprintk(XENLOG_WARNING, "Type count overflow on mfn %"PRI_mfn"\n", mfn_x(page_to_mfn(page))); return -EINVAL; } if ( unlikely((x & PGT_count_mask) == 0) ) { /* * Typeref 0 -> 1. * * Type changes are permitted when the typeref is 0. If the type * actually changes, the page needs re-validating. */ ASSERT(!(x & PGT_pae_xen_l2)); if ( (x & PGT_type_mask) != type ) { nx &= ~(PGT_type_mask | PGT_validated); nx |= type; } } else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) ) { /* * else, we're trying to take a new reference, of the wrong type. * * This (being able to prohibit use of the wrong type) is what the * typeref system exists for, but skip printing the failure if it * looks like a recursive mapping, as subsequent logic might * ultimately permit the attempt. */ if ( ((x & PGT_type_mask) == PGT_l2_page_table) && (type == PGT_l1_page_table) ) return -EINVAL; if ( ((x & PGT_type_mask) == PGT_l3_page_table) && (type == PGT_l2_page_table) ) return -EINVAL; if ( ((x & PGT_type_mask) == PGT_l4_page_table) && (type == PGT_l3_page_table) ) return -EINVAL; gdprintk(XENLOG_WARNING, "Bad type (saw %" PRtype_info " != exp %" PRtype_info ") " "for mfn %" PRI_mfn " (pfn %" PRI_pfn ")\n", x, type, mfn_x(page_to_mfn(page)), get_gpfn_from_mfn(mfn_x(page_to_mfn(page)))); return -EINVAL; } else if ( unlikely(!(x & PGT_validated)) ) { /* * else, the count is non-zero, and we're grabbing the right type; * but the page hasn't been validated yet. * * The page is in one of two states (depending on PGT_partial), * and should have exactly one reference. */ ASSERT((x & (PGT_type_mask | PGT_pae_xen_l2 | PGT_count_mask)) == (type | 1)); if ( !(x & PGT_partial) ) { /* * The page has been left in the "validate locked" state * (i.e. PGT_[type] | 1) which means that a concurrent caller * of _get_page_type() is in the middle of validation. * * Spin waiting for the concurrent user to complete (partial * or fully validated), then restart our attempt to acquire a * type reference. */ do { if ( preemptible && hypercall_preempt_check() ) return -EINTR; cpu_relax(); } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x ); continue; } /* * The page has been left in the "partial" state * (i.e., PGT_[type] | PGT_partial | 1). * * Rather than bumping the type count, we need to try to grab the * validation lock; if we succeed, we need to validate the page, * then drop the general ref associated with the PGT_partial bit. * * We grab the validation lock by setting nx to (PGT_[type] | 1) * (i.e., non-zero type count, neither PGT_validated nor * PGT_partial set). */ nx = x & ~PGT_partial; } if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) break; if ( preemptible && hypercall_preempt_check() ) return -EINTR; } /* * One typeref has been taken and is now globally visible. * * The page is either in the "validate locked" state (PGT_[type] | 1) or * fully validated (PGT_[type] | PGT_validated | >0). */ /* If the page is fully validated, we're done. */ if ( likely(nx & PGT_validated) ) return 0; /* * The page is in the "validate locked" state. We have exclusive access, * and any concurrent callers are waiting in the cmpxchg() loop above. * * Exclusive access ends when PGT_validated or PGT_partial get set. */ if ( unlikely((x & PGT_count_mask) == 0) ) { struct domain *d = page_get_owner(page); if ( d && shadow_mode_enabled(d) ) shadow_prepare_page_type_change(d, page); if ( (x & PGT_type_mask) != type && /* Shadow mode: track only writable pages. */ (!shadow_mode_enabled(d) || ((x & PGT_type_mask) == PGT_writable_page)) ) { /* * On type change we check to flush stale TLB entries. It is * vital that no other CPUs are left with writeable mappings * to a frame which is intending to become pgtable/segdesc. */ cpumask_t *mask = this_cpu(scratch_cpumask); BUG_ON(in_irq()); cpumask_copy(mask, d->dirty_cpumask); /* Don't flush if the timestamp is old enough */ tlbflush_filter(mask, page->tlbflush_timestamp); if ( unlikely(!cpumask_empty(mask)) ) { perfc_incr(need_flush_tlb_flush); /* * If page was a page table make sure the flush is * performed using an IPI in order to avoid changing the * type of a page table page under the feet of * spurious_page_fault(). */ flush_mask(mask, (x & PGT_type_mask) && (x & PGT_type_mask) <= PGT_root_page_table ? FLUSH_TLB | FLUSH_NO_ASSIST : FLUSH_TLB); } } } if ( unlikely(((x & PGT_type_mask) == PGT_writable_page) != (type == PGT_writable_page)) ) { /* Special pages should not be accessible from devices. */ struct domain *d = page_get_owner(page); if ( d && unlikely(need_iommu_pt_sync(d)) && is_pv_domain(d) ) { mfn_t mfn = page_to_mfn(page); if ( (x & PGT_type_mask) == PGT_writable_page ) rc = iommu_legacy_unmap(d, _dfn(mfn_x(mfn)), 1ul << PAGE_ORDER_4K); else rc = iommu_legacy_map(d, _dfn(mfn_x(mfn)), mfn, 1ul << PAGE_ORDER_4K, IOMMUF_readable | IOMMUF_writable); if ( unlikely(rc) ) { _put_page_type(page, 0, NULL); goto out; } } } /* * Flush the cache if there were previously non-coherent mappings of * this page, and we're trying to use it as anything other than a * writeable page. This forces the page to be coherent before we * validate its contents for safety. */ if ( (nx & PGT_non_coherent) && type != PGT_writable_page ) { void *addr = __map_domain_page(page); cache_flush(addr, PAGE_SIZE); unmap_domain_page(addr); page->u.inuse.type_info &= ~PGT_non_coherent; } /* * No special validation needed for writable or shared pages. Page * tables and GDT/LDT need to have their contents audited. * * per validate_page(), non-atomic updates are fine here. */ if ( type == PGT_writable_page || type == PGT_shared_page ) page->u.inuse.type_info |= PGT_validated; else { if ( !(x & PGT_partial) ) { page->nr_validated_ptes = 0; page->partial_flags = 0; page->linear_pt_count = 0; } rc = validate_page(page, type, preemptible); } out: /* * Did we drop the PGT_partial bit when acquiring the typeref? If so, * drop the general reference that went along with it. * * N.B. validate_page() may have have re-set PGT_partial, not reflected in * nx, but will have taken an extra ref when doing so. */ if ( (x & PGT_partial) && !(nx & PGT_partial) ) put_page(page); return rc; } void put_page_type(struct page_info *page) { int rc = _put_page_type(page, 0, NULL); ASSERT(rc == 0); (void)rc; } int get_page_type(struct page_info *page, unsigned long type) { int rc = _get_page_type(page, type, false); if ( likely(rc == 0) ) return 1; ASSERT(rc != -EINTR && rc != -ERESTART); return 0; } int put_page_type_preemptible(struct page_info *page) { return _put_page_type(page, PTF_preemptible, NULL); } int get_page_type_preemptible(struct page_info *page, unsigned long type) { ASSERT(!current->arch.old_guest_table); return _get_page_type(page, type, true); } int put_old_guest_table(struct vcpu *v) { int rc; if ( !v->arch.old_guest_table ) return 0; rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible | ( v->arch.old_guest_table_partial ? PTF_partial_set : 0 ), v->arch.old_guest_ptpg); if ( rc == -ERESTART || rc == -EINTR ) { v->arch.old_guest_table_partial = (rc == -ERESTART); return -ERESTART; } /* * It shouldn't be possible for _put_page_type() to return * anything else at the moment; but if it does happen in * production, leaking the type ref is probably the best thing to * do. Either way, drop the general ref held by old_guest_table. */ ASSERT(rc == 0); put_page(v->arch.old_guest_table); v->arch.old_guest_table = NULL; v->arch.old_guest_ptpg = NULL; /* * Safest default if someone sets old_guest_table without * explicitly setting old_guest_table_partial. */ v->arch.old_guest_table_partial = true; return rc; } int vcpu_destroy_pagetables(struct vcpu *v) { unsigned long mfn = pagetable_get_pfn(v->arch.guest_table); struct page_info *page = NULL; int rc = put_old_guest_table(v); bool put_guest_table_user = false; if ( rc ) return rc; v->arch.cr3 = 0; /* * Get the top-level guest page; either the guest_table itself, for * 64-bit, or the top-level l4 entry for 32-bit. Either way, remove * the reference to that page. */ if ( is_pv_32bit_vcpu(v) ) { l4_pgentry_t *l4tab = map_domain_page(_mfn(mfn)); mfn = l4e_get_pfn(*l4tab); l4e_write(l4tab, l4e_empty()); unmap_domain_page(l4tab); } else { v->arch.guest_table = pagetable_null(); put_guest_table_user = true; } /* Free that page if non-zero */ do { if ( mfn ) { page = mfn_to_page(_mfn(mfn)); if ( paging_mode_refcounts(v->domain) ) put_page(page); else rc = put_page_and_type_preemptible(page); mfn = 0; } if ( !rc && put_guest_table_user ) { /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */ mfn = pagetable_get_pfn(v->arch.guest_table_user); v->arch.guest_table_user = pagetable_null(); put_guest_table_user = false; } } while ( mfn ); /* * If a "put" operation was interrupted, finish things off in * put_old_guest_table() when the operation is restarted. */ switch ( rc ) { case -EINTR: case -ERESTART: v->arch.old_guest_ptpg = NULL; v->arch.old_guest_table = page; v->arch.old_guest_table_partial = (rc == -ERESTART); rc = -ERESTART; break; default: /* * Failure to 'put' a page may cause it to leak, but that's * less bad than a crash. */ ASSERT(rc == 0); break; } return rc; } #ifdef CONFIG_PV int new_guest_cr3(mfn_t mfn) { struct vcpu *curr = current; struct domain *d = curr->domain; int rc; mfn_t old_base_mfn; if ( is_pv_32bit_domain(d) ) { mfn_t gt_mfn = pagetable_get_mfn(curr->arch.guest_table); l4_pgentry_t *pl4e = map_domain_page(gt_mfn); rc = mod_l4_entry(pl4e, l4e_from_mfn(mfn, (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)), gt_mfn, 0, curr); unmap_domain_page(pl4e); switch ( rc ) { case 0: break; case -EINTR: case -ERESTART: return -ERESTART; default: gdprintk(XENLOG_WARNING, "Error while installing new compat baseptr %" PRI_mfn "\n", mfn_x(mfn)); return rc; } pv_destroy_ldt(curr); /* Unconditional TLB flush later. */ write_ptbase(curr); return 0; } rc = put_old_guest_table(curr); if ( unlikely(rc) ) return rc; old_base_mfn = pagetable_get_mfn(curr->arch.guest_table); /* * This is particularly important when getting restarted after the * previous attempt got preempted in the put-old-MFN phase. */ if ( mfn_eq(old_base_mfn, mfn) ) { write_ptbase(curr); return 0; } rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, PTF_preemptible); switch ( rc ) { case 0: break; case -EINTR: case -ERESTART: return -ERESTART; default: gdprintk(XENLOG_WARNING, "Error while installing new baseptr %" PRI_mfn "\n", mfn_x(mfn)); return rc; } pv_destroy_ldt(curr); /* Unconditional TLB flush later. */ if ( !VM_ASSIST(d, m2p_strict) ) fill_ro_mpt(mfn); curr->arch.guest_table = pagetable_from_mfn(mfn); update_cr3(curr); write_ptbase(curr); if ( likely(mfn_x(old_base_mfn) != 0) ) { struct page_info *page = mfn_to_page(old_base_mfn); switch ( rc = put_page_and_type_preemptible(page) ) { case -EINTR: case -ERESTART: curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; curr->arch.old_guest_table_partial = (rc == -ERESTART); rc = -ERESTART; break; default: BUG_ON(rc); break; } } return rc; } #endif #ifdef CONFIG_PV static int vcpumask_to_pcpumask( struct domain *d, XEN_GUEST_HANDLE_PARAM(const_void) bmap, cpumask_t *pmask) { unsigned int vcpu_id, vcpu_bias, offs; unsigned long vmask; struct vcpu *v; bool is_native = !is_pv_32bit_domain(d); cpumask_clear(pmask); for ( vmask = 0, offs = 0; ; ++offs ) { vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32); if ( vcpu_bias >= d->max_vcpus ) return 0; if ( unlikely(is_native ? copy_from_guest_offset(&vmask, bmap, offs, 1) : copy_from_guest_offset((unsigned int *)&vmask, bmap, offs, 1)) ) { cpumask_clear(pmask); return -EFAULT; } while ( vmask ) { unsigned int cpu; vcpu_id = find_first_set_bit(vmask); vmask &= ~(1UL << vcpu_id); vcpu_id += vcpu_bias; if ( (vcpu_id >= d->max_vcpus) ) return 0; if ( (v = d->vcpu[vcpu_id]) == NULL ) continue; cpu = read_atomic(&v->dirty_cpu); if ( is_vcpu_dirty_cpu(cpu) ) __cpumask_set_cpu(cpu, pmask); } } } long do_mmuext_op( XEN_GUEST_HANDLE_PARAM(mmuext_op_t) uops, unsigned int count, XEN_GUEST_HANDLE_PARAM(uint) pdone, unsigned int foreigndom) { struct mmuext_op op; unsigned long type; unsigned int i, done = 0; struct vcpu *curr = current; struct domain *currd = curr->domain; struct domain *pg_owner; int rc = put_old_guest_table(curr); if ( unlikely(rc) ) { if ( likely(rc == -ERESTART) ) rc = hypercall_create_continuation( __HYPERVISOR_mmuext_op, "hihi", uops, count, pdone, foreigndom); return rc; } if ( unlikely(count == MMU_UPDATE_PREEMPTED) && likely(guest_handle_is_null(uops)) ) { /* * See the curr->arch.old_guest_table related * hypercall_create_continuation() below. */ return (int)foreigndom; } if ( unlikely(count & MMU_UPDATE_PREEMPTED) ) { count &= ~MMU_UPDATE_PREEMPTED; if ( unlikely(!guest_handle_is_null(pdone)) ) (void)copy_from_guest(&done, pdone, 1); } else perfc_incr(calls_to_mmuext_op); if ( unlikely(!guest_handle_okay(uops, count)) ) return -EFAULT; if ( (pg_owner = get_pg_owner(foreigndom)) == NULL ) return -ESRCH; if ( !is_pv_domain(pg_owner) ) { put_pg_owner(pg_owner); return -EINVAL; } rc = xsm_mmuext_op(XSM_TARGET, currd, pg_owner); if ( rc ) { put_pg_owner(pg_owner); return rc; } for ( i = 0; i < count; i++ ) { if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) ) { rc = -ERESTART; break; } if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) ) { rc = -EFAULT; break; } if ( is_hvm_domain(currd) ) { switch ( op.cmd ) { case MMUEXT_PIN_L1_TABLE: case MMUEXT_PIN_L2_TABLE: case MMUEXT_PIN_L3_TABLE: case MMUEXT_PIN_L4_TABLE: case MMUEXT_UNPIN_TABLE: break; default: rc = -EOPNOTSUPP; goto done; } } rc = 0; switch ( op.cmd ) { struct page_info *page; p2m_type_t p2mt; case MMUEXT_PIN_L1_TABLE: type = PGT_l1_page_table; goto pin_page; case MMUEXT_PIN_L2_TABLE: type = PGT_l2_page_table; goto pin_page; case MMUEXT_PIN_L3_TABLE: type = PGT_l3_page_table; goto pin_page; case MMUEXT_PIN_L4_TABLE: if ( is_pv_32bit_domain(pg_owner) ) break; type = PGT_l4_page_table; pin_page: /* Ignore pinning of invalid paging levels. */ if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) ) break; if ( paging_mode_refcounts(pg_owner) ) break; page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC); if ( unlikely(!page) ) { rc = -EINVAL; break; } rc = get_page_type_preemptible(page, type); if ( unlikely(rc) ) { if ( rc == -EINTR ) rc = -ERESTART; else if ( rc != -ERESTART ) gdprintk(XENLOG_WARNING, "Error %d while pinning mfn %" PRI_mfn "\n", rc, mfn_x(page_to_mfn(page))); if ( page != curr->arch.old_guest_table ) put_page(page); break; } rc = xsm_memory_pin_page(XSM_HOOK, currd, pg_owner, page); if ( !rc && unlikely(test_and_set_bit(_PGT_pinned, &page->u.inuse.type_info)) ) { gdprintk(XENLOG_WARNING, "mfn %" PRI_mfn " already pinned\n", mfn_x(page_to_mfn(page))); rc = -EINVAL; } if ( unlikely(rc) ) goto pin_drop; /* A page is dirtied when its pin status is set. */ paging_mark_dirty(pg_owner, page_to_mfn(page)); /* We can race domain destruction (domain_relinquish_resources). */ if ( unlikely(pg_owner != currd) ) { bool drop_ref; spin_lock(&pg_owner->page_alloc_lock); drop_ref = (pg_owner->is_dying && test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)); spin_unlock(&pg_owner->page_alloc_lock); if ( drop_ref ) { pin_drop: if ( type == PGT_l1_page_table ) put_page_and_type(page); else { curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; curr->arch.old_guest_table_partial = false; } } } break; case MMUEXT_UNPIN_TABLE: if ( paging_mode_refcounts(pg_owner) ) break; page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC); if ( unlikely(!page) ) { gdprintk(XENLOG_WARNING, "mfn %" PRI_mfn " bad, or bad owner d%d\n", op.arg1.mfn, pg_owner->domain_id); rc = -EINVAL; break; } if ( !test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) ) { put_page(page); gdprintk(XENLOG_WARNING, "mfn %" PRI_mfn " not pinned\n", op.arg1.mfn); rc = -EINVAL; break; } switch ( rc = put_page_and_type_preemptible(page) ) { case -EINTR: case -ERESTART: curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; /* * EINTR means we still hold the type ref; ERESTART * means PGT_partial holds the type ref */ curr->arch.old_guest_table_partial = (rc == -ERESTART); rc = 0; break; default: BUG_ON(rc); break; } put_page(page); /* A page is dirtied when its pin status is cleared. */ paging_mark_dirty(pg_owner, page_to_mfn(page)); break; case MMUEXT_NEW_BASEPTR: if ( unlikely(currd != pg_owner) ) rc = -EPERM; else if ( unlikely(paging_mode_translate(currd)) ) rc = -EINVAL; else rc = new_guest_cr3(_mfn(op.arg1.mfn)); break; case MMUEXT_NEW_USER_BASEPTR: { unsigned long old_mfn; if ( unlikely(currd != pg_owner) ) rc = -EPERM; else if ( unlikely(paging_mode_translate(currd)) ) rc = -EINVAL; if ( unlikely(rc) ) break; old_mfn = pagetable_get_pfn(curr->arch.guest_table_user); /* * This is particularly important when getting restarted after the * previous attempt got preempted in the put-old-MFN phase. */ if ( old_mfn == op.arg1.mfn ) break; if ( op.arg1.mfn != 0 ) { rc = get_page_and_type_from_mfn( _mfn(op.arg1.mfn), PGT_root_page_table, currd, PTF_preemptible); if ( unlikely(rc) ) { if ( rc == -EINTR ) rc = -ERESTART; else if ( rc != -ERESTART ) gdprintk(XENLOG_WARNING, "Error %d installing new mfn %" PRI_mfn "\n", rc, op.arg1.mfn); break; } if ( VM_ASSIST(currd, m2p_strict) ) zap_ro_mpt(_mfn(op.arg1.mfn)); } curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn); if ( old_mfn != 0 ) { page = mfn_to_page(_mfn(old_mfn)); switch ( rc = put_page_and_type_preemptible(page) ) { case -EINTR: case -ERESTART: curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; /* * EINTR means we still hold the type ref; * ERESTART means PGT_partial holds the ref */ curr->arch.old_guest_table_partial = (rc == -ERESTART); rc = -ERESTART; break; default: BUG_ON(rc); break; } } break; } case MMUEXT_TLB_FLUSH_LOCAL: if ( likely(currd == pg_owner) ) flush_tlb_local(); else rc = -EPERM; break; case MMUEXT_INVLPG_LOCAL: if ( unlikely(currd != pg_owner) ) rc = -EPERM; else paging_invlpg(curr, op.arg1.linear_addr); break; case MMUEXT_TLB_FLUSH_MULTI: case MMUEXT_INVLPG_MULTI: { cpumask_t *mask = this_cpu(scratch_cpumask); if ( unlikely(currd != pg_owner) ) rc = -EPERM; else if ( unlikely(vcpumask_to_pcpumask(currd, guest_handle_to_param(op.arg2.vcpumask, const_void), mask)) ) rc = -EINVAL; if ( unlikely(rc) ) break; if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI ) flush_tlb_mask(mask); else if ( __addr_ok(op.arg1.linear_addr) ) flush_tlb_one_mask(mask, op.arg1.linear_addr); break; } case MMUEXT_TLB_FLUSH_ALL: if ( likely(currd == pg_owner) ) flush_tlb_mask(currd->dirty_cpumask); else rc = -EPERM; break; case MMUEXT_INVLPG_ALL: if ( unlikely(currd != pg_owner) ) rc = -EPERM; else if ( __addr_ok(op.arg1.linear_addr) ) flush_tlb_one_mask(currd->dirty_cpumask, op.arg1.linear_addr); break; case MMUEXT_FLUSH_CACHE: if ( unlikely(currd != pg_owner) ) rc = -EPERM; else if ( unlikely(!cache_flush_permitted(currd)) ) rc = -EACCES; else wbinvd(); break; case MMUEXT_FLUSH_CACHE_GLOBAL: if ( unlikely(currd != pg_owner) ) rc = -EPERM; else if ( likely(cache_flush_permitted(currd)) ) { unsigned int cpu; cpumask_t *mask = this_cpu(scratch_cpumask); cpumask_clear(mask); for_each_online_cpu(cpu) if ( !cpumask_intersects(mask, per_cpu(cpu_sibling_mask, cpu)) ) __cpumask_set_cpu(cpu, mask); flush_mask(mask, FLUSH_CACHE); } else rc = -EINVAL; break; case MMUEXT_SET_LDT: { unsigned int ents = op.arg2.nr_ents; unsigned long ptr = ents ? op.arg1.linear_addr : (unsigned long)ZERO_BLOCK_PTR; if ( unlikely(currd != pg_owner) ) rc = -EPERM; else if ( paging_mode_external(currd) ) rc = -EINVAL; else if ( (ents > 8192) || (ents && ((ptr & (PAGE_SIZE - 1)) || !__addr_ok(ptr))) ) { gdprintk(XENLOG_WARNING, "Bad args to SET_LDT: ptr=%lx, ents=%x\n", ptr, ents); rc = -EINVAL; } else if ( (curr->arch.pv.ldt_ents != ents) || (curr->arch.pv.ldt_base != ptr) ) { if ( pv_destroy_ldt(curr) ) flush_tlb_local(); curr->arch.pv.ldt_base = ptr; curr->arch.pv.ldt_ents = ents; load_LDT(curr); } break; } case MMUEXT_CLEAR_PAGE: page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt, P2M_ALLOC); if ( unlikely(p2mt != p2m_ram_rw) && page ) { put_page(page); page = NULL; } if ( !page || !get_page_type(page, PGT_writable_page) ) { if ( page ) put_page(page); gdprintk(XENLOG_WARNING, "Error clearing mfn %" PRI_mfn "\n", op.arg1.mfn); rc = -EINVAL; break; } /* A page is dirtied when it's being cleared. */ paging_mark_dirty(pg_owner, page_to_mfn(page)); clear_domain_page(page_to_mfn(page)); put_page_and_type(page); break; case MMUEXT_COPY_PAGE: { struct page_info *src_page, *dst_page; src_page = get_page_from_gfn(pg_owner, op.arg2.src_mfn, &p2mt, P2M_ALLOC); if ( unlikely(p2mt != p2m_ram_rw) && src_page ) { put_page(src_page); src_page = NULL; } if ( unlikely(!src_page) ) { gdprintk(XENLOG_WARNING, "Error copying from mfn %" PRI_mfn "\n", op.arg2.src_mfn); rc = -EINVAL; break; } dst_page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt, P2M_ALLOC); if ( unlikely(p2mt != p2m_ram_rw) && dst_page ) { put_page(dst_page); dst_page = NULL; } rc = (dst_page && get_page_type(dst_page, PGT_writable_page)) ? 0 : -EINVAL; if ( unlikely(rc) ) { put_page(src_page); if ( dst_page ) put_page(dst_page); gdprintk(XENLOG_WARNING, "Error copying to mfn %" PRI_mfn "\n", op.arg1.mfn); break; } /* A page is dirtied when it's being copied to. */ paging_mark_dirty(pg_owner, page_to_mfn(dst_page)); copy_domain_page(page_to_mfn(dst_page), page_to_mfn(src_page)); put_page_and_type(dst_page); put_page(src_page); break; } case MMUEXT_MARK_SUPER: case MMUEXT_UNMARK_SUPER: rc = -EOPNOTSUPP; break; default: rc = -ENOSYS; break; } done: if ( unlikely(rc) ) break; guest_handle_add_offset(uops, 1); } if ( rc == -ERESTART ) rc = hypercall_create_continuation( __HYPERVISOR_mmuext_op, "hihi", uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); else if ( curr->arch.old_guest_table ) { XEN_GUEST_HANDLE_PARAM(void) null; ASSERT(rc || i == count); set_xen_guest_handle(null, NULL); /* * In order to have a way to communicate the final return value to * our continuation, we pass this in place of "foreigndom", building * on the fact that this argument isn't needed anymore. */ rc = hypercall_create_continuation( __HYPERVISOR_mmuext_op, "hihi", null, MMU_UPDATE_PREEMPTED, null, rc); } put_pg_owner(pg_owner); perfc_add(num_mmuext_ops, i); /* Add incremental work we have done to the @done output parameter. */ if ( unlikely(!guest_handle_is_null(pdone)) ) { done += i; copy_to_guest(pdone, &done, 1); } return rc; } long do_mmu_update( XEN_GUEST_HANDLE_PARAM(mmu_update_t) ureqs, unsigned int count, XEN_GUEST_HANDLE_PARAM(uint) pdone, unsigned int foreigndom) { struct mmu_update req; void *va = NULL; unsigned long gpfn, gmfn; struct page_info *page; unsigned int cmd, i = 0, done = 0, pt_dom; struct vcpu *curr = current, *v = curr; struct domain *d = v->domain, *pt_owner = d, *pg_owner; mfn_t map_mfn = INVALID_MFN, mfn; bool flush_linear_pt = false, flush_root_pt_local = false, flush_root_pt_others = false; uint32_t xsm_needed = 0; uint32_t xsm_checked = 0; int rc = put_old_guest_table(curr); if ( unlikely(rc) ) { if ( likely(rc == -ERESTART) ) rc = hypercall_create_continuation( __HYPERVISOR_mmu_update, "hihi", ureqs, count, pdone, foreigndom); return rc; } if ( unlikely(count == MMU_UPDATE_PREEMPTED) && likely(guest_handle_is_null(ureqs)) ) { /* * See the curr->arch.old_guest_table related * hypercall_create_continuation() below. */ return (int)foreigndom; } if ( unlikely(count & MMU_UPDATE_PREEMPTED) ) { count &= ~MMU_UPDATE_PREEMPTED; if ( unlikely(!guest_handle_is_null(pdone)) ) (void)copy_from_guest(&done, pdone, 1); } else perfc_incr(calls_to_mmu_update); if ( unlikely(!guest_handle_okay(ureqs, count)) ) return -EFAULT; if ( (pt_dom = foreigndom >> 16) != 0 ) { /* Pagetables belong to a foreign domain (PFD). */ if ( (pt_owner = rcu_lock_domain_by_id(pt_dom - 1)) == NULL ) return -ESRCH; if ( pt_owner == d ) rcu_unlock_domain(pt_owner); else if ( !pt_owner->vcpu || (v = pt_owner->vcpu[0]) == NULL ) { rc = -EINVAL; goto out; } } if ( (pg_owner = get_pg_owner((uint16_t)foreigndom)) == NULL ) { rc = -ESRCH; goto out; } for ( i = 0; i < count; i++ ) { if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) ) { rc = -ERESTART; break; } if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) ) { rc = -EFAULT; break; } cmd = req.ptr & (sizeof(l1_pgentry_t)-1); switch ( cmd ) { /* * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR) * current A/D bits. */ case MMU_NORMAL_PT_UPDATE: case MMU_PT_UPDATE_PRESERVE_AD: case MMU_PT_UPDATE_NO_TRANSLATE: { p2m_type_t p2mt; rc = -EOPNOTSUPP; if ( unlikely(paging_mode_refcounts(pt_owner)) ) break; xsm_needed |= XSM_MMU_NORMAL_UPDATE; if ( get_pte_flags(req.val) & _PAGE_PRESENT ) { xsm_needed |= XSM_MMU_UPDATE_READ; if ( get_pte_flags(req.val) & _PAGE_RW ) xsm_needed |= XSM_MMU_UPDATE_WRITE; } if ( xsm_needed != xsm_checked ) { rc = xsm_mmu_update(XSM_TARGET, d, pt_owner, pg_owner, xsm_needed); if ( rc ) break; xsm_checked = xsm_needed; } rc = -EINVAL; req.ptr -= cmd; gmfn = req.ptr >> PAGE_SHIFT; page = get_page_from_gfn(pt_owner, gmfn, &p2mt, P2M_ALLOC); if ( unlikely(!page) || p2mt != p2m_ram_rw ) { if ( page ) put_page(page); if ( p2m_is_paged(p2mt) ) { p2m_mem_paging_populate(pt_owner, _gfn(gmfn)); rc = -ENOENT; } else gdprintk(XENLOG_WARNING, "Could not get page for normal update\n"); break; } mfn = page_to_mfn(page); if ( !mfn_eq(mfn, map_mfn) ) { if ( va ) unmap_domain_page(va); va = map_domain_page(mfn); map_mfn = mfn; } va = _p(((unsigned long)va & PAGE_MASK) + (req.ptr & ~PAGE_MASK)); if ( page_lock(page) ) { switch ( page->u.inuse.type_info & PGT_type_mask ) { case PGT_l1_page_table: rc = mod_l1_entry(va, l1e_from_intpte(req.val), mfn, cmd, v, pg_owner); break; case PGT_l2_page_table: if ( unlikely(pg_owner != pt_owner) ) break; rc = mod_l2_entry(va, l2e_from_intpte(req.val), mfn, cmd == MMU_PT_UPDATE_PRESERVE_AD, v); if ( !rc ) flush_linear_pt = true; break; case PGT_l3_page_table: if ( unlikely(pg_owner != pt_owner) ) break; rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn, cmd == MMU_PT_UPDATE_PRESERVE_AD, v); if ( !rc ) flush_linear_pt = true; break; case PGT_l4_page_table: if ( unlikely(pg_owner != pt_owner) ) break; rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn, cmd == MMU_PT_UPDATE_PRESERVE_AD, v); if ( !rc ) flush_linear_pt = true; if ( !rc && pt_owner->arch.pv.xpti ) { bool local_in_use = false; if ( mfn_eq(pagetable_get_mfn(curr->arch.guest_table), mfn) ) { local_in_use = true; flush_root_pt_local = true; } /* * No need to sync if all uses of the page can be * accounted to the page lock we hold, its pinned * status, and uses on this (v)CPU. */ if ( (page->u.inuse.type_info & PGT_count_mask) > (1 + !!(page->u.inuse.type_info & PGT_pinned) + mfn_eq(pagetable_get_mfn(curr->arch.guest_table_user), mfn) + local_in_use) ) flush_root_pt_others = true; } break; case PGT_writable_page: perfc_incr(writable_mmu_updates); paging_write_guest_entry(v, va, req.val, mfn); rc = 0; break; } page_unlock(page); if ( rc == -EINTR ) rc = -ERESTART; } else if ( get_page_type(page, PGT_writable_page) ) { perfc_incr(writable_mmu_updates); paging_write_guest_entry(v, va, req.val, mfn); put_page_type(page); rc = 0; } put_page(page); } break; case MMU_MACHPHYS_UPDATE: if ( unlikely(d != pt_owner) ) { rc = -EPERM; break; } if ( unlikely(paging_mode_translate(pg_owner)) ) { rc = -EINVAL; break; } mfn = maddr_to_mfn(req.ptr); gpfn = req.val; xsm_needed |= XSM_MMU_MACHPHYS_UPDATE; if ( xsm_needed != xsm_checked ) { rc = xsm_mmu_update(XSM_TARGET, d, NULL, pg_owner, xsm_needed); if ( rc ) break; xsm_checked = xsm_needed; } page = get_page_from_mfn(mfn, pg_owner); if ( unlikely(!page) ) { gdprintk(XENLOG_WARNING, "Could not get page for mach->phys update\n"); rc = -EINVAL; break; } set_gpfn_from_mfn(mfn_x(mfn), gpfn); paging_mark_pfn_dirty(pg_owner, _pfn(gpfn)); put_page(page); break; default: rc = -ENOSYS; break; } if ( unlikely(rc) ) break; guest_handle_add_offset(ureqs, 1); } if ( rc == -ERESTART ) rc = hypercall_create_continuation( __HYPERVISOR_mmu_update, "hihi", ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); else if ( curr->arch.old_guest_table ) { XEN_GUEST_HANDLE_PARAM(void) null; ASSERT(rc || i == count); set_xen_guest_handle(null, NULL); /* * In order to have a way to communicate the final return value to * our continuation, we pass this in place of "foreigndom", building * on the fact that this argument isn't needed anymore. */ rc = hypercall_create_continuation( __HYPERVISOR_mmu_update, "hihi", null, MMU_UPDATE_PREEMPTED, null, rc); } put_pg_owner(pg_owner); if ( va ) unmap_domain_page(va); /* * Perform required TLB maintenance. * * This logic currently depends on flush_linear_pt being a superset of the * flush_root_pt_* conditions. * * pt_owner may not be current->domain. This may occur during * construction of 32bit PV guests, or debugging of PV guests. The * behaviour cannot be correct with domain unpaused. We therefore expect * pt_owner->dirty_cpumask to be empty, but it is a waste of effort to * explicitly check for, and exclude, this corner case. * * flush_linear_pt requires a FLUSH_TLB to all dirty CPUs. The flush must * be performed now to maintain correct behaviour across a multicall. * i.e. we cannot relax FLUSH_TLB to FLUSH_ROOT_PGTBL, given that the * former is a side effect of the latter, because the resync (which is in * the return-to-guest path) happens too late. * * flush_root_pt_* requires FLUSH_ROOT_PGTBL on either the local CPU * (implies pt_owner == current->domain and current->processor set in * pt_owner->dirty_cpumask), and/or all *other* dirty CPUs as there are * references we can't account for locally. */ if ( flush_linear_pt /* || flush_root_pt_local || flush_root_pt_others */ ) { unsigned int cpu = smp_processor_id(); cpumask_t *mask = pt_owner->dirty_cpumask; /* * Always handle local flushing separately (if applicable), to * separate the flush invocations appropriately for scope of the two * flush_root_pt_* variables. */ if ( likely(cpumask_test_cpu(cpu, mask)) ) { mask = per_cpu(scratch_cpumask, cpu); cpumask_copy(mask, pt_owner->dirty_cpumask); __cpumask_clear_cpu(cpu, mask); flush_local(FLUSH_TLB | (flush_root_pt_local ? FLUSH_ROOT_PGTBL : 0)); } else /* Sanity check. flush_root_pt_local implies local cpu is dirty. */ ASSERT(!flush_root_pt_local); /* Flush the remote dirty CPUs. Does not include the local CPU. */ if ( !cpumask_empty(mask) ) flush_mask(mask, FLUSH_TLB | (flush_root_pt_others ? FLUSH_ROOT_PGTBL : 0)); } else /* Sanity check. flush_root_pt_* implies flush_linear_pt. */ ASSERT(!flush_root_pt_local && !flush_root_pt_others); perfc_add(num_page_updates, i); out: if ( pt_owner != d ) rcu_unlock_domain(pt_owner); /* Add incremental work we have done to the @done output parameter. */ if ( unlikely(!guest_handle_is_null(pdone)) ) { done += i; copy_to_guest(pdone, &done, 1); } return rc; } #endif /* CONFIG_PV */ /* * Steal page will attempt to remove `page` from domain `d`. Upon * return, `page` will be in a state similar to the state of a page * returned from alloc_domheap_page() with MEMF_no_owner set: * - refcount 0 * - type count cleared * - owner NULL * - page caching attributes cleaned up * - removed from the domain's page_list * * If MEMF_no_refcount is not set, the domain's tot_pages will be * adjusted. If this results in the page count falling to 0, * put_domain() will be called. * * The caller should either call free_domheap_page() to free the * page, or assign_pages() to put it back on some domain's page list. */ int steal_page( struct domain *d, struct page_info *page, unsigned int memflags) { unsigned long x, y; bool drop_dom_ref = false; const struct domain *owner; int rc; if ( paging_mode_external(d) ) return -EOPNOTSUPP; /* Grab a reference to make sure the page doesn't change under our feet */ rc = -EINVAL; if ( !(owner = page_get_owner_and_reference(page)) ) goto fail; if ( owner != d || is_special_page(page) ) goto fail_put; /* * We require there are exactly two references -- the one we just * took, and PGC_allocated. We temporarily drop both these * references so that the page becomes effectively non-"live" for * the domain. */ y = page->count_info; do { x = y; if ( (x & (PGC_count_mask|PGC_allocated)) != (2 | PGC_allocated) ) goto fail_put; y = cmpxchg(&page->count_info, x, x & ~(PGC_count_mask|PGC_allocated)); } while ( y != x ); /* * NB this is safe even if the page ends up being given back to * the domain, because the count is zero: subsequent mappings will * cause the cache attributes to be re-instated inside * get_page_from_l1e(), or the page to be added back to the IOMMU * upon the type changing to PGT_writeable, as appropriate. */ if ( (rc = cleanup_page_mappings(page)) ) { /* * Couldn't fixup Xen's mappings; put things the way we found * it and return an error */ page->count_info |= PGC_allocated | 1; goto fail; } /* * With the reference count now zero, nobody can grab references * to do anything else with the page. Return the page to a state * that it might be upon return from alloc_domheap_pages with * MEMF_no_owner set. */ spin_lock(&d->page_alloc_lock); BUG_ON(page->u.inuse.type_info & (PGT_count_mask | PGT_locked | PGT_pinned)); page->u.inuse.type_info = 0; page_set_owner(page, NULL); page_list_del(page, &d->page_list); /* Unlink from original owner. */ if ( !(memflags & MEMF_no_refcount) && !domain_adjust_tot_pages(d, -1) ) drop_dom_ref = true; spin_unlock(&d->page_alloc_lock); if ( unlikely(drop_dom_ref) ) put_domain(d); return 0; fail_put: put_page(page); fail: gdprintk(XENLOG_WARNING, "Bad steal mfn %" PRI_mfn " from d%d (owner d%d) caf=%08lx taf=%" PRtype_info "\n", mfn_x(page_to_mfn(page)), d->domain_id, owner ? owner->domain_id : DOMID_INVALID, page->count_info, page->u.inuse.type_info); return rc; } #ifdef CONFIG_PV static int __do_update_va_mapping( unsigned long va, u64 val64, unsigned long flags, struct domain *pg_owner) { l1_pgentry_t val = l1e_from_intpte(val64); struct vcpu *v = current; struct domain *d = v->domain; struct page_info *gl1pg; l1_pgentry_t *pl1e; unsigned long bmap_ptr; mfn_t gl1mfn; cpumask_t *mask = NULL; int rc; perfc_incr(calls_to_update_va); rc = xsm_update_va_mapping(XSM_TARGET, d, pg_owner, val); if ( rc ) return rc; rc = -EINVAL; pl1e = map_guest_l1e(va, &gl1mfn); gl1pg = pl1e ? get_page_from_mfn(gl1mfn, d) : NULL; if ( unlikely(!gl1pg) ) goto out; if ( !page_lock(gl1pg) ) { put_page(gl1pg); goto out; } if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table ) { page_unlock(gl1pg); put_page(gl1pg); goto out; } rc = mod_l1_entry(pl1e, val, gl1mfn, MMU_NORMAL_PT_UPDATE, v, pg_owner); page_unlock(gl1pg); put_page(gl1pg); out: if ( pl1e ) unmap_domain_page(pl1e); /* * Any error at this point means that we haven't change the L1e. Skip the * flush, as it won't do anything useful. Furthermore, va is guest * controlled and not necesserily audited by this point. */ if ( rc ) return rc; switch ( flags & UVMF_FLUSHTYPE_MASK ) { case UVMF_TLB_FLUSH: switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) ) { case UVMF_LOCAL: flush_tlb_local(); break; case UVMF_ALL: mask = d->dirty_cpumask; break; default: mask = this_cpu(scratch_cpumask); rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr, void), mask); break; } if ( mask ) flush_tlb_mask(mask); break; case UVMF_INVLPG: switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) ) { case UVMF_LOCAL: paging_invlpg(v, va); break; case UVMF_ALL: mask = d->dirty_cpumask; break; default: mask = this_cpu(scratch_cpumask); rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr, void), mask); break; } if ( mask ) flush_tlb_one_mask(mask, va); break; } return rc; } long do_update_va_mapping( unsigned long va, u64 val64, unsigned long flags) { int rc = __do_update_va_mapping(va, val64, flags, current->domain); if ( rc == -ERESTART ) rc = hypercall_create_continuation( __HYPERVISOR_update_va_mapping, "lll", va, val64, flags); return rc; } long do_update_va_mapping_otherdomain( unsigned long va, u64 val64, unsigned long flags, domid_t domid) { struct domain *pg_owner; int rc; if ( (pg_owner = get_pg_owner(domid)) == NULL ) return -ESRCH; rc = __do_update_va_mapping(va, val64, flags, pg_owner); put_pg_owner(pg_owner); if ( rc == -ERESTART ) rc = hypercall_create_continuation( __HYPERVISOR_update_va_mapping_otherdomain, "llli", va, val64, flags, domid); return rc; } #endif /* CONFIG_PV */ #ifdef CONFIG_PV32 int compat_update_va_mapping( unsigned int va, uint32_t lo, uint32_t hi, unsigned int flags) { int rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo, flags, current->domain); if ( rc == -ERESTART ) rc = hypercall_create_continuation( __HYPERVISOR_update_va_mapping, "iiii", va, lo, hi, flags); return rc; } int compat_update_va_mapping_otherdomain( unsigned int va, uint32_t lo, uint32_t hi, unsigned int flags, domid_t domid) { struct domain *pg_owner; int rc; if ( (pg_owner = get_pg_owner(domid)) == NULL ) return -ESRCH; rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo, flags, pg_owner); put_pg_owner(pg_owner); if ( rc == -ERESTART ) rc = hypercall_create_continuation( __HYPERVISOR_update_va_mapping_otherdomain, "iiiii", va, lo, hi, flags, domid); return rc; } #endif /* CONFIG_PV32 */ typedef struct e820entry e820entry_t; DEFINE_XEN_GUEST_HANDLE(e820entry_t); struct memory_map_context { unsigned int n; unsigned long s; struct xen_memory_map map; }; static int _handle_iomem_range(unsigned long s, unsigned long e, struct memory_map_context *ctxt) { if ( s > ctxt->s && !(s >> (paddr_bits - PAGE_SHIFT)) ) { if ( !guest_handle_is_null(ctxt->map.buffer) ) { e820entry_t ent; if ( ctxt->n + 1 >= ctxt->map.nr_entries ) return -EINVAL; ent.addr = (uint64_t)ctxt->s << PAGE_SHIFT; ent.size = (uint64_t)(s - ctxt->s) << PAGE_SHIFT; ent.type = E820_RESERVED; if ( __copy_to_guest_offset(ctxt->map.buffer, ctxt->n, &ent, 1) ) return -EFAULT; } ctxt->n++; } ctxt->s = e + 1; return 0; } static int cf_check handle_iomem_range( unsigned long s, unsigned long e, void *p) { int err = 0; do { unsigned long low = -1UL; unsigned int i; for ( i = 0; i < nr_ioapics; ++i ) { unsigned long mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr); if ( mfn >= s && mfn <= e && mfn < low ) low = mfn; } if ( !(low + 1) ) break; if ( s < low ) err = _handle_iomem_range(s, low - 1, p); s = low + 1; } while ( !err ); return err || s > e ? err : _handle_iomem_range(s, e, p); } long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { int rc; switch ( cmd ) { case XENMEM_set_memory_map: { struct xen_foreign_memory_map fmap; struct domain *d; struct e820entry *e820; if ( copy_from_guest(&fmap, arg, 1) ) return -EFAULT; if ( fmap.map.nr_entries > E820MAX ) return -EINVAL; d = rcu_lock_domain_by_any_id(fmap.domid); if ( d == NULL ) return -ESRCH; rc = xsm_domain_memory_map(XSM_TARGET, d); if ( rc ) { rcu_unlock_domain(d); return rc; } e820 = xmalloc_array(e820entry_t, fmap.map.nr_entries); if ( e820 == NULL ) { rcu_unlock_domain(d); return -ENOMEM; } if ( copy_from_guest(e820, fmap.map.buffer, fmap.map.nr_entries) ) { xfree(e820); rcu_unlock_domain(d); return -EFAULT; } spin_lock(&d->arch.e820_lock); xfree(d->arch.e820); d->arch.e820 = e820; d->arch.nr_e820 = fmap.map.nr_entries; spin_unlock(&d->arch.e820_lock); rcu_unlock_domain(d); return rc; } case XENMEM_memory_map: { struct xen_memory_map map; struct domain *d = current->domain; if ( copy_from_guest(&map, arg, 1) ) return -EFAULT; spin_lock(&d->arch.e820_lock); /* Backwards compatibility. */ if ( (d->arch.nr_e820 == 0) || (d->arch.e820 == NULL) ) { spin_unlock(&d->arch.e820_lock); return -ENOSYS; } map.nr_entries = min(map.nr_entries, d->arch.nr_e820); if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) || __copy_to_guest(arg, &map, 1) ) { spin_unlock(&d->arch.e820_lock); return -EFAULT; } spin_unlock(&d->arch.e820_lock); return 0; } case XENMEM_machine_memory_map: { struct memory_map_context ctxt; XEN_GUEST_HANDLE_PARAM(e820entry_t) buffer; unsigned int i; bool store; rc = xsm_machine_memory_map(XSM_PRIV); if ( rc ) return rc; if ( copy_from_guest(&ctxt.map, arg, 1) ) return -EFAULT; store = !guest_handle_is_null(ctxt.map.buffer); if ( store && ctxt.map.nr_entries < e820.nr_map + 1 ) return -EINVAL; buffer = guest_handle_cast(ctxt.map.buffer, e820entry_t); if ( store && !guest_handle_okay(buffer, ctxt.map.nr_entries) ) return -EFAULT; for ( i = 0, ctxt.n = 0, ctxt.s = 0; i < e820.nr_map; ++i, ++ctxt.n ) { unsigned long s = PFN_DOWN(e820.map[i].addr); if ( s > ctxt.s ) { rc = rangeset_report_ranges(current->domain->iomem_caps, ctxt.s, s - 1, handle_iomem_range, &ctxt); if ( !rc ) rc = handle_iomem_range(s, s, &ctxt); if ( rc ) return rc; } if ( store ) { if ( ctxt.map.nr_entries <= ctxt.n + (e820.nr_map - i) ) return -EINVAL; if ( __copy_to_guest_offset(buffer, ctxt.n, e820.map + i, 1) ) return -EFAULT; } ctxt.s = PFN_UP(e820.map[i].addr + e820.map[i].size); } if ( ctxt.s ) { rc = rangeset_report_ranges(current->domain->iomem_caps, ctxt.s, ~0UL, handle_iomem_range, &ctxt); if ( !rc && ctxt.s ) rc = handle_iomem_range(~0UL, ~0UL, &ctxt); if ( rc ) return rc; } ctxt.map.nr_entries = ctxt.n; if ( __copy_to_guest(arg, &ctxt.map, 1) ) return -EFAULT; return 0; } case XENMEM_machphys_mapping: { struct xen_machphys_mapping mapping = { .v_start = MACH2PHYS_VIRT_START, .v_end = MACH2PHYS_VIRT_END, .max_mfn = MACH2PHYS_NR_ENTRIES - 1 }; if ( !mem_hotplug && is_hardware_domain(current->domain) ) mapping.max_mfn = max_page - 1; if ( copy_to_guest(arg, &mapping, 1) ) return -EFAULT; return 0; } #ifdef CONFIG_HVM case XENMEM_set_pod_target: case XENMEM_get_pod_target: { xen_pod_target_t target; struct domain *d; if ( copy_from_guest(&target, arg, 1) ) return -EFAULT; d = rcu_lock_domain_by_any_id(target.domid); if ( d == NULL ) return -ESRCH; if ( !is_hvm_domain(d) ) rc = -EINVAL; else if ( cmd == XENMEM_set_pod_target ) { rc = xsm_set_pod_target(XSM_PRIV, d); if ( rc ) ASSERT(rc < 0); else if ( target.target_pages > d->max_pages ) rc = -EINVAL; else rc = p2m_pod_set_mem_target(d, target.target_pages); } else rc = xsm_get_pod_target(XSM_PRIV, d); if ( rc == -ERESTART ) { rc = hypercall_create_continuation( __HYPERVISOR_memory_op, "lh", cmd, arg); } else if ( rc >= 0 ) { p2m_pod_get_mem_target(d, &target); if ( __copy_to_guest(arg, &target, 1) ) rc = -EFAULT; } rcu_unlock_domain(d); return rc; } #endif default: return subarch_memory_op(cmd, arg); } return 0; } int cf_check mmio_ro_emulated_write( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { struct mmio_ro_emulate_ctxt *mmio_ro_ctxt = ctxt->data; /* Only allow naturally-aligned stores at the original %cr2 address. */ if ( ((bytes | offset) & (bytes - 1)) || !bytes || offset != mmio_ro_ctxt->cr2 ) { gdprintk(XENLOG_WARNING, "bad access (cr2=%lx, addr=%lx, bytes=%u)\n", mmio_ro_ctxt->cr2, offset, bytes); return X86EMUL_UNHANDLEABLE; } return X86EMUL_OKAY; } int cf_check mmcfg_intercept_write( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { struct mmio_ro_emulate_ctxt *mmio_ctxt = ctxt->data; /* * Only allow naturally-aligned stores no wider than 4 bytes to the * original %cr2 address. */ if ( ((bytes | offset) & (bytes - 1)) || bytes > 4 || !bytes || offset != mmio_ctxt->cr2 ) { gdprintk(XENLOG_WARNING, "bad write (cr2=%lx, addr=%lx, bytes=%u)\n", mmio_ctxt->cr2, offset, bytes); return X86EMUL_UNHANDLEABLE; } offset &= 0xfff; if ( pci_conf_write_intercept(mmio_ctxt->seg, mmio_ctxt->bdf, offset, bytes, p_data) >= 0 ) pci_mmcfg_write(mmio_ctxt->seg, PCI_BUS(mmio_ctxt->bdf), PCI_DEVFN(mmio_ctxt->bdf), offset, bytes, *(uint32_t *)p_data); return X86EMUL_OKAY; } /* * For these PTE APIs, the caller must follow the alloc-map-unmap-free * lifecycle, which means explicitly mapping the PTE pages before accessing * them. The caller must check whether the allocation has succeeded, and only * pass valid MFNs to map_domain_page(). */ mfn_t alloc_xen_pagetable(void) { if ( system_state != SYS_STATE_early_boot ) { const struct page_info *pg = alloc_domheap_page(NULL, 0); BUG_ON(!hardware_domain && !pg); return pg ? page_to_mfn(pg) : INVALID_MFN; } return alloc_boot_pages(1, 1); } /* mfn can be INVALID_MFN */ void free_xen_pagetable(mfn_t mfn) { if ( system_state != SYS_STATE_early_boot && !mfn_eq(mfn, INVALID_MFN) ) free_domheap_page(mfn_to_page(mfn)); } void *alloc_mapped_pagetable(mfn_t *pmfn) { mfn_t mfn = alloc_xen_pagetable(); void *ret; if ( mfn_eq(mfn, INVALID_MFN) ) return NULL; if ( pmfn ) *pmfn = mfn; ret = map_domain_page(mfn); clear_page(ret); return ret; } static DEFINE_SPINLOCK(map_pgdir_lock); /* * For virt_to_xen_lXe() functions, they take a linear address and return a * pointer to Xen's LX entry. Caller needs to unmap the pointer. */ static l3_pgentry_t *virt_to_xen_l3e(unsigned long v) { l4_pgentry_t *pl4e; pl4e = &idle_pg_table[l4_table_offset(v)]; if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) ) { bool locking = system_state > SYS_STATE_boot; mfn_t l3mfn; l3_pgentry_t *l3t = alloc_mapped_pagetable(&l3mfn); if ( !l3t ) return NULL; UNMAP_DOMAIN_PAGE(l3t); if ( locking ) spin_lock(&map_pgdir_lock); if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) ) { l4_pgentry_t l4e = l4e_from_mfn(l3mfn, __PAGE_HYPERVISOR); l4e_write(pl4e, l4e); efi_update_l4_pgtable(l4_table_offset(v), l4e); l3mfn = INVALID_MFN; } if ( locking ) spin_unlock(&map_pgdir_lock); free_xen_pagetable(l3mfn); } return map_l3t_from_l4e(*pl4e) + l3_table_offset(v); } static l2_pgentry_t *virt_to_xen_l2e(unsigned long v) { l3_pgentry_t *pl3e, l3e; pl3e = virt_to_xen_l3e(v); if ( !pl3e ) return NULL; if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) { bool locking = system_state > SYS_STATE_boot; mfn_t l2mfn; l2_pgentry_t *l2t = alloc_mapped_pagetable(&l2mfn); if ( !l2t ) { unmap_domain_page(pl3e); return NULL; } UNMAP_DOMAIN_PAGE(l2t); if ( locking ) spin_lock(&map_pgdir_lock); if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) { l3e_write(pl3e, l3e_from_mfn(l2mfn, __PAGE_HYPERVISOR)); l2mfn = INVALID_MFN; } if ( locking ) spin_unlock(&map_pgdir_lock); free_xen_pagetable(l2mfn); } BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE); l3e = *pl3e; unmap_domain_page(pl3e); return map_l2t_from_l3e(l3e) + l2_table_offset(v); } l1_pgentry_t *virt_to_xen_l1e(unsigned long v) { l2_pgentry_t *pl2e, l2e; pl2e = virt_to_xen_l2e(v); if ( !pl2e ) return NULL; if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) { bool locking = system_state > SYS_STATE_boot; mfn_t l1mfn; l1_pgentry_t *l1t = alloc_mapped_pagetable(&l1mfn); if ( !l1t ) { unmap_domain_page(pl2e); return NULL; } UNMAP_DOMAIN_PAGE(l1t); if ( locking ) spin_lock(&map_pgdir_lock); if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) { l2e_write(pl2e, l2e_from_mfn(l1mfn, __PAGE_HYPERVISOR)); l1mfn = INVALID_MFN; } if ( locking ) spin_unlock(&map_pgdir_lock); free_xen_pagetable(l1mfn); } BUG_ON(l2e_get_flags(*pl2e) & _PAGE_PSE); l2e = *pl2e; unmap_domain_page(pl2e); return map_l1t_from_l2e(l2e) + l1_table_offset(v); } /* Convert to from superpage-mapping flags for map_pages_to_xen(). */ #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f)) #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f)) /* flush_area_all() can be used prior to any other CPU being online. */ #define flush_area(v, f) flush_area_all((const void *)(v), f) #define L3T_INIT(page) (page) = ZERO_BLOCK_PTR #define L3T_LOCK(page) \ do { \ if ( locking ) \ l3t_lock(page); \ } while ( false ) #define L3T_UNLOCK(page) \ do { \ if ( locking && (page) != ZERO_BLOCK_PTR ) \ { \ l3t_unlock(page); \ (page) = ZERO_BLOCK_PTR; \ } \ } while ( false ) /* Translate mapped Xen address to MFN. */ mfn_t xen_map_to_mfn(unsigned long va) { #define CHECK_MAPPED(cond) \ do { \ if ( !(cond) ) \ { \ ASSERT_UNREACHABLE(); \ ret = INVALID_MFN; \ goto out; \ } \ } while ( false ) bool locking = system_state > SYS_STATE_boot; unsigned int l2_offset = l2_table_offset(va); unsigned int l1_offset = l1_table_offset(va); const l3_pgentry_t *pl3e = virt_to_xen_l3e(va); const l2_pgentry_t *pl2e = NULL; const l1_pgentry_t *pl1e = NULL; struct page_info *l3page; mfn_t ret; L3T_INIT(l3page); CHECK_MAPPED(pl3e); l3page = mfn_to_page(domain_page_map_to_mfn(pl3e)); L3T_LOCK(l3page); CHECK_MAPPED(l3e_get_flags(*pl3e) & _PAGE_PRESENT); if ( l3e_get_flags(*pl3e) & _PAGE_PSE ) { ret = mfn_add(l3e_get_mfn(*pl3e), (l2_offset << PAGETABLE_ORDER) + l1_offset); goto out; } pl2e = map_l2t_from_l3e(*pl3e) + l2_offset; CHECK_MAPPED(l2e_get_flags(*pl2e) & _PAGE_PRESENT); if ( l2e_get_flags(*pl2e) & _PAGE_PSE ) { ret = mfn_add(l2e_get_mfn(*pl2e), l1_offset); goto out; } pl1e = map_l1t_from_l2e(*pl2e) + l1_offset; CHECK_MAPPED(l1e_get_flags(*pl1e) & _PAGE_PRESENT); ret = l1e_get_mfn(*pl1e); #undef CHECK_MAPPED out: L3T_UNLOCK(l3page); unmap_domain_page(pl1e); unmap_domain_page(pl2e); unmap_domain_page(pl3e); return ret; } int map_pages_to_xen( unsigned long virt, mfn_t mfn, unsigned long nr_mfns, unsigned int flags) { bool locking = system_state > SYS_STATE_boot; l3_pgentry_t *pl3e = NULL, ol3e; l2_pgentry_t *pl2e = NULL, ol2e; l1_pgentry_t *pl1e, ol1e; unsigned int i; int rc = -ENOMEM; struct page_info *current_l3page; #define flush_flags(oldf) do { \ unsigned int o_ = (oldf); \ if ( (o_) & _PAGE_GLOBAL ) \ flush_flags |= FLUSH_TLB_GLOBAL; \ if ( (flags & _PAGE_PRESENT) && \ (((o_) ^ flags) & PAGE_CACHE_ATTRS) ) \ { \ flush_flags |= FLUSH_CACHE; \ if ( virt >= DIRECTMAP_VIRT_START && \ virt < HYPERVISOR_VIRT_END ) \ flush_flags |= FLUSH_VA_VALID; \ } \ } while (0) L3T_INIT(current_l3page); while ( nr_mfns != 0 ) { /* Clean up the previous iteration. */ L3T_UNLOCK(current_l3page); UNMAP_DOMAIN_PAGE(pl3e); UNMAP_DOMAIN_PAGE(pl2e); pl3e = virt_to_xen_l3e(virt); if ( !pl3e ) goto out; current_l3page = mfn_to_page(domain_page_map_to_mfn(pl3e)); L3T_LOCK(current_l3page); ol3e = *pl3e; if ( cpu_has_page1gb && !(((virt >> PAGE_SHIFT) | mfn_x(mfn)) & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) && nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) && !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) ) { /* 1GB-page mapping. */ l3e_write_atomic(pl3e, l3e_from_mfn(mfn, l1f_to_lNf(flags))); if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) ) { unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER); if ( l3e_get_flags(ol3e) & _PAGE_PSE ) { flush_flags(lNf_to_l1f(l3e_get_flags(ol3e))); flush_area(virt, flush_flags); } else { l2_pgentry_t *l2t = map_l2t_from_l3e(ol3e); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) { ol2e = l2t[i]; if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) ) continue; if ( l2e_get_flags(ol2e) & _PAGE_PSE ) flush_flags(lNf_to_l1f(l2e_get_flags(ol2e))); else { unsigned int j; const l1_pgentry_t *l1t = map_l1t_from_l2e(ol2e); for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ ) flush_flags(l1e_get_flags(l1t[j])); unmap_domain_page(l1t); } } flush_area(virt, flush_flags); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) { ol2e = l2t[i]; if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) && !(l2e_get_flags(ol2e) & _PAGE_PSE) ) free_xen_pagetable(l2e_get_mfn(ol2e)); } unmap_domain_page(l2t); free_xen_pagetable(l3e_get_mfn(ol3e)); } } virt += 1UL << L3_PAGETABLE_SHIFT; if ( !mfn_eq(mfn, INVALID_MFN) ) mfn = mfn_add(mfn, 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)); nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); continue; } if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) && (l3e_get_flags(ol3e) & _PAGE_PSE) ) { unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER); l2_pgentry_t *l2t; mfn_t l2mfn; /* Skip this PTE if there is no change. */ if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES - 1)) + (l2_table_offset(virt) << PAGETABLE_ORDER) + l1_table_offset(virt) == mfn_x(mfn)) && ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) & ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 ) { /* We can skip to end of L3 superpage if we got a match. */ i = (1u << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - (mfn_x(mfn) & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)); if ( i > nr_mfns ) i = nr_mfns; virt += i << PAGE_SHIFT; if ( !mfn_eq(mfn, INVALID_MFN) ) mfn = mfn_add(mfn, i); nr_mfns -= i; continue; } l2mfn = alloc_xen_pagetable(); if ( mfn_eq(l2mfn, INVALID_MFN) ) goto out; l2t = map_domain_page(l2mfn); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) l2e_write(l2t + i, l2e_from_pfn(l3e_get_pfn(ol3e) + (i << PAGETABLE_ORDER), l3e_get_flags(ol3e))); UNMAP_DOMAIN_PAGE(l2t); if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL ) flush_flags |= FLUSH_TLB_GLOBAL; if ( locking ) spin_lock(&map_pgdir_lock); if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && (l3e_get_flags(*pl3e) & _PAGE_PSE) ) { l3e_write_atomic(pl3e, l3e_from_mfn(l2mfn, __PAGE_HYPERVISOR)); l2mfn = INVALID_MFN; } if ( locking ) spin_unlock(&map_pgdir_lock); flush_area(virt, flush_flags); free_xen_pagetable(l2mfn); } pl2e = virt_to_xen_l2e(virt); if ( !pl2e ) goto out; if ( ((((virt >> PAGE_SHIFT) | mfn_x(mfn)) & ((1u << PAGETABLE_ORDER) - 1)) == 0) && (nr_mfns >= (1u << PAGETABLE_ORDER)) && !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) ) { /* Super-page mapping. */ ol2e = *pl2e; l2e_write_atomic(pl2e, l2e_from_mfn(mfn, l1f_to_lNf(flags))); if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) ) { unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER); if ( l2e_get_flags(ol2e) & _PAGE_PSE ) { flush_flags(lNf_to_l1f(l2e_get_flags(ol2e))); flush_area(virt, flush_flags); } else { l1_pgentry_t *l1t = map_l1t_from_l2e(ol2e); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) flush_flags(l1e_get_flags(l1t[i])); flush_area(virt, flush_flags); unmap_domain_page(l1t); free_xen_pagetable(l2e_get_mfn(ol2e)); } } virt += 1UL << L2_PAGETABLE_SHIFT; if ( !mfn_eq(mfn, INVALID_MFN) ) mfn = mfn_add(mfn, 1UL << PAGETABLE_ORDER); nr_mfns -= 1UL << PAGETABLE_ORDER; } else { pl1e = NULL; /* Normal page mapping. */ if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) { pl1e = virt_to_xen_l1e(virt); if ( pl1e == NULL ) goto out; } else if ( l2e_get_flags(*pl2e) & _PAGE_PSE ) { unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER); l1_pgentry_t *l1t; mfn_t l1mfn; /* Skip this PTE if there is no change. */ if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) + l1_table_offset(virt)) == mfn_x(mfn)) && (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) & ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) ) { /* We can skip to end of L2 superpage if we got a match. */ i = (1u << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - (mfn_x(mfn) & ((1u << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)); if ( i > nr_mfns ) i = nr_mfns; virt += i << L1_PAGETABLE_SHIFT; if ( !mfn_eq(mfn, INVALID_MFN) ) mfn = mfn_add(mfn, i); nr_mfns -= i; goto check_l3; } l1mfn = alloc_xen_pagetable(); if ( mfn_eq(l1mfn, INVALID_MFN) ) goto out; l1t = map_domain_page(l1mfn); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) l1e_write(&l1t[i], l1e_from_pfn(l2e_get_pfn(*pl2e) + i, lNf_to_l1f(l2e_get_flags(*pl2e)))); UNMAP_DOMAIN_PAGE(l1t); if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL ) flush_flags |= FLUSH_TLB_GLOBAL; if ( locking ) spin_lock(&map_pgdir_lock); if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && (l2e_get_flags(*pl2e) & _PAGE_PSE) ) { l2e_write_atomic(pl2e, l2e_from_mfn(l1mfn, __PAGE_HYPERVISOR)); l1mfn = INVALID_MFN; } if ( locking ) spin_unlock(&map_pgdir_lock); flush_area(virt, flush_flags); free_xen_pagetable(l1mfn); } if ( !pl1e ) pl1e = map_l1t_from_l2e(*pl2e) + l1_table_offset(virt); ol1e = *pl1e; l1e_write_atomic(pl1e, l1e_from_mfn(mfn, flags)); UNMAP_DOMAIN_PAGE(pl1e); if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) ) { unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0); flush_flags(l1e_get_flags(ol1e)); flush_area(virt, flush_flags); } virt += 1UL << L1_PAGETABLE_SHIFT; if ( !mfn_eq(mfn, INVALID_MFN) ) mfn = mfn_add(mfn, 1UL); nr_mfns -= 1UL; if ( (flags == PAGE_HYPERVISOR) && ((nr_mfns == 0) || ((((virt >> PAGE_SHIFT) | mfn_x(mfn)) & ((1u << PAGETABLE_ORDER) - 1)) == 0)) ) { unsigned long base_mfn; const l1_pgentry_t *l1t; if ( locking ) spin_lock(&map_pgdir_lock); ol2e = *pl2e; /* * L2E may be already cleared, or set to a superpage, by * concurrent paging structure modifications on other CPUs. */ if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) ) { if ( locking ) spin_unlock(&map_pgdir_lock); continue; } if ( l2e_get_flags(ol2e) & _PAGE_PSE ) { if ( locking ) spin_unlock(&map_pgdir_lock); goto check_l3; } l1t = map_l1t_from_l2e(ol2e); base_mfn = l1e_get_pfn(l1t[0]) & ~(L1_PAGETABLE_ENTRIES - 1); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) if ( (l1e_get_pfn(l1t[i]) != (base_mfn + i)) || (l1e_get_flags(l1t[i]) != flags) ) break; UNMAP_DOMAIN_PAGE(l1t); if ( i == L1_PAGETABLE_ENTRIES ) { l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn, l1f_to_lNf(flags))); if ( locking ) spin_unlock(&map_pgdir_lock); flush_area(virt - PAGE_SIZE, FLUSH_TLB_GLOBAL | FLUSH_ORDER(PAGETABLE_ORDER)); free_xen_pagetable(l2e_get_mfn(ol2e)); } else if ( locking ) spin_unlock(&map_pgdir_lock); } } check_l3: if ( cpu_has_page1gb && (flags == PAGE_HYPERVISOR) && ((nr_mfns == 0) || !(((virt >> PAGE_SHIFT) | mfn_x(mfn)) & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) ) { unsigned long base_mfn; const l2_pgentry_t *l2t; if ( locking ) spin_lock(&map_pgdir_lock); ol3e = *pl3e; /* * L3E may be already cleared, or set to a superpage, by * concurrent paging structure modifications on other CPUs. */ if ( !(l3e_get_flags(ol3e) & _PAGE_PRESENT) || (l3e_get_flags(ol3e) & _PAGE_PSE) ) { if ( locking ) spin_unlock(&map_pgdir_lock); continue; } l2t = map_l2t_from_l3e(ol3e); base_mfn = l2e_get_pfn(l2t[0]) & ~(L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES - 1); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) if ( (l2e_get_pfn(l2t[i]) != (base_mfn + (i << PAGETABLE_ORDER))) || (l2e_get_flags(l2t[i]) != l1f_to_lNf(flags)) ) break; UNMAP_DOMAIN_PAGE(l2t); if ( i == L2_PAGETABLE_ENTRIES ) { l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn, l1f_to_lNf(flags))); if ( locking ) spin_unlock(&map_pgdir_lock); flush_area(virt - PAGE_SIZE, FLUSH_TLB_GLOBAL | FLUSH_ORDER(2*PAGETABLE_ORDER)); free_xen_pagetable(l3e_get_mfn(ol3e)); } else if ( locking ) spin_unlock(&map_pgdir_lock); } } #undef flush_flags rc = 0; out: L3T_UNLOCK(current_l3page); unmap_domain_page(pl3e); unmap_domain_page(pl2e); return rc; } int populate_pt_range(unsigned long virt, unsigned long nr_mfns) { return map_pages_to_xen(virt, INVALID_MFN, nr_mfns, MAP_SMALL_PAGES); } /* * Alter the permissions of a range of Xen virtual address space. * * Does not create new mappings, and does not modify the mfn in existing * mappings, but will shatter superpages if necessary, and will destroy * mappings if not passed _PAGE_PRESENT. * * The only flags considered are NX, D, A, RW and PRESENT. All other input * flags are ignored. * * It is an error to call with present flags over an unpopulated range. */ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) { bool locking = system_state > SYS_STATE_boot; l3_pgentry_t *pl3e = NULL; l2_pgentry_t *pl2e = NULL; l1_pgentry_t *pl1e; unsigned int i; unsigned long v = s; int rc = -ENOMEM; struct page_info *current_l3page; /* Set of valid PTE bits which may be altered. */ #define FLAGS_MASK (_PAGE_NX|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_PRESENT) nf &= FLAGS_MASK; ASSERT(IS_ALIGNED(s, PAGE_SIZE)); ASSERT(IS_ALIGNED(e, PAGE_SIZE)); L3T_INIT(current_l3page); while ( v < e ) { /* Clean up the previous iteration. */ L3T_UNLOCK(current_l3page); UNMAP_DOMAIN_PAGE(pl2e); UNMAP_DOMAIN_PAGE(pl3e); pl3e = virt_to_xen_l3e(v); if ( !pl3e ) goto out; current_l3page = mfn_to_page(domain_page_map_to_mfn(pl3e)); L3T_LOCK(current_l3page); if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) { /* Confirm the caller isn't trying to create new mappings. */ ASSERT(!(nf & _PAGE_PRESENT)); v += 1UL << L3_PAGETABLE_SHIFT; v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1); continue; } if ( l3e_get_flags(*pl3e) & _PAGE_PSE ) { l2_pgentry_t *l2t; mfn_t l2mfn; if ( l2_table_offset(v) == 0 && l1_table_offset(v) == 0 && ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) ) { /* PAGE1GB: whole superpage is modified. */ l3_pgentry_t nl3e = !(nf & _PAGE_PRESENT) ? l3e_empty() : l3e_from_pfn(l3e_get_pfn(*pl3e), (l3e_get_flags(*pl3e) & ~FLAGS_MASK) | nf); l3e_write_atomic(pl3e, nl3e); v += 1UL << L3_PAGETABLE_SHIFT; continue; } /* PAGE1GB: shatter the superpage and fall through. */ l2mfn = alloc_xen_pagetable(); if ( mfn_eq(l2mfn, INVALID_MFN) ) goto out; l2t = map_domain_page(l2mfn); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) l2e_write(l2t + i, l2e_from_pfn(l3e_get_pfn(*pl3e) + (i << PAGETABLE_ORDER), l3e_get_flags(*pl3e))); UNMAP_DOMAIN_PAGE(l2t); if ( locking ) spin_lock(&map_pgdir_lock); if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && (l3e_get_flags(*pl3e) & _PAGE_PSE) ) { l3e_write_atomic(pl3e, l3e_from_mfn(l2mfn, __PAGE_HYPERVISOR)); l2mfn = INVALID_MFN; } if ( locking ) spin_unlock(&map_pgdir_lock); free_xen_pagetable(l2mfn); } /* * The L3 entry has been verified to be present, and we've dealt with * 1G pages as well, so the L2 table cannot require allocation. */ pl2e = map_l2t_from_l3e(*pl3e) + l2_table_offset(v); if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) { /* Confirm the caller isn't trying to create new mappings. */ ASSERT(!(nf & _PAGE_PRESENT)); v += 1UL << L2_PAGETABLE_SHIFT; v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1); continue; } if ( l2e_get_flags(*pl2e) & _PAGE_PSE ) { if ( (l1_table_offset(v) == 0) && ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) ) { /* PSE: whole superpage is modified. */ l2_pgentry_t nl2e = !(nf & _PAGE_PRESENT) ? l2e_empty() : l2e_from_pfn(l2e_get_pfn(*pl2e), (l2e_get_flags(*pl2e) & ~FLAGS_MASK) | nf); l2e_write_atomic(pl2e, nl2e); v += 1UL << L2_PAGETABLE_SHIFT; } else { l1_pgentry_t *l1t; /* PSE: shatter the superpage and try again. */ mfn_t l1mfn = alloc_xen_pagetable(); if ( mfn_eq(l1mfn, INVALID_MFN) ) goto out; l1t = map_domain_page(l1mfn); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) l1e_write(&l1t[i], l1e_from_pfn(l2e_get_pfn(*pl2e) + i, l2e_get_flags(*pl2e) & ~_PAGE_PSE)); UNMAP_DOMAIN_PAGE(l1t); if ( locking ) spin_lock(&map_pgdir_lock); if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && (l2e_get_flags(*pl2e) & _PAGE_PSE) ) { l2e_write_atomic(pl2e, l2e_from_mfn(l1mfn, __PAGE_HYPERVISOR)); l1mfn = INVALID_MFN; } if ( locking ) spin_unlock(&map_pgdir_lock); free_xen_pagetable(l1mfn); } } else { l1_pgentry_t nl1e, *l1t; mfn_t l1mfn; /* * Ordinary 4kB mapping: The L2 entry has been verified to be * present, and we've dealt with 2M pages as well, so the L1 table * cannot require allocation. */ pl1e = map_l1t_from_l2e(*pl2e) + l1_table_offset(v); /* Confirm the caller isn't trying to create new mappings. */ if ( !(l1e_get_flags(*pl1e) & _PAGE_PRESENT) ) ASSERT(!(nf & _PAGE_PRESENT)); nl1e = !(nf & _PAGE_PRESENT) ? l1e_empty() : l1e_from_pfn(l1e_get_pfn(*pl1e), (l1e_get_flags(*pl1e) & ~FLAGS_MASK) | nf); l1e_write_atomic(pl1e, nl1e); UNMAP_DOMAIN_PAGE(pl1e); v += PAGE_SIZE; /* * If we are not destroying mappings, or not done with the L2E, * skip the empty&free check. */ if ( (nf & _PAGE_PRESENT) || ((v != e) && (l1_table_offset(v) != 0)) ) continue; if ( locking ) spin_lock(&map_pgdir_lock); /* * L2E may be already cleared, or set to a superpage, by * concurrent paging structure modifications on other CPUs. */ if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) { if ( locking ) spin_unlock(&map_pgdir_lock); goto check_l3; } if ( l2e_get_flags(*pl2e) & _PAGE_PSE ) { if ( locking ) spin_unlock(&map_pgdir_lock); continue; } l1mfn = l2e_get_mfn(*pl2e); l1t = map_domain_page(l1mfn); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) if ( l1e_get_intpte(l1t[i]) != 0 ) break; UNMAP_DOMAIN_PAGE(l1t); if ( i == L1_PAGETABLE_ENTRIES ) { /* Empty: zap the L2E and free the L1 page. */ l2e_write_atomic(pl2e, l2e_empty()); if ( locking ) spin_unlock(&map_pgdir_lock); flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */ free_xen_pagetable(l1mfn); } else if ( locking ) spin_unlock(&map_pgdir_lock); } check_l3: /* * If we are not destroying mappings, or not done with the L3E, * skip the empty&free check. */ if ( (nf & _PAGE_PRESENT) || ((v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0)) ) continue; if ( locking ) spin_lock(&map_pgdir_lock); /* * L3E may be already cleared, or set to a superpage, by * concurrent paging structure modifications on other CPUs. */ if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) || (l3e_get_flags(*pl3e) & _PAGE_PSE) ) { if ( locking ) spin_unlock(&map_pgdir_lock); continue; } { l2_pgentry_t *l2t; mfn_t l2mfn = l3e_get_mfn(*pl3e); l2t = map_domain_page(l2mfn); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) if ( l2e_get_intpte(l2t[i]) != 0 ) break; UNMAP_DOMAIN_PAGE(l2t); if ( i == L2_PAGETABLE_ENTRIES ) { /* Empty: zap the L3E and free the L2 page. */ l3e_write_atomic(pl3e, l3e_empty()); if ( locking ) spin_unlock(&map_pgdir_lock); flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */ free_xen_pagetable(l2mfn); } else if ( locking ) spin_unlock(&map_pgdir_lock); } } flush_area(NULL, FLUSH_TLB_GLOBAL); #undef FLAGS_MASK rc = 0; out: L3T_UNLOCK(current_l3page); unmap_domain_page(pl3e); unmap_domain_page(pl2e); return rc; } #undef L3T_LOCK #undef L3T_UNLOCK #undef flush_area int destroy_xen_mappings(unsigned long s, unsigned long e) { return modify_xen_mappings(s, e, _PAGE_NONE); } /* * Similar to modify_xen_mappings(), but used by the alternatives and * livepatch in weird contexts. All synchronization, TLB flushing, etc is the * responsibility of the caller, and *MUST* not be introduced here. * * Must be limited to XEN_VIRT_{START,END}, i.e. over l2_xenmap[]. * Must be called with present flags, and over present mappings. * It is the callers responsibility to not pass s or e in the middle of * superpages if changing the permission on the whole superpage is going to be * a problem. */ void init_or_livepatch modify_xen_mappings_lite( unsigned long s, unsigned long e, unsigned int _nf) { unsigned long v = s, fm, nf; /* Set of valid PTE bits which may be altered. */ #define FLAGS_MASK (_PAGE_NX|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_PRESENT) fm = put_pte_flags(FLAGS_MASK); nf = put_pte_flags(_nf & FLAGS_MASK); #undef FLAGS_MASK ASSERT(nf & _PAGE_PRESENT); ASSERT(IS_ALIGNED(s, PAGE_SIZE) && s >= XEN_VIRT_START); ASSERT(IS_ALIGNED(e, PAGE_SIZE) && e <= XEN_VIRT_END); while ( v < e ) { l2_pgentry_t *pl2e = &l2_xenmap[l2_table_offset(v)]; l2_pgentry_t l2e = l2e_read_atomic(pl2e); unsigned int l2f = l2e_get_flags(l2e); ASSERT(l2f & _PAGE_PRESENT); if ( l2e_get_flags(l2e) & _PAGE_PSE ) { l2e_write_atomic(pl2e, l2e_from_intpte((l2e.l2 & ~fm) | nf)); v += 1UL << L2_PAGETABLE_SHIFT; continue; } /* else descend to l1 */ { l1_pgentry_t *pl1t = map_l1t_from_l2e(l2e); while ( v < e ) { l1_pgentry_t *pl1e = &pl1t[l1_table_offset(v)]; l1_pgentry_t l1e = l1e_read_atomic(pl1e); unsigned int l1f = l1e_get_flags(l1e); ASSERT(l1f & _PAGE_PRESENT); l1e_write_atomic(pl1e, l1e_from_intpte((l1e.l1 & ~fm) | nf)); v += 1UL << L1_PAGETABLE_SHIFT; if ( l2_table_offset(v) == 0 ) break; } unmap_domain_page(pl1t); } } } void __set_fixmap( enum fixed_addresses idx, unsigned long mfn, unsigned long flags) { BUG_ON(idx >= __end_of_fixed_addresses || idx <= FIX_RESERVED); map_pages_to_xen(__fix_to_virt(idx), _mfn(mfn), 1, flags); } void __set_fixmap_x( enum fixed_addresses_x idx, unsigned long mfn, unsigned long flags) { BUG_ON(idx >= __end_of_fixed_addresses_x || idx <= FIX_X_RESERVED); map_pages_to_xen(__fix_x_to_virt(idx), _mfn(mfn), 1, flags); } void *__init arch_vmap_virt_end(void) { return fix_to_virt(__end_of_fixed_addresses); } void __iomem *ioremap(paddr_t pa, size_t len) { mfn_t mfn = _mfn(PFN_DOWN(pa)); void *va; WARN_ON(page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL)); /* The low first Mb is always mapped. */ if ( !((pa + len - 1) >> 20) ) va = __va(pa); else { unsigned int offs = pa & (PAGE_SIZE - 1); unsigned int nr = PFN_UP(offs + len); va = __vmap(&mfn, nr, 1, 1, PAGE_HYPERVISOR_UCMINUS, VMAP_DEFAULT) + offs; } return (void __force __iomem *)va; } void __iomem *__init ioremap_wc(paddr_t pa, size_t len) { mfn_t mfn = _mfn(PFN_DOWN(pa)); unsigned int offs = pa & (PAGE_SIZE - 1); unsigned int nr = PFN_UP(offs + len); void *va; WARN_ON(page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL)); va = __vmap(&mfn, nr, 1, 1, PAGE_HYPERVISOR_WC, VMAP_DEFAULT); return (void __force __iomem *)(va + offs); } int create_perdomain_mapping(struct domain *d, unsigned long va, unsigned int nr, l1_pgentry_t **pl1tab, struct page_info **ppg) { struct page_info *pg; l3_pgentry_t *l3tab; l2_pgentry_t *l2tab; l1_pgentry_t *l1tab; int rc = 0; ASSERT(va >= PERDOMAIN_VIRT_START && va < PERDOMAIN_VIRT_SLOT(PERDOMAIN_SLOTS)); if ( !d->arch.perdomain_l3_pg ) { pg = alloc_domheap_page(d, MEMF_no_owner); if ( !pg ) return -ENOMEM; l3tab = __map_domain_page(pg); clear_page(l3tab); d->arch.perdomain_l3_pg = pg; if ( !nr ) { unmap_domain_page(l3tab); return 0; } } else if ( !nr ) return 0; else l3tab = __map_domain_page(d->arch.perdomain_l3_pg); ASSERT(!l3_table_offset(va ^ (va + nr * PAGE_SIZE - 1))); if ( !(l3e_get_flags(l3tab[l3_table_offset(va)]) & _PAGE_PRESENT) ) { pg = alloc_domheap_page(d, MEMF_no_owner); if ( !pg ) { unmap_domain_page(l3tab); return -ENOMEM; } l2tab = __map_domain_page(pg); clear_page(l2tab); l3tab[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR_RW); } else l2tab = map_l2t_from_l3e(l3tab[l3_table_offset(va)]); unmap_domain_page(l3tab); if ( !pl1tab && !ppg ) { unmap_domain_page(l2tab); return 0; } for ( l1tab = NULL; !rc && nr--; ) { l2_pgentry_t *pl2e = l2tab + l2_table_offset(va); if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) { if ( pl1tab && !IS_NIL(pl1tab) ) { l1tab = alloc_xenheap_pages(0, MEMF_node(domain_to_node(d))); if ( !l1tab ) { rc = -ENOMEM; break; } ASSERT(!pl1tab[l2_table_offset(va)]); pl1tab[l2_table_offset(va)] = l1tab; pg = virt_to_page(l1tab); } else { pg = alloc_domheap_page(d, MEMF_no_owner); if ( !pg ) { rc = -ENOMEM; break; } l1tab = __map_domain_page(pg); } clear_page(l1tab); *pl2e = l2e_from_page(pg, __PAGE_HYPERVISOR_RW); } else if ( !l1tab ) l1tab = map_l1t_from_l2e(*pl2e); if ( ppg && !(l1e_get_flags(l1tab[l1_table_offset(va)]) & _PAGE_PRESENT) ) { pg = alloc_domheap_page(d, MEMF_no_owner); if ( pg ) { clear_domain_page(page_to_mfn(pg)); if ( !IS_NIL(ppg) ) *ppg++ = pg; l1tab[l1_table_offset(va)] = l1e_from_page(pg, __PAGE_HYPERVISOR_RW | _PAGE_AVAIL0); l2e_add_flags(*pl2e, _PAGE_AVAIL0); } else rc = -ENOMEM; } va += PAGE_SIZE; if ( rc || !nr || !l1_table_offset(va) ) { /* Note that this is a no-op for the alloc_xenheap_page() case. */ unmap_domain_page(l1tab); l1tab = NULL; } } ASSERT(!l1tab); unmap_domain_page(l2tab); return rc; } void destroy_perdomain_mapping(struct domain *d, unsigned long va, unsigned int nr) { const l3_pgentry_t *l3tab, *pl3e; ASSERT(va >= PERDOMAIN_VIRT_START && va < PERDOMAIN_VIRT_SLOT(PERDOMAIN_SLOTS)); ASSERT(!nr || !l3_table_offset(va ^ (va + nr * PAGE_SIZE - 1))); if ( !d->arch.perdomain_l3_pg ) return; l3tab = __map_domain_page(d->arch.perdomain_l3_pg); pl3e = l3tab + l3_table_offset(va); if ( l3e_get_flags(*pl3e) & _PAGE_PRESENT ) { const l2_pgentry_t *l2tab = map_l2t_from_l3e(*pl3e); const l2_pgentry_t *pl2e = l2tab + l2_table_offset(va); unsigned int i = l1_table_offset(va); while ( nr ) { if ( l2e_get_flags(*pl2e) & _PAGE_PRESENT ) { l1_pgentry_t *l1tab = map_l1t_from_l2e(*pl2e); for ( ; nr && i < L1_PAGETABLE_ENTRIES; --nr, ++i ) { if ( (l1e_get_flags(l1tab[i]) & (_PAGE_PRESENT | _PAGE_AVAIL0)) == (_PAGE_PRESENT | _PAGE_AVAIL0) ) free_domheap_page(l1e_get_page(l1tab[i])); l1tab[i] = l1e_empty(); } unmap_domain_page(l1tab); } else if ( nr + i < L1_PAGETABLE_ENTRIES ) break; else nr -= L1_PAGETABLE_ENTRIES - i; ++pl2e; i = 0; } unmap_domain_page(l2tab); } unmap_domain_page(l3tab); } void free_perdomain_mappings(struct domain *d) { l3_pgentry_t *l3tab; unsigned int i; if ( !d->arch.perdomain_l3_pg ) return; l3tab = __map_domain_page(d->arch.perdomain_l3_pg); for ( i = 0; i < PERDOMAIN_SLOTS; ++i) if ( l3e_get_flags(l3tab[i]) & _PAGE_PRESENT ) { struct page_info *l2pg = l3e_get_page(l3tab[i]); l2_pgentry_t *l2tab = __map_domain_page(l2pg); unsigned int j; for ( j = 0; j < L2_PAGETABLE_ENTRIES; ++j ) if ( l2e_get_flags(l2tab[j]) & _PAGE_PRESENT ) { struct page_info *l1pg = l2e_get_page(l2tab[j]); if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 ) { l1_pgentry_t *l1tab = __map_domain_page(l1pg); unsigned int k; for ( k = 0; k < L1_PAGETABLE_ENTRIES; ++k ) if ( (l1e_get_flags(l1tab[k]) & (_PAGE_PRESENT | _PAGE_AVAIL0)) == (_PAGE_PRESENT | _PAGE_AVAIL0) ) free_domheap_page(l1e_get_page(l1tab[k])); unmap_domain_page(l1tab); } if ( is_xen_heap_page(l1pg) ) free_xenheap_page(page_to_virt(l1pg)); else free_domheap_page(l1pg); } unmap_domain_page(l2tab); free_domheap_page(l2pg); } unmap_domain_page(l3tab); free_domheap_page(d->arch.perdomain_l3_pg); d->arch.perdomain_l3_pg = NULL; } static void write_sss_token(unsigned long *ptr) { /* * A supervisor shadow stack token is its own linear address, with the * busy bit (0) clear. */ *ptr = (unsigned long)ptr; } void memguard_guard_stack(void *p) { /* IST Shadow stacks. 4x 1k in stack page 0. */ if ( IS_ENABLED(CONFIG_XEN_SHSTK) ) { write_sss_token(p + (IST_MCE * IST_SHSTK_SIZE) - 8); write_sss_token(p + (IST_NMI * IST_SHSTK_SIZE) - 8); write_sss_token(p + (IST_DB * IST_SHSTK_SIZE) - 8); write_sss_token(p + (IST_DF * IST_SHSTK_SIZE) - 8); } map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK); /* Primary Shadow Stack. 1x 4k in stack page 5. */ p += PRIMARY_SHSTK_SLOT * PAGE_SIZE; if ( IS_ENABLED(CONFIG_XEN_SHSTK) ) write_sss_token(p + PAGE_SIZE - 8); map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK); } void memguard_unguard_stack(void *p) { map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_RW); p += PRIMARY_SHSTK_SLOT * PAGE_SIZE; map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_RW); } void arch_dump_shared_mem_info(void) { printk("Shared frames %u -- Saved frames %u\n", mem_sharing_get_nr_shared_mfns(), mem_sharing_get_nr_saved_mfns()); } const struct platform_bad_page *__init get_platform_badpages(unsigned int *array_size) { u32 igd_id; static const struct platform_bad_page __initconst snb_bad_pages[] = { { .mfn = 0x20050000 >> PAGE_SHIFT }, { .mfn = 0x20110000 >> PAGE_SHIFT }, { .mfn = 0x20130000 >> PAGE_SHIFT }, { .mfn = 0x20138000 >> PAGE_SHIFT }, { .mfn = 0x40004000 >> PAGE_SHIFT }, }; static const struct platform_bad_page __initconst hle_bad_page = { .mfn = 0x40000000 >> PAGE_SHIFT, .order = 10 }; switch ( cpuid_eax(1) & 0x000f3ff0 ) { case 0x000406e0: /* erratum SKL167 */ case 0x00050650: /* erratum SKZ63 */ case 0x000506e0: /* errata SKL167 / SKW159 */ case 0x000806e0: /* erratum KBL??? */ case 0x000906e0: /* errata KBL??? / KBW114 / CFW103 */ *array_size = (cpuid_eax(0) >= 7 && !cpu_has_hypervisor && (cpuid_count_ebx(7, 0) & cpufeat_mask(X86_FEATURE_HLE))); return &hle_bad_page; } *array_size = ARRAY_SIZE(snb_bad_pages); igd_id = pci_conf_read32(PCI_SBDF(0, 0, 2, 0), 0); if ( IS_SNB_GFX(igd_id) ) return snb_bad_pages; return NULL; } void paging_invlpg(struct vcpu *v, unsigned long linear) { if ( !is_canonical_address(linear) ) return; if ( paging_mode_enabled(v->domain) && !paging_get_hostmode(v)->invlpg(v, linear) ) return; if ( is_pv_vcpu(v) ) flush_tlb_one_local(linear); else hvm_invlpg(v, linear); } /* Build a 32bit PSE page table using 4MB pages. */ void write_32bit_pse_identmap(uint32_t *l2) { unsigned int i; for ( i = 0; i < PAGE_SIZE / sizeof(*l2); i++ ) l2[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); } unsigned long get_upper_mfn_bound(void) { unsigned long max_mfn; max_mfn = mem_hotplug ? PFN_DOWN(mem_hotplug) : max_page; #ifndef CONFIG_BIGMEM max_mfn = min(max_mfn, 1UL << 32); #endif return min(max_mfn, 1UL << (paddr_bits - PAGE_SHIFT)) - 1; } static void __init __maybe_unused build_assertions(void) { /* * If this trips, any guests that blindly rely on the public API in xen.h * (instead of reading the PAT from Xen, as Linux 3.19+ does) will be * broken. Furthermore, live migration of PV guests between Xen versions * using different PATs will not work. */ BUILD_BUG_ON(XEN_MSR_PAT != 0x050100070406ULL); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */