/* * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include const struct iommu_init_ops *__initdata iommu_init_ops; struct iommu_ops __ro_after_init iommu_ops; bool __read_mostly iommu_non_coherent; bool __initdata iommu_superpages = true; enum iommu_intremap __read_mostly iommu_intremap = iommu_intremap_full; #ifdef CONFIG_PV /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */ bool __read_mostly untrusted_msi; #endif #ifndef iommu_intpost /* * In the current implementation of VT-d posted interrupts, in some extreme * cases, the per cpu list which saves the blocked vCPU will be very long, * and this will affect the interrupt latency, so let this feature off by * default until we find a good solution to resolve it. */ bool __read_mostly iommu_intpost; #endif void __init acpi_iommu_init(void) { int ret = -ENODEV; if ( !iommu_enable && !iommu_intremap ) return; if ( !acpi_disabled ) { ret = acpi_dmar_init(); #ifndef iommu_snoop /* * As long as there's no per-domain snoop control, and as long as on * AMD we uniformly force coherent accesses, a possible command line * override should affect VT-d only. */ if ( ret ) iommu_snoop = true; #endif if ( ret == -ENODEV ) ret = acpi_ivrs_init(); } if ( ret ) { iommu_enable = false; iommu_intremap = iommu_intremap_off; } } int __init iommu_hardware_setup(void) { struct IO_APIC_route_entry **ioapic_entries = NULL; int rc; if ( !iommu_init_ops ) return -ENODEV; rc = scan_pci_devices(); if ( rc ) return rc; if ( !iommu_ops.init ) iommu_ops = *iommu_init_ops->ops; else /* x2apic setup may have previously initialised the struct. */ ASSERT(iommu_ops.init == iommu_init_ops->ops->init); if ( !x2apic_enabled && iommu_intremap ) { /* * If x2APIC is enabled interrupt remapping is already enabled, so * there's no need to mess with the IO-APIC because the remapping * entries are already correctly setup by x2apic_bsp_setup. */ ioapic_entries = alloc_ioapic_entries(); if ( !ioapic_entries ) return -ENOMEM; rc = save_IO_APIC_setup(ioapic_entries); if ( rc ) { free_ioapic_entries(ioapic_entries); return rc; } mask_8259A(); mask_IO_APIC_setup(ioapic_entries); } if ( !iommu_superpages ) iommu_ops.page_sizes &= PAGE_SIZE_4K; rc = iommu_init_ops->setup(); ASSERT(iommu_superpages || iommu_ops.page_sizes == PAGE_SIZE_4K); if ( ioapic_entries ) { restore_IO_APIC_setup(ioapic_entries, rc); unmask_8259A(); free_ioapic_entries(ioapic_entries); } return rc; } int iommu_enable_x2apic(void) { if ( system_state < SYS_STATE_active ) { if ( !iommu_supports_x2apic() ) return -EOPNOTSUPP; iommu_ops = *iommu_init_ops->ops; } else if ( !x2apic_enabled ) return -EOPNOTSUPP; if ( !iommu_ops.enable_x2apic ) return -EOPNOTSUPP; return iommu_call(&iommu_ops, enable_x2apic); } void iommu_update_ire_from_apic( unsigned int apic, unsigned int reg, unsigned int value) { iommu_vcall(&iommu_ops, update_ire_from_apic, apic, reg, value); } unsigned int iommu_read_apic_from_ire(unsigned int apic, unsigned int reg) { return iommu_call(&iommu_ops, read_apic_from_ire, apic, reg); } int __init iommu_setup_hpet_msi(struct msi_desc *msi) { const struct iommu_ops *ops = iommu_get_ops(); return ops->setup_hpet_msi ? iommu_call(ops, setup_hpet_msi, msi) : -ENODEV; } void __hwdom_init arch_iommu_check_autotranslated_hwdom(struct domain *d) { if ( !is_iommu_enabled(d) ) panic("Presently, iommu must be enabled for PVH hardware domain\n"); if ( !iommu_hwdom_strict ) panic("PVH hardware domain iommu must be set in 'strict' mode\n"); } int arch_iommu_domain_init(struct domain *d) { struct domain_iommu *hd = dom_iommu(d); spin_lock_init(&hd->arch.mapping_lock); INIT_PAGE_LIST_HEAD(&hd->arch.pgtables.list); spin_lock_init(&hd->arch.pgtables.lock); INIT_LIST_HEAD(&hd->arch.identity_maps); return 0; } void arch_iommu_domain_destroy(struct domain *d) { /* * There should be not page-tables left allocated by the time the * domain is destroyed. Note that arch_iommu_domain_destroy() is * called unconditionally, so pgtables may be uninitialized. */ ASSERT(!dom_iommu(d)->platform_ops || page_list_empty(&dom_iommu(d)->arch.pgtables.list)); } struct identity_map { struct list_head list; paddr_t base, end; p2m_access_t access; unsigned int count; }; int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma, paddr_t base, paddr_t end, unsigned int flag) { unsigned long base_pfn = base >> PAGE_SHIFT_4K; unsigned long end_pfn = PAGE_ALIGN_4K(end) >> PAGE_SHIFT_4K; struct identity_map *map; struct domain_iommu *hd = dom_iommu(d); ASSERT(pcidevs_locked()); ASSERT(base < end); /* * No need to acquire hd->arch.mapping_lock: Both insertion and removal * get done while holding pcidevs_lock. */ list_for_each_entry( map, &hd->arch.identity_maps, list ) { if ( map->base == base && map->end == end ) { int ret = 0; if ( p2ma != p2m_access_x ) { if ( map->access != p2ma ) return -EADDRINUSE; ++map->count; return 0; } if ( --map->count ) return 0; while ( base_pfn < end_pfn ) { if ( clear_identity_p2m_entry(d, base_pfn) ) ret = -ENXIO; base_pfn++; } list_del(&map->list); xfree(map); return ret; } if ( end >= map->base && map->end >= base ) return -EADDRINUSE; } if ( p2ma == p2m_access_x ) return -ENOENT; while ( base_pfn < end_pfn ) { int err = set_identity_p2m_entry(d, base_pfn, p2ma, flag); if ( err ) return err; base_pfn++; } map = xmalloc(struct identity_map); if ( !map ) return -ENOMEM; map->base = base; map->end = end; map->access = p2ma; map->count = 1; list_add_tail(&map->list, &hd->arch.identity_maps); return 0; } void iommu_identity_map_teardown(struct domain *d) { struct domain_iommu *hd = dom_iommu(d); struct identity_map *map, *tmp; list_for_each_entry_safe ( map, tmp, &hd->arch.identity_maps, list ) { list_del(&map->list); xfree(map); } } static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d, unsigned long pfn, unsigned long max_pfn) { mfn_t mfn = _mfn(pfn); unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable; /* * Set up 1:1 mapping for dom0. Default to include only conventional RAM * areas and let RMRRs include needed reserved regions. When set, the * inclusive mapping additionally maps in every pfn up to 4GB except those * that fall in unusable ranges for PV Dom0. */ if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) return 0; switch ( type = page_get_ram_type(mfn) ) { case RAM_TYPE_UNUSABLE: return 0; case RAM_TYPE_CONVENTIONAL: if ( iommu_hwdom_strict ) return 0; break; default: if ( type & RAM_TYPE_RESERVED ) { if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) perms = 0; } else if ( is_hvm_domain(d) ) return 0; else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) perms = 0; } /* Check that it doesn't overlap with the Interrupt Address Range. */ if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) return 0; /* ... or the IO-APIC */ if ( has_vioapic(d) ) { for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) return 0; } else if ( is_pv_domain(d) ) { /* * Be consistent with CPU mappings: Dom0 is permitted to establish r/o * ones there (also for e.g. HPET in certain cases), so it should also * have such established for IOMMUs. */ if ( iomem_access_permitted(d, pfn, pfn) && rangeset_contains_singleton(mmio_ro_ranges, pfn) ) perms = IOMMUF_readable; } /* * ... or the PCIe MCFG regions. * TODO: runtime added MMCFG regions are not checked to make sure they * don't overlap with already mapped regions, thus preventing trapping. */ if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) ) return 0; return perms; } void __hwdom_init arch_iommu_hwdom_init(struct domain *d) { unsigned long i, top, max_pfn, start, count; unsigned int flush_flags = 0, start_perms = 0; BUG_ON(!is_hardware_domain(d)); /* Reserved IOMMU mappings are enabled by default. */ if ( iommu_hwdom_reserved == -1 ) iommu_hwdom_reserved = 1; if ( iommu_hwdom_inclusive ) { printk(XENLOG_WARNING "IOMMU inclusive mappings are deprecated and will be removed in future versions\n"); if ( !is_pv_domain(d) ) { printk(XENLOG_WARNING "IOMMU inclusive mappings are only supported on PV Dom0\n"); iommu_hwdom_inclusive = false; } } if ( iommu_hwdom_passthrough ) return; max_pfn = (GB(4) >> PAGE_SHIFT) - 1; top = max(max_pdx, pfn_to_pdx(max_pfn) + 1); /* * First Mb will get mapped in one go by pvh_populate_p2m(). Avoid * setting up potentially conflicting mappings here. */ start = paging_mode_translate(d) ? PFN_DOWN(MB(1)) : 0; for ( i = start, count = 0; i < top; ) { unsigned long pfn = pdx_to_pfn(i); unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn); if ( !perms ) /* nothing */; else if ( paging_mode_translate(d) ) { int rc; rc = p2m_add_identity_entry(d, pfn, perms & IOMMUF_writable ? p2m_access_rw : p2m_access_r, 0); if ( rc ) printk(XENLOG_WARNING "%pd: identity mapping of %lx failed: %d\n", d, pfn, rc); } else if ( pfn != start + count || perms != start_perms ) { long rc; commit: while ( (rc = iommu_map(d, _dfn(start), _mfn(start), count, start_perms | IOMMUF_preempt, &flush_flags)) > 0 ) { start += rc; count -= rc; process_pending_softirqs(); } if ( rc ) printk(XENLOG_WARNING "%pd: IOMMU identity mapping of [%lx,%lx) failed: %ld\n", d, start, start + count, rc); start = pfn; count = 1; start_perms = perms; } else ++count; if ( !(++i & 0xfffff) ) process_pending_softirqs(); if ( i == top && count ) goto commit; } /* Use if to avoid compiler warning */ if ( iommu_iotlb_flush_all(d, flush_flags) ) return; } void arch_pci_init_pdev(struct pci_dev *pdev) { pdev->arch.pseudo_domid = DOMID_INVALID; } unsigned long *__init iommu_init_domid(domid_t reserve) { unsigned long *map; if ( !iommu_quarantine ) return ZERO_BLOCK_PTR; BUILD_BUG_ON(DOMID_MASK * 2U >= UINT16_MAX); map = xzalloc_array(unsigned long, BITS_TO_LONGS(UINT16_MAX - DOMID_MASK)); if ( map && reserve != DOMID_INVALID ) { ASSERT(reserve > DOMID_MASK); __set_bit(reserve & DOMID_MASK, map); } return map; } domid_t iommu_alloc_domid(unsigned long *map) { /* * This is used uniformly across all IOMMUs, such that on typical * systems we wouldn't re-use the same ID very quickly (perhaps never). */ static unsigned int start; unsigned int idx = find_next_zero_bit(map, UINT16_MAX - DOMID_MASK, start); ASSERT(pcidevs_locked()); if ( idx >= UINT16_MAX - DOMID_MASK ) idx = find_first_zero_bit(map, UINT16_MAX - DOMID_MASK); if ( idx >= UINT16_MAX - DOMID_MASK ) return DOMID_INVALID; __set_bit(idx, map); start = idx + 1; return idx | (DOMID_MASK + 1); } void iommu_free_domid(domid_t domid, unsigned long *map) { ASSERT(pcidevs_locked()); if ( domid == DOMID_INVALID ) return; ASSERT(domid > DOMID_MASK); if ( !__test_and_clear_bit(domid & DOMID_MASK, map) ) BUG(); } int iommu_free_pgtables(struct domain *d) { struct domain_iommu *hd = dom_iommu(d); struct page_info *pg; unsigned int done = 0; if ( !is_iommu_enabled(d) ) return 0; /* After this barrier, no new IOMMU mappings can be inserted. */ spin_barrier(&hd->arch.mapping_lock); /* * Pages will be moved to the free list below. So we want to * clear the root page-table to avoid any potential use after-free. */ iommu_vcall(hd->platform_ops, clear_root_pgtable, d); while ( (pg = page_list_remove_head(&hd->arch.pgtables.list)) ) { free_domheap_page(pg); if ( !(++done & 0xff) && general_preempt_check() ) return -ERESTART; } return 0; } struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd, uint64_t contig_mask) { unsigned int memflags = 0; struct page_info *pg; uint64_t *p; #ifdef CONFIG_NUMA if ( hd->node != NUMA_NO_NODE ) memflags = MEMF_node(hd->node); #endif pg = alloc_domheap_page(NULL, memflags); if ( !pg ) return NULL; p = __map_domain_page(pg); if ( contig_mask ) { /* See pt-contig-markers.h for a description of the marker scheme. */ unsigned int i, shift = find_first_set_bit(contig_mask); ASSERT((CONTIG_LEVEL_SHIFT & (contig_mask >> shift)) == CONTIG_LEVEL_SHIFT); p[0] = (CONTIG_LEVEL_SHIFT + 0ull) << shift; p[1] = 0; p[2] = 1ull << shift; p[3] = 0; for ( i = 4; i < PAGE_SIZE / sizeof(*p); i += 4 ) { p[i + 0] = (find_first_set_bit(i) + 0ull) << shift; p[i + 1] = 0; p[i + 2] = 1ull << shift; p[i + 3] = 0; } } else clear_page(p); iommu_sync_cache(p, PAGE_SIZE); unmap_domain_page(p); spin_lock(&hd->arch.pgtables.lock); page_list_add(pg, &hd->arch.pgtables.list); spin_unlock(&hd->arch.pgtables.lock); return pg; } /* * Intermediate page tables which get replaced by large pages may only be * freed after a suitable IOTLB flush. Hence such pages get queued on a * per-CPU list, with a per-CPU tasklet processing the list on the assumption * that the necessary IOTLB flush will have occurred by the time tasklets get * to run. (List and tasklet being per-CPU has the benefit of accesses not * requiring any locking.) */ static DEFINE_PER_CPU(struct page_list_head, free_pgt_list); static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet); static void cf_check free_queued_pgtables(void *arg) { struct page_list_head *list = arg; struct page_info *pg; unsigned int done = 0; ASSERT(list == &this_cpu(free_pgt_list)); while ( (pg = page_list_remove_head(list)) ) { free_domheap_page(pg); /* * Just to be on the safe side, check for processing softirqs every * once in a while. Generally it is expected that parties queuing * pages for freeing will find a need for preemption before too many * pages can be queued. Granularity of checking is somewhat arbitrary. */ if ( !(++done & 0x1ff) ) process_pending_softirqs(); } } void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg) { unsigned int cpu = smp_processor_id(); spin_lock(&hd->arch.pgtables.lock); page_list_del(pg, &hd->arch.pgtables.list); spin_unlock(&hd->arch.pgtables.lock); page_list_add_tail(pg, &per_cpu(free_pgt_list, cpu)); tasklet_schedule(&per_cpu(free_pgt_tasklet, cpu)); } static int cf_check cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct page_list_head *list = &per_cpu(free_pgt_list, cpu); struct tasklet *tasklet = &per_cpu(free_pgt_tasklet, cpu); switch ( action ) { case CPU_DOWN_PREPARE: tasklet_kill(tasklet); break; case CPU_DEAD: if ( !page_list_empty(list) ) { page_list_splice(list, &this_cpu(free_pgt_list)); INIT_PAGE_LIST_HEAD(list); tasklet_schedule(&this_cpu(free_pgt_tasklet)); } break; case CPU_UP_PREPARE: INIT_PAGE_LIST_HEAD(list); fallthrough; case CPU_DOWN_FAILED: tasklet_init(tasklet, free_queued_pgtables, list); if ( !page_list_empty(list) ) tasklet_schedule(tasklet); break; } return NOTIFY_DONE; } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback, }; static int __init cf_check bsp_init(void) { if ( iommu_enabled ) { cpu_callback(&cpu_nfb, CPU_UP_PREPARE, (void *)(unsigned long)smp_processor_id()); register_cpu_notifier(&cpu_nfb); } return 0; } presmp_initcall(bsp_init); bool arch_iommu_use_permitted(const struct domain *d) { /* * Prevent device assign if mem paging, mem sharing or log-dirty * have been enabled for this domain, or if PoD is still in active use. */ return d == dom_io || (likely(!mem_sharing_enabled(d)) && likely(!mem_paging_enabled(d)) && likely(!p2m_pod_active(d)) && likely(!p2m_is_global_logdirty(d))); } static int __init cf_check adjust_irq_affinities(void) { iommu_adjust_irq_affinities(); return 0; } __initcall(adjust_irq_affinities); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */