/*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
const struct iommu_init_ops *__initdata iommu_init_ops;
struct iommu_ops __ro_after_init iommu_ops;
bool __read_mostly iommu_non_coherent;
bool __initdata iommu_superpages = true;
enum iommu_intremap __read_mostly iommu_intremap = iommu_intremap_full;
#ifdef CONFIG_PV
/* Possible unfiltered LAPIC/MSI messages from untrusted sources? */
bool __read_mostly untrusted_msi;
#endif
#ifndef iommu_intpost
/*
* In the current implementation of VT-d posted interrupts, in some extreme
* cases, the per cpu list which saves the blocked vCPU will be very long,
* and this will affect the interrupt latency, so let this feature off by
* default until we find a good solution to resolve it.
*/
bool __read_mostly iommu_intpost;
#endif
void __init acpi_iommu_init(void)
{
int ret = -ENODEV;
if ( !iommu_enable && !iommu_intremap )
return;
if ( !acpi_disabled )
{
ret = acpi_dmar_init();
#ifndef iommu_snoop
/*
* As long as there's no per-domain snoop control, and as long as on
* AMD we uniformly force coherent accesses, a possible command line
* override should affect VT-d only.
*/
if ( ret )
iommu_snoop = true;
#endif
if ( ret == -ENODEV )
ret = acpi_ivrs_init();
}
if ( ret )
{
iommu_enable = false;
iommu_intremap = iommu_intremap_off;
}
}
int __init iommu_hardware_setup(void)
{
struct IO_APIC_route_entry **ioapic_entries = NULL;
int rc;
if ( !iommu_init_ops )
return -ENODEV;
rc = scan_pci_devices();
if ( rc )
return rc;
if ( !iommu_ops.init )
iommu_ops = *iommu_init_ops->ops;
else
/* x2apic setup may have previously initialised the struct. */
ASSERT(iommu_ops.init == iommu_init_ops->ops->init);
if ( !x2apic_enabled && iommu_intremap )
{
/*
* If x2APIC is enabled interrupt remapping is already enabled, so
* there's no need to mess with the IO-APIC because the remapping
* entries are already correctly setup by x2apic_bsp_setup.
*/
ioapic_entries = alloc_ioapic_entries();
if ( !ioapic_entries )
return -ENOMEM;
rc = save_IO_APIC_setup(ioapic_entries);
if ( rc )
{
free_ioapic_entries(ioapic_entries);
return rc;
}
mask_8259A();
mask_IO_APIC_setup(ioapic_entries);
}
if ( !iommu_superpages )
iommu_ops.page_sizes &= PAGE_SIZE_4K;
rc = iommu_init_ops->setup();
ASSERT(iommu_superpages || iommu_ops.page_sizes == PAGE_SIZE_4K);
if ( ioapic_entries )
{
restore_IO_APIC_setup(ioapic_entries, rc);
unmask_8259A();
free_ioapic_entries(ioapic_entries);
}
return rc;
}
int iommu_enable_x2apic(void)
{
if ( system_state < SYS_STATE_active )
{
if ( !iommu_supports_x2apic() )
return -EOPNOTSUPP;
iommu_ops = *iommu_init_ops->ops;
}
else if ( !x2apic_enabled )
return -EOPNOTSUPP;
if ( !iommu_ops.enable_x2apic )
return -EOPNOTSUPP;
return iommu_call(&iommu_ops, enable_x2apic);
}
void iommu_update_ire_from_apic(
unsigned int apic, unsigned int reg, unsigned int value)
{
iommu_vcall(&iommu_ops, update_ire_from_apic, apic, reg, value);
}
unsigned int iommu_read_apic_from_ire(unsigned int apic, unsigned int reg)
{
return iommu_call(&iommu_ops, read_apic_from_ire, apic, reg);
}
int __init iommu_setup_hpet_msi(struct msi_desc *msi)
{
const struct iommu_ops *ops = iommu_get_ops();
return ops->setup_hpet_msi ? iommu_call(ops, setup_hpet_msi, msi) : -ENODEV;
}
void __hwdom_init arch_iommu_check_autotranslated_hwdom(struct domain *d)
{
if ( !is_iommu_enabled(d) )
panic("Presently, iommu must be enabled for PVH hardware domain\n");
if ( !iommu_hwdom_strict )
panic("PVH hardware domain iommu must be set in 'strict' mode\n");
}
int arch_iommu_domain_init(struct domain *d)
{
struct domain_iommu *hd = dom_iommu(d);
spin_lock_init(&hd->arch.mapping_lock);
INIT_PAGE_LIST_HEAD(&hd->arch.pgtables.list);
spin_lock_init(&hd->arch.pgtables.lock);
INIT_LIST_HEAD(&hd->arch.identity_maps);
return 0;
}
void arch_iommu_domain_destroy(struct domain *d)
{
/*
* There should be not page-tables left allocated by the time the
* domain is destroyed. Note that arch_iommu_domain_destroy() is
* called unconditionally, so pgtables may be uninitialized.
*/
ASSERT(!dom_iommu(d)->platform_ops ||
page_list_empty(&dom_iommu(d)->arch.pgtables.list));
}
struct identity_map {
struct list_head list;
paddr_t base, end;
p2m_access_t access;
unsigned int count;
};
int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma,
paddr_t base, paddr_t end,
unsigned int flag)
{
unsigned long base_pfn = base >> PAGE_SHIFT_4K;
unsigned long end_pfn = PAGE_ALIGN_4K(end) >> PAGE_SHIFT_4K;
struct identity_map *map;
struct domain_iommu *hd = dom_iommu(d);
ASSERT(pcidevs_locked());
ASSERT(base < end);
/*
* No need to acquire hd->arch.mapping_lock: Both insertion and removal
* get done while holding pcidevs_lock.
*/
list_for_each_entry( map, &hd->arch.identity_maps, list )
{
if ( map->base == base && map->end == end )
{
int ret = 0;
if ( p2ma != p2m_access_x )
{
if ( map->access != p2ma )
return -EADDRINUSE;
++map->count;
return 0;
}
if ( --map->count )
return 0;
while ( base_pfn < end_pfn )
{
if ( clear_identity_p2m_entry(d, base_pfn) )
ret = -ENXIO;
base_pfn++;
}
list_del(&map->list);
xfree(map);
return ret;
}
if ( end >= map->base && map->end >= base )
return -EADDRINUSE;
}
if ( p2ma == p2m_access_x )
return -ENOENT;
while ( base_pfn < end_pfn )
{
int err = set_identity_p2m_entry(d, base_pfn, p2ma, flag);
if ( err )
return err;
base_pfn++;
}
map = xmalloc(struct identity_map);
if ( !map )
return -ENOMEM;
map->base = base;
map->end = end;
map->access = p2ma;
map->count = 1;
list_add_tail(&map->list, &hd->arch.identity_maps);
return 0;
}
void iommu_identity_map_teardown(struct domain *d)
{
struct domain_iommu *hd = dom_iommu(d);
struct identity_map *map, *tmp;
list_for_each_entry_safe ( map, tmp, &hd->arch.identity_maps, list )
{
list_del(&map->list);
xfree(map);
}
}
static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d,
unsigned long pfn,
unsigned long max_pfn)
{
mfn_t mfn = _mfn(pfn);
unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable;
/*
* Set up 1:1 mapping for dom0. Default to include only conventional RAM
* areas and let RMRRs include needed reserved regions. When set, the
* inclusive mapping additionally maps in every pfn up to 4GB except those
* that fall in unusable ranges for PV Dom0.
*/
if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) )
return 0;
switch ( type = page_get_ram_type(mfn) )
{
case RAM_TYPE_UNUSABLE:
return 0;
case RAM_TYPE_CONVENTIONAL:
if ( iommu_hwdom_strict )
return 0;
break;
default:
if ( type & RAM_TYPE_RESERVED )
{
if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved )
perms = 0;
}
else if ( is_hvm_domain(d) )
return 0;
else if ( !iommu_hwdom_inclusive || pfn > max_pfn )
perms = 0;
}
/* Check that it doesn't overlap with the Interrupt Address Range. */
if ( pfn >= 0xfee00 && pfn <= 0xfeeff )
return 0;
/* ... or the IO-APIC */
if ( has_vioapic(d) )
{
for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ )
if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
return 0;
}
else if ( is_pv_domain(d) )
{
/*
* Be consistent with CPU mappings: Dom0 is permitted to establish r/o
* ones there (also for e.g. HPET in certain cases), so it should also
* have such established for IOMMUs.
*/
if ( iomem_access_permitted(d, pfn, pfn) &&
rangeset_contains_singleton(mmio_ro_ranges, pfn) )
perms = IOMMUF_readable;
}
/*
* ... or the PCIe MCFG regions.
* TODO: runtime added MMCFG regions are not checked to make sure they
* don't overlap with already mapped regions, thus preventing trapping.
*/
if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) )
return 0;
return perms;
}
void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
{
unsigned long i, top, max_pfn, start, count;
unsigned int flush_flags = 0, start_perms = 0;
BUG_ON(!is_hardware_domain(d));
/* Reserved IOMMU mappings are enabled by default. */
if ( iommu_hwdom_reserved == -1 )
iommu_hwdom_reserved = 1;
if ( iommu_hwdom_inclusive )
{
printk(XENLOG_WARNING
"IOMMU inclusive mappings are deprecated and will be removed in future versions\n");
if ( !is_pv_domain(d) )
{
printk(XENLOG_WARNING
"IOMMU inclusive mappings are only supported on PV Dom0\n");
iommu_hwdom_inclusive = false;
}
}
if ( iommu_hwdom_passthrough )
return;
max_pfn = (GB(4) >> PAGE_SHIFT) - 1;
top = max(max_pdx, pfn_to_pdx(max_pfn) + 1);
/*
* First Mb will get mapped in one go by pvh_populate_p2m(). Avoid
* setting up potentially conflicting mappings here.
*/
start = paging_mode_translate(d) ? PFN_DOWN(MB(1)) : 0;
for ( i = start, count = 0; i < top; )
{
unsigned long pfn = pdx_to_pfn(i);
unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn);
if ( !perms )
/* nothing */;
else if ( paging_mode_translate(d) )
{
int rc;
rc = p2m_add_identity_entry(d, pfn,
perms & IOMMUF_writable ? p2m_access_rw
: p2m_access_r,
0);
if ( rc )
printk(XENLOG_WARNING
"%pd: identity mapping of %lx failed: %d\n",
d, pfn, rc);
}
else if ( pfn != start + count || perms != start_perms )
{
long rc;
commit:
while ( (rc = iommu_map(d, _dfn(start), _mfn(start), count,
start_perms | IOMMUF_preempt,
&flush_flags)) > 0 )
{
start += rc;
count -= rc;
process_pending_softirqs();
}
if ( rc )
printk(XENLOG_WARNING
"%pd: IOMMU identity mapping of [%lx,%lx) failed: %ld\n",
d, start, start + count, rc);
start = pfn;
count = 1;
start_perms = perms;
}
else
++count;
if ( !(++i & 0xfffff) )
process_pending_softirqs();
if ( i == top && count )
goto commit;
}
/* Use if to avoid compiler warning */
if ( iommu_iotlb_flush_all(d, flush_flags) )
return;
}
void arch_pci_init_pdev(struct pci_dev *pdev)
{
pdev->arch.pseudo_domid = DOMID_INVALID;
}
unsigned long *__init iommu_init_domid(domid_t reserve)
{
unsigned long *map;
if ( !iommu_quarantine )
return ZERO_BLOCK_PTR;
BUILD_BUG_ON(DOMID_MASK * 2U >= UINT16_MAX);
map = xzalloc_array(unsigned long, BITS_TO_LONGS(UINT16_MAX - DOMID_MASK));
if ( map && reserve != DOMID_INVALID )
{
ASSERT(reserve > DOMID_MASK);
__set_bit(reserve & DOMID_MASK, map);
}
return map;
}
domid_t iommu_alloc_domid(unsigned long *map)
{
/*
* This is used uniformly across all IOMMUs, such that on typical
* systems we wouldn't re-use the same ID very quickly (perhaps never).
*/
static unsigned int start;
unsigned int idx = find_next_zero_bit(map, UINT16_MAX - DOMID_MASK, start);
ASSERT(pcidevs_locked());
if ( idx >= UINT16_MAX - DOMID_MASK )
idx = find_first_zero_bit(map, UINT16_MAX - DOMID_MASK);
if ( idx >= UINT16_MAX - DOMID_MASK )
return DOMID_INVALID;
__set_bit(idx, map);
start = idx + 1;
return idx | (DOMID_MASK + 1);
}
void iommu_free_domid(domid_t domid, unsigned long *map)
{
ASSERT(pcidevs_locked());
if ( domid == DOMID_INVALID )
return;
ASSERT(domid > DOMID_MASK);
if ( !__test_and_clear_bit(domid & DOMID_MASK, map) )
BUG();
}
int iommu_free_pgtables(struct domain *d)
{
struct domain_iommu *hd = dom_iommu(d);
struct page_info *pg;
unsigned int done = 0;
if ( !is_iommu_enabled(d) )
return 0;
/* After this barrier, no new IOMMU mappings can be inserted. */
spin_barrier(&hd->arch.mapping_lock);
/*
* Pages will be moved to the free list below. So we want to
* clear the root page-table to avoid any potential use after-free.
*/
iommu_vcall(hd->platform_ops, clear_root_pgtable, d);
while ( (pg = page_list_remove_head(&hd->arch.pgtables.list)) )
{
free_domheap_page(pg);
if ( !(++done & 0xff) && general_preempt_check() )
return -ERESTART;
}
return 0;
}
struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd,
uint64_t contig_mask)
{
unsigned int memflags = 0;
struct page_info *pg;
uint64_t *p;
#ifdef CONFIG_NUMA
if ( hd->node != NUMA_NO_NODE )
memflags = MEMF_node(hd->node);
#endif
pg = alloc_domheap_page(NULL, memflags);
if ( !pg )
return NULL;
p = __map_domain_page(pg);
if ( contig_mask )
{
/* See pt-contig-markers.h for a description of the marker scheme. */
unsigned int i, shift = find_first_set_bit(contig_mask);
ASSERT((CONTIG_LEVEL_SHIFT & (contig_mask >> shift)) == CONTIG_LEVEL_SHIFT);
p[0] = (CONTIG_LEVEL_SHIFT + 0ull) << shift;
p[1] = 0;
p[2] = 1ull << shift;
p[3] = 0;
for ( i = 4; i < PAGE_SIZE / sizeof(*p); i += 4 )
{
p[i + 0] = (find_first_set_bit(i) + 0ull) << shift;
p[i + 1] = 0;
p[i + 2] = 1ull << shift;
p[i + 3] = 0;
}
}
else
clear_page(p);
iommu_sync_cache(p, PAGE_SIZE);
unmap_domain_page(p);
spin_lock(&hd->arch.pgtables.lock);
page_list_add(pg, &hd->arch.pgtables.list);
spin_unlock(&hd->arch.pgtables.lock);
return pg;
}
/*
* Intermediate page tables which get replaced by large pages may only be
* freed after a suitable IOTLB flush. Hence such pages get queued on a
* per-CPU list, with a per-CPU tasklet processing the list on the assumption
* that the necessary IOTLB flush will have occurred by the time tasklets get
* to run. (List and tasklet being per-CPU has the benefit of accesses not
* requiring any locking.)
*/
static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
static void cf_check free_queued_pgtables(void *arg)
{
struct page_list_head *list = arg;
struct page_info *pg;
unsigned int done = 0;
ASSERT(list == &this_cpu(free_pgt_list));
while ( (pg = page_list_remove_head(list)) )
{
free_domheap_page(pg);
/*
* Just to be on the safe side, check for processing softirqs every
* once in a while. Generally it is expected that parties queuing
* pages for freeing will find a need for preemption before too many
* pages can be queued. Granularity of checking is somewhat arbitrary.
*/
if ( !(++done & 0x1ff) )
process_pending_softirqs();
}
}
void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg)
{
unsigned int cpu = smp_processor_id();
spin_lock(&hd->arch.pgtables.lock);
page_list_del(pg, &hd->arch.pgtables.list);
spin_unlock(&hd->arch.pgtables.lock);
page_list_add_tail(pg, &per_cpu(free_pgt_list, cpu));
tasklet_schedule(&per_cpu(free_pgt_tasklet, cpu));
}
static int cf_check cpu_callback(
struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct page_list_head *list = &per_cpu(free_pgt_list, cpu);
struct tasklet *tasklet = &per_cpu(free_pgt_tasklet, cpu);
switch ( action )
{
case CPU_DOWN_PREPARE:
tasklet_kill(tasklet);
break;
case CPU_DEAD:
if ( !page_list_empty(list) )
{
page_list_splice(list, &this_cpu(free_pgt_list));
INIT_PAGE_LIST_HEAD(list);
tasklet_schedule(&this_cpu(free_pgt_tasklet));
}
break;
case CPU_UP_PREPARE:
INIT_PAGE_LIST_HEAD(list);
fallthrough;
case CPU_DOWN_FAILED:
tasklet_init(tasklet, free_queued_pgtables, list);
if ( !page_list_empty(list) )
tasklet_schedule(tasklet);
break;
}
return NOTIFY_DONE;
}
static struct notifier_block cpu_nfb = {
.notifier_call = cpu_callback,
};
static int __init cf_check bsp_init(void)
{
if ( iommu_enabled )
{
cpu_callback(&cpu_nfb, CPU_UP_PREPARE,
(void *)(unsigned long)smp_processor_id());
register_cpu_notifier(&cpu_nfb);
}
return 0;
}
presmp_initcall(bsp_init);
bool arch_iommu_use_permitted(const struct domain *d)
{
/*
* Prevent device assign if mem paging, mem sharing or log-dirty
* have been enabled for this domain, or if PoD is still in active use.
*/
return d == dom_io ||
(likely(!mem_sharing_enabled(d)) &&
likely(!mem_paging_enabled(d)) &&
likely(!p2m_pod_active(d)) &&
likely(!p2m_is_global_logdirty(d)));
}
static int __init cf_check adjust_irq_affinities(void)
{
iommu_adjust_irq_affinities();
return 0;
}
__initcall(adjust_irq_affinities);
/*
* Local variables:
* mode: C
* c-file-style: "BSD"
* c-basic-offset: 4
* indent-tabs-mode: nil
* End:
*/