/*
* pci.c: HVM PCI setup.
*
* Leendert van Doorn, leendert@watson.ibm.com
* Copyright (c) 2005, International Business Machines Corporation.
*
* Copyright (c) 2006, Keir Fraser, XenSource Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; If not, see .
*/
#include "util.h"
#include "hypercall.h"
#include "config.h"
#include "pci_regs.h"
#include
#include
#include
#include
uint32_t pci_mem_start = HVM_BELOW_4G_MMIO_START;
const uint32_t pci_mem_end = RESERVED_MEMBASE;
uint64_t pci_hi_mem_start = 0, pci_hi_mem_end = 0;
enum virtual_vga virtual_vga = VGA_none;
unsigned long igd_opregion_pgbase = 0;
/* Check if the specified range conflicts with any reserved device memory. */
static bool check_overlap_all(uint64_t start, uint64_t size)
{
unsigned int i;
for ( i = 0; i < memory_map.nr_map; i++ )
{
if ( memory_map.map[i].type == E820_RESERVED &&
check_overlap(start, size,
memory_map.map[i].addr,
memory_map.map[i].size) )
return true;
}
return false;
}
/* Find the lowest RMRR ending above base but below 4G. */
static int find_next_rmrr(uint32_t base)
{
unsigned int i;
int next_rmrr = -1;
uint64_t end, min_end = GB(4);
for ( i = 0; i < memory_map.nr_map ; i++ )
{
end = memory_map.map[i].addr + memory_map.map[i].size;
if ( memory_map.map[i].type == E820_RESERVED &&
end > base && end <= min_end )
{
next_rmrr = i;
min_end = end;
}
}
return next_rmrr;
}
void pci_setup(void)
{
uint8_t is_64bar, using_64bar, bar64_relocate = 0;
uint32_t devfn, bar_reg, cmd, bar_data, bar_data_upper;
uint64_t base, bar_sz, bar_sz_upper, mmio_total = 0;
uint32_t vga_devfn = 256;
uint16_t class, vendor_id, device_id;
unsigned int bar, pin, link, isa_irq;
uint8_t pci_devfn_decode_type[256] = {};
/* Resources assignable to PCI devices via BARs. */
struct resource {
uint64_t base, max;
} *resource, mem_resource, high_mem_resource, io_resource;
/* Create a list of device BARs in descending order of size. */
struct bars {
uint32_t is_64bar;
uint32_t devfn;
uint32_t bar_reg;
uint64_t bar_sz;
} *bars = (struct bars *)scratch_start;
unsigned int i, nr_bars = 0;
uint64_t mmio_hole_size = 0;
const char *s;
/*
* Do we allow hvmloader to relocate guest memory in order to
* increase the size of the lowmem MMIO hole? Defaulting to 1
* here will mean that non-libxl toolstacks (including xend and
* home-grown ones) means that those using qemu-xen will still
* experience the memory relocation bug described below; but it
* also means that those using qemu-traditional will *not*
* experience any change; and it also means that there is a
* work-around for those using qemu-xen, namely switching to
* qemu-traditional.
*
* If we defaulted to 0, and failing to resize the hole caused any
* problems with qemu-traditional, then there is no work-around.
*
* Since xend can only use qemu-traditional, I think this is the
* option that will have the least impact.
*/
bool allow_memory_relocate = 1;
BUILD_BUG_ON((typeof(*pci_devfn_decode_type))PCI_COMMAND_IO !=
PCI_COMMAND_IO);
BUILD_BUG_ON((typeof(*pci_devfn_decode_type))PCI_COMMAND_MEMORY !=
PCI_COMMAND_MEMORY);
BUILD_BUG_ON((typeof(*pci_devfn_decode_type))PCI_COMMAND_MASTER !=
PCI_COMMAND_MASTER);
s = xenstore_read(HVM_XS_ALLOW_MEMORY_RELOCATE, NULL);
if ( s )
allow_memory_relocate = strtoll(s, NULL, 0);
printf("Relocating guest memory for lowmem MMIO space %s\n",
allow_memory_relocate?"enabled":"disabled");
s = xenstore_read("platform/mmio_hole_size", NULL);
if ( s )
mmio_hole_size = strtoll(s, NULL, 0);
/* Program PCI-ISA bridge with appropriate link routes. */
isa_irq = 0;
for ( link = 0; link < 4; link++ )
{
do { isa_irq = (isa_irq + 1) & 15;
} while ( !(PCI_ISA_IRQ_MASK & (1U << isa_irq)) );
pci_writeb(PCI_ISA_DEVFN, 0x60 + link, isa_irq);
printf("PCI-ISA link %u routed to IRQ%u\n", link, isa_irq);
}
/* Program ELCR to match PCI-wired IRQs. */
outb(0x4d0, (uint8_t)(PCI_ISA_IRQ_MASK >> 0));
outb(0x4d1, (uint8_t)(PCI_ISA_IRQ_MASK >> 8));
/* Scan the PCI bus and map resources. */
for ( devfn = 0; devfn < 256; devfn++ )
{
class = pci_readw(devfn, PCI_CLASS_DEVICE);
vendor_id = pci_readw(devfn, PCI_VENDOR_ID);
device_id = pci_readw(devfn, PCI_DEVICE_ID);
if ( (vendor_id == 0xffff) && (device_id == 0xffff) )
continue;
ASSERT((devfn != PCI_ISA_DEVFN) ||
((vendor_id == 0x8086) && (device_id == 0x7000)));
switch ( class )
{
case 0x0300:
/* If emulated VGA is found, preserve it as primary VGA. */
if ( (vendor_id == 0x1234) && (device_id == 0x1111) )
{
vga_devfn = devfn;
virtual_vga = VGA_std;
}
else if ( (vendor_id == 0x1013) && (device_id == 0xb8) )
{
vga_devfn = devfn;
virtual_vga = VGA_cirrus;
}
else if ( virtual_vga == VGA_none )
{
vga_devfn = devfn;
virtual_vga = VGA_pt;
if ( vendor_id == 0x8086 )
{
igd_opregion_pgbase = mem_hole_alloc(IGD_OPREGION_PAGES);
/*
* Write the the OpRegion offset to give the opregion
* address to the device model. The device model will trap
* and map the OpRegion at the give address.
*/
pci_writel(vga_devfn, PCI_INTEL_OPREGION,
igd_opregion_pgbase << PAGE_SHIFT);
}
}
break;
case 0x0680:
/* PIIX4 ACPI PM. Special device with special PCI config space. */
ASSERT((vendor_id == 0x8086) && (device_id == 0x7113));
pci_writew(devfn, 0x20, 0x0000); /* No smb bus IO enable */
pci_writew(devfn, 0xd2, 0x0000); /* No smb bus IO enable */
pci_writew(devfn, 0x22, 0x0000);
pci_writew(devfn, 0x3c, 0x0009); /* Hardcoded IRQ9 */
pci_writew(devfn, 0x3d, 0x0001);
pci_writel(devfn, 0x40, ACPI_PM1A_EVT_BLK_ADDRESS_V1 | 1);
pci_writeb(devfn, 0x80, 0x01); /* enable PM io space */
break;
case 0x0101:
if ( vendor_id == 0x8086 )
{
/* Intel ICHs since PIIX3: enable IDE legacy mode. */
pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */
pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */
}
break;
}
/*
* It is recommended that BAR programming be done whilst decode
* bits are cleared to avoid incorrect mappings being created.
* When 64-bit memory BAR is programmed, first by writing the
* lower half and then the upper half, which maps to an address
* under 4G, as soon as lower half is wriiten, replacing any RAM
* mapped in that address, which is not restored back after the
* upper half is written and PCI memory is correctly mapped to
* its intended high mem address.
*/
cmd = pci_readw(devfn, PCI_COMMAND);
cmd &= ~(PCI_COMMAND_MEMORY | PCI_COMMAND_IO);
pci_writew(devfn, PCI_COMMAND, cmd);
/* Map the I/O memory and port resources. */
for ( bar = 0; bar < 7; bar++ )
{
bar_sz_upper = 0;
bar_reg = PCI_BASE_ADDRESS_0 + 4*bar;
if ( bar == 6 )
bar_reg = PCI_ROM_ADDRESS;
bar_data = pci_readl(devfn, bar_reg);
if ( bar_reg != PCI_ROM_ADDRESS )
{
is_64bar = !!((bar_data & (PCI_BASE_ADDRESS_SPACE |
PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
(PCI_BASE_ADDRESS_SPACE_MEMORY |
PCI_BASE_ADDRESS_MEM_TYPE_64));
pci_writel(devfn, bar_reg, ~0);
}
else
{
is_64bar = 0;
pci_writel(devfn, bar_reg,
(bar_data | PCI_ROM_ADDRESS_MASK) &
~PCI_ROM_ADDRESS_ENABLE);
}
bar_sz = pci_readl(devfn, bar_reg);
pci_writel(devfn, bar_reg, bar_data);
if ( bar_reg != PCI_ROM_ADDRESS )
bar_sz &= (((bar_data & PCI_BASE_ADDRESS_SPACE) ==
PCI_BASE_ADDRESS_SPACE_MEMORY) ?
PCI_BASE_ADDRESS_MEM_MASK :
(PCI_BASE_ADDRESS_IO_MASK & 0xffff));
else
bar_sz &= PCI_ROM_ADDRESS_MASK;
if (is_64bar) {
bar_data_upper = pci_readl(devfn, bar_reg + 4);
pci_writel(devfn, bar_reg + 4, ~0);
bar_sz_upper = pci_readl(devfn, bar_reg + 4);
pci_writel(devfn, bar_reg + 4, bar_data_upper);
bar_sz = (bar_sz_upper << 32) | bar_sz;
}
bar_sz &= ~(bar_sz - 1);
if ( bar_sz == 0 )
continue;
for ( i = 0; i < nr_bars; i++ )
if ( bars[i].bar_sz < bar_sz )
break;
if ( i != nr_bars )
memmove(&bars[i+1], &bars[i], (nr_bars-i) * sizeof(*bars));
bars[i].is_64bar = is_64bar;
bars[i].devfn = devfn;
bars[i].bar_reg = bar_reg;
bars[i].bar_sz = bar_sz;
if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) ==
PCI_BASE_ADDRESS_SPACE_MEMORY) ||
(bar_reg == PCI_ROM_ADDRESS) )
mmio_total += bar_sz;
nr_bars++;
/*The upper half is already calculated, skip it! */
if (is_64bar)
bar++;
}
/* Map the interrupt. */
pin = pci_readb(devfn, PCI_INTERRUPT_PIN);
if ( pin != 0 )
{
/* This is the barber's pole mapping used by Xen. */
link = ((pin - 1) + (devfn >> 3)) & 3;
isa_irq = pci_readb(PCI_ISA_DEVFN, 0x60 + link);
pci_writeb(devfn, PCI_INTERRUPT_LINE, isa_irq);
printf("pci dev %02x:%x INT%c->IRQ%u\n",
devfn>>3, devfn&7, 'A'+pin-1, isa_irq);
}
/* Enable bus master for this function later */
pci_devfn_decode_type[devfn] = PCI_COMMAND_MASTER;
}
if ( mmio_hole_size )
{
uint64_t max_ram_below_4g = GB(4) - mmio_hole_size;
if ( max_ram_below_4g > HVM_BELOW_4G_MMIO_START )
{
printf("max_ram_below_4g=0x"PRIllx
" too big for mmio_hole_size=0x"PRIllx
" has been ignored.\n",
PRIllx_arg(max_ram_below_4g),
PRIllx_arg(mmio_hole_size));
}
else
{
pci_mem_start = max_ram_below_4g;
printf("pci_mem_start=0x%x (was 0x%x) for mmio_hole_size=0x%lx\n",
pci_mem_start, HVM_BELOW_4G_MMIO_START,
(long)mmio_hole_size);
}
}
else
{
/*
* At the moment qemu-xen can't deal with relocated memory regions.
* It's too close to the release to make a proper fix; for now,
* only allow the MMIO hole to grow large enough to move guest memory
* if we're running qemu-traditional. Items that don't fit will be
* relocated into the 64-bit address space.
*
* This loop now does the following:
* - If allow_memory_relocate, increase the MMIO hole until it's
* big enough, or until it's 2GiB
* - If !allow_memory_relocate, increase the MMIO hole until it's
* big enough, or until it's 2GiB, or until it overlaps guest
* memory
*/
while ( (mmio_total > (pci_mem_end - pci_mem_start))
&& ((pci_mem_start << 1) != 0)
&& (allow_memory_relocate
|| (((pci_mem_start << 1) >> PAGE_SHIFT)
>= hvm_info->low_mem_pgend)) )
pci_mem_start <<= 1;
/*
* Try to accommodate RMRRs in our MMIO region on a best-effort basis.
* If we have RMRRs in the range, then make pci_mem_start just after
* hvm_info->low_mem_pgend.
*/
if ( pci_mem_start > (hvm_info->low_mem_pgend << PAGE_SHIFT) &&
check_overlap_all(pci_mem_start, pci_mem_end-pci_mem_start) )
pci_mem_start = hvm_info->low_mem_pgend << PAGE_SHIFT;
}
if ( mmio_total > (pci_mem_end - pci_mem_start) )
{
printf("Low MMIO hole not large enough for all devices,"
" relocating some BARs to 64-bit\n");
bar64_relocate = 1;
}
/* Relocate RAM that overlaps PCI space (in 64k-page chunks). */
while ( (pci_mem_start >> PAGE_SHIFT) < hvm_info->low_mem_pgend )
{
struct xen_add_to_physmap xatp;
unsigned int nr_pages = min_t(
unsigned int,
hvm_info->low_mem_pgend - (pci_mem_start >> PAGE_SHIFT),
(1u << 16) - 1);
if ( hvm_info->high_mem_pgend == 0 )
hvm_info->high_mem_pgend = 1ull << (32 - PAGE_SHIFT);
hvm_info->low_mem_pgend -= nr_pages;
printf("Relocating 0x%x pages from "PRIllx" to "PRIllx\
" for lowmem MMIO hole\n",
nr_pages,
PRIllx_arg(((uint64_t)hvm_info->low_mem_pgend)<high_mem_pgend)<low_mem_pgend;
xatp.gpfn = hvm_info->high_mem_pgend;
xatp.size = nr_pages;
if ( hypercall_memory_op(XENMEM_add_to_physmap, &xatp) != 0 )
BUG();
hvm_info->high_mem_pgend += nr_pages;
}
/* Sync memory map[] if necessary. */
adjust_memory_map();
high_mem_resource.base = ((uint64_t)hvm_info->high_mem_pgend) << PAGE_SHIFT;
if ( high_mem_resource.base < GB(4) )
{
if ( hvm_info->high_mem_pgend != 0 )
printf("WARNING: hvm_info->high_mem_pgend %x"
" does not point into high memory!",
hvm_info->high_mem_pgend);
high_mem_resource.base = GB(4);
}
printf("%sRAM in high memory; setting high_mem resource base to "PRIllx"\n",
hvm_info->high_mem_pgend?"":"No ",
PRIllx_arg(high_mem_resource.base));
high_mem_resource.max = 1ull << cpu_phys_addr();
mem_resource.base = pci_mem_start;
mem_resource.max = pci_mem_end;
io_resource.base = 0xc000;
io_resource.max = 0x10000;
/* Assign iomem and ioport resources in descending order of size. */
for ( i = 0; i < nr_bars; i++ )
{
devfn = bars[i].devfn;
bar_reg = bars[i].bar_reg;
bar_sz = bars[i].bar_sz;
/*
* Relocate to high memory if the total amount of MMIO needed
* is more than the low MMIO available. Because devices are
* processed in order of bar_sz, this will preferentially
* relocate larger devices to high memory first.
*
* NB: The code here is rather fragile, as the check here to see
* whether bar_sz will fit in the low MMIO region doesn't match the
* real check made below, which involves aligning the base offset of the
* bar with the size of the bar itself. As it happens, this will always
* be satisfied because:
* - The first one will succeed because the MMIO hole can only start at
* 0x{f,e,c,8}00000000. If it fits, it will be aligned properly.
* - All subsequent ones will be aligned because the list is ordered
* large to small, and bar_sz is always a power of 2. (At least
* the code here assumes it to be.)
* Should either of those two conditions change, this code will break.
*/
using_64bar = bars[i].is_64bar && bar64_relocate
&& (mmio_total > (mem_resource.max - mem_resource.base));
bar_data = pci_readl(devfn, bar_reg);
if ( (bar_data & PCI_BASE_ADDRESS_SPACE) ==
PCI_BASE_ADDRESS_SPACE_MEMORY )
{
/* Mapping high memory if PCI device is 64 bits bar */
if ( using_64bar ) {
if ( high_mem_resource.base & (bar_sz - 1) )
high_mem_resource.base = high_mem_resource.base -
(high_mem_resource.base & (bar_sz - 1)) + bar_sz;
if ( !pci_hi_mem_start )
pci_hi_mem_start = high_mem_resource.base;
resource = &high_mem_resource;
bar_data &= ~PCI_BASE_ADDRESS_MEM_MASK;
}
else {
resource = &mem_resource;
bar_data &= ~PCI_BASE_ADDRESS_MEM_MASK;
}
mmio_total -= bar_sz;
}
else
{
resource = &io_resource;
bar_data &= ~PCI_BASE_ADDRESS_IO_MASK;
}
base = (resource->base + bar_sz - 1) & ~(uint64_t)(bar_sz - 1);
/* If we're using mem_resource, check for RMRR conflicts. */
if ( resource == &mem_resource)
{
int next_rmrr = find_next_rmrr(base);
while ( next_rmrr >= 0 &&
check_overlap(base, bar_sz,
memory_map.map[next_rmrr].addr,
memory_map.map[next_rmrr].size) )
{
base = memory_map.map[next_rmrr].addr +
memory_map.map[next_rmrr].size;
base = (base + bar_sz - 1) & ~(bar_sz - 1);
next_rmrr = find_next_rmrr(base);
}
}
bar_data |= (uint32_t)base;
bar_data_upper = (uint32_t)(base >> 32);
base += bar_sz;
if ( (base < resource->base) || (base > resource->max) )
{
printf("pci dev %02x:%x bar %02x size "PRIllx": no space for "
"resource!\n", devfn>>3, devfn&7, bar_reg,
PRIllx_arg(bar_sz));
continue;
}
resource->base = base;
pci_writel(devfn, bar_reg, bar_data);
if (using_64bar)
pci_writel(devfn, bar_reg + 4, bar_data_upper);
printf("pci dev %02x:%x bar %02x size "PRIllx": %x%08x\n",
devfn>>3, devfn&7, bar_reg,
PRIllx_arg(bar_sz),
bar_data_upper, bar_data);
if ( (bar_reg == PCI_ROM_ADDRESS) ||
((bar_data & PCI_BASE_ADDRESS_SPACE) ==
PCI_BASE_ADDRESS_SPACE_MEMORY) )
pci_devfn_decode_type[devfn] |= PCI_COMMAND_MEMORY;
else
pci_devfn_decode_type[devfn] |= PCI_COMMAND_IO;
}
if ( pci_hi_mem_start )
{
/*
* Make end address alignment match the start address one's so that
* fewer variable range MTRRs are needed to cover the range.
*/
pci_hi_mem_end = ((high_mem_resource.base - 1) |
((pci_hi_mem_start & -pci_hi_mem_start) - 1)) + 1;
}
if ( vga_devfn != 256 )
{
/*
* VGA registers live in I/O space so ensure that primary VGA
* has IO enabled, even if there is no I/O BAR on that
* particular device.
*/
pci_devfn_decode_type[vga_devfn] |= PCI_COMMAND_IO;
}
/* Enable bus master, memory and I/O decode for all valid functions. */
for ( devfn = 0; devfn < 256; devfn++ )
if ( pci_devfn_decode_type[devfn] )
{
cmd = pci_readw(devfn, PCI_COMMAND);
cmd |= pci_devfn_decode_type[devfn];
pci_writew(devfn, PCI_COMMAND, cmd);
}
}
/*
* Local variables:
* mode: C
* c-file-style: "BSD"
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/