summaryrefslogtreecommitdiff
path: root/nova/compute/pci_placement_translator.py
diff options
context:
space:
mode:
Diffstat (limited to 'nova/compute/pci_placement_translator.py')
-rw-r--r--nova/compute/pci_placement_translator.py623
1 files changed, 623 insertions, 0 deletions
diff --git a/nova/compute/pci_placement_translator.py b/nova/compute/pci_placement_translator.py
new file mode 100644
index 0000000000..016efd9122
--- /dev/null
+++ b/nova/compute/pci_placement_translator.py
@@ -0,0 +1,623 @@
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import collections
+import copy
+import typing as ty
+
+import os_resource_classes
+import os_traits
+from oslo_log import log as logging
+from oslo_utils import uuidutils
+
+from nova.compute import provider_tree
+import nova.conf
+from nova import exception
+from nova.i18n import _
+from nova.objects import fields
+from nova.objects import pci_device
+from nova.pci import devspec
+from nova.pci import manager as pci_manager
+
+
+CONF = nova.conf.CONF
+LOG = logging.getLogger(__name__)
+
+
+# Devs with this type are in one to one mapping with an RP in placement
+PARENT_TYPES = (
+ fields.PciDeviceType.STANDARD, fields.PciDeviceType.SRIOV_PF)
+# Devs with these type need to have a parent and that parent is the one
+# that mapped to a placement RP
+CHILD_TYPES = (
+ fields.PciDeviceType.SRIOV_VF, fields.PciDeviceType.VDPA)
+
+
+def _is_placement_tracking_enabled() -> bool:
+ return CONF.pci.report_in_placement
+
+
+def _normalize_traits(traits: ty.List[str]) -> ty.List[str]:
+ """Make the trait names acceptable for placement.
+
+ It keeps the already valid standard or custom traits but normalizes trait
+ names that are not already normalized.
+ """
+ standard_traits, rest = os_traits.check_traits(traits)
+ custom_traits = []
+ for name in rest:
+ name = name.upper()
+ if os_traits.is_custom(name):
+ custom_traits.append(name)
+ else:
+ custom_traits.append(os_traits.normalize_name(name))
+
+ return list(standard_traits) + custom_traits
+
+
+def get_traits(traits_str: str) -> ty.Set[str]:
+ """Return a normalized set of placement standard and custom traits from
+ a string of comma separated trait names.
+ """
+ # traits is a comma separated list of placement trait names
+ if not traits_str:
+ return set()
+ return set(_normalize_traits(traits_str.split(',')))
+
+
+def _get_traits_for_dev(
+ dev_spec_tags: ty.Dict[str, str],
+) -> ty.Set[str]:
+ return get_traits(dev_spec_tags.get("traits", "")) | {
+ os_traits.COMPUTE_MANAGED_PCI_DEVICE
+ }
+
+
+def _normalize_resource_class(rc: str) -> str:
+ rc = rc.upper()
+ if (
+ rc not in os_resource_classes.STANDARDS and
+ not os_resource_classes.is_custom(rc)
+ ):
+ rc = os_resource_classes.normalize_name(rc)
+ # mypy: normalize_name will return non None for non None input
+ assert rc
+
+ return rc
+
+
+def get_resource_class(
+ requested_name: ty.Optional[str], vendor_id: str, product_id: str
+) -> str:
+ """Return the normalized resource class name based on what is requested
+ or if nothing is requested then generated from the vendor_id and product_id
+ """
+ if requested_name:
+ rc = _normalize_resource_class(requested_name)
+ else:
+ rc = f"CUSTOM_PCI_{vendor_id}_{product_id}".upper()
+ return rc
+
+
+def _get_rc_for_dev(
+ dev: pci_device.PciDevice,
+ dev_spec_tags: ty.Dict[str, str],
+) -> str:
+ """Return the resource class to represent the device.
+
+ It is either provided by the user in the configuration as the
+ resource_class tag, or we are generating one from vendor_id and product_id.
+
+ The user specified resource class is normalized if it is not already an
+ acceptable standard or custom resource class.
+ """
+ rc = dev_spec_tags.get("resource_class")
+ return get_resource_class(rc, dev.vendor_id, dev.product_id)
+
+
+class PciResourceProvider:
+ """A PCI Resource Provider"""
+
+ def __init__(self, name: str) -> None:
+ self.name = name
+ self.parent_dev = None
+ self.children_devs: ty.List[pci_device.PciDevice] = []
+ self.resource_class: ty.Optional[str] = None
+ self.traits: ty.Optional[ty.Set[str]] = None
+
+ @property
+ def devs(self) -> ty.List[pci_device.PciDevice]:
+ return [self.parent_dev] if self.parent_dev else self.children_devs
+
+ @property
+ def to_be_deleted(self):
+ return not bool(self.devs)
+
+ def add_child(self, dev, dev_spec_tags: ty.Dict[str, str]) -> None:
+ if self.parent_dev:
+ raise exception.PlacementPciDependentDeviceException(
+ parent_dev=dev.address,
+ children_devs=",".join(dev.address for dev in self.devs)
+ )
+
+ rc = _get_rc_for_dev(dev, dev_spec_tags)
+ if self.resource_class and rc != self.resource_class:
+ raise exception.PlacementPciMixedResourceClassException(
+ new_rc=rc,
+ new_dev=dev.address,
+ current_rc=self.resource_class,
+ current_devs=",".join(
+ dev.address for dev in self.children_devs)
+ )
+
+ traits = _get_traits_for_dev(dev_spec_tags)
+ if self.traits is not None and self.traits != traits:
+ raise exception.PlacementPciMixedTraitsException(
+ new_traits=",".join(sorted(traits)),
+ new_dev=dev.address,
+ current_traits=",".join(sorted(self.traits)),
+ current_devs=",".join(
+ dev.address for dev in self.children_devs),
+ )
+
+ self.children_devs.append(dev)
+ self.resource_class = rc
+ self.traits = traits
+
+ def add_parent(self, dev, dev_spec_tags: ty.Dict[str, str]) -> None:
+ if self.parent_dev or self.children_devs:
+ raise exception.PlacementPciDependentDeviceException(
+ parent_dev=dev.address,
+ children_devs=",".join(dev.address for dev in self.devs)
+ )
+
+ self.parent_dev = dev
+ self.resource_class = _get_rc_for_dev(dev, dev_spec_tags)
+ self.traits = _get_traits_for_dev(dev_spec_tags)
+
+ def remove_child(self, dev: pci_device.PciDevice) -> None:
+ # Nothing to do here. The update_provider_tree will handle the
+ # inventory decrease or the full RP removal
+ pass
+
+ def remove_parent(self, dev: pci_device.PciDevice) -> None:
+ # Nothing to do here. The update_provider_tree we handle full RP
+ pass
+
+ def _get_allocations(self) -> ty.Mapping[str, int]:
+ """Return a dict of used resources keyed by consumer UUID.
+
+ Note that:
+ 1) a single consumer can consume more than one resource from a single
+ RP. I.e. A VM with two VFs from the same parent PF
+ 2) multiple consumers can consume resources from a single RP. I.e. two
+ VMs consuming one VF from the same PF each
+ 3) regardless of how many consumers we have on a single PCI RP, they
+ are always consuming resources from the same resource class as
+ we are not supporting dependent devices modelled by the same RP but
+ different resource classes.
+ """
+ return collections.Counter(
+ [
+ dev.instance_uuid
+ for dev in self.devs
+ if "instance_uuid" in dev and dev.instance_uuid
+ ]
+ )
+
+ def update_provider_tree(
+ self,
+ provider_tree: provider_tree.ProviderTree,
+ parent_rp_name: str,
+ ) -> None:
+
+ if self.to_be_deleted:
+ # This means we need to delete the RP from placement if exists
+ if provider_tree.exists(self.name):
+ # NOTE(gibi): If there are allocations on this RP then
+ # Placement will reject the update the provider_tree is
+ # synced up.
+ provider_tree.remove(self.name)
+
+ return
+
+ if not provider_tree.exists(self.name):
+ # NOTE(gibi): We need to generate UUID for the new provider in Nova
+ # instead of letting Placement assign one. We are potentially
+ # healing a missing RP along with missing allocations on that RP.
+ # The allocation healing happens with POST /reshape, and that API
+ # only takes RP UUIDs.
+ provider_tree.new_child(
+ self.name,
+ parent_rp_name,
+ uuid=uuidutils.generate_uuid(dashed=True)
+ )
+
+ provider_tree.update_inventory(
+ self.name,
+ # NOTE(gibi): The rest of the inventory fields (reserved,
+ # allocation_ratio, etc.) are defaulted by placement and the
+ # default value make sense for PCI devices, i.e. no overallocation
+ # and PCI can be allocated one by one.
+ # Also, this way if the operator sets reserved value in placement
+ # for the PCI inventories directly then nova will not override that
+ # value periodically.
+ {
+ self.resource_class: {
+ "total": len(self.devs),
+ "max_unit": len(self.devs),
+ }
+ },
+ )
+ provider_tree.update_traits(self.name, self.traits)
+
+ # Here we are sure the RP exists in the provider_tree. So, we can
+ # record the RP UUID in each PciDevice this RP represents
+ rp_uuid = provider_tree.data(self.name).uuid
+ for dev in self.devs:
+ dev.extra_info['rp_uuid'] = rp_uuid
+
+ def update_allocations(
+ self,
+ allocations: dict,
+ provider_tree: provider_tree.ProviderTree,
+ same_host_instances: ty.List[str],
+ ) -> bool:
+ updated = False
+
+ if self.to_be_deleted:
+ # the RP is going away because either removed from the hypervisor
+ # or the compute's config is changed to ignore the device.
+ return updated
+
+ # we assume here that if this RP has been created in the current round
+ # of healing then it already has a UUID assigned.
+ rp_uuid = provider_tree.data(self.name).uuid
+
+ for consumer, amount in self._get_allocations().items():
+ if consumer not in allocations:
+ # We have PCI device(s) allocated to an instance, but we don't
+ # see any instance allocation in placement. This
+ # happens for two reasons:
+ # 1) The instance is being migrated and therefore the
+ # allocation is held by the migration UUID in placement. In
+ # this case the PciDevice is still allocated to the instance
+ # UUID in the nova DB hence our lookup for the instance
+ # allocation here. We can ignore this case as: i) We healed
+ # the PCI allocation for the instance before the migration
+ # was started. ii) Nova simply moves the allocation from the
+ # instance UUID to the migration UUID in placement. So we
+ # assume the migration allocation is correct without
+ # healing. One limitation of this is that if there is in
+ # progress migration when nova is upgraded, then the PCI
+ # allocation of that migration will be missing from
+ # placement on the source host. But it is temporary and the
+ # allocation will be fixed as soon as the migration is
+ # completed or reverted.
+ # 2) We have a bug in the scheduler or placement and the whole
+ # instance allocation is lost. We cannot handle that here.
+ # It is expected to be healed via nova-manage placement
+ # heal_allocation CLI instead.
+ continue
+
+ if consumer in same_host_instances:
+ # This is a nasty special case. This instance is undergoing
+ # a same host resize. So in Placement the source host
+ # allocation is held by the migration UUID *but* the
+ # PciDevice.instance_uuid is set for the instance UUID both
+ # on the source and on the destination host. As the source and
+ # dest are the same for migration we will see PciDevice
+ # objects assigned to this instance that should not be
+ # allocated to the instance UUID in placement.
+ # As noted above we don't want to take care in progress
+ # migration during healing. So we simply ignore this instance.
+ # If the instance needs healing then it will be healed when
+ # after the migration is confirmed or reverted.
+ continue
+
+ current_allocs = allocations[consumer]['allocations']
+ current_rp_allocs = current_allocs.get(rp_uuid)
+
+ if current_rp_allocs:
+ # update an existing allocation if the current one differs
+ current_rc_allocs = current_rp_allocs["resources"].get(
+ self.resource_class, 0)
+ if current_rc_allocs != amount:
+ current_rp_allocs[
+ "resources"][self.resource_class] = amount
+ updated = True
+ else:
+ # insert a new allocation as it is missing
+ current_allocs[rp_uuid] = {
+ "resources": {self.resource_class: amount}
+ }
+ updated = True
+
+ return updated
+
+ def __str__(self) -> str:
+ if self.devs:
+ return (
+ f"RP({self.name}, {self.resource_class}={len(self.devs)}, "
+ f"traits={','.join(sorted(self.traits or set()))})"
+ )
+ else:
+ return f"RP({self.name}, <EMPTY>)"
+
+
+class PlacementView:
+ """The PCI Placement view"""
+
+ def __init__(
+ self,
+ hypervisor_hostname: str,
+ instances_under_same_host_resize: ty.List[str],
+ ) -> None:
+ self.rps: ty.Dict[str, PciResourceProvider] = {}
+ self.root_rp_name = hypervisor_hostname
+ self.same_host_instances = instances_under_same_host_resize
+
+ def _get_rp_name_for_address(self, addr: str) -> str:
+ return f"{self.root_rp_name}_{addr.upper()}"
+
+ def _ensure_rp(self, rp_name: str) -> PciResourceProvider:
+ return self.rps.setdefault(rp_name, PciResourceProvider(rp_name))
+
+ def _get_rp_name_for_child(self, dev: pci_device.PciDevice) -> str:
+ if not dev.parent_addr:
+ msg = _(
+ "Missing parent address for PCI device s(dev)% with "
+ "type s(type)s"
+ ) % {
+ "dev": dev.address,
+ "type": dev.dev_type,
+ }
+ raise exception.PlacementPciException(error=msg)
+
+ return self._get_rp_name_for_address(dev.parent_addr)
+
+ def _add_child(
+ self, dev: pci_device.PciDevice, dev_spec_tags: ty.Dict[str, str]
+ ) -> None:
+ rp_name = self._get_rp_name_for_child(dev)
+ self._ensure_rp(rp_name).add_child(dev, dev_spec_tags)
+
+ def _add_parent(
+ self, dev: pci_device.PciDevice, dev_spec_tags: ty.Dict[str, str]
+ ) -> None:
+ rp_name = self._get_rp_name_for_address(dev.address)
+ self._ensure_rp(rp_name).add_parent(dev, dev_spec_tags)
+
+ def _add_dev(
+ self, dev: pci_device.PciDevice, dev_spec_tags: ty.Dict[str, str]
+ ) -> None:
+ if dev_spec_tags.get("physical_network"):
+ # NOTE(gibi): We ignore devices that has physnet configured as
+ # those are there for Neutron based SRIOV and that is out of scope
+ # for now. Later these devices will be tracked as PCI_NETDEV
+ # devices in placement.
+ return
+
+ if dev.dev_type in PARENT_TYPES:
+ self._add_parent(dev, dev_spec_tags)
+ elif dev.dev_type in CHILD_TYPES:
+ self._add_child(dev, dev_spec_tags)
+ else:
+ msg = _(
+ "Unhandled PCI device type %(type)s for %(dev)s. Please "
+ "report a bug."
+ ) % {
+ "type": dev.dev_type,
+ "dev": dev.address,
+ }
+ raise exception.PlacementPciException(error=msg)
+
+ def _remove_child(self, dev: pci_device.PciDevice) -> None:
+ rp_name = self._get_rp_name_for_child(dev)
+ self._ensure_rp(rp_name).remove_child(dev)
+
+ def _remove_parent(self, dev: pci_device.PciDevice) -> None:
+ rp_name = self._get_rp_name_for_address(dev.address)
+ self._ensure_rp(rp_name).remove_parent(dev)
+
+ def _remove_dev(self, dev: pci_device.PciDevice) -> None:
+ """Remove PCI devices from Placement that existed before but now
+ deleted from the hypervisor or unlisted from [pci]device_spec
+ """
+ if dev.dev_type in PARENT_TYPES:
+ self._remove_parent(dev)
+ elif dev.dev_type in CHILD_TYPES:
+ self._remove_child(dev)
+
+ def process_dev(
+ self,
+ dev: pci_device.PciDevice,
+ dev_spec: ty.Optional[devspec.PciDeviceSpec],
+ ) -> None:
+
+ if dev.status in (
+ fields.PciDeviceStatus.DELETED,
+ fields.PciDeviceStatus.REMOVED,
+ ):
+ # If the PCI tracker marked the device DELETED or REMOVED then
+ # such device is not allocated, so we are free to drop it from
+ # placement too.
+ self._remove_dev(dev)
+ else:
+ if not dev_spec:
+ if dev.instance_uuid:
+ LOG.warning(
+ "Device spec is not found for device %s in "
+ "[pci]device_spec. We are skipping this devices "
+ "during Placement update. The device is allocated by "
+ "%s. You should not remove an allocated device from "
+ "the configuration. Please restore the configuration "
+ "or cold migrate the instance to resolve the "
+ "inconsistency.",
+ dev.address,
+ dev.instance_uuid
+ )
+ else:
+ LOG.warning(
+ "Device spec is not found for device %s in "
+ "[pci]device_spec. Ignoring device in Placement "
+ "resource view. This should not happen. Please file a "
+ "bug.",
+ dev.address
+ )
+
+ return
+
+ self._add_dev(dev, dev_spec.get_tags())
+
+ def __str__(self) -> str:
+ return (
+ f"Placement PCI view on {self.root_rp_name}: "
+ f"{', '.join(str(rp) for rp in self.rps.values())}"
+ )
+
+ def update_provider_tree(
+ self, provider_tree: provider_tree.ProviderTree
+ ) -> None:
+ for rp_name, rp in self.rps.items():
+ rp.update_provider_tree(provider_tree, self.root_rp_name)
+
+ def update_allocations(
+ self,
+ allocations: dict,
+ provider_tree: provider_tree.ProviderTree
+ ) -> bool:
+ """Updates the passed in allocations dict inplace with any PCI
+ allocations that is inferred from the PciDevice objects already added
+ to the view. It returns True if the allocations dict has been changed,
+ False otherwise.
+ """
+ updated = False
+ for rp in self.rps.values():
+ updated |= rp.update_allocations(
+ allocations,
+ provider_tree,
+ self.same_host_instances,
+ )
+ return updated
+
+
+def ensure_no_dev_spec_with_devname(dev_specs: ty.List[devspec.PciDeviceSpec]):
+ for dev_spec in dev_specs:
+ if dev_spec.dev_spec_conf.get("devname"):
+ msg = _(
+ "Invalid [pci]device_spec configuration. PCI Placement "
+ "reporting does not support 'devname' based device "
+ "specification but we got %(dev_spec)s. "
+ "Please use PCI address in the configuration instead."
+ ) % {"dev_spec": dev_spec.dev_spec_conf}
+ raise exception.PlacementPciException(error=msg)
+
+
+def ensure_tracking_was_not_enabled_before(
+ provider_tree: provider_tree.ProviderTree
+) -> None:
+ # If placement tracking was enabled before then we do not support
+ # disabling it later. To check for that we can look for RPs with
+ # the COMPUTE_MANAGED_PCI_DEVICE trait. If any then we raise to
+ # kill the service
+ for rp_uuid in provider_tree.get_provider_uuids():
+ if (
+ os_traits.COMPUTE_MANAGED_PCI_DEVICE
+ in provider_tree.data(rp_uuid).traits
+ ):
+ msg = _(
+ "The [pci]report_in_placement is False but it was enabled "
+ "before on this compute. Nova does not support disabling "
+ "it after it is enabled."
+ )
+ raise exception.PlacementPciException(error=msg)
+
+
+def update_provider_tree_for_pci(
+ provider_tree: provider_tree.ProviderTree,
+ nodename: str,
+ pci_tracker: pci_manager.PciDevTracker,
+ allocations: dict,
+ instances_under_same_host_resize: ty.List[str],
+) -> bool:
+ """Based on the PciDevice objects in the pci_tracker it calculates what
+ inventories and allocations needs to exist in placement and create the
+ missing peaces.
+
+ It returns True if not just the provider_tree but also allocations needed
+ to be changed.
+
+ :param allocations:
+ Dict of allocation data of the form:
+ { $CONSUMER_UUID: {
+ # The shape of each "allocations" dict below is identical
+ # to the return from GET /allocations/{consumer_uuid}
+ "allocations": {
+ $RP_UUID: {
+ "generation": $RP_GEN,
+ "resources": {
+ $RESOURCE_CLASS: $AMOUNT,
+ ...
+ },
+ },
+ ...
+ },
+ "project_id": $PROJ_ID,
+ "user_id": $USER_ID,
+ "consumer_generation": $CONSUMER_GEN,
+ },
+ ...
+ }
+ :param instances_under_same_host_resize: A list of instance UUIDs that
+ are undergoing same host resize on this host.
+ """
+ if not _is_placement_tracking_enabled():
+ ensure_tracking_was_not_enabled_before(provider_tree)
+ # If tracking is not enabled we just return without touching anything
+ return False
+
+ ensure_no_dev_spec_with_devname(pci_tracker.dev_filter.specs)
+
+ LOG.debug(
+ 'Collecting PCI inventories and allocations to track them in Placement'
+ )
+
+ pv = PlacementView(nodename, instances_under_same_host_resize)
+ for dev in pci_tracker.pci_devs:
+ # match the PCI device with the [pci]dev_spec config to access
+ # the configuration metadata tags
+ dev_spec = pci_tracker.dev_filter.get_devspec(dev)
+ pv.process_dev(dev, dev_spec)
+
+ LOG.info("Placement PCI resource view: %s", pv)
+
+ pv.update_provider_tree(provider_tree)
+ old_alloc = copy.deepcopy(allocations)
+ # update_provider_tree correlated the PciDevice objects with RPs in
+ # placement and recorded the RP UUID in the PciDevice object. We need to
+ # trigger an update on the device pools in the tracker to get the device
+ # RP UUID mapped to the device pools
+ pci_tracker.stats.populate_pools_metadata_from_assigned_devices()
+ updated = pv.update_allocations(allocations, provider_tree)
+
+ if updated:
+ LOG.debug(
+ "Placement PCI view needs allocation healing. This should only "
+ "happen if [filter_scheduler]pci_in_placement is still disabled. "
+ "Original allocations: %s New allocations: %s",
+ old_alloc,
+ allocations,
+ )
+
+ return updated